#Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 3 models on the Boston 311 mydata and use them to predict the outcome of cases from the last 30 days

In [1]:
!pip install keras-tuner

Defaulting to user installation because normal site-packages is not writeable


In [2]:
! pip install ../

Defaulting to user installation because normal site-packages is not writeable
Processing /home/briarmoss/Documents/Boston_311
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: boston311
  Building wheel for boston311 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for boston311: filename=boston311-0.1.0-py3-none-any.whl size=18860 sha256=62694168d1873a6af6b0e2d78647beb2c7d7efd63b775df240cbcf46b10ef62f
  Stored in directory: /tmp/pip-ephem-wheel-cache-bhrrp125/wheels/3d/69/ee/0a6ac96b9c09c948fc0e74f2724a9703aa39749a41fa757c9e
Successfully built boston311
Installing collected packages: boston311
  Attempting uninstall: boston311
    Found existing installation: boston311 0.1.0
    Uninstalling boston311-0.1.0:
      Successfully uninstalled boston311-0.1.0
Successfully installed boston311-0.1.0


##Import the Boston311Model class

In [3]:
import os
import pandas as pd

In [4]:
from boston311 import Boston311LogReg, Boston311EventDecTree, Boston311SurvDecTree, Boston311KerasNLP


2023-10-16 12:48:39.170054: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-16 12:48:39.306707: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-16 12:48:39.307534: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


## Get latest file URLS and Current Date Ranges

In [5]:
latest_URLS = Boston311LogReg.get311URLs()

In [6]:
print(latest_URLS)

{'2023': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/e6013a93-1321-4f2a-bf91-8d8a02f1e62f/download/tmp1bctvalo.csv', '2022': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/81a7b022-f8fc-4da5-80e4-b160058ca207/download/tmpfm8veglw.csv', '2021': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/f53ebccd-bc61-49f9-83db-625f209c95f5/download/tmp88p9g82n.csv', '2020': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/6ff6a6fd-3141-4440-a880-6f60a37fe789/download/tmpcv_10m2s.csv', '2019': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/ea2e4696-4a2d-429c-9807-d02eb92e0222/download/tmpcje3ep_w.csv', '2018': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2be28d90-3a90-4af1-a3f6-f28c1e25880a/download/tmp7602cia8.csv', '2017': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/300221

In [7]:
from datetime import datetime, timedelta
now = datetime.now()
thirty_days = timedelta(days=30)
thirty_days_ago = now - thirty_days
today_datestring = now.strftime("%Y-%m-%d")
thirty_days_ago_datestring = thirty_days_ago.strftime("%Y-%m-%d")
tomorrow_datestring = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')

print(today_datestring, thirty_days_ago_datestring, tomorrow_datestring)

2023-10-16 2023-09-16 2023-10-17


In [8]:
#set model folder constant
MODEL_FOLDER = './daily_models'

## Load extra features

In [9]:
#set path to mydata
EXTRA_mydata_FILE = './cls_and_pooled_embeddings_with_three_cols.csv'


In [10]:
#define a function that takes a path to a csv file and a pkl file and checks if the csv file is newer than the pkl file, and if so, loads the csv file into a dataframe and saves it as a pkl file, else loads the pkl file into a dataframe
def pkl_load_data(csv_path, pkl_path):
    if os.path.exists(pkl_path):
        pkl_time = os.path.getmtime(pkl_path)
        csv_time = os.path.getmtime(csv_path)
        if csv_time > pkl_time:
            df = pd.read_csv(csv_path)
            df.to_pickle(pkl_path)
        else:
            df = pd.read_pickle(pkl_path)
    else:
        df = pd.read_csv(csv_path)
        df.to_pickle(pkl_path)
    return df

##Define several models

In [11]:
linear_tree_model = Boston311SurvDecTree(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type','queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0,
                                      'survivalTimeFill':tomorrow_datestring},
                            files_dict=latest_URLS)

In [12]:
logistic_model = Boston311LogReg(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [13]:
oldlogistic_model = Boston311LogReg(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [14]:
logistic_tree_model = Boston311EventDecTree(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [15]:
kerasNLP_model = Boston311KerasNLP(train_date_range={'start':'2022-03-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['queue', 'subject', 'reason', 'department'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0,
                                      'survivalTimeFill':tomorrow_datestring},
                            files_dict=latest_URLS)

In [16]:
#get current datetime in Boston timezone as string
from datetime import datetime
from pytz import timezone
import pytz
boston = timezone('US/Eastern')
now = datetime.now(boston)
today_datestring = now.strftime("%Y-%m-%d")
#get time in Boston timezone as string for a filename
now = datetime.now(boston)
time_string = now.strftime("%H-%M-%S")
#define datetime string
my_datetime = today_datestring + '_' + time_string 

In [17]:
mydata = None

import pandas as pd
import numpy as np
import pickle

case_data_file = 'case_data.pkl'
case_data_csv = 'all_311_cases.csv'
mydata = None

X = None


data = pkl_load_data(case_data_csv, case_data_file)
mydata = kerasNLP_model.load_data(data)




In [18]:
mydata['case_enquiry_id']

2235124    101004204966
2235125    101004204967
2235126    101004204970
2235127    101004204968
2235128    101004204972
               ...     
2696297    101005056200
2696298    101005056201
2696299    101005056204
2696300    101005056205
2696301    101005056207
Name: case_enquiry_id, Length: 461178, dtype: int64

In [19]:
mydata = kerasNLP_model.enhance_data(mydata)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['closed_dt'] = pd.to_datetime(data['closed_dt'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['open_dt'] = pd.to_datetime(data['open_dt'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['survival_time'] = data['closed_dt'] - data['open_dt']
A value is trying to be set on a copy of 

In [20]:
mydata = kerasNLP_model.apply_scenario(mydata)


In [21]:

mydata = kerasNLP_model.clean_data(mydata)


In [22]:
print(mydata['case_enquiry_id'])

2235124    101004204966
2235125    101004204967
2235126    101004204970
2235127    101004204968
2235128    101004204972
               ...     
2696297    101005056200
2696298    101005056201
2696299    101005056204
2696300    101005056205
2696301    101005056207
Name: case_enquiry_id, Length: 406032, dtype: int64


In [23]:
import pandas as pd
import numpy as np
from ast import literal_eval
import pickle

pickle_file = 'dataframe.pkl'

X = None

X = pkl_load_data(EXTRA_mydata_FILE, pickle_file)

# if X has a column service_request_id, do the following
if 'service_request_id' in X.columns:
    X.rename(columns={'service_request_id':'case_enquiry_id'}, inplace=True)
    #X['cls_embedding'] = X['cls_embedding'].apply(literal_eval).apply(np.array)
    #X['pooled_embedding'] = X['pooled_embedding'].apply(literal_eval).apply(np.array)
    #new code similar two above two lines but using the six columns of embeddings: desc_cls_embedding, desc_pooled_embedding, name_cls_embedding, name_pooled_embedding, code_cls_embedding, code_pooled_embedding
    for col in ['desc_cls_embedding', 'desc_pooled_embedding', 'name_cls_embedding', 'name_pooled_embedding', 'code_cls_embedding', 'code_pooled_embedding']:
        X[col] = X[col].apply(literal_eval).apply(np.array)

    pickle.dump(X, open(pickle_file, "wb"))



In [24]:
#print information about X2022
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266206 entries, 0 to 266205
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   case_enquiry_id        266206 non-null  int64 
 1   desc_cls_embedding     266206 non-null  object
 2   desc_pooled_embedding  266206 non-null  object
 3   name_cls_embedding     266206 non-null  object
 4   name_pooled_embedding  266206 non-null  object
 5   code_cls_embedding     266206 non-null  object
 6   code_pooled_embedding  266206 non-null  object
dtypes: int64(1), object(6)
memory usage: 14.2+ MB
None


In [25]:
#concatenate the two dataframes and reindex
df = X

In [26]:
df.shape

(266206, 7)

In [27]:

# Assuming df is your DataFrame and it has columns 'desc_cls_embedding', 'desc_pooled_embedding', 'name_cls_embedding', 'name_pooled_embedding', 'code_cls_embedding', 'code_pooled_embedding'
desc_cls_embedding_flattened = np.stack(df['desc_cls_embedding'].to_numpy())
desc_pooled_embedding_flattened = np.stack(df['desc_pooled_embedding'].to_numpy())
#do the same for the rest
name_cls_embedding_flattened = np.stack(df['name_cls_embedding'].to_numpy())
name_pooled_embedding_flattened = np.stack(df['name_pooled_embedding'].to_numpy())
code_cls_embedding_flattened = np.stack(df['code_cls_embedding'].to_numpy())
code_pooled_embedding_flattened = np.stack(df['code_pooled_embedding'].to_numpy())


# Remove the old columns
df.drop(['desc_cls_embedding', 'desc_pooled_embedding'], axis=1, inplace=True)
#do the same for the rest
df.drop(['name_cls_embedding', 'name_pooled_embedding'], axis=1, inplace=True)
df.drop(['code_cls_embedding', 'code_pooled_embedding'], axis=1, inplace=True)






# Add the new flattened columns
df_desc_cls = pd.DataFrame(desc_cls_embedding_flattened, columns=[f'desc_cls_{i}' for i in range(desc_cls_embedding_flattened.shape[1])])

df_desc_pooled = pd.DataFrame(desc_pooled_embedding_flattened, columns=[f'desc_pooled_{i}' for i in range(desc_pooled_embedding_flattened.shape[1])])
#do the same for the rest
df_name_cls = pd.DataFrame(name_cls_embedding_flattened, columns=[f'name_cls_{i}' for i in range(name_cls_embedding_flattened.shape[1])])
df_name_pooled = pd.DataFrame(name_pooled_embedding_flattened, columns=[f'name_pooled_{i}' for i in range(name_pooled_embedding_flattened.shape[1])])
df_code_cls = pd.DataFrame(code_cls_embedding_flattened, columns=[f'code_cls_{i}' for i in range(code_cls_embedding_flattened.shape[1])])
df_code_pooled = pd.DataFrame(code_pooled_embedding_flattened, columns=[f'code_pooled_{i}' for i in range(code_pooled_embedding_flattened.shape[1])])



df = pd.concat([df, df_desc_cls, df_desc_pooled, df_name_cls, df_name_pooled, df_code_cls, df_code_pooled], axis=1)

In [28]:
df['case_enquiry_id'] = df['case_enquiry_id'].astype(str)
is_numeric = df['case_enquiry_id'].str.isnumeric()

In [29]:
df = df[is_numeric]

In [30]:
df['case_enquiry_id'] = df['case_enquiry_id'].astype('int64')

In [31]:
df.shape

(266206, 769)

In [32]:
df = df.drop_duplicates(subset=['case_enquiry_id']) 

In [33]:
df.shape

(266206, 769)

In [34]:
df.head()

Unnamed: 0,case_enquiry_id,desc_cls_0,desc_cls_1,desc_cls_2,desc_cls_3,desc_cls_4,desc_cls_5,desc_cls_6,desc_cls_7,desc_cls_8,...,code_pooled_118,code_pooled_119,code_pooled_120,code_pooled_121,code_pooled_122,code_pooled_123,code_pooled_124,code_pooled_125,code_pooled_126,code_pooled_127
0,101004113559,-1.100319,0.180848,-3.067521,-2.449431,0.048062,0.750774,-1.156688,1.701071,-1.121157,...,0.121683,-0.998929,0.036228,-0.999963,-0.742293,0.969737,-0.998525,0.986848,0.982758,0.931549
1,101004113295,-0.13696,0.691521,-3.540846,-1.352687,1.2992,-0.141181,0.158119,2.410162,0.071449,...,0.086003,-0.998614,0.014076,-0.999995,-0.893147,0.982163,-0.999566,0.996798,0.977021,0.915342
2,101004113630,0.175361,0.668518,-3.55681,-1.355421,1.444425,0.603148,-1.361185,1.510217,-0.07356,...,0.120324,-0.999417,0.059866,-0.999934,-0.765923,0.960436,-0.99833,0.985888,0.973252,0.943457
3,101004113228,-0.649289,0.929046,-2.988562,-1.7672,-0.438132,-0.361119,0.010478,1.114518,-0.448996,...,0.183652,-0.997964,0.057766,-0.999915,-0.724165,0.798789,-0.999548,0.995305,0.995679,0.948751
4,101004113229,-0.649289,0.929046,-2.988562,-1.7672,-0.438132,-0.361119,0.010478,1.114518,-0.448996,...,0.183652,-0.997964,0.057766,-0.999915,-0.724165,0.798789,-0.999548,0.995305,0.995679,0.948751


In [35]:
mydata.shape

(406032, 246)

In [36]:
mydata = mydata.drop_duplicates(subset=['case_enquiry_id'])

In [37]:
mydata.shape

(406032, 246)

In [38]:
#join them so we are left only with records that have mydata in both files
new_mydata = mydata.merge(df, on='case_enquiry_id', how='inner')



In [39]:
new_mydata.shape

(156620, 1014)

In [40]:
old_bin_edges = [0, 12, 24, 72, 168, 336, 672, 1344, 2688, 9999999]
old_bin_labels = [
                "0-12 hours",      # Less than half a day
                "12-24 hours",     # Half to one day
                "1-3 days",        # One to three days
                "4-7 days",        # Four to seven days
                "1-2 weeks",       # One to two weeks
                "2-4 weeks",       # Two to four weeks
                "1-2 months",      # One to two months
                "2-4 months",      # Two to four months
                "4+ months"        # More than four months
            ]

In [41]:
bin_edges = [0, 24, 48, 72, 96, 120, 144, 168, 192, 216, 240, 264, 288, 312, 336, 360, 384, 408, 432, 456, 480, 504, 528, 552, 576, 600, 624, 648, 672, 696, 720, 744, 768, 792, 816, 840, 864, 888, 912, 936, 960, 984, 1008, 1032, 1056, 1080, 1104, 1128, 1152, 1176, 1200, 1224, 1248, 1272, 1296, 1320, 1344, 1368, 1392, 1416, 1440, 1464, 1488, 1512, 1536, 1560, 1584, 1608, 1632, 1656, 1680, 1704, 1728, 1752, 1776, 1800, 1824, 1848, 1872, 1896, 1920, 1944, 1968, 1992, 2016, 2040, 2064, 2088, 2112, 2136, 2160, 2184, 2208, 2232, 2256, 2280, 2304, 2328, 2352, 2376, 2400, 2424, 2448, 2472, 2496, 2520, 2544, 2568, 2592, 2616, 2640, 2664, 2688, 2712, 2736, 2760, 2784, 2808, 2832, 2856, 2880, 2904, 2928, 2952, 2976, 3000, 3024, 3048, 3072, 3096, 3120, 3144, 3168, 3192, 3216, 3240, 3264, 3288, 3312, 3336, 3360, 3384, 3408, 3432, 3456, 3480, 3504, 3528, 3552, 3576, 3600, 3624, 3648, 3672, 3696, 3720, 3744, 3768, 3792, 3816, 3840, 3864, 3888, 3912, 3936, 3960, 3984, 4008, 4032, 4056, 4080, 4104, 4128, 4152, 4176, 4200, 4224, 4248, 4272, 4296, 4320, 1000000]
bin_labels = [
            "0-24 hours", "1-2 days", "2-3 days", "3-4 days", "4-5 days", 
            "5-6 days", "6-7 days", "7-8 days", "8-9 days", "9-10 days",
            "10-11 days", "11-12 days", "12-13 days", "13-14 days", "14-15 days",
            "15-16 days", "16-17 days", "17-18 days", "18-19 days", "19-20 days",
            "20-21 days", "21-22 days", "22-23 days", "23-24 days", "24-25 days",
            "25-26 days", "26-27 days", "27-28 days", "28-29 days", "29-30 days",
            "30-31 days", "31-32 days", "32-33 days", "33-34 days", "34-35 days",
            "35-36 days", "36-37 days", "37-38 days", "38-39 days", "39-40 days",
            "40-41 days", "41-42 days", "42-43 days", "43-44 days", "44-45 days",
            "45-46 days", "46-47 days", "47-48 days", "48-49 days", "49-50 days",
            "50-51 days", "51-52 days", "52-53 days", "53-54 days", "54-55 days",
            "55-56 days", "56-57 days", "57-58 days", "58-59 days", "59-60 days",
            "60-61 days", "61-62 days", "62-63 days", "63-64 days", "64-65 days",
            "65-66 days", "66-67 days", "67-68 days", "68-69 days", "69-70 days",
            "70-71 days", "71-72 days", "72-73 days", "73-74 days", "74-75 days",
            "75-76 days", "76-77 days", "77-78 days", "78-79 days", "79-80 days",
            "80-81 days", "81-82 days", "82-83 days", "83-84 days", "84-85 days",
            "85-86 days", "86-87 days", "87-88 days", "88-89 days", "89-90 days",
            "90-91 days", "91-92 days", "92-93 days", "93-94 days", "94-95 days",
            "95-96 days", "96-97 days", "97-98 days", "98-99 days", "99-100 days",
            "100-101 days", "101-102 days", "102-103 days", "103-104 days", "104-105 days",
            "105-106 days", "106-107 days", "107-108 days", "108-109 days", "109-110 days",
            "110-111 days", "111-112 days", "112-113 days", "113-114 days", "114-115 days",
            "115-116 days", "116-117 days", "117-118 days", "118-119 days", "119-120 days",
            "120-121 days", "121-122 days", "122-123 days", "123-124 days", "124-125 days",
            "125-126 days", "126-127 days", "127-128 days", "128-129 days", "129-130 days",
            "130-131 days", "131-132 days", "132-133 days", "133-134 days", "134-135 days",
            "135-136 days", "136-137 days", "137-138 days", "138-139 days", "139-140 days",
            "140-141 days", "141-142 days", "142-143 days", "143-144 days", "144-145 days",
            "145-146 days", "146-147 days", "147-148 days", "148-149 days", "149-150 days",
            "150-151 days", "151-152 days", "152-153 days", "153-154 days", "154-155 days",
            "155-156 days", "156-157 days", "157-158 days", "158-159 days", "159-160 days",
            "160-161 days", "161-162 days", "162-163 days", "163-164 days", "164-165 days",
            "165-166 days", "166-167 days", "167-168 days", "168-169 days", "169-170 days",
            "170-171 days", "171-172 days", "172-173 days", "173-174 days", "174-175 days",
            "175-176 days", "176-177 days", "177-178 days", "178-179 days", "179-180 days",
            "180+ days"]
bin_number = len(bin_labels)

In [42]:
php_array = "$prediction_timespans = [\n"
for i, label in enumerate(bin_labels):
    php_array += f'    "{label}" => [{bin_edges[i]}, {bin_edges[i + 1]}],\n'
php_array += "];"

print(php_array)


$prediction_timespans = [
    "0-24 hours" => [0, 24],
    "1-2 days" => [24, 48],
    "2-3 days" => [48, 72],
    "3-4 days" => [72, 96],
    "4-5 days" => [96, 120],
    "5-6 days" => [120, 144],
    "6-7 days" => [144, 168],
    "7-8 days" => [168, 192],
    "8-9 days" => [192, 216],
    "9-10 days" => [216, 240],
    "10-11 days" => [240, 264],
    "11-12 days" => [264, 288],
    "12-13 days" => [288, 312],
    "13-14 days" => [312, 336],
    "14-15 days" => [336, 360],
    "15-16 days" => [360, 384],
    "16-17 days" => [384, 408],
    "17-18 days" => [408, 432],
    "18-19 days" => [432, 456],
    "19-20 days" => [456, 480],
    "20-21 days" => [480, 504],
    "21-22 days" => [504, 528],
    "22-23 days" => [528, 552],
    "23-24 days" => [552, 576],
    "24-25 days" => [576, 600],
    "25-26 days" => [600, 624],
    "26-27 days" => [624, 648],
    "27-28 days" => [648, 672],
    "28-29 days" => [672, 696],
    "29-30 days" => [696, 720],
    "30-31 days" => [720, 744],
    "31-3

In [43]:

df, y = kerasNLP_model.split_data(new_mydata, bin_edges=bin_edges, bin_labels=bin_labels)

In [44]:
#cast all columns that are type bool to float
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype('float64')

In [45]:
#list the number of rows in X and y
print(df.shape)
print(y.shape)


(156620, 1011)
(156620,)


In [46]:
#best_model, best_hyperparameters = kerasNLP_model.tune_model(df, y, '/home/briarmoss/Documents/Boston_311/models/tuning')

In [47]:
#define hyperparameters
from kerastuner import HyperParameters

#set constants
start_nodes = 1024  
end_nodes = 256
#l2_0 = 0.00001
#learning_rate = 7.5842e-05
l2_0 = 0.001
learning_rate = 0.0001


hp = HyperParameters()
hp.Fixed('start_nodes', start_nodes)
hp.Fixed('end_nodes', end_nodes)
hp.Fixed('l2_0', l2_0)
hp.Fixed('learning_rate', learning_rate)
hp.Fixed('final_layer', bin_number)
hp.Fixed('final_activation', 'softmax')
kerasNLP_model.best_hyperparameters = hp


#parameters for linear regression
linear='''
hp = HyperParameters()
hp.Fixed('start_nodes', start_nodes)
hp.Fixed('end_nodes', end_nodes)
hp.Fixed('l2_0', l2_0)
hp.Fixed('learning_rate', learning_rate)
hp.Fixed('final_layer', 1)
hp.Fixed('final_activation', 'linear')
kerasNLP_model.best_hyperparameters = hp
'''

In [48]:
#free all unused dataframes
try :
    df_to_delete = [X, new_mydata, is_numeric, mydata, merged_data]
    df_to_delete.extend([df_desc_cls, df_desc_pooled, df_name_cls, df_name_pooled, df_code_cls, df_code_pooled])
except NameError:
    pass
try :
    for data_frame in df_to_delete:
        try:
            del data_frame
        #if the dataframe doesn't exist, pass
        except NameError:
            pass
except NameError:
    pass
        

In [49]:
import gc
gc.collect()

25

In [50]:

#parse CLS embedding column as array
test_acc = kerasNLP_model.train_model( df, y )

Starting Training at 2023-10-16 12:48:58.170583


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1024)              1036288   
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 dense_2 (Dense)             (None, 256)               131328    
                                                                 
 dense_3 (Dense)             (None, 181)               46517     
                                                                 
Total params: 1738933 (6.63 MB)
Trainable params: 1738933 (6.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
<class 'pandas.core.frame.DataFrame'> (125296, 181)
<class 'pandas.core.frame.DataFrame'> (31324, 181)
run fit

Epoch 1/100


2023-10-16 12:49:01.098418: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1013394048 exceeds 10% of free system memory.




2023-10-16 12:49:40.790765: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 253348512 exceeds 10% of free system memory.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
126/979 [==>...........................] - ETA: 1s - loss: 1.5479 - accuracy: 0.6634 - top_k_categorical_accuracy: 0.7431

2023-10-16 13:21:21.092972: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 253348512 exceeds 10% of free system memory.


Testing accuracy: 0.6633571982383728 
Top-2 accuracy: 0.7431681752204895 
Test loss: 1.5464606285095215
Ending Training at 2023-10-16 13:21:22.348510
Training took 0:32:24.177927


## Train several models

In [51]:
print("learning is fun!") 

learning is fun!


In [52]:
#logistic_tree_model.run_pipeline()

In [53]:
#logistic_model.run_pipeline()

In [54]:
import gc
gc.collect()

2294

In [55]:
#linear_tree_model.run_pipeline()

In [56]:
import datetime

def save_model_to_dir(model, folder_name):
    dir_path = os.path.join(MODEL_FOLDER, folder_name)
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    model_name = timestamp + "_" + model.model_type
    properties_name = model_name
    
    model.save(dir_path, model_name, properties_name)

# List of models
models = [kerasNLP_model]


# Iterate over models and save
for model in models:
    save_model_to_dir(model, model.model_type)


  saving_api.save_model(


TypeError: Object of type HyperParameters is not JSON serializable

: 

In [None]:
"""
data = kerasNLP_model.load_data( 'predict' )
data = kerasNLP_model.enhance_data( data, 'predict')
clean_data = kerasNLP_model.clean_data_for_prediction( data )

X_predict, y_predict = kerasNLP_model.split_data( clean_data )
y_predict = kerasNLP_model.model.predict(X_predict)
data['survival_prediction'] = y_predict
return data
"""


"\ndata = kerasNLP_model.load_data( 'predict' )\ndata = kerasNLP_model.enhance_data( data, 'predict')\nclean_data = kerasNLP_model.clean_data_for_prediction( data )\n\nX_predict, y_predict = kerasNLP_model.split_data( clean_data )\ny_predict = kerasNLP_model.model.predict(X_predict)\ndata['survival_prediction'] = y_predict\nreturn data\n"

In [None]:

import datetime

def save_model_to_dir(model, folder_name):
    dir_path = os.path.join(MODEL_FOLDER, folder_name)
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    model_name = timestamp + "_" + model.model_type
    properties_name = model_name
    
    model.save(dir_path, model_name, properties_name)

# List of models
models = [kerasNLP_model]


# Iterate over models and save
for model in models:
    save_model_to_dir(model, model.model_type)



: 