#Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 3 models on the Boston 311 mydata and use them to predict the outcome of cases from the last 30 days

In [1]:
# !pip install keras-tuner

In [2]:
! pip install  ../

Defaulting to user installation because normal site-packages is not writeable
Processing /home/briarmoss/Documents/Boston_311
  Installing build dependencies ... [?25l

done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: boston311
  Building wheel for boston311 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for boston311: filename=boston311-0.1.0-py3-none-any.whl size=19337 sha256=262d411199ca3ed497ec5881e21c6555e84e0530d280a4300641b14ca453bbe9
  Stored in directory: /tmp/pip-ephem-wheel-cache-mrr6oydl/wheels/3d/69/ee/0a6ac96b9c09c948fc0e74f2724a9703aa39749a41fa757c9e
Successfully built boston311
Installing collected packages: boston311
  Attempting uninstall: boston311
    Found existing installation: boston311 0.1.0
    Uninstalling boston311-0.1.0:
      Successfully uninstalled boston311-0.1.0
Successfully installed boston311-0.1.0


##Import the Boston311Model class

In [3]:
import os
import pandas as pd

In [4]:

#os.environ['HSA_OVERRIDE_GFX_VERSION'] = '10.3.0'

In [5]:
from boston311 import Boston311LogReg, Boston311EventDecTree, Boston311SurvDecTree, Boston311KerasNLP


2023-10-27 21:22:16.587756: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-27 21:22:16.809722: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-27 21:22:16.810713: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


## Get latest file URLS and Current Date Ranges

In [6]:
latest_URLS = Boston311LogReg.get311URLs()

In [7]:
print(latest_URLS)

{'2023': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/e6013a93-1321-4f2a-bf91-8d8a02f1e62f/download/tmp16mi5hsp.csv', '2022': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/81a7b022-f8fc-4da5-80e4-b160058ca207/download/tmpfm8veglw.csv', '2021': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/f53ebccd-bc61-49f9-83db-625f209c95f5/download/tmp88p9g82n.csv', '2020': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/6ff6a6fd-3141-4440-a880-6f60a37fe789/download/tmpcv_10m2s.csv', '2019': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/ea2e4696-4a2d-429c-9807-d02eb92e0222/download/tmpcje3ep_w.csv', '2018': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2be28d90-3a90-4af1-a3f6-f28c1e25880a/download/tmp7602cia8.csv', '2017': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/300221

In [8]:
from datetime import datetime, timedelta
now = datetime.now()
thirty_days = timedelta(days=30)
thirty_days_ago = now - thirty_days
today_datestring = now.strftime("%Y-%m-%d")
thirty_days_ago_datestring = thirty_days_ago.strftime("%Y-%m-%d")
tomorrow_datestring = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')

print(today_datestring, thirty_days_ago_datestring, tomorrow_datestring)

2023-10-27 2023-09-27 2023-10-28


In [9]:
#set model folder constant
MODEL_FOLDER = './daily_models'

## Load extra features

In [10]:
#set path to mydata
EXTRA_mydata_FILE = './cls_and_pooled_embeddings_with_three_cols.csv'


In [11]:
#define a function that takes a path to a csv file and a pkl file and checks if the csv file is newer than the pkl file, and if so, loads the csv file into a dataframe and saves it as a pkl file, else loads the pkl file into a dataframe
def pkl_load_data(csv_path, pkl_path):
    if os.path.exists(pkl_path):
        pkl_time = os.path.getmtime(pkl_path)
        csv_time = os.path.getmtime(csv_path)
        if csv_time > pkl_time:
            df = pd.read_csv(csv_path)
            df.to_pickle(pkl_path)
        else:
            df = pd.read_pickle(pkl_path)
    else:
        df = pd.read_csv(csv_path)
        df.to_pickle(pkl_path)
    return df

##Define several models

In [12]:
linear_tree_model = Boston311SurvDecTree(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type','queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0,
                                      'survivalTimeFill':tomorrow_datestring},
                            files_dict=latest_URLS)

In [13]:
logistic_model = Boston311LogReg(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [14]:
oldlogistic_model = Boston311LogReg(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [15]:
logistic_tree_model = Boston311EventDecTree(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [16]:
kerasNLP_model = Boston311KerasNLP(train_date_range={'start':'2020-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['queue', 'subject', 'reason', 'department'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0,
                                      'survivalTimeFill':tomorrow_datestring},
                            files_dict=latest_URLS)

In [17]:
#get current datetime in Boston timezone as string
from datetime import datetime
from pytz import timezone
import pytz
boston = timezone('US/Eastern')
now = datetime.now(boston)
today_datestring = now.strftime("%Y-%m-%d")
#get time in Boston timezone as string for a filename
now = datetime.now(boston)
time_string = now.strftime("%H-%M-%S")
#define datetime string
my_datetime = today_datestring + '_' + time_string 

In [18]:
mydata = None

import pandas as pd
import numpy as np
import pickle

case_data_file = 'case_data.pkl'
case_data_csv = 'all_311_cases.csv'
mydata = None

X = None


data = pkl_load_data(case_data_csv, case_data_file)
mydata = kerasNLP_model.load_data(data)




In [19]:
mydata['case_enquiry_id']

1666441    101003148265
1666442    101003148266
1666443    101003148268
1666444    101003148269
1666445    101003148271
               ...     
2706494    101005082957
2706495    101005082960
2706496    101005082961
2706497    101005082964
2706498    101005082966
Name: case_enquiry_id, Length: 1040058, dtype: int64

In [20]:
mydata = kerasNLP_model.enhance_data(mydata)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['closed_dt'] = pd.to_datetime(data['closed_dt'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['open_dt'] = pd.to_datetime(data['open_dt'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['survival_time'] = data['closed_dt'] - data['open_dt']
A value is trying to be set on a copy of 

In [21]:
cyclical_df = mydata[['case_enquiry_id', 'season_cos', 'weekday_cos']].copy()

In [22]:
mydata = kerasNLP_model.apply_scenario(mydata)


In [23]:

mydata = kerasNLP_model.clean_data(mydata)


In [24]:
print(mydata['case_enquiry_id'])

1666441    101003148265
1666442    101003148266
1666443    101003148268
1666444    101003148269
1666445    101003148271
               ...     
2706492    101005082955
2706493    101005082956
2706494    101005082957
2706495    101005082960
2706498    101005082966
Name: case_enquiry_id, Length: 914132, dtype: int64


In [25]:
import pandas as pd
import numpy as np
from ast import literal_eval
import pickle

pickle_file = 'dataframe.pkl'

X = None

X = pkl_load_data(EXTRA_mydata_FILE, pickle_file)

# if X has a column service_request_id, do the following
if 'service_request_id' in X.columns:
    X.rename(columns={'service_request_id':'case_enquiry_id'}, inplace=True)
    #X['cls_embedding'] = X['cls_embedding'].apply(literal_eval).apply(np.array)
    #X['pooled_embedding'] = X['pooled_embedding'].apply(literal_eval).apply(np.array)
    #new code similar two above two lines but using the six columns of embeddings: desc_cls_embedding, desc_pooled_embedding, name_cls_embedding, name_pooled_embedding, code_cls_embedding, code_pooled_embedding
    for col in ['desc_cls_embedding', 'desc_pooled_embedding', 'name_cls_embedding', 'name_pooled_embedding', 'code_cls_embedding', 'code_pooled_embedding']:
        X[col] = X[col].apply(literal_eval).apply(np.array)

    pickle.dump(X, open(pickle_file, "wb"))



In [26]:
#print information about X2022
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291559 entries, 0 to 291558
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   case_enquiry_id        291559 non-null  int64 
 1   desc_cls_embedding     291559 non-null  object
 2   desc_pooled_embedding  291559 non-null  object
 3   name_cls_embedding     291559 non-null  object
 4   name_pooled_embedding  291559 non-null  object
 5   code_cls_embedding     291559 non-null  object
 6   code_pooled_embedding  291559 non-null  object
dtypes: int64(1), object(6)
memory usage: 15.6+ MB
None


In [27]:
#concatenate the two dataframes and reindex
df = X

In [28]:
df.shape

(291559, 7)

In [29]:

# Assuming df is your DataFrame and it has columns 'desc_cls_embedding', 'desc_pooled_embedding', 'name_cls_embedding', 'name_pooled_embedding', 'code_cls_embedding', 'code_pooled_embedding'
desc_cls_embedding_flattened = np.stack(df['desc_cls_embedding'].to_numpy())
desc_pooled_embedding_flattened = np.stack(df['desc_pooled_embedding'].to_numpy())
#do the same for the rest
name_cls_embedding_flattened = np.stack(df['name_cls_embedding'].to_numpy())
name_pooled_embedding_flattened = np.stack(df['name_pooled_embedding'].to_numpy())
code_cls_embedding_flattened = np.stack(df['code_cls_embedding'].to_numpy())
code_pooled_embedding_flattened = np.stack(df['code_pooled_embedding'].to_numpy())


# Remove the old columns
df.drop(['desc_cls_embedding', 'desc_pooled_embedding'], axis=1, inplace=True)
#do the same for the rest
df.drop(['name_cls_embedding', 'name_pooled_embedding'], axis=1, inplace=True)
df.drop(['code_cls_embedding', 'code_pooled_embedding'], axis=1, inplace=True)






# Add the new flattened columns
df_desc_cls = pd.DataFrame(desc_cls_embedding_flattened, columns=[f'desc_cls_{i}' for i in range(desc_cls_embedding_flattened.shape[1])])

df_desc_pooled = pd.DataFrame(desc_pooled_embedding_flattened, columns=[f'desc_pooled_{i}' for i in range(desc_pooled_embedding_flattened.shape[1])])
#do the same for the rest
df_name_cls = pd.DataFrame(name_cls_embedding_flattened, columns=[f'name_cls_{i}' for i in range(name_cls_embedding_flattened.shape[1])])
df_name_pooled = pd.DataFrame(name_pooled_embedding_flattened, columns=[f'name_pooled_{i}' for i in range(name_pooled_embedding_flattened.shape[1])])
df_code_cls = pd.DataFrame(code_cls_embedding_flattened, columns=[f'code_cls_{i}' for i in range(code_cls_embedding_flattened.shape[1])])
df_code_pooled = pd.DataFrame(code_pooled_embedding_flattened, columns=[f'code_pooled_{i}' for i in range(code_pooled_embedding_flattened.shape[1])])



df = pd.concat([df, df_desc_cls, df_desc_pooled, df_name_cls, df_name_pooled, df_code_cls, df_code_pooled], axis=1)

In [30]:
df['case_enquiry_id'] = df['case_enquiry_id'].astype(str)
is_numeric = df['case_enquiry_id'].str.isnumeric()

In [31]:
df = df[is_numeric]

In [32]:
df['case_enquiry_id'] = df['case_enquiry_id'].astype('int64')

In [33]:
df.shape

(291559, 769)

In [34]:
df = df.drop_duplicates(subset=['case_enquiry_id']) 

In [35]:
df.shape

(291559, 769)

In [36]:
df.head()

Unnamed: 0,case_enquiry_id,desc_cls_0,desc_cls_1,desc_cls_2,desc_cls_3,desc_cls_4,desc_cls_5,desc_cls_6,desc_cls_7,desc_cls_8,...,code_pooled_118,code_pooled_119,code_pooled_120,code_pooled_121,code_pooled_122,code_pooled_123,code_pooled_124,code_pooled_125,code_pooled_126,code_pooled_127
0,101004113559,-1.100319,0.180848,-3.067521,-2.449431,0.048062,0.750774,-1.156688,1.701071,-1.121157,...,0.121683,-0.998929,0.036228,-0.999963,-0.742293,0.969737,-0.998525,0.986848,0.982758,0.931549
1,101004113295,-0.13696,0.691521,-3.540846,-1.352687,1.2992,-0.141181,0.158119,2.410162,0.071449,...,0.086003,-0.998614,0.014076,-0.999995,-0.893147,0.982163,-0.999566,0.996798,0.977021,0.915342
2,101004113630,0.175361,0.668518,-3.55681,-1.355421,1.444425,0.603148,-1.361185,1.510217,-0.07356,...,0.120324,-0.999417,0.059866,-0.999934,-0.765923,0.960436,-0.99833,0.985888,0.973252,0.943457
3,101004113228,-0.649289,0.929046,-2.988562,-1.7672,-0.438132,-0.361119,0.010478,1.114518,-0.448996,...,0.183652,-0.997964,0.057766,-0.999915,-0.724165,0.798789,-0.999548,0.995305,0.995679,0.948751
4,101004113229,-0.649289,0.929046,-2.988562,-1.7672,-0.438132,-0.361119,0.010478,1.114518,-0.448996,...,0.183652,-0.997964,0.057766,-0.999915,-0.724165,0.798789,-0.999548,0.995305,0.995679,0.948751


In [37]:
mydata.shape

(914132, 255)

In [38]:
mydata = mydata.drop_duplicates(subset=['case_enquiry_id'])

In [39]:
mydata.shape

(914132, 255)

In [40]:
#join them so we are left only with records that have mydata in both files
new_mydata = mydata.merge(df, on='case_enquiry_id', how='inner')



In [41]:
new_mydata = new_mydata.merge(cyclical_df, on='case_enquiry_id', how='inner')

In [42]:
new_mydata = new_mydata.sort_values(by='case_enquiry_id')
new_mydata.shape

(210858, 1025)

In [43]:
old_bin_edges = [0, 12, 24, 72, 168, 336, 672, 1344, 2688, 9999999]
old_bin_labels = [
                "0-12 hours",      # Less than half a day
                "12-24 hours",     # Half to one day
                "1-3 days",        # One to three days
                "4-7 days",        # Four to seven days
                "1-2 weeks",       # One to two weeks
                "2-4 weeks",       # Two to four weeks
                "1-2 months",      # One to two months
                "2-4 months",      # Two to four months
                "4+ months"        # More than four months
            ]

In [44]:
def generate_time_bins(hour_interval, max_days, overflow_label=None):
    # Calculate the number of hours for max_days
    max_hours = max_days * 24
    
    # Generate bin edges
    bin_edges = [i for i in range(0, max_hours + 1, hour_interval)]
    bin_edges.append(1000000)  # for the overflow category
    
    # Generate bin labels
    bin_labels = []
    for i in range(len(bin_edges) - 1):
        start_day = bin_edges[i] // 24
        end_day = (bin_edges[i + 1] // 24) - 1  # -1 because it's inclusive
        if end_day > start_day:
            bin_labels.append(f"{start_day}-{end_day} days")
        else:
            bin_labels.append(f"{start_day} days")
    
    if overflow_label is not None:
        bin_labels[-1] = overflow_label  # update the last label to the overflow label if specified

    return bin_edges, bin_labels

# Example usage
hour_interval = 72
max_days = 180
bin_edges, bin_labels = generate_time_bins(hour_interval, max_days, "180+ days")
bin_number = len(bin_labels)


In [45]:
php_array = "$prediction_timespans = [\n"
for i, label in enumerate(bin_labels):
    try:
        new_line = f'    "{label}" => [{bin_edges[i]}, {bin_edges[i+1]}],\n'
        php_array += new_line
    except IndexError:
        continue
php_array += "];"

print(php_array)


$prediction_timespans = [
    "0-2 days" => [0, 72],
    "3-5 days" => [72, 144],
    "6-8 days" => [144, 216],
    "9-11 days" => [216, 288],
    "12-14 days" => [288, 360],
    "15-17 days" => [360, 432],
    "18-20 days" => [432, 504],
    "21-23 days" => [504, 576],
    "24-26 days" => [576, 648],
    "27-29 days" => [648, 720],
    "30-32 days" => [720, 792],
    "33-35 days" => [792, 864],
    "36-38 days" => [864, 936],
    "39-41 days" => [936, 1008],
    "42-44 days" => [1008, 1080],
    "45-47 days" => [1080, 1152],
    "48-50 days" => [1152, 1224],
    "51-53 days" => [1224, 1296],
    "54-56 days" => [1296, 1368],
    "57-59 days" => [1368, 1440],
    "60-62 days" => [1440, 1512],
    "63-65 days" => [1512, 1584],
    "66-68 days" => [1584, 1656],
    "69-71 days" => [1656, 1728],
    "72-74 days" => [1728, 1800],
    "75-77 days" => [1800, 1872],
    "78-80 days" => [1872, 1944],
    "81-83 days" => [1944, 2016],
    "84-86 days" => [2016, 2088],
    "87-89 days" => [2088,

In [46]:

df, y = kerasNLP_model.split_data(new_mydata, bin_edges=bin_edges, bin_labels=bin_labels)

In [47]:
#cast all columns that are type bool to float
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype('float64')

In [48]:
#list the number of rows in X and y
print(df.shape)
print(y.shape)


(210858, 1022)
(210858,)


In [49]:
#best_model, best_hyperparameters = kerasNLP_model.tune_model(df, y, '/home/briarmoss/Documents/Boston_311/models/tuning')

In [50]:
#define hyperparameters
from kerastuner import HyperParameters

#set constants
start_nodes = 2048  
end_nodes = 128
#l2_0 = 0.00001
#learning_rate = 7.5842e-05
l2_0 = 0.001
learning_rate = 0.0001


hp = HyperParameters()
hp.Fixed('start_nodes', start_nodes)
hp.Fixed('end_nodes', end_nodes)
hp.Fixed('l2_0', l2_0)
hp.Fixed('learning_rate', learning_rate)
hp.Fixed('final_layer', bin_number)
hp.Fixed('final_activation', 'softmax')
kerasNLP_model.best_hyperparameters = hp


#parameters for linear regression
linear='''
hp = HyperParameters()
hp.Fixed('start_nodes', start_nodes)
hp.Fixed('end_nodes', end_nodes)
hp.Fixed('l2_0', l2_0)
hp.Fixed('learning_rate', learning_rate)
hp.Fixed('final_layer', 1)
hp.Fixed('final_activation', 'linear')
kerasNLP_model.best_hyperparameters = hp
'''

In [51]:
#free all unused dataframes
try :
    df_to_delete = [X, new_mydata, is_numeric, mydata, merged_data]
    df_to_delete.extend([df_desc_cls, df_desc_pooled, df_name_cls, df_name_pooled, df_code_cls, df_code_pooled])
except NameError:
    pass
try :
    for data_frame in df_to_delete:
        try:
            del data_frame
        #if the dataframe doesn't exist, pass
        except NameError:
            pass
except NameError:
    pass
        

In [52]:
import gc
gc.collect()

47

In [53]:
import logging

# Suppress specific TensorFlow log messages
logging.getLogger('tensorflow').addFilter(
    lambda record: "ROCm Fusion is enabled" not in record.msg
)


In [54]:

#parse CLS embedding column as array
test_acc = kerasNLP_model.train_model( df, y , epochs=30)

Starting Training at 2023-10-27 21:22:46.943497
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 2048)              2095104   
                                                                 
 dense_1 (Dense)             (None, 1024)              2098176   
                                                                 
 dense_2 (Dense)             (None, 512)               524800    
                                                                 
 dense_3 (Dense)             (None, 256)               131328    
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 dense_5 (Dense)             (None, 61)                7869      
                                                                 
Total pa

2023-10-27 21:22:51.482382: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1379176736 exceeds 10% of free system memory.


Epoch 1/30

2023-10-27 21:23:41.054716: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 689596544 exceeds 10% of free system memory.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
  42/1318 [..............................] - ETA: 4s - loss: 0.6859 - accuracy: 0.8452 - top_k_categorical_accuracy: 0.8966

2023-10-27 21:47:18.109479: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 344798272 exceeds 10% of free system memory.


Testing accuracy: 0.7809447050094604 
Top-2 accuracy: 0.8504458069801331 
Test loss: 0.9516450762748718
Ending Training at 2023-10-27 21:47:23.265237
Training took 0:24:36.321740


## Train several models

In [55]:
print("learning is fun!") 

learning is fun!


In [56]:
#logistic_tree_model.run_pipeline()

In [57]:
#logistic_model.run_pipeline()

In [58]:
import gc
gc.collect()

2294

In [59]:
kerasNLP_model.best_hyperparameters = None

In [60]:
import datetime

def save_model_to_dir(model, folder_name):
    dir_path = os.path.join(MODEL_FOLDER, folder_name)
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    model_name = timestamp + "_" + model.model_type
    properties_name = model_name
    
    model.save(dir_path, model_name, properties_name)

# List of models
models = [kerasNLP_model]


# Iterate over models and save
for model in models:
    save_model_to_dir(model, model.model_type)


  saving_api.save_model(


In [61]:
"""
data = kerasNLP_model.load_data( 'predict' )
data = kerasNLP_model.enhance_data( data, 'predict')
clean_data = kerasNLP_model.clean_data_for_prediction( data )

X_predict, y_predict = kerasNLP_model.split_data( clean_data )
y_predict = kerasNLP_model.model.predict(X_predict)
data['survival_prediction'] = y_predict
return data
"""


"\ndata = kerasNLP_model.load_data( 'predict' )\ndata = kerasNLP_model.enhance_data( data, 'predict')\nclean_data = kerasNLP_model.clean_data_for_prediction( data )\n\nX_predict, y_predict = kerasNLP_model.split_data( clean_data )\ny_predict = kerasNLP_model.model.predict(X_predict)\ndata['survival_prediction'] = y_predict\nreturn data\n"