#Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 3 models on the Boston 311 mydata and use them to predict the outcome of cases from the last 30 days

In [39]:
!pip install keras-tuner

Defaulting to user installation because normal site-packages is not writeable


In [40]:
! pip install ../

Defaulting to user installation because normal site-packages is not writeable
Processing /home/briarmoss/Documents/Boston_311
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: boston311
  Building wheel for boston311 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for boston311: filename=boston311-0.1.0-py3-none-any.whl size=18836 sha256=f912ed33224f63b5d669fda420ab5b96d5fb49249db9dfafc8c379e0d93f4f38
  Stored in directory: /tmp/pip-ephem-wheel-cache-nv6ymqkc/wheels/3d/69/ee/0a6ac96b9c09c948fc0e74f2724a9703aa39749a41fa757c9e
Successfully built boston311
Installing collected packages: boston311
  Attempting uninstall: boston311
    Found existing installation: boston311 0.1.0
    Uninstalling boston311-0.1.0:
      Successfully uninstalled boston311-0.1.0
Successfully installed boston311-0.1.0


##Import the Boston311Model class

In [41]:
import os
import pandas as pd

In [42]:
from boston311 import Boston311LogReg, Boston311EventDecTree, Boston311SurvDecTree, Boston311KerasNLP


## Get latest file URLS and Current Date Ranges

In [43]:
latest_URLS = Boston311LogReg.get311URLs()

In [44]:
print(latest_URLS)

{'2023': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/e6013a93-1321-4f2a-bf91-8d8a02f1e62f/download/tmp518q5snq.csv', '2022': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/81a7b022-f8fc-4da5-80e4-b160058ca207/download/tmpfm8veglw.csv', '2021': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/f53ebccd-bc61-49f9-83db-625f209c95f5/download/tmp88p9g82n.csv', '2020': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/6ff6a6fd-3141-4440-a880-6f60a37fe789/download/tmpcv_10m2s.csv', '2019': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/ea2e4696-4a2d-429c-9807-d02eb92e0222/download/tmpcje3ep_w.csv', '2018': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2be28d90-3a90-4af1-a3f6-f28c1e25880a/download/tmp7602cia8.csv', '2017': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/300221

In [45]:
from datetime import datetime, timedelta
now = datetime.now()
thirty_days = timedelta(days=30)
thirty_days_ago = now - thirty_days
today_datestring = now.strftime("%Y-%m-%d")
thirty_days_ago_datestring = thirty_days_ago.strftime("%Y-%m-%d")
tomorrow_datestring = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')

print(today_datestring, thirty_days_ago_datestring, tomorrow_datestring)

2023-10-03 2023-09-03 2023-10-04


In [46]:
#set model folder constant
MODEL_FOLDER = './daily_models'

## Load extra features

In [47]:
#set path to mydata
EXTRA_mydata_FILE = './cls_and_pooled_embeddings_with_service_id_2023.csv'


##Define several models

In [48]:
linear_tree_model = Boston311SurvDecTree(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type','queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0,
                                      'survivalTimeFill':tomorrow_datestring},
                            files_dict=latest_URLS)

In [49]:
logistic_model = Boston311LogReg(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [50]:
oldlogistic_model = Boston311LogReg(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [51]:
logistic_tree_model = Boston311EventDecTree(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [52]:
kerasNLP_model = Boston311KerasNLP(train_date_range={'start':'2022-03-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue', 'source', 'subject', 'reason', 'department', 'ward_number'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0,
                                      'survivalTimeFill':tomorrow_datestring},
                            files_dict=latest_URLS)

In [53]:
#get current datetime in Boston timezone as string
from datetime import datetime
from pytz import timezone
import pytz
boston = timezone('US/Eastern')
now = datetime.now(boston)
today_datestring = now.strftime("%Y-%m-%d")
#get time in Boston timezone as string for a filename
now = datetime.now(boston)
time_string = now.strftime("%H-%M-%S")
#define datetime string
my_datetime = today_datestring + '_' + time_string 

In [54]:
mydata = None

import pandas as pd
import numpy as np
import pickle

case_data_file = 'case_data.pkl'
mydata = None

X = None

if os.path.exists(case_data_file):
    mydata = pickle.load(open(case_data_file, "rb"))
else:
    mydata = kerasNLP_model.load_data()

    pickle.dump(mydata, open(case_data_file, "wb"))




  df = pd.read_csv(file)
  df = pd.read_csv(file)


Files with different number of columns from File 0:  []
Files with same number of columns as File 0:  [0, 1]
Files with different column order from File 0:  []
Files with same column order as File 0:  [0, 1]


In [55]:
mydata['case_enquiry_id']

43677     101004204966
43678     101004204967
43679     101004204970
43680     101004204968
43681     101004204972
              ...     
489419    101005031811
489420    101005031812
489421    101005031813
489422    101005031814
489423    101005031815
Name: case_enquiry_id, Length: 445747, dtype: int64

In [56]:
mydata = kerasNLP_model.enhance_data(mydata)


In [57]:
mydata = kerasNLP_model.apply_scenario(mydata)


In [58]:

mydata = kerasNLP_model.clean_data(mydata)

In [59]:
print(mydata['case_enquiry_id'])

43677     101004204966
43678     101004204967
43679     101004204970
43680     101004204968
43681     101004204972
              ...     
489405    101005031796
489407    101005031798
489417    101005031808
489418    101005031810
489420    101005031812
Name: case_enquiry_id, Length: 392488, dtype: int64


In [60]:
import pandas as pd
import numpy as np
from ast import literal_eval
import pickle

pickle_file = 'dataframe.pkl'

X = None

if os.path.exists(pickle_file):
    X = pickle.load(open(pickle_file, "rb"))
else:
    X = pd.read_csv(EXTRA_mydata_FILE)

    #rename service_request_id to case_enquiry_id
    X.rename(columns={'service_request_id':'case_enquiry_id'}, inplace=True)
    #remove all rows where case_enquiry_id is non-numeric
    #X = X[X['case_enquiry_id'].str.isnumeric()]
    #convert case_enquiry_id to int64
    #X['case_enquiry_id'] = X['case_enquiry_id'].astype('int64')

    # Convert stringified arrays back to NumPy arrays
    X['cls_embedding'] = X['cls_embedding'].apply(literal_eval).apply(np.array)
    X['pooled_embedding'] = X['pooled_embedding'].apply(literal_eval).apply(np.array)

    pickle.dump(X, open(pickle_file, "wb"))



  X = pd.read_csv(EXTRA_mydata_FILE)


In [61]:
#load second file form pickle or from csv
pickle_file2022 = 'dataframe2.pkl'
EXTRA_mydata_FILE_2022 = './cls_and_pooled_embeddings_with_service_id_2022.csv'

X2022 = None

if os.path.exists(pickle_file2022):
    X2022 = pickle.load(open(pickle_file2022, "rb"))
else:
    X2022 = pd.read_csv(EXTRA_mydata_FILE_2022)

    #rename service_request_id to case_enquiry_id
    X2022.rename(columns={'service_request_id':'case_enquiry_id'}, inplace=True)
    #remove all rows where case_enquiry_id is non-numeric
    #X2022 = X2022[X2022['case_enquiry_id'].str.isnumeric()]
    #convert case_enquiry_id to int64
    #X2022['case_enquiry_id'] = X2022['case_enquiry_id'].astype('int64')

    # Convert stringified arrays back to NumPy arrays
    X2022['cls_embedding'] = X2022['cls_embedding'].apply(literal_eval).apply(np.array)
    X2022['pooled_embedding'] = X2022['pooled_embedding'].apply(literal_eval).apply(np.array)

    pickle.dump(X2022, open(pickle_file2022, "wb"))

In [62]:
#print information about X2022
print(X.info())
print(X2022.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275503 entries, 0 to 275502
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   case_enquiry_id   275503 non-null  object
 1   cls_embedding     275503 non-null  object
 2   pooled_embedding  275503 non-null  object
dtypes: object(3)
memory usage: 6.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305359 entries, 0 to 305358
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   case_enquiry_id   305359 non-null  int64 
 1   cls_embedding     305359 non-null  object
 2   pooled_embedding  305359 non-null  object
dtypes: int64(1), object(2)
memory usage: 7.0+ MB
None


In [63]:
#concatenate the two dataframes and reindex
df = pd.concat([X, X2022], ignore_index=True)

In [64]:
df.shape

(580862, 3)

In [65]:

# Assuming df is your DataFrame and it has columns 'cls_embedding' and 'pooled_embedding'
cls_embedding_flattened = np.stack(df['cls_embedding'].to_numpy())
pooled_embedding_flattened = np.stack(df['pooled_embedding'].to_numpy())

# Remove the old columns
df.drop(['cls_embedding', 'pooled_embedding'], axis=1, inplace=True)

# Add the new flattened columns
df_cls = pd.DataFrame(cls_embedding_flattened, columns=[f'cls_{i}' for i in range(cls_embedding_flattened.shape[1])])
df_pooled = pd.DataFrame(pooled_embedding_flattened, columns=[f'pooled_{i}' for i in range(pooled_embedding_flattened.shape[1])])

df = pd.concat([df, df_cls, df_pooled], axis=1)

In [66]:
df['case_enquiry_id'] = df['case_enquiry_id'].astype(str)
is_numeric = df['case_enquiry_id'].str.isnumeric()

In [67]:
df = df[is_numeric]

In [68]:
df['case_enquiry_id'] = df['case_enquiry_id'].astype('int64')

In [69]:
df.shape

(580861, 257)

In [70]:
df = df.drop_duplicates(subset=['case_enquiry_id']) 

In [71]:
df.shape

(235842, 257)

In [72]:
mydata.shape

(392488, 438)

In [73]:
mydata = mydata.drop_duplicates(subset=['case_enquiry_id'])

In [74]:
mydata.shape

(392488, 438)

In [75]:
#join them so we are left only with records that have mydata in both files
new_mydata = mydata.merge(df, on='case_enquiry_id', how='inner')


In [76]:
new_mydata.shape

(143205, 694)

In [77]:

df, y = kerasNLP_model.split_data(new_mydata)

In [78]:
#cast all columns that are type bool to float
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype('float64')

In [79]:
#list the number of rows in X and y
print(df.shape)
print(y.shape)


(143205, 691)
(143205,)


In [80]:
#best_model, best_hyperparameters = kerasNLP_model.tune_model(df, y, '/home/briarmoss/Documents/Boston_311/models/tuning')

In [81]:
#define hyperparameters
from kerastuner import HyperParameters

#set constants
start_nodes = 128
end_nodes = 64
#l2_0 = 0.00001
#learning_rate = 7.5842e-05
l2_0 = 0.001
learning_rate = 0.0001


hp = HyperParameters()
hp.Fixed('start_nodes', start_nodes)
hp.Fixed('end_nodes', end_nodes)
hp.Fixed('l2_0', l2_0)
hp.Fixed('learning_rate', learning_rate)

kerasNLP_model.best_hyperparameters = hp

In [82]:
#free all unused dataframes
df_to_delete = [cls_embedding_flattened, pooled_embedding_flattened, df_cls, df_pooled, X, X2022, new_mydata, is_numeric, mydata]

for data_frame in df_to_delete:
    del data_frame

In [83]:

#parse CLS embedding column as array
test_acc = kerasNLP_model.train_model( df, y )

Starting Training at 2023-10-03 03:05:44.713034


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               88576     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 9)                 585       
                                                                 
Total params: 97417 (380.54 KB)
Trainable params: 97417 (380.54 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
<class 'pandas.core.frame.DataFrame'> (114564, 9)
<class 'pandas.core.frame.DataFrame'> (28641, 9)
run fit

Epoch 1/100


2023-10-03 03:05:45.903434: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 633309792 exceeds 10% of free system memory.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Testing accuracy: 0.5793792009353638 
Top-2 accuracy: 0.7641842365264893 
Test loss: 1.1890246868133545
Ending Training at 2023-10-03 03:10:59.019105
Training took 0:05:14.306071


## Train several models

In [84]:
print("learning is fun!") 

learning is fun!


In [85]:
#logistic_tree_model.run_pipeline()

In [86]:
#logistic_model.run_pipeline()

In [87]:
import gc
gc.collect()

2730

In [88]:
#linear_tree_model.run_pipeline()

In [89]:
import datetime

def save_model_to_dir(model, folder_name):
    dir_path = os.path.join(MODEL_FOLDER, folder_name)
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    model_name = timestamp + "_" + model.model_type
    properties_name = model_name
    
    model.save(dir_path, model_name, properties_name)

# List of models
models = [kerasNLP_model]


# Iterate over models and save
for model in models:
    save_model_to_dir(model, model.model_type)


  saving_api.save_model(


In [90]:
"""
data = kerasNLP_model.load_data( 'predict' )
data = kerasNLP_model.enhance_data( data, 'predict')
clean_data = kerasNLP_model.clean_data_for_prediction( data )

X_predict, y_predict = kerasNLP_model.split_data( clean_data )
y_predict = kerasNLP_model.model.predict(X_predict)
data['survival_prediction'] = y_predict
return data
"""


"\ndata = kerasNLP_model.load_data( 'predict' )\ndata = kerasNLP_model.enhance_data( data, 'predict')\nclean_data = kerasNLP_model.clean_data_for_prediction( data )\n\nX_predict, y_predict = kerasNLP_model.split_data( clean_data )\ny_predict = kerasNLP_model.model.predict(X_predict)\ndata['survival_prediction'] = y_predict\nreturn data\n"

In [92]:

import datetime

def save_model_to_dir(model, folder_name):
    dir_path = os.path.join(MODEL_FOLDER, folder_name)
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    model_name = timestamp + "_" + model.model_type
    properties_name = model_name
    
    model.save(dir_path, model_name, properties_name)

# List of models
models = [kerasNLP_model]


# Iterate over models and save
for model in models:
    save_model_to_dir(model, model.model_type)



  saving_api.save_model(
