#Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 3 models on the Boston 311 mydata and use them to predict the outcome of cases from the last 30 days

In [1]:
!pip install keras-tuner

Defaulting to user installation because normal site-packages is not writeable


In [2]:
! pip install ../

Defaulting to user installation because normal site-packages is not writeable
Processing /home/briarmoss/Documents/Boston_311
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: boston311
  Building wheel for boston311 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for boston311: filename=boston311-0.1.0-py3-none-any.whl size=18654 sha256=039375785611263ce8daebf8e83d216464b49cdc8815e1cfd0ad7dad2261b28b
  Stored in directory: /tmp/pip-ephem-wheel-cache-lmfszsle/wheels/3d/69/ee/0a6ac96b9c09c948fc0e74f2724a9703aa39749a41fa757c9e
Successfully built boston311
Installing collected packages: boston311
  Attempting uninstall: boston311
    Found existing installation: boston311 0.1.0
    Uninstalling boston311-0.1.0:
      Successfully uninstalled boston311-0.1.0
Successfully installed boston311-0.1.0


##Import the Boston311Model class

In [3]:
import os
import pandas as pd

In [4]:
from boston311 import Boston311LogReg, Boston311EventDecTree, Boston311SurvDecTree, Boston311KerasNLP


2023-09-24 00:07:30.689101: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-24 00:07:30.723593: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-24 00:07:30.724078: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


## Get latest file URLS and Current Date Ranges

In [5]:
latest_URLS = Boston311LogReg.get311URLs()

In [6]:
print(latest_URLS)

{'2023': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/e6013a93-1321-4f2a-bf91-8d8a02f1e62f/download/tmpn3l7dphs.csv', '2022': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/81a7b022-f8fc-4da5-80e4-b160058ca207/download/tmpfm8veglw.csv', '2021': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/f53ebccd-bc61-49f9-83db-625f209c95f5/download/tmp88p9g82n.csv', '2020': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/6ff6a6fd-3141-4440-a880-6f60a37fe789/download/tmpcv_10m2s.csv', '2019': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/ea2e4696-4a2d-429c-9807-d02eb92e0222/download/tmpcje3ep_w.csv', '2018': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2be28d90-3a90-4af1-a3f6-f28c1e25880a/download/tmp7602cia8.csv', '2017': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/300221

In [7]:
from datetime import datetime, timedelta
now = datetime.now()
thirty_days = timedelta(days=30)
thirty_days_ago = now - thirty_days
today_datestring = now.strftime("%Y-%m-%d")
thirty_days_ago_datestring = thirty_days_ago.strftime("%Y-%m-%d")
tomorrow_datestring = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')

print(today_datestring, thirty_days_ago_datestring, tomorrow_datestring)

2023-09-24 2023-08-25 2023-09-25


In [8]:
#set model folder constant
MODEL_FOLDER = './daily_models'

## Load extra features

In [9]:
#set path to mydata
EXTRA_mydata_FILE = './cls_and_pooled_embeddings_with_service_id.csv'


##Define several models

In [10]:
linear_tree_model = Boston311SurvDecTree(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type','queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0,
                                      'survivalTimeFill':tomorrow_datestring},
                            files_dict=latest_URLS)

In [11]:
logistic_model = Boston311LogReg(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [12]:
oldlogistic_model = Boston311LogReg(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [13]:
logistic_tree_model = Boston311EventDecTree(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [14]:
kerasNLP_model = Boston311KerasNLP(train_date_range={'start':'2022-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue', 'source', 'subject', 'reason', 'department', 'ward_number'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0,
                                      'survivalTimeFill':tomorrow_datestring},
                            files_dict=latest_URLS)

In [15]:
#get current datetime in Boston timezone as string
from datetime import datetime
from pytz import timezone
import pytz
boston = timezone('US/Eastern')
now = datetime.now(boston)
today_datestring = now.strftime("%Y-%m-%d")
#get time in Boston timezone as string for a filename
now = datetime.now(boston)
time_string = now.strftime("%H-%M-%S")
#define datetime string
my_datetime = today_datestring + '_' + time_string 

In [16]:
mydata = kerasNLP_model.load_data()



  df = pd.read_csv(file)
  df = pd.read_csv(file)


Files with different number of columns from File 0:  []
Files with same number of columns as File 0:  [0, 1]
Files with different column order from File 0:  []
Files with same column order as File 0:  [0, 1]


In [17]:
mydata['case_enquiry_id']

0         101004113346
1         101004113347
2         101004113348
3         101004113349
4         101004113351
              ...     
476126    101005012397
476127    101005012398
476128    101005012399
476129    101005012400
476130    101005012401
Name: case_enquiry_id, Length: 476131, dtype: int64

In [18]:
mydata = kerasNLP_model.enhance_data(mydata)


In [19]:
mydata = kerasNLP_model.apply_scenario(mydata)


In [20]:

mydata = kerasNLP_model.clean_data(mydata)

In [21]:
print(mydata['case_enquiry_id'])

0         101004113346
1         101004113347
2         101004113348
3         101004113349
4         101004113351
              ...     
476126    101005012397
476127    101005012398
476128    101005012399
476129    101005012400
476130    101005012401
Name: case_enquiry_id, Length: 423409, dtype: int64


In [22]:
import pandas as pd
import numpy as np
from ast import literal_eval
import pickle

pickle_file = 'dataframe.pkl'

X = None

if os.path.exists(pickle_file):
    X = pickle.load(open(pickle_file, "rb"))
else:
    X = pd.read_csv(EXTRA_mydata_FILE)

    #rename service_request_id to case_enquiry_id
    X.rename(columns={'service_request_id':'case_enquiry_id'}, inplace=True)
    #remove all rows where case_enquiry_id is non-numeric
    #X = X[X['case_enquiry_id'].str.isnumeric()]
    #convert case_enquiry_id to int64
    #X['case_enquiry_id'] = X['case_enquiry_id'].astype('int64')

    # Convert stringified arrays back to NumPy arrays
    X['cls_embedding'] = X['cls_embedding'].apply(literal_eval).apply(np.array)
    X['pooled_embedding'] = X['pooled_embedding'].apply(literal_eval).apply(np.array)

    pickle.dump(X, open(pickle_file, "wb"))



In [23]:
X.shape

(275503, 3)

In [24]:
df = X
# Assuming df is your DataFrame and it has columns 'cls_embedding' and 'pooled_embedding'
cls_embedding_flattened = np.stack(df['cls_embedding'].to_numpy())
pooled_embedding_flattened = np.stack(df['pooled_embedding'].to_numpy())

# Remove the old columns
df.drop(['cls_embedding', 'pooled_embedding'], axis=1, inplace=True)

# Add the new flattened columns
df_cls = pd.DataFrame(cls_embedding_flattened, columns=[f'cls_{i}' for i in range(cls_embedding_flattened.shape[1])])
df_pooled = pd.DataFrame(pooled_embedding_flattened, columns=[f'pooled_{i}' for i in range(pooled_embedding_flattened.shape[1])])

df = pd.concat([df, df_cls, df_pooled], axis=1)
X = df

In [25]:
X['case_enquiry_id']

0               101004615710
1               101004615710
2         service_request_id
3               101004616099
4               101004616098
                 ...        
275498          101005063009
275499          101005062363
275500          101005061781
275501          101005063039
275502          101005062077
Name: case_enquiry_id, Length: 275503, dtype: object

In [26]:
X['case_enquiry_id'] = X['case_enquiry_id'].astype(str)
is_numeric = X['case_enquiry_id'].str.isnumeric()

In [27]:
X = X[is_numeric]

In [28]:
X['case_enquiry_id'] = X['case_enquiry_id'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['case_enquiry_id'] = X['case_enquiry_id'].astype('int64')


In [29]:
X.shape

(275502, 257)

In [30]:
mydata.shape

(423409, 443)

In [31]:
#join them so we are left only with records that have mydata in both files
new_mydata = mydata.merge(X, on='case_enquiry_id', how='inner')


In [32]:
new_mydata.shape

(156025, 699)

In [33]:

X, y = kerasNLP_model.split_data(new_mydata)

In [34]:
#cast all columns that are type bool to float
for col in X.columns:
    if X[col].dtype == 'bool':
        X[col] = X[col].astype('float64')

In [35]:
#list the number of rows in X and y
print(X.shape)
print(y.shape)


(156025, 696)
(156025,)


In [36]:
best_model, best_hyperparameters = kerasNLP_model.tune_model(X, y, '/home/briarmoss/Documents/Boston_311/models/tuning')

Trial 46 Complete [00h 00m 40s]
val_accuracy: 0.2944720387458801

Best val_accuracy So Far: 0.37686267495155334
Total elapsed time: 00h 27m 18s

Search: Running Trial #47

Value             |Best Value So Far |Hyperparameter
4                 |2                 |num_layers
32                |128               |units_0
1                 |0                 |batch_normalization_0
128               |32                |units_1
1                 |1                 |batch_normalization_1
0.0071271         |0.00060448        |learning_rate
128               |128               |units_2
0                 |0                 |batch_normalization_2
32                |128               |units_3
1                 |1                 |batch_normalization_3





Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

In [None]:

#parse CLS embedding column as array
test_acc = kerasNLP_model.train_model( X, y )

Starting Training at 2023-09-23 23:57:04.590091




Epoch 1/100


2023-09-23 23:57:05.647125: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 657781248 exceeds 10% of free system memory.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Testing accuracy: 0.6836634278297424 
Top-2 accuracy: 0.8255290389060974 
Test loss: 1.067360281944275
Ending Training at 2023-09-24 00:00:53.011925
Training took 0:03:48.421834


## Train several models

In [None]:
print("learning is fun!") 

learning is fun!


In [None]:
#logistic_tree_model.run_pipeline()

In [None]:
#logistic_model.run_pipeline()

In [None]:
import gc
gc.collect()

3148

In [None]:
#linear_tree_model.run_pipeline()

In [None]:
"""
import datetime

def save_model_to_dir(model, folder_name):
    dir_path = os.path.join(MODEL_FOLDER, folder_name)
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    model_name = timestamp + "_" + model.model_type
    properties_name = model_name
    
    model.save(dir_path, model_name, properties_name)

# List of models
models = [linear_tree_model, logistic_tree_model, logistic_model]


# Iterate over models and save
for model in models:
    save_model_to_dir(model, model.model_type)
"""


'\nimport datetime\n\ndef save_model_to_dir(model, folder_name):\n    dir_path = os.path.join(MODEL_FOLDER, folder_name)\n    \n    if not os.path.exists(dir_path):\n        os.mkdir(dir_path)\n    \n    timestamp = datetime.datetime.now().strftime(\'%Y%m%d_%H%M%S\')\n    model_name = timestamp + "_" + model.model_type\n    properties_name = model_name\n    \n    model.save(dir_path, model_name, properties_name)\n\n# List of models\nmodels = [linear_tree_model, logistic_tree_model, logistic_model]\n\n\n# Iterate over models and save\nfor model in models:\n    save_model_to_dir(model, model.model_type)\n'