#Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 3 models on the Boston 311 mydata and use them to predict the outcome of cases from the last 30 days

In [130]:
! pip install keras-tuner

Defaulting to user installation because normal site-packages is not writeable


In [131]:
! pip install ../

Defaulting to user installation because normal site-packages is not writeable
[0mProcessing /home/briarmoss/Documents/Boston_311
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: boston311
  Building wheel for boston311 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for boston311: filename=boston311-0.1.0-py3-none-any.whl size=22841 sha256=65bc71c9af6028a187585993fc5edf327663a32cf9b76eb7e2d73e3b4881b174
  Stored in directory: /tmp/pip-ephem-wheel-cache-h_iiulbc/wheels/3d/69/ee/0a6ac96b9c09c948fc0e74f2724a9703aa39749a41fa757c9e
Successfully built boston311
[0mInstalling collected packages: boston311
  Attempting uninstall: boston311
[0m    Found existing installation: boston311 0.1.0
    Uninstalling boston311-0.1.0:
      Successfully uninstalled boston311-0.1.0
[0mSuccessfully installed boston311-0.1.0


##Import the Boston311Model class

In [132]:
import os
import pandas as pd
import numpy as np
import pickle
import re
import sys
import time

In [133]:
from boston311 import Boston311LogReg, Boston311EventDecTree, Boston311SurvDecTree, Boston311KerasNLP


## Load extra features

In [134]:
#get current datetime in Boston timezone as string
from datetime import datetime
from pytz import timezone
import pytz
boston = timezone('US/Eastern')
now = datetime.now(boston)
today_datestring = now.strftime("%Y-%m-%d")
#get time in Boston timezone as string for a filename
now = datetime.now(boston)
time_string = now.strftime("%H-%M-%S")
#define datetime string
my_datetime = today_datestring + '_' + time_string 

#format tomorrows date as yyyy-mm-dd
tomorrows_date =  now + pd.DateOffset(days=1)
tomorrows_datestring = tomorrows_date.strftime("%Y-%m-%d")

In [135]:
#set path to mydata
EXTRA_mydata_FILE = './cls_and_pooled_embeddings_with_service_id.csv'
json_file = './daily_models/Boston311KerasNLP/20230925_143704_Boston311KerasNLP.json'
model_file = './daily_models/Boston311KerasNLP/20230925_143704_Boston311KerasNLP.h5'
kerasNLP_model = Boston311KerasNLP()
kerasNLP_model.load( json_file, model_file)
kerasNLP_model.predict_date_rang = {'start':'2023-08-27', 'end':tomorrows_datestring}


In [136]:
data = kerasNLP_model.load_data( 'predict' )
data = kerasNLP_model.enhance_data( data, 'predict')

  df = pd.read_csv(file)


Files with different number of columns from File 0:  []
Files with same number of columns as File 0:  [0]
Files with different column order from File 0:  []
Files with same column order as File 0:  [0]


In [137]:
clean_data = kerasNLP_model.clean_data_for_prediction( data )

In [138]:
clean_data.head()

Unnamed: 0,case_enquiry_id,type_Abandoned Bicycle,type_Abandoned Building,type_Abandoned Vehicles,type_Abandoned Vehicles - Private Tow,type_Aircraft Noise Disturbance,type_Alert Boston,type_Animal Found,type_Animal Generic Request,type_Animal Lost,...,ward_number_20,ward_number_21,ward_number_22,ward_number_3,ward_number_4,ward_number_5,ward_number_6,ward_number_7,ward_number_8,ward_number_9
0,101005013792,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,101005013793,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,101005013794,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,101005013795,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,101005013796,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [139]:
mydata = clean_data

In [140]:
mydata['case_enquiry_id']

0        101005013792
1        101005013793
2        101005013794
3        101005013795
4        101005013796
             ...     
29778    101005079312
29779    101005079313
29780    101005079315
29781    101005079316
29782    101005079318
Name: case_enquiry_id, Length: 29783, dtype: object

In [141]:
import pandas as pd
import numpy as np
from ast import literal_eval
import pickle

pickle_file = 'dataframe.pkl'

X = None

#check if the file date is earlier than EXTRA_mydata_FILE date
pickle_file_date = time.ctime(os.path.getmtime(pickle_file))
EXTRA_mydata_FILE_date = time.ctime(os.path.getmtime(EXTRA_mydata_FILE))
if pickle_file_date < EXTRA_mydata_FILE_date:
    os.remove(pickle_file)

if os.path.exists(pickle_file):

    X = pickle.load(open(pickle_file, "rb"))
else:
    X = pd.read_csv(EXTRA_mydata_FILE)

    #rename service_request_id to case_enquiry_id
    X.rename(columns={'service_request_id':'case_enquiry_id'}, inplace=True)
    #remove all rows where case_enquiry_id is non-numeric
    #X = X[X['case_enquiry_id'].str.isnumeric()]
    #convert case_enquiry_id to int64
    #X['case_enquiry_id'] = X['case_enquiry_id'].astype('int64')

    # Convert stringified arrays back to NumPy arrays
    X['cls_embedding'] = X['cls_embedding'].apply(literal_eval).apply(np.array)
    X['pooled_embedding'] = X['pooled_embedding'].apply(literal_eval).apply(np.array)

    pickle.dump(X, open(pickle_file, "wb"))



In [142]:
#concatenate the two dataframes and reindex
df = X

In [143]:
df.shape

(28265, 3)

In [144]:

# Assuming df is your DataFrame and it has columns 'cls_embedding' and 'pooled_embedding'
cls_embedding_flattened = np.stack(df['cls_embedding'].to_numpy())
pooled_embedding_flattened = np.stack(df['pooled_embedding'].to_numpy())

# Remove the old columns
df.drop(['cls_embedding', 'pooled_embedding'], axis=1, inplace=True)

# Add the new flattened columns
df_cls = pd.DataFrame(cls_embedding_flattened, columns=[f'cls_{i}' for i in range(cls_embedding_flattened.shape[1])])
df_pooled = pd.DataFrame(pooled_embedding_flattened, columns=[f'pooled_{i}' for i in range(pooled_embedding_flattened.shape[1])])

df = pd.concat([df, df_cls, df_pooled], axis=1)

In [145]:
df['case_enquiry_id'] = df['case_enquiry_id'].astype(str)
is_numeric = df['case_enquiry_id'].str.isnumeric()

In [146]:
df = df[is_numeric]

In [147]:
df['case_enquiry_id'] = df['case_enquiry_id'].astype('int64')

In [148]:
df.shape

(28265, 257)

In [149]:
mydata.shape

(29783, 436)

In [150]:
#join them so we are left only with records that have mydata in both files
new_mydata = mydata.merge(df, on='case_enquiry_id', how='inner')


In [151]:
new_mydata.shape

(18186, 692)

In [152]:

df = new_mydata

In [153]:
#cast all columns that are type bool to float
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype('float64')
    if df[col].dtype == 'int64':
        df[col] = df[col].astype('float64') 

In [154]:
#list the number of rows in X and y
print(df.dtypes)


case_enquiry_id                           object
type_Abandoned Bicycle                   float64
type_Abandoned Building                  float64
type_Abandoned Vehicles                  float64
type_Abandoned Vehicles - Private Tow    float64
                                          ...   
pooled_123                               float64
pooled_124                               float64
pooled_125                               float64
pooled_126                               float64
pooled_127                               float64
Length: 692, dtype: object


In [155]:
#free all unused dataframes
#df_to_delete = [cls_embedding_flattened, pooled_embedding_flattened, df_cls, df_pooled, X, new_mydata, is_numeric, mydata]

#for data_frame in df_to_delete:
#    if data_frame is not None:
#        del data_frame

In [156]:
case_enquiry_id = df['case_enquiry_id']
X_predict = df.drop(['case_enquiry_id'], axis=1)


In [157]:

#parse CLS embedding column as array
predictions = kerasNLP_model.model.predict(X_predict)

2023-10-09 10:07:41.358800: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 100532208 exceeds 10% of free system memory.




In [158]:
# Define a function to flatten an array into a string.
def array_to_string(arr):
    return ' '.join(map(str, arr))

# Apply the function along axis 1 (rows).
string_predictions = np.apply_along_axis(array_to_string, axis=1, arr=predictions)

# Now string_predictions is a 1D NumPy array where each element is a string
# that contains all the elements from the corresponding row in the original 2D array.
print(string_predictions)

['0.7559566 0.20721163 0.033212524 0.0025828772 0.0008951307 8.179453e-05 3.803741e-05 1.7574697e-05 3.7406842e-06'
 '0.7104199 0.2381293 0.046981774 0.003711577 0.00040469103 0.00014026689 0.00020275456 9.391938e-06 2.3866696e-07'
 '0.6704167 0.28465536 0.03871845 0.0035374311 0.0015684962 0.0002003041 0.00016235236 0.000731696 9.161791e-06'
 ...
 '0.15320297 0.09023995 0.06491842 0.029716609 0.0429548 0.044575464 0.45883057 0.049090143 0.06647106'
 '0.15320298 0.09023996 0.06491843 0.02971661 0.042954803 0.044575468 0.4588306 0.049090147 0.06647107'
 '0.15320298 0.09023996 0.06491843 0.02971661 0.042954803 0.044575468 0.4588306 0.049090143 0.06647107']


In [159]:
#combine case_enquiry_id and predictions into a dataframe
predictions_df = pd.DataFrame({'case_enquiry_id':case_enquiry_id, 'prediction':string_predictions})

In [160]:
bin_labels = [
    "0-12 hours",      # Less than half a day
    "12-24 hours",     # Half to one day
    "1-3 days",        # One to three days
    "4-7 days",        # Four to seven days
    "1-2 weeks",       # One to two weeks
    "2-4 weeks",       # Two to four weeks
    "1-2 months",      # One to two months
    "2-4 months",      # Two to four months
    "4+ months"        # More than four months
]

#predictions_df['prediction'] = predictions_df['prediction'].apply(lambda x: bin_labels[x])

In [161]:
len(predictions_df)

18186

In [162]:
print(predictions)

[[7.5595659e-01 2.0721163e-01 3.3212524e-02 ... 3.8037411e-05
  1.7574697e-05 3.7406842e-06]
 [7.1041989e-01 2.3812930e-01 4.6981774e-02 ... 2.0275456e-04
  9.3919380e-06 2.3866696e-07]
 [6.7041671e-01 2.8465536e-01 3.8718451e-02 ... 1.6235236e-04
  7.3169603e-04 9.1617912e-06]
 ...
 [1.5320297e-01 9.0239950e-02 6.4918421e-02 ... 4.5883057e-01
  4.9090143e-02 6.6471063e-02]
 [1.5320298e-01 9.0239957e-02 6.4918429e-02 ... 4.5883060e-01
  4.9090147e-02 6.6471070e-02]
 [1.5320298e-01 9.0239957e-02 6.4918429e-02 ... 4.5883060e-01
  4.9090143e-02 6.6471070e-02]]


In [163]:
kerasNLP_model.model_type

'Boston311KerasNLP'

In [164]:
import pandas as pd

model_name = '20230925_143704_Boston311KerasNLP'
ml_model_date = '2023-09-25'

#define an empt pandas dataframe ml_model_df
ml_model_df = pd.DataFrame(columns=['ml_model_name', 'ml_model_type', 'ml_model_date'])

ml_model_df = pd.concat([ml_model_df, pd.DataFrame([{'ml_model_name': model_name, 
                                    'ml_model_type': kerasNLP_model.model_type,
                                    'ml_model_date': ml_model_date}])], ignore_index=True)

In [165]:
model_cases = data.drop(['geom_4326','survival_time_hours', 'survival_time', 'event'], axis=1).copy()

In [166]:
all_model_cases = model_cases 
all_model_predictions = predictions_df

In [167]:
all_model_predictions['ml_model_name'] = model_name

In [168]:
all_model_predictions['prediction_date'] = today_datestring

In [169]:
# %%
#get current datetime in Boston timezone as string
from datetime import datetime
from pytz import timezone
import pytz
boston = timezone('US/Eastern')
now = datetime.now(boston)
today_datestring = now.strftime("%Y-%m-%d")
#get time in Boston timezone as string for a filename
now = datetime.now(boston)
time_string = now.strftime("%H-%M-%S")
#define datetime string
my_datetime = today_datestring + '_' + time_string 

In [170]:


# %%
all_model_cases.to_csv(my_datetime+'_311_cases.csv', index=False)


# %%

all_model_predictions.to_csv(my_datetime+'_311_predictions.csv', index=False)

# %%

ml_model_df.to_csv(my_datetime+'_311_ml_models.csv', index=False)

# %%
#create datetime _manifest.txt file with one filename per line
with open(my_datetime+'_manifest.txt', 'w') as f:
    f.write(my_datetime+'_311_cases.csv\n')
    f.write(my_datetime+'_311_predictions.csv\n')
    f.write(my_datetime+'_311_ml_models.csv\n')

# %%
#create an export folder
EXPORT_FOLDER = '~/Documents/BODC-DEI-site/database/seeders'
#copy the csv files to the export folder
!cp {my_datetime}_311_cases.csv {EXPORT_FOLDER}
!cp {my_datetime}_311_predictions.csv {EXPORT_FOLDER}
!cp {my_datetime}_311_ml_models.csv {EXPORT_FOLDER}
!cp {my_datetime}_manifest.txt {EXPORT_FOLDER}



# %% [markdown]
# ** Copy the files to the production server **

# %%
# Define constants for servers
PROD_USER = 'u353344964'
PROD_HOSTNAME = '195.179.236.61'
PORT_NUMBER = 65002
PROD_BASE_FOLDER = '/home/u353344964/domains/bodc-dei.org/laravel'
STAGE_BASE_FOLDER = '/home/u353344964/domains/bodc-dei.org/stagelaravel'
PROD_EXPORT_FOLDER = '/home/u353344964/domains/bodc-dei.org/laravel/database/seeders'
STAGE_EXPORT_FOLDER = '/home/u353344964/domains/bodc-dei.org/stagelaravel/database/seeders'

# %%







In [171]:
import os


def scp_to_server(filename, user=PROD_USER, hostname=PROD_HOSTNAME, port=PORT_NUMBER, export_folder=PROD_EXPORT_FOLDER):
    """Copy a file to the server using scp."""
    command = f"scp -P {port} {filename} {user}@{hostname}:{export_folder}"
    print(f"Executing: {command}")
    os.system(command)

# Use the function to scp files
files_to_copy = [
    f"{my_datetime}_311_cases.csv",
    f"{my_datetime}_311_predictions.csv",
    f"{my_datetime}_311_ml_models.csv",
    f"{my_datetime}_manifest.txt"
]

# Control where to copy
copy_to_prod = True
copy_to_stage = True

for file in files_to_copy:
    if copy_to_prod:
        scp_to_server(file, export_folder=PROD_EXPORT_FOLDER)
    if copy_to_stage:
        scp_to_server(file, export_folder=STAGE_EXPORT_FOLDER)


# %%
PORT_NUMBER

Executing: scp -P 65002 2023-10-09_10-07-48_311_cases.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders


Executing: scp -P 65002 2023-10-09_10-07-48_311_cases.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/stagelaravel/database/seeders
Executing: scp -P 65002 2023-10-09_10-07-48_311_predictions.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders
Executing: scp -P 65002 2023-10-09_10-07-48_311_predictions.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/stagelaravel/database/seeders
Executing: scp -P 65002 2023-10-09_10-07-48_311_ml_models.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders
Executing: scp -P 65002 2023-10-09_10-07-48_311_ml_models.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/stagelaravel/database/seeders
Executing: scp -P 65002 2023-10-09_10-07-48_manifest.txt u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders
Executing: scp -P 65002 2023-10-09_10-07-48_manifest.txt u353344964@195.179.236.61:/

65002

In [172]:
# %%
if copy_to_prod:
    !ssh -p {PORT_NUMBER} {PROD_USER}@{PROD_HOSTNAME} 'cd {PROD_BASE_FOLDER}; php artisan db:seed --class=ThreeOneOneSeeder'

if copy_to_stage:
    !ssh -p {PORT_NUMBER} {PROD_USER}@{PROD_HOSTNAME} 'cd {STAGE_BASE_FOLDER}; php artisan db:seed --class=ThreeOneOneSeeder'


   INFO  Seeding database.  


Manifest files:
/home/u353344964/domains/bodc-dei.org/laravel/database/seeders/2023-10-09_10-07-48_manifest.txt

Total records to process: 47970

Processing /home/u353344964/domains/bodc-dei.org/laravel/database/seeders/2023-10-09_10-07-48_311_cases.csv
[6A[K100 App\Models\ThreeOneOneCase records processed.
[KRecords remaining in this file: 29683.
[KTotal records remaining: 47870.
[KTime for last 100 records: 0.03 seconds.
[KEstimated time remaining for this file: 8 seconds.
[KEstimated time for all files: 14 seconds.
[6A[K200 App\Models\ThreeOneOneCase records processed.
[KRecords remaining in this file: 29583.
[KTotal records remaining: 47770.
[KTime for last 100 records: 0.01 seconds.
[KEstimated time remaining for this file: 1 second.
[KEstimated time for all files: 3 seconds.
[6A[K300 App\Models\ThreeOneOneCase records processed.
[KRecords remaining in this file: 29483.
[KTotal records remaining: 47670.
[KTime for last 100 records