#Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 3 models on the Boston 311 data and use them to predict the outcome of cases from the last 30 days

##Install the package from github using pip

In [1]:
#This library is only needed for the Cox Regression Model, which is not included in this tutorial
#! pip install lifelines

In [2]:
#pwd()

In [3]:
#! pip install ../

##Import the Boston311Model class

In [1]:
! pip show boston311

Name: boston311
Version: 0.1.0
Summary: A package for training machine learning models on Boston 311 data
Home-page: https://github.com/mindfulcoder49/Boston_311
Author: Alex Alcivar
Author-email: alex.g.alcivar49@gmail.com
License: UNKNOWN
Location: /home/briarmoss/.local/lib/python3.10/site-packages
Requires: matplotlib, numpy, pandas, scikit-learn, tensorflow
Required-by: 


In [1]:
from boston311 import Boston311LogReg, Boston311EventDecTree, Boston311SurvDecTree

2023-09-08 20:11:55.359441: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-08 20:11:55.973234: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-08 20:11:55.976288: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#Get latest file URLS and Current Date Ranges

In [28]:
import os

#define daily model folder constant
DAILY_MODEL_FOLDER = './daily_models'

#define a dict for each type of 311 model, then scan all the folders in the daily model folder and load the correct model based on the folder name if it contains logtree or logreg and add it to the dict with the key being the folder name
daily_model_dict = {}
for folder in os.listdir(DAILY_MODEL_FOLDER):
    if 'logtree' in folder:
        daily_model_dict[folder] = Boston311EventDecTree()
        daily_model_dict[folder].load(os.path.join(DAILY_MODEL_FOLDER, folder, 'logtreeproperties.json'), os.path.join(DAILY_MODEL_FOLDER, folder, 'logtree.pkl'))
    elif 'logreg' in folder:
        daily_model_dict[folder] = Boston311LogReg()
        daily_model_dict[folder].load(os.path.join(DAILY_MODEL_FOLDER, folder, 'logregproperties.json'), os.path.join(DAILY_MODEL_FOLDER, folder, 'logreg.h5'))

daily_model_dict

{'logtree': <boston311.Boston311EventDecTree.Boston311EventDecTree at 0x7f1f794a2920>,
 'logreg': <boston311.Boston311LogReg.Boston311LogReg at 0x7f1ffbab5720>}

In [7]:
from datetime import datetime, timedelta
now = datetime.now()
today_datestring = now.strftime("%Y-%m-%d")

In [30]:
import pandas as pd

#define an empt pandas dataframe ml_model_df
ml_model_df = pd.DataFrame(columns=['ml_model_name', 'ml_model_type', 'id', 'ml_model_date'])
all_model_cases = pd.DataFrame()
all_model_predictions = pd.DataFrame()


ml_model_df

Unnamed: 0,ml_model_name,ml_model_type,id,ml_model_date


In [31]:



#foreach model in the daily_model_dict set the predict_dat_range to the last 30 days and then call the predict method and save the results to a csv file
for model_name, model in daily_model_dict.items():

    print(f"Processing model: {model_name}")

    print(ml_model_df)
    model.predict_date_range = {'start': '2023-08-09', 'end': today_datestring}



    #get file creation date for the .json file in the folder with the model_name
    #use os.path.getctime to get the creation time of the .json file in the folder with the model_name
    #convert the creation time to a datetime object
    #convert the datetime object to a string in the format of %Y-%m-%d
    #add to ml_model_df dataframe with  columns, ml_model_name, ml_model_type,ml_model_id, ml_model_date
    ml_model_datetime = os.path.getctime(os.path.join(DAILY_MODEL_FOLDER, model_name, model_name + 'properties.json'))
    ml_model_date = datetime.fromtimestamp(ml_model_datetime).strftime('%Y-%m-%d')
    

    ml_model_df = pd.concat([ml_model_df, pd.DataFrame([{'ml_model_name': model_name, 
                                    'ml_model_type': model.__class__.__name__,
                                    'id': model_name,
                                    'ml_model_date': ml_model_date}])], ignore_index=True)
    
    print(ml_model_df)

    model_prediction = model.predict()
    #get new dataframe with just the event_prediction column from the model_prediction dataframe
    model_prediction_event = model_prediction[['event_prediction','case_enquiry_id']].copy()
    model_prediction_event['ml_model_id'] = model_name
    #add today's date to the dataframe
    model_prediction_event['prediction_date'] = today_datestring
    #rename the event_prediction column to prediction
    model_prediction_event.rename(columns={'event_prediction': 'prediction'}, inplace=True)

    #remove model_prediction event_prediction column
    model_cases = model_prediction.drop('event_prediction', axis=1).copy()

    #remove geom column in model_cases
    model_cases = model_cases.drop('geom_4326', axis=1).copy()

    # Add the model_cases dataframe to the all_model_cases dataframe
    all_model_cases = pd.concat([all_model_cases, model_cases], ignore_index=True).drop_duplicates().reset_index(drop=True)

    # Add the model_prediction_event dataframe to the all_model_predictions dataframe
    all_model_predictions = pd.concat([all_model_predictions, model_prediction_event], ignore_index=True)

    

    



Processing model: logtree
Empty DataFrame
Columns: [ml_model_name, ml_model_type, id, ml_model_date]
Index: []
  ml_model_name          ml_model_type       id ml_model_date
0       logtree  Boston311EventDecTree  logtree    2023-09-08


  df = pd.read_csv(file)


Files with different number of columns from File 0:  []
Files with same number of columns as File 0:  [0]
Files with different column order from File 0:  []
Files with same column order as File 0:  [0]
Processing model: logreg
  ml_model_name          ml_model_type       id ml_model_date
0       logtree  Boston311EventDecTree  logtree    2023-09-08
  ml_model_name          ml_model_type       id ml_model_date
0       logtree  Boston311EventDecTree  logtree    2023-09-08
1        logreg        Boston311LogReg   logreg    2023-09-08


  df = pd.read_csv(file)


Files with different number of columns from File 0:  []
Files with same number of columns as File 0:  [0]
Files with different column order from File 0:  []
Files with same column order as File 0:  [0]


##Save the prediction data

In [37]:
all_model_cases.to_csv(today_datestring+'_311_cases.csv', index=False)


In [35]:

all_model_predictions.to_csv(today_datestring+'_311_predictions.csv', index=False)

In [36]:

ml_model_df.to_csv(today_datestring+'_311_ml_models.csv', index=False)

In [38]:
#create an export folder
EXPORT_FOLDER = '~/Documents/BODC-DEI-site/database/seeders'
#copy the csv files to the export folder
!cp {today_datestring}_311_cases.csv {EXPORT_FOLDER}
!cp {today_datestring}_311_predictions.csv {EXPORT_FOLDER}
!cp {today_datestring}_311_ml_models.csv {EXPORT_FOLDER}

