#Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 3 models on the Boston 311 data and use them to predict the outcome of cases from the last 30 days

##Install the package from github using pip

In [1]:
#This library is only needed for the Cox Regression Model, which is not included in this tutorial
#! pip install lifelines

In [2]:
#pwd()

In [3]:
! pip install ../

Defaulting to user installation because normal site-packages is not writeable
Processing /home/briarmoss/Documents/Boston_311
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: boston311
  Building wheel for boston311 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for boston311: filename=boston311-0.1.0-py3-none-any.whl size=20043 sha256=0b0b050beee6707f1dc4c376f0fc1aa439b79bf768309c3f53dae8c5830d7e44
  Stored in directory: /tmp/pip-ephem-wheel-cache-q37q1ygc/wheels/3d/69/ee/0a6ac96b9c09c948fc0e74f2724a9703aa39749a41fa757c9e
Successfully built boston311
Installing collected packages: boston311
  Attempting uninstall: boston311
    Found existing installation: boston311 0.1.0
    Uninstalling boston311-0.1.0:
      Successfully uninstalled boston311-0.1.0
Successfully installed boston311-0.1.0


##Import the Boston311Model class

In [4]:
! pip show boston311

Name: boston311
Version: 0.1.0
Summary: A package for training machine learning models on Boston 311 data
Home-page: https://github.com/mindfulcoder49/Boston_311
Author: Alex Alcivar
Author-email: alex.g.alcivar49@gmail.com
License: UNKNOWN
Location: /home/briarmoss/.local/lib/python3.10/site-packages
Requires: matplotlib, numpy, pandas, scikit-learn, tensorflow
Required-by: 


In [5]:
from boston311 import Boston311LogReg, Boston311EventDecTree, Boston311SurvDecTree

2023-09-13 10:50:12.253282: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-13 10:50:12.616594: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-13 10:50:12.619236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#Get latest file URLS and Current Date Ranges

In [6]:
! ls .

2023-09-11_19-11-02_311_cases.csv	 2023-09-12_17-01-30_manifest.txt
2023-09-11_19-11-02_311_ml_models.csv	 daily_models
2023-09-11_19-11-02_311_predictions.csv  daily_prediction_and_export.ipynb
2023-09-11_19-11-02_manifest.txt	 daily.py
2023-09-12_17-01-30_311_cases.csv	 __pycache__
2023-09-12_17-01-30_311_ml_models.csv	 save_models_and_predictions.ipynb
2023-09-12_17-01-30_311_predictions.csv  train_and_save_models.ipynb


In [7]:
import os

#define daily model folder constant
DAILY_MODEL_FOLDER = './daily_models'


# The helper function load_model_from_file is adjusted to load a model 
# based on its type and the provided timestamp.
# The main loop iterates through each folder in DAILY_MODEL_FOLDER.
# For each folder, it checks for model files (.pkl or .h5).
# If a model file is found, it extracts the timestamp and model type 
# from the filename and uses the helper function to load the model.
# The loaded model is added to the daily_model_dict with the key being 
# the model's filename without the extension.


def load_model_from_file(model_type, folder_path, timestamp):
    """Load a model based on its type from a given folder."""
    if model_type == 'Boston311EventDecTree':
        model_instance = Boston311EventDecTree()
        model_file = f'{timestamp}_{model_type}.pkl'
    elif model_type == 'Boston311LogReg':
        model_instance = Boston311LogReg()
        model_file = f'{timestamp}_{model_type}.h5'
    elif model_type == 'Boston311SurvDecTree':
        model_instance = Boston311SurvDecTree()
        model_file = f'{timestamp}_{model_type}.pkl'
    else:
        raise ValueError(f"Unknown model type: {model_type}")

    properties_file = f'{timestamp}_{model_type}.json'
    model_instance.load(os.path.join(folder_path, properties_file), os.path.join(folder_path, model_file))
    
    return model_instance

daily_model_dict = {}

for folder in os.listdir(DAILY_MODEL_FOLDER):
    folder_path = os.path.join(DAILY_MODEL_FOLDER, folder)
    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            if file.count('_') == 2 and any(ext in file for ext in ['.pkl', '.h5']):
                parts = file.rsplit('.', 1)[0].split('_')
                timestamp = f"{parts[0]}_{parts[1]}"
                model_type = parts[2]
                try:
                    daily_model_dict[f'{timestamp}_{model_type}'] = load_model_from_file(model_type, folder_path, timestamp)
                except ValueError:
                    # Skip files with unknown model types
                    continue

daily_model_dict



{'20230911_185132_Boston311LogReg': <boston311.Boston311LogReg.Boston311LogReg at 0x7fbd866fb040>,
 '20230911_185132_Boston311EventDecTree': <boston311.Boston311EventDecTree.Boston311EventDecTree at 0x7fbd866fb100>,
 '20230911_185132_Boston311SurvDecTree': <boston311.Boston311SurvDecTree.Boston311SurvDecTree at 0x7fbd866fb070>}

In [8]:
from datetime import datetime, timedelta
now = datetime.now()
today_datestring = now.strftime("%Y-%m-%d")

In [9]:
import pandas as pd

#define an empt pandas dataframe ml_model_df
ml_model_df = pd.DataFrame(columns=['ml_model_name', 'ml_model_type', 'ml_model_date'])
all_model_cases = pd.DataFrame()
all_model_predictions = pd.DataFrame()


ml_model_df

Unnamed: 0,ml_model_name,ml_model_type,ml_model_date


In [10]:



#foreach model in the daily_model_dict set the predict_dat_range to the last 30 days and then call the predict method and save the results to a csv file
for model_name, model in daily_model_dict.items():

    print(f"Processing model: {model_name}")

    print(ml_model_df)
    model.predict_date_range = {'start': '2023-08-09', 'end': today_datestring}



    #get file creation date for the .json file in the folder with the model_name
    #use os.path.getctime to get the creation time of the .json file in the folder with the model_name
    #convert the creation time to a datetime object
    #convert the datetime object to a string in the format of %Y-%m-%d
    #add to ml_model_df dataframe with  columns, ml_model_name, ml_model_type,ml_model_id, ml_model_date
    ml_model_datetime = os.path.getctime(os.path.join(DAILY_MODEL_FOLDER, model.__class__.__name__, model_name + '.json'))
    ml_model_date = datetime.fromtimestamp(ml_model_datetime).strftime('%Y-%m-%d')
    

    ml_model_df = pd.concat([ml_model_df, pd.DataFrame([{'ml_model_name': model_name, 
                                    'ml_model_type': model.__class__.__name__,
                                    'ml_model_date': ml_model_date}])], ignore_index=True)
    
    print(ml_model_df)

    model_prediction = model.predict()

    #check if the model_prediction dataframe contains an event_prediction column
    if 'event_prediction' in model_prediction.columns:
    #get new dataframe with just the event_prediction column from the model_prediction dataframe
        model_prediction_event = model_prediction[['event_prediction','case_enquiry_id']].copy()
        model_prediction_event.rename(columns={'event_prediction': 'prediction'}, inplace=True)
        #remove model_prediction event_prediction column
        model_cases = model_prediction.drop('event_prediction', axis=1).copy()
    elif 'survival_prediction' in model_prediction.columns:
        model_prediction_event = model_prediction[['survival_prediction','case_enquiry_id']].copy()
        model_prediction_event.rename(columns={'survival_prediction': 'prediction'}, inplace=True)
        #remove model_prediction survival_prediction column
        model_cases = model_prediction.drop('survival_prediction', axis=1).copy()

    model_prediction_event['ml_model_name'] = model_name
    #add today's date to the dataframe
    model_prediction_event['prediction_date'] = today_datestring
    #rename the event_prediction column to prediction
    
    #remove geom column in model_cases
    model_cases = model_cases.drop(['geom_4326','event','survival_time_hours', 'survival_time'], axis=1).copy()

    # Add the model_cases dataframe to the all_model_cases dataframe
    all_model_cases = pd.concat([all_model_cases, model_cases], ignore_index=True).drop_duplicates().reset_index(drop=True)

    # Add the model_prediction_event dataframe to the all_model_predictions dataframe
    all_model_predictions = pd.concat([all_model_predictions, model_prediction_event], ignore_index=True)

    

    



Processing model: 20230911_185132_Boston311LogReg
Empty DataFrame
Columns: [ml_model_name, ml_model_type, ml_model_date]
Index: []
                     ml_model_name    ml_model_type ml_model_date
0  20230911_185132_Boston311LogReg  Boston311LogReg    2023-09-11


  df = pd.read_csv(file)


Files with different number of columns from File 0:  []
Files with same number of columns as File 0:  [0]
Files with different column order from File 0:  []
Files with same column order as File 0:  [0]
Processing model: 20230911_185132_Boston311EventDecTree
                     ml_model_name    ml_model_type ml_model_date
0  20230911_185132_Boston311LogReg  Boston311LogReg    2023-09-11
                           ml_model_name          ml_model_type ml_model_date
0        20230911_185132_Boston311LogReg        Boston311LogReg    2023-09-11
1  20230911_185132_Boston311EventDecTree  Boston311EventDecTree    2023-09-11


  df = pd.read_csv(file)


Files with different number of columns from File 0:  []
Files with same number of columns as File 0:  [0]
Files with different column order from File 0:  []
Files with same column order as File 0:  [0]
Processing model: 20230911_185132_Boston311SurvDecTree
                           ml_model_name          ml_model_type ml_model_date
0        20230911_185132_Boston311LogReg        Boston311LogReg    2023-09-11
1  20230911_185132_Boston311EventDecTree  Boston311EventDecTree    2023-09-11
                           ml_model_name          ml_model_type ml_model_date
0        20230911_185132_Boston311LogReg        Boston311LogReg    2023-09-11
1  20230911_185132_Boston311EventDecTree  Boston311EventDecTree    2023-09-11
2   20230911_185132_Boston311SurvDecTree   Boston311SurvDecTree    2023-09-11


  df = pd.read_csv(file)


Files with different number of columns from File 0:  []
Files with same number of columns as File 0:  [0]
Files with different column order from File 0:  []
Files with same column order as File 0:  [0]


In [11]:
#count rows in prediction dataframe
print(f"Number of rows in all_model_predictions: {len(all_model_predictions)}")

Number of rows in all_model_predictions: 103716


In [12]:
# Assuming the dataframe with all case data is named all_cases
closed_case_ids = all_model_cases[all_model_cases['case_status'] == 'Closed']['case_enquiry_id'].unique()

# Drop rows from all_model_predictions where case_enquiry_id matches those in closed_case_ids
all_model_predictions = all_model_predictions[~all_model_predictions['case_enquiry_id'].isin(closed_case_ids)]


In [13]:
#count rows in prediction dataframe
print(f"Number of rows in all_model_predictions: {len(all_model_predictions)}")

Number of rows in all_model_predictions: 20997


## Save the prediction data


In [14]:
#get current datetime in Boston timezone as string
from datetime import datetime
from pytz import timezone
import pytz
boston = timezone('US/Eastern')
now = datetime.now(boston)
today_datestring = now.strftime("%Y-%m-%d")
#get time in Boston timezone as string for a filename
now = datetime.now(boston)
time_string = now.strftime("%H-%M-%S")
#define datetime string
my_datetime = today_datestring + '_' + time_string 

my_datetime

'2023-09-13_10-51-36'

In [27]:
import hashlib

# Function to compute the checksum for a row
def compute_checksum(row):
    # Convert row to string and encode
    row_str = ''.join(map(str, row.values)).encode('utf-8')
    
    # Compute MD5 hash (or any other hash of your choice)
    result = hashlib.md5(row_str).hexdigest()
    return result

# Apply the function to each row and assign the result to a new column
all_model_cases['checksum'] = all_model_cases.apply(compute_checksum, axis=1)

print(all_model_cases)

       case_enquiry_id             open_dt        sla_target_dt   
0         101004975722 2023-08-09 00:37:00  2023-08-10 04:30:00  \
1         101004975723 2023-08-09 00:47:40  2023-08-10 04:30:00   
2         101004975724 2023-08-09 00:48:58  2023-08-10 04:30:00   
3         101004975726 2023-08-09 01:18:47  2023-08-11 04:30:00   
4         101004975727 2023-08-09 01:19:54  2023-08-10 04:30:00   
...                ...                 ...                  ...   
34567     101005050187 2023-09-12 19:48:23  2023-09-14 04:30:00   
34568     101005050188 2023-09-12 19:49:22  2023-09-14 04:30:00   
34569     101005050189 2023-09-12 19:49:55  2023-09-14 04:30:00   
34570     101005050190 2023-09-12 19:50:09  2023-09-14 04:30:00   
34571     101005050191 2023-09-12 19:56:33  2023-09-14 04:30:00   

                closed_dt on_time case_status   
0     2023-08-09 01:29:44  ONTIME      Closed  \
1     2023-08-09 01:43:02  ONTIME      Closed   
2     2023-08-09 08:53:28  ONTIME      Closed   

In [15]:
all_model_cases.to_csv(my_datetime+'_311_cases.csv', index=False)


In [16]:

all_model_predictions.to_csv(my_datetime+'_311_predictions.csv', index=False)

In [17]:

ml_model_df.to_csv(my_datetime+'_311_ml_models.csv', index=False)

In [18]:
#create datetime _manifest.txt file with one filename per line
with open(my_datetime+'_manifest.txt', 'w') as f:
    f.write(my_datetime+'_311_cases.csv\n')
    f.write(my_datetime+'_311_predictions.csv\n')
    f.write(my_datetime+'_311_ml_models.csv\n')

In [19]:
#create an export folder
EXPORT_FOLDER = '~/Documents/BODC-DEI-site/database/seeders'
#copy the csv files to the export folder
!cp {my_datetime}_311_cases.csv {EXPORT_FOLDER}
!cp {my_datetime}_311_predictions.csv {EXPORT_FOLDER}
!cp {my_datetime}_311_ml_models.csv {EXPORT_FOLDER}
!cp {my_datetime}_manifest.txt {EXPORT_FOLDER}



** Copy the files to the production server **

In [20]:
import os

# Define constants for servers
PROD_USER = 'u353344964'
PROD_HOSTNAME = '195.179.236.61'
PORT_NUMBER = 65002
PROD_EXPORT_FOLDER = '/home/u353344964/domains/bodc-dei.org/laravel/database/seeders'
STAGE_EXPORT_FOLDER = '/home/u353344964/domains/bodc-dei.org/stagelaravel/database/seeders'

def scp_to_server(filename, user=PROD_USER, hostname=PROD_HOSTNAME, port=PORT_NUMBER, export_folder=PROD_EXPORT_FOLDER):
    """Copy a file to the server using scp."""
    command = f"scp -P {port} {filename} {user}@{hostname}:{export_folder}"
    print(f"Executing: {command}")
    os.system(command)

# Use the function to scp files
files_to_copy = [
    f"{my_datetime}_311_cases.csv",
    f"{my_datetime}_311_predictions.csv",
    f"{my_datetime}_311_ml_models.csv",
    f"{my_datetime}_manifest.txt"
]

# Control where to copy
copy_to_prod = True
copy_to_stage = True

for file in files_to_copy:
    if copy_to_prod:
        scp_to_server(file, export_folder=PROD_EXPORT_FOLDER)
    if copy_to_stage:
        scp_to_server(file, export_folder=STAGE_EXPORT_FOLDER)


Executing: scp -P 65002 2023-09-13_10-51-36_311_cases.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders
Executing: scp -P 65002 2023-09-13_10-51-36_311_cases.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/stagelaravel/database/seeders
Executing: scp -P 65002 2023-09-13_10-51-36_311_predictions.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders
Executing: scp -P 65002 2023-09-13_10-51-36_311_predictions.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/stagelaravel/database/seeders
Executing: scp -P 65002 2023-09-13_10-51-36_311_ml_models.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders
Executing: scp -P 65002 2023-09-13_10-51-36_311_ml_models.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/stagelaravel/database/seeders
Executing: scp -P 65002 2023-09-13_10-51-36_manifest.txt u353344964@195.179.236.61: