#Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 3 models on the Boston 311 mydata and use them to predict the outcome of cases from the last 30 days

In [1]:
! pip install ../

Defaulting to user installation because normal site-packages is not writeable
Processing /home/briarmoss/Documents/Boston_311


  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: boston311
  Building wheel for boston311 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for boston311: filename=boston311-0.2.0-py3-none-any.whl size=24563 sha256=2659d203cbbbc6d97065f86a63e3b14c88c67222a5aa84e4ffe1ae786ebb8c5f
  Stored in directory: /tmp/pip-ephem-wheel-cache-yh3vj_b_/wheels/3d/69/ee/0a6ac96b9c09c948fc0e74f2724a9703aa39749a41fa757c9e
Successfully built boston311
Installing collected packages: boston311
  Attempting uninstall: boston311
    Found existing installation: boston311 0.2.0
    Uninstalling boston311-0.2.0:
      Successfully uninstalled boston311-0.2.0
Successfully installed boston311-0.2.0


##Import the Boston311Model class

In [2]:
import os
import pandas as pd
import numpy as np
import pickle
import re
import sys
import time

In [3]:
from boston311 import Boston311KerasNN


2023-11-02 17:57:54.124904: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-02 17:57:54.172016: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-02 17:57:54.173239: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


## Load extra features

In [4]:
today_datestring, tomorrow_datestring, thirty_days_ago_datestring = Boston311KerasNN().get_datestrings()

In [5]:
import glob
import os

def get_latest_model_files(model_dir="./daily_models/Boston311KerasNN/"):
    # Get all json and h5 files
    json_files = glob.glob(os.path.join(model_dir, "*.json"))
    h5_files = glob.glob(os.path.join(model_dir, "*.h5"))
    #also add .keras files to h5_files
    keras_files = glob.glob(os.path.join(model_dir, "*.keras"))
    h5_files.extend(keras_files)

    # Sort files by modification time
    json_files.sort(key=os.path.getmtime, reverse=True)
    h5_files.sort(key=os.path.getmtime, reverse=True)

    if json_files and h5_files:
        latest_json = json_files[0]
        latest_h5 = h5_files[0]
        return latest_json, latest_h5
    else:
        return None, None

# Use the function
json_file, model_file = get_latest_model_files()
print(json_file)

if json_file and model_file:
    KerasNN_model = Boston311KerasNN()
    KerasNN_model.load(json_file, model_file)
    KerasNN_model.predict_date_range['end'] = tomorrow_datestring


./daily_models/Boston311KerasNN/2023-11-01_23-02-16_Boston311KerasNN.json


In [6]:


pickle_file = 'dataframe.pkl'
EXTRA_mydata_FILE = './cls_and_pooled_embeddings_with_three_cols.csv'

df = KerasNN_model.pkl_load_data(EXTRA_mydata_FILE, pickle_file)

# if df has a column service_request_id, do the following
if 'service_request_id' in df.columns:
    df.rename(columns={'service_request_id':'case_enquiry_id'}, inplace=True)

    df['case_enquiry_id'] = df['case_enquiry_id'].astype(str)
    is_numeric = df['case_enquiry_id'].str.isnumeric()
    df = df[is_numeric]
    df['case_enquiry_id'] = df['case_enquiry_id'].astype('int64')
    for col in ['desc_cls_embedding', 'desc_pooled_embedding', 'name_cls_embedding', 'name_pooled_embedding', 'code_cls_embedding', 'code_pooled_embedding']:
        df[col] = df[col].apply(literal_eval).apply(np.array)

    pickle.dump(df, open(pickle_file, "wb"))

# Assuming df is your DataFrame
column_names = ['desc_cls_embedding', 'desc_pooled_embedding', 'name_cls_embedding', 'name_pooled_embedding', 'code_cls_embedding', 'code_pooled_embedding']
df = KerasNN_model.flatten_and_replace_columns(df, column_names)



In [7]:
case_data_file = 'case_data.pkl'
case_data_csv = 'all_311_cases.csv'
data = KerasNN_model.pkl_load_data(case_data_csv, case_data_file)

In [8]:

#parse CLS embedding column as array
predictions, prediction_data = KerasNN_model.predict(api_data=df, data=data)

columns in data before drop: Index(['case_enquiry_id', 'open_dt', 'sla_target_dt', 'closed_dt', 'on_time',
       'case_status', 'closure_reason', 'case_title', 'subject', 'reason',
       'type', 'queue', 'department', 'submitted_photo', 'closed_photo',
       'location', 'fire_district', 'pwd_district', 'city_council_district',
       'police_district', 'neighborhood', 'neighborhood_services_district',
       'ward', 'precinct', 'location_street_name', 'location_zipcode',
       'latitude', 'longitude', 'geom_4326', 'source', 'survival_time',
       'event', 'ward_number', 'survival_time_hours'],
      dtype='object')
columns to drop: Index(['case_status', 'case_title', 'city_council_district', 'closed_dt',
       'closed_photo', 'closure_reason', 'fire_district', 'geom_4326',
       'latitude', 'location', 'location_street_name', 'location_zipcode',
       'longitude', 'neighborhood', 'neighborhood_services_district',
       'on_time', 'open_dt', 'police_district', 'precinct', 'pwd_

  data = pd.concat([data, fake_df], ignore_index=True)




In [9]:
# Define a function to flatten an array into a string.
def array_to_string(arr):
    return ' '.join(map(str, arr))

# Apply the function along axis 1 (rows).
string_predictions = np.apply_along_axis(array_to_string, axis=1, arr=predictions)

# Now string_predictions is a 1D NumPy array where each element is a string
# that contains all the elements from the corresponding row in the original 2D array.
print(string_predictions)

['0.15627643 0.03100016 0.023550667 0.020688528 0.015493173 0.013712872 0.010438199 0.0096393 0.0106205465 0.009338259 0.009138457 0.008840337 0.006205858 0.0072266445 0.0077862125 0.008382641 0.0078073516 0.011605649 0.011434967 0.009867816 0.012219369 0.011233026 0.012490428 0.0113552725 0.011014786 0.012149461 0.008198183 0.014184383 0.008804067 0.008600514 0.010010797 0.008084168 0.010017625 0.010326682 0.0070346775 0.0071070036 0.009006931 0.011108044 0.01124231 0.00708337 0.007313233 0.0074981693 0.006244793 0.005870699 0.0046499884 0.004489269 0.0070927795 0.005728246 0.006262879 0.005694479 0.007364902 0.008647736 0.0044945637 0.0069812047 0.0034549288 0.0047859508 0.0061486913 0.0055349832 0.005438303 0.00658558 0.2893934'
 '0.74046886 0.14088587 0.049202587 0.017766068 0.0074478155 0.0044607567 0.0046350723 0.0042094686 0.0022749554 0.0025527165 0.0010371511 0.0016099628 0.0031829132 0.0031333382 0.0015327712 0.0014159062 0.0041547194 0.0037713738 0.002255414 0.00029505853 0.

In [10]:
case_enquiry_id = prediction_data['case_enquiry_id']

In [11]:
#combine case_enquiry_id and predictions into a dataframe
predictions_df = pd.DataFrame({'case_enquiry_id':case_enquiry_id, 'prediction':string_predictions})

In [12]:
import pandas as pd

#get model_name from json_file name and ml_model_date from json_file name first 8 characters which are YYYY-MM-DD and keep it to YYYY-MM-DD
model_name = json_file.split('/')[-1].split('.')[0]
ml_model_date = json_file.split('/')[-1].split('.')[0][:10]

#define an empt pandas dataframe ml_model_df
ml_model_df = pd.DataFrame(columns=['ml_model_name', 'ml_model_type', 'ml_model_date'])

#read contents of json_file into ml_model_json
with open(json_file, 'r') as f:
    ml_model_json = f.read()

ml_model_df = pd.concat([ml_model_df, pd.DataFrame([{'ml_model_name': model_name, 
                                    'ml_model_type': KerasNN_model.model_type,
                                    'ml_model_date': ml_model_date,
                                    'ml_model_json': ml_model_json}])], ignore_index=True)

print(ml_model_df)

                          ml_model_name     ml_model_type ml_model_date  \
0  2023-11-01_23-02-16_Boston311KerasNN  Boston311KerasNN    2023-11-01   

                                       ml_model_json  
0  {"feature_columns": ["queue", "subject", "reas...  


In [13]:
all_model_cases = prediction_data.drop(['geom_4326','survival_time_hours', 'survival_time', 'event'], axis=1).copy()

In [14]:

all_model_predictions = predictions_df

In [15]:
all_model_predictions['ml_model_name'] = model_name

In [21]:
# %%
#get current datetime in Boston timezone as string
from datetime import datetime
from pytz import timezone
import pytz
boston = timezone('US/Eastern')
now = datetime.now(boston)
today_datestring = now.strftime("%Y-%m-%d")
#get time in Boston timezone as string for a filename
now = datetime.now(boston)
time_string = now.strftime("%H-%M-%S")
#define datetime string
my_datetime = today_datestring + '_' + time_string 

In [22]:
all_model_predictions['prediction_date'] = today_datestring

In [23]:


# %%
all_model_cases.to_csv(my_datetime+'_311_cases.csv', index=False)


# %%

all_model_predictions.to_csv(my_datetime+'_311_predictions.csv', index=False)

# %%

ml_model_df.to_csv(my_datetime+'_311_ml_models.csv', index=False)

# %%
#create datetime _manifest.txt file with one filename per line
with open(my_datetime+'_manifest.txt', 'w') as f:
    f.write(my_datetime+'_311_cases.csv\n')
    f.write(my_datetime+'_311_ml_models.csv\n')
    f.write(my_datetime+'_311_predictions.csv\n')

# %%
#create an export folder

SCP_COMMAND = "scp -i /home/briarmoss/.ssh/id_rsa_new" 
EXPORT_FOLDER = "briarmoss@10.0.0.81:/home/briarmoss/Documents/BODC-DEI-site/database/seeders/"

#copy the csv files to the export folder
! {SCP_COMMAND} {my_datetime}_311_cases.csv {EXPORT_FOLDER}
! {SCP_COMMAND} {my_datetime}_311_predictions.csv {EXPORT_FOLDER}
! {SCP_COMMAND} {my_datetime}_311_ml_models.csv {EXPORT_FOLDER}
! {SCP_COMMAND} {my_datetime}_manifest.txt {EXPORT_FOLDER}



# %% [markdown]
# ** Copy the files to the production server **

# %%
# Define constants for servers
PROD_USER = 'u353344964'
PROD_HOSTNAME = '195.179.236.61'
PORT_NUMBER = 65002
PROD_BASE_FOLDER = '/home/u353344964/domains/bodc-dei.org/laravel'
STAGE_BASE_FOLDER = '/home/u353344964/domains/bodc-dei.org/stagelaravel'
PROD_EXPORT_FOLDER = '/home/u353344964/domains/bodc-dei.org/laravel/database/seeders'
STAGE_EXPORT_FOLDER = '/home/u353344964/domains/bodc-dei.org/stagelaravel/database/seeders'

# %%







2023-11-02_19-38-06_311_cases.csv             100% 8213KB   1.8MB/s   00:04    
2023-11-02_19-38-06_311_predictions.csv       100%   12MB   1.7MB/s   00:07    
2023-11-02_19-38-06_311_ml_models.csv         100% 9917     1.8MB/s   00:00    
2023-11-02_19-38-06_manifest.txt              100%  112    40.8KB/s   00:00    


In [26]:
import os


def scp_to_server(filename, user=PROD_USER, hostname=PROD_HOSTNAME, port=PORT_NUMBER, export_folder=PROD_EXPORT_FOLDER):
    """Copy a file to the server using scp."""
    command = f"scp -P {port} {filename} {user}@{hostname}:{export_folder}"
    print(f"Executing: {command}")
    os.system(command)

# Use the function to scp files
files_to_copy = [
    f"{my_datetime}_311_cases.csv",
    f"{my_datetime}_311_ml_models.csv",
    f"{my_datetime}_311_predictions.csv",
    f"{my_datetime}_manifest.txt"
]

# Control where to copy
copy_to_prod = True
copy_to_stage = False

for file in files_to_copy:
    if copy_to_prod:
        scp_to_server(file, export_folder=PROD_EXPORT_FOLDER)
    if copy_to_stage:
        scp_to_server(file, export_folder=STAGE_EXPORT_FOLDER)


# %%
PORT_NUMBER

Executing: scp -P 65002 2023-11-02_19-38-06_311_cases.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders
Executing: scp -P 65002 2023-11-02_19-38-06_311_ml_models.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders
Executing: scp -P 65002 2023-11-02_19-38-06_311_predictions.csv u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders
Executing: scp -P 65002 2023-11-02_19-38-06_manifest.txt u353344964@195.179.236.61:/home/u353344964/domains/bodc-dei.org/laravel/database/seeders


65002

In [27]:

if copy_to_prod:
    ! ssh -p {PORT_NUMBER} {PROD_USER}@{PROD_HOSTNAME} 'cd {PROD_BASE_FOLDER}; php artisan db:seed --class=ThreeOneOneSeeder'

if copy_to_stage:
    ! ssh -p {PORT_NUMBER} {PROD_USER}@{PROD_HOSTNAME} 'cd {STAGE_BASE_FOLDER}; php artisan db:seed --class=ThreeOneOneSeeder'


   INFO  Seeding database.  


Manifest files:
/home/u353344964/domains/bodc-dei.org/laravel/database/seeders/2023-11-02_19-38-06_manifest.txt

Total records to process: 30919

Processing /home/u353344964/domains/bodc-dei.org/laravel/database/seeders/2023-11-02_19-38-06_311_cases.csv
[6A[K100 App\Models\ThreeOneOneCase records processed.
[KRecords remaining in this file: 15359.
[KTotal records remaining: 30819.
[KTime for last 100 records: 0.02 seconds.
[KEstimated time remaining for this file: 2 seconds.
[KEstimated time for all files: 5 seconds.
[6A[K200 App\Models\ThreeOneOneCase records processed.
[KRecords remaining in this file: 15259.
[KTotal records remaining: 30719.
[KTime for last 100 records: 0.01 seconds.
[KEstimated time remaining for this file: 1 second.
[KEstimated time for all files: 2 seconds.
[6A[K300 App\Models\ThreeOneOneCase records processed.
[KRecords remaining in this file: 15159.
[KTotal records remaining: 30619.
[KTime for last 100 records: