#Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 3 models on the Boston 311 data and use them to predict the outcome of cases from the last 30 days

In [1]:
! pip install ../

Defaulting to user installation because normal site-packages is not writeable
Processing /home/briarmoss/Documents/Boston_311
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: boston311
  Building wheel for boston311 (pyproject.toml) ... [?25ldone
[?25h  Created wheel for boston311: filename=boston311-0.1.0-py3-none-any.whl size=20043 sha256=034a653faffa2e42fceacc1627ce91ef59c15cb98dbe9205ece84f56fd45b519
  Stored in directory: /tmp/pip-ephem-wheel-cache-vgl1uhlg/wheels/3d/69/ee/0a6ac96b9c09c948fc0e74f2724a9703aa39749a41fa757c9e
Successfully built boston311
Installing collected packages: boston311
  Attempting uninstall: boston311
    Found existing installation: boston311 0.1.0
    Uninstalling boston311-0.1.0:
      Successfully uninstalled boston311-0.1.0
Successfully installed boston311-0.1.0


##Import the Boston311Model class

In [2]:
import os

In [3]:
from boston311 import Boston311LogReg, Boston311EventDecTree, Boston311SurvDecTree


2023-09-19 18:10:01.088145: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-19 18:10:01.494528: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-19 18:10:01.496286: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Get latest file URLS and Current Date Ranges

In [4]:
latest_URLS = Boston311LogReg.get311URLs()

In [5]:
print(latest_URLS)

{'2023': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/e6013a93-1321-4f2a-bf91-8d8a02f1e62f/download/tmp4i_c_2hr.csv', '2022': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/81a7b022-f8fc-4da5-80e4-b160058ca207/download/tmpfm8veglw.csv', '2021': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/f53ebccd-bc61-49f9-83db-625f209c95f5/download/tmp88p9g82n.csv', '2020': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/6ff6a6fd-3141-4440-a880-6f60a37fe789/download/tmpcv_10m2s.csv', '2019': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/ea2e4696-4a2d-429c-9807-d02eb92e0222/download/tmpcje3ep_w.csv', '2018': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2be28d90-3a90-4af1-a3f6-f28c1e25880a/download/tmp7602cia8.csv', '2017': 'https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/300221

In [6]:
from datetime import datetime, timedelta
now = datetime.now()
thirty_days = timedelta(days=30)
thirty_days_ago = now - thirty_days
today_datestring = now.strftime("%Y-%m-%d")
thirty_days_ago_datestring = thirty_days_ago.strftime("%Y-%m-%d")
tomorrow_datestring = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')

print(today_datestring, thirty_days_ago_datestring, tomorrow_datestring)

2023-09-19 2023-08-20 2023-09-20


In [7]:
#set model folder constant
MODEL_FOLDER = './daily_models'

##Define several models

In [17]:
from typing import List, Dict, Any
from itertools import combinations

# Function to define an iteration scenario
def define_iteration_scenario(feature_columns: List[str], time_spans: List[Dict[str, str]]) -> Dict[str, Any]:
    scenario = {
        'feature_columns': feature_columns,
        'time_spans': time_spans
    }
    return scenario

# Function to produce all the models based on an iteration scenario
def generate_models(scenario: Dict[str, Any], latest_urls: Dict[str, str], model_class):
    generated_models = {}
    
    # Iterate through all paired combinations of feature columns
    for feature_pair in combinations(scenario['feature_columns'], 2):
        
        # Iterate through all time spans
        for time_span in scenario['time_spans']:
            train_start = time_span['start']
            train_end = time_span['end']
            
            # Create a model
            model = model_class(
                train_date_range={'start': train_start, 'end': train_end},
                predict_date_range={'start': thirty_days_ago_datestring, 'end': tomorrow_datestring},  # Adjust as needed
                feature_columns=list(feature_pair),
                scenario={'dropColumnValues': {'source': ['City Worker App', 'Employee Generated']}, 'survivalTimeMin': 0},  # Adjust as needed
                files_dict=latest_urls
            )
            
            #store model with key as class name and feature pair
            generated_models[f'{model_class.__name__}_{feature_pair[0]}_{feature_pair[1]}'] = model
            
    return generated_models

# Example usage
#feature_columns_to_use = ['type', 'queue', 'source', 'subject', 'reason', 'department', 'ward_number']  # Replace with your actual feature columns

feature_columns_to_use = ['type', 'queue', 'ward_number'] 

time_spans_to_use = [
    {'start': '2022-01-01', 'end': thirty_days_ago_datestring},
]  # Replace with your actual time spans

# Define the iteration scenario
iteration_scenario = define_iteration_scenario(feature_columns_to_use, time_spans_to_use)

# Generate the models (replace `Boston311LogReg` with the model class you want to use)
# latest_URLS would be the actual URLs you have
generated_models = generate_models(iteration_scenario, latest_urls=latest_URLS, model_class=Boston311LogReg)  # Replace {} with your actual latest_URLS

# Now, `generated_models` contains all the models based on the defined iteration scenario

#print model keys
print(generated_models.keys())

len(generated_models) 


dict_keys(['Boston311LogReg_type_queue', 'Boston311LogReg_type_ward_number', 'Boston311LogReg_queue_ward_number'])


3

In [19]:
#get current datetime in Boston timezone as string
from datetime import datetime
from pytz import timezone
import pytz
boston = timezone('US/Eastern')
now = datetime.now(boston)
today_datestring = now.strftime("%Y-%m-%d")
#get time in Boston timezone as string for a filename
now = datetime.now(boston)
time_string = now.strftime("%H-%M-%S")
#define datetime string
my_datetime = today_datestring + '_' + time_string 

## Train all the models

In [20]:
for name, model in generated_models.items():
    #print the model we are training
    print("training model: " + name)
    model.run_pipeline()
    #print Dont training model
    print("Done training model: " + name)
    

training model: Boston311LogReg_type_queue


  df = pd.read_csv(file)
  df = pd.read_csv(file)


Files with different number of columns from File 0:  []
Files with same number of columns as File 0:  [0, 1]
Files with different column order from File 0:  []
Files with same column order as File 0:  [0, 1]
Starting Training at 2023-09-19 18:41:14.423223


2023-09-19 18:41:19.876919: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 91104360 exceeds 10% of free system memory.


Epoch 1/10

2023-09-19 18:42:22.052893: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22776260 exceeds 10% of free system memory.


Epoch 2/10

KeyboardInterrupt: 

In [None]:
import gc
gc.collect()

In [None]:
#run the predict method on each model
for name, model in generated_models.items():
    #print the model we are predicting
    print("predicting model: " + name)
    model.predict()
    #print Dont predicting model
    print("Done predicting model: " + name)

In [None]:
import datetime

def save_model_to_dir(model, folder_name):
    dir_path = os.path.join(MODEL_FOLDER, folder_name)
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    model_name = timestamp + "_" + model.model_type
    properties_name = model_name
    
    model.save(dir_path, model_name, properties_name)

# List of models
models = [linear_tree_model, logistic_tree_model, logistic_model]


# Iterate over models and save
for model_name, model in generated_models.items():
    save_model_to_dir(model, model_name)
