# Boston 311 Tutorial

This notebook will run you through the basic usage of this package to train 4 models on the Boston 311 data and use them to predict the outcome of cases from the last 30 days

In [None]:
#The lifelines library is only needed for the Cox Regression Model, which is not included in this tutorial
#The keras-tuner library is needed for the neural network model
#The seaborn library is needed for visualization of results at the end
! pip install -q lifelines keras-tuner seaborn

In [None]:
! pip install git+https://github.com/mindfulcoder49/Boston_311.git

#! pip install . 

## Import the Boston311Model classes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from keras_tuner import HyperParameters

In [None]:
from boston311 import Boston311LogReg, Boston311EventDecTree, Boston311SurvDecTree, Boston311KerasNN

## Get latest file URLS, Data, and Current Date Ranges

In [None]:
latest_URLS = Boston311LogReg().get311URLs()

In [None]:
print(latest_URLS)

In [None]:
today_datestring, tomorrow_datestring, thirty_days_ago_datestring = Boston311LogReg().get_datestrings()

## Define several models

In [None]:
linear_tree_model = Boston311SurvDecTree(train_date_range={'start':'2023-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type','queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0,
                                      'survivalTimeFill':tomorrow_datestring},
                            files_dict=latest_URLS)

hour_interval = 48
max_days = 120

linear_tree_model.bin_edges = linear_tree_model.generate_time_bins_fixed_interval(hour_interval, max_days)
linear_tree_model.bin_labels = linear_tree_model.generate_bin_labels(linear_tree_model.bin_edges)

In [None]:
logistic_model = Boston311LogReg(train_date_range={'start':'2023-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

In [None]:
logistic_tree_model = Boston311EventDecTree(train_date_range={'start':'2023-01-01','end':thirty_days_ago_datestring},
                            predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring},
                            feature_columns=['type', 'queue'],
                            scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']},
                                      'survivalTimeMin':0},
                            files_dict=latest_URLS)

## Load the data once and store it for use on all the models

In [None]:
all_2023_data = Boston311LogReg(train_date_range={'start':'2023-01-01','end':'2023-12-31'}).load_data()

In [None]:
prediction_data_2023 = linear_tree_model.load_data(data=all_2023_data, train_or_predict='predict')    

In [None]:
case_data_2023 = linear_tree_model.load_data(data=all_2023_data)

## Train several models

In [None]:
logistic_tree_model.run_pipeline(data=case_data_2023)

In [None]:
logistic_tree_prediction = logistic_tree_model.predict(data=prediction_data_2023)

In [None]:
logistic_tree_prediction['event_prediction'].value_counts()

In [None]:
logistic_tree_prediction[logistic_tree_prediction['event'] == 0].head(100)

In [None]:
logistic_tree_model.save('.','logtree','logtreeproperties')

In [None]:
logistic_model.run_pipeline(data=case_data_2023)

In [None]:
logistic_model.save('.','logreg','logregproperties')

In [None]:
logistic_prediction = logistic_model.predict(data=prediction_data_2023)

In [None]:
logistic_prediction[logistic_prediction['event_prediction'] < .5].shape[0]

In [None]:
def plot_prediction_distribution(df, column):
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in the dataframe.")

    data = df[column]  # Extract the data from the dataframe

    plt.figure(figsize=(10, 6))
    n, bins, patches = plt.hist(data, bins=20, alpha=0.6, color='b')

    # Add counts above the bins
    for count, rect in zip(n, patches):
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width() / 2, height, f'{int(count)}', ha='center', va='bottom')

    plt.title('Distribution of Predictions')
    plt.xlabel('Predicted Value')
    plt.ylabel('Count')
    plt.xlim(0, 1)  # Assuming the predictions are probabilities [0, 1]
    plt.show()

# Example usage:
plot_prediction_distribution(logistic_prediction, 'event_prediction')


In [None]:
linear_tree_model.run_pipeline(data=case_data_2023)

In [None]:
linear_prediction = linear_tree_model.predict(data=prediction_data_2023)

In [None]:
linear_prediction.head(20)

In [None]:
linear_prediction.shape[0]

In [None]:
logistic_prediction.shape[0]

In [None]:
logistic_tree_prediction.shape[0]

##Join the tables

In [None]:
merged_df = logistic_tree_prediction.merge(logistic_prediction[['case_enquiry_id','event_prediction']], on='case_enquiry_id', how="outer").merge(linear_prediction[['case_enquiry_id','survival_prediction']], on='case_enquiry_id', how="outer")

In [None]:
merged_df.shape[0]

In [None]:
merged_df.head()

##Save the prediction data

In [None]:
merged_df.to_csv('predictions.csv', index=False)

In [None]:
import gc
gc.collect()

# Enter the Neural Network #

In [None]:
KerasNN_model = Boston311KerasNN()

In [None]:
KerasNN_model.train_date_range={'start':'2023-01-01','end':thirty_days_ago_datestring}

KerasNN_model.predict_date_range={'start':thirty_days_ago_datestring,'end':today_datestring}

KerasNN_model.feature_columns=['queue', 'type']

KerasNN_model.scenario={'dropColumnValues': {'source':['City Worker App', 'Employee Generated']}, 'survivalTimeMin':0, 'survivalTimeFill':tomorrow_datestring}

KerasNN_model.epochs = 2
KerasNN_model.batch_size = 32

In [None]:

hour_interval = 48
max_days = 120

#KerasNN_model.bin_edges = KerasNN_model.generate_time_bins_statistics(df, num_intervals=60)
KerasNN_model.bin_edges = KerasNN_model.generate_time_bins_fixed_interval(hour_interval, max_days)

start_nodes = 256
end_nodes = 128
#l2_0 = 0.00001
#learning_rate = 7.5842e-05
l2_0 = 0.001
learning_rate = 0.0001

hp = HyperParameters()
hp.Fixed('start_nodes', start_nodes)
hp.Fixed('end_nodes', end_nodes)
hp.Fixed('l2_0', l2_0)
hp.Fixed('learning_rate', learning_rate)
bin_number = len(KerasNN_model.bin_edges) - 1
hp.Fixed('final_layer', bin_number)
hp.Fixed('final_activation', 'softmax')
KerasNN_model.best_hyperparameters = hp

In [None]:
KerasNN_model.run_pipeline(data=case_data_2023)

In [None]:
predictions, data = KerasNN_model.predict(data=prediction_data_2023)

In [None]:
def get_top_predictions(bin_labels, predictions):
    # Check if the number of columns in predictions matches the length of bin_labels
    if len(bin_labels) != predictions.shape[1]:
        raise ValueError("The length of bin_labels must be equal to the number of columns in predictions")

    # Get the indices of the top 3 predictions for each row
    top_3_indices = np.argsort(-predictions, axis=1)[:, :3]

    # Map the indices to labels
    top_3_labels = np.array(bin_labels)[top_3_indices]

    return top_3_labels

def add_prediction_columns(cases_df, top_predictions):
    # Check if the number of rows in the DataFrame matches the number of predictions
    if cases_df.shape[0] != top_predictions.shape[0]:
        raise ValueError("The number of rows in cases_df must be equal to the number of predictions")

    # Create new columns for the predictions
    #cases_df['1st_prediction'] = top_predictions[:, 0]
    #cases_df['2nd_prediction'] = top_predictions[:, 1]
    #cases_df['3rd_prediction'] = top_predictions[:, 2]
    #use insert instead to put them on the left side
    cases_df.insert(0, '1st_prediction', top_predictions[:, 0])
    cases_df.insert(1, '2nd_prediction', top_predictions[:, 1])
    cases_df.insert(2, '3rd_prediction', top_predictions[:, 2])
    
    return cases_df

prediction_labels = get_top_predictions(KerasNN_model.bin_labels, predictions)
df_predictions = add_prediction_columns(data, prediction_labels)

In [None]:
df_predictions.shape[0]

In [None]:
df_predictions.head(100)

In [None]:
#add the three prediction columns to the merged_df by merging on case_enquiry_id
merged_df = merged_df.merge(df_predictions[['case_enquiry_id','1st_prediction','2nd_prediction','3rd_prediction']], on='case_enquiry_id', how="outer")

In [None]:
# Select and view the first 20 records of the specified columns
columns_to_view = ['1st_prediction', '2nd_prediction', '3rd_prediction', 'survival_prediction', 'event_prediction_x', 'event_prediction_y', 'case_enquiry_id','open_dt','closed_dt','survival_time','case_title','type','queue']
merged_df[columns_to_view].head(20)


In [None]:
#now show the first 20 that have event=0
merged_df[merged_df['event'] == 0][columns_to_view].head(20)

In [None]:


def plot_prediction_distribution(df, prediction_cols):
    # Set the aesthetic style of the plots
    sns.set_style("whitegrid")

    # Create figure and axes
    fig, axes = plt.subplots(nrows=len(prediction_cols), figsize=(10, 5 * len(prediction_cols)))

    if not isinstance(axes, np.ndarray):
        axes = [axes]

    # Plot the distribution of each prediction
    for ax, col in zip(axes, prediction_cols):
        sns.countplot(data=df, x=col, hue=col, palette="viridis", order=df[col].value_counts().index, ax=ax)
        ax.set_title(f'Distribution of {col}')
        ax.set_xlabel('Labels')
        ax.set_ylabel('Frequency')
        for p in ax.patches:
            ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
        plt.xticks(rotation=45)

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
# Assuming `merged_df` is your DataFrame with columns '1st_prediction', '2nd_prediction', '3rd_prediction'
# Example usage:
plot_prediction_distribution(merged_df, ['event_prediction_x'])

In [None]:
merged_df['survival_prediction'].value_counts()

In [None]:

plot_prediction_distribution(merged_df, ['survival_prediction'])

In [None]:

plot_prediction_distribution(merged_df, ['1st_prediction'])

In [None]:

plot_prediction_distribution(merged_df, ['2nd_prediction'])

In [None]:

plot_prediction_distribution(merged_df, ['3rd_prediction'])