<a href="https://colab.research.google.com/github/nkinesis/soen471-project/blob/main/Undersampled_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [7]:
#loading the dataset
df = pd.read_csv('../../preprocessing/data/london_clean_undersample.csv')
# df.dtypes

In [8]:
# load the data, do train test split (copy from the notebook)
X = df[['DateOfCall', 'PropertyType', 'NumPumpsAttending', 'PumpHoursRoundUp', 'mean_temp']]
y = df[['CostCat']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# print a small sample of train X and y
print(X_train[0:5])
print(y_train[0:5])


        DateOfCall  PropertyType  NumPumpsAttending  PumpHoursRoundUp  \
100629           1             4                  2                 1   
30999            4            28                  2                 1   
123846           4            40                  3                 2   
169930           7            37                  3                 4   
39290           11            65                  3                 1   

        mean_temp  
100629       11.8  
30999         4.1  
123846       10.3  
169930       22.8  
39290         2.7  
        CostCat
100629        1
30999         0
123846        2
169930        4
39290         0


In [9]:
# load the models
pickled_model_dt = pickle.load(open('../../models/DT_undersample_model.pickle', 'rb'))
pickled_model_rf = pickle.load(open('../../models/RF_undersample_model.pickle','rb'))
pickled_model_bt = pickle.load(open('../../models/BT_undersample_model.pickle','rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [10]:
# generate predictions
y_pred_dt = pickled_model_dt.predict(X_test)
y_pred_rf = pickled_model_rf.predict(X_test)
y_pred_bt = pickled_model_bt.predict(X_test)


In [11]:
# compute the f1 score (if you want to compute other metrics, just change this step, the rest is the same)
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score

# f1_score = pickled_model.f1_score(y_test, y_pred)
f1_score_dt = f1_score(y_test, y_pred_dt, average='weighted')
f1_score_dt

f1_score_rf = f1_score(y_test, y_pred_rf, average='weighted')
f1_score_rf

f1_score_bt = f1_score(y_test, y_pred_bt, average='weighted')
f1_score_bt

0.7778623000620029

In [12]:
# write the code to draw your graph
import plotly.express as px

classifiers = ['Decision Tree Classifier','Random Forest Classifier','Boosted Tree Classifier']
metrics = ['F1-Score']

df = pd.DataFrame({'Classifier': [classifiers[0]] + [classifiers[1]] + [classifiers[2]] ,
                   'Metrics': metrics*3,
                   'Score': [f1_score_dt,f1_score_rf,f1_score_bt]})

fig = px.bar(df, x='Classifier', y='Score', color='Metrics', barmode='group',
             height=400, width=600, title="Classifier Performance Metrics")

fig.update_layout(xaxis=dict(categoryorder='category ascending'))
fig.update_traces(customdata=df['Score'],texttemplate='%{customdata}', textposition='auto', textfont=dict(size=8))
fig.update_traces(text=df['Score'], texttemplate='%{text}', textposition='outside', textfont=dict(size=8))
fig.update_traces(hovertemplate='%{x}<br>Score: %{y:.4f}<extra></extra>')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_traces(text=df['Score'].round(2).astype(str), texttemplate='%{text:.4f}', textposition='outside', textfont=dict(size=8))
fig.show()