In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})

# Datetime lib
from pandas import to_datetime
import itertools
import warnings
import datetime
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, classification_report

#for hyper parameter tunning
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('./data/london_clean_weather.csv')

In [3]:
data

Unnamed: 0,DateOfCall,CalYear,HourOfCall,IncidentGroup,PropertyCategory,PropertyType,NumPumpsAttending,PumpHoursRoundUp,Notional Cost (£),Date,CostCat,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
0,1,2009,0,0,0,0,2,1,255,01/01/2009,3,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0
1,1,2009,0,1,1,1,1,1,255,01/01/2009,3,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0
2,1,2009,0,1,1,2,1,1,255,01/01/2009,3,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0
3,1,2009,0,1,1,3,2,1,255,01/01/2009,3,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0
4,1,2009,0,2,2,4,2,1,255,01/01/2009,3,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,4,2016,10,2,5,33,1,1,326,04/11/2016,4,7.0,1.0,92.0,17.4,11.6,8.8,3.0,100580.0,0.0
518,4,2016,10,2,2,22,2,1,326,04/11/2016,4,7.0,1.0,92.0,17.4,11.6,8.8,3.0,100580.0,0.0
519,4,2016,10,2,5,68,1,1,326,04/11/2016,4,7.0,1.0,92.0,17.4,11.6,8.8,3.0,100580.0,0.0
520,4,2016,10,1,5,70,2,2,652,04/11/2016,7,7.0,1.0,92.0,17.4,11.6,8.8,3.0,100580.0,0.0


In [4]:
data.isnull().sum()

DateOfCall            0
CalYear               0
HourOfCall            0
IncidentGroup         0
PropertyCategory      0
PropertyType          0
NumPumpsAttending     0
PumpHoursRoundUp      0
Notional Cost (£)     0
Date                  0
CostCat               0
cloud_cover           0
sunshine              0
global_radiation      0
max_temp              0
mean_temp             0
min_temp              0
precipitation         0
pressure              0
snow_depth           62
dtype: int64

In [5]:
data.columns

Index(['DateOfCall', 'CalYear', 'HourOfCall', 'IncidentGroup',
       'PropertyCategory', 'PropertyType', 'NumPumpsAttending',
       'PumpHoursRoundUp', 'Notional Cost (£)', 'Date', 'CostCat',
       'cloud_cover', 'sunshine', 'global_radiation', 'max_temp', 'mean_temp',
       'min_temp', 'precipitation', 'pressure', 'snow_depth'],
      dtype='object')

In [6]:
data = data.loc[:, ['DateOfCall', 'PropertyType', 'NumPumpsAttending',
       'PumpHoursRoundUp', 'Notional Cost (£)', 'mean_temp']]

In [34]:
# g = sns.pairplot(data, hue = 'EstimatedPropertyLoss', diag_kws={'bw': 0.2})

In [7]:
#identify all categorical variables
cat_columns = data.select_dtypes(['object']).columns
cat_columns

Index([], dtype='object')

In [8]:
X = data.drop(['Notional Cost (£)'], axis=1).values# Input features (attributes)
y = data['Notional Cost (£)'].values # Target vector
print('X shape: {}'.format(np.shape(X)))
print('y shape: {}'.format(np.shape(y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size=0.3, random_state=0)

X shape: (522, 5)
y shape: (522,)


In [37]:
# Confusion Matrix function

def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':50})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [38]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')
rf.fit(X_train, y_train)
prediction_test = rf.predict(X=X_test)

# source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# Accuracy on Test
print("Training Accuracy is: ", rf.score(X_train, y_train))
# Accuracy on Train
print("Testing Accuracy is: ", rf.score(X_test, y_test))

print(classification_report(y_test, prediction_test))

# Confusion Matrix
# cm = confusion_matrix(y_test, prediction_test)
# cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
# plt.figure()
# plot_confusion_matrix(cm_norm, classes=rf.classes_)

Training Accuracy is:  1.0
Testing Accuracy is:  0.9808917197452229
              precision    recall  f1-score   support

         255       1.00      1.00      1.00        22
         260       1.00      1.00      1.00        75
         290       1.00      1.00      1.00         9
         326       1.00      1.00      1.00        28
         510       1.00      1.00      1.00         2
         520       0.80      1.00      0.89         8
         580       1.00      1.00      1.00         2
         652       1.00      1.00      1.00         3
         765       1.00      1.00      1.00         2
         780       0.50      1.00      0.67         1
         978       1.00      1.00      1.00         1
        1300       0.00      0.00      0.00         1
        1530       1.00      1.00      1.00         1
        1560       0.00      0.00      0.00         1
       81380       0.00      0.00      0.00         1

    accuracy                           0.98       157
   macro avg

### Tunning Random Forest

In [39]:
from itertools import product

In [40]:
n_estimators = 100
max_features = [1, 'sqrt', 'log2']
max_depths = [None, 2, 3, 4, 5]
for f, d in product(max_features, max_depths): # with product we can iterate through all possible combinations
    rf = RandomForestClassifier(n_estimators=n_estimators,
                                criterion='entropy',
                                max_features=f,
                                max_depth=d,
                                n_jobs=2,
                                random_state=1337)
    rf.fit(X_train, y_train)
    prediction_test = rf.predict(X=X_test)
    print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,prediction_test)))
    print(classification_report(y_test, prediction_test))
    # cm = confusion_matrix(y_test, prediction_test)
    # cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
    # # plt.figure()
    # plot_confusion_matrix(cm_norm, classes=rf.classes_,
    # title='Confusion matrix accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,prediction_test)))

Classification accuracy on test set with max features = 1 and max_depth = None: 0.968
              precision    recall  f1-score   support

         255       1.00      1.00      1.00        22
         260       1.00      1.00      1.00        75
         290       1.00      1.00      1.00         9
         326       1.00      1.00      1.00        28
         510       0.50      1.00      0.67         2
         520       0.80      1.00      0.89         8
         580       1.00      1.00      1.00         2
         652       1.00      1.00      1.00         3
         765       1.00      0.50      0.67         2
         780       1.00      1.00      1.00         1
         978       1.00      1.00      1.00         1
        1300       0.00      0.00      0.00         1
        1530       0.00      0.00      0.00         1
        1560       0.00      0.00      0.00         1
        2805       0.00      0.00      0.00         0
       81380       0.00      0.00      0.00      

### Hyperparameter Tuning

In [9]:
tree_params = {'n_estimators':[50, 100, 150, 200],'criterion':['gini','entropy', 'log_loss'],'max_depth':[2, 3, 4, 5, 6, 7, 8, 9, 10], 'max_features':['sqrt', 'log2', None]}
rf_top = GridSearchCV( RandomForestClassifier(), tree_params, cv=5)

# Training the model for emotion classification
rf_top2 = rf_top.fit(X_train, y_train)

# Display the best hyperparameters
print("Best hyperparameters for random forest classification: ", rf_top.best_params_)


Best hyperparameters for random forest classification:  {'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'n_estimators': 100}


In [11]:
# Predict emotion class of test data
rf_top_predict = rf_top2.predict(X=X_test)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test,rf_top_predict)))
print(classification_report(y_test, rf_top_predict))


Classification accuracy on test set with max features = None and max_depth = 6: 0.987
              precision    recall  f1-score   support

         255       1.00      1.00      1.00        22
         260       1.00      1.00      1.00        75
         290       1.00      1.00      1.00         9
         326       1.00      1.00      1.00        28
         510       1.00      1.00      1.00         2
         520       1.00      1.00      1.00         8
         580       1.00      1.00      1.00         2
         652       1.00      1.00      1.00         3
         765       1.00      1.00      1.00         2
         780       1.00      1.00      1.00         1
         978       1.00      1.00      1.00         1
        1300       0.50      1.00      0.67         1
        1530       1.00      1.00      1.00         1
        1560       0.00      0.00      0.00         1
        3640       0.00      0.00      0.00         0
       81380       0.00      0.00      0.00      