In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})

# Datetime lib
from pandas import to_datetime
import itertools
import warnings
import datetime
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, classification_report

#for hyper parameter tunning
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('preprocessing/data/london_clean.csv')
data[0:5]

Unnamed: 0,DateOfCall,CalYear,HourOfCall,IncidentGroup,PropertyCategory,PropertyType,NumPumpsAttending,PumpHoursRoundUp,Notional Cost (£),Date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth,CostCat
0,1,2009,0,0,0,0,2,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0
1,1,2009,0,1,1,1,1,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0
2,1,2009,0,1,1,2,1,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0
3,1,2009,0,1,1,3,2,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0
4,1,2009,0,2,2,4,2,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0


In [3]:
X = data[['DateOfCall', 'PropertyType', 'NumPumpsAttending', 'PumpHoursRoundUp', 'mean_temp']]
y = data[['CostCat']]

print('X shape: {}'.format(np.shape(X)))
print('y shape: {}'.format(np.shape(y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X shape: (1286617, 5)
y shape: (1286617, 1)


In [4]:
rf = RandomForestClassifier(n_estimators=10, criterion='entropy')
rf.fit(X_train, y_train)
prediction_test = rf.predict(X=X_test)

# source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# Accuracy on Test
print("Training Accuracy is: ", rf.score(X_train, y_train))
print("Testing Accuracy is: ", rf.score(X_test, y_test))
print(classification_report(y_test, prediction_test))

Training Accuracy is:  0.8399840841359901
Testing Accuracy is:  0.7721981987074407
              precision    recall  f1-score   support

           0       0.78      0.83      0.81    231390
           1       0.69      0.62      0.66    140418
           2       1.00      1.00      1.00     34444
           3       0.70      0.77      0.73      5421
           4       0.57      0.51      0.54      4962
           5       0.92      0.92      0.92      7949

    accuracy                           0.77    424584
   macro avg       0.78      0.77      0.77    424584
weighted avg       0.77      0.77      0.77    424584



### Hyperparameter Tuning

In [7]:
tree_params = {'n_estimators':[5, 10],'criterion':['gini','entropy', 'log_loss'],'max_depth':[2, 5, 10], 'max_features':['sqrt', 'log2', None]}
rf_top = GridSearchCV( RandomForestClassifier(), tree_params, cv=5)

# Training the model with each combination
rf_top2 = rf_top.fit(X_train, y_train)

# Display the best hyperparameters
print("Best hyperparameters for random forest classification: ", rf_top.best_params_)


KeyboardInterrupt: 

In [None]:
# Predict emotion class of test data
rf_top_predict = rf_top2.predict(X=X_test)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test,rf_top_predict)))
print(classification_report(y_test, rf_top_predict))


Classification accuracy on test set with max features = None and max_depth = 6: 0.987
              precision    recall  f1-score   support

         255       1.00      1.00      1.00        22
         260       1.00      1.00      1.00        75
         290       1.00      1.00      1.00         9
         326       1.00      1.00      1.00        28
         510       1.00      1.00      1.00         2
         520       1.00      1.00      1.00         8
         580       1.00      1.00      1.00         2
         652       1.00      1.00      1.00         3
         765       1.00      1.00      1.00         2
         780       1.00      1.00      1.00         1
         978       1.00      1.00      1.00         1
        1300       0.50      1.00      0.67         1
        1530       1.00      1.00      1.00         1
        1560       0.00      0.00      0.00         1
        3640       0.00      0.00      0.00         0
       81380       0.00      0.00      0.00      

Testing with different train-test split

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.25, random_state=21)
rf_top_predict1 = rf_top2.predict(X=X_test1)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test1,rf_top_predict1)))
print(classification_report(y_test1, rf_top_predict1))

Classification accuracy on test set with max features = None and max_depth = 6: 1.000
              precision    recall  f1-score   support

         255       1.00      1.00      1.00        26
         260       1.00      1.00      1.00        70
         290       1.00      1.00      1.00        22
         326       1.00      1.00      1.00        22
         510       1.00      1.00      1.00         3
         520       1.00      1.00      1.00         7
         580       1.00      1.00      1.00         2
         652       1.00      1.00      1.00         1
         765       1.00      1.00      1.00         1
         780       1.00      1.00      1.00         1
         978       1.00      1.00      1.00         1
        6120       1.00      1.00      1.00         1

    accuracy                           1.00       157
   macro avg       1.00      1.00      1.00       157
weighted avg       1.00      1.00      1.00       157



In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, train_size = 0.7, test_size=0.3)
rf_top_predict2 = rf_top2.predict(X=X_test2)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test2,rf_top_predict2)))
print(classification_report(y_test2, rf_top_predict2))

Classification accuracy on test set with max features = None and max_depth = 6: 1.000
              precision    recall  f1-score   support

         255       1.00      1.00      1.00        27
         260       1.00      1.00      1.00        67
         290       1.00      1.00      1.00        19
         326       1.00      1.00      1.00        20
         510       1.00      1.00      1.00         1
         520       1.00      1.00      1.00         7
         580       1.00      1.00      1.00         2
         652       1.00      1.00      1.00         5
         765       1.00      1.00      1.00         2
         780       1.00      1.00      1.00         3
        1160       1.00      1.00      1.00         1
        1300       1.00      1.00      1.00         1
        1560       1.00      1.00      1.00         1
        6120       1.00      1.00      1.00         1

    accuracy                           1.00       157
   macro avg       1.00      1.00      1.00     

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, train_size = 0.7, test_size=0.3)
rf_top_predict3 = rf_top2.predict(X=X_test3)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test3,rf_top_predict3)))
print(classification_report(y_test3, rf_top_predict3))

Classification accuracy on test set with max features = None and max_depth = 6: 0.987
              precision    recall  f1-score   support

         255       1.00      1.00      1.00        29
         260       1.00      1.00      1.00        75
         290       1.00      1.00      1.00        17
         326       1.00      1.00      1.00        14
         510       1.00      1.00      1.00         4
         520       1.00      1.00      1.00         6
         580       1.00      1.00      1.00         3
         765       1.00      1.00      1.00         1
         978       1.00      1.00      1.00         1
        1300       0.50      1.00      0.67         1
        1450       1.00      1.00      1.00         1
        1560       1.00      0.50      0.67         2
        2805       1.00      1.00      1.00         1
        3640       0.00      0.00      0.00         0
        6120       1.00      1.00      1.00         1
       81380       0.00      0.00      0.00      

In [None]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X, y, train_size = 0.7, test_size=0.3)
rf_top_predict4 = rf_top2.predict(X=X_test4)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test4,rf_top_predict4)))
print(classification_report(y_test4, rf_top_predict4))

Classification accuracy on test set with max features = None and max_depth = 6: 1.000
              precision    recall  f1-score   support

         255       1.00      1.00      1.00        15
         260       1.00      1.00      1.00        71
         290       1.00      1.00      1.00        19
         326       1.00      1.00      1.00        33
         510       1.00      1.00      1.00         3
         520       1.00      1.00      1.00         5
         652       1.00      1.00      1.00         5
         765       1.00      1.00      1.00         2
         780       1.00      1.00      1.00         1
         978       1.00      1.00      1.00         1
        1450       1.00      1.00      1.00         1
        3640       1.00      1.00      1.00         1

    accuracy                           1.00       157
   macro avg       1.00      1.00      1.00       157
weighted avg       1.00      1.00      1.00       157



In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std

folds = [5, 10, 15, 20]
for i in folds:
    cross_val = KFold(n_splits=i, random_state=1, shuffle=True)
    scores = cross_val_score(rf_top2, X, y, scoring='accuracy', cv=cross_val, n_jobs=-1)
    print("Testing with {} fold:".format(i))
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Testing with 5 fold:
Accuracy: 0.969 (0.013)
Testing with 10 fold:
Accuracy: 0.971 (0.018)
Testing with 15 fold:
Accuracy: 0.971 (0.028)
Testing with 20 fold:
Accuracy: 0.971 (0.024)
