In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})

# Datetime lib
from pandas import to_datetime
import itertools
import warnings
import datetime
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, classification_report

#for hyper parameter tunning
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('preprocessing/data/london_clean.csv')
data[0:5]

Unnamed: 0,DateOfCall,CalYear,HourOfCall,IncidentGroup,PropertyCategory,PropertyType,NumPumpsAttending,PumpHoursRoundUp,Notional Cost (£),Date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth,CostCat
0,1,2009,0,0,0,0,2,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0
1,1,2009,0,1,1,1,1,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0
2,1,2009,0,1,1,2,1,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0
3,1,2009,0,1,1,3,2,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0
4,1,2009,0,2,2,4,2,1,255,01/01/2009,8.0,0.0,13.0,3.5,1.5,-0.5,0.0,103010.0,0.0,0


In [3]:
X = data[['DateOfCall', 'PropertyType', 'NumPumpsAttending', 'PumpHoursRoundUp', 'mean_temp']]
y = data[['CostCat']]

print('X shape: {}'.format(np.shape(X)))
print('y shape: {}'.format(np.shape(y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X shape: (1286617, 5)
y shape: (1286617, 1)


In [4]:
rf = RandomForestClassifier(n_estimators=10, criterion='entropy')
rf.fit(X_train, y_train)
prediction_test = rf.predict(X=X_test)

# source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# Accuracy on Test
print("Training Accuracy is: ", rf.score(X_train, y_train))
print("Testing Accuracy is: ", rf.score(X_test, y_test))
print(classification_report(y_test, prediction_test))

Training Accuracy is:  0.8398437182799267
Testing Accuracy is:  0.7723771974450285
              precision    recall  f1-score   support

           0       0.78      0.83      0.81    231390
           1       0.69      0.62      0.65    140418
           2       1.00      1.00      1.00     34444
           3       0.71      0.78      0.74      5421
           4       0.58      0.51      0.54      4962
           5       0.92      0.92      0.92      7949

    accuracy                           0.77    424584
   macro avg       0.78      0.78      0.78    424584
weighted avg       0.77      0.77      0.77    424584



### Hyperparameter Tuning

In [5]:
tree_params = {'n_estimators':[5, 10],
               'criterion':['gini','entropy', 'log_loss'],
               'max_depth':[5, 7, 10], 
               'max_features':['sqrt', 'log2', None]}
rf_top = GridSearchCV( RandomForestClassifier(), tree_params, cv=5)

# Training the model with each combination
rf_top2 = rf_top.fit(X_train, y_train)

# Display the best hyperparameters
print("Best hyperparameters for random forest classification: ", rf_top.best_params_)


Best hyperparameters for random forest classification:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'n_estimators': 5}


In [6]:
# Predict emotion class of test data
rf_top_predict = rf_top2.predict(X=X_test)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test,rf_top_predict)))
print(classification_report(y_test, rf_top_predict))


Classification accuracy on test set with max features = None and max_depth = 6: 0.689
              precision    recall  f1-score   support

           0       0.65      0.97      0.78    231390
           1       0.74      0.14      0.23    140418
           2       1.00      1.00      1.00     34444
           3       0.66      0.96      0.78      5421
           4       0.68      0.28      0.40      4962
           5       0.90      0.95      0.92      7949

    accuracy                           0.69    424584
   macro avg       0.77      0.72      0.69    424584
weighted avg       0.71      0.69      0.61    424584



Testing with different train-test split

In [7]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.25, random_state=21)
rf_top_predict1 = rf_top2.predict(X=X_test1)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test1,rf_top_predict1)))
print(classification_report(y_test1, rf_top_predict1))

Classification accuracy on test set with max features = None and max_depth = 6: 0.690
              precision    recall  f1-score   support

           0       0.65      0.97      0.78    175432
           1       0.74      0.14      0.24    106038
           2       1.00      1.00      1.00     26144
           3       0.66      0.97      0.78      4090
           4       0.73      0.29      0.42      3791
           5       0.91      0.95      0.93      6160

    accuracy                           0.69    321655
   macro avg       0.78      0.72      0.69    321655
weighted avg       0.71      0.69      0.62    321655



In [8]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, train_size = 0.7, test_size=0.3)
rf_top_predict2 = rf_top2.predict(X=X_test2)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test2,rf_top_predict2)))
print(classification_report(y_test2, rf_top_predict2))

Classification accuracy on test set with max features = None and max_depth = 6: 0.690
              precision    recall  f1-score   support

           0       0.65      0.97      0.78    210415
           1       0.74      0.14      0.24    127444
           2       1.00      1.00      1.00     31446
           3       0.66      0.97      0.78      4834
           4       0.71      0.29      0.41      4510
           5       0.90      0.95      0.92      7337

    accuracy                           0.69    385986
   macro avg       0.78      0.72      0.69    385986
weighted avg       0.72      0.69      0.62    385986



In [9]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, train_size = 0.7, test_size=0.3)
rf_top_predict3 = rf_top2.predict(X=X_test3)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test3,rf_top_predict3)))
print(classification_report(y_test3, rf_top_predict3))

Classification accuracy on test set with max features = None and max_depth = 6: 0.691
              precision    recall  f1-score   support

           0       0.65      0.97      0.78    210779
           1       0.74      0.14      0.23    127195
           2       1.00      1.00      1.00     31411
           3       0.66      0.97      0.78      4852
           4       0.71      0.29      0.41      4398
           5       0.91      0.95      0.93      7351

    accuracy                           0.69    385986
   macro avg       0.78      0.72      0.69    385986
weighted avg       0.72      0.69      0.62    385986



In [10]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X, y, train_size = 0.7, test_size=0.3)
rf_top_predict4 = rf_top2.predict(X=X_test4)
print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(None, 6, accuracy_score(y_test4,rf_top_predict4)))
print(classification_report(y_test4, rf_top_predict4))

Classification accuracy on test set with max features = None and max_depth = 6: 0.689
              precision    recall  f1-score   support

           0       0.65      0.97      0.78    210194
           1       0.74      0.14      0.23    127734
           2       1.00      1.00      1.00     31139
           3       0.66      0.97      0.79      4930
           4       0.71      0.29      0.41      4500
           5       0.91      0.95      0.93      7489

    accuracy                           0.69    385986
   macro avg       0.78      0.72      0.69    385986
weighted avg       0.71      0.69      0.62    385986



In [11]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std

folds = [5, 10, 15, 20]
for i in folds:
    cross_val = KFold(n_splits=i, random_state=1, shuffle=True)
    scores = cross_val_score(rf_top2, X, y, scoring='accuracy', cv=cross_val, n_jobs=-1)
    print("Testing with {} fold:".format(i))
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

KeyboardInterrupt: 