In [47]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})

# Datetime lib
from pandas import to_datetime
import itertools
import warnings
import datetime
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, classification_report

In [48]:
data = pd.read_csv('sf_clean.csv')

In [49]:
data

Unnamed: 0,IncidentDate,Zipcode,SuppressionPersonnel,EstimatedPropertyLoss,EstimatedContentsLoss,CivilianFatalities,CivilianInjuries,PropertyUse,AreaofFireOrigin,IgnitionCause,StructureStatus,DetectorsPresent
0,2,94109,22,3000,1000,0,0,1,1,1,0,0
1,10,94117,31,5000,0,0,0,2,2,1,0,1
2,3,94102,42,50,10,0,0,2,3,1,0,2
3,6,94110,32,0,0,0,0,2,4,1,0,2
4,3,94131,30,0,0,0,0,3,5,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
5079,11,94110,28,1000,100,0,0,13,82,11,3,0
5080,11,94114,46,1750000,150000,0,1,8,76,4,3,2
5081,11,94122,33,55000,15000,0,0,8,14,6,3,0
5082,12,94124,15,3000000,1000000,0,0,13,121,11,3,0


In [50]:
data.isnull().sum()

IncidentDate             0
Zipcode                  0
SuppressionPersonnel     0
EstimatedPropertyLoss    0
EstimatedContentsLoss    0
CivilianFatalities       0
CivilianInjuries         0
PropertyUse              0
AreaofFireOrigin         0
IgnitionCause            0
StructureStatus          0
DetectorsPresent         0
dtype: int64

In [51]:
data.columns

Index(['IncidentDate', 'Zipcode', 'SuppressionPersonnel',
       'EstimatedPropertyLoss', 'EstimatedContentsLoss', 'CivilianFatalities',
       'CivilianInjuries', 'PropertyUse', 'AreaofFireOrigin', 'IgnitionCause',
       'StructureStatus', 'DetectorsPresent'],
      dtype='object')

In [52]:
# g = sns.pairplot(data, hue = 'EstimatedPropertyLoss', diag_kws={'bw': 0.2})

In [53]:
#identify all categorical variables
cat_columns = data.select_dtypes(['object']).columns
cat_columns

Index([], dtype='object')

In [69]:
X = data.drop(['EstimatedPropertyLoss'], axis=1).values# Input features (attributes)
y = data['EstimatedPropertyLoss'].values # Target vector
print('X shape: {}'.format(np.shape(X)))
print('y shape: {}'.format(np.shape(y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size=0.3, random_state=0)

X shape: (5084, 11)
y shape: (5084,)


In [70]:
# Confusion Matrix function

def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':50})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [71]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')
rf.fit(X_train, y_train)
prediction_test = rf.predict(X=X_test)

# source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# Accuracy on Test
print("Training Accuracy is: ", rf.score(X_train, y_train))
# Accuracy on Train
print("Testing Accuracy is: ", rf.score(X_test, y_test))

print(classification_report(y_test, prediction_test))

# Confusion Matrix
# cm = confusion_matrix(y_test, prediction_test)
# cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
# plt.figure()
# plot_confusion_matrix(cm_norm, classes=rf.classes_)

Training Accuracy is:  0.9988757729061271
Testing Accuracy is:  0.20576671035386632
              precision    recall  f1-score   support

           0       0.43      0.83      0.56       192
           1       0.33      0.24      0.28        41
           2       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         4
          10       0.00      0.00      0.00        10
          15       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         2
          25       0.00      0.00      0.00         9
          50       0.12      0.05      0.07        40
         100       0.09      0.08      0.08        63
         125       0.00      0.00      0.00         1
         150       0.00      0.00      0.00         5
         200       0.08      0.04      0.05        28
         250       0.00      0.00      0.00        12
         300       0.00      0.00      0.00        16
         350       0.00      0.00      0.00        

### Tunning Random Forest

In [57]:
from itertools import product

In [58]:
n_estimators = 100
max_features = [1, 'sqrt', 'log2']
max_depths = [None, 2, 3, 4, 5]
for f, d in product(max_features, max_depths): # with product we can iterate through all possible combinations
    rf = RandomForestClassifier(n_estimators=n_estimators,
                                criterion='entropy',
                                max_features=f,
                                max_depth=d,
                                n_jobs=2,
                                random_state=1337)
    rf.fit(X_train, y_train)
    prediction_test = rf.predict(X=X_test)
    print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,prediction_test)))
    print(classification_report(y_test, prediction_test))
    # cm = confusion_matrix(y_test, prediction_test)
    # cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
    # # plt.figure()
    # plot_confusion_matrix(cm_norm, classes=rf.classes_,
    # title='Confusion matrix accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,prediction_test)))

Classification accuracy on test set with max features = 1 and max_depth = None: 0.179
              precision    recall  f1-score   support

           0       0.41      0.86      0.56        69
           1       0.43      0.23      0.30        13
           5       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         4
          20       0.00      0.00      0.00         1
          25       0.00      0.00      0.00         4
          50       0.00      0.00      0.00         9
         100       0.24      0.21      0.22        19
         150       0.00      0.00      0.00         2
         200       0.00      0.00      0.00         4
         250       0.00      0.00      0.00         3
         300       0.00      0.00      0.00         5
         350       0.00      0.00      0.00         1
         400       0.00      0.00      0.00         1
         500       0.09      0.08      0.08        37
         600       0.00      0.00      0.00      

### Using fatalities as the target

In [79]:
data = pd.read_csv('sf_clean.csv')

features = ['IncidentDate', 'Zipcode', 'EstimatedPropertyLoss', 'EstimatedContentsLoss',
     'SuppressionPersonnel', 'IgnitionCause',
     'AreaofFireOrigin', 'PropertyUse', 'StructureStatus', 'DetectorsPresent']


# X = data.drop(features, axis=1).values# Input features (attributes)
X = data[features]
y = data['CivilianFatalities'].values # Target vector
print('X shape: {}'.format(np.shape(X)))
print('y shape: {}'.format(np.shape(y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size=0.3, random_state=0)

X shape: (5084, 10)
y shape: (5084,)


In [80]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')
rf.fit(X_train, y_train)
prediction_test = rf.predict(X=X_test)

# source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# Accuracy on Test
print("Training Accuracy is: ", rf.score(X_train, y_train))
# Accuracy on Train
print("Testing Accuracy is: ", rf.score(X_test, y_test))

print(classification_report(y_test, prediction_test))


Training Accuracy is:  1.0
Testing Accuracy is:  0.9967234600262124
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1521
           1       0.00      0.00      0.00         5

    accuracy                           1.00      1526
   macro avg       0.50      0.50      0.50      1526
weighted avg       0.99      1.00      1.00      1526



 ### Tunning Random Forest

In [81]:
n_estimators = 100
max_features = [1, 'sqrt', 'log2']
max_depths = [None, 2, 3, 4, 5]
for f, d in product(max_features, max_depths): # with product we can iterate through all possible combinations
    rf = RandomForestClassifier(n_estimators=n_estimators,
                                criterion='entropy',
                                max_features=f,
                                max_depth=d,
                                n_jobs=2,
                                random_state=1337)
    rf.fit(X_train, y_train)
    prediction_test = rf.predict(X=X_test)
    print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,prediction_test)))
    print(classification_report(y_test, prediction_test))

Classification accuracy on test set with max features = 1 and max_depth = None: 0.997
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1521
           1       0.00      0.00      0.00         5

    accuracy                           1.00      1526
   macro avg       0.50      0.50      0.50      1526
weighted avg       0.99      1.00      1.00      1526

Classification accuracy on test set with max features = 1 and max_depth = 2: 0.997
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1521
           1       0.00      0.00      0.00         5

    accuracy                           1.00      1526
   macro avg       0.50      0.50      0.50      1526
weighted avg       0.99      1.00      1.00      1526

Classification accuracy on test set with max features = 1 and max_depth = 3: 0.997
              precision    recall  f1-score   support

           0       1.00      1.00      