In [1]:
import numpy as np
#in case we need to repeat experiment
np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')


import seaborn as sns
sns.set()

from sklearn.metrics import classification_report, confusion_matrix 

#Enable
enable_grid_search=False


In [2]:
#df = pd.read_csv('NCDB_2016.csv', engine = 'python')
df = pd.read_csv('data01_simple.csv', engine = 'python')
#df

In [3]:
print(df.isnull().sum().sum())

0


In [4]:
print(df[df.index.astype('str').str.contains('[^0-9]')].sum().sum())

0


In [5]:
df_cat = df.astype('category').copy()

In [6]:
df_int = df.astype('int').copy()

In [7]:
print(df.columns)

Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_ID', 'V_TYPE', 'V_YEAR',
       'P_ID', 'P_SEX', 'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER', 'P_ISEV'],
      dtype='object')


## Convert Class Variable to Binary

In [8]:
## Convert Class Variable to Binary
### Merge Injury and Fatality as a single class
### we will compare the results.
df_binary_class = df_cat.copy()

#perform the conversion in two steps to avoid any unwanted side effects
df_binary_class['P_ISEV'] = df_binary_class['P_ISEV'].map({1: 'safe', 2: 'injury', 3:'fatal'})
df_binary_class['P_ISEV'] = df_binary_class['P_ISEV'].map({'safe': '0', 'injury': '1', 'fatal':'1'})
print((df_binary_class['P_ISEV']=='0').sum())
print((df_binary_class['P_ISEV']=='1').sum())
print(df_binary_class['P_ISEV'].unique())

1570775
2084559
['1' '0']


In [9]:
df_100k = df_binary_class.sample(n=100000)

## Split Training and Testing for Binary class

In [10]:
#Split between data and class
Y = df_100k[df_binary_class.columns[-1]]
X = df_100k[df_binary_class.columns[0:df_binary_class.columns.size -1]]
#print(Xbinary, Ybinary)

#### Split Test(70%) and Train (30%) for Bianry class 

In [11]:
#sprint into train and test 70/30
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [12]:
#print(Xbinary_train, Xbinary_test, Ybinary_train, Ybinary_test)

## Write cleaned data to file for future use.

In [13]:
#lets write the datafile for future use
df_binary_class.to_csv('cleansimple_binary.csv', encoding='utf-8', index=False)

## Clustering based on K-Means Clustering

In [14]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, init='random', n_init=10, max_iter=300, tol=1e-04)
ykm = kmeans.fit(X)

In [15]:
ykm.cluster_centers_

array([[2.00750929e+03, 6.72544020e+00, 3.90723642e+00, 1.36740391e+01,
        2.17583745e+00, 2.49713335e+01, 1.75177153e+00, 1.55934615e+00,
        1.52826390e+00, 1.34987653e+00, 9.95675864e+00, 1.60760683e+00,
        1.81052716e+00, 2.00039521e+03, 1.28245115e+00, 5.38329397e-01,
        5.63702222e+01, 1.20277271e+01, 2.26285699e+00, 1.28202169e+00],
       [2.00661944e+03, 6.67233440e+00, 4.04238989e+00, 1.41690282e+01,
        2.19772110e+00, 3.09138918e+01, 1.94217145e+00, 1.55611642e+00,
        1.50366041e+00, 1.29451529e+00, 5.65946393e+00, 1.57905302e+00,
        1.54616838e+00, 1.99900992e+03, 1.64972252e+00, 5.19541859e-01,
        2.42192998e+01, 1.38276656e+01, 2.26127642e+00, 1.44034124e+00],
       [2.00661629e+03, 6.75640848e+00, 4.16547735e+00, 1.33125606e+01,
        1.88596370e+00, 1.31032631e+01, 1.39410420e+00, 1.69717334e+00,
        1.73881114e+00, 1.58161286e+00, 1.69400028e+01, 1.43265900e+00,
        1.78938617e+00, 1.99907517e+03, 1.65363032e+00, 5.5715

In [16]:
display(ykm.labels_)

array([1, 1, 1, ..., 2, 0, 2])

## Feature selection using Random Forest

### Feature Selection 

In [17]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
forest.fit(X, Y)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Get the inportant features from random forest

In [18]:
importFeatures = forest.feature_importances_

### List the features by importancce

In [19]:
importFeatures

array([0.09690289, 0.08829822, 0.06998701, 0.104866  , 0.03198535,
       0.07023848, 0.02750522, 0.02843908, 0.02799837, 0.02749222,
       0.03320521, 0.0310279 , 0.02038281, 0.1121029 , 0.01870312,
       0.02824148, 0.13820599, 0.02022035, 0.01181928, 0.01237812])

In [20]:
indices = np.argsort(importFeatures)[::-1]
print(indices)
featureLabel = X.columns[0:]
print(featureLabel)
rankedFeature = []
for f in range(X.shape[1]):
    rankedFeature.append(featureLabel[indices[f]])
    print("%2d) %-*s %f" % (f+1, 30,  featureLabel[indices[f]], importFeatures[indices[f]]))
print(rankedFeature)

[16 13  3  0  1  5  2 10  4 11  7 15  8  6  9 12 17 14 19 18]
Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_ID', 'V_TYPE', 'V_YEAR',
       'P_ID', 'P_SEX', 'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER'],
      dtype='object')
 1) P_AGE                          0.138206
 2) V_YEAR                         0.112103
 3) C_HOUR                         0.104866
 4) C_YEAR                         0.096903
 5) C_MNTH                         0.088298
 6) C_CONF                         0.070238
 7) C_WDAY                         0.069987
 8) C_TRAF                         0.033205
 9) C_VEHS                         0.031985
10) V_ID                           0.031028
11) C_WTHR                         0.028439
12) P_SEX                          0.028241
13) C_RSUR                         0.027998
14) C_RCFG                         0.027505
15) C_RALN                         0.027492
16) V_TYPE                         0.0203

### Reduce the number of features

In [21]:
#select features that contribute more than 0.05
#[df_cat.columns[0:df_cat.columns.size -1]]
X_Selected = X[rankedFeature[0:10]]
display(X_Selected)
X_Selected.shape

#sprint into train and test 70/30
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_Selected, Y, test_size=0.3, random_state=0)

Unnamed: 0,P_AGE,V_YEAR,C_HOUR,C_YEAR,C_MNTH,C_CONF,C_WDAY,C_TRAF,C_VEHS,V_ID
1497736,9,1993,16,2005,7,21,5,3,2,2
3059749,28,2012,8,2013,8,36,5,18,2,1
1908179,19,1991,21,2007,6,31,6,1,3,3
737129,15,1977,18,2002,3,31,4,1,2,2
345811,32,2000,13,2000,7,31,4,3,2,2
3372130,23,2015,14,2015,6,33,2,1,3,1
2647942,19,2003,18,2011,5,35,5,18,2,1
1760000,12,2003,23,2006,10,6,1,18,1,1
1321151,23,2002,13,2004,9,31,6,18,2,2
942120,53,2000,14,2003,1,21,4,18,3,2


### SVM GridSearch for Optimal Parms

In [22]:
#This operation is computationaly expensive.
#Enable as required.
enable_grid_search = False
if enable_grid_search:
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=3)
    grid.fit(X_train, Y_train)
    print(grid.best_params_)
    #{'C': 1000, 'gamma': 0.001}
    print(grid.best_estimator_)
    #SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    #  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    #  max_iter=-1, probability=False, random_state=None, shrinking=True,
    #  tol=0.001, verbose=False)
    grid_predictions = grid.predict(X_test)
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(Y_test, grid_predictions)
    print(confusion_matrix)
    print(classification_report(Y_test,grid_predictions))

## Logistic Regression Model

In [23]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, random_state=0)
lr.fit(X_train, Y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
y_pred = lr.predict(X_test)
#display(y_pred)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

Accuracy of logistic regression classifier on train set: 0.59
Accuracy of logistic regression classifier on test set: 0.59


In [25]:
# print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
lr.intercept_

array([0.00110126])

In [26]:
# print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
lr.coef_

array([[ 0.00606785, -0.01332486, -0.01725665,  0.01364966,  0.00104168,
        -0.00519001,  0.00182394,  0.01871573, -0.37912878,  0.17243438]])

In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

[[ 3187  9576]
 [ 2730 14507]]
             precision    recall  f1-score   support

          0       0.54      0.25      0.34     12763
          1       0.60      0.84      0.70     17237

avg / total       0.58      0.59      0.55     30000



In [28]:
# with L1 regularization
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=1000, random_state=0)
lr.fit(X_train, Y_train)

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [29]:
y_pred = lr.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

Accuracy of logistic regression classifier on train set: 0.59
Accuracy of logistic regression classifier on test set: 0.59
[[ 3134  9629]
 [ 2658 14579]]
             precision    recall  f1-score   support

          0       0.54      0.25      0.34     12763
          1       0.60      0.85      0.70     17237

avg / total       0.58      0.59      0.55     30000



### Decison Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy',max_depth=5, random_state=0)
tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [31]:
y_pred = tree.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of Tree classifier on test set: {:.2f}'.format(tree.score(X_test, Y_test)))

Accuracy of logistic regression classifier on train set: 0.59
Accuracy of Tree classifier on test set: 0.64


In [32]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

[[ 5438  7325]
 [ 3560 13677]]
             precision    recall  f1-score   support

          0       0.60      0.43      0.50     12763
          1       0.65      0.79      0.72     17237

avg / total       0.63      0.64      0.62     30000



### Random forest

In [33]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=1000, random_state=0, n_jobs=2)
forest.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [34]:
y_pred = forest.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, Y_test)))

Accuracy of logistic regression classifier on train set: 0.59
Accuracy of RandomForest classifier on test set: 0.63


In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

[[ 6070  6693]
 [ 4394 12843]]
             precision    recall  f1-score   support

          0       0.58      0.48      0.52     12763
          1       0.66      0.75      0.70     17237

avg / total       0.62      0.63      0.62     30000



### K-N-N

In [36]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [37]:
y_pred = knn.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, Y_test)))

Accuracy of logistic regression classifier on train set: 0.59
Accuracy of RandomForest classifier on test set: 0.63


In [38]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

[[ 5555  7208]
 [ 5622 11615]]
             precision    recall  f1-score   support

          0       0.50      0.44      0.46     12763
          1       0.62      0.67      0.64     17237

avg / total       0.57      0.57      0.57     30000



### SVM

In [39]:
from sklearn.svm import SVC
#svm = SVC(C=1, random_state=0, kernel='sigmoid', verbose=True)
#svm = SVC(C=1, random_state=0, kernel='linear', verbose=True, cache_size=200)
svm = SVC(verbose = 3)

In [40]:
svm.fit(X_train, Y_train)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=3)

In [41]:
y_pred = svm.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of SVM regression classifier on test set: {:.2f}'.format(svm.score(X_test, Y_test)))

Accuracy of logistic regression classifier on train set: 0.59
Accuracy of SVM regression classifier on test set: 0.58


In [42]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

[[ 3892  8871]
 [ 3754 13483]]
             precision    recall  f1-score   support

          0       0.51      0.30      0.38     12763
          1       0.60      0.78      0.68     17237

avg / total       0.56      0.58      0.55     30000



### Performance Tuning using GridSearch

In [43]:
enable_grid_search = False
if enable_grid_search:
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=3)
    grid.fit(X_train, Y_train)
    grid.best_params_
    #Result: {'C': 1000, 'gamma': 0.001}
    grid.best_estimator_
        #SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
        #decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
        #max_iter=-1, probability=False, random_state=None, shrinking=True,
        #tol=0.001, verbose=False)
    grid_predictions = grid.predict(X_test)
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(Y_test, grid_predictions)
    print(confusion_matrix)
    print(classification_report(Y_test,grid_predictions))
    
    
#[[4810 1501  697]
# [3549 2112 1362]
# [ 801  905 5036]]
#             precision    recall  f1-score   support
#
#          1       0.53      0.69      0.60      7008
#          2       0.47      0.30      0.37      7023
#          3       0.71      0.75      0.73      6742
#
#avg / total       0.57      0.58      0.56     20773

## ANN - Multilayer Perceptron

In [44]:
from sklearn.neural_network import MLPClassifier
#mlpc = MLPClassifier(alpha=1)
mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), max_iter=1000)

#mlp = multilayer_perceptron(n_hidden =2, activation='logistic', algorithm='sgd', random_state=3)

In [45]:
mlpc.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(12, 12, 12), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [46]:
y_pred = mlpc.predict(X_test)

In [47]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

[[ 3636  9127]
 [ 2886 14351]]
             precision    recall  f1-score   support

          0       0.56      0.28      0.38     12763
          1       0.61      0.83      0.70     17237

avg / total       0.59      0.60      0.57     30000



In [50]:

#check sigmoid and rbf
#from sklearn.ensemble import BaggingClassifier
#from sklearn.svm import SVC
#clf = BaggingClassifier(SVC(C=1.0,
#        cache_size=200,
#        class_weight=None,
#        coef0=0.0,
#        decision_function_shape=None,
#        degree=3,
#        gamma='auto',
#        kernel='linear',
#        max_iter=-1,
#        probability=False,
#        random_state=None,
#        shrinking=True,
#        tol=0.001,
#        verbose=False,
#        ))

In [49]:
#clf.fit(X_train, Y_train)