In [240]:
import numpy as np
#in case we need to repeat experiment
np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')


import seaborn as sns
sns.set()

from sklearn.metrics import classification_report, confusion_matrix 

#Enable
enable_grid_search=False
#Use print instead of display when run as python script
pyscript = True

#Classifier verborsity where supported
verbose_level=4

import time

t_start =  time.time()
print(time.localtime( t_start ))
print(time.asctime( time.localtime(t_start) ))


time.struct_time(tm_year=2018, tm_mon=11, tm_mday=10, tm_hour=16, tm_min=33, tm_sec=27, tm_wday=5, tm_yday=314, tm_isdst=0)
Sat Nov 10 16:33:27 2018


In [241]:
#df = pd.read_csv('NCDB_2016.csv', engine = 'python')
#df = pd.read_csv('data01_simple.csv', engine = 'python')
df = pd.read_csv('data01_clean.csv', engine = 'python')
#df

In [242]:
print(df.isnull().sum().sum())

0


In [243]:
print(df[df.index.astype('str').str.contains('[^0-9]')].sum().sum())

0


In [244]:
df_cat = df.astype('category').copy()

In [245]:
df_int = df.astype('int').copy()

In [246]:
print(df.columns)

Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_ID', 'V_TYPE', 'V_YEAR',
       'P_ID', 'P_SEX', 'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER', 'P_ISEV'],
      dtype='object')


## Convert Class Variable to Binary

In [247]:
## Convert Class Variable to Binary
### Merge Injury and Fatality as a single class
### we will compare the results.
df_binary_class = df_cat.copy()

#perform the conversion in two steps to avoid any unwanted side effects
df_binary_class['P_ISEV'] = df_binary_class['P_ISEV'].map({1: 'safe', 2: 'injury', 3:'fatal'})
df_binary_class['P_ISEV'] = df_binary_class['P_ISEV'].map({'safe': '0', 'injury': '1', 'fatal':'1'})
print((df_binary_class['P_ISEV']=='0').sum())
print((df_binary_class['P_ISEV']=='1').sum())
print(df_binary_class['P_ISEV'].unique())

2568097
3395175
['0' '1']


In [248]:
df_100k = df_binary_class.sample(n=1000)

## Split Training and Testing for Binary class

In [249]:
#Split between data and class
Y = df_100k[df_binary_class.columns[-1]]
X = df_100k[df_binary_class.columns[0:df_binary_class.columns.size -1]]
#print(Xbinary, Ybinary)

#### Split Test(70%) and Train (30%) for Bianry class 

In [250]:
#sprint into train and test 70/30
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [251]:
#print(Xbinary_train, Xbinary_test, Ybinary_train, Ybinary_test)

## Write cleaned data to file for future use.

In [252]:
#lets write the datafile for future use
df_binary_class.to_csv('cleansimple_binary.csv', encoding='utf-8', index=False)

## Clustering based on K-Means Clustering

In [253]:
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))

from sklearn.cluster import KMeans
print("K-Means Clustering: Start")
kmeans = KMeans(n_clusters=3, init='random', n_init=10, max_iter=300, tol=1e-04, verbose= verbose_level)
print("K-Means Clustering: Build")
ykm = kmeans.fit(X)

Sat Nov 10 16:36:04 2018
K-Means Clustering: Start
K-Means Clustering: Build
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 1140638.3080342724
start iteration
done sorting
end inner loop
Iteration 1, inertia 1113120.0852103585
start iteration
done sorting
end inner loop
Iteration 2, inertia 1018072.3064360399
start iteration
done sorting
end inner loop
Iteration 3, inertia 560184.2254797958
start iteration
done sorting
end inner loop
Iteration 4, inertia 541542.2962123085
start iteration
done sorting
end inner loop
Iteration 5, inertia 536178.6835934527
start iteration
done sorting
end inner loop
Iteration 6, inertia 536162.872372387
start iteration
done sorting
end inner loop
Iteration 7, inertia 536162.872372387
center shift 0.000000e+00 within tolerance 7.146040e-03
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 1100964.9697269332
start iteration
done sorting
end inner loop
Iteration 1, inertia 10884

In [254]:
print(ykm.cluster_centers_)

[[2.00654340e+03 6.62673611e+00 4.10069444e+00 1.41024306e+01
  2.02430556e+00 2.23559028e+01 1.64930556e+00 1.61284722e+00
  1.55208333e+00 1.46180556e+00 1.16666667e+01 1.48437500e+00
  2.35416667e+00 2.00035243e+03 1.61458333e+00 5.59027778e-01
  2.37204861e+01 1.41319444e+01 2.32638889e+00 1.51041667e+00]
 [2.00783896e+03 6.94285714e+00 3.92207792e+00 1.37558442e+01
  1.98961039e+00 2.40597403e+01 1.72987013e+00 1.53506494e+00
  1.58441558e+00 1.42857143e+00 1.05012987e+01 1.54545455e+00
  2.53506494e+00 2.00167532e+03 1.24675325e+00 5.71428571e-01
  5.60285714e+01 1.23194805e+01 2.68311688e+00 1.36363636e+00]
 [2.00766667e+03 6.84615385e+00 4.43589744e+00 1.49230769e+01
  1.05128205e+00 8.53846154e+00 1.69230769e+00 1.51282051e+00
  1.46153846e+00 1.28205128e+00 1.06666667e+01 9.90000000e+01
  2.41025641e+00 2.01700000e+03 1.12820513e+00 4.10256410e-01
  3.77179487e+01 9.90000000e+01 7.23076923e+00 3.00000000e+00]]


In [255]:
if pyscript:
    print(ykm.labels_)
else:
    display(ykm.labels_)

[0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 0 1 0 0 1 1 1 0 1 1 1 0 0 0
 1 0 0 0 1 1 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 0 1 1 1 0 1 1 2 0 1 0 0 0 1 1
 1 1 0 0 1 0 1 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 2 0 0 0 0 0 0 0 1 1 1 0 1 1
 1 1 0 1 1 0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 0 1 0 1 0 0 1 0
 1 0 1 1 0 0 1 1 1 1 1 0 0 0 1 0 1 1 0 0 1 1 0 0 2 1 2 0 1 0 0 0 1 1 1 0 0
 0 0 1 1 0 0 1 0 0 1 1 0 0 1 0 1 1 0 1 0 0 1 1 0 0 0 1 2 0 1 0 0 1 0 0 0 1
 0 0 2 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 0 1 1 0 2 1 1 1 1 1 1 0
 0 0 0 1 2 1 1 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 2 0 0 0 1 1 1 0 0 1 1 1 1 0
 0 1 0 1 0 0 0 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 2 0 1 2
 1 0 0 0 0 1 1 0 0 0 1 0 1 1 2 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 1 1 1 0 1
 1 0 1 0 1 1 0 1 0 1 0 1 0 0 2 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 1 0 2 1
 0 1 0 1 1 0 1 0 0 1 1 1 0 0 2 1 0 1 0 0 0 1 0 1 1 1 1 0 1 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 1 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0
 0 0 1 0 1 0 0 1 0 0 0 0 

In [256]:
print("K-Means Clustering: End")
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))

K-Means Clustering: End
Sat Nov 10 16:36:05 2018


## Feature selection using Random Forest

### Feature Selection 

In [257]:
print("Random Forest Feature Selection: Start")
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))

from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, verbose=verbose_level)
print("Random Forest Feature Selection: Fit Start")
forest.fit(X, Y)
print("Random Forest Feature Selection: Fit")

Random Forest Feature Selection: Start
Sat Nov 10 16:36:05 2018
Random Forest Feature Selection: Fit Start
building tree 1 of 1000
building tree 2 of 1000
building tree 3 of 1000building tree 4 of 1000
building tree 5 of 1000

building tree 6 of 1000
building tree 7 of 1000building tree 8 of 1000

building tree 9 of 1000
building tree 10 of 1000
building tree 11 of 1000building tree 12 of 1000

building tree 13 of 1000
building tree 14 of 1000building tree 15 of 1000

building tree 16 of 1000
building tree 17 of 1000building tree 18 of 1000

building tree 19 of 1000
building tree 20 of 1000
building tree 21 of 1000
building tree 22 of 1000building tree 23 of 1000

building tree 24 of 1000
building tree 25 of 1000
building tree 26 of 1000
building tree 27 of 1000
building tree 28 of 1000building tree 29 of 1000
building tree 30 of 1000
building tree 31 of 1000

building tree 32 of 1000
building tree 33 of 1000building tree 34 of 1000

building tree 35 of 1000
building tree 36 of 1000
bu

[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:    0.0s


building tree 265 of 1000building tree 266 of 1000building tree 267 of 1000

building tree 268 of 1000

building tree 269 of 1000
building tree 270 of 1000
building tree 271 of 1000
building tree 272 of 1000
building tree 273 of 1000building tree 274 of 1000
building tree 275 of 1000
building tree 276 of 1000

building tree 277 of 1000
building tree 278 of 1000
building tree 279 of 1000
building tree 280 of 1000
building tree 281 of 1000building tree 282 of 1000
building tree 283 of 1000
building tree 284 of 1000

building tree 285 of 1000
building tree 286 of 1000
building tree 287 of 1000building tree 288 of 1000
building tree 289 of 1000

building tree 290 of 1000
building tree 291 of 1000building tree 292 of 1000building tree 293 of 1000


building tree 294 of 1000
building tree 295 of 1000building tree 296 of 1000
building tree 297 of 1000

building tree 298 of 1000
building tree 299 of 1000
building tree 300 of 1000
building tree 301 of 1000
building tree 302 of 1000building tree

[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:    0.3s


building tree 556 of 1000
building tree 557 of 1000
building tree 558 of 1000
building tree 559 of 1000
building tree 560 of 1000
building tree 561 of 1000
building tree 562 of 1000
building tree 563 of 1000
building tree 564 of 1000
building tree 565 of 1000
building tree 566 of 1000building tree 567 of 1000building tree 568 of 1000
building tree 569 of 1000
building tree 570 of 1000

building tree 571 of 1000building tree 572 of 1000


building tree 573 of 1000
building tree 574 of 1000building tree 575 of 1000

building tree 576 of 1000building tree 577 of 1000building tree 578 of 1000


building tree 579 of 1000
building tree 580 of 1000
building tree 581 of 1000
building tree 582 of 1000
building tree 583 of 1000building tree 584 of 1000

building tree 585 of 1000
building tree 586 of 1000
building tree 587 of 1000building tree 588 of 1000

building tree 589 of 1000
building tree 590 of 1000
building tree 591 of 1000building tree 592 of 1000

building tree 593 of 1000
building tre

[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    0.5s finished


### Get the inportant features from random forest

In [258]:
importFeatures = forest.feature_importances_

### List the features by importancce

In [259]:
print("Random Forest Feature Selection: Feature Importance")
print(importFeatures)

Random Forest Feature Selection: Feature Importance
[0.0868152  0.07894292 0.0606192  0.09276565 0.04001943 0.07210561
 0.03075674 0.02957351 0.03394698 0.03030475 0.03353173 0.03399499
 0.02828146 0.09938225 0.02456707 0.03481421 0.10582149 0.03122756
 0.01680649 0.03572278]


In [260]:
indices = np.argsort(importFeatures)[::-1]
print(indices)
featureLabel = X.columns[0:]
print(featureLabel)
rankedFeature = []
for f in range(X.shape[1]):
    rankedFeature.append(featureLabel[indices[f]])
    print("%2d) %-*s %f" % (f+1, 30,  featureLabel[indices[f]], importFeatures[indices[f]]))
print(rankedFeature)

[16 13  3  0  1  5  2  4 19 15 11  8 10 17  6  9  7 12 14 18]
Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_ID', 'V_TYPE', 'V_YEAR',
       'P_ID', 'P_SEX', 'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER'],
      dtype='object')
 1) P_AGE                          0.105821
 2) V_YEAR                         0.099382
 3) C_HOUR                         0.092766
 4) C_YEAR                         0.086815
 5) C_MNTH                         0.078943
 6) C_CONF                         0.072106
 7) C_WDAY                         0.060619
 8) C_VEHS                         0.040019
 9) P_USER                         0.035723
10) P_SEX                          0.034814
11) V_ID                           0.033995
12) C_RSUR                         0.033947
13) C_TRAF                         0.033532
14) P_PSN                          0.031228
15) C_RCFG                         0.030757
16) C_RALN                         0.0303

### Reduce the number of features

In [261]:
#select features that contribute more than 0.05
#[df_cat.columns[0:df_cat.columns.size -1]]
X_Selected = X[rankedFeature[0:10]]

if pyscript:
    print(X_Selected)
else:
    display(X_Selected)

    print(X_Selected.shape)

t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))

print("Random Forest Feature Selection: End")


        P_AGE V_YEAR C_HOUR C_YEAR C_MNTH C_CONF C_WDAY C_VEHS P_USER P_SEX
3564056    25   1993      8   2008     10     21      5      2      2     1
1569277     6   2002      8   2003      1     21      5      2      2     0
3834571    28   1989     19   2009      9      6      3      1      1     1
824407     16   1985     23   2001      2      6      5      1      2     0
3515329    12   2006     20   2008      8     35      7      2      2     1
3441590    31   1998     13   2008      6     35      2      6      1     1
5243339    59   2018     13   2014      6      4      7      1      5     1
4698727    55   2000      7   2012      8     36      2      2      1     1
484740     64   1991     10   2000      4     21      4      2      1     1
2112369    67   1993     11   2004      7      3      6      1      2     0
4270477    68   2018      9   2011      2     36      4      2      1     1
...       ...    ...    ...    ...    ...    ...    ...    ...    ...   ...
5915449    7

In [262]:
print("Split Test and Train based on Selected Features")
#sprint into train and test 70/30
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_Selected, Y, test_size=0.3, random_state=0)

Split Test and Train based on Selected Features


### SVM GridSearch for Optimal Parms

In [263]:
#This operation is computationaly expensive.
#Enable as required.
enable_grid_search = False
if enable_grid_search:
    
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=verbose_level)
    grid.fit(X_train, Y_train)
    print(grid.best_params_)
    #{'C': 1000, 'gamma': 0.001}
    print(grid.best_estimator_)
    #SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    #  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    #  max_iter=-1, probability=False, random_state=None, shrinking=True,
    #  tol=0.001, verbose=False)
    grid_predictions = grid.predict(X_test)
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(Y_test, grid_predictions)
    print(confusion_matrix)
    print(classification_report(Y_test,grid_predictions))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

## Logistic Regression Model

In [264]:
print("Logistic Regression: Start")
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, random_state=0, verbose=verbose_level)
print("Logistic Regression: Fit")
lr.fit(X_train, Y_train)

Logistic Regression: Start
Sat Nov 10 16:36:06 2018
Logistic Regression: Fit
[LibLinear]

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=4, warm_start=False)

In [265]:
print("Logistic Regression: Predict")
y_pred = lr.predict(X_test)
#display(y_pred)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.62
Accuracy of logistic regression classifier on test set: 0.64


In [266]:
# print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
print("Logistic Regression: Intercept")
print(lr.intercept_)

Logistic Regression: Intercept
[-0.00067669]


In [267]:
# print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
print("Logistic Regression: Coefficients")
print(lr.coef_)

Logistic Regression: Coefficients
[[ 0.00654502 -0.01637499 -0.03628983  0.01705674 -0.03262581 -0.02070459
  -0.02963865 -0.30113814  0.57214066 -0.53699403]]


In [268]:
from sklearn.metrics import confusion_matrix
print("Logistic Regression: Confusion Matrix")
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print("Logistic Regression: Classification Report")
print(classification_report(Y_test,y_pred))
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))
print("Logistic Regression: End")

Logistic Regression: Confusion Matrix
[[ 59  69]
 [ 40 132]]
Logistic Regression: Classification Report
             precision    recall  f1-score   support

          0       0.60      0.46      0.52       128
          1       0.66      0.77      0.71       172

avg / total       0.63      0.64      0.63       300

Sat Nov 10 16:36:06 2018
Logistic Regression: End


In [269]:
# with L1 regularization
print("Logistic Regression with L1 Regularization: Start")
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=1000, random_state=0, verbose=verbose_level)
print("Logistic Regression with L1 Regularization: Fit")
lr.fit(X_train, Y_train)

Logistic Regression with L1 Regularization: Start
Sat Nov 10 16:36:06 2018
Logistic Regression with L1 Regularization: Fit
[LibLinear]



LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=4, warm_start=False)

In [270]:
print("Logistic Regression with L1 Regularization: Predict")
y_pred = lr.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print("Logistic Regression with L1 Regularization: Confusion Matrix")
print(confusion_matrix)
print(classification_report(Y_test,y_pred))
print("Logistic Regression with L1 Regularization: Classification Report")
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))
print("Logistic Regression with L1 Regularization: End")

Logistic Regression with L1 Regularization: Predict
Accuracy of logistic regression classifier on train set: 0.63
Accuracy of logistic regression classifier on test set: 0.64
Logistic Regression with L1 Regularization: Confusion Matrix
[[ 59  69]
 [ 38 134]]
             precision    recall  f1-score   support

          0       0.61      0.46      0.52       128
          1       0.66      0.78      0.71       172

avg / total       0.64      0.64      0.63       300

Logistic Regression with L1 Regularization: Classification Report
Sat Nov 10 16:36:06 2018
Logistic Regression with L1 Regularization: End


### Decision Tree

In [271]:
print("Decision Tree: Start")
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy',max_depth=100, random_state=0)
print("Decision Tree: Fit")
tree.fit(X_train, Y_train)

Decision Tree: Start
Sat Nov 10 16:36:06 2018
Decision Tree: Fit


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=100,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [272]:
print("Decision Tree: Predict")
y_pred = tree.predict(X_test)
print('Accuracy of Decision Tree classifier on train set: {:.2f}'.format(tree.score(X_train, Y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(tree.score(X_test, Y_test)))

Decision Tree: Predict
Accuracy of Decision Tree classifier on train set: 1.00
Accuracy of Decision Tree classifier on test set: 0.61


In [273]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print("Decision Tree: Confusion Matrix")
print(confusion_matrix)
print("Decision Tree: Classification Report")
print(classification_report(Y_test,y_pred))
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))
print("Decision Tree: End")

Decision Tree: Confusion Matrix
[[ 65  63]
 [ 55 117]]
Decision Tree: Classification Report
             precision    recall  f1-score   support

          0       0.54      0.51      0.52       128
          1       0.65      0.68      0.66       172

avg / total       0.60      0.61      0.60       300

Sat Nov 10 16:36:06 2018
Decision Tree: End


### Random forest

In [274]:
print("Ensemble (Bagging): Random Forest: Start")
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=1000, random_state=0, n_jobs=2, verbose=verbose_level)
print("Ensemble (Bagging): Random Forest: Fit")
forest.fit(X_train, Y_train)

Ensemble (Bagging): Random Forest: Start
Sat Nov 10 16:36:06 2018
Ensemble (Bagging): Random Forest: Fit
building tree 1 of 1000
building tree 2 of 1000
building tree 3 of 1000
building tree 4 of 1000
building tree 5 of 1000
building tree 6 of 1000
building tree 7 of 1000
building tree 8 of 1000
building tree 9 of 1000
building tree 10 of 1000
building tree 11 of 1000
building tree 12 of 1000
building tree 13 of 1000building tree 14 of 1000

building tree 15 of 1000
building tree 16 of 1000
building tree 17 of 1000
building tree 18 of 1000
building tree 19 of 1000
building tree 20 of 1000
building tree 21 of 1000
building tree 22 of 1000
building tree 23 of 1000
building tree 24 of 1000
building tree 25 of 1000
building tree 26 of 1000
building tree 27 of 1000
building tree 28 of 1000
building tree 29 of 1000building tree 30 of 1000

building tree 31 of 1000
building tree 32 of 1000
building tree 33 of 1000
building tree 34 of 1000
building tree 35 of 1000building tree 36 of 1000

buil

[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 217 tasks      | elapsed:    0.1s


building tree 242 of 1000
building tree 243 of 1000
building tree 244 of 1000
building tree 245 of 1000building tree 246 of 1000

building tree 247 of 1000
building tree 248 of 1000
building tree 249 of 1000building tree 250 of 1000

building tree 251 of 1000
building tree 252 of 1000
building tree 253 of 1000
building tree 254 of 1000
building tree 255 of 1000
building tree 256 of 1000
building tree 257 of 1000
building tree 258 of 1000
building tree 259 of 1000
building tree 260 of 1000
building tree 261 of 1000
building tree 262 of 1000
building tree 263 of 1000
building tree 264 of 1000
building tree 265 of 1000
building tree 266 of 1000
building tree 267 of 1000
building tree 268 of 1000
building tree 269 of 1000
building tree 270 of 1000
building tree 271 of 1000
building tree 272 of 1000
building tree 273 of 1000
building tree 274 of 1000
building tree 275 of 1000
building tree 276 of 1000
building tree 277 of 1000
building tree 278 of 1000
building tree 279 of 1000
building tre

[Parallel(n_jobs=2)]: Done 388 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 609 tasks      | elapsed:    0.4s



building tree 502 of 1000
building tree 503 of 1000
building tree 504 of 1000
building tree 505 of 1000
building tree 506 of 1000
building tree 507 of 1000
building tree 508 of 1000
building tree 509 of 1000
building tree 510 of 1000
building tree 511 of 1000
building tree 512 of 1000
building tree 513 of 1000
building tree 514 of 1000
building tree 515 of 1000
building tree 516 of 1000
building tree 517 of 1000
building tree 518 of 1000
building tree 519 of 1000
building tree 520 of 1000
building tree 521 of 1000building tree 522 of 1000

building tree 523 of 1000
building tree 524 of 1000
building tree 525 of 1000building tree 526 of 1000

building tree 527 of 1000
building tree 528 of 1000
building tree 529 of 1000building tree 530 of 1000

building tree 531 of 1000
building tree 532 of 1000
building tree 533 of 1000
building tree 534 of 1000
building tree 535 of 1000
building tree 536 of 1000
building tree 537 of 1000
building tree 538 of 1000
building tree 539 of 1000
building tr

building tree 988 of 1000
building tree 989 of 1000
building tree 990 of 1000
building tree 991 of 1000
building tree 992 of 1000
building tree 993 of 1000
building tree 994 of 1000
building tree 995 of 1000building tree 996 of 1000

building tree 997 of 1000
building tree 998 of 1000
building tree 999 of 1000building tree 1000 of 1000



[Parallel(n_jobs=2)]: Done 878 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    0.6s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=2,
            oob_score=False, random_state=0, verbose=4, warm_start=False)

In [275]:
print("Ensemble (Bagging): Random Forest: Predict")
y_pred = forest.predict(X_test)
print('Accuracy of RandomForest classifier on train set: {:.2f}'.format(forest.score(X_train, Y_train)))
print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, Y_test)))

[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 217 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 388 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 609 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 878 tasks      | elapsed:    0.0s


Ensemble (Bagging): Random Forest: Predict


[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 217 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 388 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 609 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 878 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 217 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 388 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 609 tasks      | elapsed:    0.0s


Accuracy of RandomForest classifier on train set: 1.00
Accuracy of RandomForest classifier on test set: 0.60


[Parallel(n_jobs=2)]: Done 878 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    0.0s finished


In [276]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print("Ensemble (Bagging): Random Forest: Confusion Matrix")
print(confusion_matrix)
print("Ensemble (Bagging): Random Forest: Classification Report")
print(classification_report(Y_test,y_pred))
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))
print("Ensemble (Bagging): Random Forest: End")

Ensemble (Bagging): Random Forest: Confusion Matrix
[[ 57  71]
 [ 48 124]]
Ensemble (Bagging): Random Forest: Classification Report
             precision    recall  f1-score   support

          0       0.54      0.45      0.49       128
          1       0.64      0.72      0.68       172

avg / total       0.60      0.60      0.60       300

Sat Nov 10 16:36:08 2018
Ensemble (Bagging): Random Forest: End


### K-N-N

In [277]:
print("KNN: Start")
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
print("KNN: Fit")
knn.fit(X_train, Y_train)

KNN: Start
Sat Nov 10 16:36:08 2018
KNN: Fit


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [278]:
print("KNN: Predict")
y_pred = knn.predict(X_test)
print('Accuracy of KNN classifier on train set: {:.2f}'.format(knn.score(X_train, Y_train)))
print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_test, Y_test)))

KNN: Predict
Accuracy of KNN classifier on train set: 0.73
Accuracy of KNN classifier on test set: 0.56


In [279]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print("KNN: Confusion Matrix")
print(confusion_matrix)
print("KNN: Classification Report")
print(classification_report(Y_test,y_pred))
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))
print("KNN: End")

KNN: Confusion Matrix
[[ 48  80]
 [ 53 119]]
KNN: Classification Report
             precision    recall  f1-score   support

          0       0.48      0.38      0.42       128
          1       0.60      0.69      0.64       172

avg / total       0.55      0.56      0.55       300

Sat Nov 10 16:36:08 2018
KNN: End


### SVM

In [280]:
print("SVM: Start")
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))
from sklearn.svm import SVC
#svm = SVC(C=1, random_state=0, kernel='sigmoid', verbose=True)
#svm = SVC(C=1, random_state=0, kernel='linear', verbose=True, cache_size=200)
svm = SVC(verbose = verbose_level)
print("SVM: Fit")
svm.fit(X_train, Y_train)

SVM: Start
Sat Nov 10 16:36:08 2018
SVM: Fit
[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=4)

In [281]:
print("SVM: Predict")
y_pred = svm.predict(X_test)
print('Accuracy of SVM classifier on train set: {:.2f}'.format(svm.score(X_train, Y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm.score(X_test, Y_test)))

SVM: Predict
Accuracy of SVM classifier on train set: 1.00
Accuracy of SVM classifier on test set: 0.58


In [282]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print("SVM: Confusion Matrix")
print(confusion_matrix)
print("SVM: Classfication Report")
print(classification_report(Y_test,y_pred))
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))
print("SVM: End")

SVM: Confusion Matrix
[[  3 125]
 [  1 171]]
SVM: Classfication Report
             precision    recall  f1-score   support

          0       0.75      0.02      0.05       128
          1       0.58      0.99      0.73       172

avg / total       0.65      0.58      0.44       300

Sat Nov 10 16:36:08 2018
SVM: End


### Performance Tuning using GridSearch

In [283]:
enable_grid_search = True
if enable_grid_search:
    print("Grid Search (SVM): Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=verbose_level)
    print("Grid Search (SVM): Fit")
    grid.fit(X_train, Y_train)
    grid.best_params_
    #Result: {'C': 1000, 'gamma': 0.001}
    grid.best_estimator_
        #SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
        #decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
        #max_iter=-1, probability=False, random_state=None, shrinking=True,
        #tol=0.001, verbose=False)
    print("Grid Search (SVM): Predict")
    grid_predictions = grid.predict(X_test)
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(Y_test, grid_predictions)
    print("Grid Search (SVM): Confusion Matrix")
    print(confusion_matrix)
    print("Grid Search (SVM): Classification Report")
    print(classification_report(Y_test,grid_predictions))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Grid Search (SVM): End")
    
    
#[[4810 1501  697]
# [3549 2112 1362]
# [ 801  905 5036]]
#             precision    recall  f1-score   support
#
#          1       0.53      0.69      0.60      7008
#          2       0.47      0.30      0.37      7023
#          3       0.71      0.75      0.73      6742
#
#avg / total       0.57      0.58      0.56     20773

Grid Search (SVM): Start
Sat Nov 10 16:36:08 2018
Grid Search (SVM): Fit
Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ......................... C=0.1, gamma=1, score=0.572650 -   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ......................... C=0.1, gamma=1, score=0.572650 -   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ......................... C=0.1, gamma=1, score=0.573276 -   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.572650 -   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.572650 -   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.573276 -   0.0s
[CV] C=0.1, ga

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s


[CV] ..................... C=0.1, gamma=0.001, score=0.573276 -   0.0s
[CV] C=0.1, gamma=0.0001 .............................................
[CV] .................... C=0.1, gamma=0.0001, score=0.572650 -   0.0s
[CV] C=0.1, gamma=0.0001 .............................................
[CV] .................... C=0.1, gamma=0.0001, score=0.572650 -   0.0s
[CV] C=0.1, gamma=0.0001 .............................................
[CV] .................... C=0.1, gamma=0.0001, score=0.573276 -   0.0s
[CV] C=1, gamma=1 ....................................................
[CV] ........................... C=1, gamma=1, score=0.572650 -   0.0s
[CV] C=1, gamma=1 ....................................................
[CV] ........................... C=1, gamma=1, score=0.572650 -   0.0s
[CV] C=1, gamma=1 ....................................................
[CV] ........................... C=1, gamma=1, score=0.573276 -   0.0s
[CV] C=1, gamma=0.1 ..................................................
[CV] .

[CV] .................... C=1000, gamma=0.001, score=0.512821 -   0.0s
[CV] C=1000, gamma=0.001 .............................................
[CV] .................... C=1000, gamma=0.001, score=0.529915 -   0.0s
[CV] C=1000, gamma=0.001 .............................................
[CV] .................... C=1000, gamma=0.001, score=0.543103 -   0.0s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................... C=1000, gamma=0.0001, score=0.611111 -   0.0s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................... C=1000, gamma=0.0001, score=0.581197 -   0.0s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................... C=1000, gamma=0.0001, score=0.594828 -   0.0s
Grid Search (SVM): Predict
Grid Search (SVM): Confusion Matrix
[[ 70  58]
 [ 49 123]]
Grid Search (SVM): Classification Report
             precision    recall  f1-score   support

          0       0.59      0.55      

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    1.5s finished


## ANN - Multilayer Perceptron

In [284]:
print("Multilayer Preceptron: Start")
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))
from sklearn.neural_network import MLPClassifier
#mlpc = MLPClassifier(alpha=1)
mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), max_iter=1000, verbose=verbose_level)

#mlp = multilayer_perceptron(n_hidden =2, activation='logistic', algorithm='sgd', random_state=3)
print("Multilayer Preceptron: fit")
mlpc.fit(X_train, Y_train)

Multilayer Preceptron: Start
Sat Nov 10 16:36:10 2018
Multilayer Preceptron: fit
Iteration 1, loss = 13.19053433
Iteration 2, loss = 13.19053421
Iteration 3, loss = 13.19053408
Iteration 4, loss = 13.19053394
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(12, 12, 12), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=4, warm_start=False)

In [285]:
print("Multilayer Preceptron: Predict")
y_pred = mlpc.predict(X_test)
print('Accuracy of Multilayer Perceptron classifier on train set: {:.2f}'.format(mlpc.score(X_train, Y_train)))
print('Accuracy of Multilayer Perceptron classifier on test set: {:.2f}'.format(mlpc.score(X_test, Y_test)))

Multilayer Preceptron: Predict
Accuracy of Multilayer Perceptron classifier on train set: 0.43
Accuracy of Multilayer Perceptron classifier on test set: 0.43


In [286]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print("Multilayer Preceptron: Confusion Matrix")
print(confusion_matrix)
print("Multilayer Preceptron: Classificiation Report")
print(classification_report(Y_test,y_pred))
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))
print("Multilayer Preceptron: End")

Multilayer Preceptron: Confusion Matrix
[[128   0]
 [172   0]]
Multilayer Preceptron: Classificiation Report
             precision    recall  f1-score   support

          0       0.43      1.00      0.60       128
          1       0.00      0.00      0.00       172

avg / total       0.18      0.43      0.26       300

Sat Nov 10 16:36:10 2018
Multilayer Preceptron: End


  'precision', 'predicted', average, warn_for)


In [287]:

#check sigmoid and rbf
#from sklearn.ensemble import BaggingClassifier
#from sklearn.svm import SVC
#clf = BaggingClassifier(SVC(C=1.0,
#        cache_size=200,
#        class_weight=None,
#        coef0=0.0,
#        decision_function_shape=None,
#        degree=3,
#        gamma='auto',
#        kernel='linear',
#        max_iter=-1,
#        probability=False,
#        random_state=None,
#        shrinking=True,
#        tol=0.001,
#        verbose=False,
#        ))

In [288]:
#clf.fit(X_train, Y_train)

In [289]:
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))

Sat Nov 10 16:36:10 2018
