In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import xgboost as xgb

import os

In [2]:
DATA_DIR = os.path.join(os.getcwd(), '../data/safety/total_df.csv')

cols = ['acceleration_mean', 'acceleration_median', 'acceleration_std',
       'acceleration_spread', 'gyro_pc_mean', 'gyro_pc_median', 'gyro_pc_std',
       'gyro_pc_spread', 'speed_mean', 'speed_median', 'speed_std',
       'speed_spread', 'second_mean', 'second_median', 'second_std',
       'second_spread', 'over_speed', 'over_second', 'over_acceleration_x',
       'over_acceleration_y', 'over_acceleration_z', 'over_gyro_x',
       'over_gyro_y', 'over_gyro_z', 'label']

agg_diff_df = pd.read_csv(DATA_DIR)
agg_diff_df = agg_diff_df[cols]
agg_diff_df.head(n=10)

Unnamed: 0,acceleration_mean,acceleration_median,acceleration_std,acceleration_spread,gyro_pc_mean,gyro_pc_median,gyro_pc_std,gyro_pc_spread,speed_mean,speed_median,...,second_spread,over_speed,over_second,over_acceleration_x,over_acceleration_y,over_acceleration_z,over_gyro_x,over_gyro_y,over_gyro_z,label
0,9.883337,9.852269,0.619492,6.530989,-0.006583,-0.002863,0.099002,1.101352,9.003204,8.503366,...,1589.0,247,687,709,1002,801,570,581,614,0
1,9.865608,9.847932,0.522142,5.819621,-0.006855,-0.003612,0.09077,1.123587,8.019369,7.206634,...,1034.0,157,168,483,270,790,230,404,347,1
2,9.92959,9.877755,0.515173,5.168422,-0.012751,0.001369,0.117109,0.896289,3.157213,2.998761,...,825.0,0,0,102,143,34,67,103,71,1
3,9.813434,9.791035,0.620066,13.349284,0.022429,0.024239,0.112628,1.166471,6.150996,3.31,...,1094.0,122,232,389,1094,1072,531,735,753,1
4,9.91809,9.904142,0.585346,7.280114,0.00048,0.004189,0.106469,1.161481,4.628921,1.936962,...,1094.0,18,232,463,407,257,533,450,561,0
5,9.82647,9.7898,0.916836,8.572037,0.002651,-0.002687,0.072664,0.903745,12.176386,13.017325,...,959.0,365,97,809,262,219,561,490,687,0
6,9.763213,9.646309,0.730155,9.416841,-0.00084,0.00025,0.078446,0.75418,5.38426,3.54,...,462.0,25,0,357,94,36,157,159,82,0
7,9.550677,9.49439,0.833292,9.474737,0.001922,-0.000612,0.110181,0.909695,8.702027,9.58,...,374.0,90,0,300,71,307,90,111,149,0
8,9.948639,9.877962,0.75048,5.686104,-0.004018,-0.003111,0.15198,0.988519,6.659024,5.192059,...,299.0,14,0,129,120,44,147,154,141,0
9,9.873517,9.823053,0.425662,5.916028,-0.002192,0.000388,0.082987,0.767631,4.152211,3.702154,...,555.0,1,0,155,441,10,349,207,374,0


In [3]:
# Checking for missing values
agg_diff_df.isnull().sum()

acceleration_mean      0
acceleration_median    0
acceleration_std       0
acceleration_spread    0
gyro_pc_mean           0
gyro_pc_median         0
gyro_pc_std            0
gyro_pc_spread         0
speed_mean             0
speed_median           0
speed_std              0
speed_spread           0
second_mean            0
second_median          0
second_std             0
second_spread          0
over_speed             0
over_second            0
over_acceleration_x    0
over_acceleration_y    0
over_acceleration_z    0
over_gyro_x            0
over_gyro_y            0
over_gyro_z            0
label                  0
dtype: int64

### Data preparation for modelling on agg_diff_df

In [4]:
seed = 199
np.random.seed(seed)

X_train, X_test, y_train, y_test = train_test_split(agg_diff_df.drop('label', axis='columns'),
                                                   agg_diff_df['label'], test_size=0.2, shuffle=True,
                                                   random_state=seed)

# Checking shapes of new arrays
print("X_train: {}, y_train: {}".format(X_train.shape, y_train.shape))
print("X_test: {}, y_test: {}".format(X_test.shape, y_test.shape))

# Checking distribution of classes in train and test sets
print("Train distribution: \n {}".format(y_train.value_counts()))
print("Test distribution: \n {}".format(y_test.value_counts()))

X_train: (15947, 24), y_train: (15947,)
X_test: (3987, 24), y_test: (3987,)
Train distribution: 
 0    11996
1     3951
Name: label, dtype: int64
Test distribution: 
 0    2976
1    1011
Name: label, dtype: int64


### (failed) attempt at stacking classifiers

In [5]:
model_dict = { 'logistic' : LogisticRegression(), 
              'decision_tree' : DecisionTreeClassifier(criterion='gini'), 
              'random_forest' : RandomForestClassifier(max_features='sqrt'), 
              'adaboost' : AdaBoostClassifier(), 
              'naive_bayes' : GaussianNB()}
#               'svc': SVC(kernel='rbf')}

In [6]:
model_params = {'logistic' : { 'C' : [1e-3, 1e-2, 0.1, 1, 10],
                              'solver' : ['liblinear', 'saga']} , 
                'decision_tree' : { 'max_depth' : [3, 4, 5, 6, 7]} , 
                'random_forest' : { 'n_estimators' : [50, 100, 150, 200],
                                   'max_depth' : [None, 4, 5, 6, 7, 8, 9, 10]} , 
                'adaboost' : { 'n_estimators' : [25, 50, 100, 200], 
                              'learning_rate' : [1e-3, 1e-2, 0.1, 1]} , 
                'naive_bayes' : {'var_smoothing': [1e-11, 1e-10, 1e-09, 1e-08, 1e-7]}}
#                 'svc': { 'gamma': [0.1, 1, 10, 100], 
#                         'C': [0.1, 1, 10, 100, 1000]} }

In [7]:
import datetime
from sklearn.model_selection import RandomizedSearchCV

best_models = {}
start_time = datetime.datetime.now()
num_iter = 5
seed = 199

for model in model_dict.keys():
    print("Model : {}".format(model))
    
    best_match = {}
    
    for i in range(num_iter):
        print('Iteration', i + 1)
        
        # randomized search through the hyperparameters grid
        random_grid = RandomizedSearchCV(estimator = model_dict[model], 
                                         param_distributions=model_params[model], 
                                         scoring='roc_auc', n_iter=num_iter, cv=5, n_jobs=-1,
                                         random_state=seed)
        random_grid.fit(X_train, y_train)
        
        # update the best model's hyperparameters and columns used
        if i == 0:
            best_match['model'] = model
            best_match['hyperparams'] = random_grid.best_params_
            best_match['roc'] = random_grid.best_score_
        elif random_grid.best_score_ > best_match['roc']:
            best_match['model'] = model
            best_match['hyperparams'] = random_grid.best_params_
            best_match['roc'] = random_grid.best_score_
    
    best_models[model] = best_match

end_time = datetime.datetime.now()
print('Total time : {} seconds'.format((end_time - start_time).seconds))

Model : logistic
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Model : decision_tree
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Model : random_forest
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Model : adaboost
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Model : naive_bayes
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Total time : 308 seconds


In [8]:
best_models

{'logistic': {'model': 'logistic',
  'hyperparams': {'solver': 'liblinear', 'C': 10},
  'roc': 0.7191762373790253},
 'decision_tree': {'model': 'decision_tree',
  'hyperparams': {'max_depth': 7},
  'roc': 0.7012450439298445},
 'random_forest': {'model': 'random_forest',
  'hyperparams': {'n_estimators': 150, 'max_depth': 10},
  'roc': 0.7260615978614656},
 'adaboost': {'model': 'adaboost',
  'hyperparams': {'n_estimators': 100, 'learning_rate': 1},
  'roc': 0.716732347985426},
 'naive_bayes': {'model': 'naive_bayes',
  'hyperparams': {'var_smoothing': 1e-07},
  'roc': 0.686161060764147}}

In [9]:
logistic = LogisticRegression(**best_models['logistic']['hyperparams'])
logistic.fit(X_train, y_train)
pred_logistic = logistic.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, logistic.predict(X_test))))

d_tree = DecisionTreeClassifier(**best_models['decision_tree']['hyperparams'], criterion='gini')
d_tree.fit(X_train, y_train)
pred_d_tree = d_tree.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, d_tree.predict(X_test))))

r_forest = RandomForestClassifier(**best_models['random_forest']['hyperparams'], max_features='sqrt')
r_forest.fit(X_train, y_train)
pred_r_forest = r_forest.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, r_forest.predict(X_test))))

adaboost = AdaBoostClassifier(**best_models['adaboost']['hyperparams'])
adaboost.fit(X_train, y_train)
pred_adaboost = adaboost.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, adaboost.predict(X_test))))

n_bayes = GaussianNB(**best_models['naive_bayes']['hyperparams'])
n_bayes.fit(X_train, y_train)
pred_n_bayes = n_bayes.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, n_bayes.predict(X_test))))

# svc = SVC(**best_models['svc']['hyperparams'], kernel='rbf')
# svc.fit(X_train, y_train)
# # pred_svc = svc.predict_proba(X_test)
# print("Test ROC : {}".format(roc_auc_score(y_test, svc.predict(X_test))))

Test ROC : 0.5899410915414314
Test ROC : 0.579197875785712
Test ROC : 0.5774753916594875
Test ROC : 0.596237755655531
Test ROC : 0.6250633156249003


In [10]:
pred_array = np.transpose(np.vstack((pred_logistic[:, 1], pred_d_tree[:, 1], 
                                     pred_r_forest[:, 1], pred_adaboost[:, 1], 
                                     pred_n_bayes[:, 1])))

pred_df = pd.DataFrame(pred_array, columns=['logistic', 'd_tree', 
                                            'r_forest', 'adaboost', 
                                            'n_bayes'])

pred_df.corr()

Unnamed: 0,logistic,d_tree,r_forest,adaboost,n_bayes
logistic,1.0,0.80387,0.917892,0.467567,0.694001
d_tree,0.80387,1.0,0.867444,0.425781,0.599486
r_forest,0.917892,0.867444,1.0,0.490787,0.688983
adaboost,0.467567,0.425781,0.490787,1.0,0.277433
n_bayes,0.694001,0.599486,0.688983,0.277433,1.0


### Using tree-based models to determine feature importance

In [11]:
ada_imp = pd.DataFrame({'feature' : X_train.columns,
                            'importance' : adaboost.feature_importances_})
ada_imp = ada_imp.sort_values(by='importance', ascending=False)

ada_imp.head(n=10)

Unnamed: 0,feature,importance
13,second_median,0.11
7,gyro_pc_spread,0.1
15,second_spread,0.09
6,gyro_pc_std,0.08
11,speed_spread,0.07
8,speed_mean,0.07
9,speed_median,0.06
14,second_std,0.05
12,second_mean,0.04
23,over_gyro_z,0.04


In [13]:
import xgboost as xgb

# params : eta, max_depth, max_leaf_nodes, gamma, colsample_bytree, lambda, alpha

xgboost = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', seed=199,
                           max_depth=5)
xgboost.fit(X_train, y_train)

xg_pred = xgboost.predict(X_test)
print(roc_auc_score(y_test, xg_pred))

xg_imp = pd.DataFrame({'feature' : X_train.columns,
                            'importance' : xgboost.feature_importances_})
xg_imp = xg_imp.sort_values(by='importance', ascending=False)

print(xg_imp['feature'].values[:10])
xg_imp.head(n=10)

0.5813276738138541


array([[2889,   87],
       [ 817,  194]])

In [17]:
dtree_imp = pd.DataFrame({'feature' : X_train.columns,
                            'importance' : d_tree.feature_importances_})
dtree_imp = dtree_imp.sort_values(by='importance', ascending=False)

dtree_imp.head(n=10)

Unnamed: 0,feature,importance
13,second_median,0.392879
2,acceleration_std,0.168113
15,second_spread,0.101691
9,speed_median,0.052056
8,speed_mean,0.048879
11,speed_spread,0.035852
0,acceleration_mean,0.029126
12,second_mean,0.025718
10,speed_std,0.022676
14,second_std,0.01833


In [18]:
rf_imp = pd.DataFrame({'feature' : X_train.columns,
                            'importance' : r_forest.feature_importances_})
rf_imp = rf_imp.sort_values(by='importance', ascending=False)

rf_imp.head(n=10)

Unnamed: 0,feature,importance
13,second_median,0.099198
15,second_spread,0.093989
14,second_std,0.085533
12,second_mean,0.078323
2,acceleration_std,0.054752
17,over_second,0.050162
8,speed_mean,0.047135
11,speed_spread,0.044649
9,speed_median,0.044575
3,acceleration_spread,0.038738
