In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import xgboost as xgb

import os

In [10]:
DATA_DIR = os.path.join(os.getcwd(), 'data/safety/kfengtee.csv')

agg_diff_df = pd.read_csv(DATA_DIR)
agg_diff_df = agg_diff_df.drop('bookingid', axis='columns')
agg_diff_df.head(n=10)

Unnamed: 0,acceleration_mean,acceleration_median,acceleration_std,acceleration_spread,gyro_pc_mean,gyro_pc_median,gyro_pc_std,gyro_pc_spread,speed_mean,speed_median,...,over_gyro_x,over_gyro_y,over_gyro_z,label,num_non_speed_outlier,num_speed_outlier,num_non_accel_outlier,num_accel_outlier,num_non_gyro_outlier,num_gyro_outlier
0,9.886164,9.852645,0.624159,6.530989,-0.006603,-0.002878,0.098805,1.101489,8.994822,8.503366,...,572,580,611,0,243,7,237,13,239,11
1,9.862507,9.844748,0.519956,5.819621,-0.00777,-0.003761,0.090758,1.122123,7.881588,6.904588,...,228,413,355,1,189,22,209,2,199,12
2,9.92959,9.877755,0.515173,5.168422,-0.012728,0.001405,0.117067,0.896186,3.157213,2.998761,...,67,103,71,1,47,0,47,0,41,6
3,9.813434,9.791035,0.620066,13.349284,0.022444,0.024355,0.112611,1.166442,6.150996,3.31,...,514,734,750,1,260,11,262,9,245,26
4,9.91809,9.904142,0.585346,7.280114,0.000501,0.004203,0.106436,1.161609,4.628921,1.936962,...,533,443,556,0,263,9,263,9,253,19
5,9.82647,9.7898,0.916836,8.572037,0.00264,-0.002707,0.072364,0.899634,12.176386,13.017325,...,561,490,687,0,221,15,204,32,235,1
6,9.762028,9.646309,0.728514,9.416841,-0.00093,0.000241,0.078281,0.754561,5.351266,3.5,...,157,159,82,0,92,3,89,6,88,7
7,9.550677,9.49439,0.833292,9.474737,0.001928,-0.000594,0.110036,0.908451,8.702027,9.58,...,88,111,149,0,89,3,73,19,82,10
8,9.948639,9.877962,0.75048,5.686104,-0.003997,-0.002959,0.151739,0.986176,6.659024,5.192059,...,147,153,141,0,46,8,50,4,43,11
9,9.85864,9.824785,0.44798,6.110603,-0.003204,0.001636,0.092127,0.947279,4.725448,3.173314,...,485,307,506,0,140,9,146,3,140,9


In [11]:
agg_diff_df.isnull().sum()

acceleration_mean        0
acceleration_median      0
acceleration_std         0
acceleration_spread      0
gyro_pc_mean             0
gyro_pc_median           0
gyro_pc_std              0
gyro_pc_spread           0
speed_mean               0
speed_median             0
speed_std                0
speed_spread             0
second_mean              0
second_median            0
second_std               0
second_spread            0
over_speed               0
over_second              0
over_acceleration_x      0
over_acceleration_y      0
over_acceleration_z      0
over_gyro_x              0
over_gyro_y              0
over_gyro_z              0
label                    0
num_non_speed_outlier    0
num_speed_outlier        0
num_non_accel_outlier    0
num_accel_outlier        0
num_non_gyro_outlier     0
num_gyro_outlier         0
dtype: int64

### Data preparation for modelling on agg_diff_df

In [12]:
seed = 991
np.random.seed(seed)

X_train, X_test, y_train, y_test = train_test_split(agg_diff_df.drop('label', axis='columns'),
                                                   agg_diff_df['label'], test_size=0.2, shuffle=True,
                                                   random_state=seed)

# Checking shapes of new arrays
print("X_train: {}, y_train: {}".format(X_train.shape, y_train.shape))
print("X_test: {}, y_test: {}".format(X_test.shape, y_test.shape))

# Checking distribution of classes in train and test sets
print("Train distribution: \n {}".format(y_train.value_counts()))
print("Test distribution: \n {}".format(y_test.value_counts()))

X_train: (15985, 30), y_train: (15985,)
X_test: (3997, 30), y_test: (3997,)
Train distribution: 
 0    12044
1     3941
Name: label, dtype: int64
Test distribution: 
 0    2955
1    1042
Name: label, dtype: int64


### Tree-based methods on agg_diff_df2

In [51]:
model_dict = { 'logistic' : LogisticRegression(), 
              'decision_tree' : DecisionTreeClassifier(criterion='gini'), 
              'random_forest' : RandomForestClassifier(max_features='sqrt'), 
              'adaboost' : AdaBoostClassifier(), 
              'naive_bayes' : GaussianNB(),
              'svc': SVC(kernel='rbf')}

In [52]:
model_params = {'logistic' : { 'C' : [1e-3, 1e-2, 0.1, 1, 10],
                              'solver' : ['liblinear', 'saga']} , 
                'decision_tree' : { 'max_depth' : [3, 4, 5, 6, 7]} , 
                'random_forest' : { 'n_estimators' : [50, 100, 150, 200],
                                   'max_depth' : [None, 4, 5, 6, 7, 8, 9, 10]} , 
                'adaboost' : { 'n_estimators' : [25, 50, 100, 200], 
                              'learning_rate' : [1e-3, 1e-2, 0.1, 1]} , 
                'naive_bayes' : {'var_smoothing': [1e-11, 1e-10, 1e-09, 1e-08, 1e-7]}, 
                'svc': { 'gamma': [0.1, 1, 10, 100], 
                        'C': [0.1, 1, 10, 100, 1000]} }

In [53]:
import datetime
from sklearn.model_selection import RandomizedSearchCV

best_models = {}
start_time = datetime.datetime.now()
num_iter = 5
seed = 31

for model in model_dict.keys():
    print("Model : {}".format(model))
    
    best_match = {}
    
    for i in range(num_iter):
        print('Iteration', i + 1)
        
        # randomized search through the hyperparameters grid
        random_grid = RandomizedSearchCV(estimator = model_dict[model], 
                                         param_distributions=model_params[model], 
                                         scoring='roc_auc', n_iter=num_iter, cv=5, n_jobs=-1,
                                         random_state=seed)
        random_grid.fit(X_train, y_train)
        
        # update the best model's hyperparameters and columns used
        if i == 0:
            best_match['model'] = model
            best_match['hyperparams'] = random_grid.best_params_
            best_match['roc'] = random_grid.best_score_
        elif random_grid.best_score_ > best_match['roc']:
            best_match['model'] = model
            best_match['hyperparams'] = random_grid.best_params_
            best_match['roc'] = random_grid.best_score_
    
    best_models[model] = best_match

end_time = datetime.datetime.now()
print('Total time : {} seconds'.format((end_time - start_time).seconds))

Model : logistic
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Model : decision_tree
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Model : random_forest
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Model : adaboost
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Model : naive_bayes
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Model : svc
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Total time : 1491 seconds


In [54]:
best_models

{'logistic': {'model': 'logistic',
  'hyperparams': {'solver': 'liblinear', 'C': 0.1},
  'roc': 0.6461768590927846},
 'decision_tree': {'model': 'decision_tree',
  'hyperparams': {'max_depth': 6},
  'roc': 0.6980297863631868},
 'random_forest': {'model': 'random_forest',
  'hyperparams': {'n_estimators': 200, 'max_depth': 9},
  'roc': 0.7254084232154786},
 'adaboost': {'model': 'adaboost',
  'hyperparams': {'n_estimators': 50, 'learning_rate': 1},
  'roc': 0.7139640439031948},
 'naive_bayes': {'model': 'naive_bayes',
  'hyperparams': {'var_smoothing': 1e-11},
  'roc': 0.6663442157139099},
 'svc': {'model': 'svc',
  'hyperparams': {'gamma': 0.1, 'C': 0.1},
  'roc': 0.5001245372502802}}

In [56]:
logistic = LogisticRegression(**best_models['logistic']['hyperparams'])
logistic.fit(X_train, y_train)
pred_logistic = logistic.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, logistic.predict(X_test))))

d_tree = DecisionTreeClassifier(**best_models['decision_tree']['hyperparams'], criterion='gini')
d_tree.fit(X_train, y_train)
pred_d_tree = d_tree.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, d_tree.predict(X_test))))

r_forest = RandomForestClassifier(**best_models['random_forest']['hyperparams'], max_features='sqrt')
r_forest.fit(X_train, y_train)
pred_r_forest = r_forest.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, r_forest.predict(X_test))))

adaboost = AdaBoostClassifier(**best_models['adaboost']['hyperparams'])
adaboost.fit(X_train, y_train)
pred_adaboost = adaboost.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, adaboost.predict(X_test))))

n_bayes = GaussianNB(**best_models['naive_bayes']['hyperparams'])
n_bayes.fit(X_train, y_train)
pred_n_bayes = n_bayes.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, n_bayes.predict(X_test))))

svc = SVC(**best_models['svc']['hyperparams'], kernel='rbf')
svc.fit(X_train, y_train)
# pred_svc = svc.predict_proba(X_test)
print("Test ROC : {}".format(roc_auc_score(y_test, svc.predict(X_test))))

Test ROC : 0.5809959696145964
Test ROC : 0.5664791124708114
Test ROC : 0.59397179704525
Test ROC : 0.5980153356002221
Test ROC : 0.49920951183946005
Test ROC : 0.5


In [57]:
pred_array = np.transpose(np.vstack((pred_logistic[:, 1], pred_d_tree[:, 1], 
                                     pred_r_forest[:, 1], pred_adaboost[:, 1], 
                                     pred_n_bayes[:, 1])))

pred_df = pd.DataFrame(pred_array, columns=['logistic', 'd_tree', 
                                            'r_forest', 'adaboost', 
                                            'n_bayes'])

pred_df.corr()

Unnamed: 0,logistic,d_tree,r_forest,adaboost,n_bayes
logistic,1.0,0.553684,0.646254,0.335171,0.048448
d_tree,0.553684,1.0,0.884595,0.425912,-0.055592
r_forest,0.646254,0.884595,1.0,0.496128,-0.058821
adaboost,0.335171,0.425912,0.496128,1.0,0.012101
n_bayes,0.048448,-0.055592,-0.058821,0.012101,1.0


In [65]:
feature_imp = pd.DataFrame({'feature' : X_train.columns,
                            'importance' : adaboost.feature_importances_})
feature_imp = feature_imp.sort_values(by='importance', ascending=False)

feature_imp['feature'].values[:10]

array(['second_mean', 'speed_mean', 'speed_median', 'second_spread',
       'acceleration_std', 'num_non_speed_outlier', 'second_std',
       'second_median', 'speed_spread', 'num_non_gyro_outlier'],
      dtype=object)