In [1]:
# Core python library
import os
import gc
import pandas as pd
import numpy as np
from numpy import median
import math
import copy
import statsmodels.api as sm
import scipy.stats
# Some pandas result display, no impact to what are being stored
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
# ignore the warning message
import warnings
warnings.filterwarnings('ignore')
# visualize related
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')
from IPython.display import display, HTML
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
# ML
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
import xgboost
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, scorer
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [2]:
SEED = 1234
PATH = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
os.chdir(PATH)
np.random.seed(SEED)

In [3]:
def list_all_files_with_ext(path: str, 
                            suffix: str='.csv'):
    """
    Function to list all the files with specific extension in the given path
    @Args:
      path (str): the path to files
      suffix (str): the extension of the files to get; default: .csv
    Return: 
      List of files with given extensions in the path
    """
    filenames = os.listdir(path)
    return [path + filename for filename in filenames if filename.endswith(suffix)]


def read_files_(file_to_read: list, 
                show_sample_ind: bool=True):
    """
    Function to read multiple files onto one pandas dataframe
    @Args:
      file_to_read (list): List of path and filename to read
      show_sample_ind (bool): Show the head sample; default is True
    Return:
      full pandas dataframe
    """
    _df = pd.concat([pd.read_csv(f) for f in file_to_read], ignore_index=True)
    if show_sample_ind:
        display(_df.head(5))
        
    return _df

In [4]:
file_to_read = list_all_files_with_ext('../data/safety/features/')
data_df = read_files_(file_to_read)
print('The data frame dimensions: %s' % str(data_df.shape))
print('The unique booking ID: %s' % len(data_df.bookingID.unique()))

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,1202590843006,3.0,353.0,1.2289,8.9001,3.987,0.0082,0.0023,-0.01,1362.0,0.0
1,274877907034,9.293,17.0,0.0328,8.6599,4.7373,0.0246,0.004,-0.0109,257.0,0.19
2,884763263056,3.0,189.0,1.1397,9.546,1.9513,-0.0069,-0.0151,0.0011,973.0,0.6671
3,1073741824054,3.9,126.0,3.8715,10.3864,-0.1365,0.0013,-0.3396,-0.018,902.0,7.9133
4,1056561954943,3.9,50.0,-0.1129,10.551,-1.5601,0.1306,-0.0617,0.1615,820.0,20.4194


The data frame dimensions: (16135561, 11)
The unique booking ID: 20000


In [5]:
data_df.sort_values(by=['bookingID', 'second'], ascending=True, inplace=True)
data_df.reset_index(inplace=True, drop=True)
display(data_df.head(5))

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,0,12.0,143.2983,0.8181,-9.9415,-2.015,-0.0162,-0.094,0.0707,0.0,3.443
1,0,8.0,143.2983,0.5464,-9.8356,-2.0389,-0.0471,-0.0789,0.0432,1.0,0.2285
2,0,8.0,143.2983,-1.7062,-9.2708,-1.2094,-0.029,-0.0327,0.0154,2.0,0.2285
3,0,8.0,143.2983,-1.4167,-9.548,-1.861,-0.0224,0.005,-0.0258,3.0,0.2285
4,0,8.0,143.2983,-0.5981,-9.8535,-1.3786,-0.0143,-0.0462,0.0219,4.0,0.2285


In [6]:
file_to_read = list_all_files_with_ext('../data/safety/labels/')
target_df = read_files_(file_to_read)
print('The target data shape: %s' % str(target_df.shape))
print('The unique booking ID: %s' % len(target_df.bookingID.unique()))

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
2,171798691856,0
3,1520418422900,0
4,798863917116,0


The target data shape: (20018, 2)
The unique booking ID: 20000


Clean Target column

In [7]:
target_df.loc[target_df.duplicated(['bookingID'], keep=False), 'label'] = 1
target_df.drop_duplicates(subset=['bookingID', 'label'], keep='first', inplace=True)
target_df.reset_index(drop=True, inplace=True)
print(target_df.shape)

(20000, 2)


In [8]:
def percentile(n: float):
    """
    Function to compute the percentile based on given n, this will use in `agg()` method in pandas
    @Args:
      n (float): the percentile to compute (i.e. 0.5 for median)
      
    Return:
      the percentile value
    """
    def percentile_(x):
        return x.quantile(n)
    
    percentile_.__name__ = 'percentile_{:2.0f}'.format(n*100)
    return percentile_


def mean_abs_dev():
    """
    Function to compute the mean absolute deviation, which is the average distance between
      each data point and the mean. This helps to understand about the variability in the data.
    This should be 
    @Args:
      None
      
    Return:
      MAD value
    """
    def _mean_abs_dev(x):
        return x.mad()
    
    _mean_abs_dev.__name__ = 'mean_abs_dev'
    return _mean_abs_dev



In [9]:
sensor_data_df = \
data_df.groupby('bookingID').agg({'acceleration_x': [np.min, np.max, np.mean, 
                                                     np.median, np.var, np.std,
                                                     scipy.stats.skew, scipy.stats.kurtosis,
                                                     mean_abs_dev(), percentile(.25), percentile(.75)
                                                    ],
                                  'acceleration_y': [np.min, np.max, np.mean, 
                                                     np.median, np.var, np.std, 
                                                     scipy.stats.skew, scipy.stats.kurtosis,
                                                     mean_abs_dev(), percentile(.25), percentile(.75)
                                                    ],
                                  'acceleration_z': [np.min, np.max, np.mean, 
                                                     np.median, np.var, np.std, 
                                                     scipy.stats.skew, scipy.stats.kurtosis,
                                                     mean_abs_dev(), percentile(.25), percentile(.75)
                                                    ],
                                  'gyro_x': [np.min, np.max, np.mean, 
                                             np.median, np.var, np.std,
                                             scipy.stats.skew, scipy.stats.kurtosis,
                                             mean_abs_dev(), percentile(.25), percentile(.75)
                                            ],
                                  'gyro_y': [np.min, np.max, np.mean, 
                                             np.median, np.var, np.std, 
                                             scipy.stats.skew, scipy.stats.kurtosis,
                                             mean_abs_dev(), percentile(.25), percentile(.75)
                                            ],
                                  'gyro_z': [np.min, np.max, np.mean, 
                                             np.median, np.var, np.std, 
                                             scipy.stats.skew, scipy.stats.kurtosis,
                                             mean_abs_dev(), percentile(.25), percentile(.75)
                                            ],
                                  'Speed': [np.min, np.max, np.mean, 
                                            np.median, np.var, np.std, 
                                            scipy.stats.skew, scipy.stats.kurtosis,
                                            mean_abs_dev(), percentile(.25), percentile(.75)
                                           ]
                                 })

In [10]:
cols = {'acceleration_x_amin': 'min_acc_x', 'acceleration_x_amax': 'max_acc_x',
        'acceleration_x_mean': 'mean_acc_x', 'acceleration_x_median': 'p50_acc_x',
        'acceleration_x_var': 'var_acc_x', 'acceleration_x_std': 'std_acc_x',
        'acceleration_x_skew': 'skew_acc_x', 'acceleration_x_kurtosis': 'kurt_acc_x',
        'acceleration_x_mean_abs_dev': 'mad_acc_x', 'acceleration_x_percentile_25': 'p25_acc_x',
        'acceleration_x_percentile_75': 'p75_acc_x',
        
        'acceleration_y_amin': 'min_acc_y', 'acceleration_y_amax': 'max_acc_y',
        'acceleration_y_mean': 'mean_acc_y', 'acceleration_y_median': 'p50_acc_y',
        'acceleration_y_var': 'var_acc_y', 'acceleration_y_std': 'std_acc_y',
        'acceleration_y_skew': 'skew_acc_y', 'acceleration_y_kurtosis': 'kurt_acc_y',
        'acceleration_y_mean_abs_dev': 'mad_acc_y', 'acceleration_y_percentile_25': 'p25_acc_y',
        'acceleration_y_percentile_75': 'p75_acc_y',
        
        'acceleration_z_amin': 'min_acc_z', 'acceleration_z_amax': 'max_acc_z',
        'acceleration_z_mean': 'mean_acc_z', 'acceleration_z_median': 'p50_acc_z',
        'acceleration_z_var': 'var_acc_z', 'acceleration_z_std': 'std_acc_z',
        'acceleration_z_skew': 'skew_acc_z', 'acceleration_z_kurtosis': 'kurt_acc_z',
        'acceleration_z_mean_abs_dev': 'mad_acc_z', 'acceleration_z_percentile_25': 'p25_acc_z',
        'acceleration_z_percentile_75': 'p75_acc_z',
        
        'gyro_x_amin': 'min_gyro_x', 'gyro_x_amax': 'max_gyro_x',
        'gyro_x_mean': 'mean_gyro_x', 'gyro_x_median': 'p50_gyro_x',
        'gyro_x_var': 'var_gyro_x', 'gyro_x_std': 'std_gyro_x',
        'gyro_x_skew': 'skew_gyro_x', 'gyro_x_kurtosis': 'kurt_gyro_x',
        'gyro_x_mean_abs_dev': 'mad_gyro_x', 'gyro_x_percentile_25': 'p25_gyro_x',
        'gyro_x_percentile_75': 'p75_gyro_x',
        
        'gyro_y_amin': 'min_gyro_y', 'gyro_y_amax': 'max_gyro_y',
        'gyro_y_mean': 'mean_gyro_y', 'gyro_y_median': 'p50_gyro_y',
        'gyro_y_var': 'var_gyro_y', 'gyro_y_std': 'std_gyro_y',
        'gyro_y_skew': 'skew_gyro_y', 'gyro_y_kurtosis': 'kurt_gyro_y',
        'gyro_y_mean_abs_dev': 'mad_gyro_y', 'gyro_y_percentile_25': 'p25_gyro_y',
        'gyro_y_percentile_75': 'p75_gyro_y',
        
        'gyro_z_amin': 'min_gyro_z', 'gyro_z_amax': 'max_gyro_z',
        'gyro_z_mean': 'mean_gyro_z', 'gyro_z_median': 'p50_gyro_z',
        'gyro_z_var': 'var_gyro_z', 'gyro_z_std': 'std_gyro_z',
        'gyro_z_skew': 'skew_gyro_z', 'gyro_z_kurtosis': 'kurt_gyro_z',
        'gyro_z_mean_abs_dev': 'mad_gyro_z', 'gyro_z_percentile_25': 'p25_gyro_z',
        'gyro_z_percentile_75': 'p75_gyro_z',
        
        'Speed_amin': 'min_Speed', 'Speed_amax': 'max_Speed',
        'Speed_mean': 'mean_Speed', 'Speed_median': 'p50_Speed',
        'Speed_var': 'var_Speed', 'Speed_std': 'std_Speed',
        'Speed_skew': 'skew_Speed', 'Speed_kurtosis': 'kurt_Speed',
        'Speed_mean_abs_dev': 'mad_Speed', 'Speed_percentile_25': 'p25_Speed',
        'Speed_percentile_75': 'p75_Speed'
        
       }
       

In [11]:
sensor_data_df.columns = sensor_data_df.columns.map('_'.join).to_series().map(cols)
sensor_data_df.reset_index(drop=False, inplace=True)
display(sensor_data_df.head(5))

Unnamed: 0,bookingID,min_acc_x,max_acc_x,mean_acc_x,p50_acc_x,var_acc_x,std_acc_x,skew_acc_x,kurt_acc_x,mad_acc_x,p25_acc_x,p75_acc_x,min_acc_y,max_acc_y,mean_acc_y,p50_acc_y,var_acc_y,std_acc_y,skew_acc_y,kurt_acc_y,mad_acc_y,p25_acc_y,p75_acc_y,min_acc_z,max_acc_z,mean_acc_z,p50_acc_z,var_acc_z,std_acc_z,skew_acc_z,kurt_acc_z,mad_acc_z,p25_acc_z,p75_acc_z,min_gyro_x,max_gyro_x,mean_gyro_x,p50_gyro_x,var_gyro_x,std_gyro_x,skew_gyro_x,kurt_gyro_x,mad_gyro_x,p25_gyro_x,p75_gyro_x,min_gyro_y,max_gyro_y,mean_gyro_y,p50_gyro_y,var_gyro_y,std_gyro_y,skew_gyro_y,kurt_gyro_y,mad_gyro_y,p25_gyro_y,p75_gyro_y,min_gyro_z,max_gyro_z,mean_gyro_z,p50_gyro_z,var_gyro_z,std_gyro_z,skew_gyro_z,kurt_gyro_z,mad_gyro_z,p25_gyro_z,p75_gyro_z,min_Speed,max_Speed,mean_Speed,p50_Speed,var_Speed,std_Speed,skew_Speed,kurt_Speed,mad_Speed,p25_Speed,p75_Speed
0,0,-4.6923,4.7826,-0.7113,-0.7253,0.8612,0.928,0.3622,3.314,0.6554,-1.1851,-0.2994,-12.7647,-6.1199,-9.6138,-9.6221,0.4095,0.6399,0.1686,3.9453,0.4384,-9.9039,-9.3444,-6.2518,2.3189,-1.6197,-1.6077,1.3025,1.1413,-0.1563,1.2489,0.8311,-2.2504,-1.0337,-0.3925,0.4384,0.0033,0.0026,0.0043,0.066,0.1315,4.5502,0.0456,-0.0273,0.0332,-0.6099,0.4697,-0.0061,-0.0022,0.01,0.1002,-0.473,6.1341,0.0652,-0.0462,0.0328,-0.7319,0.3728,-0.0042,-0.0035,0.0041,0.0637,-0.9916,20.7524,0.0411,-0.0299,0.0209,-1.0,22.9461,8.9948,8.5034,51.8388,7.1999,0.1676,-1.4313,6.5121,1.4903,15.6455
1,1,-5.353,3.8133,-0.5254,-0.5631,0.5538,0.7442,0.5246,9.0311,0.4714,-0.8411,-0.2294,6.6234,12.5362,9.5321,9.538,0.2851,0.5339,-0.0668,3.4013,0.3772,9.2772,9.7968,-5.3555,1.4813,-2.199,-2.1758,0.7298,0.8543,0.0315,1.9065,0.6003,-2.5832,-1.8192,-0.1254,0.1265,-0.0025,-0.0015,0.0008,0.0277,0.0664,2.6489,0.0192,-0.0157,0.0087,-0.6782,0.4708,-0.0075,-0.0035,0.0084,0.0917,-1.85,13.7259,0.0523,-0.0339,0.0229,-0.1213,0.2351,0.0004,-0.0004,0.0011,0.0338,0.8118,5.634,0.0228,-0.0149,0.0133,-1.0,21.8821,7.8816,6.9046,49.8346,7.0594,0.3472,-1.2438,6.2239,0.0,13.7479
2,2,-2.9713,1.9561,0.3068,0.4334,0.5724,0.7566,-0.7394,1.1788,0.592,-0.1987,0.8248,7.9418,13.3337,9.8432,9.8141,0.2557,0.5057,1.6707,13.6517,0.2806,9.6932,9.9614,-3.2826,2.3129,0.1393,0.3663,1.0404,1.02,-0.8523,0.8393,0.7856,-0.3891,0.7745,-0.1552,0.2822,0.0065,0.0012,0.0029,0.0539,1.4238,6.2999,0.032,-0.0129,0.0161,-0.462,0.4317,-0.0129,0.001,0.0138,0.1173,-0.1857,3.3405,0.0746,-0.0445,0.0212,-0.1259,0.2559,0.0026,0.0018,0.0013,0.0362,1.5433,12.9625,0.0215,-0.008,0.0134,0.0,9.3605,3.1572,2.9988,8.397,2.8978,0.4074,-1.0684,2.5171,0.0,5.3
3,4,-2.8665,2.0196,-0.3651,-0.3812,0.278,0.5272,-0.0319,4.4099,0.3431,-0.5665,-0.1489,-18.8478,-7.065,-9.4064,-9.365,0.3576,0.598,-5.02,69.578,0.3378,-9.6037,-9.1785,-9.3749,0.2964,-2.6136,-2.6087,0.6077,0.7795,-0.3945,5.47,0.5719,-3.0425,-2.2178,-0.4205,0.4387,-0.0229,-0.0228,0.0018,0.0423,0.723,26.5683,0.0242,-0.0364,-0.0085,-0.5065,0.6526,0.0232,0.0245,0.0127,0.1126,-0.1355,3.5782,0.075,-0.0181,0.0732,-0.3482,0.5052,-0.0004,-0.0011,0.0043,0.0659,0.547,7.1477,0.0453,-0.0329,0.0315,0.0,19.78,6.151,3.31,31.3141,5.5959,0.8722,-0.6382,4.7609,2.19,10.2225
4,6,-4.3528,5.593,0.4906,0.4765,0.6827,0.8263,0.3431,5.9575,0.5378,0.1449,0.7626,6.4693,13.2499,9.538,9.5627,0.3809,0.6172,0.0958,4.2665,0.4406,9.1844,9.8764,-2.1405,7.9777,2.3551,2.3368,0.8877,0.9422,0.2059,3.0135,0.6519,1.9394,2.7414,-0.2536,0.3622,0.0039,0.0006,0.003,0.0552,0.7104,5.8569,0.0373,-0.022,0.0281,-0.6218,0.5321,0.0004,0.0037,0.0114,0.1068,-0.7834,9.6237,0.0573,-0.0207,0.0257,-0.3483,0.3645,0.0029,0.0029,0.0033,0.0574,-0.0344,6.9847,0.0358,-0.0151,0.0228,0.0,16.3947,4.6289,1.937,28.2476,5.3148,0.7237,-1.0012,4.712,0.0,9.2171


In [12]:
other_data_df = data_df.groupby('bookingID').\
agg({'Accuracy': [np.min, np.median, percentile(.75), np.max],
     'second': ['count', 'max']})
cols = {'Accuracy_amin': 'min_accuracy', 'Accuracy_amax': 'max_accuracy',
        'Accuracy_median': 'median_accuracy', 'Accuracy_percentile_75': 'p75_accuracy',
        'second_count': 'rec_cnt', 'second_max': 'max_sec'
       }
other_data_df.columns = other_data_df.columns.map('_'.join).to_series().map(cols)
other_data_df.reset_index(inplace=True)
other_data_df['max_sec'] = other_data_df['max_sec'] + 1  # add one as second starts from 0.

In [13]:
feature_df = pd.merge(sensor_data_df, other_data_df, how='left', on='bookingID')
display(feature_df.head(5))
print(feature_df.shape)

Unnamed: 0,bookingID,min_acc_x,max_acc_x,mean_acc_x,p50_acc_x,var_acc_x,std_acc_x,skew_acc_x,kurt_acc_x,mad_acc_x,p25_acc_x,p75_acc_x,min_acc_y,max_acc_y,mean_acc_y,p50_acc_y,var_acc_y,std_acc_y,skew_acc_y,kurt_acc_y,mad_acc_y,p25_acc_y,p75_acc_y,min_acc_z,max_acc_z,mean_acc_z,p50_acc_z,var_acc_z,std_acc_z,skew_acc_z,kurt_acc_z,mad_acc_z,p25_acc_z,p75_acc_z,min_gyro_x,max_gyro_x,mean_gyro_x,p50_gyro_x,var_gyro_x,std_gyro_x,skew_gyro_x,kurt_gyro_x,mad_gyro_x,p25_gyro_x,p75_gyro_x,min_gyro_y,max_gyro_y,mean_gyro_y,p50_gyro_y,var_gyro_y,std_gyro_y,skew_gyro_y,kurt_gyro_y,mad_gyro_y,p25_gyro_y,p75_gyro_y,min_gyro_z,max_gyro_z,mean_gyro_z,p50_gyro_z,var_gyro_z,std_gyro_z,skew_gyro_z,kurt_gyro_z,mad_gyro_z,p25_gyro_z,p75_gyro_z,min_Speed,max_Speed,mean_Speed,p50_Speed,var_Speed,std_Speed,skew_Speed,kurt_Speed,mad_Speed,p25_Speed,p75_Speed,min_accuracy,median_accuracy,p75_accuracy,max_accuracy,rec_cnt,max_sec
0,0,-4.6923,4.7826,-0.7113,-0.7253,0.8612,0.928,0.3622,3.314,0.6554,-1.1851,-0.2994,-12.7647,-6.1199,-9.6138,-9.6221,0.4095,0.6399,0.1686,3.9453,0.4384,-9.9039,-9.3444,-6.2518,2.3189,-1.6197,-1.6077,1.3025,1.1413,-0.1563,1.2489,0.8311,-2.2504,-1.0337,-0.3925,0.4384,0.0033,0.0026,0.0043,0.066,0.1315,4.5502,0.0456,-0.0273,0.0332,-0.6099,0.4697,-0.0061,-0.0022,0.01,0.1002,-0.473,6.1341,0.0652,-0.0462,0.0328,-0.7319,0.3728,-0.0042,-0.0035,0.0041,0.0637,-0.9916,20.7524,0.0411,-0.0299,0.0209,-1.0,22.9461,8.9948,8.5034,51.8388,7.1999,0.1676,-1.4313,6.5121,1.4903,15.6455,4.0,8.0,12.0,48.0,1004,1590.0
1,1,-5.353,3.8133,-0.5254,-0.5631,0.5538,0.7442,0.5246,9.0311,0.4714,-0.8411,-0.2294,6.6234,12.5362,9.5321,9.538,0.2851,0.5339,-0.0668,3.4013,0.3772,9.2772,9.7968,-5.3555,1.4813,-2.199,-2.1758,0.7298,0.8543,0.0315,1.9065,0.6003,-2.5832,-1.8192,-0.1254,0.1265,-0.0025,-0.0015,0.0008,0.0277,0.0664,2.6489,0.0192,-0.0157,0.0087,-0.6782,0.4708,-0.0075,-0.0035,0.0084,0.0917,-1.85,13.7259,0.0523,-0.0339,0.0229,-0.1213,0.2351,0.0004,-0.0004,0.0011,0.0338,0.8118,5.634,0.0228,-0.0149,0.0133,-1.0,21.8821,7.8816,6.9046,49.8346,7.0594,0.3472,-1.2438,6.2239,0.0,13.7479,3.0,3.9,4.0,7.709,851,1035.0
2,2,-2.9713,1.9561,0.3068,0.4334,0.5724,0.7566,-0.7394,1.1788,0.592,-0.1987,0.8248,7.9418,13.3337,9.8432,9.8141,0.2557,0.5057,1.6707,13.6517,0.2806,9.6932,9.9614,-3.2826,2.3129,0.1393,0.3663,1.0404,1.02,-0.8523,0.8393,0.7856,-0.3891,0.7745,-0.1552,0.2822,0.0065,0.0012,0.0029,0.0539,1.4238,6.2999,0.032,-0.0129,0.0161,-0.462,0.4317,-0.0129,0.001,0.0138,0.1173,-0.1857,3.3405,0.0746,-0.0445,0.0212,-0.1259,0.2559,0.0026,0.0018,0.0013,0.0362,1.5433,12.9625,0.0215,-0.008,0.0134,0.0,9.3605,3.1572,2.9988,8.397,2.8978,0.4074,-1.0684,2.5171,0.0,5.3,3.0,3.634,4.0,8.0,195,826.0
3,4,-2.8665,2.0196,-0.3651,-0.3812,0.278,0.5272,-0.0319,4.4099,0.3431,-0.5665,-0.1489,-18.8478,-7.065,-9.4064,-9.365,0.3576,0.598,-5.02,69.578,0.3378,-9.6037,-9.1785,-9.3749,0.2964,-2.6136,-2.6087,0.6077,0.7795,-0.3945,5.47,0.5719,-3.0425,-2.2178,-0.4205,0.4387,-0.0229,-0.0228,0.0018,0.0423,0.723,26.5683,0.0242,-0.0364,-0.0085,-0.5065,0.6526,0.0232,0.0245,0.0127,0.1126,-0.1355,3.5782,0.075,-0.0181,0.0732,-0.3482,0.5052,-0.0004,-0.0011,0.0043,0.0659,0.547,7.1477,0.0453,-0.0329,0.0315,0.0,19.78,6.151,3.31,31.3141,5.5959,0.8722,-0.6382,4.7609,2.19,10.2225,10.0,10.0,10.0,10.0,1094,1095.0
4,6,-4.3528,5.593,0.4906,0.4765,0.6827,0.8263,0.3431,5.9575,0.5378,0.1449,0.7626,6.4693,13.2499,9.538,9.5627,0.3809,0.6172,0.0958,4.2665,0.4406,9.1844,9.8764,-2.1405,7.9777,2.3551,2.3368,0.8877,0.9422,0.2059,3.0135,0.6519,1.9394,2.7414,-0.2536,0.3622,0.0039,0.0006,0.003,0.0552,0.7104,5.8569,0.0373,-0.022,0.0281,-0.6218,0.5321,0.0004,0.0037,0.0114,0.1068,-0.7834,9.6237,0.0573,-0.0207,0.0257,-0.3483,0.3645,0.0029,0.0029,0.0033,0.0574,-0.0344,6.9847,0.0358,-0.0151,0.0228,0.0,16.3947,4.6289,1.937,28.2476,5.3148,0.7237,-1.0012,4.712,0.0,9.2171,3.0,4.004,4.9385,12.0,1095,1095.0


(20000, 84)


ML using XGBoost

In [14]:
feature_df.sort_values(by='bookingID', ascending=True, inplace=True)
target_df.sort_values(by='bookingID', ascending=True, inplace=True)
target_df.reset_index(drop=True, inplace=True)
display(target_df.head(5))

Unnamed: 0,bookingID,label
0,0,0
1,1,1
2,2,1
3,4,1
4,6,0


In [15]:
booking_id = feature_df['bookingID']
X = feature_df.drop('bookingID', axis=1, inplace=False)
y = target_df['label']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

__GridSearch on XGBoost__

To find the best performing hyperparameters of the XGBoost model.

In [17]:
xgb_model = xgboost.XGBClassifier()
parameters = {'n_jobs': [6], 
              'objective':['binary:logistic'],
              'learning_rate': [0.05, 0.025],
              'max_depth': [6, 7],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500, 1000],
              'min_child_weight': [1, 2, 4],
              'random_state': [SEED]
             }

clf = GridSearchCV(xgb_model, parameters, n_jobs=6, 
                   cv=3,
                   scoring='roc_auc',
                   verbose=1, refit=True)

clf.fit(X_train, y_train)
print('Best score: %.4f' % clf.best_score_)
print('Best param: %s' % clf.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed: 26.8min
[Parallel(n_jobs=6)]: Done  72 out of  72 | elapsed: 35.6min finished


Best score: 0.7223
Best param: {'colsample_bytree': 0.7, 'learning_rate': 0.025, 'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 500, 'n_jobs': 6, 'objective': 'binary:logistic', 'random_state': 1234, 'subsample': 0.7}


In [18]:
xgb_model = xgboost.XGBClassifier(**clf.best_params_)
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.025,
       max_delta_step=0, max_depth=6, min_child_weight=2, missing=None,
       n_estimators=500, n_jobs=6, nthread=None,
       objective='binary:logistic', random_state=1234, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.7)

In [19]:
y_pred = xgb_model.predict(X_test)

print('The AUC score: %.4f, accuracy score: %.4f, f1 score: %.4f' % \
      (roc_auc_score(y_test, y_pred), 
       accuracy_score(y_test, y_pred),
       f1_score(y_test, y_pred)))

The AUC score: 0.5945, accuracy score: 0.7715, f1 score: 0.3386


Serialized the model using `Pickle`.

In [20]:
import pickle
pickle.dump(xgb_model, open("../model/grab_safety_solution.model", "wb"))

How to load the model

In [21]:
test_load = pickle.load(open("../model/grab_safety_solution.model", "rb"))
tmp_pred = test_load.predict(X_test)

Check on the prediction between the saved and loaded model, the predictions need to be exact.

In [22]:
np.array_equal(tmp_pred, y_pred)

True

End of notebook