In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import tensorflow as tf
%matplotlib inline
tf.__version__

  from ._conv import register_converters as _register_converters


'1.13.1'

In [4]:
data = pd.read_hdf('all_data.h5', 'grabai')
data.Speed.fillna(data.Speed.median(), inplace=True)
data.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label
0,1194000908341,4.0,203.0,1.001973,9.745,-0.790087,-0.066046,0.019175,0.044741,246.0,6.48,1
1,1099511627891,3.0,311.0,0.452518,9.273026,3.098191,0.005382,0.002155,-0.001436,66.0,0.340365,0
2,962072674446,10.0,142.807999,0.018542,-8.041156,-4.960114,0.004223,-0.014875,0.007397,1552.0,2.05,0
3,1142461300867,8.0,171.205292,-0.290698,-8.295367,-3.049042,0.033618,0.046224,0.030559,277.0,17.608448,0
4,412316860548,19.379,0.0,0.675613,8.84613,1.852508,-0.008652,0.003693,0.00116,148.0,0.0,0


In [5]:
def add_features(df):
    # this function adds the direction, smoothness and intensity for each data point
    # Direction
    df['up'] = df.acceleration_y.apply(lambda y: 1 if np.round(y,2)+9.81 < 0 else 0)
    df['down'] = df.acceleration_y.apply(lambda y: 1 if np.round(y,2)+9.81 > 0 else 0)
    
    df['right'] = df.acceleration_x.apply(lambda x: 1 if np.round(x,2) < 0 else 0)
    df['left'] = df.acceleration_x.apply(lambda x: 1 if np.round(x,2) > 0 else 0)
    
    # Though the else value for smoothness and intensity is not technically correct, 
    # it should help to create some noise that will improve generalization
    # Smoothness
    df['rl_smooth'] = np.where((df.Speed!=0) & (df.gyro_y!=0),
                               df.Speed/np.abs(df.gyro_y),
                               df.Speed + np.abs(df.gyro_y)) # right left smoothness
    
    df['ud_smooth'] = np.where((df.Speed!=0) & (df.gyro_x!=0),
                               df.Speed/np.abs(df.gyro_x),
                               df.Speed + np.abs(df.gyro_x)) # up down smoothness
    
    df['smoothness'] = np.sqrt(np.square(df.rl_smooth) + np.square(df.ud_smooth))
    
    # Intensity
    df['rl_intensity'] = np.where(df.acceleration_x != 0, 
                                  df.rl_smooth * np.abs(df.acceleration_x), 
                                  df.rl_smooth)
    df['ud_intensity'] = np.where(df.acceleration_x != 0, 
                                  df.ud_smooth * np.abs(df.acceleration_y), 
                                  df.ud_smooth)
    df['intensity'] = np.sqrt(np.square(df.rl_intensity) + np.square(df.ud_intensity))
    
    df_col = list(df)
    df_col.remove('label')
    df_col.append('label')
    
    return df[df_col]

In [6]:
data = add_features(data)
data.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,...,down,right,left,rl_smooth,ud_smooth,smoothness,rl_intensity,ud_intensity,intensity,label
0,1194000908341,4.0,203.0,1.001973,9.745,-0.790087,-0.066046,0.019175,0.044741,246.0,...,1,0,1,337.944256,98.112851,351.898354,338.611155,956.109731,1014.299429,1
1,1099511627891,3.0,311.0,0.452518,9.273026,3.098191,0.005382,0.002155,-0.001436,66.0,...,1,0,1,157.932496,63.244702,170.125147,71.467286,586.469771,590.80823,0
2,962072674446,10.0,142.807999,0.018542,-8.041156,-4.960114,0.004223,-0.014875,0.007397,1552.0,...,1,0,1,137.816791,485.470424,504.653347,2.555465,3903.743416,3903.744252,0
3,1142461300867,8.0,171.205292,-0.290698,-8.295367,-3.049042,0.033618,0.046224,0.030559,277.0,...,1,1,0,380.93553,523.786354,647.660422,110.737289,4345.000262,4346.411166,0
4,412316860548,19.379,0.0,0.675613,8.84613,1.852508,-0.008652,0.003693,0.00116,148.0,...,1,0,1,0.003693,0.008652,0.009407,0.002495,0.076534,0.076575,0


In [7]:
from sklearn.preprocessing import Normalizer
cols = list(data)
cols.remove('label')
cols.remove('bookingID')

scaler = Normalizer()
scaler = scaler.fit(data[cols])
data[cols] = scaler.transform(data[cols])

In [8]:
def grouped_data(df):
    extra_features = []
    all_cols = list(df)
    cols = all_cols
    cols.remove('bookingID')
    cols.remove('label')
    
    grouped = df.groupby('bookingID')
    lst_functions = ['mean', 'median', 'min', 'max', 'std', 'skew', 'count', 'sum']
    for func in lst_functions:
        if func == 'mean':
            temp = grouped.mean()
            temp.columns = list(map(lambda x: x + '_mean', list(temp)))
            extra_features.append(temp)
            
        if func == 'median':
            temp = grouped.median()
            temp = temp[cols]
            temp.columns = list(map(lambda x: x + '_median', list(temp)))
            extra_features.append(temp)
        
        if func == 'std':
            temp = grouped.std()
            temp = temp[cols]
            temp.columns = list(map(lambda x: x + '_std', list(temp)))
            extra_features.append(temp)
        
        if func == 'skew':
            temp = grouped.skew()
            temp = temp[cols]
            temp.columns = list(map(lambda x: x + '_skew', list(temp)))
            extra_features.append(temp)

        if func == 'count':
            temp = grouped.count()
            temp = temp[cols]
            temp.columns = list(map(lambda x: x + '_count', list(temp)))
            extra_features.append(temp)
        
        if func == 'sum':
            temp = grouped.sum()
            temp = temp[cols]
            temp.columns = list(map(lambda x: x + '_sum', list(temp)))
            extra_features.append(temp)
        
        if func == 'min':
            temp = grouped.min()
            temp = temp[cols]
            temp.columns = list(map(lambda x: x + '_min', list(temp)))
            extra_features.append(temp)
            
        if func == 'max':
            temp = grouped.max()
            temp = temp[cols]
            temp.columns = list(map(lambda x: x + '_max', list(temp)))
            extra_features.append(temp)
            
    merged_data = pd.concat(extra_features, axis=1)
        
    df_col = list(merged_data)
    df_col.remove('label_mean')
    df_col.append('label_mean')
    
    return merged_data[df_col]

In [9]:
data = grouped_data(data)

In [22]:
data = data[(data.label_mean == 1)|(data.label_mean == 0)]
data.head()

Unnamed: 0,bookingID,Accuracy_mean,Bearing_mean,acceleration_x_mean,acceleration_y_mean,acceleration_z_mean,gyro_x_mean,gyro_y_mean,gyro_z_mean,second_mean,...,down_sum,right_sum,left_sum,rl_smooth_sum,ud_smooth_sum,smoothness_sum,rl_intensity_sum,ud_intensity_sum,intensity_sum,label_mean
0,0,2.146581e-06,3e-05,-1.286042e-07,-2e-06,-2.673055e-07,9.873215e-10,-1.026196e-09,-7.358706e-10,7.1e-05,...,0.000129,0.000155,2.2e-05,0.018263,0.01284,0.025304,0.015247,0.12289,0.126841,0.0
1,1,5.916075e-07,2.8e-05,-1.126711e-07,2e-06,-4.193773e-07,-3.315646e-10,-7.972989e-10,-1.538258e-10,7.9e-05,...,0.000157,0.000151,5e-06,0.011684,0.011033,0.01885,0.006603,0.104893,0.10971,1.0
2,2,8.093956e-06,0.000185,-6.749148e-07,2.4e-05,1.53134e-07,1.016852e-08,-5.343612e-08,1.442027e-10,0.000425,...,0.000475,0.000353,0.000122,0.014944,0.009563,0.020538,0.010035,0.093972,0.096805,1.0
3,4,1.278652e-06,2.1e-05,-4.83693e-08,-1e-06,-3.280543e-07,-3.887337e-09,1.35489e-09,-5.033543e-10,5.5e-05,...,0.000129,0.000121,1.8e-05,0.016165,0.013856,0.024666,0.007066,0.129933,0.131252,1.0
4,6,1.336457e-06,4e-05,1.319495e-07,3e-06,7.109463e-07,5.032373e-10,-4.651618e-10,7.028859e-10,0.000104,...,0.000337,3.1e-05,0.000303,0.013689,0.008355,0.018193,0.009579,0.078525,0.081137,0.0


In [11]:
cols = list(data)
cols.remove('label_mean')

scaler2 = Normalizer()
scaler2 = scaler2.fit(data[cols])
data[cols] = scaler2.transform(data[cols])
data = data.reset_index()
data.to_hdf('data_final.h5', 'grabai')
data.head()

Unnamed: 0,bookingID,Accuracy_mean,Bearing_mean,acceleration_x_mean,acceleration_y_mean,acceleration_z_mean,gyro_x_mean,gyro_y_mean,gyro_z_mean,second_mean,...,down_sum,right_sum,left_sum,rl_smooth_sum,ud_smooth_sum,smoothness_sum,rl_intensity_sum,ud_intensity_sum,intensity_sum,label_mean
0,0,2.146581e-06,3e-05,-1.286042e-07,-2e-06,-2.673055e-07,9.873215e-10,-1.026196e-09,-7.358706e-10,7.1e-05,...,0.000129,0.000155,2.2e-05,0.018263,0.01284,0.025304,0.015247,0.12289,0.126841,0.0
1,1,5.916075e-07,2.8e-05,-1.126711e-07,2e-06,-4.193773e-07,-3.315646e-10,-7.972989e-10,-1.538258e-10,7.9e-05,...,0.000157,0.000151,5e-06,0.011684,0.011033,0.01885,0.006603,0.104893,0.10971,1.0
2,2,8.093956e-06,0.000185,-6.749148e-07,2.4e-05,1.53134e-07,1.016852e-08,-5.343612e-08,1.442027e-10,0.000425,...,0.000475,0.000353,0.000122,0.014944,0.009563,0.020538,0.010035,0.093972,0.096805,1.0
3,4,1.278652e-06,2.1e-05,-4.83693e-08,-1e-06,-3.280543e-07,-3.887337e-09,1.35489e-09,-5.033543e-10,5.5e-05,...,0.000129,0.000121,1.8e-05,0.016165,0.013856,0.024666,0.007066,0.129933,0.131252,1.0
4,6,1.336457e-06,4e-05,1.319495e-07,3e-06,7.109463e-07,5.032373e-10,-4.651618e-10,7.028859e-10,0.000104,...,0.000337,3.1e-05,0.000303,0.013689,0.008355,0.018193,0.009579,0.078525,0.081137,0.0


In [31]:
from sklearn.decomposition import PCA
from sklearn.externals import joblib
import pickle
data = pd.read_hdf('data_final.h5', 'grabai')
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]
pca = PCA(n_components=10)
pca.fit(X)
joblib.dump(pca, 'pca_fit.sav')

['pca_fit.sav']

In [32]:
load_pca = joblib.load('pca_fit.sav')
X = load_pca.transform(X)
y = np.array(y)[:, np.newaxis]
X[0:5, :]

array([[-1.87254209e-02, -5.70841564e-03, -1.24395314e-02,
        -7.62713290e-03, -1.22697542e-02,  6.24851935e-03,
         2.28132479e-03,  2.38000197e-03,  2.85914358e-04,
         6.82139225e-04],
       [-3.81467885e-03, -6.07600138e-03, -8.78387403e-03,
         1.00577916e-02,  2.82535837e-03, -1.84048993e-03,
         3.60559667e-03, -1.79138098e-03, -3.45821133e-03,
        -1.78048820e-04],
       [ 2.06240000e-02,  5.27766238e-04, -6.59130431e-03,
         3.89679634e-03,  1.86151330e-03, -4.19376288e-03,
        -1.49461045e-04,  3.98799574e-05,  2.55864821e-03,
         1.01749384e-03],
       [-3.34384156e-02, -7.39977641e-03, -1.69725500e-02,
        -2.80453859e-03, -4.96515168e-03,  5.79053349e-03,
         4.98922935e-03,  9.84933561e-04, -1.01654525e-03,
        -2.38018581e-04],
       [ 5.84581607e-02, -3.61327721e-03, -9.22870903e-03,
        -1.83901603e-03, -2.06387452e-03, -1.40387182e-03,
         1.46973365e-04, -2.50081835e-04, -8.06460934e-04,
        -6.

In [34]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=10000)
lr.fit(X, y)
joblib.dump(lr, 'logistic.sav')
lr = joblib.load('logistic.sav')
y_hat_lr = lr.predict(X)
print(classification_report(y, y_hat_lr))

             precision    recall  f1-score   support

        0.0       0.77      0.98      0.86     14999
        1.0       0.68      0.12      0.20      4983

avg / total       0.75      0.77      0.70     19982



  y = column_or_1d(y, warn=True)


In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
classifier = DecisionTreeClassifier(max_depth=8, criterion='entropy', min_samples_leaf=100)
classifier.fit(X, y)
joblib.dump(classifier, 'decision_tree.sav')
classifier = joblib.load('decision_tree.sav')
y_hat_tree = classifier.predict(X)
print(classification_report(y, y_hat_tree))

             precision    recall  f1-score   support

        0.0       0.77      0.98      0.86     14999
        1.0       0.66      0.14      0.24      4983

avg / total       0.75      0.77      0.71     19982



In [37]:
classifier = RandomForestClassifier(criterion='entropy', n_estimators=10)
classifier.fit(X, y)
joblib.dump(classifier, 'random_forest.sav')
classifier = joblib.load('random_forest.sav')
y_hat_rf = classifier.predict(X)
print(classification_report(y, y_hat_rf))

  


             precision    recall  f1-score   support

        0.0       0.97      1.00      0.98     14999
        1.0       1.00      0.91      0.95      4983

avg / total       0.98      0.98      0.98     19982



In [38]:
from sklearn.svm import SVC
classifier = SVC(probability=True)
classifier.fit(X, y)
joblib.dump(classifier, 'svc.sav')
classifier = joblib.load('svc.sav')
y_hat_svm = classifier.predict(X)
print(classification_report(y, y_hat_svm))

  y = column_or_1d(y, warn=True)


             precision    recall  f1-score   support

        0.0       0.75      1.00      0.86     14999
        1.0       0.00      0.00      0.00      4983

avg / total       0.56      0.75      0.64     19982



  'precision', 'predicted', average, warn_for)


In [39]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(subsample=0.7, min_samples_split=10)
classifier.fit(X, y)
joblib.dump(classifier,'grad_boost.sav')
classifier = joblib.load('grad_boost.sav')
y_hat_grad = classifier.predict_proba(X)
y_hat_grad
# print(classification_report(y, y_hat_grad))

  y = column_or_1d(y, warn=True)


array([[0.73559219, 0.26440781],
       [0.69906063, 0.30093937],
       [0.82165835, 0.17834165],
       ...,
       [0.77754681, 0.22245319],
       [0.7207556 , 0.2792444 ],
       [0.81092697, 0.18907303]])

In [29]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

folds = KFold(n_splits=10, shuffle=True, random_state=93)
hyperparams = {'n_neighbors': [5, 10, 15, 20, 25, 30]}
est = KNeighborsClassifier()
cv = GridSearchCV(estimator=est,
                  param_grid=hyperparams,
                  scoring=['accuracy', 'f1_weighted', 'precision'],
                  cv=folds,
                  return_train_score=True,
                  refit='f1_weighted')

cv.fit(X, y[:, 0])

pd.DataFrame(cv.cv_results_)[['param_n_neighbors', 'mean_test_precision', 'mean_test_accuracy', 'mean_test_f1_weighted']]

Unnamed: 0,param_n_neighbors,mean_test_precision,mean_test_accuracy,mean_test_f1_weighted
0,5,0.418097,0.728906,0.697433
1,10,0.57399,0.759433,0.698653
2,15,0.561117,0.758583,0.700465
3,20,0.627001,0.762386,0.694205
4,25,0.621768,0.762436,0.695666
5,30,0.67367,0.764138,0.69186


In [40]:
classifier = KNeighborsClassifier(n_neighbors=15)
classifier.fit(X, y)
joblib.dump(classifier,'knn.sav')
classifier = joblib.load('knn.sav')
y_hat_grad = classifier.predict_proba(X)
y_hat_grad

  


array([[0.8       , 0.2       ],
       [0.8       , 0.2       ],
       [0.86666667, 0.13333333],
       ...,
       [0.53333333, 0.46666667],
       [0.46666667, 0.53333333],
       [0.8       , 0.2       ]])

In [42]:
model1 = joblib.load('logistic.sav')
model2 = joblib.load('decision_tree.sav')
model3 = joblib.load('random_forest.sav')
model4 = joblib.load('svc.sav')
model5 = joblib.load('knn.sav')


def get_mod(array, models = [model1, model2, model3, model4]):
    final_model = array
    for i in models:
        final_model = np.concatenate((final_model, i.predict_proba(array)[:, 0:1]), axis=1)
        
    return final_model

In [43]:
print(get_mod(X))

[[-1.87254209e-02 -5.70841564e-03 -1.24395314e-02 ...  7.94183445e-01
   9.00000000e-01  7.74845292e-01]
 [-3.81467885e-03 -6.07600138e-03 -8.78387403e-03 ...  8.10810811e-01
   4.00000000e-01  7.68980374e-01]
 [ 2.06240000e-02  5.27766238e-04 -6.59130431e-03 ...  7.88690476e-01
   2.00000000e-01  7.70524667e-01]
 ...
 [-7.31252940e-02 -3.93767567e-03 -1.94626845e-03 ...  8.19095477e-01
   3.00000000e-01  7.51108824e-01]
 [ 2.67767676e-02 -2.98485731e-03 -1.98550170e-03 ...  7.17998700e-01
   1.00000000e-01  7.68343782e-01]
 [-4.04172468e-02  2.20370674e-03  1.55267955e-02 ...  8.16513761e-01
   1.00000000e-01  7.43363871e-01]]
