# Grab Challenge

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

### Reading the data

In [None]:
li = []

for i in range(10):
    to_concat_df = pd.read_csv('part-0000{}-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv'.format(i))
    li.append(to_concat_df)

In [None]:
df = pd.concat(li, axis=0, ignore_index=True)
df

In [None]:
labels = pd.read_csv('part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')
labels

In [None]:
drop_bookingIDs = np.random.choice(non_dangerous_ids, 5000, replace=False)

In [None]:
df = df.drop(df[df['bookingID'].isin(drop_bookingIDs)].index)
labels = labels.drop(labels[labels['bookingID'].isin(drop_bookingIDs)].index)

In [None]:
df.to_hdf('downsampled_features.hdf')
labels.to_feather('downsampled_labels.hdf')

Save data to hdf so that the big dataset can be read faster


In [None]:
df = pd.read_hdf('./drive/MyDrive/Grab Challenge/downsampled_features.hdf')
labels = pd.read_hdf('./drive/MyDrive/Grab Challenge/downsampled_labels.hdf')

In [27]:
# Remove duplicated labels
labels = labels[~labels['bookingID'].duplicated()]
labels

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
3,1520418422900,0
4,798863917116,0
5,283467841567,0
...,...,...
20012,558345748558,0
20013,429496729754,0
20014,154618822713,0
20015,979252543488,0


In [5]:
df

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,1202590843006,3.000,353.000000,1.228867,8.900100,3.986968,0.008221,0.002269,-0.009966,1362.0,0.000000
1,274877907034,9.293,17.000000,0.032775,8.659933,4.737300,0.024629,0.004028,-0.010858,257.0,0.190000
2,884763263056,3.000,189.000000,1.139675,9.545974,1.951334,-0.006899,-0.015080,0.001122,973.0,0.667059
4,1056561954943,3.900,50.000000,-0.112882,10.550960,-1.560110,0.130568,-0.061697,0.161530,820.0,20.419409
5,1185410973787,3.900,178.000000,0.805649,9.206902,2.954445,-0.057104,-0.043555,0.002334,533.0,19.250000
...,...,...,...,...,...,...,...,...,...,...,...
16135555,1331439861893,9.000,138.065338,-0.820355,-9.246567,-2.435046,-0.069028,-0.013546,0.031578,719.0,15.078815
16135556,592705486872,3.000,59.000000,0.189148,9.670476,1.625713,0.004508,0.100046,0.007831,155.0,0.000000
16135557,1357209665551,7.000,290.154938,-1.576559,-8.460007,-4.211835,0.049078,0.023569,-0.073863,83.0,11.094002
16135558,1520418422900,3.900,253.000000,-0.143478,10.014740,0.629425,0.007156,-0.026550,-0.013199,364.0,21.520000


For this problem, we can aggregate the data for each bookingID, to create features for each trip, which can be used to predict the label. 

Here are the features:

- Average Acceleration magnitude (Average Acceleration Magnitude for each second, sqrt(x^2 + y^2 + z^2) )

- Average gyro x/y/z squared (Gryo x/y/z squared average for each second, squared as so there are no -ve values)

- Average speed (Average speed for each second)

In [10]:
dic = {}
for key, group in df.groupby('bookingID'):
  trip_features = []
  group['acceleration_magnitude'] = np.sqrt(group['acceleration_x']**2 + group['acceleration_y']**2 + group['acceleration_z']**2)
  group['sq_gyro_x'] = group['gyro_x'] ** 2
  group['sq_gyro_y'] = group['gyro_y'] ** 2
  group['sq_gyro_z'] = group['gyro_z'] ** 2
  trip_features.append(group['acceleration_magnitude'].mean())
  trip_features.append(group['sq_gyro_x'].mean())
  trip_features.append(group['sq_gyro_y'].mean())
  trip_features.append(group['sq_gyro_z'].mean())
  trip_features.append(group['Speed'].mean())
  dic[key] = trip_features

In [33]:
agg_df = pd.DataFrame.from_dict(dic, orient='index', columns = ['avg_acceleration_magnitude', 'avg_gyrox_squared', 'avg_gyroy_squared', 'avg_gyroz_squared', 'avg_speed'])
agg_df.reset_index(inplace=True)
agg_df.rename(columns={'index': 'bookingID'}, inplace=True)
agg_df

Unnamed: 0,bookingID,avg_acceleration_magnitude,avg_gyrox_squared,avg_gyroy_squared,avg_gyroz_squared,avg_speed
0,0,9.886164,0.004357,0.010073,0.004069,8.994822
1,1,9.862507,0.000775,0.008456,0.001144,7.881588
2,2,9.929590,0.002932,0.013859,0.001312,3.157213
3,4,9.813434,0.002315,0.013199,0.004343,6.150996
4,6,9.918090,0.003056,0.011399,0.003305,4.628921
...,...,...,...,...,...,...
14995,1709396983957,11.332052,0.051908,0.026310,0.144247,2.305969
14996,1709396983960,9.876478,0.006233,0.007316,0.002076,7.611645
14997,1709396983966,9.723770,0.000561,0.009851,0.001769,12.718705
14998,1709396983971,9.775231,0.004286,0.028969,0.004458,6.245733


Merge together with labels

In [34]:
with_label = agg_df.merge(labels, how='inner', on='bookingID')
with_label

Unnamed: 0,bookingID,avg_acceleration_magnitude,avg_gyrox_squared,avg_gyroy_squared,avg_gyroz_squared,avg_speed,label
0,0,9.886164,0.004357,0.010073,0.004069,8.994822,0
1,1,9.862507,0.000775,0.008456,0.001144,7.881588,1
2,2,9.929590,0.002932,0.013859,0.001312,3.157213,1
3,4,9.813434,0.002315,0.013199,0.004343,6.150996,1
4,6,9.918090,0.003056,0.011399,0.003305,4.628921,0
...,...,...,...,...,...,...,...
14995,1709396983957,11.332052,0.051908,0.026310,0.144247,2.305969,1
14996,1709396983960,9.876478,0.006233,0.007316,0.002076,7.611645,1
14997,1709396983966,9.723770,0.000561,0.009851,0.001769,12.718705,1
14998,1709396983971,9.775231,0.004286,0.028969,0.004458,6.245733,1


Train Various Classifiers

In [36]:
X_train, X_test, y_train, y_test = train_test_split(with_label.drop('label', axis=1), with_label['label'], test_size = 0.2)

In [38]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [39]:
score = xgb_clf.score(X_test, y_test)
print(score)

0.7013333333333334


In [46]:
rdf_clf = RandomForestClassifier()
rdf_clf.fit(X_train, y_train)
score = rdf_clf.score(X_test, y_test)
print(score)

0.6936666666666667


In [47]:
gbc_clf = GradientBoostingClassifier()
gbc_clf.fit(X_train, y_train)
score = gbc_clf.score(X_test, y_test)
print(score)

0.7026666666666667


In [49]:
dtc_clf = DecisionTreeClassifier()
dtc_clf.fit(X_train, y_train)
score = dtc_clf.score(X_test, y_test)
print(score)

0.612


Yay, GradientBoostingClassifier is amazing!
