# Safety Category

## 1) Data Exploration

In [1]:
import re
import os
import numpy as np
import pandas as pd
import glob
import dask.dataframe as dd

In [2]:
PROJ_DIR = os.path.abspath(os.curdir)

features_df = dd.read_csv(PROJ_DIR + '/safety/features/*.csv')
labels = pd.read_csv(PROJ_DIR + '/safety/labels/labels.csv')

In [3]:
features_df.describe()

Unnamed: 0_level_0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...


In [4]:
labels.head()

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
2,171798691856,0
3,1520418422900,0
4,798863917116,0


In [5]:
# glob.glob('data*.csv') - returns List[str]
# pd.read_csv(f) - returns pd.DataFrame()
# for f in glob.glob() - returns a List[DataFrames]
# pd.concat() - returns one pd.DataFrame()
dataset = pd.concat([pd.read_csv(f) for f in glob.glob(PROJ_DIR + '/safety/features/*.csv')], ignore_index = True)

In [6]:
dataset.shape

(16135561, 11)

In [7]:
dataset.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,1202590843006,3.0,353.0,1.228867,8.9001,3.986968,0.008221,0.002269,-0.009966,1362.0,0.0
1,274877907034,9.293,17.0,0.032775,8.659933,4.7373,0.024629,0.004028,-0.010858,257.0,0.19
2,884763263056,3.0,189.0,1.139675,9.545974,1.951334,-0.006899,-0.01508,0.001122,973.0,0.667059
3,1073741824054,3.9,126.0,3.871543,10.386364,-0.136474,0.001344,-0.339601,-0.017956,902.0,7.913285
4,1056561954943,3.9,50.0,-0.112882,10.55096,-1.56011,0.130568,-0.061697,0.16153,820.0,20.419409


In [8]:
dataset_sorted = dataset.sort_values(by=['bookingID', 'second'])
dataset_sorted.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
10835302,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
12007854,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
3394723,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
436147,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
9490986,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [9]:
dataset_sorted['bookingID'].value_counts().head(15)
# The left side shows the bookingID, while the right side shows the number of data points for that trip

438086664371     7561
1374389534819    4499
34359738469      4302
1108101562533    3925
747324309632     3674
1486058684448    3611
893353197656     3486
1211180777592    3206
1365799600208    3195
120259084461     3172
412316860458     3061
1116691497104    3055
1262720385148    3026
515396075652     2996
1194000908355    2981
Name: bookingID, dtype: int64

In [10]:
bookingID_list = dataset_sorted['bookingID'].value_counts().keys()
bookingID_list

Int64Index([ 438086664371, 1374389534819,   34359738469, 1108101562533,
             747324309632, 1486058684448,  893353197656, 1211180777592,
            1365799600208,  120259084461,
            ...
             489626271850, 1185410973839,  721554505843,  420906795104,
            1486058684456, 1537598292022,  180388626478,  317827579936,
             472446402608, 1571958030400],
           dtype='int64', length=20000)

From the above, we can see that there are 20000 unique trips in the dataset

In [11]:
labels.shape

(20018, 2)

However, there are 20018 labels. Hence, we will need to remove the duplicate trips later.

## 2) Feature Engineering

For each bookingID, we obtain the the following for each feature
- average
- standard deviation
- maximum
- minimum
- 10th, 30th, 70th, 90th percentile
- length of trip (average speed * time)

In [12]:
mean_df = dataset.groupby('bookingID').mean().reset_index()
max_df = dataset.groupby('bookingID').max().reset_index()
min_df = dataset.groupby('bookingID').min().reset_index()
percentile30_df = dataset.groupby('bookingID').quantile(q=0.3).reset_index()
percentile70_df = dataset.groupby('bookingID').quantile(q=0.7).reset_index()
std_df = dataset.groupby('bookingID').std().reset_index()

In [13]:
time_df = max_df[["bookingID", "second"]].rename(index=str, columns={"second": "time"})
time_df.head()

Unnamed: 0,bookingID,time
0,0,1589.0
1,1,1034.0
2,2,825.0
3,4,1094.0
4,6,1094.0


### Steps: 
1) Rename the dataframes' columns into their respective fields (e.g. Accuracy_avg)  
2) Put the above dataframes into a single dataframe  
3) Remove duplicate trips  
4) Save the single dataframe to a csv

In [14]:
mean_df.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,0,10.165339,176.526099,-0.711264,-9.613822,-1.619658,0.003328,-0.006118,-0.004188,903.526892,8.994822
1,1,3.718763,124.19859,-0.525406,9.532086,-2.198999,-0.002467,-0.00754,0.000405,581.175088,7.881588
2,2,3.930626,173.794872,0.306786,9.843183,0.139347,0.006458,-0.012861,0.002597,339.441026,3.157213
3,4,10.0,151.807013,-0.365117,-9.406439,-2.613639,-0.022884,0.023232,-0.000376,547.49543,6.150996
4,6,4.586721,197.812785,0.490616,9.538043,2.355059,0.003877,0.000436,0.00293,547.0,4.628921


In [15]:
mean_df = mean_df.rename(index=str, columns={"Accuracy":"Accuracy_avg", 
                                             "Bearing":"Bearing_avg", 
                                             "acceleration_x":"acceleration_x_avg", 
                                             "acceleration_y":"acceleration_y_avg", 
                                             "acceleration_z":"acceleration_z_avg", 
                                             "gyro_x":"gyro_x_avg", 
                                             "gyro_y":"gyro_y_avg", 
                                             "gyro_z":"gyro_z_avg", 
                                             "Speed":"Speed_avg"
                                            })

max_df = max_df.rename(index=str, columns={"Accuracy":"Accuracy_max", 
                                            "Bearing":"Bearing_max", 
                                            "acceleration_x":"acceleration_x_max", 
                                            "acceleration_y":"acceleration_y_max", 
                                            "acceleration_z":"acceleration_z_max", 
                                            "gyro_x":"gyro_x_max", 
                                            "gyro_y":"gyro_y_max", 
                                            "gyro_z":"gyro_z_max", 
                                            "Speed":"Speed_max"
                                            })

min_df = min_df.rename(index=str, columns={"Accuracy":"Accuracy_min", 
                                            "Bearing":"Bearing_min", 
                                            "acceleration_x":"acceleration_x_min", 
                                            "acceleration_y":"acceleration_y_min", 
                                            "acceleration_z":"acceleration_z_min", 
                                            "gyro_x":"gyro_x_min", 
                                            "gyro_y":"gyro_y_min", 
                                            "gyro_z":"gyro_z_min", 
                                            "Speed":"Speed_min"
                                            })

percentile30_df = percentile30_df.rename(index=str, columns={"Accuracy":"Accuracy_perc30", 
                                                             "Bearing":"Bearing_perc30", 
                                                             "acceleration_x":"acceleration_x_perc30", 
                                                             "acceleration_y":"acceleration_y_perc30", 
                                                             "acceleration_z":"acceleration_z_perc30", 
                                                             "gyro_x":"gyro_x_perc30", 
                                                             "gyro_y":"gyro_y_perc30", 
                                                             "gyro_z":"gyro_z_perc30", 
                                                             "Speed":"Speed_perc30"
                                                             })

percentile70_df = percentile70_df.rename(index=str, columns={"Accuracy":"Accuracy_perc70", 
                                                             "Bearing":"Bearing_perc70", 
                                                             "acceleration_x":"acceleration_x_perc70", 
                                                             "acceleration_y":"acceleration_y_perc70", 
                                                             "acceleration_z":"acceleration_z_perc70", 
                                                             "gyro_x":"gyro_x_perc70", 
                                                             "gyro_y":"gyro_y_perc70", 
                                                             "gyro_z":"gyro_z_perc70", 
                                                             "Speed":"Speed_perc70"
                                                             })
                                                             
std_df = std_df.rename(index=str, columns={"Accuracy":"Accuracy_std", 
                                           "Bearing":"Bearing_std", 
                                           "acceleration_x":"acceleration_x_std", 
                                           "acceleration_y":"acceleration_y_std", 
                                           "acceleration_z":"acceleration_z_std", 
                                           "gyro_x":"gyro_x_std", 
                                           "gyro_y":"gyro_y_std", 
                                           "gyro_z":"gyro_z_std", 
                                           "Speed":"Speed_std"
                                           })

In [24]:
list_of_dataframes = [mean_df, max_df, min_df, percentile30_df, percentile70_df, std_df, time_df]
# list_of_dataframes = [mean_df, time_df] # Only using mean as new features
safety_new_dataset = pd.concat(list_of_dataframes, axis=1)

In [25]:
safety_new_dataset = safety_new_dataset.drop(columns=["second", "bookingID"])
safety_new_dataset.head()

Unnamed: 0,Accuracy_avg,Bearing_avg,acceleration_x_avg,acceleration_y_avg,acceleration_z_avg,gyro_x_avg,gyro_y_avg,gyro_z_avg,Speed_avg,Accuracy_max,...,Accuracy_std,Bearing_std,acceleration_x_std,acceleration_y_std,acceleration_z_std,gyro_x_std,gyro_y_std,gyro_z_std,Speed_std,time
0,10.165339,176.526099,-0.711264,-9.613822,-1.619658,0.003328,-0.006118,-0.004188,8.994822,48.0,...,3.855898,129.231351,0.928022,0.639934,1.141266,0.065954,0.100225,0.063685,7.199919,1589.0
1,3.718763,124.19859,-0.525406,9.532086,-2.198999,-0.002467,-0.00754,0.000405,7.881588,7.709,...,0.597933,89.861236,0.744157,0.533915,0.854271,0.02774,0.091699,0.033838,7.059362,1034.0
2,3.930626,173.794872,0.306786,9.843183,0.139347,0.006458,-0.012861,0.002597,3.157213,8.0,...,1.117354,119.31652,0.756589,0.505693,1.020021,0.053903,0.117321,0.036215,2.897762,825.0
3,10.0,151.807013,-0.365117,-9.406439,-2.613639,-0.022884,0.023232,-0.000376,6.150996,10.0,...,0.0,71.273774,0.52722,0.598023,0.779529,0.042342,0.112567,0.065927,5.595901,1094.0
4,4.586721,197.812785,0.490616,9.538043,2.355059,0.003877,0.000436,0.00293,4.628921,12.0,...,1.329545,111.868249,0.826271,0.61721,0.942163,0.05517,0.106815,0.057438,5.314844,1094.0


In [26]:
bookingID_df = max_df[['bookingID']]
safety_new_dataset = pd.concat([bookingID_df, safety_new_dataset], axis=1)

In [27]:
safety_new_dataset.columns

Index(['bookingID', 'Accuracy_avg', 'Bearing_avg', 'acceleration_x_avg',
       'acceleration_y_avg', 'acceleration_z_avg', 'gyro_x_avg', 'gyro_y_avg',
       'gyro_z_avg', 'Speed_avg', 'Accuracy_max', 'Bearing_max',
       'acceleration_x_max', 'acceleration_y_max', 'acceleration_z_max',
       'gyro_x_max', 'gyro_y_max', 'gyro_z_max', 'Speed_max', 'Accuracy_min',
       'Bearing_min', 'acceleration_x_min', 'acceleration_y_min',
       'acceleration_z_min', 'gyro_x_min', 'gyro_y_min', 'gyro_z_min',
       'Speed_min', 'Accuracy_perc30', 'Bearing_perc30', 'Speed_perc30',
       'acceleration_x_perc30', 'acceleration_y_perc30',
       'acceleration_z_perc30', 'gyro_x_perc30', 'gyro_y_perc30',
       'gyro_z_perc30', 'Accuracy_perc70', 'Bearing_perc70', 'Speed_perc70',
       'acceleration_x_perc70', 'acceleration_y_perc70',
       'acceleration_z_perc70', 'gyro_x_perc70', 'gyro_y_perc70',
       'gyro_z_perc70', 'Accuracy_std', 'Bearing_std', 'acceleration_x_std',
       'acceleration_y

In [28]:
safety_new_dataset.shape

(20000, 56)

In [29]:
safety_new_dataset.head()

Unnamed: 0,bookingID,Accuracy_avg,Bearing_avg,acceleration_x_avg,acceleration_y_avg,acceleration_z_avg,gyro_x_avg,gyro_y_avg,gyro_z_avg,Speed_avg,...,Accuracy_std,Bearing_std,acceleration_x_std,acceleration_y_std,acceleration_z_std,gyro_x_std,gyro_y_std,gyro_z_std,Speed_std,time
0,0,10.165339,176.526099,-0.711264,-9.613822,-1.619658,0.003328,-0.006118,-0.004188,8.994822,...,3.855898,129.231351,0.928022,0.639934,1.141266,0.065954,0.100225,0.063685,7.199919,1589.0
1,1,3.718763,124.19859,-0.525406,9.532086,-2.198999,-0.002467,-0.00754,0.000405,7.881588,...,0.597933,89.861236,0.744157,0.533915,0.854271,0.02774,0.091699,0.033838,7.059362,1034.0
2,2,3.930626,173.794872,0.306786,9.843183,0.139347,0.006458,-0.012861,0.002597,3.157213,...,1.117354,119.31652,0.756589,0.505693,1.020021,0.053903,0.117321,0.036215,2.897762,825.0
3,4,10.0,151.807013,-0.365117,-9.406439,-2.613639,-0.022884,0.023232,-0.000376,6.150996,...,0.0,71.273774,0.52722,0.598023,0.779529,0.042342,0.112567,0.065927,5.595901,1094.0
4,6,4.586721,197.812785,0.490616,9.538043,2.355059,0.003877,0.000436,0.00293,4.628921,...,1.329545,111.868249,0.826271,0.61721,0.942163,0.05517,0.106815,0.057438,5.314844,1094.0


In [30]:
full_df = pd.merge(safety_new_dataset, labels, on="bookingID")
full_df = full_df.drop_duplicates('bookingID', keep='last')
full_df.shape

(20000, 57)

In [31]:
full_df.to_csv("safety_dataset_new.csv", index=False)