In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

In [2]:
# Reading and merging all csv files into one dataframe

DATA_DIR = 'data/safety/features'

colnames = ['bookingID', 'Accuracy', 'Bearing', 'acceleration_x',
             'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y',
             'gyro_z', 'second', 'Speed']
raw_df = pd.DataFrame(columns=colnames)

for filename in os.listdir(DATA_DIR):
    if filename !=  '.DS_Store':
        print(os.path.join(os.getcwd(), DATA_DIR, filename))
        new_df = pd.read_csv(os.path.join(os.getcwd(), DATA_DIR, filename))
        raw_df = pd.concat([raw_df, new_df], axis=0, ignore_index=True)
        print(raw_df.shape)

/Users/itn.muhammad.afif/Documents/notebooks/data/safety/features/part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(1613558, 11)
/Users/itn.muhammad.afif/Documents/notebooks/data/safety/features/part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(3227112, 11)
/Users/itn.muhammad.afif/Documents/notebooks/data/safety/features/part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(4840665, 11)
/Users/itn.muhammad.afif/Documents/notebooks/data/safety/features/part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(6454220, 11)
/Users/itn.muhammad.afif/Documents/notebooks/data/safety/features/part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(8067771, 11)
/Users/itn.muhammad.afif/Documents/notebooks/data/safety/features/part-00009-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(9681333, 11)
/Users/itn.muhammad.afif/Documents/notebooks/data/safety/features/part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(11294892, 11)
/Users/itn.muhammad.afif/Documents/noteb

In [3]:
# Reading in labels csv

LABEL_DIR = os.path.join(os.getcwd(), 'data/safety/labels', 
                         'part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')

label_df = pd.read_csv(LABEL_DIR)
label_df.head()

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
2,171798691856,0
3,1520418422900,0
4,798863917116,0


In [4]:
# Checking and dropping the 18 duplicate bookingIDs 

label_counts = np.unique(label_df['bookingID'].values, return_counts=True)
label_counts = pd.DataFrame({'bookingID' : label_counts[0], 
                             'counts' : label_counts[1]})

dup = label_counts[label_counts['counts'] > 1]

dup_id = dup['bookingID'].values

# bookingIDs are labelled both 0 and 1, drop these IDs since we have sufficient data

for b in dup_id:
    idx1 = label_df[label_df['bookingID'] == b].index
    idx2 = raw_df[raw_df['bookingID'] == b].index

    label_df = label_df.drop(idx1, axis=0)
    raw_df = raw_df.drop(idx2, axis=0)
    
df = pd.merge(raw_df, label_df)

### Feature Engineering

Adding additional columns \
1) Distance = 'Speed' * 'second' \
2) Acceleration = sqrt('acceleration_x' ** 2, 'acceleration_y' ** 2, 'acceleration_z' ** 2) \
3) Magnitude = sqrt('gyro_x' ** 2, 'gyro_y' ** 2, 'gyro_z' ** 2) \
4) Speed_diff = per second difference in speed for each bookingID \
5) Bearing_diff = per second difference in bearing for each bookingID

In [5]:
# Adding distance, acceleration and magnitude columns

df_add = df.copy(deep=True).sort_values(by=['bookingID', 'second']).reset_index(drop=True)

df_add['distance'] = df_add['Speed'] * df_add['second']
df_add['acceleration'] = np.sqrt(np.power(df_add['acceleration_x'], 2) + 
                                 np.power(df_add['acceleration_y'], 2) + 
                                 np.power(df_add['acceleration_z'], 2))
df_add['magnitude'] = np.sqrt(np.power(df_add['gyro_x'], 2) + 
                              np.power(df_add['gyro_y'], 2) + 
                              np.power(df_add['gyro_z'], 2))

In [10]:
# Function to calculate difference in speed per second
def calc_speed_diff(x):
    return x['Speed'].diff()

# Function to calculate difference in bearing per second
def calc_bearing_diff(x):
    return x['Bearing'].diff()

# sub_df is a dataframe containing differences in speed and bearing
# values per second
# Large change in speed == sudden speeding/braking
# Large change in bearing == sudden change in direction/lane change

df_add['speed_diff'] = df_add.groupby('bookingID').apply(calc_speed_diff).fillna(0).values
df_add['bearing_diff'] = df_add.groupby('bookingID').apply(calc_bearing_diff).fillna(0).values

In [11]:
df_add2 = df_add.groupby('bookingID').aggregate({'Accuracy' : ['min', 'mean', 'max', 'median', 'std'], 
                                                 'Bearing' : ['min', 'mean', 'max', 'median', 'std'], 
                                                 'Speed' : ['min', 'mean', 'max', 'median', 'std'], 
                                                'distance' : ['min', 'mean', 'max', 'median', 'std'],
                                                'acceleration' : ['min', 'mean', 'max', 'median', 'std'],
                                                'magnitude' : ['min', 'mean', 'max', 'median', 'std'],
                                                'speed_diff' : ['min', 'mean', 'max', 'median', 'std'],
                                                'bearing_diff' : ['min', 'mean', 'max', 'median', 'std']})

df_add2.columns = ["_".join(x) for x in df_add2.columns.ravel()]
df_add2 = pd.merge(df_add2, label_df, on='bookingID').drop('bookingID', axis='columns')
df_add2.columns = [col.lower() for col in df_add2.columns]

In [31]:
df_add2.head(n=10)

Unnamed: 0,accuracy_min,accuracy_mean,accuracy_max,accuracy_median,accuracy_std,bearing_min,bearing_mean,bearing_max,bearing_median,bearing_std,...,speed_diff_mean,speed_diff_max,speed_diff_median,speed_diff_std,bearing_diff_min,bearing_diff_mean,bearing_diff_max,bearing_diff_median,bearing_diff_std,label
0,4.0,10.165339,48.0,8.0,3.855898,0.037464,176.526099,359.979767,144.299423,129.231351,...,0.014002,13.102637,0.0,0.910384,-358.965862,-0.141415,358.976575,0.0,29.914419,0
1,3.0,3.718763,7.709,3.9,0.597933,0.0,124.19859,337.0,110.0,89.861236,...,0.007991,20.428312,0.0,2.035444,-332.0,-0.175088,303.0,0.0,30.28525,1
2,3.0,3.930626,8.0,3.634,1.117354,1.0,173.794872,354.0,233.0,119.31652,...,-0.024874,3.29283,0.0,0.858242,-353.0,-1.538462,148.0,0.0,34.231659,1
3,10.0,10.0,10.0,10.0,0.0,2.271227,151.807013,353.855377,134.210114,71.273774,...,-0.005878,4.56,0.0,0.749901,-348.695439,0.213303,346.595214,0.0,25.957784,1
4,3.0,4.586721,12.0,4.004,1.329545,0.0,197.812785,359.0,256.0,111.868249,...,-0.005266,3.202953,0.0,0.672737,-349.0,0.167123,353.0,0.0,26.349126,0
5,3.0,3.681034,3.9,3.9,0.377849,0.0,101.562698,359.0,57.0,106.530782,...,0.006412,4.060644,0.0,0.702634,-357.0,0.145416,358.0,0.0,25.866583,0
6,3.0,7.008253,18.204,6.068,3.153024,0.0,172.803618,348.0,214.0,131.848507,...,-0.008346,5.75,0.0,0.925698,-345.0,-0.173127,309.0,0.0,45.524958,0
7,3.0,3.815,9.0,3.9,0.846416,0.0,120.605333,357.0,112.0,104.26021,...,0.04856,3.01,0.0,0.682591,-345.0,0.312,343.0,0.0,37.640024,0
8,3.0,4.22236,8.0,3.9,1.049047,6.0,140.8,356.0,124.0,98.746202,...,-0.011512,3.854901,0.0,0.945335,-331.0,0.737778,343.0,0.0,33.88607,0
9,16.0,20.283333,54.0,16.0,7.171851,12.221449,202.940358,352.757599,156.271118,86.852411,...,0.022345,12.460685,0.0,1.097135,-340.53615,-0.098409,26.940277,0.0,14.637863,0


In [43]:
# Checking for correlations within aggregated data
# Particularly, check for correlations between different features 
# e.g. strong correlation between accuracy and speed etc

corr_matrix = df_add2.corr()
corr_matrix.where((corr_matrix > 0.5) & (corr_matrix != 1))

Unnamed: 0,accuracy_min,accuracy_mean,accuracy_max,accuracy_median,accuracy_std,bearing_min,bearing_mean,bearing_max,bearing_median,bearing_std,...,speed_diff_mean,speed_diff_max,speed_diff_median,speed_diff_std,bearing_diff_min,bearing_diff_mean,bearing_diff_max,bearing_diff_median,bearing_diff_std,label
accuracy_min,,,,,,,,,,,...,,,,,,,,,,
accuracy_mean,,,,0.58566,0.829352,,,,,,...,,,,,,,,,,
accuracy_max,,,,,0.733153,,,,,,...,,,,,,,,,,
accuracy_median,,0.58566,,,,,,,,,...,,,,,,,,,,
accuracy_std,,0.829352,0.733153,,,,,,,,...,,,,,,,,,,
bearing_min,,,,,,,,,,,...,,,,,0.625162,,,,,
bearing_mean,,,,,,,,,0.909815,,...,,,,,,,,,,
bearing_max,,,,,,,,,,0.521956,...,,,,,,,0.650522,,,
bearing_median,,,,,,,0.909815,,,,...,,,,,,,,,,
bearing_std,,,,,,,,0.521956,,,...,,,,,,,0.566482,,0.674869,


In [None]:
# def plot_corr_heatmap(df, vmax=1.0):
#     corr_matrix = df.corr()
    
#     mask = np.zeros_like(corr_matrix, dtype=np.bool)
#     mask[np.triu_indices_from(mask)] = True

#     # Set up the matplotlib figure
#     f, ax = plt.subplots(figsize=(11, 9))

#     # Generate a custom diverging colormap
#     cmap = sns.diverging_palette(220, 10, as_cmap=True)

#     # Draw the heatmap with the mask and correct aspect ratio
#     sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=vmax, 
#                 square=True, center=0, linewidths=.5)
    
# plot_corr_heatmap(df, vmax=0.6)

In [None]:
# sns.pairplot(df_add2, hue='label')

In [47]:
SAVE_DIR = os.path.join(os.getcwd(), 'data/safety/')

df_add2.to_csv(SAVE_DIR + "massive_df.csv", index=False)