In [208]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

In [209]:
# Reading and merging all csv files into one dataframe

DATA_DIR = '../data/safety/features'

colnames = ['bookingID', 'Accuracy', 'Bearing', 'acceleration_x',
             'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y',
             'gyro_z', 'second', 'Speed']
raw_df = pd.DataFrame(columns=colnames)

for filename in os.listdir(DATA_DIR):
    if filename !=  '.DS_Store':
        print(os.path.join(os.getcwd(), DATA_DIR, filename))
        new_df = pd.read_csv(os.path.join(os.getcwd(), DATA_DIR, filename))
        raw_df = pd.concat([raw_df, new_df], axis=0, ignore_index=True)
        print(raw_df.shape)

/Users/itn.muhammad.afif/Documents/notebooks/AIforSEA_Safety/../data/safety/features/part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(1613558, 11)
/Users/itn.muhammad.afif/Documents/notebooks/AIforSEA_Safety/../data/safety/features/part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(3227112, 11)
/Users/itn.muhammad.afif/Documents/notebooks/AIforSEA_Safety/../data/safety/features/part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(4840665, 11)
/Users/itn.muhammad.afif/Documents/notebooks/AIforSEA_Safety/../data/safety/features/part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(6454220, 11)
/Users/itn.muhammad.afif/Documents/notebooks/AIforSEA_Safety/../data/safety/features/part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(8067771, 11)
/Users/itn.muhammad.afif/Documents/notebooks/AIforSEA_Safety/../data/safety/features/part-00009-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
(9681333, 11)
/Users/itn.muhammad.afif/Documents/notebooks/AIforSEA_Safety/../

In [210]:
# Reading in labels csv

LABEL_DIR = os.path.join(os.getcwd(), '../data/safety/labels', 
                         'part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')

label_df = pd.read_csv(LABEL_DIR)
label_df.head()

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
2,171798691856,0
3,1520418422900,0
4,798863917116,0


In [211]:
# Checking and dropping the 18 duplicate bookingIDs 

label_counts = np.unique(label_df['bookingID'].values, return_counts=True)
label_counts = pd.DataFrame({'bookingID' : label_counts[0], 
                             'counts' : label_counts[1]})

dup = label_counts[label_counts['counts'] > 1]

dup_id = dup['bookingID'].values

# bookingIDs are labelled both 0 and 1, drop these IDs since we have sufficient data

for b in dup_id:
    idx1 = label_df[label_df['bookingID'] == b].index
    idx2 = raw_df[raw_df['bookingID'] == b].index

    label_df = label_df.drop(idx1, axis=0)
    raw_df = raw_df.drop(idx2, axis=0)
    
df = pd.merge(raw_df, label_df)

In [None]:
# Dropping inaccurate readings 
# i.e. rows with accuracy > 30 and speed < 0

# df = df.loc[(df['Accuracy'] <= 30) & (df['Speed'] >= 0)]

### Feature Engineering

Adding additional columns 
1. Distance = 'Speed' * 'second' 
2. Acceleration = sqrt('acceleration_x' ** 2, 'acceleration_y' ** 2, 'acceleration_z' ** 2) 
3. gyro_pc (magnitude) = PC1 of PCA applied on gyro_x, gyro_y, gyro_z
4. Speed_diff = per second difference in speed for each bookingID
5. Bearing_diff = per second difference in bearing for each bookingID

In [212]:
df_add = df.copy(deep=True).sort_values(by=['bookingID', 'second']).reset_index(drop=True)

# Applying PCA to gyro coordinates

from sklearn.decomposition import PCA

gyro_cols = ['gyro_x', 'gyro_y', 'gyro_z']
gyro_coord = df_add[gyro_cols]
pca = PCA()

pca.fit(gyro_coord)
pca.explained_variance_ratio_

array([0.73374334, 0.13878273, 0.12747393])

In [213]:
# Since first PC accounts for > 70% of variance, we only keep 1 PC to explain gyro

df_add['gyro_pc'] = pca.transform(df_add[gyro_cols])[:, 0]
df_add.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label,gyro_pc
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991,0,-0.088712
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454,0,-0.076362
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454,0,-0.032132
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454,0,0.00233
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454,0,-0.044844


In [214]:
# Adding distance, acceleration and magnitude columns

df_add['distance'] = df_add['Speed'] * df_add['second']
df_add['acceleration'] = np.sqrt(np.power(df_add['acceleration_x'], 2) + 
                                 np.power(df_add['acceleration_y'], 2) + 
                                 np.power(df_add['acceleration_z'], 2))

In [215]:
# Function to calculate difference in speed per second
def calc_speed_diff(x):
    return x['Speed'].diff()

# Function to calculate difference in bearing per second
def calc_bearing_diff(x):
    return x['Bearing'].diff()

# sub_df is a dataframe containing differences in speed and bearing
# values per second
# Large change in speed == sudden speeding/braking
# Large change in bearing == sudden change in direction/lane change

df_add['speed_diff'] = df_add.groupby('bookingID').apply(calc_speed_diff).fillna(method="backfill").values
df_add['bearing_diff'] = df_add.groupby('bookingID').apply(calc_bearing_diff).fillna(method="backfill").values

In [216]:
df_add.head(n=10)

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label,gyro_pc,distance,acceleration,speed_diff,bearing_diff
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991,0,-0.088712,0.0,10.176551,-3.214536,0.0
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454,0,-0.076362,0.228454,10.059553,-3.214536,0.0
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454,0,-0.032132,0.456909,9.503762,0.0,0.0
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454,0,0.00233,0.685363,9.83032,0.0,0.0
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454,0,-0.044844,0.913818,9.967466,0.0,0.0
5,0,8.0,143.298294,-0.608313,-9.539658,-1.794583,-0.007538,-0.023838,0.018068,5.0,0.228454,0,-0.022721,1.142272,9.726029,0.0,0.0
6,0,8.0,143.298294,-0.867758,-9.698615,-1.615439,0.022728,-0.012178,0.005982,6.0,0.228454,0,-0.011466,1.370727,9.870449,0.0,0.0
7,0,8.0,143.298294,-1.05079,-9.74527,-1.411771,0.027603,0.001841,0.000904,7.0,0.228454,0,0.002201,1.599181,9.902906,0.0,0.0
8,0,8.0,143.298294,-0.721213,-9.960004,-1.202271,0.001864,-0.007702,0.014018,8.0,0.228454,0,-0.006775,1.827636,10.058194,0.0,0.0
9,0,8.0,143.298294,-0.346924,-9.532629,-1.204663,0.014962,-0.050033,0.025118,9.0,0.228454,0,-0.04784,2.05609,9.614707,0.0,0.0


### Creating df_sub1: aggregation of variables 
Taking mean, median, standard deviation, spread of acceleration, gyro_pc, speed and second

In [217]:
def spread(x):
    return x.max() - x.min()

df_sub1 = df_add.groupby('bookingID')['acceleration', 'gyro_pc', 'Speed', 'second'].aggregate([
    'mean', 'median', 'std', spread
])

df_sub1.columns = ["_".join(x) for x in df_sub1.columns.ravel()]
df_sub1.columns = [col.lower() for col in df_sub1.columns]
df_sub1 = df_sub1.fillna(0)

In [218]:
df_sub1.head(n=10)

Unnamed: 0_level_0,acceleration_mean,acceleration_median,acceleration_std,acceleration_spread,gyro_pc_mean,gyro_pc_median,gyro_pc_std,gyro_pc_spread,speed_mean,speed_median,speed_std,speed_spread,second_mean,second_median,second_std,second_spread
bookingID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,9.886164,9.852645,0.624159,6.530989,-0.006603,-0.002878,0.098805,1.101489,8.994822,8.503366,7.199919,23.946083,903.526892,1087.5,533.745097,1589.0
1,9.862507,9.844748,0.519956,5.819621,-0.00777,-0.003761,0.090758,1.122123,7.881588,6.904588,7.059362,22.882141,581.175088,607.0,289.339367,1034.0
2,9.92959,9.877755,0.515173,5.168422,-0.012728,0.001405,0.117067,0.896186,3.157213,2.998761,2.897762,9.360483,339.441026,97.0,356.319445,825.0
4,9.813434,9.791035,0.620066,13.349284,0.022444,0.024355,0.112611,1.166442,6.150996,3.31,5.595901,19.780001,547.49543,547.5,315.962793,1094.0
6,9.91809,9.904142,0.585346,7.280114,0.000501,0.004203,0.106436,1.161609,4.628921,1.936962,5.314844,16.394695,547.0,547.0,316.243577,1094.0
7,9.82647,9.7898,0.916836,8.572037,0.00264,-0.002707,0.072364,0.899634,12.176386,13.017325,8.680455,25.230654,480.947313,481.0,276.761488,959.0
8,9.762028,9.646309,0.728514,9.416841,-0.00093,0.000241,0.078281,0.754561,5.351266,3.5,5.661732,19.27,256.847545,268.0,130.713942,462.0
10,9.550677,9.49439,0.833292,9.474737,0.001928,-0.000594,0.110036,0.908451,8.702027,9.58,7.002632,20.05,187.0,187.0,108.397417,374.0
11,9.948639,9.877962,0.75048,5.686104,-0.003997,-0.002959,0.151739,0.986176,6.659024,5.192059,6.019429,17.876741,132.0,112.0,93.043769,299.0
14,9.85864,9.824785,0.44798,6.110603,-0.003204,0.001636,0.092127,0.947279,4.725448,3.173314,4.575357,21.780035,299.5,299.5,173.349358,599.0


### Creating df_sub2: detecting outlier values for speed, second, acceleration, gyro

In [219]:
print(df_add.columns)

# Checking 75th percentile for speed and second
# Checking 25th and 75th percentile for acceleration, gyro coordinates

outlier_vals = {'Speed_75' : np.quantile(df_add['Speed'], 0.75),
               'second_75' : np.quantile(df_add['second'], 0.75),
               'acceleration_x_25' : np.quantile(df_add['acceleration_x'], 0.25), 
               'acceleration_y_25' : np.quantile(df_add['acceleration_y'], 0.25),
               'acceleration_z_25' : np.quantile(df_add['acceleration_z'], 0.25),
               'acceleration_x_75' : np.quantile(df_add['acceleration_x'], 0.75),
               'acceleration_y_75' : np.quantile(df_add['acceleration_y'], 0.75),
               'acceleration_z_75' : np.quantile(df_add['acceleration_z'], 0.75),
               'gyro_x_25' : np.quantile(df_add['gyro_x'], 0.25),
               'gyro_y_25' : np.quantile(df_add['gyro_y'], 0.25),
               'gyro_z_25' : np.quantile(df_add['gyro_z'], 0.25),
               'gyro_x_75' : np.quantile(df_add['gyro_x'], 0.75),
               'gyro_y_75' : np.quantile(df_add['gyro_y'], 0.75),
               'gyro_z_75' : np.quantile(df_add['gyro_z'], 0.75)}

outlier_vals

Index(['bookingID', 'Accuracy', 'Bearing', 'acceleration_x', 'acceleration_y',
       'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z', 'second', 'Speed',
       'label', 'gyro_pc', 'distance', 'acceleration', 'speed_diff',
       'bearing_diff'],
      dtype='object')


{'Speed_75': 15.48,
 'second_75': 862.0,
 'acceleration_x_25': -0.5084228515625,
 'acceleration_y_25': -1.9327545166015627,
 'acceleration_z_25': -0.9298957000000001,
 'acceleration_x_75': 0.6347808837890625,
 'acceleration_y_75': 9.709925,
 'acceleration_z_75': 2.751175,
 'gyro_x_25': -0.026791619720576535,
 'gyro_y_25': -0.02993423,
 'gyro_z_25': -0.018769693046769464,
 'gyro_x_75': 0.023315709027534718,
 'gyro_y_75': 0.03144551,
 'gyro_z_75': 0.018234253}

In [220]:
def quantile_check(x, col, q):
    val = outlier_vals['{}_{}'.format(col, q)]
    
    if q == 25:
        return np.sum(x[col] < val)
    elif q == 75:
        return np.sum(x[col] > val)
    

df_sub2 = pd.DataFrame()
    
colnames = ['acceleration_x', 'acceleration_y', 'acceleration_z', 
           'gyro_x', 'gyro_y', 'gyro_z']

df_sub2['over_Speed'] = df_add.groupby('bookingID').apply(quantile_check, col='Speed', q=75)
df_sub2['over_second'] = df_add.groupby('bookingID').apply(quantile_check, col='second', q=75)

for col in colnames:
    print(col)
    arr = df_add.groupby('bookingID').apply(quantile_check, col=col, 
                                             q=25).values + df_add.groupby('bookingID').apply(quantile_check, 
                                                                                              col=col, q=75).values
    df_sub2['over_{}'.format(col)] = arr

acceleration_x
acceleration_y
acceleration_z
gyro_x
gyro_y
gyro_z


In [221]:
df_add2 = pd.merge(df_sub1, df_sub2, on='bookingID')
df_add2 = pd.merge(df_add2, label_df, on='bookingID')
df_add2.columns = [col.lower() for col in df_add2.columns]

### Creating df_sub3: sliding window + kmeans

In [222]:
window_features = df_add[['bookingID', 'Speed', 'acceleration', 'gyro_pc']].groupby('bookingID')
# ids = np.random.choice(window_features['bookingID'].unique(), size=20)
# window_features2 = window_features[window_features['bookingID'].isin(ids)].groupby('bookingID')

In [223]:
def sliding_window(df, step=8, overlap=4):
    df = df.reset_index(drop=True)
    start_idx = df.index.values[0]
    end_idx = df.index.values[-1]
        
    booking_df = pd.DataFrame(columns=['Speed_mean', 'acceleration_mean', 'gyro_pc_mean', 
                                       'Speed_median', 'acceleration_median', 'gyro_pc_median', 
                                       'Speed_std', 'acceleration_std', 'gyro_pc_std'])    
    
    while start_idx <= (end_idx - step):
        agg_vals = df.iloc[start_idx: start_idx + step, ].aggregate(['mean', 'median', 'std'])
        agg_vals = agg_vals.unstack().to_frame().sort_index(level=1).T
        agg_vals.columns = agg_vals.columns.map('_'.join)
        
        booking_df = pd.concat([booking_df, agg_vals], axis=0)
        
        start_idx += overlap

    return booking_df
    

    
import datetime
start_time = datetime.datetime.now()
print("Start time: {}".format(start_time))

window_features2 = window_features.rolling(8).agg(['mean', 'median', 'std']).dropna()[::4]

end_time = datetime.datetime.now()
print("End time: {}".format(end_time))

print("Total time taken: {} seconds".format((end_time - start_time).seconds))

Start time: 2020-01-27 12:02:44.520037
End time: 2020-01-27 12:07:01.495672
Total time taken: 256 seconds


In [224]:
x = window_features.count()
missing_ids = x['Speed'][x['Speed'] <= 10].index.values

In [225]:
window_features2 = window_features2.drop('bookingID', axis=1)
window_features2.columns = ['_'.join(col) for col in window_features2.columns]

In [226]:
from sklearn.ensemble import IsolationForest

columns=['Speed_mean', 'acceleration_mean', 'gyro_pc_mean', 
                                       'Speed_median', 'acceleration_median', 'gyro_pc_median', 
                                       'Speed_std', 'acceleration_std', 'gyro_pc_std']

speed_cols = ['Speed_mean', 'Speed_median', 'Speed_std']
accel_cols = ['acceleration_mean', 'acceleration_median', 'acceleration_std']
gyro_cols = ['gyro_pc_mean', 'gyro_pc_median', 'gyro_pc_std']

iso1 = IsolationForest()
iso_speed = iso1.fit_predict(window_features2[speed_cols])

iso2 = IsolationForest()
iso_accel = iso2.fit_predict(window_features2[accel_cols])

iso3 = IsolationForest()
iso_gyro = iso3.fit_predict(window_features2[gyro_cols])



In [23]:
# pred_outliers = iso.predict(window_features2)



In [227]:
window_features2['speed_outlier'] = iso_speed
window_features2['accel_outlier'] = iso_accel
window_features2['gyro_outlier'] = iso_gyro
window_features3 = window_features2.droplevel(1).reset_index()

def count_outliers(x):
    non_speed = np.sum(x['speed_outlier'] == 1)
    speed = np.sum(x['speed_outlier'] == -1)
    
    non_accel = np.sum(x['accel_outlier'] == 1)
    accel = np.sum(x['accel_outlier'] == -1)
    
    non_gyro = np.sum(x['gyro_outlier'] == 1)
    gyro = np.sum(x['gyro_outlier'] == -1)
    
    return (non_speed, speed), (non_accel, accel), (non_gyro, gyro)

outlier_df = window_features3.groupby('bookingID').apply(count_outliers)
outlier_df

bookingID
0                ((243, 7), (237, 13), (239, 11))
1                ((189, 22), (209, 2), (199, 12))
2                     ((47, 0), (47, 0), (41, 6))
4                ((260, 11), (262, 9), (245, 26))
6                 ((263, 9), (263, 9), (253, 19))
                               ...               
1709396983957    ((244, 5), (49, 200), (233, 16))
1709396983960    ((163, 18), (179, 2), (170, 11))
1709396983966    ((181, 13), (194, 0), (166, 28))
1709396983971    ((263, 4), (257, 10), (240, 27))
1709396983975      ((89, 18), (104, 3), (92, 15))
Length: 19982, dtype: object

In [228]:
outlier_df2 = pd.DataFrame({'num_non_speed_outlier' : [lst[0][0] for lst in outlier_df],
                           'num_speed_outlier' : [lst[0][1] for lst in outlier_df],
                           'num_non_accel_outlier' : [lst[1][0] for lst in outlier_df],
                           'num_accel_outlier' : [lst[1][1] for lst in outlier_df],
                           'num_non_gyro_outlier' : [lst[2][0] for lst in outlier_df],
                           'num_gyro_outlier' : [lst[2][1] for lst in outlier_df]})

In [230]:
# df_add2_drop = df_add2[~df_add2['bookingid'].isin(missing_ids)].reset_index(drop=True)
df_add3 = pd.concat([df_add2, outlier_df2], axis=1)

In [231]:
df_add3.isnull().sum()

bookingid                0
acceleration_mean        0
acceleration_median      0
acceleration_std         0
acceleration_spread      0
gyro_pc_mean             0
gyro_pc_median           0
gyro_pc_std              0
gyro_pc_spread           0
speed_mean               0
speed_median             0
speed_std                0
speed_spread             0
second_mean              0
second_median            0
second_std               0
second_spread            0
over_speed               0
over_second              0
over_acceleration_x      0
over_acceleration_y      0
over_acceleration_z      0
over_gyro_x              0
over_gyro_y              0
over_gyro_z              0
label                    0
num_non_speed_outlier    0
num_speed_outlier        0
num_non_accel_outlier    0
num_accel_outlier        0
num_non_gyro_outlier     0
num_gyro_outlier         0
dtype: int64

### Creating df_sub4 (not dropping inaccurate rows and negative speeds)
1. Calculate acceleration using Euclidean distance
2. Sliding window of size 8 to aggregate on mean, median, std per bookingID
3. IsolationForest to identify outliers per bookingID

THIS TAKES FOREVER (~ 1.5 HOURS) TO RUN, DONT USE

In [164]:
accel_df = df_add[['bookingID', 'acceleration', 'second', 'label']]

accel_agg = accel_df.groupby('bookingID')['acceleration'].rolling(8).aggregate(['mean', 'median', 'std'])
accel_agg = accel_agg.dropna().droplevel(1).reset_index()

In [175]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(behaviour='new', contamination='auto')

def detect_outliers(x):
    pred = iso.fit_predict(x)
    x['outlier'] = pred
    return x
    
accel_agg2 = accel_agg.groupby('bookingID').apply(detect_outliers)

In [179]:
accel_agg2.to_csv(SAVE_DIR + "accel_agg.csv", index=False)

### Older (unused) pre-processing/aggregation

In [None]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# window_features5 = pd.DataFrame(scaler.fit_transform(window_features4), columns=window_features4.columns)
# window_features5.describe()

In [None]:
# from sklearn.cluster import KMeans 
# from sklearn import metrics 
# from scipy.spatial.distance import cdist

# distortions = [] 
# inertias = [] 
# mapping1 = {} 
# mapping2 = {} 
# K = range(1,10) 
  
# for k in K: 
#     print(k)
    
#     #Building and fitting the model 
#     kmeanModel = KMeans(n_clusters=k)
#     kmeanModel.fit(window_features4)     
      
#     distortions.append(sum(np.min(cdist(window_features4, kmeanModel.cluster_centers_, 
#                       'euclidean'),axis=1)) / window_features4.shape[0]) 
#     inertias.append(kmeanModel.inertia_) 
  
#     mapping1[k] = sum(np.min(cdist(window_features4, kmeanModel.cluster_centers_, 
#                  'euclidean'),axis=1)) / window_features4.shape[0] 
#     mapping2[k] = kmeanModel.inertia_ 

In [None]:
# for key,val in mapping1.items(): 
# 	print(str(key)+' : '+str(val)) 


In [None]:
# plt.plot(K, distortions, 'bx-') 
# plt.xlabel('Values of K') 
# plt.ylabel('Distortion') 
# plt.title('The Elbow Method using Distortion') 
# plt.show() # Should keep 5 clusters

In [None]:
# plt.plot(K, inertias, 'bx-') 
# plt.xlabel('Values of K') 
# plt.ylabel('Inertia') 
# plt.title('The Elbow Method using Inertia') 
# plt.show() 

In [None]:
# kmeans = KMeans(n_clusters=5)
# kmeans.fit(window_features5)

In [None]:
# window_features5['cluster'] = kmeans.predict(window_features5)
# window_features5['cluster'] += 1
# window_features5['cluster'].value_counts()

In [None]:
# booking_ids = []

# for idx in window_features3.index:
#     booking_ids.append(idx[0])
    
# len(booking_ids)

In [None]:
# window_features5['bookingID'] = booking_ids

In [None]:
# df_add2 = df_add.groupby('bookingID').aggregate({'second' : ['min', 'mean', 'max', 'median', 'std', spread], 
#                                                  'Speed' : ['min', 'mean', 'max', 'median', 'std', spread], 
#                                                 'distance' : ['min', 'mean', 'max', 'median', 'std', spread],
#                                                 'acceleration' : ['min', 'mean', 'max', 'median', 'std', spread],
#                                                 'gyro_pc' : ['min', 'mean', 'max', 'median', 'std', spread],
#                                                 'speed_diff' : ['min', 'mean', 'max', 'median', 'std', spread],
#                                                 'bearing_diff' : ['min', 'mean', 'max', 'median', 'std', spread]})

# df_add2.columns = ["_".join(x) for x in df_add2.columns.ravel()]
# df_add2 = pd.merge(df_add2, label_df, on='bookingID').drop('bookingID', axis='columns')
# df_add2.columns = [col.lower() for col in df_add2.columns]

In [None]:
# # Checking for correlations within aggregated data
# # Particularly, check for correlations between different features 
# # e.g. strong correlation between accuracy and speed etc

# corr_matrix = df_add2.corr()
# corr_matrix.where((corr_matrix > 0.5) & (corr_matrix != 1))

In [None]:
# def plot_corr_heatmap(df, vmax=1.0):
#     corr_matrix = df.corr()
    
#     mask = np.zeros_like(corr_matrix, dtype=np.bool)
#     mask[np.triu_indices_from(mask)] = True

#     # Set up the matplotlib figure
#     f, ax = plt.subplots(figsize=(11, 9))

#     # Generate a custom diverging colormap
#     cmap = sns.diverging_palette(220, 10, as_cmap=True)

#     # Draw the heatmap with the mask and correct aspect ratio
#     sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=vmax, 
#                 square=True, center=0, linewidths=.5)
    
# plot_corr_heatmap(df, vmax=0.6)

In [None]:
# sns.pairplot(df_add2, hue='label')

### Generating CSV for use in modelling.ipynb

In [232]:
SAVE_DIR = os.path.join(os.getcwd(), '../data/safety/')

df_add3.to_csv(SAVE_DIR + "kfengtee.csv", index=False)