In [1]:
ref='nb61-' #Note to matt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import seaborn as sns
import copy


import random

import matplotlib

plt.style.use('seaborn-whitegrid')
matplotlib.rcParams.update({
    'font.family': 'serif',
    'axes.titlesize':16,
    'axes.labelsize':16,
    'axes.xmargin':0.1,
    'axes.ymargin':0.1,
    'legend.fontsize':16,
    'xtick.labelsize' : 16,
    'ytick.labelsize' : 16,
    'lines.markersize': 10,
    'lines.linewidth' : 3,
    'font.size': 16
})



In [2]:
f = open("../reports/{}explore.txt".format(ref), "w")
f.write("Data Exploration File \n")
f.write("===================== \n")

23

# Importing Data

In [3]:

raw_data=pd.read_csv("../data/processed/gathered_w_rms.csv",index_col=False )
raw_data.head()

Unnamed: 0,stamp,SubjectId,RunId,x,y,z,speed,walk,rms
0,1527700000.0,52.0,1.0,-426.0,-102.0,124.0,0.133031,52-1,262.84089
1,1527700000.0,52.0,1.0,-413.0,-81.0,124.0,0.133031,52-1,253.31535
2,1527700000.0,52.0,1.0,-394.0,-69.0,126.0,0.133031,52-1,242.124624
3,1527700000.0,52.0,1.0,-367.0,-52.0,122.0,0.133031,52-1,225.297581
4,1527700000.0,52.0,1.0,-340.0,-36.0,118.0,0.133031,52-1,208.822093


In [4]:
# Extract individual walks
subject_id=raw_data['SubjectId'].values
walks=list(raw_data.drop_duplicates(subset=['walk'])['walk'].values)
#print('No. of available walks (time series) in the dataset is {} performed by {} subjects'.format(len(walks),len(set(subject_id))))
f.write('No. of available walks (time series) in the dataset is {} performed by {} subjects \n\n'.format(len(walks),len(set(subject_id))))

85

# Identify time discontinuity in walks

In [5]:
lags=[]
for walk in walks:
    walk_data=raw_data[raw_data['walk']==walk]
    stamps=walk_data['stamp'].values
    diff=np.diff(stamps)
    time_lag=np.where(diff<0.015,0,diff )
    lag_idx=np.nonzero(time_lag)
    lags.append(lag_idx)
    if time_lag.sum()>0:
        # print('{} discontinuity detected in walk: {} at indices: {} with lags {} sec'.format(len(lag_idx[0]),walk, list(lag_idx[0]), list(time_lag[lag_idx[0]])))
        f.write('{} discontinuity detected in walk: {} at indices: {} with lags {} sec \n'.format(len(lag_idx[0]),walk, list(lag_idx[0]), list(time_lag[lag_idx[0]])))
        # Drop data beyond the first discontinuity 
        walk_idx=np.array(walk_data.index)
        sliced_walk=walk_idx[lag_idx[0][0]:] #sliced from the first discontinuity to the end 
        raw_data=raw_data.drop(sliced_walk) #drop the sliced data from the raw data
        #print('{} samples dropped to eliminate discontinuity in walk {}!'.format(len(sliced_walk),walk))
        f.write('{} samples dropped to eliminate discontinuity in walk {}! \n'.format(len(sliced_walk),walk))
        f.write('\n')
         



# Filtering

In [6]:
#Group data by speeds
data_speeds=raw_data.drop_duplicates(subset=['speed'])
f.write('Total number of samples in the dataset is {} and includes {} distinct speeds \n'.format(len(raw_data), len(data_speeds)))
print('Total number of samples in the dataset is {} and includes {} distinct speeds \n'.format(len(raw_data), len(data_speeds)))
data_speeds=list(data_speeds['speed'].values)

lower_perc=0.25 #lower  percentile
higher_perc=0.75 #higher  percentile
data_qs=[] #holds lower and higher quartiles for each speed per acceleration
#data_filters=[] #holds boolean filters for each speed
data_naned_accls=[]
#data_accls=[]
data_ids=[]
for speed in data_speeds:
    speed_accls=raw_data[raw_data['speed']==speed]
    speed_ids=speed_accls[['stamp','SubjectId','RunId','walk','speed']]
    data_ids.append(speed_ids)
    speed_accls=speed_accls[['x','y','z','rms']]
    #data_accls.append(speed_accls)
    speed_qs=speed_accls.quantile([lower_perc,higher_perc])
    data_qs.append(speed_qs)
    speed_iqrs=speed_qs.iloc[1]-speed_qs.iloc[0]
    lower_cutoff=speed_qs.iloc[0]-1.5*speed_iqrs
    higher_cutoff=speed_qs.iloc[1]+1.5*speed_iqrs
    filter=(speed_accls >= lower_cutoff) & (speed_accls <=higher_cutoff)
    #data_filters.append(filter)
    speed_naned_accls=filter.applymap(lambda x: 1 if x else np.nan) #replace each false value by nan and the rest by 1
    speed_naned_accls=speed_accls*speed_naned_accls
    data_naned_accls.append(speed_naned_accls)
    

data_ids=pd.concat(data_ids)
#data_accls=pd.concat(data_accls)
data_naned_accls=pd.concat(data_naned_accls)
#resorted_raw_data=pd.concat([data_ids,data_accls],axis=1) #Constructs Dataframe of input data (which is raw data resorted due to grouping by speeds)
naned_data=pd.concat([data_ids,data_naned_accls],axis=1) #Constructs Dataframe of output (filtered) data  

no_x_outliers=naned_data['x'].isnull().sum()
no_y_outliers=naned_data['y'].isnull().sum()
no_z_outliers=naned_data['z'].isnull().sum()

f.write('No of outliers:\n X = {}\n Y = {}\n Z = {}\n\n'.format(no_x_outliers, no_y_outliers,no_z_outliers))
print('No of outliers:\n X = {}\n Y = {}\n Z = {}\n\n'.format(no_x_outliers, no_y_outliers,no_z_outliers))


Total number of samples in the dataset is 2041820 and includes 12256 distinct speeds 

No of outliers:
 X = 161094
 Y = 109799
 Z = 107514




# Interpolating 

In [7]:
indexed_data=naned_data.sort_index()# reindex to bring data back into time series
cleaned_data=[]
for walk in walks: # group data by walks
    walk_data=indexed_data[raw_data['walk']==walk]
    walk_cleaned_data=walk_data.interpolate(axis=0) #this will result in linear interpolation (i.e. sequence is equally spaced )
    cleaned_data.append(walk_cleaned_data)

cleaned_data=pd.concat(cleaned_data)
cleaned_data=cleaned_data.reset_index(drop=True)

# Recalculate RMS column after cleaning x, y and z
cleaned_data['rms']=(((cleaned_data.x)**2+(cleaned_data.y)**2+(cleaned_data.z)**2)/3)**(1/2)

no_x_nan=cleaned_data['x'].isnull().sum()
no_y_nan=cleaned_data['y'].isnull().sum()
no_z_nan=cleaned_data['z'].isnull().sum()
no_rms_nan=cleaned_data['rms'].isnull().sum()

f.write('After interpolation, No of leading NaNs:\n X = {}\n Y = {}\n Z = {}\n RMS = {}\n\n'.format(no_x_nan, no_y_nan,no_z_nan, no_rms_nan))
print('After interpolation, No of leading NaNs:\n X = {}\n Y = {}\n Z = {}\n RMS = {}\n\n'.format(no_x_nan, no_y_nan,no_z_nan,no_rms_nan))


After interpolation, No of leading NaNs:
 X = 504
 Y = 161
 Z = 375
 RMS = 616




In [8]:
# Find rows with nan (rows with nan values after interpolation are exported. Typically, these are the leading nans in every walks as they can not be interpolated)
nan_rows=cleaned_data.isnull().any(axis=1)
first_nan_idx=list(nan_rows[nan_rows==True].index.values)[0]
rows_with_nan=cleaned_data.iloc[first_nan_idx-10:] #take 10 rows before the first row with nan
rows_with_nan.to_csv("../data/processed/rows_with_nan_no_rounding.csv")
rows_with_nan.head(20)

Unnamed: 0,stamp,SubjectId,RunId,walk,speed,x,y,z,rms
2041810,1528365000.0,988.0,2.0,988-2,0.056256,334.0,6.0,2.0,192.869559
2041811,1528365000.0,988.0,2.0,988-2,0.056256,332.0,6.0,9.0,191.781994
2041812,1528365000.0,988.0,2.0,988-2,0.056256,338.0,6.0,8.0,195.229779
2041813,1528365000.0,988.0,2.0,988-2,0.056256,336.0,6.0,2.0,194.024053
2041814,1528365000.0,988.0,2.0,988-2,0.056256,337.0,6.0,0.0,194.597876
2041815,1528365000.0,988.0,2.0,988-2,0.056256,336.0,6.0,0.0,194.020617
2041816,1528365000.0,988.0,2.0,988-2,0.056256,332.0,6.0,2.0,191.715066
2041817,1528365000.0,988.0,2.0,988-2,0.056256,333.333333,6.0,6.0,192.512433
2041818,1528365000.0,988.0,2.0,988-2,0.056256,334.666667,6.0,0.0,193.25094
2041819,1528365000.0,988.0,2.0,988-2,0.056256,336.0,6.0,2.0,194.024053


In [9]:
cleaned_data=cleaned_data.dropna() #We dropped leading nans in every walk as it is very erattic if we extrapolate them
cleaned_data.to_csv("../data/processed/cleaned_w_interp_no_rounding.csv", index=False) #Data cleaned using interplation 

no_x_nan=cleaned_data['x'].isnull().sum()
no_y_nan=cleaned_data['y'].isnull().sum()
no_z_nan=cleaned_data['z'].isnull().sum()
no_rms_nan=cleaned_data['rms'].isnull().sum()

f.write('After dropping nans, No of NaNs:\n X = {}\n Y = {}\n Z = {}\n RMS = {}\n\n'.format(no_x_nan, no_y_nan,no_z_nan, no_rms_nan))
print('After dropping nans, No of NaNs:\n X = {}\n Y = {}\n Z = {}\n RMS = {}\n\n'.format(no_x_nan, no_y_nan,no_z_nan,no_rms_nan))

After dropping nans, No of NaNs:
 X = 0
 Y = 0
 Z = 0
 RMS = 0




In [10]:
f.close()