In [54]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import datetime
import tensorflow as tf 

# Data Preprocessing

In [55]:
def get_time_value(x):
    ''' 
    Input : x - array of time stamps 
    Output: returns time as microseconds
    '''
    t = datetime.datetime.fromtimestamp(x/1000.0)
    t = t.replace(microsecond = 0)
    return int(t.timestamp())


### Read accelerometer data 
acc_data = pd.read_csv('../data/all_accelerometer_data_pids_13.csv')

### create microsecond timestamps 
acc_data['window10'] = acc_data['time'].apply(get_time_value)
acc_data = acc_data.drop(columns="time")
acc_data = acc_data.rename(columns = {"window10": "time"})

# Check 01
acc_data.head()

Unnamed: 0,pid,x,y,z,time
0,JB3156,0.0,0.0,0.0,0
1,CC6740,0.0,0.0,0.0,0
2,SA0297,0.0758,0.0273,-0.0102,1493733882
3,SA0297,-0.0359,0.0794,0.0037,1493733882
4,SA0297,-0.2427,-0.0861,-0.0163,1493733882


In [56]:
# Check that there are 13 unique PIDs
print("No: of IDs = ", len(acc_data['pid'].unique()))
print(acc_data['pid'].unique())

No: of IDs =  13
['JB3156' 'CC6740' 'SA0297' 'PC6771' 'BK7610' 'DC6359' 'MC7070' 'MJ8002'
 'BU4707' 'JR8022' 'HV0618' 'SF3079' 'DK3500']


In [57]:
pids = acc_data['pid'].unique()
clean_tac_data = pd.DataFrame()

### Create dataframe with clean tac data for all 13 pids 
for pid in pids:
    temp = pd.read_csv('../data/clean_tac/' + pid + '_clean_TAC.csv')
    temp['pid'] = pid
    clean_tac_data = pd.concat([clean_tac_data, temp])

### Set binary label based on TAC reading threshold
clean_tac_data["tac"] = np.where(clean_tac_data["TAC_Reading"] > 0.08, 1, 0)
clean_tac_data = clean_tac_data.drop(columns="TAC_Reading")
clean_tac_data = clean_tac_data.rename(columns={"tac": "TAC_Reading"})
clean_tac_data.describe() 


Unnamed: 0,timestamp,TAC_Reading
count,715.0,715.0
mean,1493758000.0,0.243357
std,27079.92,0.429409
min,1493717000.0,0.0
25%,1493732000.0,0.0
50%,1493757000.0,0.0
75%,1493782000.0,0.0
max,1493811000.0,1.0


In [60]:
for pid in pids:
    print("PID "+pid+"=",clean_tac_data[clean_tac_data['pid']==pid]['timestamp'].is_monotonic_increasing)

PID JB3156= True
PID CC6740= True
PID SA0297= True
PID PC6771= True
PID BK7610= True
PID DC6359= True
PID MC7070= True
PID MJ8002= True
PID BU4707= True
PID JR8022= True
PID HV0618= True
PID SF3079= True
PID DK3500= True


In [70]:
## Random check to see if two pids can be taken together based on monotonically increasing time stamps 
clean_tac_data[(clean_tac_data['pid']=='JB3156')&(clean_tac_data['pid']=='BK7610')]['timestamp'].is_monotonic_increasing

True

In [62]:
clean_tac_data['timestamp'].is_monotonic_increasing 

False

In [None]:
# clean_tac_data.to_csv("clean_tac_data.csv")

In [72]:
clean_tac_data = clean_tac_data[clean_tac_data['pid']=="BK7610"]
clean_tac_data.shape

(57, 3)

In [73]:
# Up sampling tac data to match acc data

clean_ts = clean_tac_data['timestamp'] 
acc_ts = acc_data['time']
all_labels = list()
offset_tac, offset_acc = 0, 0

while offset_tac < len(clean_ts) and offset_acc < len(acc_ts):
  
  while acc_ts.iloc[offset_acc] < clean_ts.iloc[offset_tac]:
    all_labels.append([clean_tac_data.iloc[offset_tac]['TAC_Reading'], acc_ts.iloc[offset_acc]])
    offset_acc += 1
    if offset_acc >= len(acc_ts):
      break

  offset_tac += 1

all_labels

[[0, 0],
 [0, 0],
 [0, 1493733882],
 [0, 1493733882],
 [0, 1493733882],
 [0, 1493733883],
 [0, 1493733883],
 [0, 1493733883],
 [0, 1493733883],
 [0, 1493733883],
 [0, 1493733883],
 [0, 1493733883],
 [0, 1493733883],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 1493733884],
 [0, 14937

In [77]:
len(all_labels)

11738136

In [75]:
all_labels_df = pd.DataFrame(all_labels, columns = ["tac", "time"])
all_labels_df.shape, acc_data[acc_data['pid']==''].shape 

((11738136, 2), (0, 5))

In [69]:
all_labels_df.shape

(226433, 2)