In [1]:
import pandas as pd
import h5py
import numpy as np
import datetime
import copy
import os

In [2]:
#derived data
annotation=pd.read_csv("../data/derived_data/annotation.csv")
raw_meta=pd.read_csv("../data/derived_data/raw-meta.csv")
section_speed=pd.read_csv("../data/derived_data/section-speed.csv")
steps=pd.read_csv("../data/derived_data/steps.csv")
wheel=pd.read_csv("../data/derived_data/wheel.csv")

#raw data
sync=pd.read_csv("../data/raw_data/sync.csv")
raw_file="../data/raw_data/raw.h5"
raw=h5py.File(raw_file,"r")

In [3]:
def convert_cols_to_time(dataframe, columns_to_convert):
    """ A function that converts columns specified by 'columns_to_convert' in the 'dataframe' into epoch time and returns the converted dataframe"""
    for col in columns_to_convert:
        dataframe[col]=dataframe[col].apply(lambda x: pd.Timestamp(x).timestamp())
    return dataframe



In [4]:
sync=convert_cols_to_time(sync,['subject.start','subject.end','wheel.start','wheel.end','video.start'])
wheel=convert_cols_to_time(wheel,['start','end'])
wheel['mid']=wheel['start']+0.5



In [5]:
# Construct expanded sync dataframe with omitted unnecessary fields
data_columns=["stamp","SubjectId","RunId","x","y","z","speed"]
data=[]
for idx, row in sync.iterrows():

    
    subject_id=row['SubjectId']
    run_id=row['RunId']
    subject_start=row['subject.start']
    delta=row['subject.delta']/100


    # Fetch subject raw accelerometer data from raw.h5 
    path_to_subject=row["subject.path"]
    subject_data=raw.get(path_to_subject)
    subject_data=list(subject_data)
    subject_data=np.array(subject_data).T


    # Generate corrected stamps corresponding to subject sensor entries
    # old_stamps=np.linspace(subject_start-delta,subject_start-delta+len(subject_data)/100, len(subject_data))

    stamps=0.01*np.arange(len(subject_data))+subject_start-delta
    speeds=np.zeros_like(stamps)
    subjects=subject_id*np.ones_like(stamps)
    runs=run_id*np.ones_like(stamps)


    # lookup average speed data from wheel.csv based on SubjectId, RunId, and nearest number of wheel start time to corrected subject start time 
    run_wheel_data=wheel.loc[(wheel['SubjectId']==subject_id ) & (wheel['RunId']==run_id)]
    first_speed_stamp=run_wheel_data["start"].iloc[0]
    last_speed_stamp=run_wheel_data["end"].iloc[-1]
    for i in range(len(stamps)):   
        time_idx=np.argmin(np.abs(run_wheel_data['mid'].values-stamps[i]))
        speeds[i]=run_wheel_data['speed'].values[time_idx]

    # Construct table for this subject run
    run_gathered_data=np.concatenate([stamps.reshape(-1,1),subjects.reshape(-1,1),runs.reshape(-1,1),subject_data[:,0:-1], speeds.reshape(-1,1)], axis=1) #added stamps and dropped last KSS column
    
    # discard all samples that started before initial_speed_stamp and continued after last_speed_stamp
    run_gathered_data=run_gathered_data[run_gathered_data[:,0]>first_speed_stamp]
    run_gathered_data=run_gathered_data[run_gathered_data[:,0]<last_speed_stamp]
    # discard samples with inf speed (This will generate discontinuities in the collection of  time series)
    run_gathered_data=run_gathered_data[run_gathered_data[:,-1]!=np.inf] 

    data.append(run_gathered_data)

    # # Fetch wheel raw accelerometer data from raw.h5 (no correction needed since wheel is reference)
    # path_to_wheel=row["wheel.path"]	
    # wheel_data=raw.get(path_to_wheel)
    # wheel_data=list(wheel_data)
    # wheel_data=np.array(wheel_data).T


isExist = os.path.exists("../data/processed")
if not isExist:
  os.makedirs("../data/processed")

data=np.concatenate(data)
data=pd.DataFrame(data, columns=data_columns)
data.to_csv("../data/processed/gathered.csv", index=False) 