In [54]:
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_folder = "" # ROOT FOLDER CHANGE THIS

hr_file = data_folder + "Smartwatch_HeartRateDatum.csv"
acc_file = data_folder + "Sensus_Accelerometer.csv"
loc_file = data_folder + "Sensus_Location.csv"

def read_file(filename):
    df = pd.read_csv(filename, header = 0)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'],
                                    origin="unix", 
                                    errors="coerce")

    return df

In [93]:
hr_df = read_file(hr_file)[["participantid","HR","Timestamp"]]
hr_df.columns = ['PID', 'HR', "Timestamp"] # reset column name
hr_df[['HR','PID']]=  hr_df[['HR','PID']].apply(pd.to_numeric,errors='coerce', downcast = "float") # change variable type
hr_df.set_index(['Timestamp'], inplace=True) # set timestamp as index 
hr_df = hr_df.resample("S").mean().reset_index() # resample by second
print(hr_df.shape)
print(hr_df[:5])

(310916351, 3)
            Timestamp  PID    HR
0 2009-01-01 08:01:10  4.0  82.0
1 2009-01-01 08:01:11  4.0  82.0
2 2009-01-01 08:01:12  4.0  83.0
3 2009-01-01 08:01:13  4.0  84.0
4 2009-01-01 08:01:14  4.0  84.0


In [87]:
acc_df = read_file(acc_file)[["ParticipantId","X","Y","Z","Timestamp"]]
acc_df.columns = ["PID","X","Y","Z","Timestamp"] # reset column name
acc_df[['PID','X','Y','Z']] = acc_df[['PID','X','Y','Z']].apply(pd.to_numeric,
                                                errors='coerce', downcast = "float") # change variable type
acc_df.set_index(['Timestamp'], inplace=True) # set timestamp as index 
acc_df = acc_df.resample("S").mean().reset_index() # resample by second
print(acc_df.shape)
print(acc_df[:5])

(356933, 5)
            Timestamp  PID         X         Y         Z
0 2018-11-01 22:10:32  9.0  0.392494  0.767774  0.398849
1 2018-11-01 22:10:33  9.0  0.323512  0.944885  0.306236
2 2018-11-01 22:10:34  9.0  0.298798  0.884627  0.274478
3 2018-11-01 22:10:35  9.0  0.423917  0.867023  0.362699
4 2018-11-01 22:10:36  9.0  0.353010  0.877254  0.351543


In [99]:
loc_df = read_file(loc_file)[["ParticipantId","Latitude","Longitude","Timestamp"]]
loc_df.columns = ["PID","Latitude","Longitude","Timestamp"] # reset column name
loc_df[['PID','Latitude', 'Longitude']] = loc_df[['PID','Latitude', 'Longitude']].apply(pd.to_numeric,
                                                            errors='coerce',
                                                            downcast="float") # change variable type
loc_df.set_index(['Timestamp'], inplace=True) # set timestamp as index 
loc_df = loc_df.resample("S").mean().reset_index() # resample by second
print(loc_df.shape)
print(loc_df[:5])

(362164, 4)
            Timestamp  PID  Latitude  Longitude
0 2018-11-01 20:43:21  9.0  38.03196 -78.510094
1 2018-11-01 20:43:22  9.0  38.03196 -78.510094
2 2018-11-01 20:43:23  9.0  38.03196 -78.510094
3 2018-11-01 20:43:24  NaN       NaN        NaN
4 2018-11-01 20:43:25  NaN       NaN        NaN


In [96]:
hr_acc_df = pd.merge(hr_df, acc_df, how="inner", on=["Timestamp", "PID"])
print(hr_acc_df.shape)
hr_acc_df = hr_acc_df.dropna()
print(hr_acc_df.shape)

(61468, 6)
(1075, 6)


In [97]:
combined_df = pd.merge(hr_acc_df, loc_df, how="inner", on=["Timestamp", "PID"])
print(combined_df.shape)
combined_df = combined_df.dropna()
print(combined_df.shape)

(43, 8)
(43, 8)


In [98]:
print(combined_df[:5])

            Timestamp  PID    HR         X         Y         Z   Latitude  \
0 2018-11-02 04:00:30  9.0  68.0 -0.009509 -0.060902 -0.986722  38.015347   
1 2018-11-02 04:02:28  9.0  67.0  0.041428 -0.024105 -0.982643  38.015347   
2 2018-11-02 04:05:06  9.0  67.0 -0.016170 -0.046767 -0.987743  38.015347   
3 2018-11-02 04:10:36  9.0  75.0 -0.011375 -0.031504 -0.987657  38.015347   
4 2018-11-02 04:14:40  9.0  97.0 -0.008729 -0.193506 -1.023444  38.015347   

   Longitude  
0 -78.522087  
1 -78.522087  
2 -78.522087  
3 -78.522087  
4 -78.522087  
