### AJ Data Pre-Processing

In [23]:
import os
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import re
import csv
import json
import numpy as np
import tcxparser
import glob
import time

#### Relevant Parameters of interest (user specific)

In [2]:
paths_to_data = {'fit':'./Data/Juan/Run/Fit', 'arduino':'./data/arduino/Run'}
data_dir = os.getcwd()+'/Data/Juan/Run/'

test = pd.read_csv(data_dir+'Run_0205.csv')
test['timestamp'] = test.apply(lambda x: datetime.fromtimestamp(x['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] ,axis=1)

path_to_tcx = 'Actividades/*_Correr.tcx'

file_names = sorted(glob.iglob(os.path.join(paths_to_data['fit'], path_to_tcx)))

In [3]:
def missing_data_treatment(data, method='interpolation'):
    """
    We see that the arduino app generates a lot of missing data points. We will implement a few different approaches to 
    deal with this missing data:
    :param method: (str) The methods we will implement are:
        - 'constant': Will adopt the value of the previous observation
        - 'interpolation': Interpolate the data since we are in a continuous domain
        - 'nearest': Adopts the value of the nearest value
    """
    df = data.copy()
    if method == 'interpolation':
        df.interpolate(method='linear', inplace=True)
    elif method == 'constant':
        df.interpolate(method='zero', inplace=True)
    elif method == 'nearest':
        df.interpolate(method='nearest', inplace=True)
    else:
        raise ValueError('The method specified is not defined. Please review function missing_data_treeatment')
    return df

In [4]:
def session_start(data, method='inference', value=None):
    """
    Sessions don't start in the moment in which Arduino starts recording data. There are a few residual seconds before
    we actually start running (e.g. setting up device). 
    We will implement 3 different methods:
    :param method: (str) 
        - 'inference': Look at the data and try to guess when we are actually starting to run. The idea until now is:
                - Look at the X, Y and Z axis for each user and see the most regular one (depends on how the device is placed).
                - Based on the best feature, estimate the average amplitude of a step. 
                - Set the timer at the stage in which the amplitude is below a certain threshold
        - 'percentage': Remove the initial 2% of the observations (try to be conservative, better to remove part of the session
                        than keep irrelevant data)
        - 'fixed': Set a fixed starting point (not recommended but  can be usefull for testing purposes)
    :param value: (depends) If the method is  inference leave the default value, if the method is percentage provide float 
                  between 0 and 1 (e.g. 0.02 to remove  2% of the initial data set), if the method is fixed provide a string 
                  with date format (e.g. '2021-05-02 18:01:00')
    """
    df = data.copy()
    
    if method == 'inference':
        raise ValueError('Method inference from session_start function is still under construction...come back later')
    elif method == 'percentage':
        length = len(data)*value
        df = df[int(length):]
    elif method == 'fixed':
        df = df[df > value]
    else:
        raise ValueError('The method specified is not defined. Please review function session_start')
    
    return df

In [202]:
def generate_full_time_series_AJ(path, file_identifier='Run_', plot=None, google_files=file_names):
    """
    We have decided to generate a full time series as dataset (i.e. put all the sessions together, making several time series
    into a single one). For this purpose we will simply do individual pre-processing steps on each of the csv files and then
    merge everything together
    :param path: (str) Folder in which csv files can be found
    :param file_identifier: (str) Identifier of each csv file
    """
    session_files =  [i for i in os.listdir(path) if re.search(file_identifier, i)] 
    full_df = pd.DataFrame()
    
    google_df = load_tcx(google_files)
    
    for session in session_files:
        
        df = pd.read_csv(path + session)
        df['timestamp'] = df.apply(lambda x: datetime.fromtimestamp(x['timestamp'] / 1000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] ,axis=1)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.set_index('timestamp')
        df_missing = missing_data_treatment(df, method='interpolation')
        df_start = session_start(df_missing, method='percentage', value=.02)
        print(re.findall('\d+', session)[0])
        print('Session Start: ' + str(min(df_start.index)))
        print('Session End: ' + str(max(df_start.index)))
        reduce_tcx_df = google_df[(google_df.timestamp > str(min(df_missing.index))) & 
                                  (google_df.timestamp < str(max(df_missing.index)))].set_index('timestamp')
        merged = df_start.merge(reduce_tcx_df, how='outer', sort=True, on='timestamp')
        merged['altitude'] = merged['altitude'].interpolate(method='linear')
        merged['distance'] = merged['distance'].interpolate(method='linear')
        df_final = merged[~(merged.google_fit == True)]
        df_final.drop(columns=['google_fit'], inplace=True)
        df_final = df_final[(df_final.index > str(min(df_start.index))) & 
                             (df_final.index < str(max(df_start.index)))]
        print('+++++++++++++++')
        full_df = pd.concat([full_df, df_final])        
        
        
    print('Finished pre-processing individual time series')
    full_df.sort_values(by=['timestamp'], inplace=True)
    
    if plot:
        plt.figure(figsize=(20, 12))
        plt.plot(full_df[plot])
        plt.show()
        
    return full_df
    

In [203]:
def load_tcx(file_list):
    """
    Load tcx files into pandas DataFrame
    
    Known issue: 
    the lists altitude, timestamp and distance have not the same length;
    we don't know the position of the missing data (potential values' shift);
    """
    df = pd.DataFrame(columns=['timestamp','altitude',
                               'distance', 
                               'google_fit'])
    
    for file in file_list:
        tcx_obj = tcxparser.TCXParser(file)
        #print(tcx_obj.time_values()[0])
        #print(tcx_obj.time_values()[-1])
        default_len = min([len(tcx_obj.time_values())],
                          [len(tcx_obj.altitude_points())],
                          [len(tcx_obj.distance_values())])[0]
        data_dict = {'timestamp':tcx_obj.time_values()[-default_len:], 
                     'altitude':tcx_obj.altitude_points()[-default_len:], 
                     'distance':tcx_obj.distance_values()[-default_len:],
                     'google_fit': [True] * default_len}
        tmp_df = pd.DataFrame.from_dict(data_dict)
        df = pd.concat([df, tmp_df], ignore_index=True)
        print('======================')
        
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%dT%H:%M:%S.%fZ")
    hours_added = timedelta(hours = 2)
    df['timestamp'] = df['timestamp'] + hours_added
    df['distance'] = pd.to_numeric(df.distance.astype(str))
        
    return df.sort_values(by='timestamp')

In [204]:
#tcx_df = load_tcx(file_names)
full = generate_full_time_series_AJ('Data/Juan/Run/')

0205
Session Start: 2021-05-02 18:00:16.629000
Session End: 2021-05-02 18:06:20.428000
+++++++++++++++
0405
Session Start: 2021-05-04 18:58:39.120000
Session End: 2021-05-04 19:04:34.166000
+++++++++++++++
0705
Session Start: 2021-05-07 18:34:00.739000
Session End: 2021-05-07 18:39:30.192000
+++++++++++++++
1304
Session Start: 2021-04-13 18:50:38.821000
Session End: 2021-04-13 18:54:21.910000
+++++++++++++++
1504
Session Start: 2021-04-15 17:24:57.860000
Session End: 2021-04-15 17:32:12.599000
+++++++++++++++
1604
Session Start: 2021-04-16 19:17:10.348000
Session End: 2021-04-16 19:23:54.668000
+++++++++++++++
2104
Session Start: 2021-04-21 19:00:43.597000
Session End: 2021-04-21 19:09:09.684000
+++++++++++++++
2204
Session Start: 2021-04-22 19:14:33.233000
Session End: 2021-04-22 19:20:17.773000
+++++++++++++++
2404
Session Start: 2021-04-24 19:17:57.120000
Session End: 2021-04-24 19:24:35.061000
+++++++++++++++
2704
Session Start: 2021-04-27 18:49:04.895000
Session End: 2021-04-27 18

In [205]:
full[~full.altitude.isna()]

Unnamed: 0_level_0,AmbientLightSensor,DecibelSource,PitchSensor,LinearAccelerometerSensor,AccX,AccY,AccZ,CompassSensor,MagneticRotationSensor,altitude,distance
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-04-15 17:30:01.085,103.0,45.965257,104.306641,16.211143,3.610840,26.871185,20.745987,211.437990,45.331361,57.551846,15.929916
2021-04-15 17:30:01.096,103.0,46.413841,104.306641,17.597163,5.139847,25.471375,20.068817,209.723653,45.357118,57.557832,15.929916
2021-04-15 17:30:01.104,103.0,46.862424,104.306641,18.983184,7.463287,22.877563,16.431717,204.528857,45.466799,57.563818,15.929916
2021-04-15 17:30:01.113,103.0,47.311008,104.306641,20.369204,8.020813,21.358124,10.160141,199.828581,45.576481,57.569804,15.929916
2021-04-15 17:30:01.114,103.0,47.759592,104.306641,21.755225,7.735270,19.838684,7.929230,196.602313,45.686162,57.575790,15.929916
...,...,...,...,...,...,...,...,...,...,...,...
2021-05-07 18:39:30.163,2.0,55.245296,92.265381,26.632775,-1.940506,27.378464,10.076385,115.962077,46.758833,67.452400,1028.699445
2021-05-07 18:39:30.172,2.0,55.245296,92.265381,26.632775,0.361389,22.497101,10.346771,108.241990,46.811786,67.452400,1028.699445
2021-05-07 18:39:30.173,2.0,55.245296,92.265381,26.632775,1.764778,21.571083,7.890541,105.130463,46.864740,67.452400,1028.699445
2021-05-07 18:39:30.182,2.0,55.245296,92.265381,26.632775,3.168167,20.645065,5.434311,89.060530,46.864740,67.452400,1028.699445


In [206]:
full

Unnamed: 0_level_0,AmbientLightSensor,DecibelSource,PitchSensor,LinearAccelerometerSensor,AccX,AccY,AccZ,CompassSensor,MagneticRotationSensor,altitude,distance
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-04-13 18:50:38.822,3.0,2.697640,0.000000,0.086087,-0.543106,1.861404,9.138397,289.753512,53.079867,,
2021-04-13 18:50:38.828,3.0,2.662151,0.000000,0.078798,-0.557461,1.861404,9.140793,292.111225,53.084956,,
2021-04-13 18:50:38.829,3.0,2.626662,0.000000,0.071508,-0.571815,1.861404,9.143188,292.275061,53.090045,,
2021-04-13 18:50:38.830,3.0,2.591174,0.000000,0.064219,-0.586170,1.861404,9.145584,292.438897,52.825756,,
2021-04-13 18:50:38.831,3.0,2.555685,0.000000,0.076216,-0.600525,1.878151,9.213776,292.206712,52.561466,,
...,...,...,...,...,...,...,...,...,...,...,...
2021-05-07 18:39:30.163,2.0,55.245296,92.265381,26.632775,-1.940506,27.378464,10.076385,115.962077,46.758833,67.4524,1028.699445
2021-05-07 18:39:30.172,2.0,55.245296,92.265381,26.632775,0.361389,22.497101,10.346771,108.241990,46.811786,67.4524,1028.699445
2021-05-07 18:39:30.173,2.0,55.245296,92.265381,26.632775,1.764778,21.571083,7.890541,105.130463,46.864740,67.4524,1028.699445
2021-05-07 18:39:30.182,2.0,55.245296,92.265381,26.632775,3.168167,20.645065,5.434311,89.060530,46.864740,67.4524,1028.699445


In [208]:
print(len(full[~full.AccX.isna()]))
print(len(full))

868580
868580
