In [None]:
# default_exp load_data

# load_data

> Objective is to load and clean all data of steady state training sessions.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
import json
import os
import pandas as pd
import re

# from pandas.io.json import json_normalize

In [None]:
DATAPATH: str = "/data/polar-user-data-export_20200306/"
assert os.path.isdir(DATAPATH)

### Prepare and test regular expression
All files with steady state training have in the file name the string "training-session". There're also other files that are not of interest now.

In [None]:
# prepare regular expression
file = "training-session-2019-07-05-123456789-123456d9-8f4b-458e-b50d-3e2039226034.json"
regex = r"training-session-*"

# assert regeular expression works
assert re.search(pattern=regex, string=file)

del file

### Get and test files list
Create iterator with all file names in the DATAPATH

In [None]:
# create iterator with all files in datapath
session_files = os.scandir(DATAPATH)

# check if iterator is not empty
assert any(True for _ in session_files)

### Initialize empty dataframe with the provided auto-laps
The autolaps are a distinct section in the provides json file that already converted the raw heart rate and speed data into some useful statistics for each kilometer.
For now, the raw data is ignored

In [None]:
#hide
# initialize empty autolaps dataframe
maf_running_autolaps = pd.DataFrame(columns=['PK',
                                             'lapNumber',
                                             'duration',
                                             'splitTime',
                                             'distance',
                                             'HR_min',
                                             'HR_avg',
                                             'HR_max',
                                             'speed_avg',
                                             'speed_max'])


### Some helper functions are needed

In [None]:
def is_running_maf_session(session_info: pd.DataFrame) -> bool:
    """ function that returns a True if the training session file
        is a MAF running session """
    if (
        # if target.name doesn't exist, it was not a MAF profile session
        # there are a few sessions which are MAF sessions but haven't got
        # profile. Deal with that later
            'target.name' not in session_info.columns
        ):
        result = False
    elif (
            session_info['target.name'][0] == 'MAF 90 min' and
            session_info['sport'][0] == 'RUNNING'
        ):
        result = True
    elif (
            session_info['target.name'][0] == 'MAF running' and
            session_info['sport'][0] == 'RUNNING'
        ):
        result = True
    else:
        result = False

    return result



In [None]:
def get_running_autolaps(df_session_info: pd.DataFrame) -> pd.DataFrame:
    """ Function to extract autolaps from session.
        For RUNNING profile only 
    """
    # determing session key from startTime
    session_key: str = df_session_info['startTime'][0]
    # place the autolaps in a dataframe
    autolaps: pd.DataFrame = pd.DataFrame(df_session_info['autoLaps'][0])
    # add session key to autolaps
    autolaps.insert(loc=0, column='PK', value=session_key)
    # get HR info from each autolap out of dict
    hr = pd.json_normalize(autolaps["heartRate"]).add_prefix("HR_")
    # get speed info from each autolap out of dict
    speed = pd.json_normalize(autolaps["speed"]).add_prefix("speed_")
    # combine autolaps with hr and speed
    autolaps = pd.concat([autolaps, hr, speed], axis=1)
    # remove unneeded columns
    autolaps.drop(["heartRate", 'speed'], axis=1, inplace=True)

    return autolaps


### Parse all steady state training files

In [None]:
# parse all relevant files
for file in session_files:
    # if the file indicates that it's a training session, open it
    if re.search(pattern=regex, string=file.name):
        with open(file.path, 'r') as open_file:
            json_content: dict = json.load(open_file)
            # parse the json dictionary to a dataframe
            df_session_info: pd.DataFrame = pd.json_normalize(json_content['exercises'])
            if is_running_maf_session(df_session_info):
                # extract session info into session_autolaps
                session_autolaps: pd.DataFrame = get_running_autolaps(df_session_info)
                maf_running_autolaps: pd.DataFrame = maf_running_autolaps.append(session_autolaps,
                                                                                 ignore_index=True)


In [None]:
print(f"number of files: {maf_running_autolaps.shape[0]}")


### Fix data types

In [None]:
maf_running_autolaps.dtypes

PK should be the date_time column with datetime64 as dtype.

In [None]:
maf_running_autolaps['date_time'] = pd.to_datetime(maf_running_autolaps['PK'], 
                                                   format='%Y-%m-%dT%H:%M:%S.%f')

In [None]:
maf_running_autolaps.loc[:, ('date')] = pd.to_datetime(maf_running_autolaps['date_time']).dt.date
maf_running_autolaps.loc[:, ('start_time')] = pd.to_datetime(maf_running_autolaps['date_time']).dt.time

Repalce the date_time PK to a simple integer PK.

In [None]:
maf_running_autolaps['PK'] = range(0, len(maf_running_autolaps))

Convert several variables to integer

In [None]:
maf_running_autolaps['lap_number'] = maf_running_autolaps['lap_number'].astype(int)
maf_running_autolaps['HR_avg'] = maf_running_autolaps['HR_avg'].astype(int)
maf_running_autolaps['HR_min'] = maf_running_autolaps['HR_min'].astype(int)
maf_running_autolaps['HR_max'] = maf_running_autolaps['HR_max'].astype(int)
maf_running_autolaps['distance'] = maf_running_autolaps['distance'].astype(int)

Convert the variabels that are supposed to be _seconds_ from a 'object' to a time object.
But first, remove PT at start and S at the end.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.extract.html
https://www.tutorialspoint.com/regular-expression-in-python-with-examples


In [None]:
maf_running_autolaps.head()

In [None]:
maf_running_autolaps.dtypes

In [None]:
maf_running_autolaps['duration'][0]+maf_running_autolaps['duration'][1]

#### Rename some column names and reorder the columns

In [None]:
maf_running_autolaps.rename(columns={"lapNumber": "lap_number",
                                     "splitTime": "split_time"},
                           inplace=True)

In [None]:
maf_running_autolaps = maf_running_autolaps[['PK',
                                             'date',
                                             "lap_number", 
                                             'split_time',
                                             'duration', 
                                             'speed_avg', 
                                             'HR_avg', 
                                             'HR_min',
                                             'HR_max',
                                             'speed_max',
                                             'distance', 
                                             'start_time', 
                                             'date_time']]

In [None]:
# sort dataframe by date_time and lapNumber
maf_running_autolaps.sort_values(by=['date_time', 'lap_number'],
                                 axis=0,
                                 ascending=True,
                                 ignore_index=True,
                                 inplace=True)

In [None]:
maf_running_autolaps.head(10)

In [None]:
maf_running_autolaps.tail(10)

In [None]:
maf_running_autolaps.dtypes

lapnumber, distance, HR_* should all be integer.

In [None]:
tmp = maf_running_autolaps.copy()

In [None]:
tmp['HR_avg'].astype(int)