In [4]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from pathlib import Path

In [5]:
Z3_min = 135
Z5_min = 173

In [6]:
def create_emptydf(start_date,end_date):
    """
    Creates empty DataFrame with date range
    Args:
        start_date (str): Start date in 'yyyy-mm-dd' format
        end_date (str): End date in 'yyyy-mm-dd' format
        
    Returns:
        empty (df): Eempty df ready for population
    """
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    date_range = pd.date_range(start, end)

    df = pd.DataFrame({'Date': date_range})
    
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
    df['nr.sessions'] = 0
    df['total km'] = 0.0
    df['km Z3-4'] = 0.0
    df['km Z5-T1-T2'] = 0.0
    df['hours alternative'] = 0.0
    return df

In [7]:
empty = create_emptydf('2024-08-01','2024-09-30')
empty.head(10)

Unnamed: 0,Date,nr.sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative
0,2024-08-01,0,0.0,0.0,0.0,0.0
1,2024-08-02,0,0.0,0.0,0.0,0.0
2,2024-08-03,0,0.0,0.0,0.0,0.0
3,2024-08-04,0,0.0,0.0,0.0,0.0
4,2024-08-05,0,0.0,0.0,0.0,0.0
5,2024-08-06,0,0.0,0.0,0.0,0.0
6,2024-08-07,0,0.0,0.0,0.0,0.0
7,2024-08-08,0,0.0,0.0,0.0,0.0
8,2024-08-09,0,0.0,0.0,0.0,0.0
9,2024-08-10,0,0.0,0.0,0.0,0.0


In [8]:
def readfiles(file_path="../data/external"):
    '''
    Creates lists of all csv files in directory

    Args: 
        file_path (str): the relative path for the folder that 
        contains all the activity files

    Returns:
        run_activities (list):
    '''
    fpath = Path(file_path)

    run_activities = list(fpath.glob(f'*Running_*.csv'))
    all_activities = list(fpath.glob(f'*.csv'))
    set_run = set(run_activities)
    set_all = set(all_activities)
    other_activities = list(set_all-set_run)

    return run_activities,other_activities

In [9]:
def readrun(file):
    df = pd.read_csv(file)
    return df

In [10]:
def populatebydate(emptydf,run_activities,other_activities):
    
    for i in emptydf['Date']:
        for file in run_activities:
            filedate =   datetime.strptime(str(file).split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
            if filedate == i:
                emptydf.loc[emptydf['Date'] == filedate,'nr.sessions'] += 1
                populateone(emptydf,str(file))


        for file in other_activities:
            filedate =  datetime.strptime(str(file).split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
            if filedate == i:
                temp_df= readrun(file)
                time_str = temp_df['Time'].iloc[-1]
                time_obj = datetime.strptime(time_str, '%H:%M:%S.%f').time()
                time_delta = timedelta(hours=time_obj.hour, minutes=time_obj.minute, seconds=time_obj.second, microseconds=time_obj.microsecond)
        
                hours_alternative = round(time_delta.total_seconds() / 3600, 2)

                emptydf.loc[emptydf['Date'] == filedate, 'hours alternative'] = hours_alternative

    df = emptydf
   
    return df

In [11]:
def populateone(df_prepop,filename):
    """
    Populates the empty DataFrame with the data from the file
    Args:
        df_prepop (df): DataFrame to be populated
        filename (str): Name of the file to be read
    Returns:
        df_postpop (df): Populated DataFrame
    """
    
    filedate =  datetime.strptime(filename.split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
    file_df = readrun(filename)
    df_prepop.loc[df_prepop['Date'] == filedate,'total km'] += file_df['Distance'].iloc[-1]
    for idx, row in file_df.iterrows():
        hr = row['Avg HR']
        distance = row['Distance']
        if Z3_min <= hr <= Z5_min:
            df_prepop.loc[df_prepop['Date'] == filedate, 'km Z3-4'] += distance
        elif hr > Z5_min:
            df_prepop.loc[df_prepop['Date'] == filedate, 'km Z5-T1-T2'] += distance
    
    df_postpop = df_prepop
    return df_postpop 
   

In [12]:
empty = create_emptydf('2024-08-01','2024-09-30')
r,o =readfiles()
df_full = populatebydate(empty,r,o)
df_full.tail(20)


Unnamed: 0,Date,nr.sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative
41,2024-09-11,0,0.0,0.0,0.0,0.0
42,2024-09-12,0,0.0,0.0,0.0,0.0
43,2024-09-13,0,0.0,0.0,0.0,0.0
44,2024-09-14,1,2.37,0.0,0.0,0.0
45,2024-09-15,0,0.0,0.0,0.0,0.74
46,2024-09-16,0,0.0,0.0,0.0,0.0
47,2024-09-17,0,0.0,0.0,0.0,1.26
48,2024-09-18,1,2.43,0.0,0.0,0.75
49,2024-09-19,0,0.0,0.0,0.0,0.85
50,2024-09-20,0,0.0,0.0,0.0,0.0


In [13]:
def convert_to_day_approach(df):
    """
    Converts the DataFrame to a day approach format.
    
    Args:
        df (DataFrame): The DataFrame to convert.
        
    Returns:
        DataFrame: The converted DataFrame into a format with 7 lagging days 
        before each date in the format 

    """
    feature_cols = ['nr.sessions', 'total km', 'km Z3-4', 'km Z5-T1-T2', 'hours alternative']
    df_converted = pd.DataFrame()
    for i in range(0,7):
        for col in feature_cols:
            df_converted[f'{col}.{i}'] = df[col].shift(i)  
    df_converted['Date'] = df['Date']
    # drop rows with NaN values using dropna() with index as the row
    df_converted = df_converted.dropna()

    # replace the name of the column with the name of the column without the last 2 characters
    df_converted = df_converted.rename(columns={col: col[:-2] for col in df_converted.columns if col.endswith('.0')})


    # return df_lagged
    return df_converted          


In [14]:
dfday_user = convert_to_day_approach(df_full)
dfday_user

Unnamed: 0,nr.sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative,nr.sessions.1,total km.1,km Z3-4.1,km Z5-T1-T2.1,hours alternative.1,...,total km.5,km Z3-4.5,km Z5-T1-T2.5,hours alternative.5,nr.sessions.6,total km.6,km Z3-4.6,km Z5-T1-T2.6,hours alternative.6,Date
6,0,0.0,0.0,0.0,0.0,1.0,9.84,14.84,2.84,0.0,...,0.0,0.0,0.0,0.0,1.0,6.38,1.0,0.0,0.0,2024-08-07
7,1,6.3,10.6,0.0,0.79,0.0,0.0,0.0,0.0,0.0,...,6.18,11.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-08-08
8,0,0.0,0.0,0.0,1.48,1.0,6.3,10.6,0.0,0.79,...,12.53,24.06,0.0,0.0,1.0,6.18,11.36,0.0,0.0,2024-08-09
9,1,3.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.48,...,0.0,0.0,0.0,0.0,1.0,12.53,24.06,0.0,0.0,2024-08-10
10,2,12.71,15.1,5.1,0.0,1.0,3.25,0.0,0.0,0.0,...,9.84,14.84,2.84,0.0,0.0,0.0,0.0,0.0,0.0,2024-08-11
11,1,7.29,13.58,0.0,0.0,2.0,12.71,15.1,5.1,0.0,...,0.0,0.0,0.0,0.0,1.0,9.84,14.84,2.84,0.0,2024-08-12
12,0,0.0,0.0,0.0,0.0,1.0,7.29,13.58,0.0,0.0,...,6.3,10.6,0.0,0.79,0.0,0.0,0.0,0.0,0.0,2024-08-13
13,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.48,1.0,6.3,10.6,0.0,0.79,2024-08-14
14,1,10.12,19.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.48,2024-08-15
15,1,12.0,20.0,1.0,0.0,1.0,10.12,19.24,0.0,0.0,...,12.71,15.1,5.1,0.0,1.0,3.25,0.0,0.0,0.0,2024-08-16


In [15]:
dfday_user

Unnamed: 0,nr.sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative,nr.sessions.1,total km.1,km Z3-4.1,km Z5-T1-T2.1,hours alternative.1,...,total km.5,km Z3-4.5,km Z5-T1-T2.5,hours alternative.5,nr.sessions.6,total km.6,km Z3-4.6,km Z5-T1-T2.6,hours alternative.6,Date
6,0,0.0,0.0,0.0,0.0,1.0,9.84,14.84,2.84,0.0,...,0.0,0.0,0.0,0.0,1.0,6.38,1.0,0.0,0.0,2024-08-07
7,1,6.3,10.6,0.0,0.79,0.0,0.0,0.0,0.0,0.0,...,6.18,11.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-08-08
8,0,0.0,0.0,0.0,1.48,1.0,6.3,10.6,0.0,0.79,...,12.53,24.06,0.0,0.0,1.0,6.18,11.36,0.0,0.0,2024-08-09
9,1,3.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.48,...,0.0,0.0,0.0,0.0,1.0,12.53,24.06,0.0,0.0,2024-08-10
10,2,12.71,15.1,5.1,0.0,1.0,3.25,0.0,0.0,0.0,...,9.84,14.84,2.84,0.0,0.0,0.0,0.0,0.0,0.0,2024-08-11
11,1,7.29,13.58,0.0,0.0,2.0,12.71,15.1,5.1,0.0,...,0.0,0.0,0.0,0.0,1.0,9.84,14.84,2.84,0.0,2024-08-12
12,0,0.0,0.0,0.0,0.0,1.0,7.29,13.58,0.0,0.0,...,6.3,10.6,0.0,0.79,0.0,0.0,0.0,0.0,0.0,2024-08-13
13,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.48,1.0,6.3,10.6,0.0,0.79,2024-08-14
14,1,10.12,19.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.48,2024-08-15
15,1,12.0,20.0,1.0,0.0,1.0,10.12,19.24,0.0,0.0,...,12.71,15.1,5.1,0.0,1.0,3.25,0.0,0.0,0.0,2024-08-16


In [18]:
dfday_user.to_csv('../data/processed/df_full.csv', index=False)