In [61]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from pathlib import Path

In [62]:
Z3_min = 135
Z5_min = 173

In [63]:
def create_emptydf(start_date,end_date):
    """
    Creates empty DataFrame with date range
    Args:
        start_date (str): Start date in 'yyyy-mm-dd' format
        end_date (str): End date in 'yyyy-mm-dd' format
        
    Returns:
        empty (df): Eempty df ready for population
    """
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    date_range = pd.date_range(start, end)

    df = pd.DataFrame({'Date': date_range})
    
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
    df['nr. sessions'] = 0
    df['total km'] = 0.0
    df['km Z3-4'] = 0.0
    df['km Z5-T1-T2'] = 0.0
    df['hours alternative'] = 0.0
    return df

In [64]:
empty = create_emptydf('2024-08-01','2024-09-30')
empty.head(10)

Unnamed: 0,Date,nr. sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative
0,2024-08-01,0,0.0,0.0,0.0,0.0
1,2024-08-02,0,0.0,0.0,0.0,0.0
2,2024-08-03,0,0.0,0.0,0.0,0.0
3,2024-08-04,0,0.0,0.0,0.0,0.0
4,2024-08-05,0,0.0,0.0,0.0,0.0
5,2024-08-06,0,0.0,0.0,0.0,0.0
6,2024-08-07,0,0.0,0.0,0.0,0.0
7,2024-08-08,0,0.0,0.0,0.0,0.0
8,2024-08-09,0,0.0,0.0,0.0,0.0
9,2024-08-10,0,0.0,0.0,0.0,0.0


In [65]:
def readfiles(file_path="../data/external"):
    '''
    Creates lists of all csv files in directory

    Args: 
        file_path (str): the relative path for the folder that 
        contains all the activity files

    Returns:
        run_activities (list):
    '''
    fpath = Path(file_path)

    run_activities = list(fpath.glob(f'*Running_*.csv'))
    all_activities = list(fpath.glob(f'*.csv'))
    set_run = set(run_activities)
    set_all = set(all_activities)
    other_activities = list(set_all-set_run)

    return run_activities,other_activities

In [66]:
def readrun(file):
    df = pd.read_csv(file)
    return df

In [67]:
def populatebydate(emptydf,run_activities,other_activities):
    
    for i in emptydf['Date']:
        for file in run_activities:
            filedate =   datetime.strptime(str(file).split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
            if filedate == i:
                emptydf.loc[emptydf['Date'] == filedate,'nr. sessions'] += 1
                populateone(emptydf,str(file))


        for file in other_activities:
            filedate =  datetime.strptime(str(file).split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
            if filedate == i:
                temp_df= readrun(file)
                time_str = temp_df['Time'].iloc[-1]
                time_obj = datetime.strptime(time_str, '%H:%M:%S.%f').time()
                time_delta = timedelta(hours=time_obj.hour, minutes=time_obj.minute, seconds=time_obj.second, microseconds=time_obj.microsecond)
        
                hours_alternative = round(time_delta.total_seconds() / 3600, 2)

                emptydf.loc[emptydf['Date'] == filedate, 'hours alternative'] = hours_alternative

    df = emptydf
   
    return df

In [80]:
def populateone(df_prepop,filename):
    """
    Populates the empty DataFrame with the data from the file
    Args:
        df_prepop (df): DataFrame to be populated
        filename (str): Name of the file to be read
    Returns:
        df_postpop (df): Populated DataFrame
    """
    
    filedate =  datetime.strptime(filename.split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
    file_df = readrun(filename)

    # Initialize km Z3-4 and km Z5-T1-T2 to zero for the current filedate
    #df_prepop.loc[df_prepop['Date'] == filedate, 'km Z3-4'] = 0.00
    #df_prepop.loc[df_prepop['Date'] == filedate, 'km Z5-T1-T2'] = 0.00


    df_prepop.loc[df_prepop['Date'] == filedate,'total km'] += file_df['Distance'].iloc[-1]
    
    for idx, row in file_df.iloc[:-1].iterrows():
        hr = row['Avg HR']
        distance = row['Distance']
        if Z3_min <= hr < Z5_min:
            df_prepop.loc[df_prepop['Date'] == filedate, 'km Z3-4'] += distance
        elif hr >= Z5_min:
            df_prepop.loc[df_prepop['Date'] == filedate, 'km Z5-T1-T2'] += distance
    
    df_postpop = df_prepop
    return df_postpop 
   

In [101]:
empty = create_emptydf('2024-06-17','2024-09-15')
r,o =readfiles()
df_full = populatebydate(empty,r,o)
df_full.head(20)


Unnamed: 0,Date,nr. sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative
0,2024-06-17,1,4.78,3.78,0.0,0.0
1,2024-06-18,0,0.0,0.0,0.0,0.0
2,2024-06-19,0,0.0,0.0,0.0,0.0
3,2024-06-20,1,17.44,15.44,0.0,0.0
4,2024-06-21,1,7.11,7.11,0.0,0.0
5,2024-06-22,1,5.01,3.0,2.01,0.0
6,2024-06-23,1,8.12,7.12,0.0,0.0
7,2024-06-24,0,0.0,0.0,0.0,1.31
8,2024-06-25,0,0.0,0.0,0.0,0.0
9,2024-06-26,1,5.0,4.0,0.0,0.0


In [102]:
def convert_to_day_approach(df):
    """
    Converts the DataFrame to a day approach format.
    
    Args:
        df (DataFrame): The DataFrame to convert.
        
    Returns:
        DataFrame: The converted DataFrame into a format with 7 lagging days 
        before each date in the format 

    """
    feature_cols = ['nr. sessions', 'total km', 'km Z3-4', 'km Z5-T1-T2', 'hours alternative']
    df_converted = pd.DataFrame()
    for i in range(0,7):
        for col in feature_cols:
            df_converted[f'{col}.{i}'] = df[col].shift(i)  
    df_converted['Date'] = df['Date']
    # drop rows with NaN values using dropna() with index as the row
    df_converted = df_converted.dropna()

    # replace the name of the column with the name of the column without the last 2 characters
    df_converted = df_converted.rename(columns={col: col[:-2] for col in df_converted.columns if col.endswith('.0')})


    # return df_lagged
    return df_converted          


In [103]:
dfday_user = convert_to_day_approach(df_full)
dfday_user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85 entries, 6 to 90
Data columns (total 36 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   nr. sessions         85 non-null     int64  
 1   total km             85 non-null     float64
 2   km Z3-4              85 non-null     float64
 3   km Z5-T1-T2          85 non-null     float64
 4   hours alternative    85 non-null     float64
 5   nr. sessions.1       85 non-null     float64
 6   total km.1           85 non-null     float64
 7   km Z3-4.1            85 non-null     float64
 8   km Z5-T1-T2.1        85 non-null     float64
 9   hours alternative.1  85 non-null     float64
 10  nr. sessions.2       85 non-null     float64
 11  total km.2           85 non-null     float64
 12  km Z3-4.2            85 non-null     float64
 13  km Z5-T1-T2.2        85 non-null     float64
 14  hours alternative.2  85 non-null     float64
 15  nr. sessions.3       85 non-null     float

In [104]:
dfday_user[['Date','nr. sessions','nr. sessions.1','nr. sessions.2','nr. sessions.3','nr. sessions.4','nr. sessions.5','nr. sessions.6']].tail(20)


Unnamed: 0,Date,nr. sessions,nr. sessions.1,nr. sessions.2,nr. sessions.3,nr. sessions.4,nr. sessions.5,nr. sessions.6
71,2024-08-27,1,1.0,1.0,0.0,1.0,1.0,0.0
72,2024-08-28,0,1.0,1.0,1.0,0.0,1.0,1.0
73,2024-08-29,1,0.0,1.0,1.0,1.0,0.0,1.0
74,2024-08-30,1,1.0,0.0,1.0,1.0,1.0,0.0
75,2024-08-31,0,1.0,1.0,0.0,1.0,1.0,1.0
76,2024-09-01,4,0.0,1.0,1.0,0.0,1.0,1.0
77,2024-09-02,0,4.0,0.0,1.0,1.0,0.0,1.0
78,2024-09-03,1,0.0,4.0,0.0,1.0,1.0,0.0
79,2024-09-04,1,1.0,0.0,4.0,0.0,1.0,1.0
80,2024-09-05,0,1.0,1.0,0.0,4.0,0.0,1.0


In [105]:
dfday_user.head(20)

Unnamed: 0,nr. sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative,nr. sessions.1,total km.1,km Z3-4.1,km Z5-T1-T2.1,hours alternative.1,...,total km.5,km Z3-4.5,km Z5-T1-T2.5,hours alternative.5,nr. sessions.6,total km.6,km Z3-4.6,km Z5-T1-T2.6,hours alternative.6,Date
6,1,8.12,7.12,0.0,0.0,1.0,5.01,3.0,2.01,0.0,...,0.0,0.0,0.0,0.0,1.0,4.78,3.78,0.0,0.0,2024-06-23
7,0,0.0,0.0,0.0,1.31,1.0,8.12,7.12,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-06-24
8,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.31,...,17.44,15.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-06-25
9,1,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.11,7.11,0.0,0.0,1.0,17.44,15.44,0.0,0.0,2024-06-26
10,0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,0.0,0.0,...,5.01,3.0,2.01,0.0,1.0,7.11,7.11,0.0,0.0,2024-06-27
11,1,5.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.12,7.12,0.0,0.0,1.0,5.01,3.0,2.01,0.0,2024-06-28
12,0,0.0,0.0,0.0,0.0,1.0,5.08,0.0,0.0,0.0,...,0.0,0.0,0.0,1.31,1.0,8.12,7.12,0.0,0.0,2024-06-29
13,1,9.08,5.08,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.31,2024-06-30
14,1,12.66,9.66,0.0,0.0,1.0,9.08,5.08,2.0,0.0,...,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-07-01
15,0,0.0,0.0,0.0,0.0,1.0,12.66,9.66,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,5.0,4.0,0.0,0.0,2024-07-02


In [106]:
dfday_user.to_csv('../data/processed/df_full.csv', index=False)