In [48]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from pathlib import Path

In [49]:
Z3_min = 135
Z5_min = 173

In [50]:
def create_emptydf(start_date,end_date):
    """
    Creates empty DataFrame with date range
    Args:
        start_date (str): Start date in 'yyyy-mm-dd' format
        end_date (str): End date in 'yyyy-mm-dd' format
        
    Returns:
        empty (df): Eempty df ready for population
    """
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    date_range = pd.date_range(start, end)

    df = pd.DataFrame({'Date': date_range})
    
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
    df['nr.sessions'] = 0
    df['total km'] = 0.0
    df['km Z3-4'] = 0.0
    df['km Z5-T1-T2'] = 0.0
    df['hours alternative'] = 0.0
    return df

In [51]:
empty = create_emptydf('2024-08-01','2024-09-30')
empty.head(10)

Unnamed: 0,Date,nr.sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative
0,2024-08-01,0,0.0,0.0,0.0,0.0
1,2024-08-02,0,0.0,0.0,0.0,0.0
2,2024-08-03,0,0.0,0.0,0.0,0.0
3,2024-08-04,0,0.0,0.0,0.0,0.0
4,2024-08-05,0,0.0,0.0,0.0,0.0
5,2024-08-06,0,0.0,0.0,0.0,0.0
6,2024-08-07,0,0.0,0.0,0.0,0.0
7,2024-08-08,0,0.0,0.0,0.0,0.0
8,2024-08-09,0,0.0,0.0,0.0,0.0
9,2024-08-10,0,0.0,0.0,0.0,0.0


In [57]:
def readfiles(file_path="../data/external"):
    '''
    Creates lists of all csv files in directory

    Args: 
        file_path (str): the relative path for the folder that 
        contains all the activity files

    Returns:
        run_activities (list):
    '''
    fpath = Path(file_path)

    run_activities = list(fpath.glob(f'*Running_*.csv'))
    all_activities = list(fpath.glob(f'*.csv'))
    set_run = set(run_activities)
    set_all = set(all_activities)
    other_activities = list(set_all-set_run)

    return run_activities,other_activities

In [58]:
def readrun(file):
    df = pd.read_csv(file)
    return df

In [59]:
def populatebydate(emptydf,run_activities,other_activities):
    
    for i in emptydf['Date']:
        for file in run_activities:
            filedate =   datetime.strptime(str(file).split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
            if filedate == i:
                emptydf.loc[emptydf['Date'] == filedate,'nr.sessions'] += 1
                populateone(emptydf,str(file))


        for file in other_activities:
            filedate =  datetime.strptime(str(file).split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
            if filedate == i:
                temp_df= readrun(file)
                time_str = temp_df['Time'].iloc[-1]
                time_obj = datetime.strptime(time_str, '%H:%M:%S.%f').time()
                time_delta = timedelta(hours=time_obj.hour, minutes=time_obj.minute, seconds=time_obj.second, microseconds=time_obj.microsecond)
        
                hours_alternative = round(time_delta.total_seconds() / 3600, 2)

                emptydf.loc[emptydf['Date'] == filedate, 'hours alternative'] = hours_alternative

    df = emptydf
   
    return df

In [None]:
def populateone(df_prepop,filename):
    """
    Populates the empty DataFrame with the data from the file
    Args:
        df_prepop (df): DataFrame to be populated
        filename (str): Name of the file to be read
    Returns:
        df_postpop (df): Populated DataFrame
    """
    
    filedate =  datetime.strptime(filename.split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
    file_df = readrun(filename)
    df_prepop.loc[df_prepop['Date'] == filedate,'total km'] += file_df['Distance'].iloc[-1]
    for idx, row in file_df.iterrows():
        hr = row['Avg HR']
        distance = row['Distance']
        if Z3_min <= hr <= Z5_min:
            df_prepop.loc[df_prepop['Date'] == filedate, 'km Z3-4'] += distance
        elif hr > Z5_min:
            df_prepop.loc[df_prepop['Date'] == filedate, 'km Z5-T1-T2'] += distance
    
    df_postpop = df_prepop
    return df_postpop 
   

In [63]:
empty = create_emptydf('2024-08-01','2024-09-30')
r,o =readfiles()
df_full = populatebydate(empty,r,o)
df_full.tail(20)


Unnamed: 0,Date,nr.sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative
41,2024-09-11,0,0.0,0.0,0.0,0.0
42,2024-09-12,0,0.0,0.0,0.0,0.0
43,2024-09-13,0,0.0,0.0,0.0,0.0
44,2024-09-14,1,2.37,0.0,0.0,0.0
45,2024-09-15,0,0.0,0.0,0.0,0.74
46,2024-09-16,0,0.0,0.0,0.0,0.0
47,2024-09-17,0,0.0,0.0,0.0,1.26
48,2024-09-18,1,2.43,0.0,0.0,0.75
49,2024-09-19,0,0.0,0.0,0.0,1.08
50,2024-09-20,0,0.0,0.0,0.0,0.0


In [None]:
def convert_to_day_approach(df):
    """
    Converts the DataFrame to a day approach format.
    
    Args:
        df (DataFrame): The DataFrame to convert.
        
    Returns:
        DataFrame: The converted DataFrame into a format with 7 lagging days 
        before each date in the format 

    """
    # create a new column for the date in format 1 to number of days, call it date_int
    # the first date in the df should be 1, the second 2 and so on
    df['date_int'] = range(1, len(df) + 1)
    # create a new DataFrame with the same columns as the original df
    new_df = pd.DataFrame()

    for i in df['date_int']:
        try:
            # create a single row in the new_df with all columns for each of the proceeding 7 days
            # so Date	nr.sessions	total km	km Z3-4	km Z5-T1-T2	hours alternative, 
            # nr.sessions.1	total km.1	km Z3-4.1	km Z5-T1-T2.1	hours alternative.1,
            # nr.sessions.2	total km.2	km Z3-4.2	km Z5-T1-T2.2	hours alternative.2,
            # and so on up to 7 days so zero through 6
            #
            
            for j in range(0, 7):
                new_df.loc[new_df['date_int'] == i, 'Date'] = df['Date'].iloc[i - j - 1]
                new_df.loc[new_df['nr.sessions.' + str(j)] == i, 'nr.sessions'] = df['nr.sessions'].iloc[i - j - 1]
                new_df.loc[new_df['total km.' + str(j)] == i, 'total km'] = df['total km'].iloc[i - j - 1]
                new_df.loc[new_df['km Z3-4.' + str(j)] == i, 'km Z3-4'] = df['km Z3-4'].iloc[i - j - 1]
                new_df.loc[new_df['km Z5-T1-T2.' + str(j)] == i, 'km Z5-T1-T2'] = df['km Z5-T1-T2'].iloc[i - j - 1]
                new_df.loc[new_df['hours alternative.' + str(j)] == i, 'hours alternative'] = df['hours alternative'].iloc[i - j - 1]
            
           
           # exceept in the case of the first 7 days where there are no previous days to shift
        except KeyError:
            pass
    # remove the first 7 days from the df as they have no previous days to shift
    df = df.iloc[7:]
    # reset the index to be sequential and remove the old index
    df = df.reset_index(drop=True)
    
    return df

In [69]:
dfday_user = convert_to_day_approach(df_full)
print(dfday_user.head(20))

          Date  nr.sessions  total km  km Z3-4  km Z5-T1-T2  \
0   2024-08-08            1      6.30    10.60         0.00   
1   2024-08-09            0      0.00     0.00         0.00   
2   2024-08-10            1      3.25     0.00         0.00   
3   2024-08-11            2     12.71    15.10         5.10   
4   2024-08-12            1      7.29    13.58         0.00   
5   2024-08-13            0      0.00     0.00         0.00   
6   2024-08-14            0      0.00     0.00         0.00   
7   2024-08-15            1     10.12    19.24         0.00   
8   2024-08-16            1     12.00    20.00         1.00   
9   2024-08-17            1     10.02    19.02         0.02   
10  2024-08-18            0      0.00     0.00         0.00   
11  2024-08-19            0      0.00     0.00         0.00   
12  2024-08-20            1     11.78    21.56         0.00   
13  2024-08-21            0      0.00     0.00         0.00   
14  2024-08-22            1      9.32     2.32         