In [50]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from pathlib import Path

In [51]:
def create_emptydf(start_date,end_date):
    """
    Creates empty DataFrame with date range
    Args:
        start_date (str): Start date in 'yyyy-mm-dd' format
        end_date (str): End date in 'yyyy-mm-dd' format
        
    Returns:
        empty (df): Eempty df ready for population
    """
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    date_range = pd.date_range(start, end)

    df = pd.DataFrame({'Date': date_range})
    
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
    df['nr. sessions'] = 0
    df['total km'] = 0.00
    df['km Z3-4'] = 0.00
    df['km Z5-T1-T2'] = 0.00
    df['hours alternative'] = 0.00
    return df

In [52]:
def readfiles(file_path="../data/external"):
    '''
    Creates lists of all csv files in directory

    Args: 
        file_path (str): the relative path for the folder that 
        contains all the activity files

    Returns:
        run_activities (list):
    '''
    fpath = Path(file_path)

    run_activities = list(fpath.glob(f'*Running_*.csv'))
    all_activities = list(fpath.glob(f'*.csv'))
    set_run = set(run_activities)
    set_all = set(all_activities)
    other_activities = list(set_all-set_run)

    return run_activities,other_activities

In [53]:
def readrun(file):
    df = pd.read_csv(file)
    return df

In [54]:
def populatebydate(emptydf,run_activities,other_activities,Z3_min, Z5_min):
    
    for i in emptydf['Date']:
        for file in run_activities:
            filedate =   datetime.strptime(str(file).split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
            if filedate == i:
                emptydf.loc[emptydf['Date'] == filedate,'nr. sessions'] += 1
                populateone(emptydf,str(file),Z3_min, Z5_min)


        for file in other_activities:
            filedate =  datetime.strptime(str(file).split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
            if filedate == i:
                temp_df= readrun(file)
                time_str = temp_df['Time'].iloc[-1]
                time_obj = datetime.strptime(time_str, '%H:%M:%S.%f').time()
                time_delta = timedelta(hours=time_obj.hour, minutes=time_obj.minute, seconds=time_obj.second, microseconds=time_obj.microsecond)
        
                hours_alternative = round(time_delta.total_seconds() / 3600, 2)

                emptydf.loc[emptydf['Date'] == filedate, 'hours alternative'] = hours_alternative

    df = emptydf
   
    return df

In [55]:
def populateone(df_prepop,filename, Z3_min, Z5_min):
    """
    Populates the empty DataFrame with the data from the file
    Args:
        df_prepop (df): DataFrame to be populated
        filename (str): Name of the file to be read
    Returns:
        df_postpop (df): Populated DataFrame
    """
    
    filedate =  datetime.strptime(filename.split('_')[1], '%d-%m-%Y').strftime('%Y-%m-%d')
    file_df = readrun(filename)
    
    file_df['Distance'] = pd.to_numeric(file_df['Distance'], errors='coerce')

    
    # df_prepop.loc[df_prepop['Date'] == filedate, 'total km'] = current_total_km + file_df['Distance'].iloc[-1]
    df_prepop.loc[df_prepop['Date'] == filedate,'total km'] += file_df['Distance'].iloc[-1]
    
    for idx, row in file_df.iloc[:-1].iterrows():
        hr = row['Avg HR']
        distance = row['Distance']
        if Z3_min <= hr < Z5_min:
            df_prepop.loc[df_prepop['Date'] == filedate, 'km Z3-4'] += distance
        elif hr >= Z5_min:
            df_prepop.loc[df_prepop['Date'] == filedate, 'km Z5-T1-T2'] += distance
    
    df_postpop = df_prepop
    return df_postpop 
   

In [56]:
def convert_to_day_approach(df):
    """
    Converts the DataFrame to a day approach format.
    
    Args:
        df (DataFrame): The DataFrame to convert.
        
    Returns:
        DataFrame: The converted DataFrame into a format with 7 lagging days 
        before each date in the format 

    """
    feature_cols = ['nr. sessions', 'total km', 'km Z3-4', 'km Z5-T1-T2', 'hours alternative']
    df_converted = pd.DataFrame()
    for i in range(0,7):
        for col in feature_cols:
            df_converted[f'{col}.{i}'] = df[col].shift(i)  
    df_converted['Date'] = df['Date']
    # drop rows with NaN values using dropna() with index as the row
    df_converted = df_converted.dropna()

    # replace the name of the column with the name of the column without the last 2 characters
    df_converted = df_converted.rename(columns={col: col[:-2] for col in df_converted.columns if col.endswith('.0')})


    # return df_lagged
    return df_converted          


In [57]:
date_start = '2024-03-03'
date_end = '2024-09-15'
Z3_min = 135
Z5_min = 173

In [58]:
def main_extract_transform(date_start, date_end, Z3_min = 135, Z5_min = 173):   
    """
    Main function to extract and transform data.
    """
    while True:
        # the z3_min and Z5 min need to be inputted by the user here
        Z3_min = input("Enter the minimum heart rate for your Z3 according to garmin: ") 
        Z5_min = input("Enter the minimim heart rate for your Z5 according to garmin: ")
        # wrap the input in a try except block to check if the input is a number
        try:
            Z3_min = int(Z3_min)
            Z5_min = int(Z5_min)
            break
        except ValueError:
            print("Please enter valid numbers for heart rate zone thresholds.")


    # Create an empty DataFrame for the specified date range
    empty = create_emptydf(date_start, date_end)
    
    # Read files and populate the DataFrame
    r, o = readfiles()
    df_full = populatebydate(empty, r, o, Z3_min, Z5_min)
    
    # Convert to day approach format
    dfday_user = convert_to_day_approach(df_full)
    
    return dfday_user



In [59]:
main_extract_transform(date_start, date_end, Z3_min, Z5_min)

Unnamed: 0,nr. sessions,total km,km Z3-4,km Z5-T1-T2,hours alternative,nr. sessions.1,total km.1,km Z3-4.1,km Z5-T1-T2.1,hours alternative.1,...,total km.5,km Z3-4.5,km Z5-T1-T2.5,hours alternative.5,nr. sessions.6,total km.6,km Z3-4.6,km Z5-T1-T2.6,hours alternative.6,Date
6,1,0.00,0.00,0.0,0.00,0.0,0.00,0.00,0.0,0.0,...,0.00,0.00,0.0,0.0,1.0,5.51,3.51,2.0,0.0,2024-03-09
7,1,6.05,5.05,0.0,0.00,1.0,0.00,0.00,0.0,0.0,...,13.20,11.19,0.0,0.0,0.0,0.00,0.00,0.0,0.0,2024-03-10
8,0,0.00,0.00,0.0,0.00,1.0,6.05,5.05,0.0,0.0,...,0.00,0.00,0.0,0.0,1.0,13.20,11.19,0.0,0.0,2024-03-11
9,1,5.04,3.04,0.0,0.00,0.0,0.00,0.00,0.0,0.0,...,7.27,6.27,0.0,0.0,0.0,0.00,0.00,0.0,0.0,2024-03-12
10,0,0.00,0.00,0.0,0.00,1.0,5.04,3.04,0.0,0.0,...,0.00,0.00,0.0,0.0,1.0,7.27,6.27,0.0,0.0,2024-03-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,0,0.00,0.00,0.0,0.00,0.0,0.00,0.00,0.0,0.0,...,0.00,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,2024-09-11
193,0,0.00,0.00,0.0,0.00,0.0,0.00,0.00,0.0,0.0,...,0.00,0.00,0.0,1.0,0.0,0.00,0.00,0.0,0.0,2024-09-12
194,0,0.00,0.00,0.0,0.00,0.0,0.00,0.00,0.0,0.0,...,5.27,2.27,0.0,0.0,0.0,0.00,0.00,0.0,1.0,2024-09-13
195,1,2.37,0.00,0.0,0.00,0.0,0.00,0.00,0.0,0.0,...,0.00,0.00,0.0,0.0,1.0,5.27,2.27,0.0,0.0,2024-09-14


In [None]:
# dfday_user.to_csv('../data/processed/df_full.csv', index=False)

NameError: name 'dfday_user' is not defined