# Prep

In [301]:
import pandas as pd
import numpy as np
import pprint
import re

In [302]:
WORKOUTS_FILE = r'C:\Users\User\Documents\GitHub\ap_workouts\b1-bronze\workouts.csv'
PLANS_FILE = r'C:\Users\User\Documents\GitHub\ap_workouts\b1-bronze\plans.csv'
CUSTOMEXERCISES_FILE = r'C:\Users\User\Documents\GitHub\ap_workouts\b1-bronze\custom_exercises.csv'
SILVER_FOLDER = r'C:\Users\User\Documents\GitHub\ap_workouts\b2-silver'
GOLD_FOLDER = r'C:\Users\User\Documents\GitHub\ap_workouts\b3-gold'

## Functions

### Extract

In [303]:
def get_raw_workouts(fp):
  all_workouts = []
  counter = 0

  with open(fp, encoding='utf-8') as f:
    clean_strings = [line.strip() for line in f.readlines()]
    no_empty_lines = list(filter(None, clean_strings))
    no_empty_lines.pop(0) # tirar a primeira linha, que é vazia

    single_workout = []
    for line in no_empty_lines:
      if 'Week' in line or 'Standalone' in line:
        if counter > 0:
          all_workouts.append(single_workout)
          single_workout = []
        counter += 1
      single_workout.append(line)

  return all_workouts

### Transform

#### Flatten - Big Function

In [304]:
def get_workout_dict(raw_workout_list):
    workout_listdict = []

    workout_counter = 0
    for workout in raw_workout_list:
        #print(f"Workout: {workout_counter}") # use to find errors

        workout_description = workout[0].split(';')
        description_info = [item.strip() for item in workout_description[0].strip('"').split('·')]

        # 
        workout_dict = {
            #'workout_counter': workout_counter, # use to find errors
            #'raw_description': f'{workout[0]}', # use to check for data quality
            'description': {
                'id': re.sub(r'\D', '', workout_description[1].strip('"')), #re.sub(r'\D', '', input_string) ## workout_description[1].strip('"').strip('h').strip(' ')
                'day_name': description_info[0],
                'day_number': description_info[1],
                'week_number': '',
                'plan_name': '',
                'plan_function': '',
                'start_time': workout_description[1].strip('"'),
                'duration': workout_description[2].strip('"')
            },
            #'raw_exercises': [],
            'exercises': []
        }

        # Standalone workouts follow a different structure and need specific treatment
        if 'Standalone' in workout[0]:
            workout_dict['description']['week_number'] = '0'
            workout_dict['description']['plan_name'] = 'Standalone'
            try:
                workout_dict['description']['plan_function'] = description_info[2]
            except:
                workout_dict['description']['plan_function'] = ''
        else:
            workout_dict['description']['week_number'] = description_info[2]
            workout_dict['description']['plan_name'] = description_info[3]
            try:
                workout_dict['description']['plan_function'] = description_info[4]
            except:
                workout_dict['description']['plan_function'] = ''

        # 
        raw_current_exercise = {
            'exercise': '',
            'data': []
        }
        raw_exercises = []
        for line in workout[1:]:
            if len(line) > 13:
                if len(raw_current_exercise['exercise']) > 0:
                    raw_exercises.append(raw_current_exercise)
                raw_current_exercise = {'exercise': line, 'data': []}
            else:
                raw_current_exercise['data'].append(line)
        raw_exercises.append(raw_current_exercise) # necessary to append the last dict

        # 
        clean_exercises = []
        for e in raw_exercises:
            exercise_sets = []
            #print(f"workout: {workout_counter}  {e}") # use to find errors

            raw_exercise_header = e['exercise']
            if ';' in e['exercise']:
                split_header = raw_exercise_header.split(';')
                exercise_info = [item.strip() for item in split_header[0].strip('"').split('·')]
                exercise_comments = split_header[1]
            else:
                exercise_info = [item.strip() for item in raw_exercise_header.strip('"').split('·')]
                exercise_comments = ''
            order_and_name = exercise_info[0].split(". ")

            clean_exercise = {
                #'info': exercise_info, # use to check data quality
                'workout_id': workout_dict['description']['id'],
                'exercise_number': order_and_name[0],
                'exercise_name': order_and_name[1],
                'equipment': exercise_info[1],
                'goal': np.nan
            }

            if (len(exercise_info) > 2):
                clean_exercise['goal'] = exercise_info[2]
            
            # add set data to main dict
            exercise_set_header = e['data'][0].split(';')
            for set in e['data'][1:]:
                complete_set = clean_exercise.copy()
                set_info = set.split(';')
                
                #clean_exercise['test'] = exercise_set_header # use to check data quality
                complete_set['set_number'] = set_info[0]

                if 'KG' in exercise_set_header:
                    complete_set['weight'] = set_info[1]
                    complete_set['quantity'] = set_info[2]
                    complete_set['measure'] = 'reps'
                elif ('KG' not in exercise_set_header and 'REPS' in exercise_set_header):
                    complete_set['weight'] = ''
                    complete_set['quantity'] = set_info[1]
                    complete_set['measure'] = 'reps'
                elif ('KG' not in exercise_set_header and 'SECS' in exercise_set_header):
                    complete_set['weight'] = ''
                    complete_set['quantity'] = set_info[1]
                    complete_set['measure'] = 'secs'
                elif ('KG' not in exercise_set_header and 'MINS' in exercise_set_header):
                    complete_set['weight'] = ''
                    complete_set['quantity'] = set_info[1]
                    complete_set['measure'] = 'mins'
                else:
                    complete_set['weight'] = 'error'
                    complete_set['quantity'] = '-1'
                    complete_set['measure'] = 'error'
                
                complete_set['comments'] = exercise_comments

                exercise_sets.append(complete_set)
                
            clean_exercises.append(exercise_sets)
        
        #workout_dict['raw_exercises'] = raw_exercises # not returned but still useful to check for data quality
        workout_dict['exercises'] = [item for sublist in clean_exercises for item in sublist]
        workout_counter += 1

        workout_listdict.append(workout_dict)
    
    
    return workout_listdict

#### Others

In [305]:
def extract_day_or_week(value):
    if value == "Standalone workout":
        return 0
    elif value.split()[-1].isdigit():
        return int(value.split()[-1])

In [306]:
def convert_to_minutes(duration):
    if "hr" in duration:
        if ':' in duration:
            hours, minutes = duration.replace(" hr", "").split(":")
            return int(hours) * 60 + int(minutes)
        else:
            return int(duration.replace(" hr", "")) * 60
    elif "min" in duration:
        return int(duration.replace(" min", ""))
    else:
        return None

In [307]:
def convert_to_seconds(duration):
    if pd.isna(duration) or duration == '' or duration == 0:
        return 0
    elif ':' in duration:
        minutes, seconds = duration.split(":")
        return int(minutes) * 60 + int(seconds)
    else:
        return int(duration) * 60

In [308]:
def strip_measure(value):
    if (pd.isna(value) or value == ''):
        return 0
    else:
        return value.split()[0]

# Extract

In [309]:
raw_workouts = get_raw_workouts(WORKOUTS_FILE)

# Transform

## Flattening

In [310]:
normalized_workouts_dict = get_workout_dict(raw_workouts)

In [311]:
#pprint.pprint(normalized_workouts_dict[1], sort_dicts=False) #['exercises']

## Dataframes

In [312]:
workouts_list = []
exercises_log_list = []

for i in normalized_workouts_dict:
    workouts_list.append(i['description'])
    exercises_log_list.append(i['exercises'])

exercises_log_list = [item for sublist in exercises_log_list for item in sublist]

df_workouts_orig = pd.DataFrame.from_dict(workouts_list)
df_exerciseslog_orig = pd.DataFrame.from_dict(exercises_log_list)


## Exercises Ref

### Custom Exercises

In [313]:
df_customexercises_raw = pd.read_csv(CUSTOMEXERCISES_FILE, sep=';')

In [314]:
df_customexercises_clean = (
    df_customexercises_raw
    .rename(columns={
        'NAME': 'exercise_name',
        'EQUIPMENT': 'equipment',
        'MUSCLE': 'main_muscle',
        'AUXILIARY MUSCLES': 'auxiliary_muscles',
        'TYPE': 'size',
        'METRICS': 'metrics',
        'BODYWEIGHT': 'bodyweight',
        'NOTES': 'notes',
        })
    .assign(
        custom_exercise = True,
        muscle_group = '',
        exercise_group = ''
        )
    .sort_values(by='exercise_name').reset_index(drop=True)
)

In [315]:
df_customexercises_clean = df_customexercises_clean[[
    'custom_exercise',
    'exercise_name',
    'equipment',
    'muscle_group',
    'exercise_group',
    'main_muscle',
    'auxiliary_muscles',
    'size',
    'metrics',
    'bodyweight',
    'notes',
]]

In [316]:
#df_customexercises_clean

### Standard Exercises

In [317]:
df_all_exercises = (
    df_exerciseslog_orig[['exercise_name', 'equipment', 'workout_id']]
    .groupby(['exercise_name', 'equipment']).count().reset_index()
    .drop(columns=['workout_id'])
).merge(df_customexercises_clean[['exercise_name', 'custom_exercise']], how="left", on='exercise_name')

In [318]:
df_standard_exercises = (
    df_all_exercises
    .query(" custom_exercise != True ")
    .drop(columns='custom_exercise')
    .assign(
        custom_exercise = False,
        muscle_group = '',
        exercise_group = '',
        main_muscle = '',
        auxiliary_muscles = '',
        size = '',
        metrics = '',
        bodyweight = '',
        notes = ''
    ))

In [319]:
#df_standard_exercises

In [320]:
df_standard_exercises = df_standard_exercises[[
    'custom_exercise',
    'exercise_name',
    'equipment',
    'muscle_group',
    'exercise_group',
    'main_muscle',
    'auxiliary_muscles',
    'size',
    'metrics',
    'bodyweight',
    'notes',
]]

### Join

In [321]:
list_exercise_dfs = [df_customexercises_clean, df_standard_exercises]
df_exercises_ref = pd.concat(list_exercise_dfs, ignore_index=True)

In [322]:
#df_exercises_ref

## Workouts

In [323]:
df_workouts_formatted = df_workouts_orig.copy()

In [324]:
df_workouts_formatted['day_number'] = df_workouts_orig['day_number'].apply(extract_day_or_week).astype('string')
df_workouts_formatted['week_number'] = df_workouts_orig['week_number'].apply(extract_day_or_week).astype('string')
df_workouts_formatted['start_time'] = pd.to_datetime(df_workouts_orig['start_time'], format="%Y-%m-%d %H:%M h")
df_workouts_formatted['duration'] = df_workouts_orig['duration'].apply(convert_to_minutes)
df_workouts_formatted['end_time'] = df_workouts_formatted['start_time'] + pd.to_timedelta(df_workouts_formatted['duration'], unit='m')

In [325]:
#df_workouts_formatted.iloc[:20]

## Workouts Log

In [326]:
df_exerciseslog_formatted = df_exerciseslog_orig.copy()

In [327]:
# convert values in minutes to seconds
mask_mins = (df_exerciseslog_formatted["measure"] == "mins")

df_exerciseslog_formatted['goal'] = df_exerciseslog_formatted['goal'].apply(strip_measure)
df_exerciseslog_formatted.loc[mask_mins, "goal"] = df_exerciseslog_formatted.loc[mask_mins]['goal'].apply(convert_to_seconds)
df_exerciseslog_formatted.loc[mask_mins, "quantity"] = df_exerciseslog_orig.loc[mask_mins, "quantity"].apply(convert_to_seconds)
df_exerciseslog_formatted.loc[df_exerciseslog_orig["measure"] == "mins", "measure"] = "secs"

In [328]:
df_exerciseslog_formatted['goal'] = df_exerciseslog_formatted['goal'].fillna(0).astype('int64')
df_exerciseslog_formatted['quantity'] = df_exerciseslog_formatted['quantity'].replace('-', 0).astype('int64')
df_exerciseslog_formatted['weight'] = df_exerciseslog_formatted['weight'].str.replace("+", "").str.replace(",", ".").replace('-', 0).replace('', 0).fillna(0).astype('float64')

In [329]:
df_exerciseslog_formatted['volume'] = df_exerciseslog_formatted['weight'] * df_exerciseslog_formatted['quantity']

In [330]:
# when an exercise uses dumbbells, multiply its weight by 2 to yield total weight

df_exerciseslog_formatted['total_weight'] = df_exerciseslog_formatted['weight']

mask_dumbbells = (df_exerciseslog_formatted["equipment"] == "Dumbbells")
df_exerciseslog_formatted.loc[mask_dumbbells, "total_weight"] = df_exerciseslog_formatted.loc[mask_dumbbells, "total_weight"] * 2

In [331]:
df_exerciseslog_formatted = df_exerciseslog_formatted[[
    'workout_id',
    'exercise_number',
    'exercise_name',
    'equipment',
    'goal',
    'measure',
    'set_number',
    'weight',
    'total_weight',
    'quantity',
    'volume'
]]

# Save

In [332]:
df_workouts_formatted.to_csv(f"{GOLD_FOLDER}/workouts.csv", sep=";", decimal=',', index=False)
df_exerciseslog_formatted.to_csv(f"{GOLD_FOLDER}/exercises_log.csv", sep=";", decimal=',', index=False)
df_exercises_ref.to_csv(f"{GOLD_FOLDER}/exercises_ref.csv", sep=";", decimal=',', index=False)

# Tests

In [64]:
#df_exercises_log

In [65]:
#raw_workouts[2]

In [66]:
#pprint.pprint(workouts_dict[2], sort_dicts=False)