# Prep

In [226]:
import pandas as pd
import numpy as np
import pprint
import re

In [227]:
WORKOUTS_FILE = r'C:\Users\User\Documents\GitHub\ap_workouts\raw_data\uptodate_workouts.csv'
SILVER_FOLDER = r'C:\Users\User\Documents\GitHub\ap_workouts\silver_tables'

## Functions

### Extract

In [228]:
def get_raw_workouts(fp):
  all_workouts = []
  counter = 0

  with open(fp, encoding='utf-8') as f:
    clean_strings = [line.strip() for line in f.readlines()]
    no_empty_lines = list(filter(None, clean_strings))
    no_empty_lines.pop(0) # tirar a primeira linha, que é vazia

    single_workout = []
    for line in no_empty_lines:
      if 'Week' in line or 'Standalone' in line:
        if counter > 0:
          all_workouts.append(single_workout)
          single_workout = []
        counter += 1
      single_workout.append(line)

  return all_workouts

### Transform

In [229]:
def get_workout_dict(raw_workout_list):
    workout_listdict = []

    workout_counter = 0
    for workout in raw_workout_list:
        #print(f"Workout: {workout_counter}") # use to find errors

        workout_description = workout[0].split(';')
        description_info = [item.strip() for item in workout_description[0].strip('"').split('·')]

        # 
        workout_dict = {
            #'workout_counter': workout_counter, # use to find errors
            #'raw_description': f'{workout[0]}', # use to check for data quality
            'description': {
                'id': re.sub(r'\D', '', workout_description[1].strip('"')), #re.sub(r'\D', '', input_string) ## workout_description[1].strip('"').strip('h').strip(' ')
                'day_name': description_info[0],
                'day_number': description_info[1],
                'week_number': '',
                'plan_name': '',
                'plan_function': '',
                'start_time': workout_description[1].strip('"'),
                'duration': workout_description[2].strip('"')
            },
            #'raw_exercises': [],
            'exercises': []
        }

        # Standalone workouts follow a different structure and need specific treatment
        if 'Standalone' in workout[0]:
            workout_dict['description']['week_number'] = '0'
            workout_dict['description']['plan_name'] = 'Standalone'
            workout_dict['description']['plan_function'] = description_info[2]
        else:
            workout_dict['description']['week_number'] = description_info[2]
            workout_dict['description']['plan_name'] = description_info[3]
            workout_dict['description']['plan_function'] = description_info[4]

        # 
        raw_current_exercise = {
            'exercise': '',
            'data': []
        }
        raw_exercises = []
        for line in workout[1:]:
            if len(line) > 13:
                if len(raw_current_exercise['exercise']) > 0:
                    raw_exercises.append(raw_current_exercise)
                raw_current_exercise = {'exercise': line, 'data': []}
            else:
                raw_current_exercise['data'].append(line)
        raw_exercises.append(raw_current_exercise) # necessary to append the last dict

        # 
        clean_exercises = []
        for e in raw_exercises:
            exercise_sets = []
            #print(f"workout: {workout_counter}  {e}") # use to find errors

            raw_exercise_header = e['exercise']
            if ';' in e['exercise']:
                split_header = raw_exercise_header.split(';')
                exercise_info = [item.strip() for item in split_header[0].strip('"').split('·')]
                exercise_comments = split_header[1]
            else:
                exercise_info = [item.strip() for item in raw_exercise_header.strip('"').split('·')]
                exercise_comments = []
            order_and_name = exercise_info[0].split(". ")

            clean_exercise = {
                #'info': exercise_info, # use to check data quality
                'workout_id': workout_dict['description']['id'],
                'exercise_number': order_and_name[0],
                'exercise_name': order_and_name[1],
                'equipment': exercise_info[1],
                'reps_goal': ''
            }

            if (len(exercise_info) > 2):
                clean_exercise['reps_goal'] = exercise_info[2]
            
            # add set data to main dict
            exercise_set_header = e['data'][0].split(';')
            for set in e['data'][1:]:
                complete_set = clean_exercise.copy()
                set_info = set.split(';')
                
                #clean_exercise['test'] = exercise_set_header # use to check data quality
                complete_set['set_number'] = set_info[0]

                if 'KG' in exercise_set_header:
                    complete_set['weigth'] = set_info[1]
                    complete_set['quantity'] = set_info[2]
                    complete_set['measure'] = 'reps'
                elif ('KG' not in exercise_set_header and 'REPS' in exercise_set_header):
                    complete_set['weigth'] = ''
                    complete_set['quantity'] = set_info[1]
                    complete_set['measure'] = 'reps'
                elif ('KG' not in exercise_set_header and 'SECS' in exercise_set_header):
                    complete_set['weigth'] = ''
                    complete_set['quantity'] = set_info[1]
                    complete_set['measure'] = 'secs'
                elif ('KG' not in exercise_set_header and 'MINS' in exercise_set_header):
                    complete_set['weigth'] = ''
                    complete_set['quantity'] = set_info[1]
                    complete_set['measure'] = 'mins'
                else:
                    complete_set['weigth'] = 'error'
                    complete_set['quantity'] = '-1'
                    complete_set['measure'] = 'error'
                
                complete_set['comments'] = exercise_comments

                exercise_sets.append(complete_set)
                
            clean_exercises.append(exercise_sets)
        
        #workout_dict['raw_exercises'] = raw_exercises # not returned but still useful to check for data quality
        workout_dict['exercises'] = [item for sublist in clean_exercises for item in sublist]
        workout_counter += 1

        workout_listdict.append(workout_dict)
    
    
    return workout_listdict

# Extract

In [230]:
raw_workouts = get_raw_workouts(WORKOUTS_FILE)

In [231]:
#raw_workouts[1]

# Transform

## Flattening

In [232]:
normalized_workouts_dict = get_workout_dict(raw_workouts)

In [233]:
#pprint.pprint(normalized_workouts_dict[1], sort_dicts=False) #['exercises']

## Dataframes

In [234]:
workouts_list = []
exercises_log_list = []

for i in normalized_workouts_dict:
    workouts_list.append(i['description'])
    exercises_log_list.append(i['exercises'])

exercises_log_list = [item for sublist in exercises_log_list for item in sublist]

df_workouts = pd.DataFrame.from_dict(workouts_list)
df_exercises_log = pd.DataFrame.from_dict(exercises_log_list)


# Save

In [239]:
df_workouts.to_csv(f"{SILVER_FOLDER}/workouts.csv", sep=";", decimal=',', index=False)
df_exercises_log.to_csv(f"{SILVER_FOLDER}/exercises_log.csv", sep=";", decimal=',', index=False)

# Tests

In [236]:
#df_exercises_log

In [237]:
#raw_workouts[2]

In [238]:
#pprint.pprint(workouts_dict[2], sort_dicts=False)