Now that we've done body comp - let's get into the swim analysis - first let's load in all our apple data then take a look at that first

In [91]:
import pandas as pd
import plotly.graph_objects as go
from IPython.display import display, HTML

# --- Load and clean the workout data (HKWorkoutActivityTypeSwimming) ---
workout_data_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/apple_health/health_data_exported/HKWorkoutActivityTypeSwimming_2024-04-99_18-31-37_SimpleHealthExportCSV.csv'
workout_data = pd.read_csv(workout_data_path, skiprows=1)

# Select the necessary columns
workout_data = workout_data[['startDate', 'totalEnergyBurned', 'totalDistance', 'totalSwimmingStrokeCount', 'duration']]

# Ensure 'startDate' is parsed as datetime
workout_data['startDate'] = pd.to_datetime(workout_data['startDate'], errors='coerce')

# Convert 'totalEnergyBurned', 'totalDistance', 'totalSwimmingStrokeCount', and 'duration' to numeric
workout_data['totalEnergyBurned'] = pd.to_numeric(workout_data['totalEnergyBurned'].str.replace(' kcal', '', regex=False), errors='coerce')
workout_data['totalDistance'] = pd.to_numeric(workout_data['totalDistance'].str.replace(' m', '', regex=False), errors='coerce')
workout_data['totalSwimmingStrokeCount'] = pd.to_numeric(workout_data['totalSwimmingStrokeCount'].str.replace(' count', '', regex=False), errors='coerce')
workout_data['duration'] = pd.to_numeric(workout_data['duration'], errors='coerce')

# Convert duration from seconds to minutes
workout_data['duration_min'] = workout_data['duration'] / 60

# Calculate the average stroke count per minute
workout_data['avgStrokeCountPerMin'] = workout_data['totalSwimmingStrokeCount'] / (workout_data['duration'] / 60)

# Filter out rows with missing values in key columns
filtered_workout_data = workout_data.dropna(subset=['startDate', 'totalEnergyBurned', 'totalDistance', 'totalSwimmingStrokeCount', 'duration'])

# Define important dates
training_start_date = pd.to_datetime("2022-12-01")
ironman_event_date = pd.to_datetime("2023-07-23")

# --- Overlay 1: Calories Burned and Total Distance Swum Over Time ---
fig1 = go.Figure()

# Add Calories Burned as a line plot
fig1.add_trace(go.Scatter(x=filtered_workout_data['startDate'], y=filtered_workout_data['totalEnergyBurned'],
                          mode='lines+markers', name='Calories Burned (Kcal)', line=dict(color='#e63946')))

# Add Total Distance as a line plot on the same graph
fig1.add_trace(go.Scatter(x=filtered_workout_data['startDate'], y=filtered_workout_data['totalDistance'],
                          mode='lines+markers', name='Total Distance (m)', line=dict(color='#457b9d'), yaxis='y2'))

# Add green vertical lines for training start and Ironman event
fig1.add_shape(type='line', x0=training_start_date, x1=training_start_date, y0=0, y1=filtered_workout_data['totalEnergyBurned'].max(),
               line=dict(color='green', dash='dash'))

fig1.add_shape(type='line', x0=ironman_event_date, x1=ironman_event_date, y0=0, y1=filtered_workout_data['totalEnergyBurned'].max(),
               line=dict(color='green', dash='dash'))

# Add annotations for the vertical lines
fig1.add_annotation(x=training_start_date, y=filtered_workout_data['totalEnergyBurned'].max(),
                    text="Training Start", showarrow=False, yshift=10, font=dict(color="black"))
fig1.add_annotation(x=ironman_event_date, y=filtered_workout_data['totalEnergyBurned'].max(),
                    text="Ironman Event", showarrow=False, yshift=10, font=dict(color="black"))

# Add secondary y-axis for the distance
fig1.update_layout(
    title="Calories Burned and Total Distance Over Time",
    xaxis=dict(title='Date'),
    yaxis=dict(title='Calories Burned (Kcal)', titlefont=dict(color='#e63946')),
    yaxis2=dict(title='Total Distance (m)', titlefont=dict(color='#457b9d'), overlaying='y', side='right'),
    plot_bgcolor='#f1faee'
)

# --- Overlay 2: Duration (Minutes) and Average Stroke Count Per Minute Over Time ---
fig2 = go.Figure()

# Add Duration (in minutes) as a line plot
fig2.add_trace(go.Scatter(x=filtered_workout_data['startDate'], y=filtered_workout_data['duration_min'],
                          mode='lines+markers', name='Duration (minutes)', line=dict(color='#457b9d')))

# Add Average Stroke Count Per Minute as a line plot on the same graph
fig2.add_trace(go.Scatter(x=filtered_workout_data['startDate'], y=filtered_workout_data['avgStrokeCountPerMin'],
                          mode='lines+markers', name='Avg Stroke Count/Min', line=dict(color='#e63946'), yaxis='y2'))

# Add green vertical lines for training start and Ironman event
fig2.add_shape(type='line', x0=training_start_date, x1=training_start_date, y0=0, y1=filtered_workout_data['duration_min'].max(),
               line=dict(color='green', dash='dash'))

fig2.add_shape(type='line', x0=ironman_event_date, x1=ironman_event_date, y0=0, y1=filtered_workout_data['duration_min'].max(),
               line=dict(color='green', dash='dash'))

# Add annotations for the vertical lines
fig2.add_annotation(x=training_start_date, y=filtered_workout_data['duration_min'].max(),
                    text="Training Start", showarrow=False, yshift=10, font=dict(color="black"))
fig2.add_annotation(x=ironman_event_date, y=filtered_workout_data['duration_min'].max(),
                    text="Ironman Event", showarrow=False, yshift=10, font=dict(color="black"))

# Add secondary y-axis for the average stroke count per minute
fig2.update_layout(
    title="Duration and Average Stroke Count Per Minute Over Time",
    xaxis=dict(title='Date'),
    yaxis=dict(title='Duration (minutes)', titlefont=dict(color='#457b9d')),
    yaxis2=dict(title='Avg Stroke Count/Min', titlefont=dict(color='#e63946'), overlaying='y', side='right'),
    plot_bgcolor='#f1faee'
)

# Show the plots
fig1.show()
fig2.show()

# --- Summary Statistics ---
summary_stats = filtered_workout_data[['totalEnergyBurned', 'totalDistance', 'totalSwimmingStrokeCount', 'duration_min', 'avgStrokeCountPerMin']].describe().T
summary_stats.columns = ['Count', 'Mean', 'Std Dev', 'Min', '25%', '50%', '75%', 'Max']

# Round the summary statistics to two decimal places
summary_stats = summary_stats.round(2)

# Display summary statistics in a neat format
summary_html = summary_stats.to_html(classes="table table-striped", border=0)
display(HTML(f"<h3>Summary Statistics</h3>{summary_html}"))


Unnamed: 0,Count,Mean,Std Dev,Min,25%,50%,75%,Max
totalEnergyBurned,39.0,415.44,259.82,24.51,271.42,395.65,490.94,1303.37
totalDistance,39.0,1442.89,948.1,54.86,914.4,1316.74,1645.92,4397.84
totalSwimmingStrokeCount,39.0,855.15,566.13,28.0,538.0,792.0,1045.0,2611.99
duration_min,39.0,43.39,23.3,2.65,29.2,45.24,52.9,99.05
avgStrokeCountPerMin,39.0,18.9,3.84,5.44,16.99,19.12,21.02,26.37


In [68]:
import os
import pandas as pd
import io

# Function to read the metadata and swim data separately
def read_swim_file(filepath):
    # Read the file and split into lines
    with open(filepath, 'r') as file:
        lines = file.readlines()

    # Extract metadata (first 2 rows) and swim data (starting from row 4)
    metadata_lines = lines[:2]
    swim_data_lines = lines[3:]

    # Parse metadata
    metadata = pd.read_csv(io.StringIO(''.join(metadata_lines)), header=None)
    
    # Parse the swim data
    max_cols = max([len(line.split(',')) for line in swim_data_lines])
    swim_data = pd.read_csv(io.StringIO(''.join(swim_data_lines)), 
                            header=0,  # Use the 4th row as header
                            names=range(max_cols), 
                            skip_blank_lines=True)
    
    return metadata, swim_data

# Initialize lists to store metadata and swim data
all_metadata = []
all_swim_data = []

# Set path to your folder containing all files
swim_data_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/form_swimming/form_swim_data/'

# List all the files in the directory
all_files = [file for file in os.listdir(swim_data_path) if file.endswith('.csv')]

# Read each file and store the metadata and swim data
for file in all_files:
    try:
        metadata, swim_data = read_swim_file(os.path.join(swim_data_path, file))
        metadata['Source File'] = file  # Add source file info to metadata
        swim_data['Source File'] = file  # Add source file info to swim data
        
        all_metadata.append(metadata)
        all_swim_data.append(swim_data)
    except Exception as e:
        print(f"Error parsing {file}: {e}")

# Only combine data if files were successfully read
if all_swim_data:
    combined_swim_data = pd.concat(all_swim_data, ignore_index=True)
    print("Combined Swim Data:")
    print(combined_swim_data.head())

if all_metadata:
    combined_metadata = pd.concat(all_metadata, ignore_index=True)
    print("\nCombined Metadata:")
    print(combined_metadata.head())
else:
    print("No valid data was loaded.")


Combined Swim Data:
            0           1  2      3    4   5     6        7        8        9  \
0  05/02/2023  05:29:26PM  1   2x20   20  20    FR  0:20.71  0:00.00  0:20.71   
1  05/02/2023  05:29:46PM  1   2x20   20   0  REST  0:00.00  0:04.55  0:25.26   
2  05/02/2023  05:29:51PM  1   2x20   20  20    FR  0:21.04  0:00.00  0:46.30   
3  05/02/2023  05:30:12PM  1   2x20   20   0  REST  0:00.00  0:04.77  0:51.07   
4  05/02/2023  05:30:17PM  2  1x120  120  20    FR  0:21.36  0:00.00  1:12.44   

   ...    12    13       14       15  16  17  18  19  20  \
0  ...  20.0  0.50  1:43.56  0:51.78  39  59  18  18   4   
1  ...  20.0  0.00  0:00.00  0:00.00   0   0   0   0   0   
2  ...  40.0  0.95  1:45.23  0:52.61  38  56  17  17   4   
3  ...  40.0  0.00  0:00.00  0:00.00   0   0   0   0   0   
4  ...  60.0  0.97  1:46.82  0:53.41  38  58  17  17   4   

                  Source File  
0  FORM_2023-05-02_172926.csv  
1  FORM_2023-05-02_172926.csv  
2  FORM_2023-05-02_172926.csv  
3  F