In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Define the file paths
wahoo_data_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv'
apple_health_cycling_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/apple_health/health_data_exported/HKWorkoutActivityTypeCycling_2024-04-99_18-31-36_SimpleHealthExportCSV.csv'

# Read the Wahoo data
wahoo_data = pd.read_csv(wahoo_data_path)

# Read the Apple Health data, skipping the first metadata row
apple_health_data = pd.read_csv(apple_health_cycling_path, skiprows=1)

# Convert the 'Date' column to datetime in Wahoo data and find unique workouts
wahoo_data['Date'] = pd.to_datetime(wahoo_data['Date'])
wahoo_workouts = wahoo_data['Date'].dt.date.nunique()

# Process the Apple Health data (using the correct 'startDate' column)
apple_health_data['startDate'] = pd.to_datetime(apple_health_data['startDate'])
apple_health_workouts = apple_health_data['startDate'].dt.date.nunique()

# Prepare the data for visualization
workout_data = pd.DataFrame({
    'Source': ['Wahoo', 'Apple Health'],
    'Workout Count': [wahoo_workouts, apple_health_workouts]
})

# Create a bar chart using Plotly Express
fig = px.bar(workout_data, x='Source', y='Workout Count', 
             title='Number of Cycling Workouts by Source',
             text='Workout Count', color='Source')

# Show the visualization
fig.show()


FileNotFoundError: [Errno 2] No such file or directory: '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv'

In [3]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML

# Define the file paths
wahoo_data_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv'
apple_health_cycling_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/apple_health/health_data_exported/HKWorkoutActivityTypeCycling_2024-04-99_18-31-36_SimpleHealthExportCSV.csv'

# Correct color mapping and explicitly assign to each zone
color_mapping = {'Zone 1 (50%-60%)': '#e63946', 'Zone 2 (60%-70%)': '#a8dadc', 
                 'Zone 3 (70%-80%)': '#90EE90', 'Zone 4 (80%-90%)': '#457b9d', 'Zone 5 (90%-100%)': '#1d3557'}

# Read the data
wahoo_data = pd.read_csv(wahoo_data_path)
apple_health_data = pd.read_csv(apple_health_cycling_path, skiprows=1)

# Convert necessary columns to datetime
wahoo_data['Date'] = pd.to_datetime(wahoo_data['Date'])
apple_health_data['startDate'] = pd.to_datetime(apple_health_data['startDate'])

# --- Clean the Apple Health data ---
# Convert 'totalEnergyBurned' to numeric, coercing errors to NaN
apple_health_data['totalEnergyBurned'] = pd.to_numeric(apple_health_data['totalEnergyBurned'], errors='coerce')

# --- Summary Statistics ---

# Wahoo Summary
wahoo_summary = {
    "Total Workouts": wahoo_data['Date'].nunique(),
    "Total Duration (hours)": wahoo_data['Duration'].apply(lambda x: pd.to_timedelta(x).total_seconds() / 3600).sum(),
    "Average Power (W)": wahoo_data['Avg Power'].mean(),
    "Average Cadence (RPM)": wahoo_data['Avg Cadence'].mean(),
    "Average Speed (km/h)": wahoo_data['Avg Speed'].mean(),
}

# Apple Health Summary
apple_health_summary = {
    "Total Workouts": apple_health_data['startDate'].dt.date.nunique(),
    "Total Calories Burned (kcal)": apple_health_data['totalEnergyBurned'].sum(),
    "Total Duration (hours)": apple_health_data['duration'].sum() / 3600,
}

# Display summary stats in a table format
wahoo_summary_df = pd.DataFrame(list(wahoo_summary.items()), columns=["Metric", "Wahoo Stats"])
apple_health_summary_df = pd.DataFrame(list(apple_health_summary.items()), columns=["Metric", "Apple Health Stats"])

# Combine the summary tables into one
summary_df = pd.merge(wahoo_summary_df, apple_health_summary_df, on="Metric", how="outer")

# Display the summary stats using IPython display
display(HTML(summary_df.to_html(index=False)))

# --- Visualization 1: Workouts Over Time ---
# Wahoo workouts over time
wahoo_workouts_per_month = wahoo_data.groupby(wahoo_data['Date'].dt.to_period('M')).size()

# Apple Health workouts over time
apple_health_workouts_per_month = apple_health_data.groupby(apple_health_data['startDate'].dt.to_period('M')).size()

# Create a combined dataframe for plotting
workouts_per_month = pd.DataFrame({
    'Month': wahoo_workouts_per_month.index.astype(str),  # Convert to string for plotting
    'Wahoo Workouts': wahoo_workouts_per_month.values,
    'Apple Health Workouts': apple_health_workouts_per_month.reindex(wahoo_workouts_per_month.index, fill_value=0).values
})

# Plot Workouts Over Time with the correct color mapping
fig_workouts_time = px.line(workouts_per_month, x='Month', y=['Wahoo Workouts', 'Apple Health Workouts'],
                            title='Workouts Performed Over Time', markers=True,
                            color_discrete_sequence=[color_mapping['Zone 1 (50%-60%)'], color_mapping['Zone 2 (60%-70%)']])

# Update axis labels and titles
fig_workouts_time.update_layout(
    xaxis_title="Month",
    yaxis_title="Number of Workouts",
    legend_title_text='Workout Source',
    title_x=0.5,
    template="simple_white"
)

# --- Visualization 2: Distribution of Power, Speed, and Cadence ---
fig_distribution = go.Figure()

# Histogram for Power (Zone 5 color)
fig_distribution.add_trace(go.Histogram(x=wahoo_data['Avg Power'], name='Power (W)', opacity=0.75, marker=dict(color=color_mapping['Zone 5 (90%-100%)'])))

# Histogram for Speed (Zone 4 color)
fig_distribution.add_trace(go.Histogram(x=wahoo_data['Avg Speed'], name='Speed (km/h)', opacity=0.75, marker=dict(color=color_mapping['Zone 4 (80%-90%)'])))

# Histogram for Cadence (Zone 3 color)
fig_distribution.add_trace(go.Histogram(x=wahoo_data['Avg Cadence'], name='Cadence (RPM)', opacity=0.75, marker=dict(color=color_mapping['Zone 3 (70%-80%)'])))

# Update layout for the distribution plots with clearer labels
fig_distribution.update_layout(
    barmode='overlay', 
    title_text='Distribution of Power, Speed, and Cadence',
    xaxis_title="Metric Value",
    yaxis_title="Count",
    template="simple_white"
)

# --- Visualization 3: Heatmap for Workout Intensity (based on Avg Power or Calories) ---
# Adjust the Y-axis for the heatmap to be more insightful (e.g., Workout Duration)
wahoo_data['Duration (hrs)'] = wahoo_data['Duration'].apply(lambda x: pd.to_timedelta(x).total_seconds() / 3600)

# Create a heatmap with Workout Duration on the Y-axis
fig_heatmap = px.density_heatmap(wahoo_data, x='Date', y='Duration (hrs)', z='Avg Power',
                                 title="Workout Intensity Heatmap (Average Power vs Duration)",
                                 labels={'Duration (hrs)': 'Workout Duration (hrs)', 'Avg Power': 'Avg Power (Watts)'},
                                 color_continuous_scale=[color_mapping['Zone 5 (90%-100%)'], 
                                                         color_mapping['Zone 4 (80%-90%)'], 
                                                         color_mapping['Zone 3 (70%-80%)'], 
                                                         color_mapping['Zone 2 (60%-70%)'], 
                                                         color_mapping['Zone 1 (50%-60%)']])

# Update layout for clarity
fig_heatmap.update_layout(
    xaxis_title="Date",
    yaxis_title="Workout Duration (hrs)",
    title_x=0.5,
    template="simple_white"
)

# --- Visualization 4: Compare Average Power Across Months ---
wahoo_data['Month'] = wahoo_data['Date'].dt.to_period('M').astype(str)  # Convert to string
monthly_avg_power = wahoo_data.groupby('Month')['Avg Power'].mean().reset_index()

# Plot monthly average power with Zone 4 color
fig_avg_power = px.line(monthly_avg_power, x='Month', y='Avg Power', title='Average Power Performed Each Month',
                        markers=True, line_shape='linear', color_discrete_sequence=[color_mapping['Zone 4 (80%-90%)']])

# Update the layout for the Average Power plot
fig_avg_power.update_layout(
    xaxis_title="Month",
    yaxis_title="Average Power (W)",
    title_x=0.5,
    template="simple_white"
)

# Display all plots
fig_workouts_time.show()
fig_distribution.show()
fig_heatmap.show()
fig_avg_power.show()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv'

In [32]:
import pandas as pd
import plotly.express as px

# Load the provided cycling data
wahoo_data_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv'
cycling_data = pd.read_csv(wahoo_data_path)

# 1. Bar Graph: Count of Different Cycling Activities
activity_counts = cycling_data['Activity Name'].value_counts()

# Create a bar plot using plotly
fig1 = px.bar(activity_counts, 
              x=activity_counts.index, 
              y=activity_counts.values, 
              title='Count of Different Cycling Activities',
              labels={'x':'Activity Name', 'y':'Count'})

# Update layout for better display
fig1.update_layout(xaxis_tickangle=-90)

# Show the first plot
fig1.show()

# Check if the TSS column exists or calculate it
if 'TSS' not in cycling_data.columns:
    # Assuming 'Duration' and 'Normalized Power' exist to calculate TSS (using FTP of 250W as an example)
    cycling_data['Duration (hours)'] = pd.to_timedelta(cycling_data['Duration']).dt.total_seconds() / 3600
    FTP = 250  # Adjust this value based on your actual FTP
    cycling_data['IF'] = cycling_data['Normalized Power'] / FTP
    cycling_data['TSS'] = (cycling_data['Duration (hours)'] * cycling_data['Normalized Power'] * cycling_data['IF']) / (FTP * 100)

# 2. Bar Graph: Showing TSS values for each entry
fig2 = px.bar(cycling_data, 
              y='TSS', 
              title='TSS Values for Each Activity',
              labels={'index': 'Activity', 'TSS': 'Training Stress Score'},
              text='TSS')

# Update layout for better display
fig2.update_layout(xaxis={'title':'Activity Index'}, 
                   yaxis={'title':'Training Stress Score'},
                   showlegend=False)

# Show the second plot
fig2.show()

# 3. Summary Statistics for TSS
summary_stats = cycling_data['TSS'].describe()

# Display summary statistics for TSS
print("Summary Statistics for TSS:")
print(summary_stats)

# 4. Visualization: Distribution of Training Stress Score (TSS)
fig3 = px.histogram(cycling_data, 
                    x='TSS', 
                    title='Distribution of Training Stress Score (TSS)',
                    labels={'TSS':'Training Stress Score'},
                    nbins=10)

# Show the distribution plot
fig3.show()

# 5. Scatter Plot: TSS vs IF (if available)
if 'IF' in cycling_data.columns:
    fig4 = px.scatter(cycling_data, 
                      x='IF', 
                      y='TSS', 
                      title='TSS vs Intensity Factor (IF)',
                      labels={'IF':'Intensity Factor', 'TSS':'Training Stress Score'},
                      trendline='ols')
    fig4.show()


Summary Statistics for TSS:
count     38.000000
mean      44.526316
std       38.694044
min        0.000000
25%       14.000000
50%       40.000000
75%       61.750000
max      156.000000
Name: TSS, dtype: float64


In [30]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from statsmodels.formula.api import ols, mixedlm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from tabulate import tabulate

# Custom color mapping
color_mapping = {
    'Zone 1 (50%-60%)': '#e63946',
    'Zone 2 (60%-70%)': '#a8dadc',
    'Zone 3 (70%-80%)': '#90EE90',
    'Zone 4 (80%-90%)': '#457b9d',
    'Zone 5 (90%-100%)': '#1d3557'
}

# Load and prepare data
cycling_data = pd.read_csv('/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv', parse_dates=['Date'])
cycling_data['Duration'] = pd.to_timedelta(cycling_data['Duration'])
cycling_data['Duration_hours'] = cycling_data['Duration'].dt.total_seconds() / 3600
cycling_data = cycling_data[cycling_data['Duration_hours'] >= 5/60]

# Rename columns with spaces
cycling_data = cycling_data.rename(columns={
    'Normalized Power': 'Normalized_Power',
    'Avg Power': 'Avg_Power',
    'Activity Name': 'Activity_Name'
})

# Round all numeric columns to two decimal points
numeric_columns = cycling_data.select_dtypes(include=[np.number]).columns
cycling_data[numeric_columns] = cycling_data[numeric_columns].round(2)

# 1. Descriptive Statistics
print("Descriptive Statistics:")
desc_stats = cycling_data[['TSS', 'Avg_Power', 'Normalized_Power', 'IF', 'Duration_hours']].describe().round(2)
print(tabulate(desc_stats, headers='keys', tablefmt='pretty'))

# 2. Correlation Analysis
corr_matrix = cycling_data[['TSS', 'Avg_Power', 'Normalized_Power', 'IF', 'Duration_hours']].corr().round(2)
fig = px.imshow(corr_matrix, text_auto=True, color_continuous_scale=list(color_mapping.values()))
fig.update_layout(title='Correlation Matrix of Key Metrics')
fig.show()

# 3. One-way ANOVA for TSS across different activities
activities = cycling_data['Activity_Name'].unique()
tss_by_activity = [group['TSS'].values for name, group in cycling_data.groupby('Activity_Name')]
f_statistic, p_value = stats.f_oneway(*tss_by_activity)
print("\nOne-way ANOVA Results for TSS across activities:")
anova_results = [["F-statistic", f"{f_statistic:.2f}"], ["p-value", f"{p_value:.4f}"]]
print(tabulate(anova_results, headers=["Metric", "Value"], tablefmt='pretty'))

# 4. Tukey's HSD Test
tukey_results = pairwise_tukeyhsd(cycling_data['TSS'], cycling_data['Activity_Name'])
print("\nTukey's HSD Test Results:")
print(tabulate(tukey_results.summary().data[1:], headers=tukey_results.summary().data[0], tablefmt='pretty'))

# 5. Multiple Regression
try:
    model = ols('TSS ~ Duration_hours + Normalized_Power + IF', data=cycling_data).fit()
    print("\nMultiple Regression Results:")
    print(model.summary().as_text())  # Already formatted nicely
except Exception as e:
    print(f"Error in Multiple Regression: {e}")

# 6. Linear Mixed Model
try:
    lmm = mixedlm('TSS ~ Duration_hours + Normalized_Power', cycling_data, groups=cycling_data['Activity_Name'])
    lmm_results = lmm.fit()
    print("\nLinear Mixed Model Results:")
    print(lmm_results.summary().as_text())  # Already formatted nicely
except Exception as e:
    print(f"Error in Linear Mixed Model: {e}")

# 7. Paired t-test (comparing TSS between two specific activity types)
activity1 = 'Endurance 1'
activity2 = 'Endurance 2'
tss_activity1 = cycling_data[cycling_data['Activity_Name'] == activity1]['TSS']
tss_activity2 = cycling_data[cycling_data['Activity_Name'] == activity2]['TSS']
if len(tss_activity1) > 0 and len(tss_activity2) > 0:
    t_stat, t_p_value = stats.ttest_ind(tss_activity1, tss_activity2)
    print(f"\nPaired t-test results ({activity1} vs {activity2}):")
    t_test_results = [["t-statistic", f"{t_stat:.2f}"], ["p-value", f"{t_p_value:.4f}"]]
    print(tabulate(t_test_results, headers=["Metric", "Value"], tablefmt='pretty'))
else:
    print(f"\nNot enough data for paired t-test between {activity1} and {activity2}")

# 8. TSS vs Duration Scatter Plot
fig = px.scatter(cycling_data, x='Duration_hours', y='TSS', color='IF',
                 color_continuous_scale='viridis',
                 title='Training Stress Score vs Duration')

# Define and add duration zones with faint colors
duration_zones = [
    (0, 1, 'Short', 'rgba(255,0,0,0.05)'),
    (1, 2, 'Medium', 'rgba(0,255,0,0.05)'),
    (2, 3, 'Long', 'rgba(0,0,255,0.05)')
]

for start, end, name, color in duration_zones:
    fig.add_vrect(
        x0=start, x1=end,
        fillcolor=color, layer="below", line_width=0,
        annotation_text=name, annotation_position="top",
        annotation=dict(font_size=10, font_color="gray")
    )

# Add average TSS line
avg_tss = cycling_data['TSS'].mean()
fig.add_hline(y=avg_tss, line_dash="dot", line_color="rgba(0,128,0,0.5)", annotation_text=f"Avg TSS: {avg_tss:.2f}")

# Add annotations
max_tss_idx = cycling_data['TSS'].idxmax()
fig.add_annotation(x=cycling_data.loc[max_tss_idx, 'Duration_hours'], y=cycling_data.loc[max_tss_idx, 'TSS'],
                   text="Max TSS", showarrow=True, arrowhead=1, arrowcolor="rgba(0,0,0,0.5)")

# Add trend line (now as a separate trace to control its position)
z = np.polyfit(cycling_data['Duration_hours'], cycling_data['TSS'], 1)
p = np.poly1d(z)
x_trend = np.linspace(cycling_data['Duration_hours'].min(), cycling_data['Duration_hours'].max(), 100)
trend_trace = go.Scatter(x=x_trend, y=p(x_trend), mode='lines', name='Trend', 
                         line=dict(color='rgba(255,0,0,0.5)', dash='dash'))

# Update layout
fig.update_layout(
    xaxis_title='Duration (hours)',
    yaxis_title='TSS',
    plot_bgcolor='rgba(240,240,240,0.5)',
    legend_title_text='Intensity Factor',
    coloraxis_colorbar=dict(title='IF'),
    font=dict(family="Arial", size=12),
    margin=dict(l=50, r=50, t=50, b=50)
)

# Update axes
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='rgba(200,200,200,0.5)')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='rgba(200,200,200,0.5)')

# Add the trend line trace last to ensure it's on top
fig.add_trace(trend_trace)

fig.show()

Descriptive Statistics:
+-------+-------+-----------+------------------+------+----------------+
|       |  TSS  | Avg_Power | Normalized_Power |  IF  | Duration_hours |
+-------+-------+-----------+------------------+------+----------------+
| count | 33.0  |   33.0    |       33.0       | 33.0 |      33.0      |
| mean  | 51.12 |  124.64   |      140.64      | 0.78 |      0.78      |
|  std  | 37.28 |   12.66   |      22.19       | 0.15 |      0.53      |
|  min  |  9.0  |   100.0   |      102.0       | 0.57 |      0.19      |
|  25%  | 17.0  |   112.0   |      124.0       | 0.65 |      0.42      |
|  50%  | 42.0  |   127.0   |      143.0       | 0.77 |      0.76      |
|  75%  | 66.0  |   134.0   |      156.0       | 0.91 |      0.98      |
|  max  | 156.0 |   149.0   |      188.0       | 1.12 |      2.72      |
+-------+-------+-----------+------------------+------+----------------+



One-way ANOVA Results for TSS across activities:
+-------------+--------+
|   Metric    | Value  |
+-------------+--------+
| F-statistic |  1.91  |
|   p-value   | 0.1451 |
+-------------+--------+

Tukey's HSD Test Results:
+-------------------------------------------+-------------------------------------------+-----------+--------+-----------+----------+--------+
|                  group1                   |                  group2                   | meandiff  | p-adj  |   lower   |  upper   | reject |
+-------------------------------------------+-------------------------------------------+-----------+--------+-----------+----------+--------+
|             A Very Dark Place             |                 Alla Vita                 |   -10.0   |  1.0   | -204.0911 | 184.0911 | False  |
|             A Very Dark Place             |    Big Gear Tempo/Sub-Threshold: 4 x 8    |   10.0    |  1.0   | -184.0911 | 204.0911 | False  |
|             A Very Dark Place             |             

In [36]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.formula.api import ols
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

# Custom color mapping
color_mapping = {
    'Zone 1 (50%-60%)': '#e63946',
    'Zone 2 (60%-70%)': '#a8dadc',
    'Zone 3 (70%-80%)': '#90EE90',
    'Zone 4 (80%-90%)': '#457b9d',
    'Zone 5 (90%-100%)': '#1d3557'
}

# Create a color list from the mapping
color_list = list(color_mapping.values())

# Load the data
file_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/training_peaks/workouts.csv'
df = pd.read_csv(file_path, parse_dates=['WorkoutDay'])

# Function to convert duration string to hours
def duration_to_hours(duration_str):
    if pd.isna(duration_str):
        return 0
    try:
        return float(duration_str)
    except ValueError:
        return float(duration_str) * 24

# Convert PlannedDuration to hours
df['Duration'] = df['PlannedDuration'].apply(duration_to_hours)

# Calculate training period
training_start = df['WorkoutDay'].min()
training_end = df['WorkoutDay'].max()
training_period = (training_end - training_start).days

# Calculate TSS and IF where available
df['TSS'] = pd.to_numeric(df['TSS'], errors='coerce')
df['IF'] = pd.to_numeric(df['IF'], errors='coerce')

# Group workouts by type
workout_types = df['WorkoutType'].value_counts()

# Calculate total training volume
total_volume = df['Duration'].sum()

# Calculate average TSS and IF
avg_tss = df['TSS'].mean()
avg_if = df['IF'].mean()

# Time series analysis
df_ts = df.set_index('WorkoutDay')
df_ts = df_ts.resample('D').sum()
df_ts['CumulativeTSS'] = df_ts['TSS'].cumsum()

# Print basic statistics
print(f"Training period: {training_period} days")
print(f"Total training volume: {total_volume:.2f} hours")
print(f"Average TSS: {avg_tss:.2f}")
print(f"Average IF: {avg_if:.2f}")
print("\nWorkout type distribution:")
print(workout_types)

# Visualizations
# Duration Distribution by Workout Type
fig = px.box(df, x='WorkoutType', y='Duration', color='WorkoutType', 
             color_discrete_sequence=color_list,
             title='Duration Distribution by Workout Type')
fig.update_layout(xaxis_title='Workout Type', yaxis_title='Duration (hours)')
fig.show()

# TSS vs Duration by Workout Type
fig = px.scatter(df, x='Duration', y='TSS', color='WorkoutType', 
                 color_discrete_sequence=color_list,
                 title='TSS vs Duration by Workout Type')
fig.update_layout(xaxis_title='Duration (hours)', yaxis_title='TSS')
fig.show()

# Cumulative TSS over Time
fig = px.line(df_ts, x=df_ts.index, y='CumulativeTSS', 
              color_discrete_sequence=[color_mapping['Zone 3 (70%-80%)']],
              title='Cumulative TSS over Time')
fig.update_layout(xaxis_title='Date', yaxis_title='Cumulative TSS')
fig.show()

# Analyze progression of key metrics
bike_data = df[df['WorkoutType'] == 'Bike']
run_data = df[df['WorkoutType'] == 'Run']

fig = go.Figure()
fig.add_trace(go.Scatter(x=bike_data['WorkoutDay'], y=bike_data['IF'], mode='markers', name='Bike IF',
                         marker=dict(color=color_mapping['Zone 2 (60%-70%)'])))
fig.add_trace(go.Scatter(x=run_data['WorkoutDay'], y=run_data['IF'], mode='markers', name='Run IF',
                         marker=dict(color=color_mapping['Zone 4 (80%-90%)'])))
fig.update_layout(title='Intensity Factor (IF) Progression', xaxis_title='Date', yaxis_title='IF')
fig.show()

# Analyze tapering period
taper_start = training_end - pd.Timedelta(days=14)
taper_data = df[df['WorkoutDay'] >= taper_start]

print("\nTaper period analysis:")
print(taper_data['WorkoutType'].value_counts())
print(f"Average TSS during taper: {taper_data['TSS'].mean():.2f}")
print(f"Average IF during taper: {taper_data['IF'].mean():.2f}")

# Additional analysis: Weekly training load
df['Week'] = df['WorkoutDay'].dt.to_period('W').astype(str)
weekly_load = df.groupby('Week')['TSS'].sum().reset_index()

fig = px.line(weekly_load, x='Week', y='TSS', 
              color_discrete_sequence=[color_mapping['Zone 5 (90%-100%)']],
              title='Weekly Training Load (TSS)')
fig.update_layout(xaxis_title='Week', yaxis_title='TSS')
fig.show()

# Workout Type Distribution
fig = px.pie(workout_types, values=workout_types.values, names=workout_types.index, 
             color_discrete_sequence=color_list,
             title='Workout Type Distribution')
fig.show()

# TSS and IF over time
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=df['WorkoutDay'], y=df['TSS'], mode='markers', name='TSS',
                         marker=dict(color=color_mapping['Zone 1 (50%-60%)'])), secondary_y=False)
fig.add_trace(go.Scatter(x=df['WorkoutDay'], y=df['IF'], mode='markers', name='IF',
                         marker=dict(color=color_mapping['Zone 5 (90%-100%)'])), secondary_y=True)
fig.update_layout(title='TSS and IF over Time', xaxis_title='Date')
fig.update_yaxes(title_text="TSS", secondary_y=False)
fig.update_yaxes(title_text="IF", secondary_y=True)
fig.show()

print("\nAnalysis complete. Please check the generated plots and statistics.")

Training period: 148 days
Total training volume: 241.92 hours
Average TSS: 44.90
Average IF: 0.56

Workout type distribution:
WorkoutType
Run         88
Bike        75
Swim        72
Other       51
Strength    21
Day Off     20
Brick        2
Name: count, dtype: int64



Taper period analysis:
WorkoutType
Run        10
Bike       10
Swim        9
Other       6
Day Off     3
Brick       1
Name: count, dtype: int64
Average TSS during taper: 86.20
Average IF during taper: 0.61



Analysis complete. Please check the generated plots and statistics.
