In [12]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Define the file paths
wahoo_data_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv'
apple_health_cycling_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/apple_health/health_data_exported/HKWorkoutActivityTypeCycling_2024-04-99_18-31-36_SimpleHealthExportCSV.csv'

# Read the Wahoo data
wahoo_data = pd.read_csv(wahoo_data_path)

# Read the Apple Health data, skipping the first metadata row
apple_health_data = pd.read_csv(apple_health_cycling_path, skiprows=1)

# Convert the 'Date' column to datetime in Wahoo data and find unique workouts
wahoo_data['Date'] = pd.to_datetime(wahoo_data['Date'])
wahoo_workouts = wahoo_data['Date'].dt.date.nunique()

# Process the Apple Health data (using the correct 'startDate' column)
apple_health_data['startDate'] = pd.to_datetime(apple_health_data['startDate'])
apple_health_workouts = apple_health_data['startDate'].dt.date.nunique()

# Prepare the data for visualization
workout_data = pd.DataFrame({
    'Source': ['Wahoo', 'Apple Health'],
    'Workout Count': [wahoo_workouts, apple_health_workouts]
})

# Create a bar chart using Plotly Express
fig = px.bar(workout_data, x='Source', y='Workout Count', 
             title='Number of Cycling Workouts by Source',
             text='Workout Count', color='Source')

# Show the visualization
fig.show()


In [22]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML

# Define the file paths
wahoo_data_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv'
apple_health_cycling_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/apple_health/health_data_exported/HKWorkoutActivityTypeCycling_2024-04-99_18-31-36_SimpleHealthExportCSV.csv'

# Correct color mapping and explicitly assign to each zone
color_mapping = {'Zone 1 (50%-60%)': '#e63946', 'Zone 2 (60%-70%)': '#a8dadc', 
                 'Zone 3 (70%-80%)': '#90EE90', 'Zone 4 (80%-90%)': '#457b9d', 'Zone 5 (90%-100%)': '#1d3557'}

# Read the data
wahoo_data = pd.read_csv(wahoo_data_path)
apple_health_data = pd.read_csv(apple_health_cycling_path, skiprows=1)

# Convert necessary columns to datetime
wahoo_data['Date'] = pd.to_datetime(wahoo_data['Date'])
apple_health_data['startDate'] = pd.to_datetime(apple_health_data['startDate'])

# --- Clean the Apple Health data ---
# Convert 'totalEnergyBurned' to numeric, coercing errors to NaN
apple_health_data['totalEnergyBurned'] = pd.to_numeric(apple_health_data['totalEnergyBurned'], errors='coerce')

# --- Summary Statistics ---

# Wahoo Summary
wahoo_summary = {
    "Total Workouts": wahoo_data['Date'].nunique(),
    "Total Duration (hours)": wahoo_data['Duration'].apply(lambda x: pd.to_timedelta(x).total_seconds() / 3600).sum(),
    "Average Power (W)": wahoo_data['Avg Power'].mean(),
    "Average Cadence (RPM)": wahoo_data['Avg Cadence'].mean(),
    "Average Speed (km/h)": wahoo_data['Avg Speed'].mean(),
}

# Apple Health Summary
apple_health_summary = {
    "Total Workouts": apple_health_data['startDate'].dt.date.nunique(),
    "Total Calories Burned (kcal)": apple_health_data['totalEnergyBurned'].sum(),
    "Total Duration (hours)": apple_health_data['duration'].sum() / 3600,
}

# Display summary stats in a table format
wahoo_summary_df = pd.DataFrame(list(wahoo_summary.items()), columns=["Metric", "Wahoo Stats"])
apple_health_summary_df = pd.DataFrame(list(apple_health_summary.items()), columns=["Metric", "Apple Health Stats"])

# Combine the summary tables into one
summary_df = pd.merge(wahoo_summary_df, apple_health_summary_df, on="Metric", how="outer")

# Display the summary stats using IPython display
display(HTML(summary_df.to_html(index=False)))

# --- Visualization 1: Workouts Over Time ---
# Wahoo workouts over time
wahoo_workouts_per_month = wahoo_data.groupby(wahoo_data['Date'].dt.to_period('M')).size()

# Apple Health workouts over time
apple_health_workouts_per_month = apple_health_data.groupby(apple_health_data['startDate'].dt.to_period('M')).size()

# Create a combined dataframe for plotting
workouts_per_month = pd.DataFrame({
    'Month': wahoo_workouts_per_month.index.astype(str),  # Convert to string for plotting
    'Wahoo Workouts': wahoo_workouts_per_month.values,
    'Apple Health Workouts': apple_health_workouts_per_month.reindex(wahoo_workouts_per_month.index, fill_value=0).values
})

# Plot Workouts Over Time with the correct color mapping
fig_workouts_time = px.line(workouts_per_month, x='Month', y=['Wahoo Workouts', 'Apple Health Workouts'],
                            title='Workouts Performed Over Time', markers=True,
                            color_discrete_sequence=[color_mapping['Zone 1 (50%-60%)'], color_mapping['Zone 2 (60%-70%)']])

# Update axis labels and titles
fig_workouts_time.update_layout(
    xaxis_title="Month",
    yaxis_title="Number of Workouts",
    legend_title_text='Workout Source',
    title_x=0.5,
    template="simple_white"
)

# --- Visualization 2: Distribution of Power, Speed, and Cadence ---
fig_distribution = go.Figure()

# Histogram for Power (Zone 5 color)
fig_distribution.add_trace(go.Histogram(x=wahoo_data['Avg Power'], name='Power (W)', opacity=0.75, marker=dict(color=color_mapping['Zone 5 (90%-100%)'])))

# Histogram for Speed (Zone 4 color)
fig_distribution.add_trace(go.Histogram(x=wahoo_data['Avg Speed'], name='Speed (km/h)', opacity=0.75, marker=dict(color=color_mapping['Zone 4 (80%-90%)'])))

# Histogram for Cadence (Zone 3 color)
fig_distribution.add_trace(go.Histogram(x=wahoo_data['Avg Cadence'], name='Cadence (RPM)', opacity=0.75, marker=dict(color=color_mapping['Zone 3 (70%-80%)'])))

# Update layout for the distribution plots with clearer labels
fig_distribution.update_layout(
    barmode='overlay', 
    title_text='Distribution of Power, Speed, and Cadence',
    xaxis_title="Metric Value",
    yaxis_title="Count",
    template="simple_white"
)

# --- Visualization 3: Heatmap for Workout Intensity (based on Avg Power or Calories) ---
# Adjust the Y-axis for the heatmap to be more insightful (e.g., Workout Duration)
wahoo_data['Duration (hrs)'] = wahoo_data['Duration'].apply(lambda x: pd.to_timedelta(x).total_seconds() / 3600)

# Create a heatmap with Workout Duration on the Y-axis
fig_heatmap = px.density_heatmap(wahoo_data, x='Date', y='Duration (hrs)', z='Avg Power',
                                 title="Workout Intensity Heatmap (Average Power vs Duration)",
                                 labels={'Duration (hrs)': 'Workout Duration (hrs)', 'Avg Power': 'Avg Power (Watts)'},
                                 color_continuous_scale=[color_mapping['Zone 5 (90%-100%)'], 
                                                         color_mapping['Zone 4 (80%-90%)'], 
                                                         color_mapping['Zone 3 (70%-80%)'], 
                                                         color_mapping['Zone 2 (60%-70%)'], 
                                                         color_mapping['Zone 1 (50%-60%)']])

# Update layout for clarity
fig_heatmap.update_layout(
    xaxis_title="Date",
    yaxis_title="Workout Duration (hrs)",
    title_x=0.5,
    template="simple_white"
)

# --- Visualization 4: Compare Average Power Across Months ---
wahoo_data['Month'] = wahoo_data['Date'].dt.to_period('M').astype(str)  # Convert to string
monthly_avg_power = wahoo_data.groupby('Month')['Avg Power'].mean().reset_index()

# Plot monthly average power with Zone 4 color
fig_avg_power = px.line(monthly_avg_power, x='Month', y='Avg Power', title='Average Power Performed Each Month',
                        markers=True, line_shape='linear', color_discrete_sequence=[color_mapping['Zone 4 (80%-90%)']])

# Update the layout for the Average Power plot
fig_avg_power.update_layout(
    xaxis_title="Month",
    yaxis_title="Average Power (W)",
    title_x=0.5,
    template="simple_white"
)

# Display all plots
fig_workouts_time.show()
fig_distribution.show()
fig_heatmap.show()
fig_avg_power.show()

Metric,Wahoo Stats,Apple Health Stats
Average Cadence (RPM),75.625,
Average Power (W),121.081081,
Average Speed (km/h),15.107895,
Total Calories Burned (kcal),,0.0
Total Duration (hours),25.931389,73.0598
Total Workouts,38.0,34.0



Converting to PeriodArray/Index representation will drop timezone information.



In [32]:
import pandas as pd
import plotly.express as px

# Load the provided cycling data
wahoo_data_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv'
cycling_data = pd.read_csv(wahoo_data_path)

# 1. Bar Graph: Count of Different Cycling Activities
activity_counts = cycling_data['Activity Name'].value_counts()

# Create a bar plot using plotly
fig1 = px.bar(activity_counts, 
              x=activity_counts.index, 
              y=activity_counts.values, 
              title='Count of Different Cycling Activities',
              labels={'x':'Activity Name', 'y':'Count'})

# Update layout for better display
fig1.update_layout(xaxis_tickangle=-90)

# Show the first plot
fig1.show()

# Check if the TSS column exists or calculate it
if 'TSS' not in cycling_data.columns:
    # Assuming 'Duration' and 'Normalized Power' exist to calculate TSS (using FTP of 250W as an example)
    cycling_data['Duration (hours)'] = pd.to_timedelta(cycling_data['Duration']).dt.total_seconds() / 3600
    FTP = 250  # Adjust this value based on your actual FTP
    cycling_data['IF'] = cycling_data['Normalized Power'] / FTP
    cycling_data['TSS'] = (cycling_data['Duration (hours)'] * cycling_data['Normalized Power'] * cycling_data['IF']) / (FTP * 100)

# 2. Bar Graph: Showing TSS values for each entry
fig2 = px.bar(cycling_data, 
              y='TSS', 
              title='TSS Values for Each Activity',
              labels={'index': 'Activity', 'TSS': 'Training Stress Score'},
              text='TSS')

# Update layout for better display
fig2.update_layout(xaxis={'title':'Activity Index'}, 
                   yaxis={'title':'Training Stress Score'},
                   showlegend=False)

# Show the second plot
fig2.show()

# 3. Summary Statistics for TSS
summary_stats = cycling_data['TSS'].describe()

# Display summary statistics for TSS
print("Summary Statistics for TSS:")
print(summary_stats)

# 4. Visualization: Distribution of Training Stress Score (TSS)
fig3 = px.histogram(cycling_data, 
                    x='TSS', 
                    title='Distribution of Training Stress Score (TSS)',
                    labels={'TSS':'Training Stress Score'},
                    nbins=10)

# Show the distribution plot
fig3.show()

# 5. Scatter Plot: TSS vs IF (if available)
if 'IF' in cycling_data.columns:
    fig4 = px.scatter(cycling_data, 
                      x='IF', 
                      y='TSS', 
                      title='TSS vs Intensity Factor (IF)',
                      labels={'IF':'Intensity Factor', 'TSS':'Training Stress Score'},
                      trendline='ols')
    fig4.show()


Summary Statistics for TSS:
count     38.000000
mean      44.526316
std       38.694044
min        0.000000
25%       14.000000
50%       40.000000
75%       61.750000
max      156.000000
Name: TSS, dtype: float64


In [35]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load the provided cycling data
wahoo_data_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/wahoo/wahoo_data.csv'
cycling_data = pd.read_csv(wahoo_data_path)

# Convert 'Date' column to datetime
cycling_data['Date'] = pd.to_datetime(cycling_data['Date'])

# 1. Updated Correlation Matrix with Custom Colors
# Only include columns that are available
columns_to_check = ['TSS', 'IF', 'Avg Power', 'Normalized Power']
available_columns = [col for col in columns_to_check if col in cycling_data.columns]

# Custom color mapping provided for the zones
color_mapping = {'Zone 1 (50%-60%)': '#e63946', 'Zone 2 (60%-70%)': '#a8dadc', 
                 'Zone 3 (70%-80%)': '#90EE90', 'Zone 4 (80%-90%)': '#457b9d', 
                 'Zone 5 (90%-100%)': '#1d3557'}

colorscale = [color_mapping['Zone 1 (50%-60%)'], 
              color_mapping['Zone 2 (60%-70%)'], 
              color_mapping['Zone 3 (70%-80%)'], 
              color_mapping['Zone 4 (80%-90%)'], 
              color_mapping['Zone 5 (90%-100%)']]

if available_columns:
    correlation_matrix = cycling_data[available_columns].corr()

    # Print correlation matrix
    print("Correlation Matrix:")
    print(correlation_matrix)

    # Visualization of the correlation matrix using custom colors
    fig_corr = px.imshow(correlation_matrix, title='Correlation Matrix of Key Metrics', color_continuous_scale=colorscale)
    fig_corr.show()
else:
    print("None of the specified columns for correlation analysis are available.")

# 2. Improved Duration of Workouts Over Time Plot
# Convert 'Duration' to hours if necessary
cycling_data['Duration (hours)'] = pd.to_timedelta(cycling_data['Duration']).dt.total_seconds() / 3600

# Line plot with improved legibility
fig_duration = go.Figure()

# Add lines for each workout, with opacity for better readability
for activity in cycling_data['Activity Name'].unique():
    activity_data = cycling_data[cycling_data['Activity Name'] == activity]
    fig_duration.add_trace(go.Scatter(x=activity_data['Date'], y=activity_data['Duration (hours)'],
                                      mode='lines', name=activity, opacity=0.6))

# Update the layout to improve the legibility
fig_duration.update_layout(title='Workout Duration Over Time',
                           xaxis_title='Date',
                           yaxis_title='Duration (hours)',
                           yaxis_range=[0, cycling_data['Duration (hours)'].max() + 0.5],  # Adjust range for better readability
                           legend_title_text='Activity Name')

fig_duration.show()


Correlation Matrix:
                       TSS        IF  Avg Power  Normalized Power
TSS               1.000000  0.647737   0.558440          0.608565
IF                0.647737  1.000000   0.762464          0.958397
Avg Power         0.558440  0.762464   1.000000          0.876093
Normalized Power  0.608565  0.958397   0.876093          1.000000
