In [2]:
import pandas as pd
import plotly.graph_objects as go
import os
import statsmodels.api as sm
from tabulate import tabulate

# Set the file paths
myfitnesspal_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/myfitnesspal/'
base_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/apple_health/health_data_exported/'
body_mass_file = os.path.join(base_path, 'HKQuantityTypeIdentifierBodyMass_2024-04-99_18-30-56_SimpleHealthExportCSV.csv')

# Read the CSV files
nutrition_df = pd.read_csv(os.path.join(myfitnesspal_path, 'Nutrition-Summary-2016-03-13-to-2024-01-19.csv'))
exercise_df = pd.read_csv(os.path.join(myfitnesspal_path, 'Exercise-Summary-2016-03-13-to-2024-01-19.csv'))
body_mass_df = pd.read_csv(body_mass_file, skiprows=1)  # Skip the first row as it contains 'sep=,'

# Convert date columns to datetime and ensure UTC timezone
nutrition_df['Date'] = pd.to_datetime(nutrition_df['Date'], utc=True)
exercise_df['Date'] = pd.to_datetime(exercise_df['Date'], utc=True)
body_mass_df['startDate'] = pd.to_datetime(body_mass_df['startDate'], utc=True)

# Rename columns and select relevant ones for body mass data
body_mass_df = body_mass_df[['startDate', 'value']].rename(columns={'startDate': 'Date', 'value': 'Weight'})

# Group nutrition data by date and sum the values
nutrition_daily = nutrition_df.groupby('Date').agg({
    'Calories': 'sum',
    'Protein (g)': 'sum',
    'Carbohydrates (g)': 'sum',
    'Fat (g)': 'sum'
}).reset_index()

# Merge nutrition data with body mass data
combined_df = pd.merge(body_mass_df, nutrition_daily, on='Date', how='outer')

# Merge exercise data
exercise_daily = exercise_df.groupby('Date').agg({
    'Exercise Calories': 'sum',
    'Exercise Minutes': 'sum'
}).reset_index()

combined_df = pd.merge(combined_df, exercise_daily, on='Date', how='outer')

# Sort by date
combined_df = combined_df.sort_values('Date')

# Fill NaN values with forward fill method
combined_df = combined_df.ffill()

# Calculate net calories (intake - exercise)
combined_df['Net Calories'] = combined_df['Calories'] - combined_df['Exercise Calories'].fillna(0)

# Filter data for Ironman training period
start_date = '2022-12-01'
end_date = '2023-07-23'
ironman_df = combined_df[(combined_df['Date'] >= start_date) & (combined_df['Date'] <= end_date)]

# Function to save and display plotly figure
def save_and_display_plot(fig, filename):
    html_path = os.path.join(myfitnesspal_path, filename)
    fig.write_html(html_path)
    print(f"Plot saved as {html_path}")
    fig.show()

# 1. Weight and Net Calories Over Time
fig = go.Figure()

# Add Weight trace
fig.add_trace(go.Scatter(
    x=ironman_df['Date'],
    y=ironman_df['Weight'],
    mode='lines+markers',
    name='Weight',
    line=dict(color='#457b9d')
))

# Add Net Calories trace on secondary y-axis
fig.add_trace(go.Scatter(
    x=ironman_df['Date'],
    y=ironman_df['Net Calories'],
    mode='lines+markers',
    name='Net Calories',
    line=dict(color='#e63946'),
    yaxis='y2'
))

# Update layout
fig.update_layout(
    title="Weight and Net Calories During Ironman Training",
    xaxis=dict(title='Date'),
    yaxis=dict(
        title='Weight (lbs)',
        titlefont=dict(color='#457b9d'),
        tickfont=dict(color='#457b9d')
    ),
    yaxis2=dict(
        title='Net Calories',
        titlefont=dict(color='#e63946'),
        tickfont=dict(color='#e63946'),
        overlaying='y',
        side='right'
    ),
    legend=dict(x=1.1, y=1),
    plot_bgcolor='#f1faee'
)

save_and_display_plot(fig, 'ironman_weight_and_calories_over_time.html')

# 2. Macronutrient Distribution
color_mapping = {
    'Protein': '#e63946',
    'Carbohydrates': '#a8dadc',
    'Fat': '#457b9d'
}

macronutrient_data = [
    ('Protein', ironman_df['Protein (g)'].mean()),
    ('Carbohydrates', ironman_df['Carbohydrates (g)'].mean()),
    ('Fat', ironman_df['Fat (g)'].mean())
]

fig = go.Figure(data=[go.Pie(
    labels=[item[0] for item in macronutrient_data],
    values=[item[1] for item in macronutrient_data],
    marker=dict(colors=[color_mapping[item[0]] for item in macronutrient_data]),
    textinfo='label+percent',
    hole=.3
)])

fig.update_layout(
    title="Average Macronutrient Distribution During Ironman Training",
    plot_bgcolor='#f1faee'
)

save_and_display_plot(fig, 'ironman_macronutrient_distribution.html')

# Prepare summary statistics
summary_stats = pd.DataFrame({
    'Metric': ['Total days', 'Average daily calorie intake', 'Average daily net calories',
               'Average daily exercise calories', 'Average daily exercise minutes',
               'Starting weight', 'Ending weight', 'Total weight change',
               'Average daily protein intake', 'Average daily carbohydrate intake',
               'Average daily fat intake'],
    'Value': [
        len(ironman_df),
        f"{ironman_df['Calories'].mean():.2f}",
        f"{ironman_df['Net Calories'].mean():.2f}",
        f"{ironman_df['Exercise Calories'].mean():.2f}",
        f"{ironman_df['Exercise Minutes'].mean():.2f}",
        f"{ironman_df['Weight'].iloc[0]:.2f} lbs",
        f"{ironman_df['Weight'].iloc[-1]:.2f} lbs",
        f"{ironman_df['Weight'].iloc[-1] - ironman_df['Weight'].iloc[0]:.2f} lbs",
        f"{ironman_df['Protein (g)'].mean():.2f} g",
        f"{ironman_df['Carbohydrates (g)'].mean():.2f} g",
        f"{ironman_df['Fat (g)'].mean():.2f} g"
    ]
})

exercise_days = ironman_df['Exercise Calories'].notna().sum()
total_days = len(ironman_df)
exercise_percentage = (exercise_days / total_days) * 100

# Add the exercise percentage to the summary stats using concat instead of append
summary_stats = pd.concat([summary_stats, pd.DataFrame({
    'Metric': ['Percentage of days with recorded exercise'],
    'Value': [f"{exercise_percentage:.2f}%"]
})], ignore_index=True)

print("\nIronman Training Period Analysis (Dec 1, 2022 - Jul 23, 2023):")
print(tabulate(summary_stats, headers='keys', tablefmt='pretty', showindex=False))

# Calculate correlations
correlations = ironman_df[['Weight', 'Net Calories', 'Protein (g)', 'Carbohydrates (g)', 'Fat (g)', 'Exercise Calories']].corr()
correlation_table = correlations['Weight'].sort_values(ascending=False).reset_index()
correlation_table.columns = ['Variable', 'Correlation with Weight']

print("\nCorrelations with Weight:")
print(tabulate(correlation_table, headers='keys', tablefmt='pretty', showindex=False, floatfmt='.4f'))

# Statistical Analysis: Multiple Linear Regression
print("\nMultiple Linear Regression Analysis:")

# Prepare the data
X = ironman_df[['Net Calories', 'Protein (g)', 'Carbohydrates (g)', 'Fat (g)', 'Exercise Calories']]
y = ironman_df['Weight']

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Prepare regression results table
regression_results = pd.DataFrame({
    'Variable': model.model.exog_names,
    'Coefficient': model.params,
    'P-value': model.pvalues,
    'Std Error': model.bse
})

print("\nRegression Results:")
print(tabulate(regression_results, headers='keys', tablefmt='pretty', showindex=False, floatfmt='.4f'))

print(f"\nR-squared: {model.rsquared:.4f}")
print(f"Adjusted R-squared: {model.rsquared_adj:.4f}")

print("\nInterpretation of the Multiple Linear Regression Results:")
print("1. R-squared value indicates the proportion of the variance in Weight that is predictable from the independent variables.")
print("2. Variables with p-values < 0.05 are considered statistically significant.")
print("3. Coefficients represent the change in Weight for a one-unit change in the respective variable, holding other variables constant.")
print("\nNote: This analysis assumes a linear relationship and independence of observations, which may not hold perfectly for time-series data.")

Plot saved as /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/myfitnesspal/ironman_weight_and_calories_over_time.html


Plot saved as /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/myfitnesspal/ironman_macronutrient_distribution.html



Ironman Training Period Analysis (Dec 1, 2022 - Jul 23, 2023):
+-------------------------------------------+------------+
|                  Metric                   |   Value    |
+-------------------------------------------+------------+
|                Total days                 |    111     |
|       Average daily calorie intake        |  2191.40   |
|        Average daily net calories         |  2046.40   |
|      Average daily exercise calories      |   145.00   |
|      Average daily exercise minutes       |   45.00    |
|              Starting weight              | 167.20 lbs |
|               Ending weight               | 156.60 lbs |
|            Total weight change            | -10.60 lbs |
|       Average daily protein intake        |  152.17 g  |
|     Average daily carbohydrate intake     |  269.74 g  |
|         Average daily fat intake          |  56.55 g   |
| Percentage of days with recorded exercise |  100.00%   |
+-------------------------------------------+------

In [30]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from scipy import stats
from scipy.stats import f_oneway
from tabulate import tabulate

# Load the sleep dataset
file_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/diet_and_sleep/sleepdata.csv'
sleep_data = pd.read_csv(file_path)

# Data preparation
sleep_data['Start'] = pd.to_datetime(sleep_data['Start'])
sleep_data['End'] = pd.to_datetime(sleep_data['End'])
sleep_data['Sleep Duration (hours)'] = (sleep_data['End'] - sleep_data['Start']).dt.total_seconds() / 3600
sleep_data_clean = sleep_data[(sleep_data['Sleep Duration (hours)'] >= 3) & (sleep_data['Sleep Duration (hours)'] <= 20)]
sleep_data_clean['Sleep Quality'] = sleep_data_clean['Sleep Quality'].str.rstrip('%').astype('float') / 100
sleep_data_clean['Regularity'] = sleep_data_clean['Regularity'].str.rstrip('%').astype('float') / 100
sleep_data_clean['Movements per Hour'] = sleep_data_clean['Steps'] / sleep_data_clean['Sleep Duration (hours)']
sleep_data_clean = sleep_data_clean.sort_values('Start')
sleep_data_clean['Sleep Quality 7-day Avg'] = sleep_data_clean['Sleep Quality'].rolling(window=7).mean()
sleep_data_clean['Sleep Duration 7-day Avg'] = sleep_data_clean['Sleep Duration (hours)'].rolling(window=7).mean()

# Modified bedtime calculation
sleep_data_clean['Bedtime'] = sleep_data_clean['Start'].dt.hour + sleep_data_clean['Start'].dt.minute / 60
sleep_data_clean.loc[sleep_data_clean['Bedtime'] < 12, 'Bedtime'] += 24  # Adjust times after midnight

# Handling missing data in 'Snore time'
sleep_data_clean['Snore time'] = sleep_data_clean['Snore time'].fillna(0)

# Create Duration Range column
def categorize_duration(hours):
    if hours < 6:
        return 'Short (<6h)'
    elif 6 <= hours < 7:
        return 'Normal (6-7h)'
    elif 7 <= hours < 8:
        return 'Good (7-8h)'
    else:
        return 'Long (>8h)'

sleep_data_clean['Duration Range'] = sleep_data_clean['Sleep Duration (hours)'].apply(categorize_duration)

# Color mapping
color_mapping = {
    'Zone 1 (50%-60%)': '#e63946',
    'Zone 2 (60%-70%)': '#a8dadc',
    'Zone 3 (70%-80%)': '#90EE90',
    'Zone 4 (80%-90%)': '#457b9d',
    'Zone 5 (90%-100%)': '#1d3557'
}

# Create subplots
fig = make_subplots(rows=3, cols=2, subplot_titles=(
    "Sleep Quality vs. Duration",
    "Sleep Patterns Over Time",
    "Movements per Hour vs. Sleep Duration",
    "Snore Time vs. Sleep Duration",
    "Sleep Quality Distribution",
    "Bedtime vs Sleep Quality"
), vertical_spacing=0.1, horizontal_spacing=0.1)

# 1. Sleep Quality vs. Duration with trendline
x = sleep_data_clean['Sleep Duration (hours)']
y = sleep_data_clean['Sleep Quality']
z = np.polyfit(x, y, 1)
p = np.poly1d(z)

fig.add_trace(go.Scatter(
    x=x, y=y, 
    mode='markers',
    marker=dict(size=8, color=color_mapping['Zone 4 (80%-90%)'], opacity=0.6),
    name="Sleep Quality"
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=x, y=p(x),
    mode='lines',
    line=dict(color=color_mapping['Zone 1 (50%-60%)'], width=2),
    name="Trendline"
), row=1, col=1)

# 2. Sleep Patterns Over Time (Enhanced)
fig.add_trace(go.Scatter(
    x=sleep_data_clean['Start'], 
    y=sleep_data_clean['Sleep Duration 7-day Avg'],
    mode='lines',
    line=dict(color=color_mapping['Zone 3 (70%-80%)'], width=2),
    name="Sleep Duration (7-day Avg)"
), row=1, col=2)

fig.add_trace(go.Scatter(
    x=sleep_data_clean['Start'], 
    y=sleep_data_clean['Sleep Quality 7-day Avg'],
    mode='lines',
    line=dict(color=color_mapping['Zone 5 (90%-100%)'], width=2),
    name="Sleep Quality (7-day Avg)",
    yaxis='y2'
), row=1, col=2)

# 3. Movements per Hour vs. Sleep Duration
fig.add_trace(go.Scatter(
    x=sleep_data_clean['Sleep Duration (hours)'], 
    y=sleep_data_clean['Movements per Hour'], 
    mode='markers',
    marker=dict(size=8, color=color_mapping['Zone 2 (60%-70%)'], opacity=0.6),
    name="Movements per Hour"
), row=2, col=1)

# 4. Snore Time vs. Sleep Duration
fig.add_trace(go.Scatter(
    x=sleep_data_clean['Sleep Duration (hours)'], 
    y=sleep_data_clean['Snore time'], 
    mode='markers',
    marker=dict(size=8, color=color_mapping['Zone 1 (50%-60%)'], opacity=0.6),
    name="Snore Time"
), row=2, col=2)

# 5. Sleep Quality Distribution
fig.add_trace(go.Histogram(
    x=sleep_data_clean['Sleep Quality'],
    nbinsx=20,
    marker_color=color_mapping['Zone 4 (80%-90%)'],
    name="Sleep Quality Distribution"
), row=3, col=1)

# 6. Bedtime vs Sleep Quality (Updated)
fig.add_trace(go.Scatter(
    x=sleep_data_clean['Bedtime'],
    y=sleep_data_clean['Sleep Quality'],
    mode='markers',
    marker=dict(
        size=8,
        color=sleep_data_clean['Sleep Quality'],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title="Sleep Quality", y=0.15, len=0.3)
    ),
    name="Bedtime vs Sleep Quality"
), row=3, col=2)

# Update layout
fig.update_layout(
    height=2100,
    width=1200, 
    title_text="Enhanced Sleep Data Analysis",
    title_y=0.98,
    template="plotly_white",
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="Black",
        borderwidth=1
    ),
    margin=dict(t=150)
)

# Update axes
fig.update_xaxes(title_text="Sleep Duration (hours)", row=1, col=1, range=[3, 12], gridcolor='lightgray')
fig.update_yaxes(title_text="Sleep Quality", row=1, col=1, range=[0, 1.2], gridcolor='lightgray')
fig.update_xaxes(title_text="Date", row=1, col=2, gridcolor='lightgray')
fig.update_yaxes(title_text="Sleep Duration (hours)", row=1, col=2, gridcolor='lightgray', range=[0, 10])
fig.update_yaxes(title_text="Sleep Quality", secondary_y=True, row=1, col=2, gridcolor='lightgray', range=[0, 1], overlaying='y', side='right')
fig.update_xaxes(title_text="Sleep Duration (hours)", row=2, col=1, range=[3, 12], gridcolor='lightgray')
fig.update_yaxes(title_text="Movements per Hour", row=2, col=1, gridcolor='lightgray')
fig.update_xaxes(title_text="Sleep Duration (hours)", row=2, col=2, range=[3, 12], gridcolor='lightgray')
fig.update_yaxes(title_text="Snore Time (seconds)", row=2, col=2, gridcolor='lightgray')
fig.update_xaxes(title_text="Sleep Quality", row=3, col=1, gridcolor='lightgray')
fig.update_yaxes(title_text="Frequency", row=3, col=1, gridcolor='lightgray')
fig.update_xaxes(
    title_text="Bedtime (hour of day)", 
    row=3, col=2, 
    gridcolor='lightgray', 
    range=[18, 30],  # This will show from 6 PM to 6 AM
    tickvals=[18, 20, 22, 24, 26, 28, 30], 
    ticktext=['6PM', '8PM', '10PM', '12AM', '2AM', '4AM', '6AM']
)
fig.update_yaxes(title_text="Sleep Quality", row=3, col=2, gridcolor='lightgray', range=[0, 1])

# Show the figure
fig.show()

# Function to format float values to 2 decimal places
def format_float(value):
    return f"{value:.2f}"

# Calculate summary statistics
summary_stats = sleep_data_clean[['Sleep Duration (hours)', 'Sleep Quality', 'Regularity', 'Movements per Hour', 'Snore time']].describe()
summary_stats = summary_stats.applymap(format_float)

# Create a formatted table for summary statistics
print("\nSummary Statistics:")
print(tabulate(summary_stats, headers='keys', tablefmt='pretty'))

# Additional statistical analysis
correlations = pd.DataFrame({
    'Correlation': ['Sleep Duration vs. Sleep Quality', 'Sleep Quality vs. Movements per Hour'],
    'Value': [
        format_float(sleep_data_clean['Sleep Duration (hours)'].corr(sleep_data_clean['Sleep Quality'])),
        format_float(sleep_data_clean['Sleep Quality'].corr(sleep_data_clean['Movements per Hour']))
    ]
})

print("\nCorrelations:")
print(tabulate(correlations, headers='keys', tablefmt='pretty', showindex=False))

# Make sure 'Duration Range' column exists
if 'Duration Range' in sleep_data_clean.columns:
    avg_sleep_quality = sleep_data_clean.groupby('Duration Range')['Sleep Quality'].mean().reset_index()
    avg_sleep_quality.columns = ['Duration Range', 'Average Sleep Quality']
    avg_sleep_quality['Average Sleep Quality'] = avg_sleep_quality['Average Sleep Quality'].apply(format_float)

    print("\nAverage Sleep Quality for Different Sleep Duration Ranges:")
    print(tabulate(avg_sleep_quality, headers='keys', tablefmt='pretty', showindex=False))

    # ANOVA test: Sleep Quality across different Sleep Duration ranges
    groups = sleep_data_clean.groupby('Duration Range')['Sleep Quality']
    anova_result = f_oneway(*[group for name, group in groups])

    anova_table = pd.DataFrame({
        'Statistic': ['F-statistic', 'p-value'],
        'Value': [format_float(anova_result.statistic), format_float(anova_result.pvalue)]
    })

    print("\nANOVA Result for Sleep Quality across Sleep Duration Ranges:")
    print(tabulate(anova_table, headers='keys', tablefmt='pretty', showindex=False))
else:
    print("\nWarning: 'Duration Range' column not found. Skipping related analyses.")

# Identify nights with best and worst sleep quality
best_sleep = sleep_data_clean.loc[sleep_data_clean['Sleep Quality'].idxmax()]
worst_sleep = sleep_data_clean.loc[sleep_data_clean['Sleep Quality'].idxmin()]

best_worst_sleep = pd.DataFrame({
    'Metric': ['Date', 'Sleep Quality', 'Sleep Duration (hours)', 'Movements per Hour', 'Snore Time (seconds)'],
    'Best Night': [best_sleep['Start'].date(), format_float(best_sleep['Sleep Quality']), format_float(best_sleep['Sleep Duration (hours)']),
                   format_float(best_sleep['Movements per Hour']), format_float(best_sleep['Snore time'])],
    'Worst Night': [worst_sleep['Start'].date(), format_float(worst_sleep['Sleep Quality']), format_float(worst_sleep['Sleep Duration (hours)']),
                    format_float(worst_sleep['Movements per Hour']), format_float(worst_sleep['Snore time'])]
})

print("\nBest and Worst Sleep Nights:")
print(tabulate(best_worst_sleep, headers='keys', tablefmt='pretty', showindex=False))

print("\nInterpretation of the Sleep Data Analysis:")
print("1. The summary statistics provide an overview of the central tendencies and variability in sleep-related metrics.")
print("2. Correlations between sleep duration, quality, and movements per hour indicate potential relationships between these factors.")
print("3. The average sleep quality for different duration ranges shows how sleep quality varies with sleep duration.")
print("4. The ANOVA test results indicate whether there are significant differences in sleep quality across different sleep duration ranges.")
print("5. The best and worst sleep nights provide insight into the extremes of sleep quality and associated factors.")
print("\nNote: This analysis assumes certain statistical properties and may not account for all factors affecting sleep quality.")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Summary Statistics:
+-------+------------------------+---------------+------------+--------------------+------------+
|       | Sleep Duration (hours) | Sleep Quality | Regularity | Movements per Hour | Snore time |
+-------+------------------------+---------------+------------+--------------------+------------+
| count |        1241.00         |    1241.00    |  1241.00   |      1241.00       |  1241.00   |
| mean  |          8.15          |     0.83      |    0.67    |        1.61        |   71.74    |
|  std  |          1.28          |     0.12      |    0.34    |       31.99        |   294.52   |
|  min  |          3.05          |     0.26      |   -1.04    |        0.00        |    0.00    |
|  25%  |          7.61          |     0.79      |    0.61    |        0.00        |    0.00    |
|  50%  |          8.35          |     0.87      |    0.81    |        0.00        |    0.00    |
|  75%  |          8.86          |     0.92      |    0.89    |        0.00        |    0.00    |


DataFrame.applymap has been deprecated. Use DataFrame.map instead.

