# Fitness Data Analysis - Exploratory Data Analysis

This notebook explores the Workout & Fitness Tracker dataset to understand its structure, feature distributions, and relationships between variables.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add parent directory to path for imports
sys.path.append(os.path.abspath('..'))

# Import custom modules
from src.data_loader import load_fitness_data
from src.preprocessing import identify_column_types, handle_missing_values
from src.visualization import (
    plot_correlation_matrix,
    plot_categorical_distribution,
    plot_numeric_features_distribution,
    plot_target_vs_feature
)

# Set plot style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Display all DataFrame columns
pd.set_option('display.max_columns', None)

## Loading and Exploring the Dataset

In [None]:
# Load the dataset
data = load_fitness_data()

# Display basic information about the dataset
print("Dataset shape:", data.shape)
data.head()

In [None]:
# Summary statistics for numeric columns
data.describe()

In [None]:
# Information about data types and missing values
data.info()

## Data Preprocessing

In [None]:
# Identify column types
numeric_cols, categorical_cols = identify_column_types(data)

print("\nNumeric columns:")
print(numeric_cols)

print("\nCategorical columns:")
print(categorical_cols)

In [None]:
# Check for missing values
missing_data = data.isnull().sum()
missing_data = missing_data[missing_data > 0]
if len(missing_data) > 0:
    print("Columns with missing values:")
    print(missing_data)
else:
    print("No missing values found in the dataset.")

In [None]:
# Handle any missing values
data_clean = handle_missing_values(data)

## Distribution of Key Features

In [None]:
# Plot distribution of numeric features
fig = plot_numeric_features_distribution(data_clean, columns=numeric_cols[:12])
plt.tight_layout()

In [None]:
# Plot distribution of categorical features
for col in categorical_cols:
    fig = plot_categorical_distribution(data_clean, col)
    plt.tight_layout()
    plt.show()

## Feature Relationships and Correlations

In [None]:
# Plot correlation matrix
fig = plot_correlation_matrix(data_clean, figsize=(14, 12))
plt.tight_layout()

In [None]:
# Create workout efficiency feature (calories burned per minute)
data_clean['Workout_Efficiency_Score'] = data_clean['Calories_Burned'] / data_clean['Workout_Duration']

# Plot histogram of workout efficiency
plt.figure(figsize=(10, 6))
sns.histplot(data_clean['Workout_Efficiency_Score'], kde=True)
plt.title('Distribution of Workout Efficiency (Calories Burned per Minute)')
plt.xlabel('Calories Burned per Minute')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Create workout efficiency categories (Low, Medium, High)
efficiency_thresholds = data_clean['Workout_Efficiency_Score'].quantile([0.33, 0.66])
data_clean['Workout_Efficiency'] = pd.cut(
    data_clean['Workout_Efficiency_Score'],
    bins=[-float('inf'), efficiency_thresholds.iloc[0], efficiency_thresholds.iloc[1], float('inf')],
    labels=['Low', 'Medium', 'High']
)

# Display the distribution of efficiency categories
plt.figure(figsize=(8, 6))
counts = data_clean['Workout_Efficiency'].value_counts()
ax = sns.barplot(x=counts.index, y=counts.values)
plt.title('Distribution of Workout Efficiency Categories')
plt.xlabel('Efficiency Category')
plt.ylabel('Count')

# Add count and percentage labels to bars
total = len(data_clean)
for i, count in enumerate(counts):
    pct = 100 * count / total
    ax.text(i, count/2, f'{count}\n({pct:.1f}%)', ha='center', va='center', color='white', fontweight='bold')

plt.show()

## Exploring Relationships with Key Lifestyle Factors

In [None]:
# Key lifestyle and health factors
key_factors = ['Sleep_Hours', 'Water_Intake', 'Body_Fat_Pct', 'Resting_Heart_Rate', 'Daily_Calories']

# Plot relationship between efficiency score and each factor
for factor in key_factors:
    if factor in data_clean.columns:
        fig = plot_target_vs_feature(data_clean, 'Workout_Efficiency_Score', factor)
        plt.show()

In [None]:
# Boxplot of numeric factors by efficiency category
plt.figure(figsize=(15, 10))
for i, factor in enumerate(key_factors, 1):
    if factor in data_clean.columns:
        plt.subplot(2, 3, i)
        sns.boxplot(x='Workout_Efficiency', y=factor, data=data_clean)
        plt.title(f'{factor} by Efficiency Category')
        plt.tight_layout()
plt.show()

## Exploring Demographic Factors

In [None]:
# Explore efficiency by gender
if 'Gender' in data_clean.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Gender', y='Workout_Efficiency_Score', data=data_clean)
    plt.title('Workout Efficiency by Gender')
    plt.xlabel('Gender')
    plt.ylabel('Calories Burned per Minute')
    plt.show()

In [None]:
# Explore efficiency by age
if 'Age' in data_clean.columns:
    # Create age groups
    data_clean['Age_Group'] = pd.cut(
        data_clean['Age'],
        bins=[0, 20, 30, 40, 50, 60, 100],
        labels=['<20', '20-29', '30-39', '40-49', '50-59', '60+'])
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Age_Group', y='Workout_Efficiency_Score', data=data_clean)
    plt.title('Workout Efficiency by Age Group')
    plt.xlabel('Age Group')
    plt.ylabel('Calories Burned per Minute')
    plt.show()

## Exploring Workout Types

In [None]:
# Explore efficiency by workout type
if 'Workout_Type' in data_clean.columns:
    # Calculate average efficiency by workout type
    workout_efficiency = data_clean.groupby('Workout_Type')['Workout_Efficiency_Score'].mean().sort_values(ascending=False)
    
    plt.figure(figsize=(14, 6))
    ax = sns.barplot(x=workout_efficiency.index, y=workout_efficiency.values)
    plt.title('Average Workout Efficiency by Workout Type')
    plt.xlabel('Workout Type')
    plt.ylabel('Average Calories Burned per Minute')
    plt.xticks(rotation=45, ha='right')
    
    # Add value labels
    for i, val in enumerate(workout_efficiency.values):
        ax.text(i, val, f'{val:.2f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

## Analyzing Mood Effects

In [None]:
# Explore efficiency by mood
if 'Mood' in data_clean.columns:
    # Calculate average efficiency by mood
    mood_efficiency = data_clean.groupby('Mood')['Workout_Efficiency_Score'].mean().sort_values(ascending=False)
    
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x=mood_efficiency.index, y=mood_efficiency.values)
    plt.title('Average Workout Efficiency by Mood')
    plt.xlabel('Mood')
    plt.ylabel('Average Calories Burned per Minute')
    
    # Add value labels
    for i, val in enumerate(mood_efficiency.values):
        ax.text(i, val, f'{val:.2f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

## Summary of Findings

Main observations from the exploratory data analysis:

1. The dataset contains a comprehensive set of fitness and lifestyle variables
2. Key relationships have been identified between workout efficiency and lifestyle factors
3. Demographic factors like age and gender show distinct patterns in workout performance
4. Different workout types have varying levels of calorie-burning efficiency
5. There appears to be a relationship between mood and workout performance

These insights will inform our feature engineering and modeling approaches in subsequent steps.