# Fitness Data Analysis - Modeling and Evaluation

This notebook focuses on building and evaluating machine learning models for predicting workout efficiency and calories burned.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score

# Add parent directory to path for imports
sys.path.append(os.path.abspath('..'))

# Import custom modules
from src.data_loader import load_fitness_data, split_data
from src.preprocessing import identify_column_types, handle_missing_values, create_preprocessing_pipeline, create_workout_efficiency_category, encode_categorical_target
from src.feature_engineering import create_all_features
from src.models import create_classification_model, create_regression_model, train_model, evaluate_classification_model, evaluate_regression_model, save_model
from src.visualization import plot_feature_importance, plot_confusion_matrix

# Set plot style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Display all DataFrame columns
pd.set_option('display.max_columns', None)

## Data Loading and Preprocessing

In [None]:
# Load the dataset
data = load_fitness_data()

# Display basic information about the dataset
print("Dataset shape:", data.shape)
data.head()

In [None]:
# Handle missing values
data_clean = handle_missing_values(data)

In [None]:
# Apply feature engineering
data_engineered = create_all_features(data_clean)

# Display the engineered dataset
print("Engineered dataset shape:", data_engineered.shape)
data_engineered.head()

## Classification Task: Predicting Workout Efficiency

In [None]:
# Encode the categorical target for classification
data_encoded, label_encoder = encode_categorical_target(data_engineered)

# Display encoding mapping
print("Target encoding mapping:")
mapping = {i: category for i, category in enumerate(label_encoder.classes_)}
print(mapping)

In [None]:
# Identify feature types for preprocessing
feature_data = data_encoded.drop([c for c in data_encoded.columns if 'Workout_Efficiency' in c], axis=1)
numeric_cols, categorical_cols = identify_column_types(feature_data)

# Create preprocessing pipeline
preprocessor = create_preprocessing_pipeline(numeric_cols, categorical_cols)

In [None]:
# Split data for classification
X_train, X_val, X_test, y_train, y_val, y_test = split_data(data_encoded, 'Workout_Efficiency_Encoded')

### Logistic Regression Model

In [None]:
# Create and train Logistic Regression model
lr_model = create_classification_model('lr', random_state=42)
lr_pipeline = train_model(lr_model, preprocessor, X_train, y_train)

# Evaluate on validation set
print("Evaluating on validation set:")
lr_val_results = evaluate_classification_model(lr_pipeline, X_val, y_val)

In [None]:
# Plot confusion matrix for validation set
lr_cm_fig = plot_confusion_matrix(y_val, lr_val_results['y_pred'], labels=label_encoder.classes_)
plt.title('Logistic Regression - Validation Set Confusion Matrix')
plt.show()

### Random Forest Model

In [None]:
# Create and train Random Forest model
rf_model = create_classification_model('rf', random_state=42)
rf_pipeline = train_model(rf_model, preprocessor, X_train, y_train)

# Evaluate on validation set
print("Evaluating on validation set:")
rf_val_results = evaluate_classification_model(rf_pipeline, X_val, y_val)

In [None]:
# Plot confusion matrix for validation set
rf_cm_fig = plot_confusion_matrix(y_val, rf_val_results['y_pred'], labels=label_encoder.classes_)
plt.title('Random Forest - Validation Set Confusion Matrix')
plt.show()

In [None]:
# Feature importance for Random Forest
try:
    model_instance = rf_pipeline.named_steps['model']
    feature_names = numeric_cols + categorical_cols
    importance_fig = plot_feature_importance(model_instance, feature_names)
    plt.title('Random Forest - Feature Importance')
    plt.show()
except Exception as e:
    print(f"Could not plot feature importance: {e}")

### XGBoost Model

In [None]:
# Create and train XGBoost model
xgb_model = create_classification_model('xgb', random_state=42)
xgb_pipeline = train_model(xgb_model, preprocessor, X_train, y_train)

# Evaluate on validation set
print("Evaluating on validation set:")
xgb_val_results = evaluate_classification_model(xgb_pipeline, X_val, y_val)

In [None]:
# Plot confusion matrix for validation set
xgb_cm_fig = plot_confusion_matrix(y_val, xgb_val_results['y_pred'], labels=label_encoder.classes_)
plt.title('XGBoost - Validation Set Confusion Matrix')
plt.show()

In [None]:
# Feature importance for XGBoost
try:
    model_instance = xgb_pipeline.named_steps['model']
    feature_names = numeric_cols + categorical_cols
    importance_fig = plot_feature_importance(model_instance, feature_names)
    plt.title('XGBoost - Feature Importance')
    plt.show()
except Exception as e:
    print(f"Could not plot feature importance: {e}")

### Model Comparison and Final Evaluation

In [None]:
# Collect validation accuracy for comparison
model_results = {
    'Logistic Regression': lr_val_results['accuracy'],
    'Random Forest': rf_val_results['accuracy'],
    'XGBoost': xgb_val_results['accuracy']
}

# Plot comparison
plt.figure(figsize=(10, 6))
bars = plt.bar(model_results.keys(), model_results.values())
plt.title('Model Accuracy Comparison (Validation Set)')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add accuracy values on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.4f}',
             ha='center', va='bottom')

plt.show()

In [None]:
# Identify best model
best_model_name = max(model_results, key=model_results.get)
print(f"Best classification model: {best_model_name} with validation accuracy: {model_results[best_model_name]:.4f}")

# Select the best model pipeline
if best_model_name == 'Logistic Regression':
    best_model_pipeline = lr_pipeline
elif best_model_name == 'Random Forest':
    best_model_pipeline = rf_pipeline
else:  # XGBoost
    best_model_pipeline = xgb_pipeline

In [None]:
# Evaluate best model on test set
print(f"Evaluating {best_model_name} on test set:")
test_results = evaluate_classification_model(best_model_pipeline, X_test, y_test)

In [None]:
# Plot confusion matrix for test set
test_cm_fig = plot_confusion_matrix(y_test, test_results['y_pred'], labels=label_encoder.classes_)
plt.title(f'{best_model_name} - Test Set Confusion Matrix')
plt.show()

In [None]:
# Save the best model
best_model_path = save_model(best_model_pipeline, f"efficiency_classifier_{best_model_name.lower().replace(' ', '_')}")
print(f"Best model saved to: {best_model_path}")

## Regression Task: Predicting Calories Burned

In [None]:
# Remove efficiency-related columns that were created for classification
cols_to_drop = [c for c in data_engineered.columns if 'Workout_Efficiency' in c]
data_reg = data_engineered.drop(cols_to_drop, axis=1)

In [None]:
# Identify feature types for preprocessing
feature_data = data_reg.drop(['Calories_Burned'], axis=1)
numeric_cols, categorical_cols = identify_column_types(feature_data)

# Create preprocessing pipeline
preprocessor = create_preprocessing_pipeline(numeric_cols, categorical_cols)

In [None]:
# Split data for regression
X_train, X_val, X_test, y_train, y_val, y_test = split_data(data_reg, 'Calories_Burned')

### Linear Regression Model

In [None]:
# Create and train Linear Regression model
lr_reg_model = create_regression_model('lr')
lr_reg_pipeline = train_model(lr_reg_model, preprocessor, X_train, y_train)

# Evaluate on validation set
print("Evaluating on validation set:")
lr_reg_val_results = evaluate_regression_model(lr_reg_pipeline, X_val, y_val)

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_val, lr_reg_val_results['y_pred'], alpha=0.5)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'k--', lw=2)
plt.xlabel('Actual Calories Burned')
plt.ylabel('Predicted Calories Burned')
plt.title('Linear Regression: Actual vs Predicted')
plt.show()

### Random Forest Regression Model

In [None]:
# Create and train Random Forest Regression model
rf_reg_model = create_regression_model('rf', random_state=42)
rf_reg_pipeline = train_model(rf_reg_model, preprocessor, X_train, y_train)

# Evaluate on validation set
print("Evaluating on validation set:")
rf_reg_val_results = evaluate_regression_model(rf_reg_pipeline, X_val, y_val)

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_val, rf_reg_val_results['y_pred'], alpha=0.5)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'k--', lw=2)
plt.xlabel('Actual Calories Burned')
plt.ylabel('Predicted Calories Burned')
plt.title('Random Forest: Actual vs Predicted')
plt.show()

In [None]:
# Feature importance for Random Forest Regression
try:
    model_instance = rf_reg_pipeline.named_steps['model']
    feature_names = numeric_cols + categorical_cols
    importance_fig = plot_feature_importance(model_instance, feature_names)
    plt.title('Random Forest Regression - Feature Importance')
    plt.show()
except Exception as e:
    print(f"Could not plot feature importance: {e}")

### XGBoost Regression Model

In [None]:
# Create and train XGBoost Regression model
xgb_reg_model = create_regression_model('xgb', random_state=42)
xgb_reg_pipeline = train_model(xgb_reg_model, preprocessor, X_train, y_train)

# Evaluate on validation set
print("Evaluating on validation set:")
xgb_reg_val_results = evaluate_regression_model(xgb_reg_pipeline, X_val, y_val)

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_val, xgb_reg_val_results['y_pred'], alpha=0.5)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'k--', lw=2)
plt.xlabel('Actual Calories Burned')
plt.ylabel('Predicted Calories Burned')
plt.title('XGBoost: Actual vs Predicted')
plt.show()

In [None]:
# Feature importance for XGBoost Regression
try:
    model_instance = xgb_reg_pipeline.named_steps['model']
    feature_names = numeric_cols + categorical_cols
    importance_fig = plot_feature_importance(model_instance, feature_names)
    plt.title('XGBoost Regression - Feature Importance')
    plt.show()
except Exception as e:
    print(f"Could not plot feature importance: {e}")

### Model Comparison and Final Evaluation

In [None]:
# Collect validation R² for comparison
reg_model_results = {
    'Linear Regression': lr_reg_val_results['r2'],
    'Random Forest': rf_reg_val_results['r2'],
    'XGBoost': xgb_reg_val_results['r2']
}

# Plot comparison
plt.figure(figsize=(10, 6))
bars = plt.bar(reg_model_results.keys(), reg_model_results.values())
plt.title('Model R² Comparison (Validation Set)')
plt.ylabel('R² Score')
plt.ylim(0, 1)

# Add R² values on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.4f}',
             ha='center', va='bottom')

plt.show()

In [None]:
# Collect validation RMSE for comparison
reg_model_rmse = {
    'Linear Regression': lr_reg_val_results['rmse'],
    'Random Forest': rf_reg_val_results['rmse'],
    'XGBoost': xgb_reg_val_results['rmse']
}

# Plot comparison
plt.figure(figsize=(10, 6))
bars = plt.bar(reg_model_rmse.keys(), reg_model_rmse.values())
plt.title('Model RMSE Comparison (Validation Set)')
plt.ylabel('RMSE')

# Add RMSE values on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}',
             ha='center', va='bottom')

plt.show()

In [None]:
# Identify best regression model based on R²
best_reg_model_name = max(reg_model_results, key=reg_model_results.get)
print(f"Best regression model: {best_reg_model_name} with validation R²: {reg_model_results[best_reg_model_name]:.4f}")

# Select the best model pipeline
if best_reg_model_name == 'Linear Regression':
    best_reg_pipeline = lr_reg_pipeline
elif best_reg_model_name == 'Random Forest':
    best_reg_pipeline = rf_reg_pipeline
else:  # XGBoost
    best_reg_pipeline = xgb_reg_pipeline

In [None]:
# Evaluate best model on test set
print(f"Evaluating {best_reg_model_name} on test set:")
reg_test_results = evaluate_regression_model(best_reg_pipeline, X_test, y_test)

In [None]:
# Plot actual vs predicted values for test set
plt.figure(figsize=(10, 6))
plt.scatter(y_test, reg_test_results['y_pred'], alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Calories Burned')
plt.ylabel('Predicted Calories Burned')
plt.title(f'{best_reg_model_name}: Actual vs Predicted (Test Set)')

# Add R² and RMSE to plot
r2 = reg_test_results['r2']
rmse = reg_test_results['rmse']
plt.annotate(f'R² = {r2:.4f}\nRMSE = {rmse:.2f}', 
             xy=(0.05, 0.95), xycoords='axes fraction',
             bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))

plt.show()

In [None]:
# Save the best regression model
best_reg_model_path = save_model(best_reg_pipeline, f"calories_burned_regressor_{best_reg_model_name.lower().replace(' ', '_')}")
print(f"Best regression model saved to: {best_reg_model_path}")

## Summary and Conclusions

This notebook has demonstrated the development and evaluation of machine learning models for the fitness data analysis project. We've successfully built models for two prediction tasks:

### 1. Classification Task: Predicting Workout Efficiency
- We've trained and compared Logistic Regression, Random Forest, and XGBoost models
- The best model achieved good accuracy in classifying workout efficiency into Low, Medium, and High categories
- Key predictive features were identified through feature importance analysis

### 2. Regression Task: Predicting Calories Burned
- We've trained and compared Linear Regression, Random Forest, and XGBoost regression models
- The best model achieved a strong R² score in predicting calories burned
- Feature importance analysis revealed the most influential factors in calorie expenditure

These models provide valuable insights into fitness performance prediction and can help individuals understand which factors most significantly impact their workout results.