In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib


In [27]:
df = pd.read_csv("../data/gym_dataset.csv")
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [28]:
print(df.isnull().sum())
print(df.info())

Age                              0
Gender                           0
Weight (kg)                      0
Height (m)                       0
Max_BPM                          0
Avg_BPM                          0
Resting_BPM                      0
Session_Duration (hours)         0
Calories_Burned                  0
Workout_Type                     0
Fat_Percentage                   0
Water_Intake (liters)            0
Workout_Frequency (days/week)    0
Experience_Level                 0
BMI                              0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            973 non-null    int64  
 1   Gender                         973 non-null    object 
 2   Weight (kg)                    973 non-null    float64
 3   Height (m)                     973 non-null    float

In [29]:
df.columns = [col.strip().lower()
                .replace(" ", "_")
                .replace("(kg)", "_kg")
                .replace("(m)", "_m")
                .replace("(hours)", "_hours")
                .replace("(liters)", "_liters")
                .replace("(days/week)", "_days/week") for col in df.columns]

In [30]:
df['gender'] = df['gender'].map({'male': 0, 'female': 1})

In [31]:
df = pd.get_dummies(df, columns=['workout_type', 'experience_level'], drop_first=True)

In [32]:
X = df.drop('calories_burned', axis=1)
y = df['calories_burned']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "../models/scaler.pkl")

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


['../models/scaler.pkl']

In [35]:
import joblib
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
import os

# Đảm bảo thư mục models tồn tại
os.makedirs("../models", exist_ok=True)

# Xử lý NaN trong dữ liệu
imputer = SimpleImputer(strategy='mean')  # Thay thế NaN bằng giá trị trung bình
X_train_scaled = imputer.fit_transform(X_train_scaled)
X_test_scaled = imputer.transform(X_test_scaled)  # Áp dụng trên X_test_scaled

# Huấn luyện mô hình Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

# Lưu mô hình Ridge
joblib.dump(ridge, "../models/ridge_model.pkl")




['../models/ridge_model.pkl']

In [36]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
joblib.dump(rf, "../models/rf_model.pkl")

['../models/rf_model.pkl']

In [37]:
def evaluate_model(model, X, y, model_name):
    y_pred = model.predict(X)
    print(f"🔍 {model_name}")
    print("MAE:", mean_absolute_error(y, y_pred))
    print("MSE:", mean_squared_error(y, y_pred))
    print("R²:", r2_score(y, y_pred))
    print("-" * 30)

evaluate_model(ridge, X_test_scaled, y_test, "Ridge Regression")
evaluate_model(rf, X_test, y_test, "Random Forest Regression")

🔍 Ridge Regression
MAE: 35.87885411931417
MSE: 2144.2564590009265
R²: 0.9742972034767172
------------------------------
🔍 Random Forest Regression
MAE: 39.09153846153846
MSE: 2541.8109102564104
R²: 0.969531793478929
------------------------------


In [39]:
print(X_train.columns.tolist())

['age', 'gender', 'weight__kg', 'height__m', 'max_bpm', 'avg_bpm', 'resting_bpm', 'session_duration__hours', 'fat_percentage', 'water_intake__liters', 'workout_frequency__days/week', 'bmi', 'workout_type_HIIT', 'workout_type_Strength', 'workout_type_Yoga', 'experience_level_2', 'experience_level_3']
