## Data Preprocessing

In [4]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv("../gym_exercise_data.csv")

In [6]:
data.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,34.0,Female,86.7,1.86,174,152.0,74.0,1.12,712.0,Strength,12.8,2.4,5.0,2.0,14.31
1,26.0,Female,84.7,1.83,166,156.0,73.0,1.0,833.0,Strength,27.9,2.8,5.0,2.0,33.49
2,22.0,Male,64.8,1.85,187,166.0,64.0,1.24,1678.0,Cardio,28.7,1.9,3.0,2.0,12.73
3,54.0,Female,75.3,1.82,187,169.0,58.0,1.45,628.0,Cardio,31.8,2.4,4.0,1.0,20.37
4,34.0,Female,52.8,1.74,177,169.0,66.0,1.6,1286.0,Strength,26.4,3.2,4.0,2.0,20.83


In [11]:
# 1. Remove rows where target is missing
data = data.dropna(subset=['Calories_Burned'])

# 2. Define numeric and categorical columns
num_cols = ['Age', 'Weight (kg)', 'Height (m)', 'Max_BPM', 'Avg_BPM',
            'Resting_BPM', 'Session_Duration (hours)', 'Fat_Percentage',
            'Water_Intake (liters)', 'Workout_Frequency (days/week)',
            'Experience_Level', 'BMI']
cat_cols = ['Gender', 'Workout_Type']

# 3. Convert numeric columns to numeric (fix text issues)
for col in num_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# 4. Fill missing numeric values with median
for col in num_cols:
    data[col] = data[col].fillna(data[col].median())

# 5. Fill missing categorical values with mode
for col in cat_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

# 6. Verify
print("✅ Missing values after cleaning:")
print(data.isnull().sum())
print("\nData types after conversion:")
print(data.dtypes.head())


✅ Missing values after cleaning:
Age                              0
Gender                           0
Weight (kg)                      0
Height (m)                       0
Max_BPM                          0
Avg_BPM                          0
Resting_BPM                      0
Session_Duration (hours)         0
Calories_Burned                  0
Workout_Type                     0
Fat_Percentage                   0
Water_Intake (liters)            0
Workout_Frequency (days/week)    0
Experience_Level                 0
BMI                              0
dtype: int64

Data types after conversion:
Age            float64
Gender          object
Weight (kg)    float64
Height (m)     float64
Max_BPM        float64
dtype: object


In [13]:
data.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,34.0,Female,86.7,1.86,174.0,152.0,74.0,1.12,712.0,Strength,12.8,2.4,5.0,2.0,14.31
1,26.0,Female,84.7,1.83,166.0,156.0,73.0,1.0,833.0,Strength,27.9,2.8,5.0,2.0,33.49
2,22.0,Male,64.8,1.85,187.0,166.0,64.0,1.24,1678.0,Cardio,28.7,1.9,3.0,2.0,12.73
3,54.0,Female,75.3,1.82,187.0,169.0,58.0,1.45,628.0,Cardio,31.8,2.4,4.0,1.0,20.37
4,34.0,Female,52.8,1.74,177.0,169.0,66.0,1.6,1286.0,Strength,26.4,3.2,4.0,2.0,20.83


In [14]:
data.to_csv("../cleaned_data.csv", index=False)