In [17]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Prepare data for Training

In [2]:
df_engineered = pd.read_csv('../data/3-features/engineered_features.csv')
df_engineered.columns

Index(['Name', 'Date', 'Sex', 'Age', 'BodyweightKg', 'Best3SquatKg',
       'Best3BenchKg', 'Best3DeadliftKg', 'TotalKg', 'Dots', 'prev_squat',
       'prev_bench', 'prev_deadlift', 'prev_total', 'avg_total',
       'max_total_ever', 'min_total_ever', 'avg_squat', 'avg_bench',
       'avg_deadlift', 'total_gain_per_meet', 'squat_gain_per_meet',
       'bench_gain_per_meet', 'deadlift_gain_per_meet'],
      dtype='object')

In [8]:
feature_columns = [
    # previous performance
    'prev_squat', 'prev_bench', 'prev_deadlift', 'prev_total',
    
    # average/historical performance
    'avg_squat', 'avg_bench', 'avg_deadlift', 'avg_total',
    'max_total_ever', 'min_total_ever', 
    
    # improvement per meet
    'squat_gain_per_meet', 'bench_gain_per_meet', 'deadlift_gain_per_meet', 'total_gain_per_meet',

    # demographics
    'Age', 'BodyweightKg'
]

df_engineered['SexEncoded'] = df_engineered['Sex'].map({'M': 1, 'F': 0})
feature_columns.append('SexEncoded')

target = 'TotalKg'

print("missing values:")
df_engineered[feature_columns] = df_engineered[feature_columns].fillna(0)
print(df_engineered[feature_columns].isnull().sum())


missing values:
prev_squat                0
prev_bench                0
prev_deadlift             0
prev_total                0
avg_squat                 0
avg_bench                 0
avg_deadlift              0
avg_total                 0
max_total_ever            0
min_total_ever            0
squat_gain_per_meet       0
bench_gain_per_meet       0
deadlift_gain_per_meet    0
total_gain_per_meet       0
Age                       0
BodyweightKg              0
SexEncoded                0
dtype: int64


## Time-based train-test split

In [9]:
df_engineered['Date'] = pd.to_datetime(df_engineered['Date'])
df_engineered = df_engineered.sort_values('Date').reset_index(drop=True)


split_date = df_engineered['Date'].quantile(0.8)
train_df = df_engineered[df_engineered['Date'] < split_date]
test_df = df_engineered[df_engineered['Date'] >= split_date]


print(f"data split::")
print(f"Training date range: {train_df['Date'].min()} to {train_df['Date'].max()}")
print(f"Testing date range: {test_df['Date'].min()} to {test_df['Date'].max()}")
print(f"Training examples: {len(train_df)}")
print(f"Testing examples: {len(test_df)}")

X_train = train_df[feature_columns]
X_test = test_df[feature_columns]
y_train = train_df['TotalKg']
y_test = test_df['TotalKg']

data split::
Training date range: 1967-02-26 00:00:00 to 2023-11-02 00:00:00
Testing date range: 2023-11-04 00:00:00 to 2025-09-21 00:00:00
Training examples: 83093
Testing examples: 20797


In [13]:
models = {}
predictions = {}

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)
models['Linear Regression'] = lr
predictions['Linear Regression'] = lr_y_pred

rdr = RandomForestRegressor()
rdr.fit(X_train, y_train)
rdr_y_pred = rdr.predict(X_test)
models['Random Forest'] = rdr
predictions['Random Forest'] = rdr_y_pred

gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_y_pred = gbr.predict(X_test)
models['Gradient Boosting'] = gbr
predictions['Gradient Boosting'] = gbr_y_pred

In [20]:
def evaluate_models(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f'\n==={model_name}===')
    print(f'Mean Absolute Error: {mae:.4f}')
    print(f'Mean Squared Error: {mse:.4f}')
    print(f'Root Mean Squared Error: {rmse:.4f}')
    print(f'R2 Score: {r2:.4f}')

    return {'MAE': mae, 'MSE': mse, 'RSME': rmse, 'R2': r2}

results = {}
for model_name, y_pred in predictions.items():
    results[model_name] = evaluate_models(y_test, y_pred, model_name)

print('\n==Comparison==')
comparison = pd.DataFrame(results)
print(comparison.round(2))



===Linear Regression===
Mean Absolute Error: 23.2658
Mean Squared Error: 1866.4955
Root Mean Squared Error: 43.2030
R2 Score: 0.9273

===Random Forest===
Mean Absolute Error: 23.4170
Mean Squared Error: 1768.0306
Root Mean Squared Error: 42.0480
R2 Score: 0.9311

===Gradient Boosting===
Mean Absolute Error: 22.7038
Mean Squared Error: 1675.3691
Root Mean Squared Error: 40.9313
R2 Score: 0.9347

==Comparison==
      Linear Regression  Random Forest  Gradient Boosting
MAE               23.27          23.42              22.70
MSE             1866.50        1768.03            1675.37
RSME              43.20          42.05              40.93
R2                 0.93           0.93               0.93
