# Load dataset

In [52]:
import pandas as pd
df = pd.read_csv('../data/athletes.csv')

### Create Response Variable

In [53]:
# There are Na values in some columns. Fill with 0
df['total_lift'] = df['candj'].fillna(0) + df['snatch'].fillna(0) + df['deadlift'].fillna(0) + df['backsq'].fillna(0)

### Split and Model

In [54]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import accuracy_score

vars = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong','age','height','weight','candj','snatch','deadlift','backsq']
cats = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong'] 

x = df[vars]
x = pd.get_dummies(x, columns=cats) # encode categorical variables
y = df['total_lift']

# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [55]:
hist_regressor = HistGradientBoostingRegressor(random_state=42)
hist_regressor.fit(x_train, y_train)

### Compute Metrics

In [59]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error

# Make predictions on the test set
y_pred = hist_regressor.predict(x_test)

# Compute regression metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # Calculate RMSE

# Display the metrics summary
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

Mean Absolute Error (MAE): 927.76
Mean Squared Error (MSE): 17458723521.14
Root Mean Squared Error (RMSE): 132131.46
