# Load dataset

In [35]:
import pandas as pd
df = pd.read_csv('../data/athletes.csv')

In [30]:
%store -r df

### Create Response Variable

In [32]:
# There are Na values in some columns. Fill with 0
df['total_lift'] = df['candj'].fillna(0) + df['snatch'].fillna(0) + df['deadlift'].fillna(0) + df['backsq'].fillna(0)

### Split and Model

In [33]:
from sklearn.model_selection import train_test_split

vars = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong','age','height','weight','candj','snatch','deadlift','backsq']
cats = ['region','gender', 'eat', 'background', 'experience', 'schedule', 'howlong']
numcs = ['age','height','weight']

# x = pd.get_dummies(x, columns=cats) # encode categorical variables

x = df[numcs].fillna(0) # NAs in numeric columns, fill 0 if any

y = df['total_lift']

# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [34]:
import tensorflow as tf

# model hyperparameters
epochs = 10
batch_size = 32

# Define the model architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(x_train.shape[1],)),   # Input layer for the number of features
    tf.keras.layers.Dense(64, activation='relu'),       # Hidden layer 1
    tf.keras.layers.Dense(32, activation='relu'),       # Hidden layer 2
    tf.keras.layers.Dense(1)                            # Output layer (for regression)
])


In [35]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(x_train, y_train, epochs=5, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1eb40881e90>

### Compute Metrics

In [36]:
# Evaluate the model
test_loss = model.evaluate(x_test, y_test)
print(f'Test loss: {test_loss}')

# Make predictions
y_pred = model.predict(x_test)

Test loss: 42058.66796875


In [37]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Compute regression metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # Calculate RMSE

# Display the metrics summary
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

Mean Absolute Error (MAE): 161.45
Mean Squared Error (MSE): 42058.67
Root Mean Squared Error (RMSE): 205.08
