# 1. Load and Clean Data



In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import os

file_path = 'data/athletes.csv'

df = pd.read_csv(file_path)

print("Data loaded successfully! ")

ModuleNotFoundError: No module named 'mlflow'

In [15]:
!pip install mlflow

Collecting mlflow
  Using cached mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Using cached mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting Flask<4 (from mlflow)
  Using cached flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Using cached gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting matplotlib<4 (from mlflow)
  Using cached matplotlib-3.10.3-cp312-cp312-macosx_10_13_x86_64.whl.metadata (11 kB)
Collecting numpy<3 (from mlflow)
  Using cached numpy-2.3.1-cp312-cp312-macosx_10_13_x86_64.whl.metadata (62 kB)
Collecting pandas<3 (from mlflow)
  Using cached pa

In [6]:
import numpy as np

# clean the dataset
df = df.dropna(subset=['region','age','weight','height','howlong','gender','eat',
                           'train','background','experience','schedule','howlong',
                           'deadlift','candj','snatch','backsq','experience',
                           'background','schedule','howlong'])
df = df.drop(columns=['affiliate','team','name','athlete_id','fran','helen','grace',
                          'filthy50','fgonebad','run400','run5k','pullups','train'])

# Remove Outliers

df = df[df['weight'] < 1500]
df = df[df['gender'] != '--']
df = df[df['age'] >= 18]
df = df[(df['height'] < 96) & (df['height'] > 48)]

df = df[(df['deadlift'] > 0) & (df['deadlift'] <= 1105)|((df['gender'] == 'Female')
            & (df['deadlift'] <= 636))]
df = df[(df['candj'] > 0) & (df['candj'] <= 395)]
df = df[(df['snatch'] > 0) & (df['snatch'] <= 496)]
df = df[(df['backsq'] > 0) & (df['backsq'] <= 1069)]

# Clean Survey Data

decline_dict = {'Decline to answer|': np.nan}
df = df.replace(decline_dict)
df = df.dropna(subset=['background','experience','schedule','howlong','eat'])

df.head()


Unnamed: 0,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong
21,Southern California,Male,30.0,71.0,200.0,235.0,175.0,385.0,315.0,I eat whatever is convenient|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 1x a week|I ty...,1-2 years|
22,Africa,Male,28.0,70.0,176.0,187.0,134.0,335.0,254.0,I eat 1-3 full cheat meals per week|,I have no athletic background besides CrossFit|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 1x a week|,2-4 years|
27,North East,Male,35.0,68.0,225.0,285.0,205.0,440.0,405.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I typically rest 4 or more days per month|,2-4 years|
50,North Central,Male,36.0,71.0,199.0,267.0,212.0,485.0,390.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,1-2 years|
60,North East,Male,36.0,64.0,155.0,245.0,180.0,415.0,385.0,I eat strict Paleo|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 2x a week|I st...,4+ years|


# 2. Data Preprocessing

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# --- 1. Select Target and Feature Columns ---
lift_columns = ['candj', 'snatch', 'deadlift', 'backsq']
df['total_lift'] = df[lift_columns].sum(axis=1)
target_column = 'total_lift'

feature_columns = ['gender', 'age', 'height', 'weight', 'candj', 'snatch', 'deadlift', 'backsq', 'region']

# Create a new DataFrame with only the columns we need
df_processed = df[feature_columns + [target_column]].copy()


# --- 2. Encode the Categorical Columns ---

for col in ['gender', 'region']:
    encoder = LabelEncoder()
    df_processed[col + '_encoded'] = encoder.fit_transform(df_processed[col])

# --- 3. Create the Final DataFrame for the Next Step ---
# Drop original categorical columns and rearrange
df_final = df_processed.drop(columns=['gender', 'region'])
df_final = df_final[[col for col in df_final if col != target_column] + [target_column]]


print("\nPreprocessing for regression task complete.")
df_final.head()


Preprocessing for regression task complete.


Unnamed: 0,age,height,weight,candj,snatch,deadlift,backsq,gender_encoded,region_encoded,total_lift
21,30.0,71.0,200.0,235.0,175.0,385.0,315.0,1,16,1110.0
22,28.0,70.0,176.0,187.0,134.0,335.0,254.0,1,0,910.0
27,35.0,68.0,225.0,285.0,205.0,440.0,405.0,1,10,1335.0
50,36.0,71.0,199.0,267.0,212.0,485.0,390.0,1,9,1354.0
60,36.0,64.0,155.0,245.0,180.0,415.0,385.0,1,10,1225.0


# 3. Create 2 Versions of Features

In [10]:
# Feature Version 1: Basic features
def get_feature_version_1(data):
    features_v1 = data[['age', 'height', 'weight', 'gender_encoded', 'region_encoded']].copy()
    target = data['total_lift']
    return features_v1, target

# Feature Version 2: Engineered features (with BMI)
def get_feature_version_2(data):
    features_v2 = data[['age', 'height', 'weight', 'gender_encoded', 'region_encoded']].copy()
    # first convert the unit of weight and height
    weight_kg = features_v2['weight'] * 0.453592
    height_m = features_v2['height'] * 0.0254
    features_v2['bmi'] = weight_kg / (height_m ** 2)

    # Drop the original height and weight to make this version distinct
    features_v2 = features_v2.drop(columns=['height', 'weight'])

    target = data['total_lift']
    return features_v2, target

# 4. Model Training Component