## Imports ##

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
df = pd.read_csv('../data/2-preprocessed/cleanIPF_minimal.csv')

cols = ['Sex', 'Age', 'BodyweightKg', 'TotalKg']

# df = df[cols]
# df = df.dropna()

# example_lifter = df[df['Name'] == 'Imani Martinez'].copy()
# print(example_lifter)


## Feature engineering ##

In [None]:
def create_features(current_meet, previous_meet):
    features = current_meet.copy()

    # 1. previous performance
    features['prev_squat'] = previous_meet['Best3SquatKg'].iloc[-1]
    features['prev_bench'] = previous_meet['Best3BenchKg'].iloc[-1]
    features['prev_deadlift'] = previous_meet['Best3DeadliftKg'].iloc[-1]
    features['prev_total'] = previous_meet['TotalKg'].iloc[-1]

    # 2. historical performances, averaged, min, max
    features['avg_total'] = previous_meet['TotalKg'].mean()
    features['max_total_ever'] = previous_meet['TotalKg'].max()
    features['min_total_ever'] = previous_meet['TotalKg'].min()

    features['avg_squat'] = previous_meet['Best3SquatKg'].mean()
    features['avg_bench'] = previous_meet['Best3BenchKg'].mean()
    features['avg_deadlift'] = previous_meet['Best3DeadliftKg'].mean()

    # 3. improvement rates over meets, 
    if len(previous_meet) > 1:
        first_total = previous_meet['TotalKg'].iloc[0]
        last_total = previous_meet['TotalKg'].iloc[-1]
        num_meets = len(previous_meet) - 1 # n meets -> n - 1 intervals

        features['total_gain_per_meet'] = (last_total - first_total) / num_meets

        features['squat_gain_per_meet'] = (
            previous_meet['Best3SquatKg'].iloc[-1] - previous_meet['Best3SquatKg'].iloc[0]) / num_meets
        features['bench_gain_per_meet'] = (
            previous_meet['Best3BenchKg'].iloc[-1] - previous_meet['Best3BenchKg'].iloc[0]) / num_meets
        features['deadlift_gain_per_meet'] = (
            previous_meet['Best3DeadliftKg'].iloc[-1] - previous_meet['Best3DeadliftKg'].iloc[0]) / num_meets
    else:
        # else all 0
        features['total_gain_per_meet'] = 0
        features['squat_gain_per_meet'] = 0
        features['bench_gain_per_meet'] = 0
        features['deadlift_gain_per_meet'] = 0
    
    return features

# TODO: try engineer more features

    

## Handle one lifter and all lifters

In [None]:
def process_single_lifter(lifter_data):
    lifting_data = []

    for i in range(1, len(lifter_data)):
        current = lifter_data.iloc[i]
        previous = lifter_data.iloc[:i]
        features = create_features(current, previous)
        lifting_data.append(features)
    return pd.DataFrame(lifting_data)

# issue with <x>_gain_per_meet resulting iin 0 value
# could change to requires 3 meets?
def engineer_features(df, meets=2):
    print(f'Total lifters: {df["Name"].nunique()}')

    df = df.sort_values(['Name', 'Date']).reset_index(drop=True) # sort again for now, might need to modify preprocessing later
    all_lifting_data = []

    for name, lifter_data in df.groupby('Name'): # for each lifters competition history
        if len(lifter_data) < meets: # only can predict lifters with at least two comp history
            continue
        features = process_single_lifter(lifter_data)
        all_lifting_data.append(features)

    result = pd.concat(all_lifting_data)

    print(f"\nFeature engineering complete!")
    print(f"Training examples created: {len(result)}")
    print(f"{result['Name'].nunique()} lifters with 2+ meets")
    return result

df_with_features = engineer_features(df)
df_with_features.to_csv('../data/3-features/engineered_features.csv', index=False)



In [None]:
example_lifter = df[df['Name'] == 'Imani Martinez']
print(f"Imani Martinez has {len(example_lifter)} meets")

imani_features = process_single_lifter(example_lifter)
print(f"Created {len(imani_features)} training examples (excludes first meet)")
print("\nFeature columns created:")
print([col for col in imani_features.columns])
print(imani_features[['Age', 'BodyweightKg', 'Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'avg_squat', 'avg_bench', 'avg_deadlift']])

In [None]:
from sklearn.base import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

feature_columns = [
    # previous performance
    'prev_squat', 'prev_bench', 'prev_deadlift', 'prev_total',
    # average/historical performance
    'avg_squat', 'avg_bench', 'avg_deadlift', 'avg_total',
    'max_total_ever', 'min_total_ever', 
    # improvement per meet
    'squat_gain_per_meet', 'bench_gain_per_meet', 'deadlift_gain_per_meet', 'total_gain_per_meet',
    # demographics
    'Age', 'BodyweightKg'
]

df_with_features['Date'] = pd.to_datetime(df_with_features['Date'])
df_with_features = df_with_features.sort_values('Date').reset_index(drop=True)

split_date = df_with_features['Date'].quantile(0.8)
train_df = df_with_features[df_with_features['Date'] < split_date]
test_df = df_with_features[df_with_features['Date'] >= split_date]

X_train = train_df[feature_columns]
X_test = test_df[feature_columns]
y_train = train_df['TotalKg']
y_test = test_df['TotalKg']

gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_y_pred = gbr.predict(X_test)

mae = mean_absolute_error(y_test, gbr_y_pred)
rmse = np.sqrt(mean_squared_error(y_test, gbr_y_pred))
mse = mean_squared_error(y_test, gbr_y_pred)
r2 = r2_score(y_test, gbr_y_pred)