## Imports ##

In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('../data/2-preprocessed/cleanIPF_minimal.csv')

cols = ['Sex', 'Age', 'BodyweightKg', 'TotalKg']

# df = df[cols]
# df = df.dropna()

example_lifter = df[df['Name'] == 'Imani Martinez'].copy()
print(example_lifter)


                 Name        Date Sex   Age  BodyweightKg  Best3SquatKg  \
72955  Imani Martinez  2017-01-21   M  23.0         73.60         122.5   
72956  Imani Martinez  2017-03-26   M  23.0         71.34         122.5   
72957  Imani Martinez  2017-08-05   M  23.0         72.50         142.5   
72958  Imani Martinez  2018-01-13   M  24.0         73.50         147.5   
72959  Imani Martinez  2018-05-12   M  24.0         72.60         152.5   
72960  Imani Martinez  2018-09-29   M  24.0         73.85         160.0   
72961  Imani Martinez  2019-01-26   M  25.0         73.80         160.0   
72962  Imani Martinez  2019-04-06   M  25.0         73.50         162.5   
72963  Imani Martinez  2019-11-10   M  25.0         73.50         162.5   
72964  Imani Martinez  2020-01-18   M  26.0         79.00         170.0   
72965  Imani Martinez  2020-07-05   M  26.0         82.34         182.5   
72966  Imani Martinez  2021-04-10   M  27.0         73.03         175.0   

       Best3BenchKg  Bes

## Feature engineering ##

In [5]:
def create_features(current_meet, previous_meet):
    features = current_meet.copy()

    # 1. previous performance
    features['prev_squat'] = previous_meet['Best3SquatKg'].iloc[-1]
    features['prev_bench'] = previous_meet['Best3BenchKg'].iloc[-1]
    features['prev_deadlift'] = previous_meet['Best3DeadliftKg'].iloc[-1]
    features['prev_total'] = previous_meet['TotalKg'].iloc[-1]

    # 2. historical performances, averaged, min, max
    features['avg_total'] = previous_meet['TotalKg'].mean()
    features['max_total_ever'] = previous_meet['TotalKg'].max()
    features['min_total_ever'] = previous_meet['TotalKg'].min()

    features['avg_squat'] = previous_meet['Best3SquatKg'].mean()
    features['avg_bench'] = previous_meet['Best3BenchKg'].mean()
    features['avg_deadlift'] = previous_meet['Best3DeadliftKg'].mean()

    # 3. improvement rates over meets, 
    if len(previous_meet) > 1:
        first_total = previous_meet['TotalKg'].iloc[0]
        last_total = previous_meet['TotalKg'].iloc[-1]
        num_meets = len(previous_meet) - 1 # n meets -> n - 1 intervals

        features['total_gain_per_meet'] = (last_total - first_total) / num_meets

        features['squat_gain_per_meet'] = (
            previous_meet['Best3SquatKg'].iloc[-1] - previous_meet['Best3SquatKg'].iloc[0]) / num_meets
        features['bench_gain_per_meet'] = (
            previous_meet['Best3BenchKg'].iloc[-1] - previous_meet['Best3BenchKg'].iloc[0]) / num_meets
        features['deadlift_gain_per_meet'] = (
            previous_meet['Best3DeadliftKg'].iloc[-1] - previous_meet['Best3DeadliftKg'].iloc[0]) / num_meets
    else:
        # else all 0
        features['total_gain_per_meet'] = 0
        features['squat_gain_per_meet'] = 0
        features['bench_gain_per_meet'] = 0
        features['deadlift_gain_per_meet'] = 0
    
    return features

    

## Handle one lifter

In [6]:
def process_single_lifter(lifter_data):
    lifting_data = []

    for i in range(1, len(lifter_data)):
        current = lifter_data.iloc[i]
        previous = lifter_data.iloc[:i]
        features = create_features(current, previous)
        lifting_data.append(features)
    return pd.DataFrame(lifting_data)


In [7]:
example_lifter = df[df['Name'] == 'Imani Martinez']
print(f"Imani Martinez has {len(example_lifter)} meets")

imani_features = process_single_lifter(example_lifter)
print(f"Created {len(imani_features)} training examples (excludes first meet)")
print("\nFeature columns created:")
print([col for col in imani_features.columns])
print(imani_features[['Age', 'BodyweightKg', 'Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'avg_squat', 'avg_bench', 'avg_deadlift']])

Imani Martinez has 12 meets
Created 11 training examples (excludes first meet)

Feature columns created:
['Name', 'Date', 'Sex', 'Age', 'BodyweightKg', 'Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'TotalKg', 'Dots', 'prev_squat', 'prev_bench', 'prev_deadlift', 'prev_total', 'avg_total', 'max_total_ever', 'min_total_ever', 'avg_squat', 'avg_bench', 'avg_deadlift', 'total_gain_per_meet', 'squat_gain_per_meet', 'bench_gain_per_meet', 'deadlift_gain_per_meet']
        Age  BodyweightKg  Best3SquatKg  Best3BenchKg  Best3DeadliftKg  \
72956  23.0         71.34         122.5          97.5            150.0   
72957  23.0         72.50         142.5         102.5            172.5   
72958  24.0         73.50         147.5         102.5            177.5   
72959  24.0         72.60         152.5         102.5            197.5   
72960  24.0         73.85         160.0         115.0            200.0   
72961  25.0         73.80         160.0         115.0            202.5   
72962  25.0    

## Handle all lifters

In [None]:
# issue with <x>_gain_per_meet resulting iin 0 value
# could change to requires 3 meets?


def engineer_features(df, meets=2):
    print(f'Total lifters: {df["Name"].nunique()}')

    df = df.sort_values(['Name', 'Date']).reset_index(drop=True) # sort again for now, might need to modify preprocessing later
    all_lifting_data = []

    for name, lifter_data in df.groupby('Name'): # for each lifters competition history
        if len(lifter_data) < meets: # only can predict lifters with at least two comp history
            continue
        features = process_single_lifter(lifter_data)
        all_lifting_data.append(features)

    result = pd.concat(all_lifting_data)

    print(f"\nFeature engineering complete!")
    print(f"Training examples created: {len(result)}")
    print(f"{result['Name'].nunique()} lifters with 2+ meets")
    return result

df_with_features = engineer_features(df)
df_with_features.to_csv('../data/3-features/engineered_features.csv', index=False)
