## Imports ##

In [1]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv('../data/2-preprocessed/cleanIPF.csv')

cols = ['Sex', 'Age', 'BodyweightKg', 'TotalKg']

# df = df[cols]
# df = df.dropna()

example_lifter = df[df['Name'] == 'Imani Martinez'].copy()
print(example_lifter[['Date', 'Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'TotalKg', 'BodyweightKg']])


             Date  Best3SquatKg  Best3BenchKg  Best3DeadliftKg  TotalKg  \
77740  2017-01-21         122.5         100.0            137.5    360.0   
77741  2017-03-26         122.5          97.5            150.0    370.0   
77742  2017-08-05         142.5         102.5            172.5    417.5   
77743  2018-01-13         147.5         102.5            177.5    427.5   
77744  2018-05-12         152.5         102.5            197.5    452.5   
77745  2018-09-29         160.0         115.0            200.0    475.0   
77746  2019-01-26         160.0         115.0            202.5    477.5   
77747  2019-04-06         162.5         117.5            207.5    487.5   
77748  2019-11-10         162.5         120.0            217.5    500.0   
77749  2020-01-18         170.0         112.5            217.5    500.0   
77750  2020-07-05         182.5         125.0            220.0    527.5   
77751  2021-04-10         175.0         117.5            217.5    510.0   

       BodyweightKg  
77

## Feature engineering ##

In [12]:
def create_features(current_meet, previous_meet):
    features = current_meet.copy()

    # 1. previous performance
    features['prev_squat'] = previous_meet['Best3SquatKg'].iloc[-1]
    features['prev_bench'] = previous_meet['Best3BenchKg'].iloc[-1]
    features['prev_deadlift'] = previous_meet['Best3DeadliftKg'].iloc[-1]
    features['prev_total'] = previous_meet['TotalKg'].iloc[-1]

    # 2. historical performances, averaged, min, max
    features['avg_total'] = previous_meet['TotalKg'].mean()
    features['max_total_ever'] = previous_meet['TotalKg'].max()
    features['min_total_ever'] = previous_meet['TotalKg'].min()

    features['avg_squat'] = previous_meet['Best3SquatKg'].mean()
    features['avg_bench'] = previous_meet['Best3BenchKg'].mean()
    features['avg_deadlift'] = previous_meet['Best3DeadliftKg'].mean()

    # 3. improvement rates over meets, 
    if len(previous_meet) > 1:
        first_total = previous_meet['TotalKg'].iloc[0]
        last_total = previous_meet['TotalKg'].iloc[-1]
        num_meets = len(previous_meet) - 1 # n meets -> n - 1 intervals

        features['total_gain_per_meet'] = (last_total - first_total) / num_meets

        features['squat_gain_per_meet'] = (
            previous_meet['Best3SquatKg'].iloc[-1] - previous_meet['Best3SquatKg'].iloc[0]) / num_meets
        features['bench_gain_per_meet'] = (
            previous_meet['Best3BenchKg'].iloc[-1] - previous_meet['Best3BenchKg'].iloc[0]) / num_meets
        features['deadlift_gain_per_meet'] = (
            previous_meet['Best3DeadliftKg'].iloc[-1] - previous_meet['Best3DeadliftKg'].iloc[0]) / num_meets
        
    else:
        # else all 0
        features['total_gain_per_meet'] = 0
        features['squat_gain_per_meet'] = 0
        features['bench_gain_per_meet'] = 0
        features['deadlift_gain_per_meet'] = 0
    
    return features

    

## Handle one lifter

In [9]:
def process_single_lifter(lifter_data):
    lifting_data = []

    for i in range(1, len(lifter_data)):
        current = lifter_data.iloc[i]
        previous = lifter_data.iloc[:i]
        features = create_features(current, previous)
        lifting_data.append(features)
    return pd.DataFrame(lifting_data)


In [13]:
example_lifter = df[df['Name'] == 'Imani Martinez']
print(f"Imani Martinez has {len(example_lifter)} meets")

imani_features = process_single_lifter(example_lifter)
print(f"Created {len(imani_features)} training examples (excludes first meet)")
print("\nFeature columns created:")
print([col for col in imani_features.columns])

Imani Martinez has 12 meets
Created 11 training examples (excludes first meet)

Feature columns created:
['Name', 'Sex', 'Event', 'Equipment', 'Age', 'AgeClass', 'BirthYearClass', 'Division', 'BodyweightKg', 'WeightClassKg', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Squat4Kg', 'Best3SquatKg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Bench4Kg', 'Best3BenchKg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Deadlift4Kg', 'Best3DeadliftKg', 'TotalKg', 'Place', 'Dots', 'Wilks', 'Glossbrenner', 'Goodlift', 'Tested', 'Country', 'State', 'Federation', 'ParentFederation', 'Date', 'MeetCountry', 'MeetState', 'MeetTown', 'MeetName', 'Sanctioned', 'prev_squat', 'prev_bench', 'prev_deadlift', 'prev_total', 'avg_total', 'max_total_ever', 'min_total_ever', 'avg_squat', 'avg_bench', 'avg_deadlift', 'total_gain_per_meet', 'squat_gain_per_meet', 'bench_gain_per_meet', 'deadlift_gain_per_meet']


## Handle all lifters

In [None]:
def engineer_features(df):
    # already sorted by name and date from preprocessing
    all_lifting_data = []
    df = df.sort_values(['Name', 'Date']).reset_index(drop=True) # sort again for now, might need to modify preprocessing later

    for name, lifter_data in df.groupby('Name'): # for each lifters competition history
        if len(lifter_data) < 2: # only can predict lifters with at least two comp history
            continue
        features = process_single_lifter(lifter_data)
        all_lifting_data.append(features)
    return pd.concat(all_lifting_data)