In [2]:
# Feature Engineering - Part 1: Weekly Aggregations

import pandas as pd
import numpy as np

# 1. Load the cleaned data
data = pd.read_csv('../data/cleaned-data.csv')
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [4]:
# Sort data by athlete_id and date
data = data.sort_values(by=['athlete', 'timestamp'])

# Make a week column since it is faster to resample by week number
data['week'] = data['timestamp'].dt.to_period('W')

# Group by athlete and week
weekly_data = data.groupby(['athlete', 'week']).agg({
    'distance (m)': 'sum',
    'distance_miles': 'sum',
    'elapsed time (s)': 'sum',
    'pace_min_per_km': 'mean',
    'pace_min_per_mile': 'mean',
    'elevation gain (m)': 'sum',
    'average heart rate (bpm)': 'mean',
    'timestamp': 'count'  # Count runs
}).rename(columns={'timestamp': 'training_days'})

# rest index
weekly_data.reset_index(inplace=True)
weekly_data['week'] = weekly_data['week'].dt.to_timestamp()

# rename columns
weekly_data.rename(columns={
    'distance (m)': 'weekly_distance_m',
    'distance_miles': 'weekly_mileage',
    'elapsed time (s)': 'weekly_time_s',
    'pace_min_per_km': 'avg_weekly_pace_km',
    'pace_min_per_mile': 'avg_weekly_pace_mile',
    'elevation gain (m)': 'weekly_elevation_m',
    'week': 'timestamp'
}, inplace=True)

print(f"Weekly aggregated data: {len(weekly_data)} rows")
weekly_data.head(10)

Weekly aggregated data: 14736 rows


Unnamed: 0,athlete,timestamp,weekly_distance_m,weekly_mileage,weekly_time_s,avg_weekly_pace_km,avg_weekly_pace_mile,weekly_elevation_m,average heart rate (bpm),training_days
0,771514,2015-09-14,3927.8,2.440621,3484,14.783509,23.791759,29.2,,1
1,771514,2015-10-26,1391.1,0.864389,644,7.715717,12.417246,32.1,,1
2,771514,2016-07-11,2116.8,1.315318,779,6.133472,9.870869,0.0,,1
3,771514,2017-02-13,5355.4,3.32769,3371,10.490969,16.883583,81.6,,1
4,771514,2017-06-26,6243.5,3.87953,2137,5.628635,9.058413,37.2,,2
5,771514,2017-07-03,6630.2,4.119814,2306,5.796708,9.3289,95.6,162.1,1
6,771514,2017-07-10,16955.9,10.535905,6086,5.96464,9.59916,177.8,154.2,2
7,771514,2017-07-17,6191.1,3.84697,2216,5.965553,9.600629,57.3,145.0,1
8,771514,2017-07-24,7843.2,4.873537,2709,5.756579,9.264319,40.2,160.1,1
9,771514,2017-07-31,4564.9,2.836496,3313,12.09592,19.466503,29.3,,1


In [None]:
# Feature Engineering - Part 2: Advanced Features

# 7. Week-over-week mileage change (per athlete)
weekly_data['weekly_mileage_change'] = weekly_data.groupby('athlete')['weekly_mileage'].diff()

# 8. Consistency index (4-week rolling std per athlete)
weekly_data['consistency_index'] = weekly_data.groupby('athlete')['weekly_mileage'].transform(
    lambda x: x.rolling(window=4, min_periods=2).std()
)

# 9. Recovery ratio - FIX: Cap training_days at 7 max
weekly_data['actual_training_days'] = weekly_data['training_days'].clip(upper=7)
weekly_data['rest_days'] = 7 - weekly_data['actual_training_days']
weekly_data['recovery_ratio'] = weekly_data['rest_days'] / weekly_data['actual_training_days'].replace(0, 1)
weekly_data.loc[weekly_data['recovery_ratio'] == 0, 'recovery_ratio'] = 0.1

# 10. Fatigue index
weekly_data['fatigue_index'] = weekly_data['weekly_mileage'] / weekly_data['recovery_ratio']

# 11. Cumulative mileage (per athlete)
weekly_data['cumulative_mileage'] = weekly_data.groupby('athlete')['weekly_mileage'].cumsum()

# 12. Training intensity
weekly_data['training_intensity'] = 1 / weekly_data['avg_weekly_pace_km']

print(f"\nAdvanced features added!")
rows_before = len(weekly_data)

# 13. Filter for recreational runners only
weekly_data = weekly_data[weekly_data['weekly_mileage'] <= 70]
weekly_data = weekly_data[
    (weekly_data['weekly_mileage_change'].isna()) | 
    (weekly_data['weekly_mileage_change'].abs() <= 40)
]
weekly_data = weekly_data[weekly_data['weekly_mileage'] >= 3]

rows_after = len(weekly_data)
print(f"Rows removed: {rows_before - rows_after}")
print(f"Final dataset: {rows_after} rows, {len(weekly_data.columns)} columns")
weekly_data.describe()



Advanced features added!
Rows removed: 1
Final dataset: 14234 rows, 18 columns


Unnamed: 0,athlete,timestamp,weekly_distance_m,weekly_mileage,weekly_time_s,avg_weekly_pace_km,avg_weekly_pace_mile,weekly_elevation_m,average heart rate (bpm),training_days,weekly_mileage_change,consistency_index,rest_days,recovery_ratio,fatigue_index,cumulative_mileage,training_intensity,actual_training_days
count,14234.0,14234,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,9034.0,14234.0,14118.0,14118.0,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0
mean,14997690.0,2018-01-25 02:57:32.803147264,30539.924266,18.976623,11176.543417,6.078378,9.782204,640.000597,149.38986,2.787691,0.025047,7.035015,4.23753,2.470082,34.709545,1570.990517,0.170554,2.76247
min,771514.0,2009-12-28 00:00:00,4831.2,3.001968,1039.0,2.049231,3.297919,0.0,62.0,1.0,-39.763146,0.000352,0.0,0.1,0.500328,3.206088,0.066667,1.0
25%,6583234.0,2017-04-10 00:00:00,15380.175,9.556795,5449.0,5.274124,8.487882,136.0,141.527083,2.0,-6.509203,4.168288,3.0,0.75,3.400273,467.254603,0.154173,2.0
50%,13892790.0,2018-04-30 00:00:00,27190.25,16.895233,9636.5,5.78568,9.311152,337.0,149.333333,3.0,0.095909,6.404202,4.0,1.333333,9.778863,1119.294938,0.172841,3.0
75%,21299970.0,2019-03-04 00:00:00,41270.375,25.644214,15189.0,6.486239,10.438593,850.15,157.55,4.0,6.66352,9.227906,5.0,2.5,27.023836,2206.689388,0.189605,4.0
max,46817580.0,2020-01-06 00:00:00,112180.6,69.705772,64573.0,15.0,24.140167,6713.9,209.8,17.0,39.832864,29.450093,6.0,6.0,697.057716,9432.021932,0.487988,7.0
std,8922115.0,,18921.375768,11.757194,7395.320759,1.308215,2.105368,768.99504,13.396644,1.582114,11.151172,3.887111,1.488047,2.027294,80.60589,1514.177403,0.029646,1.488047


In [17]:
# Investigate extreme fatigue values
extreme_fatigue = weekly_data[weekly_data['fatigue_index'] > 200].sort_values('fatigue_index', ascending=False)
print(f"Weeks with extreme fatigue (>200): {len(extreme_fatigue)}")
print("\nTop 10 highest fatigue weeks:")
print(extreme_fatigue[['athlete', 'weekly_mileage', 'recovery_ratio', 'fatigue_index', 'actual_training_days', 'rest_days']].head(10))

Weeks with extreme fatigue (>200): 659

Top 10 highest fatigue weeks:
        athlete  weekly_mileage  recovery_ratio  fatigue_index  \
6785   13435425       69.705772             0.1     697.057716   
11256  22538702       69.583237             0.1     695.832372   
6858   13435425       69.466606             0.1     694.666059   
10577  20181492       69.374084             0.1     693.740838   
13663  27655563       69.147905             0.1     691.479047   
11227  22538702       68.906875             0.1     689.068749   
6887   13435425       68.792418             0.1     687.924184   
10623  20181492       68.573882             0.1     685.738822   
6872   13435425       68.440101             0.1     684.401010   
6923   13435425       67.957979             0.1     679.579793   

       actual_training_days  rest_days  
6785                      7          0  
11256                     7          0  
6858                      7          0  
10577                     7          0 

In [18]:
# 14. Save the featured dataset
weekly_data.to_csv('../data/featured-data.csv', index=False)
print("Featured data saved to '../data/featured-data.csv'")

Featured data saved to '../data/featured-data.csv'
