In [1]:
import pandas as pd

### Read and clean CSV's

# Variables
cutoff_date = pd.to_datetime('2023-01-01')

# Read CSV's
activities_df = pd.read_csv('activities.csv')
ftp_df = pd.read_csv('ftp.csv')

# Drop unnessary columns
activities_df.drop(['Activity Description','Commute', 'Activity Private Note', 'Activity Gear', 'Filename', 'Athlete Weight', 
         'Bike Weight', 'Weather Observation Time', 'Weather Condition', 'Weather Temperature', 'Apparent Temperature',
         'Dewpoint', 'Humidity', 'Weather Pressure', 'Wind Speed', 'Wind Gust', 'Wind Bearing', 'Precipitation Intensity',
         'Sunrise Time', 'Sunset Time', 'Moon Phase', 'Gear', 'Precipitation Probability', 'Precipitation Type', 
         'Cloud Cover', 'Weather Visibility', 'UV Index', 'Weather Ozone', 'Jump Count', 'Total Grit', 'Average Flow', 
         'Flagged', 'Dirt Distance', 'Newly Explored Distance', 'Newly Explored Dirt Distance', 'Total Steps', 
         'Carbon Saved', 'Pool Length', 'Timer Time', 'Media', 'Total Weight Lifted', 'From Upload', 'Commute.1', 
         'Average Positive Grade', 'Average Negative Grade', 'Average Grade Adjusted Pace', 'Total Cycles', 
          'Number of Runs', 'Uphill Time', 'Downhill Time', 'Other Time',], axis=1, inplace=True)

# Only keep Rides
activities_df = activities_df[activities_df['Activity Type'] == 'Ride']

# Convert to datetimes and numeric
activities_df['Activity Date'] = pd.to_datetime(activities_df['Activity Date'], errors='coerce')
ftp_df['FTP Test Date'] = pd.to_datetime(ftp_df['FTP Test Date'], errors='coerce')
activities_df['Distance'] = pd.to_numeric(activities_df['Distance'], errors='coerce')

# Sort both dataframes by date
ftp_df = ftp_df.sort_values(by='FTP Test Date')
activities_df = activities_df.sort_values(by='Activity Date')

# Only look at data starting 2023-01-01
ftp_df = ftp_df[ftp_df['FTP Test Date'] >= cutoff_date]
activities_df = activities_df[activities_df['Activity Date'] >= cutoff_date]

ModuleNotFoundError: No module named 'pandas'

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer

### Create training blocks

# Define thresholds for Zone 2 and high-intensity rides
max_hr = 196  # Replace with your actual max heart rate if available
high_intensity_min_hr = max_hr * 0.95

# Initialize an empty list to store training blocks
training_blocks = []

# Iterate through the FTP test data to create training blocks
for i in range(len(ftp_df) - 1):
    # Define the start and end date of the block
    start_date = ftp_df.iloc[i]['FTP Test Date']
    end_date = ftp_df.iloc[i + 1]['FTP Test Date']
    
    # Filter activities within the training block
    block_activities = activities_df[(activities_df['Activity Date'] >= start_date) & 
                                     (activities_df['Activity Date'] < end_date)]
    
    # Calculate block duration in weeks
    block_duration_weeks = (end_date - start_date).days / 7
    
    # # Calculate aggregated metrics for the block
    # avg_relative_effort = block_activities['Relative Effort'].mean()
    # avg_weighted_watts = block_activities['Weighted Average Power'].mean()

    # Calculate longest ride time
    longest_ride_duration = block_activities['Moving Time'].max()
    
    # Classify Zone 2 and high-intensity rides
    zone_2_rides = block_activities[block_activities['Max Heart Rate'] < high_intensity_min_hr]
    high_intensity_rides = block_activities[block_activities['Max Heart Rate'] >= high_intensity_min_hr]
    
    # Calculate the ratio of Zone 2 and high-intensity rides
    total_rides = len(block_activities)
    zone_2_ratio = len(zone_2_rides) / total_rides if total_rides > 0 else 0
    high_intensity_ratio = len(high_intensity_rides) / total_rides if total_rides > 0 else 0
    
    # Calculate rides per week
    rides_per_week = total_rides / block_duration_weeks if block_duration_weeks > 0 else 0

    # Calculate distance per week
    km_per_week = block_activities['Distance'].sum() / block_duration_weeks if block_duration_weeks > 0 else block_activities['Distance'].sum()

    # # Calculate Variability Index: ratio of normalized power to average power within the block (try to find interval training)
    # avg_power = block_activities['Average Watts'].mean()
    # norm_power = block_activities['Weighted Average Power'].mean()  
    # variability_index = norm_power / avg_power if avg_power > 0 else 0

    # Calculate FTP change during the block
    ftp_change = ftp_df.iloc[i + 1]['FTP'] - ftp_df.iloc[i]['FTP']
    
    # Store block information
    training_blocks.append({
        'Block Duration (weeks)': block_duration_weeks,
        'Rides Per Week': rides_per_week,
        'KM Per Week': km_per_week,
        # 'Average Relative Effort': avg_relative_effort,
        # 'Average Weighted Watts': avg_weighted_watts,
        # 'Variability Index': variability_index,
        'Zone 2 Ratio': zone_2_ratio,
        'High Intensity Ratio': high_intensity_ratio,
        'Log Longest Ride': np.log1p(longest_ride_duration),
        'FTP Change': ftp_change
    })

# Convert training blocks to a DataFrame
training_blocks_df = pd.DataFrame(training_blocks)


In [None]:
## Visualize the training blocks

training_blocks_df

In [4]:
### Prepare for training

# Prepare the feature set and target variable
X = training_blocks_df.drop(columns=['FTP Change'])
y = training_blocks_df['FTP Change']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# ### Try hyperparameter Randomized grid search

# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score

# # Define the parameter grid for RandomForestRegressor
# param_grid = {
#     'n_estimators': [100, 200, 300],       # Number of trees in the forest
#     'max_depth': [5, 10, 15, None],        # Maximum depth of the tree
#     'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required to be at a leaf node
#     'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider when looking for the best split
# }

# # Initialize the RandomForestRegressor
# rf_model = RandomForestRegressor(random_state=42)

# # Set up the GridSearchCV
# grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
#                            cv=5, scoring='r2', n_jobs=-1, verbose=1)

# # Fit GridSearchCV to the training data
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_
# print(f"Best Hyperparameters: {best_params}")

# # Fit the best model to the training data
# best_rf_model = grid_search.best_estimator_
# y_pred = best_rf_model.predict(X_test)

# # Evaluate the best model's performance
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"Mean Squared Error: {mse}")
# print(f"R-squared: {r2}")

In [None]:
### Train a Random Forest Regressor

# Initialize and train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" ------ Random Forest Regressor  ------ ")

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Feature importancech
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
print("\nFeature Importance:")
feature_importance = feature_importance.sort_values(ascending=False)
print(feature_importance)

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train a LightGBM Regressor
model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" ------ LightGBM Regressor  ------ ")

print(f"LightGBM Mean Squared Error: {mse}")
print(f"LightGBM R-squared: {r2}")

In [None]:
from xgboost import XGBRegressor

# Initialize and train an XGBoost Regressor
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" ------ XGBoost Regressor  ------ ")

print(f"XGBoost Mean Squared Error: {mse}")
print(f"XGBoost R-squared: {r2}")

In [9]:
# from catboost import CatBoostRegressor

# # Initialize and train a CatBoost Regressor
# model = CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbose=0)
# model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f" ------ CatBoost Regressor  ------ ")

# print(f"CatBoost Mean Squared Error: {mse}")
# print(f"CatBoost R-squared: {r2}")

In [None]:
from sklearn.linear_model import ElasticNet

# Initialize and train an ElasticNet Regressor
model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" ------ ElasticNet Regressor  ------ ")


print(f"ElasticNet Mean Squared Error: {mse}")
print(f"ElasticNet R-squared: {r2}")

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# Standardize the feature set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train a Support Vector Regressor
model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" ------ Support Vector Regressor  ------ ")

print(f"SVR Mean Squared Error: {mse}")
print(f"SVR R-squared: {r2}")

In [None]:
from sklearn.neural_network import MLPRegressor

# Initialize and train an MLP Regressor
model = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" ------ MLP Regressor  ------ ")

print(f"MLP Mean Squared Error: {mse}")
print(f"MLP R-squared: {r2}")

In [None]:
import matplotlib.pyplot as plt

### Plot charts

# Plot FTP Over Time
plt.figure(figsize=(12, 6))
plt.plot(ftp_df['FTP Test Date'], ftp_df['FTP'], marker='o', linestyle='-', color='b')
plt.title('FTP Over Time')
plt.xlabel('Date')
plt.ylabel('FTP (Watts)')
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot features, ranked by feature importance
for feature, importance in feature_importance.items():
    print(f"Importance of {feature}: {importance}")
    plt.figure(figsize=(12, 6))
    plt.plot(training_blocks_df.index, training_blocks_df[feature], marker='o', linestyle='-', color='purple')
    plt.title(f"{feature} by Training Block")
    plt.xlabel('Training Block')
    plt.ylabel(feature)
    plt.grid(True)
    plt.tight_layout()
    plt.show()
