In [29]:
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error, make_scorer

import xgboost as xgb
from xgboost import XGBRegressor

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [30]:
df = pd.read_csv('data/clean/parkrun_weather_v2.csv')
df['Date'] = pd.to_datetime(df['Date'])


In [31]:
df.columns

Index(['Date', 'Position', 'Position_score', 'Name', 'Runner_id',
       'Parkrun_count', 'Gender', 'Age_group', 'Time_in_minutes',
       'Total_Appearances', 'Appearance_Instance', 'Appearance/Total',
       'Appearance_Category', 'Total_event_runners', 'Days_since_last_parkrun',
       'PB_mins', 'ave_mins', 'prev_PB', 'avg_prev_run_time', 'temperature',
       'windspeed', 'precipitation'],
      dtype='object')

In [32]:
# Mapping the 'Age_group' to a numeric value (you could use the starting age of each range)
age_group_map = {
    '18-19': 19,
    '20-24': 22,
    '25-29': 27,
    '30-34': 32,
    '35-39': 37,
    '40-44': 42,
    '45-49': 47,
    '50-54': 52,
    '55-59': 57,
    '60-64': 62,
    '65-69': 67,
    '70-74': 72
}

# Apply the mapping to the 'Age_group' column
df['Age_group_numeric'] = df['Age_group'].map(age_group_map)

In [33]:
df['first_parkrun_date'] = df.groupby('Runner_id')['Date'].transform('min')
df['Days_since_first_parkrun'] = (df['Date'] - df['first_parkrun_date']).dt.days

In [34]:
df['Male'] = df['Gender'].apply(lambda x: 1 if x == 'Male' else 0)

In [35]:
# Sort by 'Runner_id' and 'Date'
df = df.sort_values(by=['Runner_id', 'Date'])
df['Previous_time_mins'] = df.groupby('Runner_id')['Time_in_minutes'].shift(1)
df = df.sort_index()

In [36]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 157970 entries, 9 to 165553
Data columns (total 27 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   Date                      157970 non-null  datetime64[ns]
 1   Position                  157970 non-null  int64         
 2   Position_score            157970 non-null  float64       
 3   Name                      157970 non-null  object        
 4   Runner_id                 157970 non-null  int64         
 5   Parkrun_count             157970 non-null  int64         
 6   Gender                    157970 non-null  object        
 7   Age_group                 157970 non-null  object        
 8   Time_in_minutes           157970 non-null  float64       
 9   Total_Appearances         157970 non-null  int64         
 10  Appearance_Instance       157970 non-null  int64         
 11  Appearance/Total          157970 non-null  float64       
 12  Appeara

In [37]:
# Drop the target and non-numeric columns
X = df.drop(columns=['Time_in_minutes', 'Parkrun_count', 'Gender', 'PB_mins', 'ave_mins', 'Position_score',
                     'Age_group', 'Date', 'Name', 'Position', 'Total_Appearances', 'Appearance/Total',
                     'first_parkrun_date', 'Runner_id','Appearance_Category'])
y = df['Time_in_minutes']


In [38]:
# Step 1: Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Step 2: Initialize MinMaxScaler
scaler = MinMaxScaler()

# Step 3: Normalize the training data
X_train_norm = scaler.fit_transform(X_train)

# Optionally, you can also scale the test data using the same scaler
X_test_norm = scaler.transform(X_test)

# Convert the normalized data back to DataFrame
X_train_norm_df = pd.DataFrame(X_train_norm, columns=X.columns)
X_test_norm_df = pd.DataFrame(X_test_norm, columns=X.columns)

In [39]:
def plot_feature_importance(model, feature_names):
    """
    Function to plot feature importance of a trained model.
    
    Parameters:
    model: Trained model (e.g., RandomForestRegressor)
    feature_names: List of feature names
    
    Returns:
    feature_importances
    """
    feature_importances = model.feature_importances_
    plt.figure(figsize=(10, 6))
    plt.barh(feature_names, feature_importances)  # Horizontal bar plot for better readability
    plt.xlabel('Feature Importance')
    plt.title('Feature Importances')
    plt.show()
    return feature_importances

def plot_predicted_vs_actual(y_test, y_pred):
    """
    Function to plot scatter plot of predicted vs actual values.
    
    Parameters:
    y_test: Actual target values
    y_pred: Predicted target values
    
    Returns:
    None
    """
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # Line for perfect prediction
    plt.xlabel('Actual Run Time (minutes)')
    plt.ylabel('Predicted Run Time (minutes)')
    plt.title('Scatter Plot: Predicted vs Actual Run Time')
    plt.show()

In [40]:
# Initialize and train Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_train_norm_df, y_train)

# Predict on test set
y_pred_rf = rf.predict(X_test_norm_df)

# Evaluate the model
rmse = root_mean_squared_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)

print(f'Root Mean Squared Error: {rmse.round(3)}')
print(f'R-squared: {r2.round(3)}')

# Call functions to visualize feature importance and predicted vs actual
rf_features = plot_feature_importance(rf, X_train_norm_df.columns)
plot_predicted_vs_actual(y_test, y_pred_rf)

KeyboardInterrupt: 

In [None]:
# Initialize and train Linear Regression model
lr = LinearRegression()
lr.fit(X_train_norm_df, y_train)

# Predict on test set
y_pred_lr = lr.predict(X_test_norm_df)

# Evaluate the model
rmse_lr = root_mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f'Linear Regression - RMSE: {rmse_lr.round(3)}')
print(f'Linear Regression - R-squared: {r2_lr.round(3)}')

# Visualize results
plot_predicted_vs_actual(y_test, y_pred_lr)

In [None]:
# Initialize and train Gradient Boosting model
gb = GradientBoostingRegressor(n_estimators=100, random_state=0)
gb.fit(X_train_norm_df, y_train)

# Predict on test set
y_pred_gb = gb.predict(X_test_norm_df)

# Evaluate the model
rmse_gb = root_mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f'Gradient Boosting - RMSE: {rmse_gb.round(3)}')
print(f'Gradient Boosting - R-squared: {r2_gb.round(3)}')

# Visualize results
gb_features = plot_feature_importance(gb, X_train_norm_df.columns)
plot_predicted_vs_actual(y_test, y_pred_gb)

In [None]:
# Initialize and train XGBoost model
xg = xgb.XGBRegressor(n_estimators=100, random_state=0)
xg.fit(X_train_norm_df, y_train)

# Predict on test set
y_pred_xg = xg.predict(X_test_norm_df)

# Evaluate the model
rmse_xg = root_mean_squared_error(y_test, y_pred_xg)
r2_xg = r2_score(y_test, y_pred_xg)

print(f'XGBoost - RMSE: {rmse_xg.round(3)}')
print(f'XGBoost - R-squared: {r2_xg.round(3)}')

# Visualize results
xg_features = plot_feature_importance(xg, X_train_norm_df.columns)
plot_predicted_vs_actual(y_test, y_pred_xg)

In [None]:
# Initialize and train SVR model
svr = SVR(kernel='rbf')
svr.fit(X_train_norm_df, y_train)

# Predict on test set
y_pred_svr = svr.predict(X_test_norm_df)

# Evaluate the model
rmse_svr = root_mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print(f'Support Vector Regression - RMSE: {rmse_svr.round(3)}')
print(f'Support Vector Regression - R-squared: {r2_svr.round(3)}')

# Visualize results
plot_predicted_vs_actual(y_test, y_pred_svr)

In [None]:
# Initialize and train KNN model
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_norm_df, y_train)

# Predict on test set
y_pred_knn = knn.predict(X_test_norm_df)

# Evaluate the model
rmse_knn = root_mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

print(f'KNN - RMSE: {rmse_knn.round(3)}')
print(f'KNN - R-squared: {r2_knn.round(3)}')

# Visualize results
plot_predicted_vs_actual(y_test, y_pred_knn)

In [None]:
print(f'KNN - RMSE: {rmse_knn.round(3)}')
print(f'KNN - R-squared: {r2_knn.round(3)}')
print(f'Support Vector Regression - RMSE: {rmse_svr.round(3)}')
print(f'Support Vector Regression - R-squared: {r2_svr.round(3)}')
print(f'XGBoost - RMSE: {rmse_xg.round(3)}')
print(f'XGBoost - R-squared: {r2_xg.round(3)}')
print(f'Gradient Boosting - RMSE: {rmse_gb.round(3)}')
print(f'Gradient Boosting - R-squared: {r2_gb.round(3)}')
print(f'Linear Regression - RMSE: {rmse_lr.round(3)}')
print(f'Linear Regression - R-squared: {r2_lr.round(3)}')
print(f'Random Forest - RMSE: {rmse.round(3)}')
print(f'Random Forest - R-squared: {r2.round(3)}')

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb = XGBRegressor(random_state=0)
grid_search = GridSearchCV(estimator=xgb, param_grid=params, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train_norm_df, y_train)

print("Best Parameters:", grid_search.best_params_)

In [None]:
#colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0

# Retrieve the best parameters
best_params = grid_search.best_params_

# Train the model with the best parameters
best_model = XGBRegressor(**best_params, random_state=0)
best_model.fit(X_train_norm_df, y_train)

# Predict on the test set
y_pred_xg = best_model.predict(X_test_norm_df)

# Evaluate the model
rmse_xg = root_mean_squared_error(y_test, y_pred_xg)
r2_xg = r2_score(y_test, y_pred_xg)

print(f"Test RMSE: {rmse_xg:.3f}")
print(f"Test R-squared: {r2_xg:.3f}")

plot_predicted_vs_actual(y_test, y_pred_knn)

In [None]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

def objective(trial):
    # Define hyperparameter space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
    }

    # Create and evaluate the model
    model = XGBRegressor(**params, random_state=0)
    scores = cross_val_score(model, X_train_norm_df, y_train, 
                             cv=5, scoring=make_scorer(mean_squared_error, greater_is_better=False))
    
    return np.mean(scores)  # Return negative MSE for minimization

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # Adjust n_trials as needed

# Best parameters and score
print("Best Parameters:", study.best_params)
print("Best RMSE:", np.sqrt(-study.best_value))

In [None]:
best_params = study.best_params
best_model = XGBRegressor(**best_params, random_state=0)
best_model.fit(X_train_norm_df, y_train)

# Evaluate on test data
y_pred = best_model.predict(X_test_norm_df)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {rmse:.3f}")
print(f"Test R-squared: {r2:.3f}")

In [None]:
plot_optimization_history(study).show()
plot_param_importances(study).show()

In [None]:

# Define the optimal parameters
best_params = {
    'n_estimators': 50,
    'max_depth': 7,
    'learning_rate': 0.012243304825496931,
    'subsample': 0.865760407475865,
    'colsample_bytree': 0.8515210990372176,
    'gamma': 1.7316033622064344,
    'reg_alpha': 2.1905117158413243,
    'reg_lambda': 6.278342959785368
}

# Train the model with the best parameters
opt_xgb = XGBRegressor(**best_params, random_state=0)
opt_xgb.fit(X_train_norm_df, y_train)

# Predict and evaluate on the train set
y_pred_tr_opt_x = opt_xgb.predict(X_train_norm_df)
rmse_opt_x = root_mean_squared_error(y_train, y_pred_tr_opt_x )
r2_opt_x = r2_score(y_train, y_pred_tr_opt_x )


# Predict on the test set
y_pred_opt_x = opt_xgb.predict(X_test_norm_df)

# Evaluate the model
rmse_opt_x = root_mean_squared_error(y_test, y_pred_opt_x)
r2_opt_x = r2_score(y_test, y_pred_opt_x)


print(f"Train RMSE: ")
print(f"Train R-Squared: ")

print("")
print(f"Test RMSE: {rmse_opt_x:.3f}")
print(f"Test R-squared: {r2_opt_x:.3f}")

# Plot the results
plot_predicted_vs_actual(y_test, y_pred_opt_x)