In [None]:
!pip install joblib
!pip install torch gpytorch



In [None]:
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np
import joblib
import gc
import io
import os
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.impute import IterativeImputer
from google.colab import files, drive
from joblib import Parallel, delayed
from sklearn.utils import shuffle

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the folder path within Google Drive
folder_path = '/content/drive/My Drive/CFB_Model/PreProcessed Data/'

# Load the specific CSV files into the named DataFrames
df_all_stats = pd.read_csv(os.path.join(folder_path, 'all_stats_2005_2024.csv'))
df_schedule = pd.read_csv(os.path.join(folder_path, 'schedule_2005_2024.csv'))

# Assuming df_schedule and df_all_stats are already loaded
# Filter df_all_stats for home team stats
home_stats = df_all_stats.rename(columns={'Team': 'Home Team'})
home_stats['Team Type'] = 'Home'

# Filter df_all_stats for away team stats
away_stats = df_all_stats.rename(columns={'Team': 'Away Team'})
away_stats['Team Type'] = 'Away'

# Merge df_schedule with home_stats and away_stats
df_schedule_with_home_stats = df_schedule.merge(home_stats, left_on=['Home Team', 'Year'], right_on=['Home Team', 'Year'], how='left')
df_schedule_with_full_stats = df_schedule_with_home_stats.merge(away_stats, left_on=['Away Team', 'Year'], right_on=['Away Team', 'Year'], suffixes=('_home', '_away'), how='left')

# Drop unnecessary columns and rename for clarity
df_schedule_with_full_stats = df_schedule_with_full_stats.drop(columns=['Team Type_home', 'Team Type_away', 'Conference_home', 'Conference_away'])

# Separate features and target variables
# Define columns to be dropped
columns_to_drop = ['Week', 'Home Team', 'Home Conference', 'Away Team', 'Away Conference']

# Drop columns only if they exist in the DataFrame
df_schedule_with_full_stats = df_schedule_with_full_stats.drop(columns=[col for col in columns_to_drop if col in df_schedule_with_full_stats.columns])

# Identify columns to check for missing values (excluding 'Home Points' and 'Away Points')
columns_to_check = [col for col in df_schedule_with_full_stats.columns if col not in ['Home Points', 'Away Points']]

# Drop rows with missing values in any of the identified columns
df_schedule_with_full_stats_cleaned = df_schedule_with_full_stats.dropna(subset=columns_to_check)

# Create training dataframe excluding 2024
df_train = df_schedule_with_full_stats_cleaned[df_schedule_with_full_stats_cleaned['Year'] != 2024]
df_test = df_schedule_with_full_stats_cleaned[df_schedule_with_full_stats_cleaned['Year'] == 2024]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 1: Prepare Data
X = df_train.drop(columns=['Year', 'Home Points', 'Away Points'])
y_home = df_train['Home Points']
y_away = df_train['Away Points']

# Split data into training and testing sets
X_train, X_test, y_home_train, y_home_test = train_test_split(X, y_home, test_size=0.2, random_state=42)
X_train, X_test, y_away_train, y_away_test = train_test_split(X, y_away, test_size=0.2, random_state=42)

# Step 2: Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid
param_grid = {
    'kernel': [
        RBF(length_scale=l, length_scale_bounds=(1e-1, 1.0)) +
        DotProduct(sigma_0=s, sigma_0_bounds=(0.1, 1.0)) +
        WhiteKernel(noise_level=n, noise_level_bounds=(1e-5, 1.0))
        for l in np.logspace(-1, 1, 2)  # 3 values for length_scale
        for s in np.logspace(-1, 1, 2)  # 3 values for sigma_0
        for n in np.logspace(-5, 1, 2)  # 3 values for noise_level
    ]
}

# Initialize the models
gpr_home = GaussianProcessRegressor(random_state=42)
gpr_away = GaussianProcessRegressor(random_state=42)

# Set up the RandomizedSearchCV for both home and away models with single-threaded execution
random_search_home = RandomizedSearchCV(gpr_home, param_distributions=param_grid, n_iter=5, cv=2, random_state=42, n_jobs=1)
random_search_away = RandomizedSearchCV(gpr_away, param_distributions=param_grid, n_iter=5, cv=2, random_state=42, n_jobs=1)

# Fit the models with hyperparameter tuning
random_search_home.fit(X_train_scaled, y_home_train)
gc.collect()  # Garbage collection to free up memory
random_search_away.fit(X_train_scaled, y_away_train)
gc.collect()  # Garbage collection to free up memory

# Get the best models
best_gpr_home = random_search_home.best_estimator_
best_gpr_away = random_search_away.best_estimator_

# Evaluate the models on the test set
home_score = best_gpr_home.score(X_test_scaled, y_home_test)
away_score = best_gpr_away.score(X_test_scaled, y_away_test)

print(f'Best Home Model Score: {home_score}')
print(f'Best Away Model Score: {away_score}')



In [1]:
# Step 5: Make predictions on the test set
y_home_pred, y_home_std = best_gpr_home.predict(X_test_scaled, return_std=True)
y_away_pred, y_away_std = best_gpr_away.predict(X_test_scaled, return_std=True)

# Step 6: Evaluate the models
home_rmse = mean_squared_error(y_home_test, y_home_pred, squared=False)
away_rmse = mean_squared_error(y_away_test, y_away_pred, squared=False)

print(f"Home Points RMSE: {home_rmse}")
print(f"Away Points RMSE: {away_rmse}")

# Optionally, print mean and std values for debugging
print("Home Mean Predictions:", y_home_pred)
print("Home Std Predictions:", y_home_std)
print("Away Mean Predictions:", y_away_pred)
print("Away Std Predictions:", y_away_std)

# Step 7: Save models and scaler
joblib.dump(best_gpr_home, '/content/drive/MyDrive/CFB_Model/Model Training/gpr_home_model_v2.pkl')
joblib.dump(best_gpr_away, '/content/drive/MyDrive/CFB_Model/Model Training/gpr_away_model_v2.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/CFB_Model/Model Training/scaler_v2.pkl')

NameError: name 'best_gpr_home' is not defined