In [1]:
!pip install joblib
!pip install torch gpytorch

Collecting gpytorch
  Downloading gpytorch-1.12-py3-none-any.whl.metadata (8.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cur

In [2]:
import pandas as pd
import numpy as np
import joblib
import gc
import torch
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")
import gpytorch
import io
import os
import xgboost as xgb
import matplotlib.pyplot as plt
from IPython.display import clear_output
from sklearn.utils import shuffle
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # To enable IterativeImputer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.impute import IterativeImputer
from google.colab import files, drive
import scipy.stats

In [3]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the schedule and stats dataframes
df_schedule = pd.read_csv('/content/drive/MyDrive/CFB_Model/PreProcessed Data/schedule_2005_2024.csv')
df_all_stats = pd.read_csv('/content/drive/MyDrive/CFB_Model/PreProcessed Data/all_stats_2005_2024.csv')
df_2024_team_info = pd.read_csv('/content/drive/MyDrive/CFB_Model/Raw Data/2024_team_info.csv')

# Load the models and scaler
gpr_home = joblib.load('/content/drive/MyDrive/CFB_Model/Model Training/gpr_home_model_v1.pkl')
gpr_away = joblib.load('/content/drive/MyDrive/CFB_Model/Model Training/gpr_away_model_v1.pkl')
scaler = joblib.load('/content/drive/MyDrive/CFB_Model/Model Training/scaler_v1.pkl')

Mounted at /content/drive


In [4]:
def initial_data_prep(df_schedule, df_all_stats, year):
    # Check for duplicate matchups in df_schedule
    duplicate_matchups = df_schedule[df_schedule.duplicated(subset=['Home Team','Away Team', 'Week', 'Year'], keep=False)]

    # Assuming df_schedule and df_all_stats are already loaded

    # Filter df_all_stats for home team stats
    home_stats = df_all_stats.rename(columns={'Team': 'Home Team'})
    home_stats['Team Type'] = 'Home'

    # Filter df_all_stats for away team stats
    away_stats = df_all_stats.rename(columns={'Team': 'Away Team'})
    away_stats['Team Type'] = 'Away'

    # Merge df_schedule with home_stats and away_stats
    df_schedule_with_home_stats = df_schedule.merge(home_stats, left_on=['Home Team', 'Year'], right_on=['Home Team', 'Year'], how='left')
    df_schedule_with_full_stats = df_schedule_with_home_stats.merge(away_stats, left_on=['Away Team', 'Year'], right_on=['Away Team', 'Year'], suffixes=('_home', '_away'), how='left')

    # Drop unnecessary columns
    df_schedule_with_full_stats = df_schedule_with_full_stats.drop(columns=['Team Type_home', 'Team Type_away', 'Conference_home', 'Conference_away'])

    # Remove duplicates, keeping only unique games
    df_schedule_with_full_stats = df_schedule_with_full_stats.drop_duplicates(subset=['Home Team', 'Away Team', 'Year'], keep='first')

    # Separate features and target variables
    # Identify columns to check for missing values (excluding 'Home Points' and 'Away Points')
    columns_to_check = [col for col in df_schedule_with_full_stats.columns if col not in ['Home Points', 'Away Points']]

    # Drop rows with missing values in any of the identified columns
    df_schedule_with_full_stats = df_schedule_with_full_stats.dropna(subset=columns_to_check)

    # Create training dataframe excluding 2024
    df_train = df_schedule_with_full_stats[df_schedule_with_full_stats['Year'] != year]

    # Create testing dataframe for 2024
    df_test = df_schedule_with_full_stats[df_schedule_with_full_stats['Year'] == year]
    return df_test

def predict_regular_season_outcomes(year, df, home_model, away_model, scaler, df_2024_team_info):
    results = []
    placeholders = []

    # Filter schedule for the specified year
    df_year_schedule = df[df['Year'] == year]

    # Track number of games per team
    team_game_count = {}

    for idx, game in df_year_schedule.iterrows():
        # Home and away teams
        home_team = game['Home Team']
        away_team = game['Away Team']

        # Initialize game count for each team if not present
        if home_team not in team_game_count:
            team_game_count[home_team] = 0
        if away_team not in team_game_count:
            team_game_count[away_team] = 0

        # Increase the game count for each team
        team_game_count[home_team] += 1
        team_game_count[away_team] += 1

        # Drop unnecessary columns and reshape the input for the model
        X_game = game.drop(labels=['Year', 'Home Points', 'Away Points', 'Week', 'Home Team', 'Home Conference', 'Away Team', 'Away Conference']).values.reshape(1, -1)

        # Scale the input features
        X_game_scaled = scaler.transform(X_game)

        # Suppress warnings during prediction
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            # Make predictions
            home_mean_array, home_std_array = home_model.predict(X_game_scaled, return_std=True)
            away_mean_array, away_std_array = away_model.predict(X_game_scaled, return_std=True)

            home_mean = home_mean_array[0]
            home_std = home_std_array[0]
            away_mean = away_mean_array[0]
            away_std = away_std_array[0]

        # Sample from the distributions to simulate the game outcome
        home_sampled_score = np.random.normal(home_mean, home_std)
        away_sampled_score = np.random.normal(away_mean, away_std)

        # Round the sampled scores up
        final_home_score = np.ceil(home_sampled_score)
        final_away_score = np.ceil(away_sampled_score)

        # Handle tie-breaking by checking original means
        if final_home_score == final_away_score:
            if home_mean > away_mean:
                final_home_score += 1
            else:
                final_away_score += 1

        # Store the results
        results.append({
            'Year': game['Year'],
            'Home Team': home_team,
            'Home Conference': game['Home Conference'],
            'Away Team': away_team,
            'Away Conference': game['Away Conference'],
            'Week': game['Week'],
            'Home Score': final_home_score,
            'Away Score': final_away_score,
        })

    # Add placeholder games to ensure each team has 12 games
    for team, games_played in team_game_count.items():
        while games_played < 12:
            # Placeholder stats for a generic FCS team
            placeholder_stats = {
                'home_mean': 45,
                'home_std': 1.5,
                'away_mean': 14,
                'away_std': 1.5,
                'team': 'FCS Placeholder'
            }

            # Sample from the distributions to simulate the game outcome
            home_sampled_score = np.random.normal(placeholder_stats['home_mean'], placeholder_stats['home_std'])
            away_sampled_score = np.random.normal(placeholder_stats['away_mean'], placeholder_stats['away_std'])

            # Round the sampled scores up
            final_home_score = np.ceil(home_sampled_score)
            final_away_score = np.ceil(away_sampled_score)

            # Handle tie-breaking by checking original means
            if final_home_score == final_away_score:
                if home_mean > away_mean:
                    final_home_score += 1
                else:
                    final_away_score += 1

            # Add a home placeholder game
            placeholders.append({
                'Year': year,
                'Home Team': team,
                'Home Conference': 'N/A',
                'Away Team': placeholder_stats['team'],
                'Away Conference': 'N/A',
                'Week': 15,
                'Home Score': final_home_score,
                'Away Score': final_away_score,
            })

            games_played += 1

    # Convert lists to DataFrame
    df_results = pd.DataFrame(results)
    df_fcs_placeholders = pd.DataFrame(placeholders)

    # Combine regular games and placeholders
    df_final_results = pd.concat([df_results, df_fcs_placeholders], ignore_index=True)

    return df_final_results

def predict_regular_season_distributions(year, df, home_model, away_model, scaler, df_2024_team_info):
    distributions = []
    placeholders = []

    # Filter schedule for the specified year
    df_year_schedule = df[df['Year'] == year]

    # Track number of games per team
    team_game_count = {}

    for idx, game in df_year_schedule.iterrows():
        # Home and away teams
        home_team = game['Home Team']
        away_team = game['Away Team']

        # Initialize game count for each team if not present
        if home_team not in team_game_count:
            team_game_count[home_team] = 0
        if away_team not in team_game_count:
            team_game_count[away_team] = 0

        # Increase the game count for each team
        team_game_count[home_team] += 1
        team_game_count[away_team] += 1

        # Drop unnecessary columns and reshape the input for the model
        X_game = game.drop(labels=['Year', 'Home Points', 'Away Points', 'Week', 'Home Team', 'Home Conference', 'Away Team', 'Away Conference']).values.reshape(1, -1)

        # Scale the input features
        X_game_scaled = scaler.transform(X_game)

        # Suppress warnings during prediction
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            # Make predictions
            home_mean_array, home_std_array = home_model.predict(X_game_scaled, return_std=True)
            away_mean_array, away_std_array = away_model.predict(X_game_scaled, return_std=True)

            home_mean = home_mean_array[0]
            home_std = home_std_array[0]
            away_mean = away_mean_array[0]
            away_std = away_std_array[0]

        # Store the results
        distributions.append({
            'Year': game['Year'],
            'Home Team': home_team,
            'Home Conference': game['Home Conference'],
            'Away Team': away_team,
            'Away Conference': game['Away Conference'],
            'Week': game['Week'],
            'Home Mean Score': home_mean,
            'Home Std Score': home_std,
            'Away Mean Score': away_mean,
            'Away Std Score': away_std,
        })

    # Add placeholder games to ensure each team has 12 games
    for team, games_played in team_game_count.items():
        while games_played < 12:
            # Placeholder stats for a generic FCS team
            placeholder_stats = {
                'home_mean': 35,
                'home_std': 1.5,
                'away_mean': 10,
                'away_std': 1.5,
                'team': 'FCS Placeholder'
            }

            # Add a home placeholder game
            placeholders.append({
                'Year': year,
                'Home Team': team,
                'Home Conference': 'FBS',
                'Away Team': placeholder_stats['team'],
                'Away Conference': 'FCS',
                'Week': 15,
                'Home Mean Score': home_mean,
                'Home Std Score': home_std,
                'Away Mean Score': placeholder_stats['away_mean'],
                'Away Std Score': placeholder_stats['away_std']
            })

            games_played += 1

    # Convert lists to DataFrame
    df_distributions = pd.DataFrame(distributions)
    df_fcs_placeholders = pd.DataFrame(placeholders)

    # Combine regular games and placeholders
    df_score_distributions = pd.concat([df_distributions, df_fcs_placeholders], ignore_index=True)

    return df_score_distributions

def sample_from_regular_season_distributions(df_score_distributions):
    results = []

    for idx, game in df_score_distributions.iterrows():

        # Home and away teams
        home_team = game['Home Team']
        away_team = game['Away Team']

        home_mean = game['Home Mean Score']
        home_std = game['Home Std Score']
        away_mean = game['Away Mean Score']
        away_std = game['Away Std Score']



        # Sample from the distributions to simulate the game outcome
        home_sampled_score = np.random.normal(home_mean, home_std)
        away_sampled_score = np.random.normal(away_mean, away_std)

        # Round the sampled scores up
        final_home_score = np.ceil(home_sampled_score)
        final_away_score = np.ceil(away_sampled_score)

        # Handle tie-breaking by checking original means
        if final_home_score == final_away_score:
            if home_mean > away_mean:
                final_home_score += 1
            else:
                final_away_score += 1

        # Store the results
        results.append({
            'Year': game['Year'],
            'Home Team': home_team,
            'Home Conference': game['Home Conference'],
            'Away Team': away_team,
            'Away Conference': game['Away Conference'],
            'Week': game['Week'],
            'Home Score': final_home_score,
            'Away Score': final_away_score,
        })

    # Convert lists to DataFrame
    df_results = pd.DataFrame(results)
    return df_results

# Function to calculate expected wins for each game based on a top-25 team
def calculate_expected_wins(df_regular_season_results):
    # Calculate expected wins based on whether the home team won
    df_regular_season_results['Expected_Wins_Home'] = df_regular_season_results['Home Team Win'].astype(int)
    df_regular_season_results['Expected_Wins_Away'] = 1 - df_regular_season_results['Expected_Wins_Home']

    return df_regular_season_results

# Function to calculate SOR for a given team
def calculate_sor_for_team(team_name, df_results):
    # Filter the games where the team is playing either as home or away
    team_games_home = df_results[df_results['Home Team'] == team_name]
    team_games_away = df_results[df_results['Away Team'] == team_name]

    # Sum the expected wins from all games the team is involved in
    total_expected_wins = team_games_home['Expected_Wins_Home'].sum() + team_games_away['Expected_Wins_Away'].sum()

    # Calculate actual wins (count of games where the team won)
    actual_wins = team_games_home['Home Team Win'].sum() + team_games_away['Home Team Win'].apply(lambda x: not x).sum()

    # Calculate SOR: expected wins for a top-25 team - actual wins by the team
    sor = total_expected_wins - actual_wins

    return sor

def rank_teams_in_conference(team_records, df_results_2024):
    # Filter only conference games in df_results_2024
    df_conference_games = df_results_2024[df_results_2024['Conference Game'] == True]

    # Initialize a list to store rankings
    rankings = []

    # Group by conference
    conferences = team_records['Conference'].unique()

    for conference in conferences:
        # Filter for teams in the current conference
        df_conference_teams = team_records[team_records['Conference'] == conference]

        # Sort teams by 'Conference Wins Prob' initially (descending)
        df_conference_teams = df_conference_teams.sort_values(by='Conference Wins', ascending=False).reset_index(drop=True)

        # Head-to-head tie-breaker logic
        for i in range(len(df_conference_teams) - 1):
            # Check if the teams are tied in conference wins probability
            if df_conference_teams.loc[i, 'Conference Wins'] == df_conference_teams.loc[i+1, 'Conference Wins']:
                team_1 = df_conference_teams.loc[i, 'Team']
                team_2 = df_conference_teams.loc[i+1, 'Team']

                # Check if they played a head-to-head game in df_results_2024
                head_to_head_game = df_conference_games[((df_conference_games['Home Team'] == team_1) & (df_conference_games['Away Team'] == team_2)) |
                                                        ((df_conference_games['Home Team'] == team_2) & (df_conference_games['Away Team'] == team_1))]

                if not head_to_head_game.empty:
                    # Determine the winner of the head-to-head game
                    if head_to_head_game.iloc[0]['Home Team'] == team_1 and head_to_head_game.iloc[0]['Home Team Win']:
                        winner = team_1
                    elif head_to_head_game.iloc[0]['Away Team'] == team_1 and not head_to_head_game.iloc[0]['Home Team Win']:
                        winner = team_1
                    else:
                        winner = team_2

                    # Adjust ranking based on head-to-head winner
                    if winner == team_2:
                        # Swap the positions of the two teams
                        df_conference_teams.iloc[i], df_conference_teams.iloc[i+1] = df_conference_teams.iloc[i+1].copy(), df_conference_teams.iloc[i].copy()

        # Sort again by SOR for final tie-breaker
        df_conference_teams = df_conference_teams.sort_values(by=['Conference Wins', 'SOR'], ascending=[False, False]).reset_index(drop=True)

        # Add a 'Rank' column
        df_conference_teams['Rank'] = range(1, len(df_conference_teams) + 1)
        df_conference_teams['Conference'] = conference

        # Append to the final rankings
        rankings.append(df_conference_teams)

    # Concatenate all conference rankings into a single DataFrame
    df_final_rankings = pd.concat(rankings, ignore_index=True)

    return df_final_rankings

def set_up_conference_championships(df_final_rankings):
    # Conferences with championship games
    conferences_with_championships = ['ACC', 'Mountain West', 'Conference USA', 'Sun Belt',
                                      'Big Ten', 'American Athletic', 'SEC', 'Mid-American', 'Big 12']

    # Initialize a list to store championship games
    championship_games = []

    # Loop through the conferences
    for conference in conferences_with_championships:
        # Filter teams by conference
        conference_teams = df_final_rankings[df_final_rankings['Conference'] == conference]

        # Special case for Sun Belt (handle East and West divisions)
        if conference == 'Sun Belt':
            # Assume we have a 'Division' column to split the teams
            east_teams = conference_teams[conference_teams['Division'] == 'East']
            west_teams = conference_teams[conference_teams['Division'] == 'West']

            # Get the highest-ranked team from each division
            top_east_team = east_teams.sort_values(by='Rank').iloc[0]
            top_west_team = west_teams.sort_values(by='Rank').iloc[0]

            # Determine home and away team based on overall ranking
            if top_east_team['Rank'] < top_west_team['Rank']:
                home_team = top_east_team
                away_team = top_west_team
            else:
                home_team = top_west_team
                away_team = top_east_team

            # Append the championship game details to the list
            championship_games.append({
                'Conference': conference,
                'Home Team': home_team['Team'],
                'Away Team': away_team['Team'],
            })

        else:
            # Get the top two ranked teams in the conference
            top_two_teams = conference_teams.sort_values(by='Rank').head(2)

            # Home team is the highest-ranked team, away team is the second-highest-ranked team
            home_team = top_two_teams.iloc[0]
            away_team = top_two_teams.iloc[1]

            # Append the championship game details to the list
            championship_games.append({
                'Conference': conference,
                'Home Team': home_team['Team'],
                'Away Team': away_team['Team'],
            })

    # Convert the championship games to a DataFrame
    df_championship_games = pd.DataFrame(championship_games)

    return df_championship_games

def set_up_championship_game_stats(df_championship_games, df_all_stats):
    # Rename columns in df_all_stats for merging with home and away teams
    home_stats = df_all_stats.rename(columns={'Team': 'Home Team'})
    away_stats = df_all_stats.rename(columns={'Team': 'Away Team'})

    # Merge df_championship_games with home_stats and away_stats
    df_championship_with_home_stats = df_championship_games.merge(
        home_stats, left_on=['Home Team'], right_on=['Home Team'], how='left'
    )
    df_championship_with_full_stats = df_championship_with_home_stats.merge(
        away_stats, left_on=['Away Team'], right_on=['Away Team'], suffixes=('_home', '_away'), how='left'
    )

    # Remove duplicates
    df_championship_with_full_stats = df_championship_with_full_stats.drop_duplicates(subset=['Home Team', 'Away Team'])

    return df_championship_with_full_stats

def prep_team_records(df_regular_season_results):
    # Calculate 'Home Team Win' column
    df_regular_season_results['Home Team Win'] = df_regular_season_results['Home Score'] > df_regular_season_results['Away Score']
    # Calculate 'Conference Game' column
    df_regular_season_results['Conference Game'] = df_regular_season_results['Home Conference'] == df_regular_season_results['Away Conference']

    # Initialize a dictionary to track team stats
    team_stats = {}

    # Iterate through each game and update team stats
    for _, game in df_regular_season_results.iterrows():
        home_team = game['Home Team']
        away_team = game['Away Team']
        is_conference_game = game['Conference Game']

        # Initialize stats for the home team if not already present
        if home_team not in team_stats:
            team_stats[home_team] = {
                'Games Played': 0, 'Wins': 0, 'Losses': 0, 'Conference Wins': 0, 'Conference Losses': 0,
                'Total Points Scored': 0, 'Total Points Allowed': 0
            }

        # Initialize stats for the away team if not already present
        if away_team not in team_stats:
            team_stats[away_team] = {
                'Games Played': 0, 'Wins': 0, 'Losses': 0, 'Conference Wins': 0, 'Conference Losses': 0,
                'Total Points Scored': 0, 'Total Points Allowed': 0
            }

        # Update stats for the home team
        team_stats[home_team]['Games Played'] += 1
        team_stats[home_team]['Total Points Scored'] += game['Home Score']
        team_stats[home_team]['Total Points Allowed'] += game['Away Score']

        # Update stats for the away team
        team_stats[away_team]['Games Played'] += 1
        team_stats[away_team]['Total Points Scored'] += game['Away Score']
        team_stats[away_team]['Total Points Allowed'] += game['Home Score']

        # Determine and update actual wins/losses
        if game['Home Score'] > game['Away Score']:
            team_stats[home_team]['Wins'] += 1
            team_stats[away_team]['Losses'] += 1
            if is_conference_game:
                team_stats[home_team]['Conference Wins'] += 1
                team_stats[away_team]['Conference Losses'] += 1
        else:
            team_stats[away_team]['Wins'] += 1
            team_stats[home_team]['Losses'] += 1
            if is_conference_game:
                team_stats[away_team]['Conference Wins'] += 1
                team_stats[home_team]['Conference Losses'] += 1

    # Convert the dictionary to a DataFrame for easier analysis
    team_records = pd.DataFrame.from_dict(team_stats, orient='index').reset_index()
    team_records.rename(columns={'index': 'Team'}, inplace=True)

    # Add additional statistics like average points, point differential, etc.
    team_records['Average Points Scored'] = team_records['Total Points Scored'] / team_records['Games Played']
    team_records['Average Points Allowed'] = team_records['Total Points Allowed'] / team_records['Games Played']
    team_records['Point Differential'] = team_records['Average Points Scored'] - team_records['Average Points Allowed']

    # Merge the 'Conference' and 'Division' columns from df_2024_team_info into team_records
    team_records = pd.merge(team_records, df_2024_team_info[['Team', 'Conference', 'Division']], on='Team', how='left')

    # Apply the expected wins calculation
    df_regular_season_results = calculate_expected_wins(df_regular_season_results)

    # Example usage: Calculate SOR for all teams in the dataset
    teams = pd.concat([df_regular_season_results['Home Team'], df_regular_season_results['Away Team']]).unique()

    sor_results = []

    for team in teams:
        sor_value = calculate_sor_for_team(team, df_regular_season_results)
        sor_results.append({'Team': team, 'SOR': sor_value})

    # Create a DataFrame to store the SOR results
    df_sor = pd.DataFrame(sor_results)

    # Perform an inner join to merge the Strength of Record (SOR) into the team_records dataset
    team_records = team_records.merge(df_sor, on='Team', how='left')

    return team_records

def prep_final_rankings(team_records, df_regular_season_results):

    df_final_rankings = rank_teams_in_conference(team_records, df_regular_season_results)

    return df_final_rankings

def prep_championship_data(df_final_rankings, df_all_stats, df_schedule_with_full_stats):
    df_championship_games = set_up_conference_championships(df_final_rankings)

    df_championship_with_full_stats = set_up_championship_game_stats(df_championship_games, df_all_stats)

    # Step 1: Compare column names in both dataframes
    schedule_columns = df_schedule_with_full_stats.columns.tolist()
    championship_columns = df_championship_with_full_stats.columns.tolist()

    # Check if columns are identical
    if schedule_columns == championship_columns:
        print("Columns are identical and in the same order.")
    else:

        # Step 2: Find differences
        missing_in_championship = [col for col in schedule_columns if col not in championship_columns]
        missing_in_schedule = [col for col in championship_columns if col not in schedule_columns]

        #print(f"Missing in championship dataset: {missing_in_championship}")
        #print(f"Missing in schedule dataset: {missing_in_schedule}")

        # Step 3: Reorder championship columns to match schedule columns (ignoring missing columns for now)
        common_columns = [col for col in schedule_columns if col in championship_columns]
        df_championship_with_full_stats = df_championship_with_full_stats[common_columns]

        # Step 4: Add missing columns to df_championship_with_full_stats with default values
        for col in missing_in_championship:
            df_championship_with_full_stats[col] = None  # or appropriate default value

        # Step 5: Recheck and reorder columns
        df_championship_with_full_stats = df_championship_with_full_stats[schedule_columns]

    # Step 1: Remove/rename unnecessary columns in df_championship_with_stats
    df_championship_with_full_stats = df_championship_with_full_stats.drop(columns=['Conference_x', 'Year_home', 'Conference_y', 'Year_away', 'Conference'], errors='ignore')

    # Step 2: Add missing columns to df_championship_with_full_stats with default values
    missing_columns = ['Year', 'Week', 'Neutral Site', 'Conference Game', 'Home Conference', 'Home Points', 'Away Conference', 'Away Points']
    df_championship_with_full_stats['Year'] = 2024
    df_championship_with_full_stats['Week'] = "Conf Champ"
    for col in missing_columns:
        if col not in df_championship_with_full_stats.columns:
            df_championship_with_full_stats[col] = None  # Default value; adjust based on your data

    # Step 3: Reorder columns in df_championship_with_stats to match df_schedule_with_full_stats
    df_championship_with_full_stats = df_championship_with_full_stats[df_schedule_with_full_stats.columns]

    # Set 'Conference Game' and 'Neutral Site' to True for championship games
    df_championship_with_full_stats['Conference Game'] = False
    df_championship_with_full_stats['Neutral Site'] = True

    return df_championship_with_full_stats

def predict_championship_outcomes(df_championship, home_model, away_model, scaler):
    results = []

    for idx, game in df_championship.iterrows():
        # Home and away teams
        home_team = game['Home Team']
        away_team = game['Away Team']
        game['Home Coming Off Bye'] = False
        game['Away Coming Off Bye'] = False
        game['Home Consecutive Games'] = 0
        game['Away Consecutive Games'] = 0
        game['Home Consecutive Away Games'] = 0
        game['Away Consecutive Away Games'] = 0

        # Drop unnecessary columns and reshape the input for the model
        X_game = game.drop(labels=['Year', 'Home Points', 'Away Points', 'Week', 'Home Team', 'Home Conference', 'Away Team', 'Away Conference']).values.reshape(1, -1)

        # Scale the input features
        X_game_scaled = scaler.transform(X_game)

        # Suppress warnings during prediction
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            # Make predictions
            home_mean_array, home_std_array = home_model.predict(X_game_scaled, return_std=True)
            away_mean_array, away_std_array = away_model.predict(X_game_scaled, return_std=True)

            home_mean = home_mean_array[0]
            home_std = home_std_array[0]
            away_mean = away_mean_array[0]
            away_std = away_std_array[0]

        # Sample from the distributions to simulate the game outcome
        home_sampled_score = np.random.normal(home_mean, home_std)
        away_sampled_score = np.random.normal(away_mean, away_std)

        # Round the sampled scores up
        final_home_score = np.ceil(home_sampled_score)
        final_away_score = np.ceil(away_sampled_score)

        # Handle tie-breaking by checking original means
        if final_home_score == final_away_score:
            if home_mean > away_mean:
                final_home_score += 1
            else:
                final_away_score += 1

        home_team_win = final_home_score > final_away_score

        # Store the results
        results.append({
            'Year': game['Year'],
            'Home Team': home_team,
            'Home Conference': game['Home Conference'],
            'Away Team': away_team,
            'Away Conference': game['Away Conference'],
            'Week': game['Week'],
            'Home Score': final_home_score,
            'Away Score': final_away_score,
            'Home Team Win': home_team_win,
            'Conference Game': game['Conference Game'],
            'Expected_Wins_Home': home_team_win.astype(int),
            'Expected_Wins_Away': 1 - home_team_win.astype(int)
        })

    # Convert results to DataFrame
    df_results = pd.DataFrame(results)

    return df_results

def update_rankings_with_championship(df_final_rankings, df_championship_results):
    # Add new columns for Conference Champion and Conference Runner Up
    df_final_rankings['Conference Champion'] = None
    df_final_rankings['Conference Runner Up'] = None

    # Update total wins, losses, win probabilities, and loss probabilities based on championship results
    for idx, game in df_championship_results.iterrows():
        home_team = game['Home Team']
        away_team = game['Away Team']
        home_score = game['Home Score']
        away_score = game['Away Score']

        # Determine winner and runner up based on win probabilities
        if home_score >= away_score:
            champion = home_team
            runner_up = away_team
        else:
            champion = away_team
            runner_up = home_team

        # Update Conference Champion and Runner Up in df_final_rankings
        df_final_rankings.loc[df_final_rankings['Team'] == champion, 'Conference Champion'] = True
        df_final_rankings.loc[df_final_rankings['Team'] == runner_up, 'Conference Runner Up'] = True

        # Update total wins, losses, and probabilities for both teams
        df_final_rankings.loc[df_final_rankings['Team'] == champion, 'Wins'] += 1
        df_final_rankings.loc[df_final_rankings['Team'] == runner_up, 'Losses'] += 1

        df_final_rankings.loc[df_final_rankings['Team'] == champion, 'Games Played'] += 1
        df_final_rankings.loc[df_final_rankings['Team'] == runner_up, 'Games Played'] += 1

    # Ensure that the columns 'Conference Champion' and 'Conference Runner Up' are properly filled
    df_final_rankings['Conference Champion'] = df_final_rankings['Conference Champion'].fillna(False)
    df_final_rankings['Conference Runner Up'] = df_final_rankings['Conference Runner Up'].fillna(False)

    return df_final_rankings

def calculate_sos(df_results, df_team_info):
    team_sos = {}

    for team in df_team_info['Team']:
        # Get all games involving this team (both home and away)
        team_games = df_results[(df_results['Home Team'] == team) | (df_results['Away Team'] == team)]

        opponent_win_percs = []

        for idx, game in team_games.iterrows():
            if game['Home Team'] == team:
                opponent = game['Away Team']
            else:
                opponent = game['Home Team']

            # Get opponent's win percentage from the final rankings
            opponent_data = df_team_info[df_team_info['Team'] == opponent]
            if not opponent_data.empty:
                opponent_wins = opponent_data['Wins'].values[0]
                opponent_losses = opponent_data['Losses'].values[0]
                if opponent_wins + opponent_losses > 0:
                    opponent_win_percentage = opponent_wins / (opponent_wins + opponent_losses)
                else:
                    opponent_win_percentage = 0  # To avoid division by zero if the opponent has no games
                opponent_win_percs.append(opponent_win_percentage)

        # Strength of schedule is the average win percentage of all opponents
        if opponent_win_percs:
            team_sos[team] = np.mean(opponent_win_percs)
        else:
            team_sos[team] = 0  # In case there are no games for a team (which shouldn't happen)

    # Convert to DataFrame and merge with team info
    df_sos = pd.DataFrame(list(team_sos.items()), columns=['Team', 'SOS'])
    df_team_info_with_sos = pd.merge(df_team_info, df_sos, on='Team')

    return df_team_info_with_sos


def calculate_adjusted_wins(df_team_info_with_sos):
    # Adjust wins based on strength of schedule (higher SOS means more credit for wins)
    df_team_info_with_sos['Adjusted Wins'] = df_team_info_with_sos['Wins'] * df_team_info_with_sos['SOS']
    df_team_info_with_sos['Adjusted Losses'] = df_team_info_with_sos['Losses'] / df_team_info_with_sos['SOS']

    return df_team_info_with_sos

def calculate_point_differential(df_results, df_team_info):
    point_differentials = {}

    for team in df_team_info['Team']:
        # Get all games involving this team
        team_games = df_results[(df_results['Home Team'] == team) | (df_results['Away Team'] == team)]
        total_point_diff = 0

        for idx, game in team_games.iterrows():
            if game['Home Team'] == team:
                point_diff = game['Home Score'] - game['Away Score']
            else:
                point_diff = game['Away Score'] - game['Home Score']

            total_point_diff += point_diff

        point_differentials[team] = total_point_diff

    # Convert to DataFrame and merge with team info
    df_point_diff = pd.DataFrame(list(point_differentials.items()), columns=['Team', 'Point Differential'])

    # Ensure the column names are correctly merged
    df_team_info_with_point_diff = pd.merge(df_team_info, df_point_diff, on='Team', how='left')

    # Rename column if necessary to avoid clashes
    df_team_info_with_point_diff = df_team_info_with_point_diff.rename(columns={'Point Differential_y': 'Point Differential'})

    return df_team_info_with_point_diff

def rank_teams(df_team_info_with_metrics):
    # Ensure 'Point Differential' exists
    if 'Point Differential' not in df_team_info_with_metrics.columns:
        raise KeyError("'Point Differential' column is missing in DataFrame")

    # Define a ranking formula combining adjusted wins, SOS, and point differential
    df_team_info_with_metrics['Rank Score'] = (
        df_team_info_with_metrics['Adjusted Wins'] * 0.6 +  # Weight adjusted wins the highest
        df_team_info_with_metrics['SOS'] * 0.3 +  # Give weight to strength of schedule
        df_team_info_with_metrics['Point Differential'] * 0.1  # Include point differential
    )

    # Sort teams by the final rank score
    df_team_info_with_metrics = df_team_info_with_metrics.sort_values(by='Rank Score', ascending=False).reset_index(drop=True)

    return df_team_info_with_metrics

def prep_playoff_game_stats(df_pre_playoff_results, df_final_rankings_updated):
    # Calculate SOS
    df_team_info_with_sos = calculate_sos(df_pre_playoff_results, df_final_rankings_updated)

    # Calculate Adjusted Wins
    df_team_info_with_adjusted_wins = calculate_adjusted_wins(df_team_info_with_sos)

    # Calculate Point Differential
    df_team_info_with_metrics = calculate_point_differential(df_pre_playoff_results, df_team_info_with_adjusted_wins)

    # Rank Teams
    df_ranked_teams = rank_teams(df_team_info_with_metrics)

    # Rename the existing 'Rank' column to 'Conference Rank'
    df_ranked_teams = df_ranked_teams.rename(columns={'Rank': 'Conference Rank'})

    # Add the 'FBS Rank' column, which ranks teams among all teams
    df_ranked_teams['FBS Rank'] = df_ranked_teams['Rank Score'].rank(ascending=False, method='min').astype(int)

    # Step 1: Select the Top 4 Conference Champions
    conference_champions = df_ranked_teams[df_ranked_teams['Conference Champion'] == True]
    top_conference_champions = conference_champions.sort_values(by='FBS Rank').head(4)

    # Step 2: Select Seeds 5-12 from the remaining highest-ranked teams
    remaining_teams = df_ranked_teams[~df_ranked_teams['Team'].isin(top_conference_champions['Team'])]
    next_highest_teams = remaining_teams.sort_values(by='FBS Rank').head(8)

    # Combine top conference champions and the next highest-ranked teams
    cfp_bracket = pd.concat([top_conference_champions, next_highest_teams])

    # Ensure at least 5 Conference Champions
    # Check if the bracket already has at least 5 conference champions
    current_champions_count = df_ranked_teams[df_ranked_teams['Team'].isin(cfp_bracket['Team'])]['Conference Champion'].sum()

    if current_champions_count < 5:
        # Find additional conference champions to add to the bracket
        additional_champions_needed = 5 - current_champions_count
        additional_champions = conference_champions[
            ~conference_champions['Team'].isin(cfp_bracket['Team'])
        ].sort_values(by='FBS Rank').head(additional_champions_needed)

        # Find the team(s) to remove if necessary
        if additional_champions.shape[0] > 0:
            # Remove the team with the lowest FBS Rank from current bracket
            lowest_seed_team = cfp_bracket.sort_values(by='FBS Rank').tail(1)
            cfp_bracket = cfp_bracket[~cfp_bracket['Team'].isin(lowest_seed_team['Team'])]

            # Add additional champions to the final bracket
            cfp_bracket = pd.concat([cfp_bracket, additional_champions])

    # Add seed numbers: Seeds 1-4 are for conference champions
    cfp_bracket['Seed'] = list(range(1, min(5, len(cfp_bracket) + 1))) + list(range(5, len(cfp_bracket) + 1))

    # Step 2: Update CFP Seed for teams that made the playoffs
    for index, row in cfp_bracket.iterrows():
        team_name = row['Team']
        seed = row['Seed']
        df_ranked_teams.loc[df_ranked_teams['Team'] == team_name, 'CFP Seed'] = seed

    return df_ranked_teams

def set_up_playoff_game_stats(df_playoff_games, df_all_stats, df_schedule_with_full_stats):
    # Rename columns in df_all_stats for merging with home and away teams
    home_stats = df_all_stats.rename(columns={'Team': 'Home Team'})
    away_stats = df_all_stats.rename(columns={'Team': 'Away Team'})

    # Merge df_playoff_games with home_stats and away_stats
    df_playoff_with_home_stats = df_playoff_games.merge(
        home_stats, left_on=['Home Team'], right_on=['Home Team'], how='left'
    )
    df_playoff_with_full_stats = df_playoff_with_home_stats.merge(
        away_stats, left_on=['Away Team'], right_on=['Away Team'], suffixes=('_home', '_away'), how='left'
    )

    # Remove duplicates
    df_playoff_with_full_stats = df_playoff_with_full_stats.drop_duplicates(subset=['Home Team', 'Away Team'])

    # Adjust columns to match the model's input
    schedule_columns = df_schedule_with_full_stats.columns.tolist()
    playoff_columns = df_playoff_with_full_stats.columns.tolist()

    if schedule_columns != playoff_columns:
        missing_in_playoff = [col for col in schedule_columns if col not in playoff_columns]
        missing_in_schedule = [col for col in playoff_columns if col not in schedule_columns]

        # Add missing columns with default values
        for col in missing_in_playoff:
            df_playoff_with_full_stats[col] = None

        # Reorder columns
        df_playoff_with_full_stats = df_playoff_with_full_stats[schedule_columns]

    # Remove/rename unnecessary columns
    df_playoff_with_full_stats = df_playoff_with_full_stats.drop(columns=['Conference_x', 'Year_home', 'Conference_y', 'Year_away', 'Conference'], errors='ignore')
    df_playoff_with_full_stats['Year'] = 2024
    df_playoff_with_full_stats['Week'] = "Playoffs"
    # Add missing columns with default values
    missing_columns = ['Year', 'Week', 'Neutral Site', 'Conference Game', 'Home Conference', 'Home Points', 'Away Conference', 'Away Points']
    for col in missing_columns:
        if col not in df_playoff_with_full_stats.columns:
            df_playoff_with_full_stats[col] = None

    # Final column order
    df_playoff_with_full_stats = df_playoff_with_full_stats[schedule_columns]

    # Set 'Conference Game' and 'Neutral Site' to True for playoff games
    df_playoff_with_full_stats['Conference Game'] = False
    df_playoff_with_full_stats['Neutral Site'] = True

    return df_playoff_with_full_stats

def simulate_playoff_round(df_playoff_games, home_model, away_model, scaler, round_name):
    # Simulate the outcomes for each game in the given round
    results = []

    for idx, game in df_playoff_games.iterrows():
        # Home and away teams
        home_team = game['Home Team']
        away_team = game['Away Team']
        game['Home Coming Off Bye'] = False
        game['Away Coming Off Bye'] = False
        game['Home Consecutive Games'] = 0
        game['Away Consecutive Games'] = 0
        game['Home Consecutive Away Games'] = 0
        game['Away Consecutive Away Games'] = 0

        # Drop unnecessary columns and reshape the input for the model
        X_game = game.drop(labels=['Year', 'Home Points', 'Away Points', 'Week', 'Home Team', 'Home Conference', 'Away Team', 'Away Conference']).values.reshape(1, -1)

        # Scale the input features
        X_game_scaled = scaler.transform(X_game)

        # Suppress warnings during prediction
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            # Make predictions
            home_mean_array, home_std_array = home_model.predict(X_game_scaled, return_std=True)
            away_mean_array, away_std_array = away_model.predict(X_game_scaled, return_std=True)

            home_mean = home_mean_array[0]
            home_std = home_std_array[0]
            away_mean = away_mean_array[0]
            away_std = away_std_array[0]

        # Sample from the distributions to simulate the game outcome
        home_sampled_score = np.random.normal(home_mean, home_std)
        away_sampled_score = np.random.normal(away_mean, away_std)

        # Round the sampled scores up
        final_home_score = np.ceil(home_sampled_score)
        final_away_score = np.ceil(away_sampled_score)

        # Handle tie-breaking by checking original means
        if final_home_score == final_away_score:
            if home_mean > away_mean:
                final_home_score += 1
            else:
                final_away_score += 1

        # Determine the winner based on win percentage
        winner = home_team if final_home_score > final_away_score else away_team

        home_team_win = final_home_score > final_away_score

        # Store the results
        results.append({
            'Year': game['Year'],
            'Home Team': home_team,
            'Home Conference': game['Home Conference'],
            'Away Team': away_team,
            'Away Conference': game['Away Conference'],
            'Week': game['Week'],
            'Home Score': final_home_score,
            'Away Score': final_away_score,
            'Home Team Win': home_team_win,
            'Conference Game': game['Conference Game'],
            'Expected_Wins_Home': home_team_win.astype(int),
            'Expected_Wins_Away': 1 - home_team_win.astype(int),
            'Round': round_name,
            'Winner': winner
        })

    # Convert results to DataFrame
    df_results = pd.DataFrame(results)

    return df_results

def simulate_playoff_bracket(df_ranked_teams, home_model, away_model, scaler, df_schedule_with_full_stats):
    # Define Round 1 matchups
    round_1_matchups = [
        ('5', '12'),
        ('6', '11'),
        ('7', '10'),
        ('8', '9')
    ]

    # Prepare DataFrame for Round 1
    df_round_1 = pd.DataFrame(columns=['Home Team', 'Away Team'])
    for home_seed, away_seed in round_1_matchups:
        home_team = df_ranked_teams[df_ranked_teams['CFP Seed'] == int(home_seed)]['Team'].values[0]
        away_team = df_ranked_teams[df_ranked_teams['CFP Seed'] == int(away_seed)]['Team'].values[0]
        df_round_1 = pd.concat([df_round_1, pd.DataFrame({
            'Home Team': [home_team],
            'Away Team': [away_team]
        })], ignore_index=True)

    # Set up stats for Round 1
    df_round_1_with_stats = set_up_playoff_game_stats(df_round_1, df_all_stats, df_schedule_with_full_stats)

    # Simulate Round 1
    df_round_1_results = simulate_playoff_round(df_round_1_with_stats, home_model, away_model, scaler, 'Round 1')

    # Determine Quarterfinals matchups
    round_1_winners = df_round_1_results[['Winner']].rename(columns={'Winner': 'Team'})
    seeds = [1, 2, 3, 4]
    quarterfinals_matchups = [(df_ranked_teams[df_ranked_teams['CFP Seed'] == seed]['Team'].values[0], round_1_winners.iloc[i]['Team']) for i, seed in enumerate(seeds)]

    # Prepare DataFrame for Quarterfinals
    df_quarterfinals = pd.DataFrame(columns=['Home Team', 'Away Team'])
    for home_team, away_team in quarterfinals_matchups:
        df_quarterfinals = pd.concat([df_quarterfinals, pd.DataFrame({
            'Home Team': [home_team],
            'Away Team': [away_team]
        })], ignore_index=True)

    # Set up stats for Quarterfinals
    df_quarterfinals_with_stats = set_up_playoff_game_stats(df_quarterfinals, df_all_stats, df_schedule_with_full_stats)

    # Simulate Quarterfinals
    df_quarterfinals_results = simulate_playoff_round(df_quarterfinals_with_stats, home_model, away_model, scaler, 'Quarterfinals')

    # Determine Semifinals matchups
    semifinal_winners = df_quarterfinals_results[['Winner']].rename(columns={'Winner': 'Team'})
    semifinal_matchups = [(semifinal_winners.iloc[i]['Team'], semifinal_winners.iloc[i+1]['Team']) for i in range(0, len(semifinal_winners), 2)]

    # Prepare DataFrame for Semifinals
    df_semifinals = pd.DataFrame(columns=['Home Team', 'Away Team'])
    for home_team, away_team in semifinal_matchups:
        df_semifinals = pd.concat([df_semifinals, pd.DataFrame({
            'Home Team': [home_team],
            'Away Team': [away_team]
        })], ignore_index=True)

    # Set up stats for Semifinals
    df_semifinals_with_stats = set_up_playoff_game_stats(df_semifinals, df_all_stats, df_schedule_with_full_stats)

    # Simulate Semifinals
    df_semifinals_results = simulate_playoff_round(df_semifinals_with_stats, home_model, away_model, scaler, 'Semifinals')

    # Determine Championship matchup
    championship_matchup = df_semifinals_results[['Winner']].rename(columns={'Winner': 'Team'}).values.tolist()
    df_championship = pd.DataFrame({
        'Home Team': [championship_matchup[0][0]],
        'Away Team': [championship_matchup[1][0]]
    })

    # Set up stats for Championship
    df_championship_with_stats = set_up_playoff_game_stats(df_championship, df_all_stats, df_schedule_with_full_stats)

    # Simulate Championship
    df_championship_results = simulate_playoff_round(df_championship_with_stats, home_model, away_model, scaler, 'Championship')

    # Combine all results
    all_results = pd.concat([
        df_round_1_results,
        df_quarterfinals_results,
        df_semifinals_results,
        df_championship_results
    ], ignore_index=True)

    return all_results

def update_playoff_wins(df_ranked_teams, playoff_results):
    # Initialize 'Playoff Wins' column to 0
    df_ranked_teams['Playoff Wins'] = 0

    # Count playoff wins for each team
    # Include all rounds for calculating playoff wins
    playoff_wins_count = playoff_results.groupby('Winner').size()

    # Update the 'Playoff Wins' column
    for team, wins in playoff_wins_count.items():
        if team in df_ranked_teams['Team'].values:
            df_ranked_teams.loc[df_ranked_teams['Team'] == team, 'Playoff Wins'] = wins

    return df_ranked_teams

def update_final_rankings_with_championship(df_ranked_teams):
    # Define the desired order of columns
    desired_order = ['FBS Rank', 'Team', 'Conference', 'Games Played', 'Wins', 'Losses', 'CFP Seed', 'Playoff Wins', 'Conference Champion', 'Conference Runner Up', 'Conference Rank']

    # Get the remaining columns
    remaining_columns = [col for col in df_ranked_teams.columns if col not in desired_order]

    # Concatenate the desired order and remaining columns
    final_order = desired_order + remaining_columns

#    Reorder the DataFrame
    df_ranked_teams = df_ranked_teams[final_order]

    return df_ranked_teams

def calculate_betting_odds(percentage):
    """Convert a percentage to American betting odds."""
    if percentage == 0:
        return np.inf  # Infinite odds if the event never happens
    if percentage == 100:
        return -np.inf  # Impossible odds if the event always happens

    probability = percentage / 100

    if probability >= 0.5:
        american_odds = -100 * (probability / (1 - probability))
    else:
        american_odds = 100 * ((1 - probability) / probability)

    return round(american_odds, 2)

def calculate_team_statistics(season_results_list):
    # Combine all simulated season results into a single DataFrame
    combined_results = pd.concat(season_results_list, ignore_index=True)

    # Ensure columns are of the correct type
    numeric_columns = ['CFP Seed', 'Wins', 'Losses', 'Conference Wins', 'Conference Losses',
                       'Total Points Scored', 'Total Points Allowed', 'Average Points Scored',
                       'Average Points Allowed', 'Point Differential_x', 'SOR', 'SOS',
                       'Adjusted Wins', 'Adjusted Losses', 'Point Differential', 'Rank Score']

    for col in numeric_columns:
        combined_results[col] = pd.to_numeric(combined_results[col], errors='coerce')

    # Handle categorical columns
    combined_results['Conference Champion'] = combined_results['Conference Champion'].astype(bool)
    combined_results['Conference Runner Up'] = combined_results['Conference Runner Up'].astype(bool)

    # Reset index to ensure 'Team' is a column
    combined_results = combined_results.reset_index(drop=True)

    # Define the function to adjust wins and losses based on conference championship outcomes
    def adjust_wins_losses_for_conference_game(group):
        group = group.copy()

        # Adjust wins and losses based on conference championship outcomes
        group.loc[group['Conference Champion'], 'Wins'] -= 1
        group.loc[group['Conference Runner Up'], 'Losses'] -= 1

        return group

    # Apply the adjustment function to each group of data (team)
    adjusted_results = combined_results.groupby('Team').apply(adjust_wins_losses_for_conference_game).reset_index(drop=True)

    # Initialize the stats DataFrame
    stats = pd.DataFrame()

    # Recalculate the average regular season wins and losses after adjustments
    stats['Avg_Regular_Season_Wins'] = adjusted_results.groupby('Team')['Wins'].mean()
    stats['Avg_Regular_Season_Losses'] = adjusted_results.groupby('Team')['Losses'].mean()

    # Similarly, calculate the average regular season conference wins and losses
    stats['Avg_Regular_Season_Conf_Wins'] = adjusted_results.groupby('Team')['Conference Wins'].mean()
    stats['Avg_Regular_Season_Conf_Losses'] = adjusted_results.groupby('Team')['Conference Losses'].mean()

    # Calculate average total season wins and losses (including conference championship games and playoffs)
    stats['Avg_Total_Season_Wins'] = adjusted_results.groupby('Team')['Wins'].mean() + adjusted_results.groupby('Team')['Playoff Wins'].mean()
    stats['Avg_Total_Season_Losses'] = adjusted_results.groupby('Team')['Losses'].mean()

    # Best and Worst Recorded Records
    stats['Best_Record_Seen'] = adjusted_results.groupby('Team').apply(lambda x: f"{x['Wins'].max()}-{x['Losses'].min()}")
    stats['Worst_Record_Seen'] = adjusted_results.groupby('Team').apply(lambda x: f"{x['Wins'].min()}-{x['Losses'].max()}")

    # 95th and 5th Percentile Records
    stats['95th_Record_High'] = adjusted_results.groupby('Team').apply(lambda x: f"{int(x['Wins'].quantile(0.95))}-{int(x['Losses'].quantile(0.05))}")
    stats['95th_Record_Low'] = adjusted_results.groupby('Team').apply(lambda x: f"{int(x['Wins'].quantile(0.05))}-{int(x['Losses'].quantile(0.95))}")

    # Calculate percentages
    stats['%_Bowling'] = adjusted_results.groupby('Team').apply(lambda x: (x['Wins'] >= 6).mean() * 100)
    stats['%_Conf_Champ_Appearance'] = adjusted_results.groupby('Team').apply(lambda x: ((x['Conference Champion']) | (x['Conference Runner Up'])).mean() * 100)
    stats['%_Conf_Champion'] = adjusted_results.groupby('Team')['Conference Champion'].mean() * 100
    stats['%_Makes_Playoffs'] = adjusted_results.groupby('Team').apply(lambda x: (x['CFP Seed'] <= 12).mean() * 100)
    stats['%_Playoff_Bye'] = adjusted_results.groupby('Team').apply(lambda x: (x['CFP Seed'] <= 4).mean() * 100)
    stats['%_Playoff_Quarterfinals'] = adjusted_results.groupby('Team').apply(lambda x: (((x['CFP Seed'] <= 4) & (x['Playoff Wins'] >= 1)) | ((x['CFP Seed'] > 4) & (x['Playoff Wins'] >= 1))).mean() * 100)
    stats['%_Playoff_Semifinals'] = adjusted_results.groupby('Team').apply(lambda x: (((x['CFP Seed'] <= 4) & (x['Playoff Wins'] >= 1)) | ((x['CFP Seed'] > 4) & (x['Playoff Wins'] >= 2))).mean() * 100)
    stats['%_Championship_Game'] = adjusted_results.groupby('Team').apply(lambda x: (((x['CFP Seed'] <= 4) & (x['Playoff Wins'] >= 2)) | ((x['CFP Seed'] > 4) & (x['Playoff Wins'] >= 3))).mean() * 100)
    stats['%_National_Champion'] = adjusted_results.groupby('Team').apply(lambda x: (((x['CFP Seed'] <= 4) & (x['Playoff Wins'] >= 3)) | ((x['CFP Seed'] > 4) & (x['Playoff Wins'] >= 4))).mean() * 100)

    # Calculate Over/Under percentages
    stats['%_Over'] = adjusted_results.groupby('Team').apply(lambda x: (x['Wins'] > stats.loc[x.name, 'Avg_Regular_Season_Wins']).mean() * 100)
    stats['%_Under'] = 100 - stats['%_Over']  # The complement of the Over percentage

    # Add betting odds columns based on the calculated percentages
    stats['Odds_Conf_Champ_Appearance'] = stats['%_Conf_Champ_Appearance'].apply(calculate_betting_odds)
    stats['Odds_Conf_Champion'] = stats['%_Conf_Champion'].apply(calculate_betting_odds)
    stats['Odds_Makes_Playoffs'] = stats['%_Makes_Playoffs'].apply(calculate_betting_odds)
    stats['Odds_Playoff_Bye'] = stats['%_Playoff_Bye'].apply(calculate_betting_odds)
    stats['Odds_Playoff_Quarterfinals'] = stats['%_Playoff_Quarterfinals'].apply(calculate_betting_odds)
    stats['Odds_Playoff_Semifinals'] = stats['%_Playoff_Semifinals'].apply(calculate_betting_odds)
    stats['Odds_Championship_Game'] = stats['%_Championship_Game'].apply(calculate_betting_odds)
    stats['Odds_National_Champion'] = stats['%_National_Champion'].apply(calculate_betting_odds)

    # Add Over/Under betting odds
    stats['Odds_Over'] = stats['%_Over'].apply(calculate_betting_odds)
    stats['Odds_Under'] = stats['%_Under'].apply(calculate_betting_odds)

    # Calculate averages for additional stats (excluding categorical columns)
    additional_stats = ['Total Points Scored', 'Total Points Allowed',
                        'Average Points Scored', 'Average Points Allowed', 'Point Differential_x',
                        'SOR', 'SOS', 'Adjusted Wins', 'Adjusted Losses',
                        'Point Differential', 'Rank Score']

    for stat in additional_stats:
        stats[f'Avg_{stat}'] = adjusted_results.groupby('Team')[stat].mean()

    # Round all numerical values to 3 decimal places
    stats = stats.round(3)

    return stats.reset_index()


In [5]:
def simulate_seasons(n_simulations, year, df_schedule, home_model, away_model, scaler, df_2024_team_info):
    season_results = []
    games_results = []
    # Gererate game_score_prediction_distributions
    df_regular_season_distributions = predict_regular_season_distributions(year, df_schedule_with_full_stats, home_model, away_model, scaler, df_2024_team_info)
    # Loop to simulate the season n_simulations times
    for sim in range(n_simulations):
        gc.collect()  # Garbage collection to free up memory
        clear_output(wait=True)
        print(f"Simulating season {sim + 1} of {n_simulations}")
        # Simulate one season
        df_regular_season_result = sample_from_regular_season_distributions(df_regular_season_distributions)
        team_records = prep_team_records(df_regular_season_result)
        df_final_rankings = prep_final_rankings(team_records, df_regular_season_result)
        df_championship = prep_championship_data(df_final_rankings, df_all_stats, df_schedule_with_full_stats)
        df_conference_championships_result = predict_championship_outcomes(df_championship, home_model, away_model, scaler)
        df_pre_playoff_results = pd.concat([df_regular_season_result, df_conference_championships_result], ignore_index=True)
        df_final_rankings_updated = update_rankings_with_championship(df_final_rankings, df_conference_championships_result)
        df_ranked_teams = prep_playoff_game_stats(df_pre_playoff_results, df_final_rankings_updated)
        df_playoff_results = simulate_playoff_bracket(df_ranked_teams, home_model, away_model, scaler, df_schedule_with_full_stats)
        df_ranked_teams = update_playoff_wins(df_ranked_teams, df_playoff_results)
        df_full_season_result = update_final_rankings_with_championship(df_ranked_teams)
        df_full_games_list = pd.concat([df_pre_playoff_results, df_playoff_results], ignore_index=True) #########
        # Store the simulated season result dataframe
        season_results.append(df_full_season_result)
        games_results.append(df_full_games_list)

        # Clean up to free memory
        del df_regular_season_result
        del team_records
        del df_final_rankings
        del df_championship
        del df_conference_championships_result
        del df_ranked_teams
        del df_playoff_results
        del df_full_season_result
        del df_full_games_list

    return season_results, games_results

# Example usage:
year = 2024  # Example year
n_simulations = 3000  # Number of times to simulate the season
df_schedule_with_full_stats = initial_data_prep(df_schedule, df_all_stats, year)
season_results_list, full_games_list = simulate_seasons(n_simulations, year, df_schedule_with_full_stats, gpr_home, gpr_away, scaler, df_2024_team_info) #########

# Calculate all the statistics based on the simulation results
team_stats = calculate_team_statistics(season_results_list)

pd.concat(season_results_list).to_csv("/content/drive/MyDrive/CFB_Model/season_results_list_3000_sims.csv", index=False)
pd.concat(full_games_list).to_csv("/content/drive/MyDrive/CFB_Model/full_games_list_3000_sims.csv", index=False)
team_stats.to_csv("/content/drive/MyDrive/CFB_Model/2024_monte_carlo_3000_sims.csv", index=False)

Simulating season 3000 of 3000


In [6]:
# Concatenate the list of DataFrames into a single DataFrame
df_full_games = pd.concat(full_games_list, ignore_index=True)

# Filter for regular-season games (where Week isn't blank)
df_regular_season = df_full_games[df_full_games['Week'].notna()]

# Group by Year, Week, Home Team, and Away Team to aggregate statistics for each game
grouped = df_regular_season.groupby(['Year', 'Week', 'Home Team', 'Away Team'])

# Calculate the required statistics
game_stats = grouped.agg(
    Avg_Home_Score=('Home Score', 'mean'),
    Avg_Away_Score=('Away Score', 'mean'),
    Home_Win_Percentage=('Home Team Win', 'mean')
).reset_index()

# Calculate the away team win percentage (complement of home win percentage)
game_stats['Away_Win_Percentage'] = 1 - game_stats['Home_Win_Percentage']

# Calculate the game spread
game_stats['Game_Spread'] = game_stats.apply(
    lambda row: f"{row['Home Team'] if row['Avg_Home_Score'] > row['Avg_Away_Score'] else row['Away Team']} - {abs(row['Avg_Home_Score'] - row['Avg_Away_Score']):.2f}",
    axis=1
)

# Calculate the Over/Under
game_stats['Over_Under'] = game_stats.apply(
    lambda row: f"O/U: {row['Avg_Home_Score'] + row['Avg_Away_Score']:.2f}",
    axis=1
)

# Display or save the resulting DataFrame
print(game_stats)

# Optionally, save the results to a CSV file
game_stats.to_csv("/content/drive/MyDrive/CFB_Model/full_game_results_list.csv", index=False)

      Year      Week       Home Team         Away Team  Avg_Home_Score  \
0     2024         0    Georgia Tech     Florida State       18.982667   
1     2024         0          Nevada               SMU       15.969000   
2     2024         1         Alabama  Western Kentucky       47.165333   
3     2024         1         Arizona        New Mexico       46.817667   
4     2024         1   Arizona State           Wyoming       29.659333   
...    ...       ...             ...               ...             ...   
1355  2024  Playoffs            Troy            Oregon       23.000000   
1356  2024  Playoffs            Troy      Oregon State       23.000000   
1357  2024  Playoffs            Troy        Penn State       12.000000   
1358  2024  Playoffs            Troy             Texas       14.000000   
1359  2024  Playoffs  UT San Antonio           Alabama       20.000000   

      Avg_Away_Score  Home_Win_Percentage  Away_Win_Percentage  \
0          35.685000             0.000000    