In [1]:
!pip install joblib
!pip install torch gpytorch

Collecting gpytorch
  Downloading gpytorch-1.12-py3-none-any.whl.metadata (8.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cur

In [2]:
import pandas as pd
import numpy as np
import joblib
import torch
import warnings
import gpytorch
import io
import os
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer  # To enable IterativeImputer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.impute import IterativeImputer
from google.colab import files, drive
import scipy.stats

In [3]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load the schedule and stats dataframes
df_schedule = pd.read_csv('/content/drive/MyDrive/CFB_Model/PreProcessed Data/schedule_2005_2024.csv')
df_all_stats = pd.read_csv('/content/drive/MyDrive/CFB_Model/PreProcessed Data/all_stats_2005_2024.csv')
df_2024_team_info = pd.read_csv('/content/drive/MyDrive/CFB_Model/Raw Data/2024_team_info.csv')

In [5]:
# prompt: check for duplicate matchups in df_schedule

# Check for duplicate matchups in df_schedule
duplicate_matchups = df_schedule[df_schedule.duplicated(subset=['Home Team','Away Team', 'Week', 'Year'], keep=False)]
print(duplicate_matchups)


      Year  Week  Neutral Site  Conference Game Home Team Home Conference  \
2232  2008     1         False            False       LSU             SEC   
2233  2008     1         False            False       LSU             SEC   

      Home Points          Away Team Away Conference  Away Points  
2232         41.0  Appalachian State        Southern         13.0  
2233         41.0  Appalachian State        Southern         13.0  


In [6]:
# Assuming df_schedule and df_all_stats are already loaded

# Filter df_all_stats for home team stats
home_stats = df_all_stats.rename(columns={'Team': 'Home Team'})
home_stats['Team Type'] = 'Home'

# Filter df_all_stats for away team stats
away_stats = df_all_stats.rename(columns={'Team': 'Away Team'})
away_stats['Team Type'] = 'Away'

# Merge df_schedule with home_stats and away_stats
df_schedule_with_home_stats = df_schedule.merge(home_stats, left_on=['Home Team', 'Year'], right_on=['Home Team', 'Year'], how='left')
df_schedule_with_full_stats = df_schedule_with_home_stats.merge(away_stats, left_on=['Away Team', 'Year'], right_on=['Away Team', 'Year'], suffixes=('_home', '_away'), how='left')

# Drop unnecessary columns
df_schedule_with_full_stats = df_schedule_with_full_stats.drop(columns=['Team Type_home', 'Team Type_away', 'Conference_home', 'Conference_away'])

# Remove duplicates, keeping only unique games
df_schedule_with_full_stats = df_schedule_with_full_stats.drop_duplicates(subset=['Home Team', 'Away Team', 'Year'], keep='first')

print(df_schedule_with_full_stats.head())
print(df_schedule_with_full_stats.tail())

   Year  Week  Neutral Site  Conference Game    Home Team Home Conference  \
0  2005     1         False            False     Marshall  Conference USA   
1  2005     1         False            False       Toledo    Mid-American   
2  2005     1         False            False      Houston  Conference USA   
3  2005     1         False            False   Cincinnati        Big East   
4  2005     1         False            False  Wake Forest             ACC   

   Home Points         Away Team Away Conference  Away Points  ...  \
0         36.0    William & Mary     Atlantic 10         24.0  ...   
1         62.0  Western Illinois            MVFC         14.0  ...   
2         24.0            Oregon          Pac-10         38.0  ...   
3         28.0  Eastern Michigan    Mid-American         26.0  ...   
4         20.0        Vanderbilt             SEC         24.0  ...   

   Defense Havoc Total_away  Defense Havoc FrontSeven_away  \
0                       NaN                           

In [7]:
# Separate features and target variables
# Identify columns to check for missing values (excluding 'Home Points' and 'Away Points')
columns_to_check = [col for col in df_schedule_with_full_stats.columns if col not in ['Home Points', 'Away Points']]

# Drop rows with missing values in any of the identified columns
df_schedule_with_full_stats = df_schedule_with_full_stats.dropna(subset=columns_to_check)

# Create training dataframe excluding 2024
df_train = df_schedule_with_full_stats[df_schedule_with_full_stats['Year'] != 2024]

# Create testing dataframe for 2024
df_test = df_schedule_with_full_stats[df_schedule_with_full_stats['Year'] == 2024]
print(df_test)

       Year  Week  Neutral Site  Conference Game           Home Team  \
16517  2024     0          True             True        Georgia Tech   
16519  2024     0         False            False              Nevada   
16533  2024     1         False            False  Jacksonville State   
16536  2024     1         False            False           Minnesota   
16544  2024     1         False            False      Michigan State   
...     ...   ...           ...              ...                 ...   
17425  2024    14         False             True      Arkansas State   
17426  2024    14         False            False    Washington State   
17427  2024    14         False             True                UNLV   
17428  2024    14         False             True             Hawai'i   
17429  2024    16          True             True                Army   

         Home Conference  Home Points         Away Team    Away Conference  \
16517                ACC          NaN     Florida State  

In [8]:
# Load the models and scaler
gpr_home = joblib.load('/content/drive/MyDrive/CFB_Model/Model Training/gpr_home_model.pkl')
gpr_away = joblib.load('/content/drive/MyDrive/CFB_Model/Model Training/gpr_away_model.pkl')
scaler = joblib.load('/content/drive/MyDrive/CFB_Model/Model Training/scaler.pkl')

In [9]:
import pandas as pd
import numpy as np
import warnings
import scipy.stats

def predict_regular_season_outcomes(year, df, home_model, away_model, scaler, df_2024_team_info):
    results = []
    placeholders = []

    # Filter schedule for the specified year
    df_year_schedule = df[df['Year'] == year]

    # Track number of games per team
    team_game_count = {}

    for idx, game in df_year_schedule.iterrows():
        # Home and away teams
        home_team = game['Home Team']
        away_team = game['Away Team']

        # Initialize game count for each team if not present
        if home_team not in team_game_count:
            team_game_count[home_team] = 0
        if away_team not in team_game_count:
            team_game_count[away_team] = 0

        # Increase the game count for each team
        team_game_count[home_team] += 1
        team_game_count[away_team] += 1

        # Drop unnecessary columns and reshape the input for the model
        X_game = game.drop(labels=['Year', 'Home Points', 'Away Points', 'Week', 'Home Team', 'Home Conference', 'Away Team', 'Away Conference']).values.reshape(1, -1)

        # Scale the input features
        X_game_scaled = scaler.transform(X_game)

        # Suppress warnings during prediction
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            # Make predictions
            home_mean_array, home_std_array = home_model.predict(X_game_scaled, return_std=True)
            away_mean_array, away_std_array = away_model.predict(X_game_scaled, return_std=True)

            home_mean = home_mean_array[0]
            home_std = home_std_array[0]
            away_mean = away_mean_array[0]
            away_std = away_std_array[0]

        # Calculate spread, win probabilities, and total points
        spread_mean = home_mean - away_mean
        spread_std = np.sqrt(home_std**2 + away_std**2)
        home_win_percentage = 1 - scipy.stats.norm.cdf(0, loc=spread_mean, scale=spread_std)
        away_win_percentage = scipy.stats.norm.cdf(0, loc=spread_mean, scale=spread_std)
        total_points_mean = home_mean + away_mean
        total_points_std = np.sqrt(home_std**2 + away_std**2)

        # Store the results
        results.append({
            'Year': game['Year'],
            'Home Team': home_team,
            'Home Conference': game['Home Conference'],
            'Away Team': away_team,
            'Away Conference': game['Away Conference'],
            'Week': game['Week'],
            'Home Mean Score': home_mean,
            'Home Score Std Dev': home_std,
            'Away Mean Score': away_mean,
            'Away Score Std Dev': away_std,
            'Home Spread Mean': spread_mean,
            'Home Spread Std Dev': spread_std,
            'Home Win Percentage': home_win_percentage,
            'Away Win Percentage': away_win_percentage,
            'Total Points Mean': total_points_mean,
            'Total Points Std Dev': total_points_std
        })

    # Add placeholder games to ensure each team has 12 games
    for team, games_played in team_game_count.items():
        while games_played < 12:
            # Placeholder stats for a generic FCS team
            placeholder_stats = {
                'home_mean': 45,
                'home_std': 1.5,
                'away_mean': 14,
                'away_std': 1.5,
                'team': 'FCS Placeholder'
            }

            # Add a home placeholder game
            placeholders.append({
                'Year': year,
                'Home Team': team,
                'Home Conference': 'N/A',
                'Away Team': placeholder_stats['team'],
                'Away Conference': 'N/A',
                'Week': 15,  # Placeholder week
                'Home Mean Score': placeholder_stats['home_mean'],
                'Home Score Std Dev': placeholder_stats['home_std'],
                'Away Mean Score': placeholder_stats['away_mean'],
                'Away Score Std Dev': placeholder_stats['away_std'],
                'Home Spread Mean': placeholder_stats['home_mean'] - placeholder_stats['away_mean'],
                'Home Spread Std Dev': np.sqrt(placeholder_stats['home_std']**2 + placeholder_stats['away_std']**2),
                'Home Win Percentage': scipy.stats.norm.cdf((placeholder_stats['home_mean'] - placeholder_stats['away_mean']) / np.sqrt(placeholder_stats['home_std']**2 + placeholder_stats['away_std']**2)),
                'Away Win Percentage': 1 - scipy.stats.norm.cdf((placeholder_stats['home_mean'] - placeholder_stats['away_mean']) / np.sqrt(placeholder_stats['home_std']**2 + placeholder_stats['away_std']**2)),
                'Total Points Mean': placeholder_stats['home_mean'] + placeholder_stats['away_mean'],
                'Total Points Std Dev': np.sqrt(placeholder_stats['home_std']**2 + placeholder_stats['away_std']**2)
            })

            games_played += 1

    # Convert lists to DataFrame
    df_results = pd.DataFrame(results)
    df_fcs_placeholders = pd.DataFrame(placeholders)

    # Combine regular games and placeholders
    df_final_results = pd.concat([df_results, df_fcs_placeholders], ignore_index=True)

    return df_final_results


In [10]:
# Define the year for which you want to predict the outcomes
year = 2024

# Call the function to predict the outcomes for the 2024 season
df_results_2024 = predict_regular_season_outcomes(
    year=year,
    df=df_test,
    home_model=gpr_home,
    away_model=gpr_away,
    scaler=scaler,
    df_2024_team_info=df_2024_team_info
)

# Save the predictions to a CSV file
df_results_2024.to_csv('/content/drive/MyDrive/CFB_Model/2024_season_predictions.csv', index=False)

# Optionally display the first few rows of the results
print(df_results_2024.head())



   Year           Home Team Home Conference         Away Team  \
0  2024        Georgia Tech             ACC     Florida State   
1  2024              Nevada   Mountain West               SMU   
2  2024  Jacksonville State  Conference USA  Coastal Carolina   
3  2024           Minnesota         Big Ten    North Carolina   
4  2024      Michigan State         Big Ten  Florida Atlantic   

     Away Conference  Week  Home Mean Score  Home Score Std Dev  \
0                ACC     0        24.176730            3.192362   
1                ACC     0        15.859303            3.188923   
2           Sun Belt     1        25.724221            3.206657   
3                ACC     1        28.466843            3.175149   
4  American Athletic     1        24.607167            3.185717   

   Away Mean Score  Away Score Std Dev  Home Spread Mean  Home Spread Std Dev  \
0        36.588533            3.192362        -12.411803             4.514681   
1        40.294036            3.188923      

In [11]:
# prompt: add a column called Home Team Win that is a boolean for if the home team won the game. and a a column called Conference Game that is true if the Home Conference matches the Away Conference

# Calculate 'Home Team Win' column
df_results_2024['Home Team Win'] = df_results_2024['Home Win Percentage'] > 0.5

# Calculate 'Conference Game' column
df_results_2024['Conference Game'] = df_results_2024['Home Conference'] == df_results_2024['Away Conference']

# Display the updated DataFrame
print(df_results_2024.head())


   Year           Home Team Home Conference         Away Team  \
0  2024        Georgia Tech             ACC     Florida State   
1  2024              Nevada   Mountain West               SMU   
2  2024  Jacksonville State  Conference USA  Coastal Carolina   
3  2024           Minnesota         Big Ten    North Carolina   
4  2024      Michigan State         Big Ten  Florida Atlantic   

     Away Conference  Week  Home Mean Score  Home Score Std Dev  \
0                ACC     0        24.176730            3.192362   
1                ACC     0        15.859303            3.188923   
2           Sun Belt     1        25.724221            3.206657   
3                ACC     1        28.466843            3.175149   
4  American Athletic     1        24.607167            3.185717   

   Away Mean Score  Away Score Std Dev  Home Spread Mean  Home Spread Std Dev  \
0        36.588533            3.192362        -12.411803             4.514681   
1        40.294036            3.188923      

In [12]:
# Initialize a dictionary to track team stats
team_stats = {}

# Iterate through each game and update team stats
for _, game in df_results_2024.iterrows():
    home_team = game['Home Team']
    away_team = game['Away Team']
    is_conference_game = game['Conference Game']

    home_win_prob = game['Home Win Percentage']
    away_win_prob = game['Away Win Percentage']

    # Initialize stats for the home team if not already present
    if home_team not in team_stats:
        team_stats[home_team] = {
            'Games Played': 0, 'Wins': 0, 'Losses': 0, 'Conference Wins': 0, 'Conference Losses': 0,
            'Total Points Scored': 0, 'Total Points Allowed': 0, 'Total Wins Prob': 0.0, 'Conference Wins Prob': 0.0
        }

    # Initialize stats for the away team if not already present
    if away_team not in team_stats:
        team_stats[away_team] = {
            'Games Played': 0, 'Wins': 0, 'Losses': 0, 'Conference Wins': 0, 'Conference Losses': 0,
            'Total Points Scored': 0, 'Total Points Allowed': 0, 'Total Wins Prob': 0.0, 'Conference Wins Prob': 0.0
        }

    # Update stats for the home team
    team_stats[home_team]['Games Played'] += 1
    team_stats[home_team]['Total Points Scored'] += game['Home Mean Score']
    team_stats[home_team]['Total Points Allowed'] += game['Away Mean Score']
    team_stats[home_team]['Total Wins Prob'] += home_win_prob
    if is_conference_game:
        team_stats[home_team]['Conference Wins Prob'] += home_win_prob

    # Update stats for the away team
    team_stats[away_team]['Games Played'] += 1
    team_stats[away_team]['Total Points Scored'] += game['Away Mean Score']
    team_stats[away_team]['Total Points Allowed'] += game['Home Mean Score']
    team_stats[away_team]['Total Wins Prob'] += away_win_prob
    if is_conference_game:
        team_stats[away_team]['Conference Wins Prob'] += away_win_prob

    # Determine and update actual wins/losses
    if game['Home Mean Score'] > game['Away Mean Score']:
        team_stats[home_team]['Wins'] += 1
        team_stats[away_team]['Losses'] += 1
        if is_conference_game:
            team_stats[home_team]['Conference Wins'] += 1
            team_stats[away_team]['Conference Losses'] += 1
    else:
        team_stats[away_team]['Wins'] += 1
        team_stats[home_team]['Losses'] += 1
        if is_conference_game:
            team_stats[away_team]['Conference Wins'] += 1
            team_stats[home_team]['Conference Losses'] += 1

# Convert the dictionary to a DataFrame for easier analysis
team_records = pd.DataFrame.from_dict(team_stats, orient='index').reset_index()
team_records.rename(columns={'index': 'Team'}, inplace=True)

# Add additional statistics like average points, point differential, etc.
team_records['Average Points Scored'] = team_records['Total Points Scored'] / team_records['Games Played']
team_records['Average Points Allowed'] = team_records['Total Points Allowed'] / team_records['Games Played']
team_records['Point Differential'] = team_records['Average Points Scored'] - team_records['Average Points Allowed']

# Display the results
print(team_records)

                   Team  Games Played  Wins  Losses  Conference Wins  \
0          Georgia Tech            12     2      10                1   
1         Florida State            12    11       1                9   
2                Nevada            12     0      12                0   
3                   SMU            12    10       2                7   
4    Jacksonville State            12     9       3                6   
..                  ...           ...   ...     ...              ...   
129      Louisiana Tech            12     4       8                4   
130              Purdue            12     4       8                4   
131          Ball State            12     2      10                2   
132           Louisiana            12     8       4                8   
133     FCS Placeholder           130     0     130                0   

     Conference Losses  Total Points Scored  Total Points Allowed  \
0                    8           335.889732            381.817322 

In [13]:
# Merge the 'Conference' and 'Division' columns from df_2024_team_info into team_records
team_records = pd.merge(team_records, df_2024_team_info[['Team', 'Conference', 'Division']], on='Team', how='left')

# Display the updated team_records DataFrame
print(team_records)

                   Team  Games Played  Wins  Losses  Conference Wins  \
0          Georgia Tech            12     2      10                1   
1         Florida State            12    11       1                9   
2                Nevada            12     0      12                0   
3                   SMU            12    10       2                7   
4    Jacksonville State            12     9       3                6   
..                  ...           ...   ...     ...              ...   
129      Louisiana Tech            12     4       8                4   
130              Purdue            12     4       8                4   
131          Ball State            12     2      10                2   
132           Louisiana            12     8       4                8   
133     FCS Placeholder           130     0     130                0   

     Conference Losses  Total Points Scored  Total Points Allowed  \
0                    8           335.889732            381.817322 

In [14]:
# Function to calculate expected wins for each game based on a top-25 team
def calculate_expected_wins(df_results):
    # Expected wins for top-25 team assuming the top-25 team plays each game
    df_results['Expected_Wins_Home'] = df_results['Home Win Percentage']
    df_results['Expected_Wins_Away'] = df_results['Away Win Percentage']

    return df_results

# Apply the expected wins calculation
df_results_2024 = calculate_expected_wins(df_results_2024)

# Function to calculate SOR for a given team
def calculate_sor_for_team(team_name, df_results):
    # Filter the games where the team is playing either as home or away
    team_games_home = df_results[df_results['Home Team'] == team_name]
    team_games_away = df_results[df_results['Away Team'] == team_name]

    # Sum the expected wins from all games the team is involved in
    total_expected_wins = team_games_home['Expected_Wins_Home'].sum() + team_games_away['Expected_Wins_Away'].sum()

    # Calculate actual wins (count of games where the team won)
    actual_wins = team_games_home['Home Team Win'].sum() + team_games_away['Home Team Win'].apply(lambda x: not x).sum()

    # Calculate SOR: expected wins for a top-25 team - actual wins by the team
    sor = total_expected_wins - actual_wins

    return sor

# Example usage: Calculate SOR for all teams in the dataset
teams = pd.concat([df_results_2024['Home Team'], df_results_2024['Away Team']]).unique()

sor_results = []

for team in teams:
    sor_value = calculate_sor_for_team(team, df_results_2024)
    sor_results.append({'Team': team, 'SOR': sor_value})

# Create a DataFrame to store the SOR results
df_sor = pd.DataFrame(sor_results)

# Display SOR results
print(df_sor)

# Perform an inner join to merge the Strength of Record (SOR) into the team_records dataset
team_records = team_records.merge(df_sor, on='Team', how='left')

# Check the merged dataset
print(team_records.head())



                   Team       SOR
0          Georgia Tech  1.329085
1                Nevada  0.641440
2    Jacksonville State -0.503698
3             Minnesota -0.446000
4        Michigan State  0.895231
..                  ...       ...
129               Akron  0.831384
130      San Jose State -0.472020
131    Western Michigan -0.217687
132      UT San Antonio -0.547503
133     FCS Placeholder  0.000000

[134 rows x 2 columns]
                 Team  Games Played  Wins  Losses  Conference Wins  \
0        Georgia Tech            12     2      10                1   
1       Florida State            12    11       1                9   
2              Nevada            12     0      12                0   
3                 SMU            12    10       2                7   
4  Jacksonville State            12     9       3                6   

   Conference Losses  Total Points Scored  Total Points Allowed  \
0                  8           335.889732            381.817322   
1            

In [15]:
def rank_teams_in_conference(team_records, df_results_2024):
    # Filter only conference games in df_results_2024
    df_conference_games = df_results_2024[df_results_2024['Conference Game'] == True]

    # Initialize a list to store rankings
    rankings = []

    # Group by conference
    conferences = team_records['Conference'].unique()

    for conference in conferences:
        # Filter for teams in the current conference
        df_conference_teams = team_records[team_records['Conference'] == conference]

        # Sort teams by 'Conference Wins Prob' initially (descending)
        df_conference_teams = df_conference_teams.sort_values(by='Conference Wins Prob', ascending=False).reset_index(drop=True)

        # Head-to-head tie-breaker logic
        for i in range(len(df_conference_teams) - 1):
            # Check if the teams are tied in conference wins probability
            if df_conference_teams.loc[i, 'Conference Wins Prob'] == df_conference_teams.loc[i+1, 'Conference Wins Prob']:
                team_1 = df_conference_teams.loc[i, 'Team']
                team_2 = df_conference_teams.loc[i+1, 'Team']

                # Check if they played a head-to-head game in df_results_2024
                head_to_head_game = df_conference_games[((df_conference_games['Home Team'] == team_1) & (df_conference_games['Away Team'] == team_2)) |
                                                        ((df_conference_games['Home Team'] == team_2) & (df_conference_games['Away Team'] == team_1))]

                if not head_to_head_game.empty:
                    # Determine the winner of the head-to-head game
                    if head_to_head_game.iloc[0]['Home Team'] == team_1 and head_to_head_game.iloc[0]['Home Team Win']:
                        winner = team_1
                    elif head_to_head_game.iloc[0]['Away Team'] == team_1 and not head_to_head_game.iloc[0]['Home Team Win']:
                        winner = team_1
                    else:
                        winner = team_2

                    # Adjust ranking based on head-to-head winner
                    if winner == team_2:
                        # Swap the positions of the two teams
                        df_conference_teams.iloc[i], df_conference_teams.iloc[i+1] = df_conference_teams.iloc[i+1].copy(), df_conference_teams.iloc[i].copy()

        # Sort again by SOR for final tie-breaker
        df_conference_teams = df_conference_teams.sort_values(by=['Conference Wins Prob', 'SOR'], ascending=[False, False]).reset_index(drop=True)

        # Add a 'Rank' column
        df_conference_teams['Rank'] = range(1, len(df_conference_teams) + 1)
        df_conference_teams['Conference'] = conference

        # Append to the final rankings
        rankings.append(df_conference_teams)

    # Concatenate all conference rankings into a single DataFrame
    df_final_rankings = pd.concat(rankings, ignore_index=True)

    return df_final_rankings


In [16]:
df_final_rankings = rank_teams_in_conference(team_records, df_results_2024)
print(df_final_rankings)

                 Team  Games Played  Wins  Losses  Conference Wins  \
0       Florida State            12    11       1                9   
1             Clemson            12    10       2                8   
2          Louisville            12    10       2                8   
3            NC State            12    10       2                8   
4                 SMU            12    10       2                7   
..                ...           ...   ...     ...              ...   
128             UMass            12     2      10                2   
129       Connecticut            12     4       8                2   
130        Notre Dame            12    11       1                0   
131      Oregon State            12    10       2                2   
132  Washington State            12     8       4                1   

     Conference Losses  Total Points Scored  Total Points Allowed  \
0                    0           436.439176            269.665395   
1                    

In [17]:
def set_up_conference_championships(df_final_rankings):
    # Conferences with championship games
    conferences_with_championships = ['ACC', 'Mountain West', 'Conference USA', 'Sun Belt',
                                      'Big Ten', 'American Athletic', 'SEC', 'Mid-American', 'Big 12']

    # Initialize a list to store championship games
    championship_games = []

    # Loop through the conferences
    for conference in conferences_with_championships:
        # Filter teams by conference
        conference_teams = df_final_rankings[df_final_rankings['Conference'] == conference]

        # Special case for Sun Belt (handle East and West divisions)
        if conference == 'Sun Belt':
            # Assume we have a 'Division' column to split the teams
            east_teams = conference_teams[conference_teams['Division'] == 'East']
            west_teams = conference_teams[conference_teams['Division'] == 'West']

            # Get the highest-ranked team from each division
            top_east_team = east_teams.sort_values(by='Rank').iloc[0]
            top_west_team = west_teams.sort_values(by='Rank').iloc[0]

            # Determine home and away team based on overall ranking
            if top_east_team['Rank'] < top_west_team['Rank']:
                home_team = top_east_team
                away_team = top_west_team
            else:
                home_team = top_west_team
                away_team = top_east_team

            # Append the championship game details to the list
            championship_games.append({
                'Conference': conference,
                'Home Team': home_team['Team'],
                'Away Team': away_team['Team'],
            })

        else:
            # Get the top two ranked teams in the conference
            top_two_teams = conference_teams.sort_values(by='Rank').head(2)

            # Home team is the highest-ranked team, away team is the second-highest-ranked team
            home_team = top_two_teams.iloc[0]
            away_team = top_two_teams.iloc[1]

            # Append the championship game details to the list
            championship_games.append({
                'Conference': conference,
                'Home Team': home_team['Team'],
                'Away Team': away_team['Team'],
            })

    # Convert the championship games to a DataFrame
    df_championship_games = pd.DataFrame(championship_games)

    return df_championship_games

In [18]:
df_championship_games = set_up_conference_championships(df_final_rankings)
print(df_championship_games)

          Conference       Home Team         Away Team
0                ACC   Florida State           Clemson
1      Mountain West     Boise State      Fresno State
2     Conference USA         Liberty  Western Kentucky
3           Sun Belt            Troy     James Madison
4            Big Ten          Oregon        Penn State
5  American Athletic  UT San Antonio            Tulane
6                SEC         Alabama           Georgia
7       Mid-American          Toledo              Ohio
8             Big 12    Kansas State              Utah


In [19]:
def set_up_championship_game_stats(df_championship_games, df_all_stats):
    # Rename columns in df_all_stats for merging with home and away teams
    home_stats = df_all_stats.rename(columns={'Team': 'Home Team'})
    away_stats = df_all_stats.rename(columns={'Team': 'Away Team'})

    # Merge df_championship_games with home_stats and away_stats
    df_championship_with_home_stats = df_championship_games.merge(
        home_stats, left_on=['Home Team'], right_on=['Home Team'], how='left'
    )
    df_championship_with_full_stats = df_championship_with_home_stats.merge(
        away_stats, left_on=['Away Team'], right_on=['Away Team'], suffixes=('_home', '_away'), how='left'
    )

    # Remove duplicates
    df_championship_with_full_stats = df_championship_with_full_stats.drop_duplicates(subset=['Home Team', 'Away Team'])

    return df_championship_with_full_stats

# Call the function
df_championship_with_full_stats = set_up_championship_game_stats(df_championship_games, df_all_stats)

# Check the results
print(df_championship_with_full_stats.head())

        Conference_x      Home Team         Away Team  Year_home  \
0                ACC  Florida State           Clemson       2024   
462    Mountain West    Boise State      Fresno State       2024   
924   Conference USA        Liberty  Western Kentucky       2024   
1068        Sun Belt           Troy     James Madison       2024   
1160         Big Ten         Oregon        Penn State       2024   

      Coach Win Percentage_home  Coach Tenure_home  Rank_home  Points_home  \
0                      0.657624                  5         12       271.76   
462                    0.750000                  2         64       181.19   
924                    0.684676                  2        105       142.69   
1068                   0.000000                  2         89       159.41   
1160                   0.813187                  3          3       293.22   

      Composite Rating_home  Average Rank_home  ... Defense Havoc Total_away  \
0                    953.34              1

In [20]:
# Step 1: Compare column names in both dataframes
schedule_columns = df_schedule_with_full_stats.columns.tolist()
championship_columns = df_championship_with_full_stats.columns.tolist()

# Check if columns are identical
if schedule_columns == championship_columns:
    print("Columns are identical and in the same order.")
else:
    print("Columns are not identical or not in the same order.")

    # Step 2: Find differences
    missing_in_championship = [col for col in schedule_columns if col not in championship_columns]
    missing_in_schedule = [col for col in championship_columns if col not in schedule_columns]

    print(f"Missing in championship dataset: {missing_in_championship}")
    print(f"Missing in schedule dataset: {missing_in_schedule}")

    # Step 3: Reorder championship columns to match schedule columns (ignoring missing columns for now)
    common_columns = [col for col in schedule_columns if col in championship_columns]
    df_championship_with_full_stats = df_championship_with_full_stats[common_columns]

    # Step 4: Add missing columns to df_championship_with_full_stats with default values
    for col in missing_in_championship:
        df_championship_with_full_stats[col] = None  # or appropriate default value

    # Step 5: Recheck and reorder columns
    df_championship_with_full_stats = df_championship_with_full_stats[schedule_columns]

# Final check of column alignment
print("Final column alignment check:")
print(df_schedule_with_full_stats.columns)
print(df_championship_with_full_stats.columns)

Columns are not identical or not in the same order.
Missing in championship dataset: ['Year', 'Week', 'Neutral Site', 'Conference Game', 'Home Conference', 'Home Points', 'Away Conference', 'Away Points']
Missing in schedule dataset: ['Conference_x', 'Year_home', 'Conference_y', 'Year_away', 'Conference']
Final column alignment check:
Index(['Year', 'Week', 'Neutral Site', 'Conference Game', 'Home Team',
       'Home Conference', 'Home Points', 'Away Team', 'Away Conference',
       'Away Points', 'Coach Win Percentage_home', 'Coach Tenure_home',
       'Rank_home', 'Points_home', 'Composite Rating_home',
       'Average Rank_home', 'Rating_home', 'Ranking_home',
       'SecondOrderWins_home', 'Sos_home', 'Offense Ranking_home',
       'Offense Rating_home', 'Offense Success_home',
       'Offense Explosiveness_home', 'Offense Rushing_home',
       'Offense Passing_home', 'Offense StandardDowns_home',
       'Offense PassingDowns_home', 'Offense RunRate_home',
       'Offense Pace_home

In [21]:
# Step 1: Remove/rename unnecessary columns in df_championship_with_stats
df_championship_with_full_stats = df_championship_with_full_stats.drop(columns=['Conference_x', 'Year_home', 'Conference_y', 'Year_away', 'Conference'], errors='ignore')

# Step 2: Add missing columns to df_championship_with_full_stats with default values
missing_columns = ['Year', 'Week', 'Neutral Site', 'Conference Game', 'Home Conference', 'Home Points', 'Away Conference', 'Away Points']
for col in missing_columns:
    if col not in df_championship_with_full_stats.columns:
        df_championship_with_full_stats[col] = None  # Default value; adjust based on your data

# Step 3: Reorder columns in df_championship_with_stats to match df_schedule_with_full_stats
df_championship_with_full_stats = df_championship_with_full_stats[df_schedule_with_full_stats.columns]

# Step 4: Final check to ensure columns are now aligned
print("Final column alignment check:")
print(df_schedule_with_full_stats.columns)
print(df_championship_with_full_stats.columns)


Final column alignment check:
Index(['Year', 'Week', 'Neutral Site', 'Conference Game', 'Home Team',
       'Home Conference', 'Home Points', 'Away Team', 'Away Conference',
       'Away Points', 'Coach Win Percentage_home', 'Coach Tenure_home',
       'Rank_home', 'Points_home', 'Composite Rating_home',
       'Average Rank_home', 'Rating_home', 'Ranking_home',
       'SecondOrderWins_home', 'Sos_home', 'Offense Ranking_home',
       'Offense Rating_home', 'Offense Success_home',
       'Offense Explosiveness_home', 'Offense Rushing_home',
       'Offense Passing_home', 'Offense StandardDowns_home',
       'Offense PassingDowns_home', 'Offense RunRate_home',
       'Offense Pace_home', 'Defense Ranking_home', 'Defense Rating_home',
       'Defense Success_home', 'Defense Explosiveness_home',
       'Defense Rushing_home', 'Defense Passing_home',
       'Defense StandardDowns_home', 'Defense PassingDowns_home',
       'Defense Havoc Total_home', 'Defense Havoc FrontSeven_home',
       

In [22]:
# prompt: for each entry in df_championship_with_full_stats set the Conference Game to true and the Neutral Site to true

# Set 'Conference Game' and 'Neutral Site' to True for championship games
df_championship_with_full_stats['Conference Game'] = True
df_championship_with_full_stats['Neutral Site'] = True


In [23]:
def predict_championship_outcomes(df_championship, home_model, away_model, scaler):
    results = []

    for idx, game in df_championship.iterrows():
        # Home and away teams
        home_team = game['Home Team']
        away_team = game['Away Team']

        # Drop unnecessary columns and reshape the input for the model
        X_game = game.drop(labels=['Year', 'Home Points', 'Away Points', 'Week', 'Home Team', 'Home Conference', 'Away Team', 'Away Conference']).values.reshape(1, -1)

        # Scale the input features
        X_game_scaled = scaler.transform(X_game)

        # Suppress warnings during prediction
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            # Make predictions
            home_mean_array, home_std_array = home_model.predict(X_game_scaled, return_std=True)
            away_mean_array, away_std_array = away_model.predict(X_game_scaled, return_std=True)

            home_mean = home_mean_array[0]
            home_std = home_std_array[0]
            away_mean = away_mean_array[0]
            away_std = away_std_array[0]

        # Calculate spread, win probabilities, and total points
        spread_mean = home_mean - away_mean
        spread_std = np.sqrt(home_std**2 + away_std**2)
        home_win_percentage = 1 - scipy.stats.norm.cdf(0, loc=spread_mean, scale=spread_std)
        away_win_percentage = scipy.stats.norm.cdf(0, loc=spread_mean, scale=spread_std)
        total_points_mean = home_mean + away_mean
        total_points_std = np.sqrt(home_std**2 + away_std**2)

        # Store the results
        results.append({
            'Year': game['Year'],
            'Home Team': home_team,
            'Home Conference': game['Home Conference'],
            'Away Team': away_team,
            'Away Conference': game['Away Conference'],
            'Week': game['Week'],
            'Home Mean Score': home_mean,
            'Home Score Std Dev': home_std,
            'Away Mean Score': away_mean,
            'Away Score Std Dev': away_std,
            'Home Spread Mean': spread_mean,
            'Home Spread Std Dev': spread_std,
            'Home Win Percentage': home_win_percentage,
            'Away Win Percentage': away_win_percentage,
            'Total Points Mean': total_points_mean,
            'Total Points Std Dev': total_points_std
        })

    # Convert results to DataFrame
    df_results = pd.DataFrame(results)

    return df_results

# Example usage for predicting championship games
df_championship_results = predict_championship_outcomes(
    df_championship=df_championship_with_full_stats,
    home_model=gpr_home,
    away_model=gpr_away,
    scaler=scaler
)

# Save the predictions to a CSV file
df_championship_results.to_csv('/content/drive/MyDrive/CFB_Model/championship_predictions.csv', index=False)

# Optionally display the first few rows of the results
print(df_championship_results)



   Year       Home Team Home Conference         Away Team Away Conference  \
0  None   Florida State            None           Clemson            None   
1  None     Boise State            None      Fresno State            None   
2  None         Liberty            None  Western Kentucky            None   
3  None            Troy            None     James Madison            None   
4  None          Oregon            None        Penn State            None   
5  None  UT San Antonio            None            Tulane            None   
6  None         Alabama            None           Georgia            None   
7  None          Toledo            None              Ohio            None   
8  None    Kansas State            None              Utah            None   

   Week  Home Mean Score  Home Score Std Dev  Away Mean Score  \
0  None        29.919558            3.191094        27.069551   
1  None        37.280178            3.190673        28.102112   
2  None        30.742299          

In [24]:
def update_final_rankings_with_championship(df_final_rankings, df_championship_results):
    # Add new columns for Conference Champion and Conference Runner Up
    df_final_rankings['Conference Champion'] = None
    df_final_rankings['Conference Runner Up'] = None

    # Update total wins, losses, win probabilities, and loss probabilities based on championship results
    for idx, game in df_championship_results.iterrows():
        home_team = game['Home Team']
        away_team = game['Away Team']
        home_win_prob = game['Home Win Percentage']
        away_win_prob = game['Away Win Percentage']

        # Determine winner and runner up based on win probabilities
        if home_win_prob >= away_win_prob:
            champion = home_team
            runner_up = away_team
        else:
            champion = away_team
            runner_up = home_team

        # Update Conference Champion and Runner Up in df_final_rankings
        df_final_rankings.loc[df_final_rankings['Team'] == champion, 'Conference Champion'] = True
        df_final_rankings.loc[df_final_rankings['Team'] == runner_up, 'Conference Runner Up'] = True

        # Update total wins, losses, and probabilities for both teams
        df_final_rankings.loc[df_final_rankings['Team'] == champion, 'Wins'] += 1
        df_final_rankings.loc[df_final_rankings['Team'] == runner_up, 'Losses'] += 1

        df_final_rankings.loc[df_final_rankings['Team'] == champion, 'Games Played'] += 1
        df_final_rankings.loc[df_final_rankings['Team'] == runner_up, 'Games Played'] += 1

        # Update total wins, losses, and probabilities for both teams
        df_final_rankings.loc[df_final_rankings['Team'] == home_team, 'Total Wins Prob'] += home_win_prob

        df_final_rankings.loc[df_final_rankings['Team'] == away_team, 'Total Wins Prob'] += away_win_prob

    # Ensure that the columns 'Conference Champion' and 'Conference Runner Up' are properly filled
    df_final_rankings['Conference Champion'] = df_final_rankings['Conference Champion'].fillna(False)
    df_final_rankings['Conference Runner Up'] = df_final_rankings['Conference Runner Up'].fillna(False)

    return df_final_rankings

# Example usage to update final rankings
df_final_rankings_updated = update_final_rankings_with_championship(df_final_rankings, df_championship_results)

# Save the updated rankings to a CSV file
df_final_rankings_updated.to_csv('/content/drive/MyDrive/CFB_Model/final_rankings_with_championship.csv', index=False)

# Optionally display the first few rows of the updated rankings
print(df_final_rankings_updated.head())


            Team  Games Played  Wins  Losses  Conference Wins  \
0  Florida State            13    12       1                9   
1        Clemson            13    10       3                8   
2     Louisville            12    10       2                8   
3       NC State            12    10       2                8   
4            SMU            12    10       2                7   

   Conference Losses  Total Points Scored  Total Points Allowed  \
0                  0           436.439176            269.665395   
1                  1           399.954786            240.985088   
2                  1           371.190268            259.949646   
3                  1           371.905734            250.007076   
4                  2           403.858222            278.213163   

   Total Wins Prob  Conference Wins Prob  Average Points Scored  \
0        11.579019              8.604658              36.369931   
1        10.487369              8.209364              33.329565   
2    

In [25]:
def calculate_sos(df_results, df_team_info):
    team_sos = {}

    for team in df_team_info['Team']:
        # Get all games involving this team (both home and away)
        team_games = df_results[(df_results['Home Team'] == team) | (df_results['Away Team'] == team)]

        opponent_win_percs = []

        for idx, game in team_games.iterrows():
            if game['Home Team'] == team:
                opponent = game['Away Team']
            else:
                opponent = game['Home Team']

            # Get opponent's win percentage from the final rankings
            opponent_data = df_team_info[df_team_info['Team'] == opponent]
            if not opponent_data.empty:
                opponent_wins = opponent_data['Wins'].values[0]
                opponent_losses = opponent_data['Losses'].values[0]
                if opponent_wins + opponent_losses > 0:
                    opponent_win_percentage = opponent_wins / (opponent_wins + opponent_losses)
                else:
                    opponent_win_percentage = 0  # To avoid division by zero if the opponent has no games
                opponent_win_percs.append(opponent_win_percentage)

        # Strength of schedule is the average win percentage of all opponents
        if opponent_win_percs:
            team_sos[team] = np.mean(opponent_win_percs)
        else:
            team_sos[team] = 0  # In case there are no games for a team (which shouldn't happen)

    # Convert to DataFrame and merge with team info
    df_sos = pd.DataFrame(list(team_sos.items()), columns=['Team', 'SOS'])
    df_team_info_with_sos = pd.merge(df_team_info, df_sos, on='Team')

    return df_team_info_with_sos


def calculate_adjusted_wins(df_team_info_with_sos):
    # Adjust wins based on strength of schedule (higher SOS means more credit for wins)
    df_team_info_with_sos['Adjusted Wins'] = df_team_info_with_sos['Wins'] * df_team_info_with_sos['SOS']
    df_team_info_with_sos['Adjusted Losses'] = df_team_info_with_sos['Losses'] / df_team_info_with_sos['SOS']

    return df_team_info_with_sos

def calculate_point_differential(df_results, df_team_info):
    point_differentials = {}

    for team in df_team_info['Team']:
        # Get all games involving this team
        team_games = df_results[(df_results['Home Team'] == team) | (df_results['Away Team'] == team)]
        total_point_diff = 0

        for idx, game in team_games.iterrows():
            if game['Home Team'] == team:
                point_diff = game['Home Mean Score'] - game['Away Mean Score']
            else:
                point_diff = game['Away Mean Score'] - game['Home Mean Score']

            total_point_diff += point_diff

        point_differentials[team] = total_point_diff

    # Convert to DataFrame and merge with team info
    df_point_diff = pd.DataFrame(list(point_differentials.items()), columns=['Team', 'Point Differential'])

    # Ensure the column names are correctly merged
    df_team_info_with_point_diff = pd.merge(df_team_info, df_point_diff, on='Team', how='left')

    # Rename column if necessary to avoid clashes
    df_team_info_with_point_diff = df_team_info_with_point_diff.rename(columns={'Point Differential_y': 'Point Differential'})

    return df_team_info_with_point_diff

def rank_teams(df_team_info_with_metrics):
    # Ensure 'Point Differential' exists
    if 'Point Differential' not in df_team_info_with_metrics.columns:
        raise KeyError("'Point Differential' column is missing in DataFrame")

    # Define a ranking formula combining adjusted wins, SOS, and point differential
    df_team_info_with_metrics['Rank Score'] = (
        df_team_info_with_metrics['Adjusted Wins'] * 0.6 +  # Weight adjusted wins the highest
        df_team_info_with_metrics['SOS'] * 0.3 +  # Give weight to strength of schedule
        df_team_info_with_metrics['Point Differential'] * 0.1  # Include point differential
    )

    # Sort teams by the final rank score
    df_team_info_with_metrics = df_team_info_with_metrics.sort_values(by='Rank Score', ascending=False).reset_index(drop=True)

    return df_team_info_with_metrics

# Calculate SOS
df_team_info_with_sos = calculate_sos(df_results_2024, df_final_rankings)

# Calculate Adjusted Wins
df_team_info_with_adjusted_wins = calculate_adjusted_wins(df_team_info_with_sos)

# Calculate Point Differential
df_team_info_with_metrics = calculate_point_differential(df_results_2024, df_team_info_with_adjusted_wins)

# Rank Teams
df_ranked_teams = rank_teams(df_team_info_with_metrics)

# Rename the existing 'Rank' column to 'Conference Rank'
df_ranked_teams = df_ranked_teams.rename(columns={'Rank': 'Conference Rank'})

# Add the 'FBS Rank' column, which ranks teams among all teams
df_ranked_teams['FBS Rank'] = df_ranked_teams['Rank Score'].rank(ascending=False, method='min').astype(int)

# Save the updated ranked teams to a CSV file
df_ranked_teams.to_csv('/content/drive/MyDrive/CFB_Model/final_team_rankings_with_fbs_rank.csv', index=False)

# Display the top teams
print(df_ranked_teams.head())




         Team  Games Played  Wins  Losses  Conference Wins  Conference Losses  \
0  Ohio State            12    11       1                8                  1   
1     Alabama            13    13       0                9                  0   
2     Georgia            13    11       2                8                  1   
3  Penn State            13    11       2                8                  1   
4      Oregon            13    12       1                9                  1   

   Total Points Scored  Total Points Allowed  Total Wins Prob  \
0           431.983324            171.707464        10.822712   
1           445.196996            222.543786        12.448572   
2           449.633035            236.061258        11.019181   
3           418.962769            207.619030        11.468557   
4           467.582915            273.812374        11.707229   

   Conference Wins Prob  ...       SOR  Conference Rank  Conference Champion  \
0              7.822712  ... -0.177288    

In [26]:
# Step 1: Select the Top 4 Conference Champions
conference_champions = df_ranked_teams[df_ranked_teams['Conference Champion'] == True]
top_conference_champions = conference_champions.sort_values(by='FBS Rank').head(4)

# Step 2: Select Seeds 5-12 from the remaining highest-ranked teams
remaining_teams = df_ranked_teams[~df_ranked_teams['Team'].isin(top_conference_champions['Team'])]
next_highest_teams = remaining_teams.sort_values(by='FBS Rank').head(8)

# Combine top conference champions and the next highest-ranked teams
cfp_bracket = pd.concat([top_conference_champions, next_highest_teams])

# Ensure at least 5 Conference Champions
# Check if the bracket already has at least 5 conference champions
current_champions_count = df_ranked_teams[df_ranked_teams['Team'].isin(cfp_bracket['Team'])]['Conference Champion'].sum()

if current_champions_count < 5:
    # Find additional conference champions to add to the bracket
    additional_champions_needed = 5 - current_champions_count
    additional_champions = conference_champions[
        ~conference_champions['Team'].isin(cfp_bracket['Team'])
    ].sort_values(by='FBS Rank').head(additional_champions_needed)

    # Find the team(s) to remove if necessary
    if additional_champions.shape[0] > 0:
        # Remove the team with the lowest FBS Rank from current bracket
        lowest_seed_team = cfp_bracket.sort_values(by='FBS Rank').tail(1)
        cfp_bracket = cfp_bracket[~cfp_bracket['Team'].isin(lowest_seed_team['Team'])]

        # Add additional champions to the final bracket
        cfp_bracket = pd.concat([cfp_bracket, additional_champions])

# Add seed numbers: Seeds 1-4 are for conference champions
cfp_bracket['Seed'] = list(range(1, min(5, len(cfp_bracket) + 1))) + list(range(5, len(cfp_bracket) + 1))

# Save the CFP bracket to a CSV file
cfp_bracket.to_csv('/content/drive/MyDrive/CFB_Model/cfp_bracket.csv', index=False)

# Display the CFP bracket
print(cfp_bracket[['Seed', 'Team', 'Conference Champion', 'FBS Rank']])

    Seed           Team  Conference Champion  FBS Rank
1      1        Alabama                 True         2
4      2         Oregon                 True         5
8      3  Florida State                 True         9
9      4        Liberty                 True        10
0      5     Ohio State                False         1
2      6        Georgia                False         3
3      7     Penn State                False         4
5      8     Notre Dame                False         6
6      9       Michigan                False         7
7     10          Texas                False         8
10    11   Kansas State                 True        11
11    12       Missouri                False        12


In [27]:
# Step 2: Update CFP Seed for teams that made the playoffs
for index, row in cfp_bracket.iterrows():
    team_name = row['Team']
    seed = row['Seed']
    df_ranked_teams.loc[df_ranked_teams['Team'] == team_name, 'CFP Seed'] = seed

# Save the updated DataFrame to a CSV file
df_ranked_teams.to_csv('/content/drive/MyDrive/CFB_Model/df_ranked_teams_with_cfp_seeds.csv', index=False)

# Display the updated DataFrame
print(df_ranked_teams)

                 Team  Games Played  Wins  Losses  Conference Wins  \
0          Ohio State            12    11       1                8   
1             Alabama            13    13       0                9   
2             Georgia            13    11       2                8   
3          Penn State            13    11       2                8   
4              Oregon            13    12       1                9   
..                ...           ...   ...     ...              ...   
128        Kent State            12     2      10                2   
129        Vanderbilt            12     3       9                1   
130            Temple            12     0      12                0   
131  Louisiana Monroe            12     1      11                1   
132            Nevada            12     0      12                0   

     Conference Losses  Total Points Scored  Total Points Allowed  \
0                    1           431.983324            171.707464   
1                    

In [28]:
import pandas as pd
import numpy as np
import warnings
import scipy.stats

def set_up_playoff_game_stats(df_playoff_games, df_all_stats):
    # Rename columns in df_all_stats for merging with home and away teams
    home_stats = df_all_stats.rename(columns={'Team': 'Home Team'})
    away_stats = df_all_stats.rename(columns={'Team': 'Away Team'})

    # Merge df_playoff_games with home_stats and away_stats
    df_playoff_with_home_stats = df_playoff_games.merge(
        home_stats, left_on=['Home Team'], right_on=['Home Team'], how='left'
    )
    df_playoff_with_full_stats = df_playoff_with_home_stats.merge(
        away_stats, left_on=['Away Team'], right_on=['Away Team'], suffixes=('_home', '_away'), how='left'
    )

    # Remove duplicates
    df_playoff_with_full_stats = df_playoff_with_full_stats.drop_duplicates(subset=['Home Team', 'Away Team'])

    # Adjust columns to match the model's input
    schedule_columns = df_schedule_with_full_stats.columns.tolist()
    playoff_columns = df_playoff_with_full_stats.columns.tolist()

    if schedule_columns != playoff_columns:
        missing_in_playoff = [col for col in schedule_columns if col not in playoff_columns]
        missing_in_schedule = [col for col in playoff_columns if col not in schedule_columns]

        # Add missing columns with default values
        for col in missing_in_playoff:
            df_playoff_with_full_stats[col] = None

        # Reorder columns
        df_playoff_with_full_stats = df_playoff_with_full_stats[schedule_columns]

    # Remove/rename unnecessary columns
    df_playoff_with_full_stats = df_playoff_with_full_stats.drop(columns=['Conference_x', 'Year_home', 'Conference_y', 'Year_away', 'Conference'], errors='ignore')

    # Add missing columns with default values
    missing_columns = ['Year', 'Week', 'Neutral Site', 'Conference Game', 'Home Conference', 'Home Points', 'Away Conference', 'Away Points']
    for col in missing_columns:
        if col not in df_playoff_with_full_stats.columns:
            df_playoff_with_full_stats[col] = None

    # Final column order
    df_playoff_with_full_stats = df_playoff_with_full_stats[schedule_columns]

    # Set 'Conference Game' and 'Neutral Site' to True for playoff games
    df_playoff_with_full_stats['Conference Game'] = True
    df_playoff_with_full_stats['Neutral Site'] = True

    return df_playoff_with_full_stats

def simulate_playoff_round(df_playoff_games, home_model, away_model, scaler, round_name):
    # Simulate the outcomes for each game in the given round
    results = []

    for idx, game in df_playoff_games.iterrows():
        # Home and away teams
        home_team = game['Home Team']
        away_team = game['Away Team']

        # Drop unnecessary columns and reshape the input for the model
        X_game = game.drop(labels=['Year', 'Home Points', 'Away Points', 'Week', 'Home Team', 'Home Conference', 'Away Team', 'Away Conference']).values.reshape(1, -1)

        # Scale the input features
        X_game_scaled = scaler.transform(X_game)

        # Suppress warnings during prediction
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            # Make predictions
            home_mean_array, home_std_array = home_model.predict(X_game_scaled, return_std=True)
            away_mean_array, away_std_array = away_model.predict(X_game_scaled, return_std=True)

            home_mean = home_mean_array[0]
            home_std = home_std_array[0]
            away_mean = away_mean_array[0]
            away_std = away_std_array[0]

        # Calculate spread, win probabilities, and total points
        spread_mean = home_mean - away_mean
        spread_std = np.sqrt(home_std**2 + away_std**2)
        home_win_percentage = 1 - scipy.stats.norm.cdf(0, loc=spread_mean, scale=spread_std)
        away_win_percentage = scipy.stats.norm.cdf(0, loc=spread_mean, scale=spread_std)

        # Determine the winner based on win percentage
        winner = home_team if np.random.rand() < home_win_percentage else away_team

        # Store the results
        results.append({
            'Round': round_name,
            'Home Team': home_team,
            'Away Team': away_team,
            'Winner': winner,
            'Home Mean Score': home_mean,
            'Home Score Std Dev': home_std,
            'Away Mean Score': away_mean,
            'Away Score Std Dev': away_std,
            'Home Spread Mean': spread_mean,
            'Home Spread Std Dev': spread_std,
            'Home Win Percentage': home_win_percentage,
            'Away Win Percentage': away_win_percentage
        })

    # Convert results to DataFrame
    df_results = pd.DataFrame(results)

    return df_results

def simulate_playoff_bracket(df_ranked_teams, home_model, away_model, scaler):
    # Define Round 1 matchups
    round_1_matchups = [
        ('5', '12'),
        ('6', '11'),
        ('7', '10'),
        ('8', '9')
    ]

    # Prepare DataFrame for Round 1
    df_round_1 = pd.DataFrame(columns=['Home Team', 'Away Team'])
    for home_seed, away_seed in round_1_matchups:
        home_team = df_ranked_teams[df_ranked_teams['CFP Seed'] == int(home_seed)]['Team'].values[0]
        away_team = df_ranked_teams[df_ranked_teams['CFP Seed'] == int(away_seed)]['Team'].values[0]
        df_round_1 = pd.concat([df_round_1, pd.DataFrame({
            'Home Team': [home_team],
            'Away Team': [away_team]
        })], ignore_index=True)

    # Set up stats for Round 1
    df_round_1_with_stats = set_up_playoff_game_stats(df_round_1, df_all_stats)

    # Simulate Round 1
    df_round_1_results = simulate_playoff_round(df_round_1_with_stats, home_model, away_model, scaler, 'Round 1')

    # Determine Quarterfinals matchups
    round_1_winners = df_round_1_results[['Winner']].rename(columns={'Winner': 'Team'})
    seeds = [1, 2, 3, 4]
    quarterfinals_matchups = [(df_ranked_teams[df_ranked_teams['CFP Seed'] == seed]['Team'].values[0], round_1_winners.iloc[i]['Team']) for i, seed in enumerate(seeds)]

    # Prepare DataFrame for Quarterfinals
    df_quarterfinals = pd.DataFrame(columns=['Home Team', 'Away Team'])
    for home_team, away_team in quarterfinals_matchups:
        df_quarterfinals = pd.concat([df_quarterfinals, pd.DataFrame({
            'Home Team': [home_team],
            'Away Team': [away_team]
        })], ignore_index=True)

    # Set up stats for Quarterfinals
    df_quarterfinals_with_stats = set_up_playoff_game_stats(df_quarterfinals, df_all_stats)

    # Simulate Quarterfinals
    df_quarterfinals_results = simulate_playoff_round(df_quarterfinals_with_stats, home_model, away_model, scaler, 'Quarterfinals')

    # Determine Semifinals matchups
    semifinal_winners = df_quarterfinals_results[['Winner']].rename(columns={'Winner': 'Team'})
    semifinal_matchups = [(semifinal_winners.iloc[i]['Team'], semifinal_winners.iloc[i+1]['Team']) for i in range(0, len(semifinal_winners), 2)]

    # Prepare DataFrame for Semifinals
    df_semifinals = pd.DataFrame(columns=['Home Team', 'Away Team'])
    for home_team, away_team in semifinal_matchups:
        df_semifinals = pd.concat([df_semifinals, pd.DataFrame({
            'Home Team': [home_team],
            'Away Team': [away_team]
        })], ignore_index=True)

    # Set up stats for Semifinals
    df_semifinals_with_stats = set_up_playoff_game_stats(df_semifinals, df_all_stats)

    # Simulate Semifinals
    df_semifinals_results = simulate_playoff_round(df_semifinals_with_stats, home_model, away_model, scaler, 'Semifinals')

    # Determine Championship matchup
    championship_matchup = df_semifinals_results[['Winner']].rename(columns={'Winner': 'Team'}).values.tolist()
    df_championship = pd.DataFrame({
        'Home Team': [championship_matchup[0][0]],
        'Away Team': [championship_matchup[1][0]]
    })

    # Set up stats for Championship
    df_championship_with_stats = set_up_playoff_game_stats(df_championship, df_all_stats)

    # Simulate Championship
    df_championship_results = simulate_playoff_round(df_championship_with_stats, home_model, away_model, scaler, 'Championship')

    # Combine all results
    all_results = pd.concat([
        df_round_1_results,
        df_quarterfinals_results,
        df_semifinals_results,
        df_championship_results
    ], ignore_index=True)

    return all_results

# Simulate the playoff bracket
playoff_results = simulate_playoff_bracket(df_ranked_teams, gpr_home, gpr_away, scaler)
print(playoff_results)






            Round      Home Team     Away Team      Winner  Home Mean Score  \
0         Round 1     Ohio State      Missouri  Ohio State        36.886829   
1         Round 1        Georgia  Kansas State     Georgia        36.926761   
2         Round 1     Penn State         Texas  Penn State        29.324870   
3         Round 1     Notre Dame      Michigan  Notre Dame        27.627110   
4   Quarterfinals        Alabama    Ohio State     Alabama        29.409424   
5   Quarterfinals         Oregon       Georgia      Oregon        31.965325   
6   Quarterfinals  Florida State    Penn State  Penn State        27.750133   
7   Quarterfinals        Liberty    Notre Dame  Notre Dame        21.426874   
8      Semifinals        Alabama        Oregon     Alabama        34.114175   
9      Semifinals     Penn State    Notre Dame  Penn State        30.623624   
10   Championship        Alabama    Penn State     Alabama        30.125072   

    Home Score Std Dev  Away Mean Score  Away Score



In [29]:
def update_playoff_wins(df_ranked_teams, playoff_results):
    # Initialize 'Playoff Wins' column to 0
    df_ranked_teams['Playoff Wins'] = 0

    # Count playoff wins for each team
    # Include all rounds for calculating playoff wins
    playoff_wins_count = playoff_results.groupby('Winner').size()

    # Update the 'Playoff Wins' column
    for team, wins in playoff_wins_count.items():
        if team in df_ranked_teams['Team'].values:
            df_ranked_teams.loc[df_ranked_teams['Team'] == team, 'Playoff Wins'] = wins

    return df_ranked_teams

# Example usage
# df_ranked_teams = ... # Your ranked teams DataFrame
# playoff_results = ... # The results from the simulate_playoff_bracket function

# Update the df_ranked_teams with playoff wins
df_ranked_teams_updated = update_playoff_wins(df_ranked_teams, playoff_results)

# Display the updated DataFrame
print(df_ranked_teams_updated.head())

         Team  Games Played  Wins  Losses  Conference Wins  Conference Losses  \
0  Ohio State            12    11       1                8                  1   
1     Alabama            13    13       0                9                  0   
2     Georgia            13    11       2                8                  1   
3  Penn State            13    11       2                8                  1   
4      Oregon            13    12       1                9                  1   

   Total Points Scored  Total Points Allowed  Total Wins Prob  \
0           431.983324            171.707464        10.822712   
1           445.196996            222.543786        12.448572   
2           449.633035            236.061258        11.019181   
3           418.962769            207.619030        11.468557   
4           467.582915            273.812374        11.707229   

   Conference Wins Prob  ...  Conference Champion  Conference Runner Up  \
0              7.822712  ...                Fal

In [30]:
# prompt: reorder the df_ranked_teams_updated columns so that the make more sense. I want these columns first 'FBS Rank', 'Team', 'Conference', 'Wins', 'Losses', 'CFP Seed', 'Playoff Wins', 'Conference Champion', then fill in the rest of the columns

# Define the desired order of columns
desired_order = ['FBS Rank', 'Team', 'Conference', 'Games Played', 'Wins', 'Losses', 'CFP Seed', 'Playoff Wins', 'Conference Champion', 'Conference Runner Up', 'Conference Rank']

# Get the remaining columns
remaining_columns = [col for col in df_ranked_teams_updated.columns if col not in desired_order]

# Concatenate the desired order and remaining columns
final_order = desired_order + remaining_columns

# Reorder the DataFrame
df_ranked_teams_updated = df_ranked_teams_updated[final_order]

# Display the reordered DataFrame
print(df_ranked_teams_updated.head())


   FBS Rank        Team Conference  Games Played  Wins  Losses  CFP Seed  \
0         1  Ohio State    Big Ten            12    11       1       5.0   
1         2     Alabama        SEC            13    13       0       1.0   
2         3     Georgia        SEC            13    11       2       6.0   
3         4  Penn State    Big Ten            13    11       2       7.0   
4         5      Oregon    Big Ten            13    12       1       2.0   

   Playoff Wins  Conference Champion  Conference Runner Up  ...  \
0             1                False                 False  ...   
1             3                 True                 False  ...   
2             1                False                  True  ...   
3             3                False                  True  ...   
4             1                 True                 False  ...   

   Average Points Scored  Average Points Allowed  Point Differential_x  \
0              35.998610               14.308955             21.68

In [31]:
# prompt: save df_final_rankings to my drive

from google.colab import drive
drive.mount('/content/drive')
df_ranked_teams_updated.to_csv('/content/drive/MyDrive/CFB_Model/2024_season_predictions.csv', index=False)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
