In [158]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, classification_report

In [159]:
power_5 = ['Big Ten', 'Big 12', 'SEC', 'ACC', 'Pac-12']
power_4 = ['Big Ten', 'Big 12', 'SEC', 'ACC']

dfs = []

for yr in range(2014, 2025 + 1):
    standings_raw = pd.read_csv(f"sports-reference-data/sr-standings-{yr}.csv")

    year_df = pd.DataFrame({
        'Year': yr,
        'Team': standings_raw['School'],
        'Conference': standings_raw['Conf'],
        'Wins': standings_raw['W'],
        'Losses': standings_raw['L'],
        'SoS': standings_raw['SOS'],
    })

    year_df['Conference'] = year_df['Conference'].str.replace(r'\s*\(.*\)', '', regex=True)
    year_df['Games'] = year_df['Wins'] + year_df['Losses']
    year_df['Power_Conf'] = year_df['Conference'].isin(
        power_5 if yr >= 2024 else power_4
    ).astype(int)
    
    possible_rank_cols = ['AP Rank', 'AP Curr']

    rank_series = next(
        (standings_raw[c] for c in possible_rank_cols if c in standings_raw.columns),
        None
    )

    year_df['Ranked'] = rank_series.notna().astype(int) if rank_series is not None else 0


    dfs.append(year_df)

df = pd.concat(dfs, ignore_index=True)
df.head()




Unnamed: 0,Year,Team,Conference,Wins,Losses,SoS,Games,Power_Conf,Ranked
0,2014,Florida State,ACC,13,1,5.13,14,1,1
1,2014,Clemson,ACC,10,3,2.86,13,1,1
2,2014,Louisville,ACC,9,4,3.22,13,1,1
3,2014,Boston College,ACC,7,6,2.35,13,1,0
4,2014,North Carolina State,ACC,8,5,1.25,13,1,0


In [160]:
df['Final_4'] = 0
df['National_Champ'] = 0

df['Conf_Champ'] = 0
df['Conf_Finalist'] = 0

df['Final_12'] = 0

df.head()

Unnamed: 0,Year,Team,Conference,Wins,Losses,SoS,Games,Power_Conf,Ranked,Final_4,National_Champ,Conf_Champ,Conf_Finalist,Final_12
0,2014,Florida State,ACC,13,1,5.13,14,1,1,0,0,0,0,0
1,2014,Clemson,ACC,10,3,2.86,13,1,1,0,0,0,0,0
2,2014,Louisville,ACC,9,4,3.22,13,1,1,0,0,0,0,0
3,2014,Boston College,ACC,7,6,2.35,13,1,0,0,0,0,0,0
4,2014,North Carolina State,ACC,8,5,1.25,13,1,0,0,0,0,0,0


In [161]:
# ============================================================================
# College Football Playoff Data Processing
# Handles both 4-team (2014-2023) and 12-team (2025+) playoff formats
# ============================================================================

def remove_playoff_games(df, year, playoff_results):
    """
    Remove playoff games from team statistics.
    
    Args:
        df: DataFrame with team statistics
        year: Season year
        playoff_results: Dict mapping team -> {'Games': int, 'Wins': int, 'Losses': int, 'Round': str}
    """
    if year not in playoff_results:
        return
    
    for team, results in playoff_results[year].items():
        mask = (df['Year'] == year) & (df['Team'] == team)
        df.loc[mask, 'Games'] -= results['Games']
        df.loc[mask, 'Wins'] -= results['Wins']
        df.loc[mask, 'Losses'] -= results['Losses']


def assign_targets(df, year, playoff_results, ny6_bowls=None):
    """
    Assign all playoff-related target variables based on playoff results.
    
    Target variables:
        - Final_12: Made NY6 bowl (2014-2023) or 12-team playoff (2025+)
        - Final_4: Made semifinals or better (final 4 teams)
        - National_Champ: Won the championship
    
    Args:
        ny6_bowls: Optional dict of NY6 participants (for 2014-2023)
    """
    if year not in playoff_results:
        return
    
    teams = playoff_results[year]
    
    # Final_12 assignment
    if ny6_bowls and year in ny6_bowls:
        # 2014-2023: Use NY6 bowl participants list
        df.loc[
            (df['Year'] == year) & (df['Team'].isin(ny6_bowls[year])),
            'Final_12'
        ] = 1
    else:
        # 2024+: All playoff teams
        df.loc[
            (df['Year'] == year) & (df['Team'].isin(teams.keys())),
            'Final_12'
        ] = 1
    
    # Teams that reached semifinals or better get Final_4
    semifinalists = [
        team for team, r in teams.items()
        if r['Round'] in {'Semifinal', 'Final', 'Champion'}
    ]
    df.loc[
        (df['Year'] == year) & (df['Team'].isin(semifinalists)),
        'Final_4'
    ] = 1
    
    # Champion
    champions = [team for team, r in teams.items() if r['Round'] == 'Champion']
    if champions:
        df.loc[
            (df['Year'] == year) & (df['Team'] == champions[0]),
            'National_Champ'
        ] = 1



def assign_conf_champs(df, year, conf_champs, conf_finalists):
    """
    Assign conference championship target variables.
    
    Args:
        conf_champs: List of conference champions
        conf_finalists: List of conference finalists (includes champs)
    """
    # FIXED: Remove extra brackets - conf_champs is already a list
    df.loc[
        (df['Year'] == year) & (df['Team'].isin(conf_champs)),
        'Conf_Champ'
    ] = 1
    
    df.loc[
        (df['Year'] == year) & (df['Team'].isin(conf_finalists)),
        'Conf_Finalist'
    ] = 1


# ============================================================================
# NY6 BOWL PARTICIPANTS (2014-2023 only)
# For 2024+, all 12 playoff teams count as Final_12
# ============================================================================

ny6_bowls = {
    2014: ['Boise State', 'Arizona', 'TCU', 'Ole Miss', 'Georgia Tech', 'Mississippi State',
           'Michigan State', 'Baylor', 'Oregon', 'Florida State', 'Ohio State', 'Alabama'],
    
    2015: ['Houston', 'Florida State', 'Clemson', 'Oklahoma', 'Alabama', 'Michigan State',
           'Ohio State', 'Notre Dame', 'Stanford', 'Iowa', 'Ole Miss', 'Oklahoma State'],
    
    2016: ['Florida State', 'Michigan', 'Alabama', 'Washington', 'Clemson', 'Ohio State',
           'Wisconsin', 'Western Michigan', 'USC', 'Penn State', 'Oklahoma', 'Auburn'],
    
    2017: ['Ohio State', 'USC', 'Penn State', 'Washington', 'Wisconsin', 'Miami',
           'UCF', 'Auburn', 'Georgia', 'Oklahoma', 'Alabama', 'Clemson'],
    
    2018: ['Florida', 'Michigan', 'Clemson', 'Notre Dame', 'Alabama', 'Oklahoma',
           'LSU', 'UCF', 'Ohio State', 'Washington', 'Texas', 'Georgia'],
    
    2019: ['Penn State', 'Memphis', 'LSU', 'Oklahoma', 'Clemson', 'Ohio State',
           'Florida', 'Virginia', 'Oregon', 'Wisconsin', 'Georgia', 'Baylor'],
    
    2020: ['Oklahoma', 'Florida', 'Georgia', 'Cincinnati', 'Alabama', 'Notre Dame',
           'Ohio State', 'Clemson', 'Texas A&M', 'North Carolina', 'Iowa State', 'Oregon'],
    
    2021: ['Michigan State', 'Pitt', 'Alabama', 'Cincinnati', 'Georgia', 'Michigan',
           'Oklahoma State', 'Notre Dame', 'Ohio State', 'Utah', 'Baylor', 'Ole Miss'],
    
    2022: ['Tennessee', 'Clemson', 'Alabama', 'Kansas State', 'TCU', 'Michigan',
           'Georgia', 'Ohio State', 'Tulane', 'USC', 'Penn State', 'Utah'],
    
    2023: ['Missouri', 'Ohio State', 'Ole Miss', 'Penn State', 'Georgia', 'Florida State',
           'Oregon', 'Liberty', 'Michigan', 'Alabama', 'Washington', 'Texas'],
}


# ============================================================================
# PLAYOFF RESULTS - Authoritative source of truth
# ============================================================================

playoff_results = {
    # 2014-2015 Season (4-team playoff)
    2014: {
        'Ohio State':    {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'Oregon':        {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Alabama':       {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Florida State': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2015-2016 Season
    2015: {
        'Alabama':        {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'Clemson':        {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Oklahoma':       {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Michigan State': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2016-2017 Season
    2016: {
        'Clemson':    {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'Alabama':    {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Ohio State': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Washington': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2017-2018 Season
    2017: {
        'Alabama': {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'Georgia': {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Oklahoma': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Clemson':  {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2018-2019 Season
    2018: {
        'Clemson':    {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'Alabama':    {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Notre Dame': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Oklahoma':   {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2019-2020 Season
    2019: {
        'LSU':       {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'Clemson':   {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Ohio State': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Oklahoma':   {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2020-2021 Season
    2020: {
        'Alabama':    {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'Ohio State': {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Clemson':    {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Notre Dame': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2021-2022 Season
    2021: {
        'Georgia':   {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'Alabama':   {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Michigan':  {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Cincinnati': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2022-2023 Season
    2022: {
        'Georgia':   {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'TCU':       {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Michigan':  {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Ohio State': {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2023-2024 Season
    2023: {
        'Michigan':  {'Games': 2, 'Wins': 2, 'Losses': 0, 'Round': 'Champion'},
        'Washington': {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Final'},
        'Alabama':   {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
        'Texas':     {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'Semifinal'},
    },
    
    # 2024-2025 Season (12-team playoff)
    2025: {
        # Champion (4 games)
        'Ohio State':   {'Games': 4, 'Wins': 4, 'Losses': 0, 'Round': 'Champion'},
        
        # Runner-up (4 games)
        'Notre Dame':   {'Games': 4, 'Wins': 3, 'Losses': 1, 'Round': 'Final'},
        
        # Semifinal losers (3 games)
        'Texas':        {'Games': 3, 'Wins': 2, 'Losses': 1, 'Round': 'Semifinal'},
        'Penn State':   {'Games': 3, 'Wins': 1, 'Losses': 2, 'Round': 'Semifinal'},
        
        # Quarterfinal losers with byes (2 games, 0 wins)
        'Georgia':      {'Games': 2, 'Wins': 0, 'Losses': 2, 'Round': 'Quarterfinal'},
        'Oregon':       {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Quarterfinal'},
        'Boise State':  {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Quarterfinal'},
        'Arizona State': {'Games': 2, 'Wins': 1, 'Losses': 1, 'Round': 'Quarterfinal'},
        
        # First round losers (1 game)
        'Tennessee':    {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'First Round'},
        'Indiana':      {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'First Round'},
        'SMU':          {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'First Round'},
        'Clemson':      {'Games': 1, 'Wins': 0, 'Losses': 1, 'Round': 'First Round'},
    },
}


# ============================================================================
# CONFERENCE CHAMPIONSHIP DATA
# ============================================================================

conf_data = {
    2014: {
        'finalists': [
            'Florida State', 'Georgia Tech',  # ACC
            'Ohio State', 'Wisconsin',        # Big Ten
            'Alabama', 'Missouri',            # SEC
            'Oregon', 'Arizona',              # Pac-12
        ],
        'champs': [
            'Florida State',  # ACC
            'Ohio State',     # Big Ten
            'Alabama',        # SEC
            'Baylor', 'TCU',  # Big 12 (co-champs)
            'Oregon',         # Pac-12
        ],
    },
    
    2015: {
        'finalists': [
            'North Carolina', 'Clemson',      # ACC
            'Michigan State', 'Iowa',         # Big Ten
            'Alabama', 'Florida',             # SEC
            'Stanford', 'USC',                # Pac-12
        ],
        'champs': [
            'Clemson',        # ACC
            'Michigan State', # Big Ten
            'Alabama',        # SEC
            'Oklahoma',       # Big 12
            'Stanford',       # Pac-12
        ],
    },
    
    2016: {
        'finalists': [
            'Clemson', 'Virginia Tech',       # ACC
            'Penn State', 'Wisconsin',        # Big Ten
            'Alabama', 'Florida',             # SEC
            'Washington', 'Colorado',         # Pac-12
        ],
        'champs': [
            'Clemson',    # ACC
            'Penn State', # Big Ten
            'Alabama',    # SEC
            'Oklahoma',   # Big 12
            'Washington', # Pac-12
        ],
    },
    
    2017: {
        'finalists': [
            'Clemson', 'Miami',               # ACC
            'Ohio State', 'Wisconsin',        # Big Ten
            'Georgia', 'Auburn',              # SEC
            'Oklahoma', 'TCU',                # Big 12
            'USC', 'Stanford',                # Pac-12
        ],
        'champs': [
            'Clemson',    # ACC
            'Ohio State', # Big Ten
            'Georgia',    # SEC
            'TCU',        # Big 12
            'USC',        # Pac-12
        ],
    },
    
    2018: {
        'finalists': [
            'Clemson', 'Pittsburgh',          # ACC
            'Ohio State', 'Northwestern',     # Big Ten
            'Alabama', 'Georgia',             # SEC
            'Oklahoma', 'Texas',              # Big 12
            'Washington', 'Utah',             # Pac-12
        ],
        'champs': [
            'Clemson',    # ACC
            'Ohio State', # Big Ten
            'Alabama',    # SEC
            'Oklahoma',   # Big 12
            'Washington', # Pac-12
        ],
    },
    
    2019: {
        'finalists': [
            'Clemson', 'Virginia',            # ACC
            'Ohio State', 'Wisconsin',        # Big Ten
            'LSU', 'Georgia',                 # SEC
            'Oklahoma', 'Baylor',             # Big 12
            'Oregon', 'Utah',                 # Pac-12
        ],
        'champs': [
            'Clemson',    # ACC
            'Ohio State', # Big Ten
            'LSU',        # SEC
            'Oklahoma',   # Big 12
            'Oregon',     # Pac-12
        ],
    },
    
    2020: {
        'finalists': [
            'Clemson', 'Notre Dame',          # ACC
            'Northwestern', 'Ohio State',     # Big Ten
            'Alabama', 'Florida',             # SEC
            'Oklahoma', 'Iowa State',         # Big 12
            'Oregon', 'USC',                  # Pac-12
        ],
        'champs': [
            'Clemson',    # ACC
            'Ohio State', # Big Ten
            'Alabama',    # SEC
            'Oklahoma',   # Big 12
            'Oregon',     # Pac-12
        ],
    },
    
    2021: {
        'finalists': [
            'Pittsburgh', 'Wake Forest',      # ACC
            'Michigan', 'Iowa',               # Big Ten
            'Georgia', 'Alabama',             # SEC
            'Baylor', 'Oklahoma State',       # Big 12
            'Utah', 'Oregon',                 # Pac-12
        ],
        'champs': [
            'Pittsburgh', # ACC
            'Michigan',   # Big Ten
            'Alabama',    # SEC
            'Baylor',     # Big 12
            'Utah',       # Pac-12
        ],
    },
    
    2022: {
        'finalists': [
            'Clemson', 'North Carolina',      # ACC
            'Michigan', 'Purdue',             # Big Ten
            'Georgia', 'LSU',                 # SEC
            'Kansas State', 'TCU',            # Big 12
            'Utah', 'USC',                    # Pac-12
        ],
        'champs': [
            'Clemson',      # ACC
            'Michigan',     # Big Ten
            'Georgia',      # SEC
            'Kansas State', # Big 12
            'Utah',         # Pac-12
        ],
    },
    
    2023: {
        'finalists': [
            'Louisville', 'Florida State',    # ACC
            'Michigan', 'Iowa',               # Big Ten
            'Georgia', 'Alabama',             # SEC
            'Oklahoma State', 'Texas',        # Big 12
            'Washington', 'Oregon',           # Pac-12
            
            
        ],
        'champs': [
            'Florida State', # ACC
            'Michigan',      # Big Ten
            'Georgia',       # SEC
            'Texas',         # Big 12
            'Washington',    # Pac-12
        ],
    },
    
    2025: {
        'finalists': [
            'Clemson', 'SMU',                 # ACC
            'Oregon', 'Penn State',           # Big Ten
            'Georgia', 'Texas',               # SEC
            'Arizona State', 'Iowa State',    # Big 12
        ],
        'champs': [
            'Clemson', 
            'Oregon',              # Big Ten
            'Georgia',             # SEC
            'Arizona State',       # Big 12
        ],
    },
    
    2025: {
        'finalists': [
            'Virginia', 'Duke',                    # ACC
            'Indiana', 'Ohio State',               # Big Ten
            'Georgia', 'Alabama',                  # SEC
            'Texas Tech', 'BYU',                   # Big 12
            'Ohio', 'Miami (OH)',                  # MAC
            'Boise State', 'UNLV',                 # Mountain West
            'Tulane', 'North Texas',               # AAC
            'James Madison', 'Troy',               # Sun Belt
            'Kennesaw State', 'Jacksonville State',# CUSA
        ],
        'champs': [
            'Duke',           # ACC
            'Indiana',        # Big Ten
            'Georgia',        # SEC
            'Texas Tech',     # Big 12
            'Ohio',           # MAC
            'Boise State',    # Mountain West
            'Tulane',         # AAC
            'James Madison',  # Sun Belt
            'Kennesaw State', # CUSA
        ],
    },
}


# ============================================================================
# APPLY ALL ADJUSTMENTS
# ============================================================================

def process_all_years(df):
    """Apply all playoff and conference championship adjustments to dataframe."""
    
    # Process playoff results
    for year in playoff_results.keys():
        remove_playoff_games(df, year, playoff_results)
        assign_targets(df, year, playoff_results, ny6_bowls)
    
    # Process conference championships
    for year, data in conf_data.items():
        assign_conf_champs(df, year, data['champs'], data['finalists'])
    
    return df


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == '__main__':
    import pandas as pd
    
    # Example usage:
    # df = pd.read_csv('your_data.csv')
    # df = process_all_years(df)
    # df.to_csv('processed_data.csv', index=False)
    
    print("Playoff processing module loaded successfully.")
    print(f"Configured for years: {sorted(playoff_results.keys())}")
    print(f"Conference data for years: {sorted(conf_data.keys())}")

Playoff processing module loaded successfully.
Configured for years: [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Conference data for years: [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]


In [162]:
process_all_years(df)

df['Win_Pct'] = df['Wins'] / df['Games']
df.to_csv("processed-data-cfb.csv", index=False)
df.head()


Unnamed: 0,Year,Team,Conference,Wins,Losses,SoS,Games,Power_Conf,Ranked,Final_4,National_Champ,Conf_Champ,Conf_Finalist,Final_12,Win_Pct
0,2014,Florida State,ACC,13,0,5.13,13,1,1,1,0,1,1,1,1.0
1,2014,Clemson,ACC,10,3,2.86,13,1,1,0,0,0,0,0,0.769231
2,2014,Louisville,ACC,9,4,3.22,13,1,1,0,0,0,0,0,0.692308
3,2014,Boston College,ACC,7,6,2.35,13,1,0,0,0,0,0,0,0.538462
4,2014,North Carolina State,ACC,8,5,1.25,13,1,0,0,0,0,0,0,0.615385


In [163]:
df.isnull().sum()


Year              0
Team              0
Conference        0
Wins              0
Losses            0
SoS               0
Games             0
Power_Conf        0
Ranked            0
Final_4           0
National_Champ    0
Conf_Champ        0
Conf_Finalist     0
Final_12          0
Win_Pct           0
dtype: int64

In [164]:
def prediction(df, yr, target, model_name, head=12, max=15):
    # Separate year data as holdout set
    df_train = df[df['Year'] < yr].copy()
    df_pred_year = df[df['Year'] == yr].copy()


    # Split features and target from training data (2014-2024)
    X = df_train[['Wins', 'Losses', 'SoS', 'Power_Conf', 'Ranked', 
                'Conf_Champ', 'Conf_Finalist', 'Win_Pct']].values
    
    y = df_train[target]

    # Split into training and testing sets (using historical data only)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.20,
        random_state=0,
        stratify=y
    )

    if (model_name == 'RandomForest'): 
        model = RandomForestClassifier(n_estimators=400, 
                                    n_jobs=-1,
                                    random_state=42,
                                    criterion='gini', # speed
                                    class_weight='balanced' # {0:, .5, 1:3} (weights dict)
                                    ) 
    elif (model_name == 'XGBoost'):
        scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
        
        model = xgb.XGBClassifier(
            n_estimators=400,
            learning_rate=0.1,
            max_depth=6,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            scale_pos_weight=scale_pos_weight,  # Handle class imbalance
            objective='binary:logistic',
            random_state=42,
            n_jobs=-1,
            eval_metric='logloss'
        )
    else:
        print(f"Model '{model_name} not found")

    # Train/fit
    model.fit(X_train, y_train)

    # Predictions on test set
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]


    # accuracy
    # train_acc = accuracy_score(y_train, y_pred_train)
    # print(f"Train Accuracy: {train_acc}")
    # accuracy = accuracy_score(y_test, y_pred)
    # print(f"Test Accuracy: {accuracy}")
    # print(classification_report(y_test, y_pred))

    # Prepare 2025 data
    X_pred_year = df_pred_year[['Wins', 'Losses', 'SoS', 'Power_Conf', 'Ranked', 
                            'Conf_Champ', 'Conf_Finalist', 'Win_Pct']].values

    # Predictions 
    y_pred_year = model.predict(X_pred_year)
    y_pred_proba_year = model.predict_proba(X_pred_year)[:, 1]

    # Add predictions back to dataframe
    df_pred_year['Prediction'] = y_pred_year
    df_pred_year['Prediction'] = df_pred_year['Prediction'].map({0:"no", 1:"yes"})  
    df_pred_year['Probability'] = y_pred_proba_year

    # # Sort by probability to see top candidates
    # print(f"{yr} Top {head} Predicted {target} ({model_name}):")
    # print(df_pred_year[['Team', 'Probability']]
    #     .sort_values('Probability', ascending=False).head(head))

    # # Outside the bubble
    # print(df_pred_year[['Team', 'Probability']]
    #     .sort_values('Probability', ascending=False).iloc[head:max])
    
    df_pred_year = df_pred_year[['Team', 'Probability']].sort_values('Probability', 
                                                                     ascending=False)

    return df_pred_year


In [166]:
# # extract feature importances and plot
# feature_importances = pd.Series(model.feature_importances_, 
#                                 index=X_vars.columns).sort_values(ascending=False)
# feature_importances.plot.bar()
# plt.show()


In [239]:
rf_12_2024 = prediction(df, 2024, "Final_12", "RandomForest")

xg_12_2024 = prediction(df, 2024, "Final_12", "XGBoost")
real_12_2024 = pd.DataFrame({
    "Rank": [
        1, 2, 3, 4, 5,
        6, 7, 8, 9, 10,
        11, 12, 13, 14, 
        15, 16, 17, 18
    ],
    "Team": [
        "Oregon",
        "Georgia",
        "Texas",
        "Penn State",
        
        "Notre Dame",
        "Ohio State",
        "Tennessee",
        "Indiana",
        "Boise State",
        "SMU",
        "Arizona State",
        "Clemson",
        
        "Alabama",
        "Miami (FL)",
        "Ole Miss",
        "South Carolina",
        "BYU",
        "Iowa State",
    ]
})

# Start fresh with actual rankings
comp_12_2024 = real_12_2024.copy()
comp_12_2024['Rank'] = comp_12_2024['Rank']

# Get RF predictions
comp_12_2024['RF Pred'] = rf_12_2024['Team'].iloc[:18].values

# Calculate RF accuracy: Find actual seed of predicted team
comp_12_2024['RF acc'] = comp_12_2024.apply(
    lambda row: real_12_2024[real_12_2024['Team'] == row['RF Pred']]['Rank'].values[0] - row['Rank']
    if row['RF Pred'] in real_12_2024['Team'].values else None,
    axis=1
)

# Format RF accuracy
comp_12_2024['RF acc'] = comp_12_2024['RF acc'].apply(
    lambda x: f"+{int(x)}" if x > 0 else (f"{int(x)}" if x < 0 else "0") if pd.notna(x) else "N/A"
)

# Get XGB predictions
comp_12_2024['XGB Pred'] = xg_12_2024['Team'].iloc[:18].values

# Calculate XGB accuracy: Find actual seed of predicted team
comp_12_2024['XGB acc'] = comp_12_2024.apply(
    lambda row: real_12_2024[real_12_2024['Team'] == row['XGB Pred']]['Rank'].values[0] - row['Rank']
    if row['XGB Pred'] in real_12_2024['Team'].values else None,
    axis=1
)

# Format XGB accuracy
comp_12_2024['XGB acc'] = comp_12_2024['XGB acc'].apply(
    lambda x: f"+{int(x)}" if x > 0 else (f"{int(x)}" if x < 0 else "0") if pd.notna(x) else "N/A"
)

# Select and reorder columns
comp_12_2024 = comp_12_2024[['Team', 'Rank', 'RF Pred', 'RF acc', 'XGB Pred', 'XGB acc']]

comp_12_2024.head(15)

Unnamed: 0,Team,Rank,RF Pred,RF acc,XGB Pred,XGB acc
0,Oregon,1,Oregon,0.0,Oregon,0.0
1,Georgia,2,Penn State,2.0,Penn State,2.0
2,Texas,3,Georgia,-1.0,Georgia,-1.0
3,Penn State,4,Texas,-1.0,Arizona State,7.0
4,Notre Dame,5,Arizona State,6.0,Miami (FL),9.0
5,Ohio State,6,SMU,4.0,SMU,4.0
6,Tennessee,7,Clemson,5.0,Texas,-4.0
7,Indiana,8,Notre Dame,-3.0,Clemson,4.0
8,Boise State,9,Indiana,-1.0,Indiana,-1.0
9,SMU,10,Miami (FL),4.0,Ohio State,-4.0


In [247]:
# Convert accuracy back to numeric for calculations
rf_acc_numeric = comp_12_2024['RF acc'].apply(
    lambda x: int(x) if x not in ['N/A', None] else None
)
xg_acc_numeric = comp_12_2024['XGB acc'].apply(
    lambda x: int(x) if x not in ['N/A', None] else None
)

# RF Statistics
rf_avg_off = rf_acc_numeric.abs().mean()
rf_most_overrated_idx = rf_acc_numeric.idxmin()  # Changed: min is overrated (negative = ranked too high)
rf_most_overrated_team = comp_12_2024.loc[rf_most_overrated_idx, 'RF Pred']
rf_most_overrated_acc = comp_12_2024.loc[rf_most_overrated_idx, 'RF acc']
rf_most_underrated_idx = rf_acc_numeric.idxmax()  # Changed: max is underrated (positive = ranked too low)
rf_most_underrated_team = comp_12_2024.loc[rf_most_underrated_idx, 'RF Pred']
rf_most_underrated_acc = comp_12_2024.loc[rf_most_underrated_idx, 'RF acc']

# Count RF playoff teams (top 12 actual teams that appear in top 12 
# s)
rf_top12_predicted = set(rf_12_2024['Team'].iloc[:12].values)
actual_top12 = set(real_12_2024['Team'].iloc[:12].values)
rf_playoff_correct = len(rf_top12_predicted.intersection(actual_top12))

# XGB Statistics
xg_avg_off = xg_acc_numeric.abs().mean()
xg_most_overrated_idx = xg_acc_numeric.idxmin()  # Changed: min is overrated (negative = ranked too high)
xg_most_overrated_team = comp_12_2024.loc[xg_most_overrated_idx, 'XGB Pred']
xg_most_overrated_acc = comp_12_2024.loc[xg_most_overrated_idx, 'XGB acc']
xg_most_underrated_idx = xg_acc_numeric.idxmax()  # Changed: max is underrated (positive = ranked too low)
xg_most_underrated_team = comp_12_2024.loc[xg_most_underrated_idx, 'XGB Pred']
xg_most_underrated_acc = comp_12_2024.loc[xg_most_underrated_idx, 'XGB acc']

# Count XGB playoff teams
xg_top12_predicted = set(xg_12_2024['Team'].iloc[:12].values)
xg_playoff_correct = len(xg_top12_predicted.intersection(actual_top12))

# Print Statistics
print(f"RF Average Off-By: {rf_avg_off:.2f}")
print(f"Most Overrated: {rf_most_overrated_team} ({abs(int(rf_most_overrated_acc))} spots lower)")
print(f"Most Underrated: {rf_most_underrated_team} ({abs(int(rf_most_underrated_acc))} spots higher)")
print(f"Playoff Teams Predicted: {rf_playoff_correct}/12")

print(f"\nXGB Average Off-By: {xg_avg_off:.2f}")
print(f"Most Overrated: {xg_most_overrated_team} ({abs(int(xg_most_overrated_acc))} spots lower)")
print(f"Most Underrated: {xg_most_underrated_team} ({abs(int(xg_most_underrated_acc))} spots higher)")
print(f"Playoff Teams Predicted: {xg_playoff_correct}/12")

RF Average Off-By: 3.25
Most Overrated: Tennessee (8 spots lower)
Most Underrated: Arizona State (6 spots higher)
Playoff Teams Predicted: 10/12

XGB Average Off-By: 3.87
Most Overrated: Tennessee (8 spots lower)
Most Underrated: Miami (FL) (9 spots higher)
Playoff Teams Predicted: 10/12


In [252]:
rf_12_2025 = prediction(df, 2025, "Final_12", "RandomForest")
xg_12_2025 = prediction(df, 2025, "Final_12", "XGBoost")

real_12_2025 = pd.DataFrame({
    "Rank": [
        1, 2, 3, 4, 5,
        6, 7, 8, 9, 10,
        11, 12, 13, 14, 
        15, 16, 17
    ],
    "Team": [
        "Indiana",
        "Ohio State",
        "Georgia",
        "Texas Tech",
        
        "Oregon",
        "Ole Miss",
        "Texas A&M",
        "Oklahoma",
        "Alabama",
        "Miami (FL)",
        "Tulane",
        "James Madison",
        
        "Notre Dame",
        "BYU",
        "Texas",
        "Vanderbilt",
        "Utah",
    ]
})

# Start fresh with actual rankings
comp_12_2025 = real_12_2025.copy()
comp_12_2025['Rank'] = comp_12_2025['Rank']

# Get RF predictions
comp_12_2025['RF Pred'] = rf_12_2025['Team'].iloc[:17].values

# Calculate RF accuracy: Find actual seed of predicted team
comp_12_2025['RF acc'] = comp_12_2025.apply(
    lambda row: real_12_2025[real_12_2025['Team'] == row['RF Pred']]['Rank'].values[0] - row['Rank']
    if row['RF Pred'] in real_12_2025['Team'].values else None,
    axis=1
)

# Format RF accuracy
comp_12_2025['RF acc'] = comp_12_2025['RF acc'].apply(
    lambda x: f"+{int(x)}" if x > 0 else (f"{int(x)}" if x < 0 else "0") if pd.notna(x) else "N/A"
)

# Get XGB predictions
comp_12_2025['XGB Pred'] = xg_12_2025['Team'].iloc[:17].values

# Calculate XGB accuracy: Find actual seed of predicted team
comp_12_2025['XGB acc'] = comp_12_2025.apply(
    lambda row: real_12_2025[real_12_2025['Team'] == row['XGB Pred']]['Rank'].values[0] - row['Rank']
    if row['XGB Pred'] in real_12_2025['Team'].values else None,
    axis=1
)

# Format XGB accuracy
comp_12_2025['XGB acc'] = comp_12_2025['XGB acc'].apply(
    lambda x: f"+{int(x)}" if x > 0 else (f"{int(x)}" if x < 0 else "0") if pd.notna(x) else "N/A"
)

# Select and reorder columns
comp_12_2025 = comp_12_2025[['Team', 'Rank', 'RF Pred', 'RF acc', 'XGB Pred', 'XGB acc']]

comp_12_2025.head(15)


Unnamed: 0,Team,Rank,RF Pred,RF acc,XGB Pred,XGB acc
0,Indiana,1,Georgia,2,Georgia,2.0
1,Ohio State,2,BYU,12,Ohio State,0.0
2,Georgia,3,Indiana,-2,BYU,11.0
3,Texas Tech,4,Texas Tech,0,Texas Tech,0.0
4,Oregon,5,Texas A&M,2,Oregon,0.0
5,Ole Miss,6,Oregon,-1,Indiana,-5.0
6,Texas A&M,7,Ohio State,-5,Miami (FL),3.0
7,Oklahoma,8,Miami (FL),2,Notre Dame,5.0
8,Alabama,9,Ole Miss,-3,Oklahoma,-1.0
9,Miami (FL),10,Oklahoma,-2,Texas A&M,-3.0


In [253]:
# Convert accuracy back to numeric for calculations
rf_acc_numeric = comp_12_2025['RF acc'].apply(
    lambda x: int(x) if x not in ['N/A', None] else None
)
xg_acc_numeric = comp_12_2025['XGB acc'].apply(
    lambda x: int(x) if x not in ['N/A', None] else None
)

# RF Statistics
rf_avg_off = rf_acc_numeric.abs().mean()
rf_most_overrated_idx = rf_acc_numeric.idxmin()  # min is overrated (negative = ranked too high)
rf_most_overrated_team = comp_12_2025.loc[rf_most_overrated_idx, 'RF Pred']
rf_most_overrated_acc = comp_12_2025.loc[rf_most_overrated_idx, 'RF acc']
rf_most_underrated_idx = rf_acc_numeric.idxmax()  # max is underrated (positive = ranked too low)
rf_most_underrated_team = comp_12_2025.loc[rf_most_underrated_idx, 'RF Pred']
rf_most_underrated_acc = comp_12_2025.loc[rf_most_underrated_idx, 'RF acc']

# Count RF playoff teams (top 12 actual teams that appear in top 12 predictions)
rf_top12_predicted = set(rf_12_2025['Team'].iloc[:12].values)
actual_top12 = set(real_12_2025['Team'].iloc[:12].values)
rf_playoff_correct = len(rf_top12_predicted.intersection(actual_top12))

# XGB Statistics
xg_avg_off = xg_acc_numeric.abs().mean()
xg_most_overrated_idx = xg_acc_numeric.idxmin()  # min is overrated (negative = ranked too high)
xg_most_overrated_team = comp_12_2025.loc[xg_most_overrated_idx, 'XGB Pred']
xg_most_overrated_acc = comp_12_2025.loc[xg_most_overrated_idx, 'XGB acc']
xg_most_underrated_idx = xg_acc_numeric.idxmax()  # max is underrated (positive = ranked too low)
xg_most_underrated_team = comp_12_2025.loc[xg_most_underrated_idx, 'XGB Pred']
xg_most_underrated_acc = comp_12_2025.loc[xg_most_underrated_idx, 'XGB acc']

# Count XGB playoff teams
xg_top12_predicted = set(xg_12_2025['Team'].iloc[:12].values)
xg_playoff_correct = len(xg_top12_predicted.intersection(actual_top12))

# Print Statistics
print(f"RF Average Off-By: {rf_avg_off:.2f}")
print(f"Most Overrated: {rf_most_overrated_team} ({abs(int(rf_most_overrated_acc))} spots lower)")
print(f"Most Underrated: {rf_most_underrated_team} ({abs(int(rf_most_underrated_acc))} spots higher)")
print(f"Playoff Teams Predicted: {rf_playoff_correct}/12")

print(f"\nXGB Average Off-By: {xg_avg_off:.2f}")
print(f"Most Overrated: {xg_most_overrated_team} ({abs(int(xg_most_overrated_acc))} spots lower)")
print(f"Most Underrated: {xg_most_underrated_team} ({abs(int(xg_most_underrated_acc))} spots higher)")
print(f"Playoff Teams Predicted: {xg_playoff_correct}/12")


RF Average Off-By: 2.62
Most Overrated: Ohio State (5 spots lower)
Most Underrated: BYU (12 spots higher)
Playoff Teams Predicted: 11/12

XGB Average Off-By: 3.07
Most Overrated: Ole Miss (6 spots lower)
Most Underrated: BYU (11 spots higher)
Playoff Teams Predicted: 10/12


In [261]:
# Final_4

rf_12_2025 = prediction(df, 2025, "Final_4", "RandomForest")
xg_12_2025 = prediction(df, 2025, "Final_4", "XGBoost")

real_12_2025 = pd.DataFrame({
    "Rank": [
        1, 2, 3, 4, 5,
        6, 7, 8, 9, 10,
        11, 12
    ],
    "Team": [
        "Indiana",
        "Ohio State",
        "Georgia",
        "Texas Tech",
        
        "Oregon",
        "Ole Miss",
        "Texas A&M",
        "Oklahoma",
        "Alabama",
        "Miami (FL)",
        "Tulane",
        "James Madison"
    ]
})

# Start fresh with actual rankings
comp_12_2025 = real_12_2025.copy()
comp_12_2025['Rank'] = comp_12_2025['Rank']

# Get RF predictions
comp_12_2025['RF Pred'] = rf_12_2025['Team'].iloc[:12].values

# Calculate RF accuracy: Find actual seed of predicted team
comp_12_2025['RF acc'] = comp_12_2025.apply(
    lambda row: real_12_2025[real_12_2025['Team'] == row['RF Pred']]['Rank'].values[0] - row['Rank']
    if row['RF Pred'] in real_12_2025['Team'].values else None,
    axis=1
)

# Format RF accuracy
comp_12_2025['RF acc'] = comp_12_2025['RF acc'].apply(
    lambda x: f"+{int(x)}" if x > 0 else (f"{int(x)}" if x < 0 else "0") if pd.notna(x) else "N/A"
)

# Get XGB predictions
comp_12_2025['XGB Pred'] = xg_12_2025['Team'].iloc[:12].values

# Calculate XGB accuracy: Find actual seed of predicted team
comp_12_2025['XGB acc'] = comp_12_2025.apply(
    lambda row: real_12_2025[real_12_2025['Team'] == row['XGB Pred']]['Rank'].values[0] - row['Rank']
    if row['XGB Pred'] in real_12_2025['Team'].values else None,
    axis=1
)

# Format XGB accuracy
comp_12_2025['XGB acc'] = comp_12_2025['XGB acc'].apply(
    lambda x: f"+{int(x)}" if x > 0 else (f"{int(x)}" if x < 0 else "0") if pd.notna(x) else "N/A"
)

# Select and reorder columns
comp_12_2025 = comp_12_2025[['Team', 'Rank', 'RF Pred', 'RF acc', 'XGB Pred', 'XGB acc']]

comp_12_2025.head(6)


Unnamed: 0,Team,Rank,RF Pred,RF acc,XGB Pred,XGB acc
0,Indiana,1,Indiana,0,Indiana,0.0
1,Ohio State,2,Ohio State,0,Ohio State,0.0
2,Georgia,3,Texas A&M,4,Georgia,0.0
3,Texas Tech,4,Georgia,-1,Texas Tech,0.0
4,Oregon,5,Texas Tech,-1,Texas A&M,2.0
5,Ole Miss,6,Oregon,-1,BYU,


In [262]:
# Convert accuracy back to numeric for calculations
rf_acc_numeric = comp_12_2025['RF acc'].apply(
    lambda x: int(x) if x not in ['N/A', None] else None
)
xg_acc_numeric = comp_12_2025['XGB acc'].apply(
    lambda x: int(x) if x not in ['N/A', None] else None
)

# RF Statistics
rf_avg_off = rf_acc_numeric.abs().mean()
rf_most_overrated_idx = rf_acc_numeric.idxmin()  # min is overrated (negative = ranked too high)
rf_most_overrated_team = comp_12_2025.loc[rf_most_overrated_idx, 'RF Pred']
rf_most_overrated_acc = comp_12_2025.loc[rf_most_overrated_idx, 'RF acc']
rf_most_underrated_idx = rf_acc_numeric.idxmax()  # max is underrated (positive = ranked too low)
rf_most_underrated_team = comp_12_2025.loc[rf_most_underrated_idx, 'RF Pred']
rf_most_underrated_acc = comp_12_2025.loc[rf_most_underrated_idx, 'RF acc']

# Count RF playoff teams (top 12 actual teams that appear in top 12 predictions)
rf_top12_predicted = set(rf_12_2025['Team'].iloc[:6].values)
actual_top12 = set(real_12_2025['Team'].iloc[:6].values)
rf_playoff_correct = len(rf_top12_predicted.intersection(actual_top12))

# XGB Statistics
xg_avg_off = xg_acc_numeric.abs().mean()
xg_most_overrated_idx = xg_acc_numeric.idxmin()  # min is overrated (negative = ranked too high)
xg_most_overrated_team = comp_12_2025.loc[xg_most_overrated_idx, 'XGB Pred']
xg_most_overrated_acc = comp_12_2025.loc[xg_most_overrated_idx, 'XGB acc']
xg_most_underrated_idx = xg_acc_numeric.idxmax()  # max is underrated (positive = ranked too low)
xg_most_underrated_team = comp_12_2025.loc[xg_most_underrated_idx, 'XGB Pred']
xg_most_underrated_acc = comp_12_2025.loc[xg_most_underrated_idx, 'XGB acc']

# Count XGB playoff teams
xg_top12_predicted = set(xg_12_2025['Team'].iloc[:12].values)
xg_playoff_correct = len(xg_top12_predicted.intersection(actual_top12))

# Print Statistics
print(f"RF Average Off-By: {rf_avg_off:.2f}")
print(f"Most Overrated: {rf_most_overrated_team} ({abs(int(rf_most_overrated_acc))} spots lower)")
print(f"Most Underrated: {rf_most_underrated_team} ({abs(int(rf_most_underrated_acc))} spots higher)")
print(f"Playoff Teams Predicted: {rf_playoff_correct}/6")

print(f"\nXGB Average Off-By: {xg_avg_off:.2f}")
print(f"Most Overrated: {xg_most_overrated_team} ({abs(int(xg_most_overrated_acc))} spots lower)")
print(f"Most Underrated: {xg_most_underrated_team} ({abs(int(xg_most_underrated_acc))} spots higher)")
print(f"Playoff Teams Predicted: {xg_playoff_correct}/6")


RF Average Off-By: 1.67
Most Overrated: Ole Miss (3 spots lower)
Most Underrated: James Madison (5 spots higher)
Playoff Teams Predicted: 5/6

XGB Average Off-By: 2.00
Most Overrated: Oregon (4 spots lower)
Most Underrated: James Madison (5 spots higher)
Playoff Teams Predicted: 6/6


In [263]:
rf_4_2024 = prediction(df, 2024, "Final_4", "RandomForest")

xg_4_2024 = prediction(df, 2024, "Final_4", "XGBoost")
real_4_2024 = pd.DataFrame({
    "Seed": [
        1, 2, 3, 4
    ],
    "Team": [
        "Ohio State",
        "Notre Dame",
        "Texas",
        "Penn State",
    ]
})

comp_4_2024 = pd.DataFrame()

comp_4_2024['Rank'] = real_4_2024['Seed']
comp_4_2024['Actual'] = real_4_2024['Team']

comp_4_2024['RF %'] = (rf_4_2024['Probability'].iloc[:4].values * 100).round(0).astype(int)
comp_4_2024['RF Pred'] = (
    rf_4_2024['Team'].iloc[:4].values
    + " ("
    + comp_4_2024['RF %'].astype(str)
    + "%)"
)

comp_4_2024['XGB %'] = (xg_4_2024['Probability'].iloc[:4].values * 100).round(0).astype(int)
comp_4_2024['XGB Pred'] = (
    xg_4_2024['Team'].iloc[:4].values
    + " ("
    + comp_4_2024['XGB %'].astype(str)
    + "%)"
)


comp_4_2024 = comp_4_2024[['Rank', 'Actual', 'RF Pred', 'XGB Pred']]


comp_4_2024.head()


Unnamed: 0,Rank,Actual,RF Pred,XGB Pred
0,1,Ohio State,Oregon (100%),Oregon (100%)
1,2,Notre Dame,Georgia (96%),Georgia (100%)
2,3,Texas,Penn State (90%),Penn State (100%)
3,4,Penn State,Indiana (66%),Notre Dame (98%)
