# Complicated Elo Predictor

This is meant to copy the way that Nate Silver Built his, from his methodology here: https://www.natesilver.net/p/sbcb-methodology


In [1]:
import pandas as pd
import yaml
import duckdb as db
import warnings

with open('config.yaml', 'r') as file:
    config_file = yaml.safe_load(file)
data_dir = config_file.get("data_dir")
output_dir = config_file.get("output_dir")

In [2]:
submission_df = pd.read_csv(f'{data_dir}/Kaggle/SampleSubmissionStage2.csv')

def extract_game_info(id_str):
    # Extract year and team_ids
    parts = id_str.split('_')
    year = int(parts[0])
    teamID1 = int(parts[1])
    teamID2 = int(parts[2])
    return year, teamID1, teamID2

submission_df[['Season', 'TeamID1', 'TeamID2']] = submission_df['ID'].apply(extract_game_info).tolist()

In [3]:
# Some men's teams leave D1, so we *should* filter them out, but it breaks the basic model so I ignore it. The mean is still 1500.
# mensids = db.sql('FROM "./SourceData/Kaggle/MTeams.csv" WHERE LastD1Season = 2025').to_df()
mensids = db.sql('FROM "./SourceData/Kaggle/MTeams.csv"').to_df()

womensids = db.sql('FROM "./SourceData/Kaggle/WTeams.csv" ').to_df()

In [8]:
womens_results = pd.read_csv(f'{data_dir}/Kaggle/WRegularSeasonCompactResults.csv')
mens_results = pd.read_csv(f'{data_dir}/Kaggle/MRegularSeasonCompactResults.csv')

In [None]:
# Point 1: Margin of victory: 
# Specifically, the margin of victory factor is calculated as (3 + s) ^ .85, where s is the scoring differential.
# I assume this should be added to the change in elo for the winner/loser

mens_results
def point_differential_scaler(row, scaler=.85):
    s = (row['WScore']- row['LScore'])
    return (3+s) ** scaler

In [None]:
# This will be a 10 point scaler to ELO
# This could be improved by looking at the values of the losing teams, because as it is now it only uses the winning teams location
# For calculating margins of victory, one point in a basketball game equals approximately 27 Elo points.

def home_field_advantage_calculator(teams_df, season_cutoff = 2000, values_cutoff = 5):
    # Filter mens_results to only include seasons from the year 2000 onwards
    teams_df_filtered = teams_df[teams_df['Season'] >= season_cutoff]

    # Calculate the average points scored by each team at home and away (including neutral courts)
    home_points = teams_df_filtered[teams_df_filtered['WLoc'] == 'H'].groupby('WTeamID')['WScore'].mean().reset_index()
    home_points.columns = ['TeamID', 'HomePoints']

    away_points = teams_df_filtered[teams_df_filtered['WLoc'].isin(['A', 'N'])].groupby('WTeamID')['WScore'].mean().reset_index()
    away_points.columns = ['TeamID', 'AwayPoints']

    # Merge the home and away points dataframes
    points_comparison = pd.merge(home_points, away_points, on='TeamID', how='inner')

    # Calculate the expected additional points at home
    points_comparison['HomeAdvantage'] = points_comparison['HomePoints'] - points_comparison['AwayPoints']
    points_comparison['HomeAdvantage'] = points_comparison['HomeAdvantage'].clip(lower=-values_cutoff, upper=values_cutoff)

    return points_comparison[['TeamID', 'HomeAdvantage']]

HFA_df = home_field_advantage_calculator(mens_results)

In [None]:
# Travel distance: 8 * m^(⅓) addded to the home court advantage

In [None]:
# Mean reversion:
# Empirically, the degree of mean reversion from year to year is growing — in other words, 
# teams are less likely to sustain their success — probably because the best players typically
# leave for the NBA after one or two years in college; even elite programs now rarely maintain
# dominance with the same core of talent. Currently, a team’s rating is reverted by 30-35 percent
# toward the mean at the start of each new season.
# Should revert to the mean of the conference ratings
# The baysian model uses pre-season ratings, but this would be a bit harder to implement
# "partly on preseason rankings in the AP (media) and Coaches Polls."


In [None]:
# K factor: Specifically, we use a k-factor of 38; this number has no intrinsic meaning and is derived empirically.
# However, the k-factor is up to 50 percent higher (so, up to a k-factor of 56) for early-season games, 
# with this diminishing linearly to a k-factor of 38 until a team plays roughly the 20th game of its season.

def k_factor_calculator(game_number, k_factor_start=56, k_factor_end = 38):
    """ takes the input of the game number and returns the k-factor for the game
    :param game_number: int, the number of the game
    :param k_factor_start: int, the starting k-factor for the first game
    :param k_factor_end: int, the ending k-factor for the last game
    :return: int, the k-factor for the game
    """
    if k_factor > k_factor_end:
        k_factor = k_factor_start - (game_number - 1)
    else:
        k_factor = 38
    return k_factor

In [None]:
# NCAA tournament games also receive an additional multiple of 1.25x, tantamount to a k-factor of 47.5
# Not sure if I want to look at NCAA games as well, but maybe
# An additional multiplier of 1.07x is applied to the Elo ratings
# difference between the teams in forecasting margins of victory and win probabilities in the tournament. 

In [None]:
# WOMEN:
# less mean-reversion from season to season
# Home court advantage tends to be slightly less in the women’s game
# the ratio of Elo rating point differences to the point spread is about 25:1 for women as opposed to 27:1 for men

In [None]:
# Composite with https://kenpom.com/? (1.5x)
# Composite with https://sonnymoorepowerratings.com/m-basket.htm?
# https://www.espn.com/mens-college-basketball/bpi? 
# Massey ratings: https://masseyratings.com/cb2024/ncaad1/ratings

# Womens:
# https://herhoopstats.com/stats/ncaa/research/team_single_seasons/?min_season=2025&max_season=2025&division=1&games=all&criteria0=hhs_net_rtg&comp0=ge&threshold0=-100&stats_to_show=summary_advanced&submit=true

SyntaxError: invalid syntax (1495644420.py, line 6)

In [None]:
k_set = 30 # K-factor for Elo rating
# K-factor determines how much the Elo rating changes after each game
# Higher K-factor means more volatility in Elo ratings, Lower K-factor means more stable Elo ratings
# The K-factor is usually set between 10 and 40, 30 being the standard
initial_elo_set = 1500 # Initial Elo rating for all teams
# Initial Elo rating is usually set to 1500, but can be set to any value
mean_reversion = .25 # Mean reversion ratio for Elo rating
# Mean reversion ratio determines how much the Elo rating reverts to the initial/mean Elo rating after each season
# Used to reflect the turnover in a sports team, higher mean reversion means more turnover in the team
# I chose to go with 25% of the elo returns to mean as a starting point

def update_elo(winner_elo, loser_elo, k=k_set):
    expected_win = 1 / (1 + 10**((loser_elo - winner_elo) / 400))
    new_winner_elo = winner_elo + k * (1 - expected_win)
    new_loser_elo = loser_elo - k * (1 - expected_win)
    return new_winner_elo, new_loser_elo

def run_basic_elo(season_results_df, ids_df):

    seasons_array = sorted(season_results_df['Season'].unique())
    initial_elo = initial_elo_set 
    elo_ratings = {team_id: initial_elo for team_id in ids_df['TeamID'].unique()}

    for i in seasons_array:
        results_season = season_results_df[season_results_df['Season'] == i]
        # print(i)
        for index, row in results_season.iterrows():
            winner = row['WTeamID']
            loser = row['LTeamID']
            if row['WLoc'] == 'H':
                winner_elo = elo_ratings[winner] + 100
            elif row['WLoc'] == 'A':
                loser_elo = elo_ratings[loser] + 100
                
            winner_elo = elo_ratings[winner]
            loser_elo = elo_ratings[loser]
            new_winner_elo, new_loser_elo = update_elo(winner_elo, loser_elo)
            elo_ratings[winner] = new_winner_elo
            elo_ratings[loser] = new_loser_elo
        elo_ratings = {team_id: (1-mean_reversion) * elo + (mean_reversion) * initial_elo for team_id, elo in elo_ratings.items()}
        df = pd.DataFrame(list(elo_ratings.items()), columns=['TeamID', 'Elo'])

    return df

mens_elo = run_basic_elo(mens_results, mensids)
womens_elo = run_basic_elo(womens_results, womensids)

In [4]:
All_elo = pd.concat([mens_elo, womens_elo], ignore_index=True)

warnings.filterwarnings('ignore')
# Create a dictionary for quick lookup of ELO ratings by TeamID
elo_dict = All_elo.set_index('TeamID')['Elo'].to_dict()

# Map the ELO ratings to the TeamID1 column in the submission_df
submission_df['TeamID1_Elo'] = submission_df['TeamID1'].map(elo_dict)
submission_df['TeamID2_Elo'] = submission_df['TeamID2'].map(elo_dict)

# Fill missing values with 9999 - these would be teams that aren't in the nate database of mismatches in names
submission_df['TeamID1_Elo'].fillna(9999, inplace=True)
submission_df['TeamID2_Elo'].fillna(9999, inplace=True)

# Check the result, this should be 0
assert len(submission_df.query('TeamID1_Elo == 9999 or TeamID2_Elo == 9999')) == 0, "There are teams with missing ELO ratings"

NameError: name 'mens_elo' is not defined

In [None]:
# Basic ELO win probability calculation
def calc_elo_win(A, B):
    awin = 1 / (1 + 10**( (B - A) / 400))
    return(awin)
submission_df['Team1_win_prob'] = submission_df.apply(lambda x: calc_elo_win(x['TeamID1_Elo'], x['TeamID2_Elo']), axis=1)

In [None]:
Output = submission_df[['ID', 'Team1_win_prob']].rename(columns={'Team1_win_prob': 'Pred'})
Output.to_csv(f'{output_dir}/BasicEloProbs.csv', index=False)