# Nate Silver Elo Predicitons

Use's Nate Silver's elo values and predicts off that, no input of my own.

Predictions to act as a benchmark.

In [1]:
import pandas as pd
import yaml
import duckdb as db
import warnings
from fuzzywuzzy import process
warnings.filterwarnings('ignore')

with open('config.yaml', 'r') as file:
    config_file = yaml.safe_load(file)
data_dir = config_file.get("data_dir")
output_dir = config_file.get("output_dir")



In [2]:
submission_df = pd.read_csv(f'{data_dir}/Kaggle/SampleSubmissionStage2.csv')

def extract_game_info(id_str):
    # Extract year and team_ids
    parts = id_str.split('_')
    year = int(parts[0])
    teamID1 = int(parts[1])
    teamID2 = int(parts[2])
    return year, teamID1, teamID2

submission_df[['Season', 'TeamID1', 'TeamID2']] = submission_df['ID'].apply(extract_game_info).tolist()

In [31]:
# nate_men = pd.read_csv(f'{data_dir}/Nate/Mens.csv', index_col=0)
# nate_women = pd.read_csv(f'{data_dir}/Nate/Womens.csv', index_col=0)

nate_men = pd.read_csv(f'{data_dir}/Nate/Mens_Combined.csv')
nate_women = pd.read_csv(f'{data_dir}/Nate/Womens_Combined.csv')

MTeamSpellings = pd.read_csv(f"{data_dir}/Kaggle/MTeamSpellings.csv")
WTeamSpellings = pd.read_csv(f"{data_dir}/Kaggle/WTeamSpellings.csv")

In [32]:
def name_match(nate_df, names_df):
    ''' input should be nate's df and the path to ncaa name choices'''
    choices = list(names_df["TeamNameSpelling"])
    nate_df['kaggle_name'] = nate_df['Team'].apply(lambda x: process.extractOne(x, choices)[0])
    nate_df['fuzz_score'] = nate_df['Team'].apply(lambda x: process.extractOne(x, choices)[1])
    return nate_df

In [33]:
nate_men = name_match(nate_men, MTeamSpellings)
# check for erroneous matches by going worst to best
display(nate_men[['kaggle_name','Team','fuzz_score']].sort_values(by='fuzz_score', ascending=True))
# In my case, the only errors were that Miami University (OH), which got matches with 'uni'
# and saint francis in the nate elo is for PA, but it matches to ny
nate_men.loc[nate_men['kaggle_name'] == 'uni', 'kaggle_name'] = 'miami (oh)'
nate_men.loc[nate_men['kaggle_name'] == 'saint francis (ny)', 'kaggle_name'] = 'saint francis (pa)'

Unnamed: 0,kaggle_name,Team,fuzz_score
284,texas rio grande valley,UT Rio Grande Valley,88
148,uni,Miami University (OH),90
223,queens (nc),Queens,90
209,umass,UMass (Amherst),90
363,mississippi valley state,Mississippi Valley St.,93
...,...,...,...
117,bryant,Bryant,100
116,wofford,Wofford,100
115,cal state northridge,Cal State Northridge,100
113,notre dame,Notre Dame,100


In [34]:
nate_women = name_match(nate_women, WTeamSpellings)
# check for erroneous matches by going worst to best
display(nate_women[['kaggle_name','Team','fuzz_score']].sort_values(by='fuzz_score', ascending=True))
# In my case, the only error was Miami University (OH), which got matches with 'uni' again, same as men
nate_women.loc[nate_women['kaggle_name'] == 'uni', 'kaggle_name'] = 'miami (oh)'

Unnamed: 0,kaggle_name,Team,fuzz_score
257,texas rio grande valley,UT Rio Grande Valley,88
322,queens (nc),Queens,90
154,umass,UMass (Amherst),90
157,uni,Miami University (OH),90
334,southeast missouri state,Southeast Missouri St.,93
...,...,...,...
116,lamar,Lamar,100
115,illinois st,Illinois St.,100
114,vermont,Vermont,100
134,albany,Albany,100


In [35]:
# Merge the two to get the team_id
M_merged = pd.merge(nate_men, MTeamSpellings, left_on='kaggle_name', right_on='TeamNameSpelling', how='left')
M_merged['gender'] = 'M'
W_merged = pd.merge(nate_women, WTeamSpellings, left_on='kaggle_name', right_on='TeamNameSpelling', how='left')
W_merged['gender'] = 'W'
# Merge the two dataframes to get all team_ids
All_merged = pd.concat([M_merged, W_merged], ignore_index=True)

In [36]:
# Optional if you have the combined
All_merged.rename(columns={'SBCB (Bayesian)': 'Current Elo'}, inplace=True)

In [37]:
# Create a dictionary for quick lookup of ELO ratings by TeamID
# elo_dict = All_merged.set_index('TeamID')['Current Elo'].to_dict()
elo_dict = All_merged.set_index('TeamID')['Current Elo'].to_dict()

# Map the ELO ratings to the TeamID1 column in the submission_df
submission_df['TeamID1_Elo'] = submission_df['TeamID1'].map(elo_dict)
submission_df['TeamID2_Elo'] = submission_df['TeamID2'].map(elo_dict)

# Fill missing values with 9999 - these would be teams that aren't in the nate database of mismatches in names
submission_df['TeamID1_Elo'].fillna(9999, inplace=True)
submission_df['TeamID2_Elo'].fillna(9999, inplace=True)

# Check the result, this should be 0
assert len(submission_df.query('TeamID1_Elo == 9999 or TeamID2_Elo == 9999')) == 0, "There are teams with missing ELO ratings"

In [38]:
# This should be replaced with goto conversion thing
def calc_elo_win(A, B):
    awin = 1 / (1 + 10**( (B - A) / 400))
    return(awin)

# differences in team strength are typically more apparent in the tournament, 
# and the model accounts for this, too. An additional multiplier of 1.07x is applied 
# to the Elo ratings difference between the teams in forecasting margins of 
# victory and win probabilities in the tournament.
def calc_elo_win_tourney(A, B, boost=1.07):
    awin = 1 / (1 + 10**( (B - A) * boost / 400))
    return(awin)

submission_df['Team1_win_prob'] = submission_df.apply(lambda x: calc_elo_win_tourney(x['TeamID1_Elo'], x['TeamID2_Elo']), axis=1)

In [39]:
finalsub = submission_df[['ID', 'Team1_win_prob']]
readable_finalsub = submission_df[['ID','TeamID1', 'TeamID2', 'TeamID1_Elo', 'TeamID2_Elo', 'Team1_win_prob']]
readable_finalsub['TeamName1'] = readable_finalsub['TeamID1'].map(All_merged.set_index('TeamID')['TeamNameSpelling'].to_dict())
readable_finalsub['TeamName2'] = readable_finalsub['TeamID2'].map(All_merged.set_index('TeamID')['TeamNameSpelling'].to_dict())
readable_finalsub.drop(['TeamID1', 'TeamID2'], axis=1, inplace=True)
readable_finalsub[['ID', 'TeamName1', 'TeamName2', 'TeamID1_Elo', 'TeamID2_Elo', 'Team1_win_prob']].head()

Unnamed: 0,ID,TeamName1,TeamName2,TeamID1_Elo,TeamID2_Elo,Team1_win_prob
0,2025_1101_1102,abilene christian,air force,1467.915,1266.23,0.775957
1,2025_1101_1103,abilene christian,akron,1467.915,1651.156,0.244409
2,2025_1101_1104,abilene christian,alabama,1467.915,2035.864,0.029363
3,2025_1101_1105,abilene christian,alabama a&m,1467.915,1078.902,0.916529
4,2025_1101_1106,abilene christian,alabama st,1467.915,1353.811,0.668811


In [40]:
Output = submission_df[['ID', 'Team1_win_prob']].rename(columns={'Team1_win_prob': 'Pred'})
Output.to_csv(f'{output_dir}/NateEloProbs_SCBC.csv', index=False)

In [54]:
top3seedsM = [1120, 1277, 1235, 1181, 1104, 1458, 1222, 1397, 1246, 1196, 1385, 1403]
submission_df.loc[submission_df.query('TeamID1 in @top3seedsM and TeamID2 not in @top3seedsM').index, 'Team1_win_prob'] = 1
submission_df.loc[submission_df.query('TeamID2 in @top3seedsM and TeamID1 not in @top3seedsM').index, 'Team1_win_prob'] = 0

In [55]:
top3seedsW = womens_team_ids = [3417, 3301, 3261, 3376, 3181, 3314, 3400, 3395, 3323, 3425, 3328, 3163]
submission_df.loc[submission_df.query('TeamID1 in @top3seedsW and TeamID2 not in @top3seedsW').index, 'Team1_win_prob'] = 1
submission_df.loc[submission_df.query('TeamID2 in @top3seedsW and TeamID1 not in @top3seedsW').index, 'Team1_win_prob'] = 0

In [59]:
Output = submission_df[['ID', 'Team1_win_prob']].rename(columns={'Team1_win_prob': 'Pred'})
Output.to_csv(f'{output_dir}/NateEloProbs_SCBC_top3win.csv', index=False)