# Nate Silver Elo Predicitons

Use's Nate Silver's elo values and predicts off that, no input of my own.

Predictions to act as a benchmark.

In [7]:
import pandas as pd
import yaml
import duckdb as db
import warnings
from fuzzywuzzy import process
warnings.filterwarnings('ignore')

with open('config.yaml', 'r') as file:
    config_file = yaml.safe_load(file)
data_dir = config_file.get("data_dir")
output_dir = config_file.get("output_dir")

In [5]:
submission_df = pd.read_csv(f'{data_dir}/Kaggle/SampleSubmissionStage2.csv')

def extract_game_info(id_str):
    # Extract year and team_ids
    parts = id_str.split('_')
    year = int(parts[0])
    teamID1 = int(parts[1])
    teamID2 = int(parts[2])
    return year, teamID1, teamID2

submission_df[['Season', 'TeamID1', 'TeamID2']] = submission_df['ID'].apply(extract_game_info).tolist()

In [8]:
nate_men = pd.read_csv(f'{data_dir}/Nate/Mens.csv', index_col=0)
nate_women = pd.read_csv(f'{data_dir}/Nate/Womens.csv', index_col=0)

MTeamSpellings = pd.read_csv(f"{data_dir}/Kaggle/MTeamSpellings.csv")
WTeamSpellings = pd.read_csv(f"{data_dir}/Kaggle/WTeamSpellings.csv")

In [9]:
def name_match(nate_df, names_df):
    ''' input should be nate's df and the path to ncaa name choices'''
    choices = list(names_df["TeamNameSpelling"])
    nate_df['kaggle_name'] = nate_df['Team'].apply(lambda x: process.extractOne(x, choices)[0])
    nate_df['fuzz_score'] = nate_df['Team'].apply(lambda x: process.extractOne(x, choices)[1])
    return nate_df

In [13]:
nate_men = name_match(nate_men, MTeamSpellings)
# check for erroneous matches by going worst to best
display(nate_men[['kaggle_name','Team','fuzz_score']].sort_values(by='fuzz_score', ascending=True))
# In my case, the only errors were that Miami University (OH), which got matches with 'uni'
# and saint francis in the nate elo is for PA, but it matches to ny
nate_men.loc[nate_men['kaggle_name'] == 'uni', 'kaggle_name'] = 'miami (oh)'
nate_men.loc[nate_men['kaggle_name'] == 'saint francis (ny)', 'kaggle_name'] = 'saint francis (pa)'

Unnamed: 0,kaggle_name,Team,fuzz_score
300,texas rio grande valley,UT Rio Grande Valley,88
211,umass,UMass (Amherst),90
167,uni,Miami University (OH),90
215,queens (nc),Queens,90
364,mississippi valley state,Mississippi Valley St.,93
...,...,...,...
118,middle tennessee,Middle Tennessee,100
117,tcu,TCU,100
116,uab,UAB,100
114,furman,Furman,100


In [10]:
nate_women = name_match(nate_women, WTeamSpellings)
# check for erroneous matches by going worst to best
display(nate_women[['kaggle_name','Team','fuzz_score']].sort_values(by='fuzz_score', ascending=True))
# In my case, the only error was Miami University (OH), which got matches with 'uni' again, same as men
nate_women.loc[nate_women['kaggle_name'] == 'uni', 'kaggle_name'] = 'miami (oh)'

Unnamed: 0,kaggle_name,Team,fuzz_score
267,texas rio grande valley,UT Rio Grande Valley,88
154,uni,Miami University (OH),90
153,umass,UMass (Amherst),90
317,queens (nc),Queens,90
334,southeast missouri state,Southeast Missouri St.,93
...,...,...,...
117,bowling green,Bowling Green,100
116,byu,BYU,100
115,rhode island,Rhode Island,100
135,georgetown,Georgetown,100


In [14]:
# Merge the two to get the team_id
M_merged = pd.merge(nate_men, MTeamSpellings, left_on='kaggle_name', right_on='TeamNameSpelling', how='left')
M_merged['gender'] = 'M'
W_merged = pd.merge(nate_women, WTeamSpellings, left_on='kaggle_name', right_on='TeamNameSpelling', how='left')
W_merged['gender'] = 'W'
# Merge the two dataframes to get all team_ids
All_merged = pd.concat([M_merged, W_merged], ignore_index=True)

In [15]:
# Create a dictionary for quick lookup of ELO ratings by TeamID
elo_dict = All_merged.set_index('TeamID')['Current Elo'].to_dict()

# Map the ELO ratings to the TeamID1 column in the submission_df
submission_df['TeamID1_Elo'] = submission_df['TeamID1'].map(elo_dict)
submission_df['TeamID2_Elo'] = submission_df['TeamID2'].map(elo_dict)

# Fill missing values with 9999 - these would be teams that aren't in the nate database of mismatches in names
submission_df['TeamID1_Elo'].fillna(9999, inplace=True)
submission_df['TeamID2_Elo'].fillna(9999, inplace=True)

# Check the result, this should be 0
assert len(submission_df.query('TeamID1_Elo == 9999 or TeamID2_Elo == 9999')) == 0, "There are teams with missing ELO ratings"

In [16]:
# This should be replaced with goto conversion thing
def calc_elo_win(A, B):
    awin = 1 / (1 + 10**( (B - A) / 400))
    return(awin)
submission_df['Team1_win_prob'] = submission_df.apply(lambda x: calc_elo_win(x['TeamID1_Elo'], x['TeamID2_Elo']), axis=1)

In [17]:
finalsub = submission_df[['ID', 'Team1_win_prob']]
readable_finalsub = submission_df[['ID','TeamID1', 'TeamID2', 'TeamID1_Elo', 'TeamID2_Elo', 'Team1_win_prob']]
readable_finalsub['TeamName1'] = readable_finalsub['TeamID1'].map(All_merged.set_index('TeamID')['TeamNameSpelling'].to_dict())
readable_finalsub['TeamName2'] = readable_finalsub['TeamID2'].map(All_merged.set_index('TeamID')['TeamNameSpelling'].to_dict())
readable_finalsub.drop(['TeamID1', 'TeamID2'], axis=1, inplace=True)
readable_finalsub[['ID', 'TeamName1', 'TeamName2', 'TeamID1_Elo', 'TeamID2_Elo', 'Team1_win_prob']].head()

Unnamed: 0,ID,TeamName1,TeamName2,TeamID1_Elo,TeamID2_Elo,Team1_win_prob
0,2025_1101_1102,abilene christian,air force,1482.911,1267.39,0.775675
1,2025_1101_1103,abilene christian,akron,1482.911,1635.409,0.293624
2,2025_1101_1104,abilene christian,alabama,1482.911,2031.764,0.04072
3,2025_1101_1105,abilene christian,alabama a&m,1482.911,1109.851,0.895435
4,2025_1101_1106,abilene christian,alabama st,1482.911,1321.198,0.717257


In [19]:
Output = submission_df[['ID', 'Team1_win_prob']].rename(columns={'Team1_win_prob': 'Pred'})
Output.to_csv(f'{output_dir}/NateEloProbs.csv', index=False)