# March Madness 2025

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import os
from sklearn.model_selection import train_test_split
import random
from data import Data, STATS_COLUMNS
from model import *

torch.manual_seed(20250222)
random.seed(20250222)

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


## Hypothesis
Each team can be modeled by x hidden features. In each game, these hidden features interact in a nonlinear fashion to determine the outcome of the game

## Preparing the data
Load the data

In [2]:
dataset = Data()

dataset.games.describe()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,NumOT,WFGM,WFGA,WFGM3,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
count,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,...,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0
mean,2015.470621,69.843291,2099.847868,74.183169,2097.450588,61.187026,0.061787,26.176339,57.063405,6.912005,...,19.248818,11.436922,16.826656,10.826832,21.949363,11.217125,15.21463,6.453946,2.848942,18.853504
std,6.024751,35.933736,986.382716,11.406085,989.676138,11.373007,0.287403,4.811306,7.828931,3.16658,...,6.325219,5.239163,6.987616,4.418293,4.708807,3.765042,5.028571,2.985335,2.037092,4.587468
min,2003.0,0.0,1101.0,30.0,1101.0,11.0,0.0,9.0,26.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
25%,2011.0,37.0,1260.0,66.0,1253.0,53.0,0.0,23.0,52.0,5.0,...,15.0,8.0,12.0,8.0,19.0,9.0,12.0,4.0,1.0,16.0
50%,2016.0,73.0,1413.0,74.0,1407.0,61.0,0.0,26.0,57.0,7.0,...,19.0,11.0,16.0,10.0,22.0,11.0,15.0,6.0,3.0,19.0
75%,2020.0,101.0,3244.0,81.0,3245.0,69.0,0.0,29.0,62.0,9.0,...,23.0,15.0,21.0,14.0,25.0,14.0,18.0,8.0,4.0,22.0
max,2025.0,132.0,3480.0,149.0,3480.0,144.0,6.0,58.0,113.0,30.0,...,80.0,48.0,65.0,38.0,53.0,34.0,49.0,26.0,21.0,47.0


The x's will be the indexes of two team IDs and program IDs, the y's will include and indicator of who won at the game stats

In [3]:
train_loader, validation_loader = dataset.train_test_data()

Loading cached data


## The Model
Define the model. Combine the embeddings for the two teams, go to a hidden layer, and then output to a prediction if the first team won

In [4]:
model = Model(embedding_sizes=[32, 512], model_sizes=(128,128), dropout=0.1, dataset=dataset).to(device)

## Training the model

Train the model

In [5]:
train(train_loader, validation_loader, model, learning_rate=0.001)

Initial: Accuracy: 50.00%, Stats loss: 900.542225 Result loss: 0.491335
Epoch 0
Train: Accuracy: 59.68%, Stats loss: 43.871659 Result loss: 0.238777
Test: Accuracy: 59.79%, Stats loss: 43.645016 Result loss: 0.238606
Epoch 1
Train: Accuracy: 66.47%, Stats loss: 48.879215 Result loss: 0.207571
Test: Accuracy: 66.76%, Stats loss: 48.705380 Result loss: 0.207083
Epoch 2
Train: Accuracy: 65.82%, Stats loss: 49.180272 Result loss: 0.207492
Test: Accuracy: 66.00%, Stats loss: 49.269932 Result loss: 0.207627
Epoch 3
Train: Accuracy: 67.89%, Stats loss: 49.363735 Result loss: 0.201123
Test: Accuracy: 67.77%, Stats loss: 49.558864 Result loss: 0.201325
Epoch 4
Train: Accuracy: 68.69%, Stats loss: 54.810863 Result loss: 0.197570
Test: Accuracy: 68.56%, Stats loss: 55.146440 Result loss: 0.198501
Epoch 5
Train: Accuracy: 69.60%, Stats loss: 50.945645 Result loss: 0.192993
Test: Accuracy: 69.42%, Stats loss: 51.367047 Result loss: 0.195058
Epoch 6
Train: Accuracy: 70.52%, Stats loss: 50.317189 Res

Fine tune with only the result

In [6]:
train(train_loader, validation_loader, model, learning_rate=0.001, full_loss=False)

Initial: Accuracy: 74.01%, Stats loss: 39.610087 Result loss: 0.171235
Epoch 0
Train: Accuracy: 76.31%, Stats loss: 403.544225 Result loss: 0.157939
Test: Accuracy: 73.70%, Stats loss: 402.612701 Result loss: 0.174861
Epoch 1
Train: Accuracy: 77.47%, Stats loss: 434.315869 Result loss: 0.153397
Test: Accuracy: 73.92%, Stats loss: 433.725209 Result loss: 0.172661
Epoch 2
Train: Accuracy: 77.92%, Stats loss: 467.682951 Result loss: 0.151341
Test: Accuracy: 73.89%, Stats loss: 466.704220 Result loss: 0.172733
Epoch 3
Train: Accuracy: 78.17%, Stats loss: 469.347803 Result loss: 0.150020
Test: Accuracy: 73.86%, Stats loss: 468.621429 Result loss: 0.173113
Epoch 4
Train: Accuracy: 78.30%, Stats loss: 476.897721 Result loss: 0.149150
Test: Accuracy: 73.50%, Stats loss: 475.995043 Result loss: 0.175208
Best Loss: 0.171235


With this model we can predict the output of about three quarters of regular season games.

## Load the tourney data to test with

In [7]:
tourney_dataset = dataset.gen_dataset(dataset.tourney)
tourney_loader = DataLoader(tourney_dataset, batch_size=500, shuffle=True)

In [8]:
test(tourney_loader, model, device, label="Tourney")

Tourney: Accuracy: 73.62%, Stats loss: 39.430088 Result loss: 0.170269


0.17026904279620309

When it comes to tournament results we get a little worse. The lower result is likely due to teams having increased pairity.

Train with early torney data

In [9]:
tourney_df = dataset.tourney[dataset.tourney.Season < 2021]

tourney_train_df, tourney_validation_df = train_test_split(tourney_df, train_size=0.8)
tourney_train_data = dataset.gen_dataset(tourney_train_df)
tourney_validation_data = dataset.gen_dataset(tourney_validation_df)

tourney_train_loader = DataLoader(tourney_train_data, batch_size=500)
tourney_validation_loader = DataLoader(tourney_validation_data, batch_size=500)

In [10]:
for param in model.team_embedding.parameters():
    param.requires_grad=False
for param in model.program_embedding.parameters():
    param.requires_grad=False

In [11]:
train(tourney_train_loader, tourney_validation_loader, model, learning_rate=0.001, full_loss=False)

Initial: Accuracy: 73.50%, Stats loss: 41.984778 Result loss: 0.168976
Epoch 0
Train: Accuracy: 57.59%, Stats loss: 58.302074 Result loss: 0.328471
Test: Accuracy: 57.74%, Stats loss: 61.795610 Result loss: 0.326404
Epoch 1
Train: Accuracy: 71.81%, Stats loss: 98.962744 Result loss: 0.181236
Test: Accuracy: 71.06%, Stats loss: 104.226268 Result loss: 0.182257
Epoch 2
Train: Accuracy: 66.58%, Stats loss: 141.380307 Result loss: 0.220946
Test: Accuracy: 67.19%, Stats loss: 147.537067 Result loss: 0.220095
Epoch 3
Train: Accuracy: 72.89%, Stats loss: 176.158320 Result loss: 0.169897
Test: Accuracy: 72.35%, Stats loss: 183.206390 Result loss: 0.173002
Epoch 4
Train: Accuracy: 75.14%, Stats loss: 198.926360 Result loss: 0.165202
Test: Accuracy: 73.35%, Stats loss: 206.084670 Result loss: 0.169739
Best Loss: 0.168976


### Performance by year


In [12]:
for season in dataset.tourney.Season.unique():
    loader = dataset.tourney_data(year=season)
    test(loader, model, device, label=f"{season} Tournament")

2003 Tournament: Accuracy: 68.75%, Stats loss: 37.168748 Result loss: 0.181188
2004 Tournament: Accuracy: 70.31%, Stats loss: 38.417288 Result loss: 0.180705
2005 Tournament: Accuracy: 74.22%, Stats loss: 40.892869 Result loss: 0.163052
2006 Tournament: Accuracy: 67.19%, Stats loss: 39.711059 Result loss: 0.205839
2007 Tournament: Accuracy: 75.00%, Stats loss: 41.874363 Result loss: 0.162287
2008 Tournament: Accuracy: 78.91%, Stats loss: 41.898771 Result loss: 0.161820
2009 Tournament: Accuracy: 69.53%, Stats loss: 39.143316 Result loss: 0.172674
2010 Tournament: Accuracy: 73.23%, Stats loss: 39.925324 Result loss: 0.173990
2011 Tournament: Accuracy: 73.85%, Stats loss: 37.580139 Result loss: 0.172666
2012 Tournament: Accuracy: 77.69%, Stats loss: 38.135547 Result loss: 0.157112
2013 Tournament: Accuracy: 73.46%, Stats loss: 40.224741 Result loss: 0.178893
2014 Tournament: Accuracy: 72.31%, Stats loss: 37.432570 Result loss: 0.167395
2015 Tournament: Accuracy: 78.08%, Stats loss: 39.81

In [13]:
stage1_loader = dataset.tourney_data(after=2021)
test(stage1_loader, model, device=device, label=f"Stage 1")

Stage 1: Accuracy: 72.13%, Stats loss: 37.648741 Result loss: 0.179146


0.17914557941092962

Breaking out by league

In [14]:
for season in dataset.tourney.Season.unique():
    for league in dataset.tourney[dataset.tourney.Season == season].League.unique():
        loader = dataset.tourney_data(year=season, league=league)
        test(loader, model, device, label=f"{season} {league} Tournament")

2003 M Tournament: Accuracy: 68.75%, Stats loss: 37.168748 Result loss: 0.181188
2004 M Tournament: Accuracy: 70.31%, Stats loss: 38.417288 Result loss: 0.180705
2005 M Tournament: Accuracy: 74.22%, Stats loss: 40.892869 Result loss: 0.163052
2006 M Tournament: Accuracy: 67.19%, Stats loss: 39.711059 Result loss: 0.205839
2007 M Tournament: Accuracy: 75.00%, Stats loss: 41.874363 Result loss: 0.162287
2008 M Tournament: Accuracy: 78.91%, Stats loss: 41.898771 Result loss: 0.161820
2009 M Tournament: Accuracy: 69.53%, Stats loss: 39.143316 Result loss: 0.172674
2010 M Tournament: Accuracy: 69.53%, Stats loss: 38.580878 Result loss: 0.202173
2010 W Tournament: Accuracy: 76.98%, Stats loss: 41.291110 Result loss: 0.145359
2011 M Tournament: Accuracy: 68.66%, Stats loss: 35.315611 Result loss: 0.222710
2011 W Tournament: Accuracy: 79.37%, Stats loss: 39.988447 Result loss: 0.119445
2012 M Tournament: Accuracy: 70.15%, Stats loss: 31.975261 Result loss: 0.192931
2012 W Tournament: Accuracy:

## Inspect the model
First what are the sizes of the smallest input and output weights

In [15]:
print(f"Program embedding min: {model.program_embedding.state_dict()['weight'].abs().max(axis=0).values.min().item():>8f}")
print(f"Team embedding min: {model.team_embedding.state_dict()['weight'].abs().max(axis=0).values.min().item():>8f}")
print(f"FC min: {model.result_fc.state_dict()['weight'].abs().max(axis=0).values.min().item():>8f}")

Program embedding min: 2.575979
Team embedding min: 3.709198
FC min: 0.000062


Calculate the average gradient for each input feature

In [16]:
for param in model.team_embedding.parameters():
    param.requires_grad=True
for param in model.program_embedding.parameters():
    param.requires_grad=True

In [17]:
program_weights, team_weights, stats_weights = feature_eval(model, tourney_loader)

In [18]:
program_weights.abs().sum().item(), team_weights.abs().sum().item()

(0.015867117792367935, 0.011381457559764385)

In [19]:
print(f"Year:\t{stats_weights[0]:>4f}")
print(f"Game:\t{stats_weights[1]:>4f}")
print(f"League:\t{stats_weights[2]:>4f}")

Year:	0.055380
Game:	-0.025323
League:	0.211779


## Generating the submission file
### Phase 2

Write the results

In [20]:
odds = model_odds(dataset, 2025, 'M', model)

In [21]:
gen_submission(model, dataset)

## Save the model

In [22]:
torch.save(model.state_dict(), 'model.pth')

## Moderated model

Moderate a model by pushing it towards 0.5

In [23]:
moderated = ModeratedModel(model, 0.75)

In [24]:
for season in dataset.tourney.Season.unique():
    loader = dataset.tourney_data(season)
    test(loader, moderated, label=f"{season} Tournament")

2003 Tournament: Accuracy: 68.75%, Stats loss: 117.639871 Result loss: 0.181188
2004 Tournament: Accuracy: 70.31%, Stats loss: 117.118838 Result loss: 0.180705
2005 Tournament: Accuracy: 74.22%, Stats loss: 120.296368 Result loss: 0.163052
2006 Tournament: Accuracy: 67.19%, Stats loss: 114.069967 Result loss: 0.205839
2007 Tournament: Accuracy: 75.00%, Stats loss: 124.222712 Result loss: 0.162287
2008 Tournament: Accuracy: 78.91%, Stats loss: 123.015729 Result loss: 0.161820
2009 Tournament: Accuracy: 69.53%, Stats loss: 121.336625 Result loss: 0.172674
2010 Tournament: Accuracy: 73.23%, Stats loss: 117.896776 Result loss: 0.173990
2011 Tournament: Accuracy: 73.85%, Stats loss: 113.407064 Result loss: 0.172666
2012 Tournament: Accuracy: 77.69%, Stats loss: 113.280889 Result loss: 0.157112
2013 Tournament: Accuracy: 73.46%, Stats loss: 116.745498 Result loss: 0.178893
2014 Tournament: Accuracy: 72.31%, Stats loss: 114.201846 Result loss: 0.167395
2015 Tournament: Accuracy: 78.08%, Stats

## Dig into 2023 results

In [25]:
loader = dataset.tourney_data(2023)

x, y = loader.dataset.tensors

preds = model(x.to(device))

In [26]:
t_2023 = pd.DataFrame({'winner_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,0].tolist()],
                       'loser_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,2].tolist()],
                       'winner': [dataset.programs.loc[i].TeamID for i in x[:,0].tolist()],
                       'loser': [dataset.programs.loc[i].TeamID for i in x[:,2].tolist()],
                       'actual': y[:,0].reshape([-1]),
                       'predicted': np.array(preds[0].tolist()).reshape([-1])}).iloc[:67]

In [27]:
t_2023[t_2023.predicted < 0.5].sort_values('predicted')

Unnamed: 0,winner_name,loser_name,winner,loser,actual,predicted
23,F Dickinson,Purdue,1192,1345,1.0,0.01118
15,Princeton,Arizona,1343,1112,1.0,0.078412
8,Furman,Virginia,1202,1438,1.0,0.182489
57,Miami FL,Houston,1274,1222,1.0,0.23325
53,FL Atlantic,Tennessee,1194,1397,1.0,0.256479
58,San Diego St,Alabama,1361,1104,1.0,0.267418
63,San Diego St,Creighton,1361,1166,1.0,0.267589
50,Michigan St,Marquette,1277,1266,1.0,0.295179
39,Princeton,Missouri,1343,1281,1.0,0.299903
54,Gonzaga,UCLA,1211,1417,1.0,0.300744


The biggest thing in this season were the huge upsets in the first round. Purdue was a number one seed and lost which I only gave a .4% chance to happen. Arizona and Virginia were number 2 seeds and lost which I gave 7% and 15% chances of happening respectively.

In [28]:
t_2023['Upset'] = [dataset.upset(2023, winner, loser) for (winner, loser) in zip(t_2023['winner'], t_2023['loser'])]

In [29]:
t_2023[t_2023.Upset].predicted.mean()

np.float64(0.33145307625109366)

On average the upsets had a 32% chance of happening

In [30]:
t_2023[t_2023.Upset & (t_2023.predicted >= 0.5)].sort_values('predicted', ascending=False)

Unnamed: 0,winner_name,loser_name,winner,loser,actual,predicted,Upset
45,Creighton,Baylor,1166,1124,1.0,0.586516,True
60,Connecticut,Gonzaga,1163,1211,1.0,0.522066,True


I correctly predicted 2 upsets, though all were closely ranked

In [31]:
t_2023[~t_2023.Upset & (t_2023.predicted < 0.5)].sort_values('predicted')

Unnamed: 0,winner_name,loser_name,winner,loser,actual,predicted,Upset
63,San Diego St,Creighton,1361,1166,1.0,0.267589,False
12,Missouri,Utah St,1281,1429,1.0,0.3924,False
28,Kentucky,Providence,1246,1344,1.0,0.393311,False
0,Pittsburgh,Mississippi St,1338,1280,1.0,0.396287,False
5,Arkansas,Illinois,1116,1228,1.0,0.411795,False
48,Kansas St,Kentucky,1243,1246,1.0,0.419272,False
65,San Diego St,FL Atlantic,1361,1194,1.0,0.425434,False
59,Texas,Xavier,1400,1462,1.0,0.442409,False
55,Kansas St,Michigan St,1243,1277,1.0,0.446159,False
2,Arizona St,Nevada,1113,1305,1.0,0.449551,False


I also incorrectly predicted 4 upsets

Looking at all the tourneys

In [32]:
x, y = tourney_loader.dataset.tensors
preds = model(x.to(device))
tourney_df = pd.DataFrame({'season': x[:,4].tolist(),
                           'winner_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,0].tolist()],
                           'loser_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,2].tolist()],
                           'winner': [dataset.programs.loc[i].TeamID for i in x[:,0].tolist()],
                           'loser': [dataset.programs.loc[i].TeamID for i in x[:,2].tolist()],
                           'actual': y[:,0].reshape([-1]),
                           'predicted': np.array(preds[0].tolist()).reshape([-1])})
tourney_df = tourney_df[tourney_df.actual == 1.0]
tourney_df['Upset'] = [dataset.upset(season, winner, loser) for (winner, loser, season)
                       in zip(tourney_df['winner'], tourney_df['loser'], tourney_df['season'])]

In [33]:
len(tourney_df[tourney_df.Upset & (tourney_df.predicted >= 0.5) & (tourney_df.season > 2020)].sort_values('predicted', ascending=False))

17

In [34]:
len(tourney_df[~tourney_df.Upset & (tourney_df.predicted < 0.5) & (tourney_df.season > 2020)].sort_values('predicted'))

62

Overall I predicted 19 upsets correctly, and 30 incorrectly

## Predicting by seeds
What if I predict just using the seeds?

In [35]:
odds = dataset.odds_by_seed_diff(before=2021)

In [36]:
dataset.tourney_df(after=2021).SeedDiff.map(lambda x: odds[x]**2).mean()

np.float64(0.18527460145235355)

This results in a test Brier score of about 0.185.

## Hybrid Model
Building a model using the neural net and seeds

In [37]:
seed_model = SeedModel(dataset, after=2021)
test(stage1_loader, seed_model, label=f"Seeds")

Seeds: Accuracy: 72.22%, Stats loss: 821.501009 Result loss: 0.182851


0.1828505115610905

In [38]:
test(stage1_loader, model, label="NN")

NN: Accuracy: 72.13%, Stats loss: 37.648741 Result loss: 0.179146


0.17914557941092962

In [39]:
hybrid = HybridModel([model, seed_model], [0.8, 0.2])

In [40]:
test(stage1_loader, hybrid, label=f"Hybrid")

Hybrid: Accuracy: 73.35%, Stats loss: 821.501009 Result loss: 0.176930


0.17693026790728852

They hybrid model outperforms both individual models

In [41]:
for season in range(2021, 2025):
    for league in ['M', 'W']:
        loader = dataset.tourney_data(season, league)
        test(loader, hybrid, label=f"{season} {league} Tournament")

2021 M Tournament: Accuracy: 72.09%, Stats loss: 799.078627 Result loss: 0.181126
2021 W Tournament: Accuracy: 72.09%, Stats loss: 799.078627 Result loss: 0.181126
2022 M Tournament: Accuracy: 73.51%, Stats loss: 813.262527 Result loss: 0.179342
2022 W Tournament: Accuracy: 73.51%, Stats loss: 813.262527 Result loss: 0.179342
2023 M Tournament: Accuracy: 71.64%, Stats loss: 815.193230 Result loss: 0.187688
2023 W Tournament: Accuracy: 71.64%, Stats loss: 815.193230 Result loss: 0.187688
2024 M Tournament: Accuracy: 76.12%, Stats loss: 857.632996 Result loss: 0.159722
2024 W Tournament: Accuracy: 76.12%, Stats loss: 857.632996 Result loss: 0.159722


## Generate a bracket

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(gen_bracket(dataset, 2024, 'M', hybrid).join(dataset.all_teams, on='Winner')[['Winner', 'TeamName']])