# March Madness 2025

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import os
from sklearn.model_selection import train_test_split
import random
from data import Data, STATS_COLUMNS
import model

torch.manual_seed(20250222)
random.seed(20250222)

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


## Hypothesis
Each team can be modeled by x hidden features. In each game, these hidden features interact in a nonlinear fashion to determine the outcome of the game

## Preparing the data
Load the data

In [2]:
dataset = Data()

dataset.games.describe()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,NumOT,WFGM,WFGA,WFGM3,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
count,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,...,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0
mean,2015.470621,69.843291,2099.847868,74.183169,2097.450588,61.187026,0.061787,26.176339,57.063405,6.912005,...,19.248818,11.436922,16.826656,10.826832,21.949363,11.217125,15.21463,6.453946,2.848942,18.853504
std,6.024751,35.933736,986.382716,11.406085,989.676138,11.373007,0.287403,4.811306,7.828931,3.16658,...,6.325219,5.239163,6.987616,4.418293,4.708807,3.765042,5.028571,2.985335,2.037092,4.587468
min,2003.0,0.0,1101.0,30.0,1101.0,11.0,0.0,9.0,26.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
25%,2011.0,37.0,1260.0,66.0,1253.0,53.0,0.0,23.0,52.0,5.0,...,15.0,8.0,12.0,8.0,19.0,9.0,12.0,4.0,1.0,16.0
50%,2016.0,73.0,1413.0,74.0,1407.0,61.0,0.0,26.0,57.0,7.0,...,19.0,11.0,16.0,10.0,22.0,11.0,15.0,6.0,3.0,19.0
75%,2020.0,101.0,3244.0,81.0,3245.0,69.0,0.0,29.0,62.0,9.0,...,23.0,15.0,21.0,14.0,25.0,14.0,18.0,8.0,4.0,22.0
max,2025.0,132.0,3480.0,149.0,3480.0,144.0,6.0,58.0,113.0,30.0,...,80.0,48.0,65.0,38.0,53.0,34.0,49.0,26.0,21.0,47.0


The x's will be the indexes of two team IDs and program IDs, the y's will include and indicator of who won at the game stats

## Predicting Stats
The first model we build will predict statistics. This will allow us to build up embeddings for the teams that can later be used to predict outcomes

In [3]:
stats_train_loader, stats_validation_loader = dataset.train_test_data(output_stats=True, cache="stats")

Loading cached data


In [4]:
stats_model = model.StatsModel(program_embedding=512, team_embedding=64,
                               num_programs=len(dataset.programs), num_teams=len(dataset.teams),
                               model_sizes=(256,256), dropout=0.5).to(device)

In [5]:
model.train(stats_train_loader, stats_validation_loader, stats_model, name="stats_model", learning_rate=0.001, streak=10, use_cache=True)

Loading from cache


## The Model
Now we use the embeddings developed in the first model to build a model to predict if the first team won

In [6]:
result_train_loader, result_validation_loader = dataset.train_test_data(output_stats=False, cache="result")

Loading cached data


In [7]:
result_model = model.Model(matchup=stats_model.matchup, model_sizes=(128,128), dropout=0.5).to(device)

## Training the model

Train the model

In [8]:
model.train(result_train_loader, result_validation_loader, result_model, name="result_model", learning_rate=0.0001, streak=10)

Epoch   0: Train Loss=0.17416289, Test Loss=0.17399674
Epoch   1: Train Loss=0.18298500, Test Loss=0.18309844
Epoch   2: Train Loss=0.17056150, Test Loss=0.17109695
Epoch   3: Train Loss=0.16769204, Test Loss=0.16861580
Epoch   4: Train Loss=0.16473803, Test Loss=0.16611724
Epoch   5: Train Loss=0.16384807, Test Loss=0.16556161
Epoch   6: Train Loss=0.16467655, Test Loss=0.16659407
Epoch   7: Train Loss=0.16303603, Test Loss=0.16541864
Epoch   8: Train Loss=0.16168486, Test Loss=0.16450060
Epoch   9: Train Loss=0.16138623, Test Loss=0.16457177
Epoch  10: Train Loss=0.16175516, Test Loss=0.16508303
Epoch  11: Train Loss=0.16049947, Test Loss=0.16429859
Epoch  12: Train Loss=0.16060611, Test Loss=0.16469360
Epoch  13: Train Loss=0.16215174, Test Loss=0.16624779
Epoch  14: Train Loss=0.16392777, Test Loss=0.16798995
Epoch  15: Train Loss=0.16185694, Test Loss=0.16642702
Epoch  16: Train Loss=0.16137794, Test Loss=0.16628523
Epoch  17: Train Loss=0.16255178, Test Loss=0.16743262
Epoch  18:

In [9]:
model.test_accuracy(result_validation_loader, result_model)

0.753629471307013

With this model we can predict the output of about three quarters of regular season games.

## Load the tourney data to test with

In [10]:
tourney_dataset = dataset.gen_dataset(dataset.tourney)
tourney_loader = DataLoader(tourney_dataset, batch_size=500, shuffle=True)

In [11]:
model.test_accuracy(tourney_loader, result_model)

0.7451669595782073

When it comes to tournament results we get a little worse. The lower result is likely due to teams having increased pairity.

Train with early torney data

In [12]:
tourney_df = dataset.tourney[dataset.tourney.Season < 2021]

tourney_train_df, tourney_validation_df = train_test_split(tourney_df, train_size=0.8)
tourney_train_data = dataset.gen_dataset(tourney_train_df)
tourney_validation_data = dataset.gen_dataset(tourney_validation_df)

tourney_train_loader = DataLoader(tourney_train_data, batch_size=500)
tourney_validation_loader = DataLoader(tourney_validation_data, batch_size=500)

In [13]:
result_model.matchup.freeze()

In [14]:
model.train(tourney_train_loader, tourney_validation_loader, result_model, name="tuned_model", learning_rate=0.0001)

Epoch   0: Train Loss=0.17159439, Test Loss=0.17480975
Epoch   1: Train Loss=0.16836352, Test Loss=0.17151254
Epoch   2: Train Loss=0.16773524, Test Loss=0.17069791
Epoch   3: Train Loss=0.16864450, Test Loss=0.17119237
Epoch   4: Train Loss=0.16928691, Test Loss=0.17167473
Best Loss: 0.170050


### Performance by year


In [15]:
for season in dataset.tourney.Season.unique():
    loader = dataset.tourney_data(year=season)
    model.print_results(loader, result_model, label=f"{season} Tournament")

2003 Tournament: Accuracy=74.22, Loss=0.173521
2004 Tournament: Accuracy=74.22, Loss=0.179849
2005 Tournament: Accuracy=71.88, Loss=0.181795
2006 Tournament: Accuracy=73.44, Loss=0.207049
2007 Tournament: Accuracy=79.69, Loss=0.168712
2008 Tournament: Accuracy=78.91, Loss=0.165924
2009 Tournament: Accuracy=80.47, Loss=0.159149
2010 Tournament: Accuracy=75.20, Loss=0.167792
2011 Tournament: Accuracy=72.31, Loss=0.171724
2012 Tournament: Accuracy=80.00, Loss=0.154173
2013 Tournament: Accuracy=69.62, Loss=0.176480
2014 Tournament: Accuracy=73.85, Loss=0.161705
2015 Tournament: Accuracy=79.23, Loss=0.145956
2016 Tournament: Accuracy=77.69, Loss=0.171147
2017 Tournament: Accuracy=78.08, Loss=0.157052
2018 Tournament: Accuracy=70.00, Loss=0.174825
2019 Tournament: Accuracy=71.92, Loss=0.152619
2021 Tournament: Accuracy=69.77, Loss=0.182787
2022 Tournament: Accuracy=74.25, Loss=0.174591
2023 Tournament: Accuracy=70.52, Loss=0.192326
2024 Tournament: Accuracy=75.37, Loss=0.161786


In [16]:
stage1_loader = dataset.tourney_data(after=2021)
model.print_results(stage1_loader, result_model, label=f"Stage 1")

Stage 1: Accuracy=72.50, Loss=0.177826


Breaking out by league

In [17]:
for season in dataset.tourney.Season.unique():
    for league in dataset.tourney[dataset.tourney.Season == season].League.unique():
        loader = dataset.tourney_data(year=season, league=league)
        model.print_results(loader, result_model, label=f"{season} {league} Tournament")

2003 M Tournament: Accuracy=74.22, Loss=0.173521
2004 M Tournament: Accuracy=74.22, Loss=0.179849
2005 M Tournament: Accuracy=71.88, Loss=0.181795
2006 M Tournament: Accuracy=73.44, Loss=0.207049
2007 M Tournament: Accuracy=79.69, Loss=0.168712
2008 M Tournament: Accuracy=78.91, Loss=0.165924
2009 M Tournament: Accuracy=80.47, Loss=0.159149
2010 M Tournament: Accuracy=72.66, Loss=0.197992
2010 W Tournament: Accuracy=77.78, Loss=0.137113
2011 M Tournament: Accuracy=65.67, Loss=0.214979
2011 W Tournament: Accuracy=79.37, Loss=0.125723
2012 M Tournament: Accuracy=71.64, Loss=0.186713
2012 W Tournament: Accuracy=88.89, Loss=0.119566
2013 M Tournament: Accuracy=62.69, Loss=0.207005
2013 W Tournament: Accuracy=76.98, Loss=0.144016
2014 M Tournament: Accuracy=67.91, Loss=0.195535
2014 W Tournament: Accuracy=80.16, Loss=0.125727
2015 M Tournament: Accuracy=76.87, Loss=0.166086
2015 W Tournament: Accuracy=81.75, Loss=0.124548
2016 M Tournament: Accuracy=76.12, Loss=0.184937
2016 W Tournament: A

## Inspect the model
First what are the sizes of the smallest input and output weights

In [18]:
print(f"Program embedding min: {result_model.matchup.program_embedding.state_dict()['weight'].abs().max(axis=0).values.min().item():>8f}")
print(f"Team embedding min: {result_model.matchup.team_embedding.state_dict()['weight'].abs().max(axis=0).values.min().item():>8f}")
print(f"FC min: {result_model.fc3.state_dict()['weight'].abs().max(axis=0).values.min().item():>8f}")

Program embedding min: 2.537224
Team embedding min: 3.735119
FC min: 0.003115


Calculate the average gradient for each input feature

In [19]:
result_model.matchup.unfreeze()

In [20]:
program_weights, team_weights, stats_weights = model.feature_eval(result_model, tourney_loader)

In [21]:
program_weights.abs().sum().item(), team_weights.abs().sum().item()

(0.00030536422855220735, 3.5212542570661753e-05)

In [22]:
print(f"Year:\t{stats_weights[0]:>4f}")
print(f"Game:\t{stats_weights[1]:>4f}")
print(f"League:\t{stats_weights[2]:>4f}")

Year:	-0.000048
Game:	-0.000101
League:	0.000589


## Generating the submission file
### Phase 2

Write the results

In [23]:
odds = model.model_odds(dataset, 2025, 'M', result_model)

In [24]:
model.gen_submission(result_model, dataset)

## Save the model

In [25]:
torch.save(result_model.state_dict(), 'model.pth')

## Moderated model

Moderate a model by pushing it towards 0.5

In [26]:
moderated = model.ModeratedModel(result_model, 0.75)

In [27]:
for season in dataset.tourney.Season.unique():
    loader = dataset.tourney_data(season)
    model.print_results(loader, moderated, label=f"{season} Tournament")

2003 Tournament: Accuracy=74.22, Loss=0.179495
2004 Tournament: Accuracy=74.22, Loss=0.186399
2005 Tournament: Accuracy=71.88, Loss=0.185481
2006 Tournament: Accuracy=73.44, Loss=0.205928
2007 Tournament: Accuracy=79.69, Loss=0.178531
2008 Tournament: Accuracy=78.91, Loss=0.175410
2009 Tournament: Accuracy=80.47, Loss=0.170452
2010 Tournament: Accuracy=75.20, Loss=0.174236
2011 Tournament: Accuracy=72.31, Loss=0.176522
2012 Tournament: Accuracy=80.00, Loss=0.164660
2013 Tournament: Accuracy=69.62, Loss=0.181334
2014 Tournament: Accuracy=73.85, Loss=0.169469
2015 Tournament: Accuracy=79.23, Loss=0.156552
2016 Tournament: Accuracy=77.69, Loss=0.175631
2017 Tournament: Accuracy=78.08, Loss=0.164811
2018 Tournament: Accuracy=70.00, Loss=0.177822
2019 Tournament: Accuracy=71.92, Loss=0.161673
2021 Tournament: Accuracy=69.77, Loss=0.184600
2022 Tournament: Accuracy=74.25, Loss=0.179702
2023 Tournament: Accuracy=70.52, Loss=0.193584
2024 Tournament: Accuracy=75.37, Loss=0.169081


## Dig into 2023 results

In [28]:
loader = dataset.tourney_data(2023)

x, y = loader.dataset.tensors

preds = result_model(x.to(device))

In [29]:
t_2023 = pd.DataFrame({'winner_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,0].tolist()],
                       'loser_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,2].tolist()],
                       'winner': [dataset.programs.loc[i].TeamID for i in x[:,0].tolist()],
                       'loser': [dataset.programs.loc[i].TeamID for i in x[:,2].tolist()],
                       'actual': y.reshape(-1),
                       'predicted': preds.cpu().detach().numpy().reshape(-1)}).iloc[:67]

In [30]:
t_2023[t_2023.predicted < 0.5].sort_values('predicted')

Unnamed: 0,winner_name,loser_name,winner,loser,actual,predicted
23,F Dickinson,Purdue,1192,1345,1.0,0.016843
15,Princeton,Arizona,1343,1112,1.0,0.101849
8,Furman,Virginia,1202,1438,1.0,0.119518
53,FL Atlantic,Tennessee,1194,1397,1.0,0.229568
39,Princeton,Missouri,1343,1281,1.0,0.249924
37,Arkansas,Kansas,1116,1242,1.0,0.327663
61,FL Atlantic,Kansas St,1194,1243,1.0,0.355363
57,Miami FL,Houston,1274,1222,1.0,0.35701
24,FL Atlantic,Memphis,1194,1272,1.0,0.36331
14,Penn St,Texas A&M,1336,1401,1.0,0.371114


The biggest thing in this season were the huge upsets in the first round. Purdue was a number one seed and lost which I only gave a .4% chance to happen. Arizona and Virginia were number 2 seeds and lost which I gave 7% and 15% chances of happening respectively.

In [31]:
t_2023['Upset'] = [dataset.upset(2023, winner, loser) for (winner, loser) in zip(t_2023['winner'], t_2023['loser'])]

In [32]:
t_2023[t_2023.Upset].predicted.mean()

np.float64(0.3478138957087081)

On average the upsets had a 32% chance of happening

In [33]:
t_2023[t_2023.Upset & (t_2023.predicted >= 0.5)].sort_values('predicted', ascending=False)

Unnamed: 0,winner_name,loser_name,winner,loser,actual,predicted,Upset
45,Creighton,Baylor,1166,1124,1.0,0.544255,True
54,Gonzaga,UCLA,1211,1417,1.0,0.508251,True


I correctly predicted 2 upsets, though all were closely ranked

In [34]:
t_2023[~t_2023.Upset & (t_2023.predicted < 0.5)].sort_values('predicted')

Unnamed: 0,winner_name,loser_name,winner,loser,actual,predicted,Upset
63,San Diego St,Creighton,1361,1166,1.0,0.40904,False
0,Pittsburgh,Mississippi St,1338,1280,1.0,0.428284,False
48,Kansas St,Kentucky,1243,1246,1.0,0.446092,False
34,TCU,Arizona St,1395,1113,1.0,0.45736,False
5,Arkansas,Illinois,1116,1228,1.0,0.472131,False
12,Missouri,Utah St,1281,1429,1.0,0.482999,False
59,Texas,Xavier,1400,1462,1.0,0.486456,False


I also incorrectly predicted 4 upsets

Looking at all the tourneys

In [35]:
x, y = tourney_loader.dataset.tensors
preds = result_model(x.to(device))
tourney_df = pd.DataFrame({'season': x[:,4].tolist(),
                           'winner_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,0].tolist()],
                           'loser_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,2].tolist()],
                           'winner': [dataset.programs.loc[i].TeamID for i in x[:,0].tolist()],
                           'loser': [dataset.programs.loc[i].TeamID for i in x[:,2].tolist()],
                           'actual': y.reshape([-1]),
                           'predicted': np.array(preds.tolist()).reshape([-1])})
tourney_df = tourney_df[tourney_df.actual == 1.0]
tourney_df['Upset'] = [dataset.upset(season, winner, loser) for (winner, loser, season)
                       in zip(tourney_df['winner'], tourney_df['loser'], tourney_df['season'])]

In [36]:
len(tourney_df[tourney_df.Upset & (tourney_df.predicted >= 0.5) & (tourney_df.season > 2020)].sort_values('predicted', ascending=False))

21

In [37]:
len(tourney_df[~tourney_df.Upset & (tourney_df.predicted < 0.5) & (tourney_df.season > 2020)].sort_values('predicted'))

40

Overall I predicted 19 upsets correctly, and 30 incorrectly

## Predicting by seeds
What if I predict just using the seeds?

In [38]:
odds = dataset.odds_by_seed_diff(before=2021)

In [39]:
dataset.tourney_df(after=2021).SeedDiff.map(lambda x: odds[x]**2).mean()

np.float64(0.18527460145235355)

This results in a test Brier score of about 0.185.

## Hybrid Model
Building a model using the neural net and seeds

In [40]:
seed_model = model.SeedModel(dataset, after=2021)
model.test(stage1_loader, seed_model)

0.1828505115610905

In [41]:
model.test(stage1_loader, result_model)

0.17782618509477946

In [42]:
hybrid = model.HybridModel([result_model, seed_model], [0.8, 0.2])

In [43]:
model.test(stage1_loader, hybrid)

0.17653593524947686

They hybrid model outperforms both individual models

In [44]:
for season in range(2021, 2025):
    for league in ['M', 'W']:
        loader = dataset.tourney_data(season, league)
        model.print_results(loader, hybrid, label=f"{season} Tournament")

2021 Tournament: Accuracy=69.38, Loss=0.182805
2021 Tournament: Accuracy=69.38, Loss=0.182805
2022 Tournament: Accuracy=72.76, Loss=0.174390
2022 Tournament: Accuracy=72.76, Loss=0.174390
2023 Tournament: Accuracy=71.27, Loss=0.189143
2023 Tournament: Accuracy=71.27, Loss=0.189143
2024 Tournament: Accuracy=76.87, Loss=0.160040
2024 Tournament: Accuracy=76.87, Loss=0.160040


## Generate a bracket

In [45]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(model.gen_bracket(dataset, 2024, 'M', hybrid).join(dataset.all_teams, on='Winner')[['Winner', 'TeamName']])

      Winner        TeamName
Slot                        
R1W1    1163     Connecticut
R1W2    1235         Iowa St
R1W3    1228        Illinois
R1W4    1120          Auburn
R1W5    1361    San Diego St
R1W6    1140             BYU
R1W7    1450   Washington St
R1W8    1194     FL Atlantic
R1X1    1314  North Carolina
R1X2    1112         Arizona
R1X3    1124          Baylor
R1X4    1104         Alabama
R1X5    1388    St Mary's CA
R1X6    1307      New Mexico
R1X7    1173          Dayton
R1X8    1277     Michigan St
R1Y1    1345          Purdue
R1Y2    1397       Tennessee
R1Y3    1166       Creighton
R1Y4    1242          Kansas
R1Y5    1211         Gonzaga
R1Y6    1376  South Carolina
R1Y7    1400           Texas
R1Y8    1429         Utah St
R1Z1    1222         Houston
R1Z2    1266       Marquette
R1Z3    1246        Kentucky
R1Z4    1181            Duke
R1Z5    1458       Wisconsin
R1Z6    1403      Texas Tech
R1Z7    1196         Florida
R1Z8    1401       Texas A&M
R2W1    1163  