# March Madness 2025

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import os
from sklearn.model_selection import train_test_split
import random
from data import Data, STATS_COLUMNS
import model

torch.manual_seed(20250222)
random.seed(20250222)

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


## Hypothesis
Each team can be modeled by x hidden features. In each game, these hidden features interact in a nonlinear fashion to determine the outcome of the game

## Preparing the data
Load the data

In [2]:
dataset = Data()

dataset.games.describe()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,NumOT,WFGM,WFGA,WFGM3,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
count,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,...,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0,198374.0
mean,2015.470621,69.843291,2099.847868,74.183169,2097.450588,61.187026,0.061787,26.176339,57.063405,6.912005,...,19.248818,11.436922,16.826656,10.826832,21.949363,11.217125,15.21463,6.453946,2.848942,18.853504
std,6.024751,35.933736,986.382716,11.406085,989.676138,11.373007,0.287403,4.811306,7.828931,3.16658,...,6.325219,5.239163,6.987616,4.418293,4.708807,3.765042,5.028571,2.985335,2.037092,4.587468
min,2003.0,0.0,1101.0,30.0,1101.0,11.0,0.0,9.0,26.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
25%,2011.0,37.0,1260.0,66.0,1253.0,53.0,0.0,23.0,52.0,5.0,...,15.0,8.0,12.0,8.0,19.0,9.0,12.0,4.0,1.0,16.0
50%,2016.0,73.0,1413.0,74.0,1407.0,61.0,0.0,26.0,57.0,7.0,...,19.0,11.0,16.0,10.0,22.0,11.0,15.0,6.0,3.0,19.0
75%,2020.0,101.0,3244.0,81.0,3245.0,69.0,0.0,29.0,62.0,9.0,...,23.0,15.0,21.0,14.0,25.0,14.0,18.0,8.0,4.0,22.0
max,2025.0,132.0,3480.0,149.0,3480.0,144.0,6.0,58.0,113.0,30.0,...,80.0,48.0,65.0,38.0,53.0,34.0,49.0,26.0,21.0,47.0


The x's will be the indexes of two team IDs and program IDs, the y's will include and indicator of who won at the game stats

## Predicting Stats
The first model we build will predict statistics. This will allow us to build up embeddings for the teams that can later be used to predict outcomes

In [3]:
stats_train_loader, stats_validation_loader = dataset.train_test_data(output_stats=True, cache="stats")

Loading cached data


In [4]:
stats_model = model.StatsModel(program_embedding=1024, team_embedding=32,
                               num_programs=len(dataset.programs), num_teams=len(dataset.teams),
                               model_sizes=(256,256), dropout=0.5).to(device)

In [5]:
model.train(stats_train_loader, stats_validation_loader, stats_model, name="stats_model", learning_rate=0.001)

Epoch   0: Train Loss=234.40420201, Test Loss=234.83222016
Epoch   1: Train Loss=357.50759419, Test Loss=357.97455997
Epoch   2: Train Loss=311.49441827, Test Loss=311.88463313
Epoch   3: Train Loss=299.67260964, Test Loss=300.05408691
Epoch   4: Train Loss=230.49741919, Test Loss=230.83408734
Epoch   5: Train Loss=173.02145131, Test Loss=173.48125809
Epoch   6: Train Loss=62.09819710, Test Loss=62.41048917
Epoch   7: Train Loss=36.03215100, Test Loss=36.26891135
Epoch   8: Train Loss=35.74479896, Test Loss=36.00596516
Epoch   9: Train Loss=34.45330733, Test Loss=34.77445466
Epoch  10: Train Loss=33.34903043, Test Loss=33.73162851
Epoch  11: Train Loss=32.70654354, Test Loss=33.18703016
Epoch  12: Train Loss=32.65190676, Test Loss=33.12551234
Epoch  13: Train Loss=31.89696096, Test Loss=32.46369276
Epoch  14: Train Loss=32.39400129, Test Loss=32.90866038
Epoch  15: Train Loss=32.59397002, Test Loss=33.08471464
Epoch  16: Train Loss=32.08852939, Test Loss=32.63543571
Epoch  17: Train Lo

## The Model
Define the model. Combine the embeddings for the two teams, go to a hidden layer, and then output to a prediction if the first team won

In [6]:
result_train_loader, result_validation_loader = dataset.train_test_data(output_stats=False, cache="result")

Loading cached data


In [7]:
result_model = model.Model(matchup=stats_model.matchup, model_sizes=(64,64), dropout=0.1).to(device)

## Training the model

Train the model

In [None]:
model.train(result_train_loader, result_validation_loader, result_model, name="result_model", learning_rate=0.001)

Epoch   0: Train Loss=0.17766324, Test Loss=0.18592530
loss: 0.175948 [ 50500/297560]

In [None]:
model.test_accuracy(result_validation_loader, result_model)

With this model we can predict the output of about three quarters of regular season games.

## Load the tourney data to test with

In [None]:
tourney_dataset = dataset.gen_dataset(dataset.tourney)
tourney_loader = DataLoader(tourney_dataset, batch_size=500, shuffle=True)

In [None]:
model.test_accuracy(tourney_loader, result_model)

When it comes to tournament results we get a little worse. The lower result is likely due to teams having increased pairity.

Train with early torney data

In [None]:
tourney_df = dataset.tourney[dataset.tourney.Season < 2021]

tourney_train_df, tourney_validation_df = train_test_split(tourney_df, train_size=0.8)
tourney_train_data = dataset.gen_dataset(tourney_train_df)
tourney_validation_data = dataset.gen_dataset(tourney_validation_df)

tourney_train_loader = DataLoader(tourney_train_data, batch_size=500)
tourney_validation_loader = DataLoader(tourney_validation_data, batch_size=500)

In [None]:
result_model.matchup.freeze()

In [None]:
model.train(tourney_train_loader, tourney_validation_loader, result_model, name="tuned_model", learning_rate=0.0001)

### Performance by year


In [None]:
for season in dataset.tourney.Season.unique():
    loader = dataset.tourney_data(year=season)
    model.print_results(loader, result_model, label=f"{season} Tournament")

In [None]:
stage1_loader = dataset.tourney_data(after=2021)
model.print_results(stage1_loader, result_model, label=f"Stage 1")

Breaking out by league

In [None]:
for season in dataset.tourney.Season.unique():
    for league in dataset.tourney[dataset.tourney.Season == season].League.unique():
        loader = dataset.tourney_data(year=season, league=league)
        model.print_results(loader, result_model, label=f"{season} {league} Tournament")

## Inspect the model
First what are the sizes of the smallest input and output weights

In [None]:
print(f"Program embedding min: {result_model.matchup.program_embedding.state_dict()['weight'].abs().max(axis=0).values.min().item():>8f}")
print(f"Team embedding min: {result_model.matchup.team_embedding.state_dict()['weight'].abs().max(axis=0).values.min().item():>8f}")
print(f"FC min: {result_model.fc3.state_dict()['weight'].abs().max(axis=0).values.min().item():>8f}")

Calculate the average gradient for each input feature

In [None]:
result_model.matchup.unfreeze()

In [None]:
program_weights, team_weights, stats_weights = model.feature_eval(result_model, tourney_loader)

In [None]:
program_weights.abs().sum().item(), team_weights.abs().sum().item()

In [None]:
print(f"Year:\t{stats_weights[0]:>4f}")
print(f"Game:\t{stats_weights[1]:>4f}")
print(f"League:\t{stats_weights[2]:>4f}")

## Generating the submission file
### Phase 2

Write the results

In [None]:
odds = model.model_odds(dataset, 2025, 'M', result_model)

In [None]:
model.gen_submission(result_model, dataset)

## Save the model

In [None]:
torch.save(result_model.state_dict(), 'model.pth')

## Moderated model

Moderate a model by pushing it towards 0.5

In [None]:
moderated = model.ModeratedModel(result_model, 0.75)

In [None]:
for season in dataset.tourney.Season.unique():
    loader = dataset.tourney_data(season)
    model.print_results(loader, moderated, label=f"{season} Tournament")

## Dig into 2023 results

In [None]:
loader = dataset.tourney_data(2023)

x, y = loader.dataset.tensors

preds = result_model(x.to(device))

In [None]:
t_2023 = pd.DataFrame({'winner_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,0].tolist()],
                       'loser_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,2].tolist()],
                       'winner': [dataset.programs.loc[i].TeamID for i in x[:,0].tolist()],
                       'loser': [dataset.programs.loc[i].TeamID for i in x[:,2].tolist()],
                       'actual': y.reshape(-1),
                       'predicted': preds.cpu().detach().numpy().reshape(-1)}).iloc[:67]

In [None]:
t_2023[t_2023.predicted < 0.5].sort_values('predicted')

The biggest thing in this season were the huge upsets in the first round. Purdue was a number one seed and lost which I only gave a .4% chance to happen. Arizona and Virginia were number 2 seeds and lost which I gave 7% and 15% chances of happening respectively.

In [None]:
t_2023['Upset'] = [dataset.upset(2023, winner, loser) for (winner, loser) in zip(t_2023['winner'], t_2023['loser'])]

In [None]:
t_2023[t_2023.Upset].predicted.mean()

On average the upsets had a 32% chance of happening

In [None]:
t_2023[t_2023.Upset & (t_2023.predicted >= 0.5)].sort_values('predicted', ascending=False)

I correctly predicted 2 upsets, though all were closely ranked

In [None]:
t_2023[~t_2023.Upset & (t_2023.predicted < 0.5)].sort_values('predicted')

I also incorrectly predicted 4 upsets

Looking at all the tourneys

In [None]:
x, y = tourney_loader.dataset.tensors
preds = result_model(x.to(device))
tourney_df = pd.DataFrame({'season': x[:,4].tolist(),
                           'winner_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,0].tolist()],
                           'loser_name': [dataset.all_teams.loc[dataset.programs.loc[i].TeamID].TeamName for i in x[:,2].tolist()],
                           'winner': [dataset.programs.loc[i].TeamID for i in x[:,0].tolist()],
                           'loser': [dataset.programs.loc[i].TeamID for i in x[:,2].tolist()],
                           'actual': y.reshape([-1]),
                           'predicted': np.array(preds.tolist()).reshape([-1])})
tourney_df = tourney_df[tourney_df.actual == 1.0]
tourney_df['Upset'] = [dataset.upset(season, winner, loser) for (winner, loser, season)
                       in zip(tourney_df['winner'], tourney_df['loser'], tourney_df['season'])]

In [None]:
len(tourney_df[tourney_df.Upset & (tourney_df.predicted >= 0.5) & (tourney_df.season > 2020)].sort_values('predicted', ascending=False))

In [None]:
len(tourney_df[~tourney_df.Upset & (tourney_df.predicted < 0.5) & (tourney_df.season > 2020)].sort_values('predicted'))

Overall I predicted 19 upsets correctly, and 30 incorrectly

## Predicting by seeds
What if I predict just using the seeds?

In [None]:
odds = dataset.odds_by_seed_diff(before=2021)

In [None]:
dataset.tourney_df(after=2021).SeedDiff.map(lambda x: odds[x]**2).mean()

This results in a test Brier score of about 0.185.

## Hybrid Model
Building a model using the neural net and seeds

In [None]:
seed_model = model.SeedModel(dataset, after=2021)
model.test(stage1_loader, seed_model)

In [None]:
model.test(stage1_loader, result_model)

In [None]:
hybrid = model.HybridModel([result_model, seed_model], [0.8, 0.2])

In [None]:
model.test(stage1_loader, hybrid)

They hybrid model outperforms both individual models

In [None]:
for season in range(2021, 2025):
    for league in ['M', 'W']:
        loader = dataset.tourney_data(season, league)
        model.print_results(loader, hybrid, label=f"{season} Tournament")

## Generate a bracket

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(model.gen_bracket(dataset, 2024, 'M', hybrid).join(dataset.all_teams, on='Winner')[['Winner', 'TeamName']])