# March Madness 2025

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


## Hypothesis
Each team can be modeled by x hidden features. In each game, these hidden features interact in a nonlinear fashion to determine the outcome of the game

## Preparing the data
Load the data

In [2]:
mens = pd.read_csv('data/MRegularSeasonDetailedResults.csv')
mens['League'] = 'M'
mens.describe()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,NumOT,WFGM,WFGA,WFGM3,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
count,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0,...,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0,116723.0
mean,2014.051044,70.072462,1288.243422,75.859651,1283.044987,63.857732,0.068658,26.392099,55.746305,7.339085,...,20.138276,12.072488,17.736907,10.480668,21.632934,11.405867,13.73713,5.901031,3.144239,19.324709
std,6.464256,35.845605,105.298971,11.007412,104.76416,10.85121,0.305052,4.68348,7.461328,3.116574,...,6.064958,5.34529,7.085348,4.221941,4.519345,3.724047,4.536147,2.778302,2.628125,4.551727
min,2003.0,0.0,1101.0,34.0,1101.0,20.0,0.0,10.0,26.0,0.0,...,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0
25%,2009.0,38.0,1199.0,68.0,1192.0,57.0,0.0,23.0,51.0,5.0,...,16.0,8.0,13.0,7.0,19.0,9.0,11.0,4.0,1.0,16.0
50%,2014.0,73.0,1287.0,75.0,1282.0,64.0,0.0,26.0,55.0,7.0,...,20.0,12.0,17.0,10.0,21.0,11.0,13.0,6.0,3.0,19.0
75%,2020.0,101.0,1381.0,83.0,1374.0,71.0,0.0,29.0,60.0,9.0,...,24.0,15.0,22.0,13.0,25.0,14.0,17.0,8.0,4.0,22.0
max,2025.0,132.0,1480.0,149.0,1480.0,144.0,6.0,57.0,103.0,26.0,...,59.0,48.0,65.0,36.0,49.0,31.0,41.0,22.0,33.0,45.0


In [3]:
womens = pd.read_csv('data/WRegularSeasonDetailedResults.csv')
womens['League'] = 'W'
womens.describe()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,NumOT,WFGM,WFGA,WFGM3,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
count,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0,...,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0,79639.0
mean,2017.310476,68.860759,3285.050867,71.711963,3286.594658,57.23437,0.051583,25.847537,58.98001,6.268876,...,17.913974,10.511119,15.515175,11.395447,22.441116,10.933688,16.745024,6.923831,3.43495,18.204184
std,4.531798,36.258086,104.022507,11.547894,105.457243,10.964583,0.258755,4.982451,7.975729,3.125925,...,6.469817,4.938106,6.632564,4.639725,4.939763,3.805204,5.597689,3.279905,3.666537,4.557235
min,2010.0,0.0,3101.0,30.0,3101.0,11.0,0.0,9.0,30.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
25%,2013.0,35.0,3196.0,64.0,3195.0,50.0,0.0,22.0,53.0,4.0,...,13.0,7.0,11.0,8.0,19.0,8.0,13.0,5.0,1.0,15.0
50%,2017.0,72.0,3283.0,71.0,3287.0,57.0,0.0,25.0,59.0,6.0,...,17.0,10.0,15.0,11.0,22.0,11.0,16.0,7.0,3.0,18.0
75%,2021.0,101.0,3376.0,79.0,3377.0,64.0,0.0,29.0,64.0,8.0,...,22.0,14.0,20.0,14.0,26.0,13.0,20.0,9.0,4.0,21.0
max,2025.0,132.0,3480.0,140.0,3480.0,130.0,5.0,58.0,113.0,30.0,...,80.0,37.0,52.0,38.0,53.0,34.0,49.0,26.0,42.0,47.0


The IDs are definitely distinct so we can combine into a single dataframe

In [4]:
all_teams = pd.concat([mens, womens])

Get the distinct team/Years

In [5]:
teams = pd.concat([all_teams[['WTeamID', 'Season', 'League']].rename(columns={'WTeamID': 'TeamID'}),
                   all_teams[['LTeamID', 'Season', 'League']].rename(columns={'LTeamID': 'TeamID'})]).drop_duplicates().reset_index()

Define the training data. The x's will be the indexes of two team IDs, the y's will be 1 if the first team won, 0 otherwise.

In [6]:
teamMapping = {(x.TeamID, x.Season): x.Index for x in teams.itertuples()}

In [7]:
all_teams

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,League
0,2003,10,1104,68,1328,62,N,0,27,58,...,16,22,10,22,8,18,9,2,20,M
1,2003,10,1272,70,1393,63,N,0,26,62,...,9,20,20,25,7,12,8,6,16,M
2,2003,11,1266,73,1437,61,N,0,24,58,...,14,23,31,22,9,12,2,5,23,M
3,2003,11,1296,56,1457,50,N,0,18,38,...,8,15,17,20,9,19,4,3,23,M
4,2003,11,1400,77,1208,71,N,0,30,61,...,17,27,21,15,12,10,7,1,14,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79634,2025,84,3450,65,3333,57,A,0,24,58,...,7,9,11,23,10,8,5,10,14,W
79635,2025,85,3129,89,3307,80,H,0,33,73,...,7,16,4,25,16,4,2,13,13,W
79636,2025,85,3234,85,3321,80,H,0,29,60,...,13,20,6,25,18,8,2,13,21,W
79637,2025,85,3337,55,3258,43,H,0,21,50,...,14,17,11,20,6,8,2,10,13,W


In [8]:
winners = all_teams.apply(lambda x: teamMapping[(x.WTeamID, x.Season)], axis=1)
losers = all_teams.apply(lambda x: teamMapping[(x.LTeamID, x.Season)], axis=1)

Try modeling the score differences

In [9]:
score_diffs = all_teams.WScore - all_teams.LScore

In [10]:
x_tensor = torch.from_numpy(np.concatenate([np.stack([winners, losers], axis=1), np.stack([losers, winners], axis=1)]))
y_tensor = torch.from_numpy(np.concatenate([score_diffs, -score_diffs])).reshape([-1,1]).double()
dataset = TensorDataset(x_tensor, y_tensor)

Generate the train/validation split

In [11]:
batch_size=32

generator = torch.Generator().manual_seed(20250217)
train_data, validation_data = torch.utils.data.random_split(dataset, [0.9, 0.1], generator=generator)
train_loader = DataLoader(train_data, batch_size=batch_size)
validation_loader = DataLoader(validation_data, batch_size=batch_size)

## The Model
Define the model. Combine the embeddings for the two teams, go to a hidden layer, and then output to a prediction if the first team won

In [12]:
class Model(nn.Module):
    def __init__(self, embedding_size=64, model_size=16, dropout=0.1):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(len(teams), embedding_size)
        self.dropout1 = nn.Dropout(dropout)
        self.fc1 = nn.Linear(2*embedding_size, model_size)
        self.dropout2 = nn.Dropout(dropout)
        self.score_fc = nn.Linear(model_size, 1)
        self.result_fc = nn.Linear(model_size, 1)
        self.double()

    def forward(self, x):
        team = self.embedding(x[:,0])
        opponent = self.embedding(x[:,1])
        matchup = self.dropout1(torch.cat([team, opponent], axis=1))
        hidden = self.dropout2(F.relu(self.fc1(matchup)))
        score = self.score_fc(hidden)
        result = self.result_fc(hidden)
        return score, result 
        

In [13]:
model = Model(embedding_size=128, model_size=32).to(device)

## Training the model

Define the training function

In [14]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

def train(data, model, loss_fn, optimizer, log=False):
    size = len(data.dataset)
    model.train()
    for batch, (x, y) in enumerate(data):
        x = x.to(device)
        y = y.to(device)
        pred_score, pred_result = model(x)
        score_loss = loss_fn(pred_score, y)
        result_loss = loss_fn(pred_result, (y > 0).double())

        (score_loss + result_loss).backward()
        optimizer.step()
        optimizer.zero_grad()

        if log and batch % 1000 == 0:
            score_loss, result_loss, current = score_loss.item(), result_loss.item(), (batch + 1) * len(x)
            print(f"score loss: {score_loss:>7f}, result loss: {result_loss:>7f} [{current:>6d}/{size:>6d}]")

Define the testing function

In [15]:
def test(data, model, loss_fn, label="Test"):
    size = len(data.dataset)
    num_batches = len(data)
    model.eval()
    score_loss, result_loss, correct = 0, 0, 0
    with torch.no_grad():
        for x, y in data:
            x = x.to(device)
            y = y.to(device)
            score_pred, result_pred = model(x)
            score_loss += loss_fn(score_pred, y).item()
            result_loss += loss_fn(result_pred, (y > 0).float())
            correct += ((result_pred >= 0.5) == (y > 0)).type(torch.float).sum().item()
    score_loss /= num_batches
    result_loss /= num_batches
    correct /= size
    print(f"{label} Error: Accuracy: {(100*correct):>0.1f}%, Avg score loss: {score_loss:>8f}, Avg result loss: {result_loss:>8f}")

Train the model

In [16]:
n_epochs = 40
for i in range(n_epochs):
    print(f"Epoch {i}")
    train(train_loader, model, loss_fn, optimizer, log=(i==0))
    test(train_loader, model, loss_fn, label="Train")
    test(validation_loader, model, loss_fn, label="Validation")

Epoch 0
score loss: 334.471988, result loss: 0.476096 [    32/353452]
score loss: 302.613749, result loss: 0.391279 [ 32032/353452]
score loss: 288.009242, result loss: 0.261286 [ 64032/353452]
score loss: 240.909307, result loss: 0.300321 [ 96032/353452]
score loss: 281.345915, result loss: 0.281104 [128032/353452]
score loss: 224.945068, result loss: 0.266802 [160032/353452]
score loss: 194.466044, result loss: 0.291128 [192032/353452]
score loss: 318.072640, result loss: 0.266847 [224032/353452]
score loss: 279.972306, result loss: 0.250939 [256032/353452]
score loss: 287.624735, result loss: 0.246050 [288032/353452]
score loss: 229.493105, result loss: 0.233772 [320032/353452]
score loss: 233.766236, result loss: 0.256219 [352032/353452]
Train Error: Accuracy: 53.7%, Avg score loss: 266.830870, Avg result loss: 0.251377
Validation Error: Accuracy: 53.4%, Avg score loss: 265.878485, Avg result loss: 0.252369
Epoch 1
Train Error: Accuracy: 61.0%, Avg score loss: 242.551075, Avg resul

With this model we can predict the output of about three quarters of regular season games.

## Load the tourney data to test with

In [17]:
mens_tourney = pd.read_csv('data/MNCAATourneyDetailedResults.csv')
womens_tourney = pd.read_csv('data/WNCAATourneyDetailedResults.csv')
tourney = pd.concat([mens_tourney, womens_tourney])
tourney_winners = tourney.apply(lambda x: teamMapping[(x.WTeamID, x.Season)], axis=1)
tourney_losers = tourney.apply(lambda x: teamMapping[(x.LTeamID, x.Season)], axis=1)
x_tourney_tensor = torch.from_numpy(np.concatenate([np.stack([tourney_winners, tourney_losers], axis=1), 
                                                    np.stack([tourney_losers, tourney_winners], axis=1)]))
y_tourney_tensor = torch.from_numpy(np.concatenate([np.ones(len(tourney)), np.zeros(len(tourney))])).reshape([-1,1])
tourney_dataset = TensorDataset(x_tourney_tensor, y_tourney_tensor)
tourney_loader = DataLoader(tourney_dataset, batch_size=batch_size)

In [18]:
test(tourney_loader, model, loss_fn, label="Tourney")

Tourney Error: Accuracy: 74.2%, Avg score loss: 170.450061, Avg result loss: 0.178833


When it comes to tournament results we get about 7 out of 10 results. The lower result is likely due to teams having increased pairity.

## Generating the submission file
### Phase 1

Write the results

In [19]:
with open('submission.csv', 'w') as f:
    f.write("ID,Pred\n")
    for season in range(2021, 2025):
        for league in ('M', 'W'):
            teams_to_test = sorted(teams[(teams.Season==season) & (teams.League==league)].TeamID.values)
            matchups = [(t1, t2) for t1 in teams_to_test for t2 in teams_to_test if t1 < t2]
            matchups_tensor = torch.Tensor([(teamMapping[(t1, season)], teamMapping[(t2, season)])
                                     for (t1, t2) in matchups]).int().to(device)
            _, predictions = model(matchups_tensor)
            for (t1, t2), pred in zip(matchups, predictions):
                f.write(f"{season}_{t1.item()}_{t2.item()},{pred.item()}\n")

Two teams canceled their 2021 season due to covid but are still in the sample submission. Add in their results

In [20]:
with open('submission.csv', 'a') as f:
    for missing_team in [3169, 3197]:
        for opponent in teams[(teams.Season==2021) & (teams.League=='W')].TeamID.values:
            if opponent > missing_team:
                f.write(f"2021_{missing_team}_{opponent},0\n")
            else:
                f.write(f"2021_{opponent}_{missing_team},1\n")
    f.write(f"2021_3169_3197,0.5\n")