## Using rankings
Can we use rankings? We only have this data for Men's

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.data import TensorDataset, DataLoader


device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")
torch.manual_seed(20250222)
random.seed(20250222)

Using cuda device


In [2]:
rankings = pd.read_csv('data/MMasseyOrdinals.csv')
mens = pd.read_csv('data/MRegularSeasonDetailedResults.csv')

In [4]:
rankings_by_team = rankings.pivot_table(index=["Season", "TeamID"],
                                        columns="SystemName",
                                        values="OrdinalRank",
                                        aggfunc="last").fillna(365)

In [5]:
games = mens[['Season', 'DayNum', 'WTeamID', 'LTeamID']]
winner_rankings = games.join(rankings_by_team, on=['Season', 'WTeamID']).set_index(['Season', 'DayNum', 'WTeamID', 'LTeamID'])
loser_rankings = games.join(rankings_by_team, on=['Season', 'LTeamID']).set_index(['Season', 'DayNum', 'WTeamID', 'LTeamID'])

In [6]:
x = torch.from_numpy(np.concatenate([winner_rankings - loser_rankings, loser_rankings - winner_rankings])).double()
y = torch.from_numpy(np.concatenate([mens.WScore - mens.LScore, mens.LScore - mens.WScore]).reshape((-1,1))).double()

In [7]:
dataset = TensorDataset(x, y)

batch_size=200

generator = torch.Generator().manual_seed(20250217)
train_data, validation_data = torch.utils.data.random_split(dataset, [0.95, 0.05], generator=generator)
train_loader = DataLoader(train_data, batch_size=batch_size)
validation_loader = DataLoader(validation_data, batch_size=batch_size)

In [8]:
class Model(nn.Module):
    def __init__(self, model_sizes, dropout, ranking_size=192):
        super(Model, self).__init__()
        hid1, hid2 = model_sizes
        self.fc1 = nn.Linear(ranking_size, hid1).double()
        self.fc2 = nn.Linear(hid1, hid2).double()
        self.score_fc = nn.Linear(hid2, 1).double()
        self.result_fc = nn.Linear(hid2, 1).double()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        hid1 = F.relu(self.dropout(self.fc1(x)))
        hid2 = F.relu(self.dropout(self.fc2(hid1)))
        score = self.score_fc(hid2)
        result = F.sigmoid(self.result_fc(hid2))
        return score, result

In [9]:
model = Model([512, 256], 0.25).to(device)

In [10]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)

def train(data, model, loss_fn, optimizer, full_loss=True):
    size = len(data.dataset)
    model.train()
    for batch, (x, y) in enumerate(data):
        x = x.to(device)
        y = y.to(device)
        pred_score, pred_result = model(x)
        actual_result = (y > 0).double()
        score_loss = loss_fn(pred_score, y)
        result_loss = loss_fn(pred_result, actual_result)
        if full_loss:
            (score_loss + 10 * result_loss).backward()
        else:
            result_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            score_loss, result_loss, current = score_loss.item(), result_loss.item(), (batch + 1) * len(x)
            print(f"result loss: {result_loss:>7f} [{current:>6d}/{size:>6d}]", end="\r")

In [11]:
def test(data, model, loss_fn, label="Test"):
    size = len(data.dataset)
    num_batches = len(data)
    model.eval()
    score_loss, result_loss, correct = 0, 0, 0
    with torch.no_grad():
        for x, y in data:
            x = x.to(device)
            y = y.to(device)
            score_pred, result_pred = model(x)
            actual_result = (y > 0).double()
            score_loss += loss_fn(score_pred, y).item()
            result_loss += loss_fn(result_pred, actual_result).item()
            correct += ((result_pred >= 0.5) == (actual_result == 1)).type(torch.float).sum().item()
    score_loss /= num_batches
    result_loss /= num_batches
    correct /= size
    print(f"{label}: Accuracy: {(100*correct):>0.2f}%, Result loss: {result_loss:>8f}")

In [12]:
n_epochs = 27
for i in range(n_epochs):
    print(f"Epoch {i}")
    train(train_loader, model, loss_fn, optimizer)
    test(train_loader, model, loss_fn, label="Train")
    test(validation_loader, model, loss_fn, label="Validation")

Epoch 0
Train: Accuracy: 76.07%, Result loss: 0.163663
Validation: Accuracy: 75.35%, Result loss: 0.165617
Epoch 1
Train: Accuracy: 76.17%, Result loss: 0.163323
Validation: Accuracy: 75.36%, Result loss: 0.165511
Epoch 2
Train: Accuracy: 76.25%, Result loss: 0.162539
Validation: Accuracy: 75.31%, Result loss: 0.164978
Epoch 3
Train: Accuracy: 76.26%, Result loss: 0.162552
Validation: Accuracy: 75.41%, Result loss: 0.164919
Epoch 4
Train: Accuracy: 76.28%, Result loss: 0.162556
Validation: Accuracy: 75.50%, Result loss: 0.164901
Epoch 5
Train: Accuracy: 76.35%, Result loss: 0.162359
Validation: Accuracy: 75.56%, Result loss: 0.164769
Epoch 6
Train: Accuracy: 76.37%, Result loss: 0.162071
Validation: Accuracy: 75.58%, Result loss: 0.164636
Epoch 7
Train: Accuracy: 76.39%, Result loss: 0.161590
Validation: Accuracy: 75.70%, Result loss: 0.164061
Epoch 8
Train: Accuracy: 76.44%, Result loss: 0.161691
Validation: Accuracy: 75.72%, Result loss: 0.164182
Epoch 9
Train: Accuracy: 76.46%, Resu

In [13]:
for i in range(8):
    print(f"Epoch {i}")
    train(train_loader, model, loss_fn, optimizer, full_loss=False)
    test(train_loader, model, loss_fn, label="Train")
    test(validation_loader, model, loss_fn, label="Validation")

Epoch 0
Train: Accuracy: 76.97%, Result loss: 0.158516
Validation: Accuracy: 75.90%, Result loss: 0.162508
Epoch 1
Train: Accuracy: 77.00%, Result loss: 0.158486
Validation: Accuracy: 75.90%, Result loss: 0.162521
Epoch 2
Train: Accuracy: 77.03%, Result loss: 0.158873
Validation: Accuracy: 75.90%, Result loss: 0.162961
Epoch 3
Train: Accuracy: 77.03%, Result loss: 0.158778
Validation: Accuracy: 75.76%, Result loss: 0.163221
Epoch 4
Train: Accuracy: 77.05%, Result loss: 0.158570
Validation: Accuracy: 75.90%, Result loss: 0.163215
Epoch 5
Train: Accuracy: 77.04%, Result loss: 0.158313
Validation: Accuracy: 75.90%, Result loss: 0.162907
Epoch 6
Train: Accuracy: 77.06%, Result loss: 0.158262
Validation: Accuracy: 75.87%, Result loss: 0.163148
Epoch 7
Train: Accuracy: 77.08%, Result loss: 0.157822
Validation: Accuracy: 76.00%, Result loss: 0.162794


In [14]:
mens_tourney = pd.read_csv('data/MNCAATourneyDetailedResults.csv')
tourney_games = mens_tourney[['Season', 'DayNum', 'WTeamID', 'LTeamID']]
tourney_winner_rankings = tourney_games.join(rankings_by_team, on=['Season', 'WTeamID']).set_index(['Season', 'DayNum', 'WTeamID', 'LTeamID'])
tourney_loser_rankings = tourney_games.join(rankings_by_team, on=['Season', 'LTeamID']).set_index(['Season', 'DayNum', 'WTeamID', 'LTeamID'])
tourney_x = torch.from_numpy(np.concatenate([tourney_winner_rankings - tourney_loser_rankings,
                                             tourney_loser_rankings - tourney_winner_rankings])).double()
tourney_y = torch.from_numpy(np.concatenate([mens_tourney.WScore - mens_tourney.LScore,
                                             mens_tourney.LScore - mens_tourney.WScore]).reshape((-1,1))).double()
tourney_dataset = TensorDataset(tourney_x, tourney_y)
tourney_loader = DataLoader(tourney_dataset, batch_size=batch_size)

In [15]:
test(tourney_loader, model, loss_fn, label="Tourney")

Tourney: Accuracy: 69.46%, Result loss: 0.196700
