In [199]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# change the display width to see all columns
pd.set_option('display.width', 500)

games_df = pd.read_csv('./data/games.csv', low_memory=False)
games_details_df = pd.read_csv('./data/games_details.csv', low_memory=False)
# players_df = pd.read_csv('./data/players.csv')
ranking_df = pd.read_csv('./data/ranking.csv')
teams_df = pd.read_csv('./data/teams.csv')

Clean the game dataframe and group by season

In [200]:
games_df = games_df.drop(columns=['GAME_STATUS_TEXT'])

games_df.groupby(['SEASON'])
games_df = games_df.sort_values('GAME_DATE_EST')

# games_df = games_df.dropna()

print(games_df.head(200))

      GAME_DATE_EST   GAME_ID  HOME_TEAM_ID  VISITOR_TEAM_ID  SEASON  TEAM_ID_home  PTS_home  FG_PCT_home  FT_PCT_home  FG3_PCT_home  AST_home  REB_home  TEAM_ID_away  PTS_away  FG_PCT_away  FT_PCT_away  FG3_PCT_away  AST_away  REB_away  HOME_TEAM_WINS
19288    2003-10-05  10300001    1610612762       1610612742    2003    1610612762      90.0        0.457        0.735         0.143      23.0      41.0    1610612742      85.0        0.447        0.500         0.250      20.0      38.0               1
19287    2003-10-06  10300002    1610612763       1610612749    2003    1610612763     105.0        0.494        0.618         0.267      25.0      48.0    1610612749      94.0        0.427        0.700         0.154      20.0      43.0               1
19280    2003-10-07  10300010    1610612764       1610612752    2003    1610612764     104.0        0.506        0.677         0.455      26.0      45.0    1610612752      86.0        0.380        0.852         0.188      19.0      37.0     

Below we will create the ELO rating for each team, as a default the team will start with 1500 points. Each time a team wins or loses a game, the ELO rating will be updated. The ELO rating will be updated based on the following formula:



Updated Team ELO = Team ELO + k * (Team Expected Outcome - Team Actual Outcome)

In [201]:
initial_elo = 1500
k_factor = 15
team_elos = {team_id: initial_elo for team_id in pd.concat([games_df['HOME_TEAM_ID'], games_df['VISITOR_TEAM_ID']]).unique()}


def expected_outcome(home_elo, away_elo):
    return 1 / (1 + 10 ** ((away_elo - home_elo) / 400))

def update_elo(home_elo, visitor_elo, home_win, k_factor):
    expected_home_win = expected_outcome(home_elo, visitor_elo)
    actual_home_win = 1 if home_win else 0
    new_home_elo = home_elo + k_factor * (actual_home_win - expected_home_win)
    new_visitor_elo = visitor_elo + k_factor * ((1 - actual_home_win) - (1 - expected_home_win))

    return new_home_elo, new_visitor_elo


After iterating, team_elos will have the updated Elo ratings for each team

In [202]:
games_df = games_df.sort_values('GAME_DATE_EST', ascending=False)
games_df['ELO_home'] = 0
games_df['ELO_away'] = 0

for index, row in games_df.iterrows():
    home_team, away_team = row['HOME_TEAM_ID'], row['VISITOR_TEAM_ID']
    home_elo, away_elo = team_elos[home_team], team_elos[away_team]
    home_win = row['HOME_TEAM_WINS']
    new_home_elo, new_away_elo = update_elo(home_elo, away_elo, home_win, k_factor)
    games_df.at[index, 'ELO_home'] = round(new_home_elo)
    games_df.at[index, 'ELO_away'] = round(new_away_elo)
    
    team_elos[home_team], team_elos[away_team] = new_home_elo, new_away_elo
    

Group each Season into its own dataframe, this will help for calculating the overall ELO rating for each team in a season. I can then also track the ELO rating for each team over time, and see how it changes over the course of the dataset.

In [203]:
seasons_df = games_df.groupby('SEASON')
seasons_dict = {}
for season, season_df in seasons_df:
    seasons_dict[season] = season_df

# Iterate over the dictionary and calculate the percentage of missing values
for season, season_df in seasons_dict.items():
    missing_values_percent = season_df.isnull().sum() * 100 / len(season_df)
    # print(f"Percentage of missing values for season {season}:")
    print(missing_values_percent)

GAME_DATE_EST      0.000000
GAME_ID            0.000000
HOME_TEAM_ID       0.000000
VISITOR_TEAM_ID    0.000000
SEASON             0.000000
TEAM_ID_home       0.000000
PTS_home           7.148014
FG_PCT_home        7.148014
FT_PCT_home        7.148014
FG3_PCT_home       7.148014
AST_home           7.148014
REB_home           7.148014
TEAM_ID_away       0.000000
PTS_away           7.148014
FG_PCT_away        7.148014
FT_PCT_away        7.148014
FG3_PCT_away       7.148014
AST_away           7.148014
REB_away           7.148014
HOME_TEAM_WINS     0.000000
ELO_home           0.000000
ELO_away           0.000000
dtype: float64
GAME_DATE_EST      0.0
GAME_ID            0.0
HOME_TEAM_ID       0.0
VISITOR_TEAM_ID    0.0
SEASON             0.0
TEAM_ID_home       0.0
PTS_home           0.0
FG_PCT_home        0.0
FT_PCT_home        0.0
FG3_PCT_home       0.0
AST_home           0.0
REB_home           0.0
TEAM_ID_away       0.0
PTS_away           0.0
FG_PCT_away        0.0
FT_PCT_away        0.0
F

Here each team's ELO will be saved in a dataframe then merged with the team dataframe to get the ELO rating for each team in the dataset. 

In [204]:
team_elos_df = pd.DataFrame.from_dict(team_elos, orient="index", columns=["ELO"])
team_elos_df = team_elos_df.merge(teams_df, left_index=True, right_on="TEAM_ID")
team_elos_df = team_elos_df.sort_values("ELO", ascending=False)

In [205]:
# Create margin of victory column
games_df["MOV"] = games_df["PTS_home"] - games_df["PTS_away"]
close_games = games_df[(games_df["MOV"] > -5) & (games_df["MOV"] < 5)]
print("Number of close games:", close_games.shape[0])

close_game_prob = close_games["MOV"].count() / games_df["MOV"].count() * 100
print("Probability of a close game:", close_game_prob)

Number of close games: 5631
Probability of a close game: 21.20744200060259


In [206]:
# Create high scoring game column
games_df["total_score"] = games_df["PTS_home"] + games_df["PTS_away"]
high_scoring_games = games_df[games_df["total_score"] > 220]
print("Number of high scoring games:", high_scoring_games.shape[0])

high_scoring_game_prob = len(high_scoring_games) / len(games_df) * 100
print("Probability of a high scoring game:", high_scoring_game_prob)

games_df = games_df.dropna(ignore_index=True)


Number of high scoring games: 6258
Probability of a high scoring game: 23.48129526096582


In [207]:
def save_model(model, losses):
    average_loss = round(np.average(losses), 3)
    model_to_string = model.__class__.__name__
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    if not os.path.exists(f'./models/{model_to_string}'):
        os.makedirs(f'./models/{model_to_string}')
    torch.save(model.state_dict(), f=f'./models/{model_to_string}/{average_loss}_on_{timestamp}_model.pth')

This class will stop training runs if the training loss is not decreasing anymore. This is done through comparing the current loss with the previous loss. If the loss is not decreasing, the training will stop.

In [208]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True


## Initial Model Creation

The model is created, experimenting with the different model architecture. The model will be trained on the first 80% of the data, and tested on the last 20% of the data. The model will be evaluated based on the accuracy of the predictions. 

The model will be trained on the ELO rating of each team, and the difference in ELO rating between the two teams. The model will predict the outcome of the game based on the ELO rating of each team.

In [209]:
skip_cell = True
if not skip_cell:
    features_df = games_df[
            [
                # "PTS_home",
                # "PTS_away",
                # "FG_PCT_home",
                # "FT_PCT_home",
                # "FG3_PCT_home",
                # "AST_home",
                # "REB_home",
                # "FG_PCT_away",
                # "FT_PCT_away",
                # "FG3_PCT_away",
                # "AST_away",
                # "REB_away",
                "SEASON",
                # "MOV",
                "total_score",
                "ELO_home",
                "ELO_away",
                "HOME_TEAM_ID",
                "VISITOR_TEAM_ID",]
        ]


    class BinaryClassifier(nn.Module):
        def __init__(self):
            super(BinaryClassifier, self).__init__()
            self.fc1 = nn.Linear(in_features=6, out_features=50)
            self.fc2 = nn.Linear(50, 1)

        def forward(self, x):
            x = F.prelu(self.fc1(x), torch.tensor(0.75))
            x = torch.sigmoid(self.fc2(x))
            return x


    features = torch.tensor(
        features_df.values,
        dtype=torch.float32,
    )
    labels = torch.tensor(games_df["HOME_TEAM_WINS"])

    dataset = TensorDataset(features, labels)

    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size

    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=128)

    model = BinaryClassifier()
    criterion = nn.BCELoss(weight=torch.tensor(0.75))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    num_epochs = 10
    losses = []
    for epoch in range(num_epochs):
        for inputs, targets in train_dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criterion(outputs, targets.float())
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")
else: 
    print("Cell skipped")        

Cell skipped


## Model v2 - relu activation function and larger hidden layers
Here I have created a model with larger hidden layers, and more hidden layers, to see if the model can learn more complex patterns in the data. The learning rate is lowered and a scheduler is added to help the model converge to a better solution. The model is trained for 100 epochs, and the loss rate is assessed to see if the model is learning at a better rate than the previous model.

In [210]:
skip_cell = False
if not skip_cell:
    features_df = games_df[
            [
                "SEASON",
                "total_score",
                "ELO_home",
                "ELO_away",
                "HOME_TEAM_ID",
                "VISITOR_TEAM_ID",]
        ]

    class BinaryClassifierV2(nn.Module):
        def __init__(self):
            super(BinaryClassifierV2, self).__init__()
            self.fc1 = nn.Linear(in_features=6, out_features=1000)
            self.fc2 = nn.Linear(1000, 50)
            self.fc3 = nn.Linear(50, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = torch.sigmoid(self.fc3(x))
            return x


    features = torch.tensor(
        features_df.values,
        dtype=torch.float32,
    )
    labels = torch.tensor(games_df["HOME_TEAM_WINS"])

    dataset = TensorDataset(features, labels)

    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size

    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=128)

    model = BinaryClassifierV2()
    criterion = nn.BCELoss(weight=torch.tensor(0.75))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', cooldown=5, patience=2)

    num_epochs = 100
    losses = []
    for epoch in range(num_epochs):
        for inputs, targets in train_dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criterion(outputs, targets.float())
            loss.backward()
            optimizer.step()
            scheduler.step(loss)
            losses.append(loss.item())
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")
        
    save_model(model, losses)

else: 
    print("Cell skipped")

Epoch 1, Loss: 26.033058166503906
Epoch 2, Loss: 32.85123825073242
Epoch 3, Loss: 35.95041275024414
Epoch 4, Loss: 30.991735458374023
Epoch 5, Loss: 31.611570358276367
Epoch 6, Loss: 27.272727966308594
Epoch 7, Loss: 29.132230758666992
Epoch 8, Loss: 31.611570358276367
Epoch 9, Loss: 34.71074295043945
Epoch 10, Loss: 31.611570358276367
Epoch 11, Loss: 22.933883666992188
Epoch 12, Loss: 30.37190055847168
Epoch 13, Loss: 26.033058166503906
Epoch 14, Loss: 27.272727966308594
Epoch 15, Loss: 29.752065658569336
Epoch 16, Loss: 29.132230758666992
Epoch 17, Loss: 29.132230758666992
Epoch 18, Loss: 33.471073150634766
Epoch 19, Loss: 28.51239585876465
Epoch 20, Loss: 29.132230758666992
Epoch 21, Loss: 29.752065658569336
Epoch 22, Loss: 24.173553466796875
Epoch 23, Loss: 32.85123825073242
Epoch 24, Loss: 37.80991744995117
Epoch 25, Loss: 35.3305778503418
Epoch 26, Loss: 32.85123825073242
Epoch 27, Loss: 38.429752349853516
Epoch 28, Loss: 29.132230758666992
Epoch 29, Loss: 28.51239585876465
Epoch

## Model v3.1 - More model information
The results from the model now include things like the F1 score, precision, recall, and the confusion matrix. This will help to better understand the performance of the model, and see where the model is making mistakes.

In [211]:
skip_cell = True
if not skip_cell:
    class BinaryClassifierV3(nn.Module):
        def __init__(self):
            super(BinaryClassifierV3, self).__init__()
            self.fc1 = nn.Linear(in_features=6, out_features=1000)
            self.fc2 = nn.Linear(1000, 50)
            self.fc3 = nn.Linear(50, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = torch.sigmoid(self.fc3(x))
            return x


    features_df = games_df[
        [
            "SEASON",
            "total_score",
            "ELO_home",
            "ELO_away",
            "HOME_TEAM_ID",
            "VISITOR_TEAM_ID",
        ]
    ]

    features = torch.tensor(features_df.values, dtype=torch.float32)
    labels = torch.tensor(games_df["HOME_TEAM_WINS"].values, dtype=torch.float32)

    dataset = TensorDataset(features, labels)

    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size

    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=128)

    model = BinaryClassifierV3()
    criterion = nn.BCELoss(weight=torch.tensor(0.75))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', cooldown=5, patience=2)

    num_epochs = 100
    losses = []

    for epoch in range(num_epochs):
        model.train()
        train_targets = []
        train_predictions = []
        
        for inputs, targets in train_dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criterion(outputs, targets.float())
            loss.backward()
            optimizer.step()
            scheduler.step(loss)
            losses.append(loss.item())
            
            # Collect targets and predictions for metrics
            train_targets.extend(targets.numpy())
            train_predictions.extend(outputs.detach().numpy())
        
        # Calculate metrics for training data
        train_predictions_binary = [1 if pred > 0.5 else 0 for pred in train_predictions]
        train_accuracy = accuracy_score(train_targets, train_predictions_binary)
        train_precision = precision_score(train_targets, train_predictions_binary)
        train_recall = recall_score(train_targets, train_predictions_binary)
        train_f1 = f1_score(train_targets, train_predictions_binary)
        train_roc_auc = roc_auc_score(train_targets, train_predictions)
        train_conf_matrix = confusion_matrix(train_targets, train_predictions_binary)
        
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")
        print(f"Train Accuracy: {train_accuracy}")
        print(f"Train Precision: {train_precision}")
        print(f"Train Recall: {train_recall}")
        print(f"Train F1-Score: {train_f1}")
        print(f"Train ROC-AUC: {train_roc_auc}")
        print(f"Train Confusion Matrix:\n{train_conf_matrix}")
        
        # Validation loop
        model.eval()
        val_targets = []
        val_predictions = []
        
        with torch.no_grad():
            for inputs, targets in test_dataloader:
                outputs = model(inputs)
                outputs = outputs.squeeze()
                
                # Collect targets and predictions for metrics
                val_targets.extend(targets.numpy())
                val_predictions.extend(outputs.numpy())
        
        # Calculate metrics for validation data
        val_predictions_binary = [1 if pred > 0.5 else 0 for pred in val_predictions]
        val_accuracy = accuracy_score(val_targets, val_predictions_binary)
        val_precision = precision_score(val_targets, val_predictions_binary)
        val_recall = recall_score(val_targets, val_predictions_binary)
        val_f1 = f1_score(val_targets, val_predictions_binary)
        val_roc_auc = roc_auc_score(val_targets, val_predictions)
        val_conf_matrix = confusion_matrix(val_targets, val_predictions_binary)
        
        print(f"Validation Accuracy: {val_accuracy}")
        print(f"Validation Precision: {val_precision}")
        print(f"Validation Recall: {val_recall}")
        print(f"Validation F1-Score: {val_f1}")
        print(f"Validation ROC-AUC: {val_roc_auc}")
        print(f"Validation Confusion Matrix:\n{val_conf_matrix}")

    save_model()
else:
    print("Cell skipped")    

Cell skipped


## Third Model - Adding Dropout Layers

Here I have added dropout layers to the model to help prevent overfitting. The dropout rate is set to 0.5, increasing layer count to 7 and increasing the number of neurons in each layer. The model is evaluated based on the loss rate, and the accuracy of the predictions.

In [212]:
skip_cell = True
if not skip_cell:
    features_df = games_df[
            [
                "SEASON",
                "total_score",
                "ELO_home",
                "ELO_away",
                "HOME_TEAM_ID",
                "VISITOR_TEAM_ID",]
        ]

    # Calculate class weights
    class_counts = games_df["HOME_TEAM_WINS"].value_counts()
    class_weights = 1. / class_counts
    sample_weights = class_weights[games_df["HOME_TEAM_WINS"]]

    # Use weighted loss function

    class BinaryClassifierDropout(nn.Module):
        def __init__(self):
            super(BinaryClassifierDropout, self).__init__()
            self.fc1 = nn.Linear(in_features=6, out_features=1000)
            self.dropout1 = nn.Dropout(p=0.5)
            self.fc2 = nn.Linear(1000, 1000)
            self.dropout2 = nn.Dropout(p=0.5)
            self.fc3 = nn.Linear(1000, 1000)
            self.dropout3 = nn.Dropout(p=0.5)
            self.fc4 = nn.Linear(1000, 1000)
            self.dropout4 = nn.Dropout(p=0.5)
            self.fc5 = nn.Linear(1000, 1000)
            self.dropout5 = nn.Dropout(p=0.5)
            self.fc6 = nn.Linear(1000, 500)
            self.dropout6 = nn.Dropout(p=0.5)
            self.fc7 = nn.Linear(500, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = self.dropout1(x)
            x = F.relu(self.fc2(x))
            x = self.dropout2(x)
            x = F.relu(self.fc3(x))
            x = self.dropout3(x)
            x = F.relu(self.fc4(x))
            x = self.dropout4(x)
            x = F.relu(self.fc5(x))
            x = self.dropout5(x)
            x = F.relu(self.fc6(x))
            x = self.dropout6(x)
            x = torch.sigmoid(self.fc7(x))
            return x

    features = torch.tensor(
        features_df.values,
        dtype=torch.float32,
    )
    labels = torch.tensor(games_df["HOME_TEAM_WINS"])

    dataset = TensorDataset(features, labels)

    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size

    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=128)

    model = BinaryClassifierDropout()
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([class_weights[1]]))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', cooldown=5, patience=2)

    num_epochs = 100
    losses = []
    early_stopping = EarlyStopping(patience=10, min_delta=0.001)

    for epoch in range(num_epochs):
        model.train()
        train_targets = []
        train_predictions = []
        
        for inputs, targets in train_dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze() 
            loss = criterion(outputs, targets.float())
            loss.backward()
            optimizer.step()
            scheduler.step(loss)
            losses.append(loss.item())
            
            # Collect targets and predictions for metrics
            train_targets.extend(targets.numpy())
            train_predictions.extend(outputs.detach().numpy())
        
        # Calculate metrics for training data
        train_predictions_binary = [1 if pred > 0.5 else 0 for pred in train_predictions]
        train_accuracy = accuracy_score(train_targets, train_predictions_binary)
        train_precision = precision_score(train_targets, train_predictions_binary, zero_division=0.0)
        train_recall = recall_score(train_targets, train_predictions_binary)
        train_f1 = f1_score(train_targets, train_predictions_binary)
        train_roc_auc = roc_auc_score(train_targets, train_predictions)
        train_conf_matrix = confusion_matrix(train_targets, train_predictions_binary)
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1}, Loss: {loss.item()}")
            print(f"Train Accuracy: {train_accuracy}")
            print(f"Train Precision: {train_precision}")
            print(f"Train Recall: {train_recall}")
            print(f"Train F1-Score: {train_f1}")
            print(f"Train ROC-AUC: {train_roc_auc}")
            print(f"Train Confusion Matrix:\n{train_conf_matrix}")
        
        # Validation loop
        model.eval()
        val_targets = []
        val_predictions = []
        
        with torch.no_grad():
            for inputs, targets in test_dataloader:
                outputs = model(inputs)
                outputs = outputs.squeeze()
                
                # Collect targets and predictions for metrics
                val_targets.extend(targets.numpy())
                val_predictions.extend(outputs.numpy())
        
        # Calculate metrics for validation data
        val_predictions_binary = [1 if pred > 0.5 else 0 for pred in val_predictions]
        val_accuracy = accuracy_score(val_targets, val_predictions_binary)
        val_precision = precision_score(val_targets, val_predictions_binary, zero_division=0.0)
        val_recall = recall_score(val_targets, val_predictions_binary)
        val_f1 = f1_score(val_targets, val_predictions_binary)
        val_roc_auc = roc_auc_score(val_targets, val_predictions)
        val_conf_matrix = confusion_matrix(val_targets, val_predictions_binary)
        if (epoch+1) % 10 == 0:
            print(f"Validation Accuracy: {val_accuracy}")
            print(f"Validation Precision: {val_precision}")
            print(f"Validation Recall: {val_recall}")
            print(f"Validation F1-Score: {val_f1}")
            print(f"Validation ROC-AUC: {val_roc_auc}")
            print(f"Validation Confusion Matrix:\n{val_conf_matrix}")
            
        val_loss = criterion(torch.tensor(val_predictions), torch.tensor(val_targets).float()).item()
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print(f"Early stopping at epoch {epoch+1}")
            break
        
    save_model(model, losses)
else:
    print("Cell skipped")

Cell skipped


Below is a rough outline of the final recommender system that I will be building in this notebook. It takes in the schedule for upcoming games, and outputs the recommended games that will be the closest and most exciting to watch based on the historical data from NBA games.

In [213]:
# from sklearn.neighbors import NearestNeighbors
# from sklearn.metrics import f1_score

# # Load the trained model
# model = NearestNeighbors(n_neighbors=5, metric='cosine')
# model.load('nba_game_recommender.pkl')

# # Load the schedule for the upcoming night's games
# schedule = pd.read_csv('nba_schedule.csv')

# # Extract relevant features for each game
# features = []
# for index, row in schedule.iterrows():
#     game_id = row['Game_ID']
#     home_team = row['Home_Team']
#     away_team = row['Away_Team']
#     # Extract features from historical data
#     feature_vector = get_features(game_id, home_team, away_team)
#     features.append(feature_vector)

# # Use the model to predict the most exciting game(s) to watch
# distances, indices = model.kneighbors(features)
# recommended_games = []
# for i, dist in enumerate(distances):
#     if dist < 0.5:  # Arbitrarily set a threshold