In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (
    DataLoader,
    WeightedRandomSampler,
    TensorDataset,
    random_split,
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# change the display width to see all columns
pd.set_option("display.width", 500)

games_df = pd.read_csv("./data/games.csv", low_memory=False)
games_details_df = pd.read_csv("./data/games_details.csv", low_memory=False)
# players_df = pd.read_csv('./data/players.csv')
ranking_df = pd.read_csv("./data/ranking.csv")
teams_df = pd.read_csv("./data/teams.csv")

Clean the game dataframe and group by season, create two new columns in the games_df one for the home teams previous 10 games and one for the away teams previous 10 games. The values are up to 10, 1 for each win, and 0 for each loss. The games are sorted by date so the games are in order.

In [2]:
games_df = games_df.drop(columns=["GAME_STATUS_TEXT"])

games_df.groupby(["SEASON"])
games_df = games_df.sort_values(by=["HOME_TEAM_ID", "GAME_DATE_EST"])
home_games = games_df[
    ["GAME_ID", "HOME_TEAM_ID", "GAME_DATE_EST", "HOME_TEAM_WINS"]
].rename(columns={"HOME_TEAM_ID": "team_id", "HOME_TEAM_WINS": "win"})
away_games = games_df[
    ["GAME_ID", "VISITOR_TEAM_ID", "GAME_DATE_EST", "HOME_TEAM_WINS"]
].rename(columns={"VISITOR_TEAM_ID": "team_id", "HOME_TEAM_WINS": "win"})
away_games["win"] = 1 - away_games["win"]
all_games = pd.concat([home_games, away_games]).sort_values(
    by=["team_id", "GAME_DATE_EST"]
)
all_games["rolling_wins"] = (
    all_games.groupby("team_id")["win"]
    .rolling(window=10, min_periods=1)
    .sum()
    .reset_index(level=0, drop=True)
)
all_games["prev_10_game_record"] = all_games.groupby("team_id")["rolling_wins"].shift(1)
all_games["prev_10_game_record"] = all_games["prev_10_game_record"].fillna(0)
home_records = all_games[
    all_games["GAME_ID"].isin(games_df["GAME_ID"])
    & (all_games["team_id"].isin(games_df["HOME_TEAM_ID"]))
]
away_records = all_games[
    all_games["GAME_ID"].isin(games_df["GAME_ID"])
    & (all_games["team_id"].isin(games_df["VISITOR_TEAM_ID"]))
]
games_df = games_df.merge(
    home_records[["GAME_ID", "prev_10_game_record"]],
    left_on="GAME_ID",
    right_on="GAME_ID",
    how="left",
).rename(columns={"prev_10_game_record": "HOME_TEAM_L10"})
games_df = games_df.merge(
    away_records[["GAME_ID", "prev_10_game_record"]],
    left_on="GAME_ID",
    right_on="GAME_ID",
    how="left",
).rename(columns={"prev_10_game_record": "AWAY_TEAM_L10"})

print(games_df["HOME_TEAM_WINS"].value_counts())

HOME_TEAM_WINS
1    62988
0    44312
Name: count, dtype: int64


Below we will create the ELO rating for each team, as a default the team will start with 1500 points. Each time a team wins or loses a game, the ELO rating will be updated. The ELO rating will be updated based on the following formula:



Updated Team ELO = Team ELO + k * (Team Expected Outcome - Team Actual Outcome)

In [3]:
initial_elo = 1500
k_factor = 15
team_elos = {
    team_id: initial_elo
    for team_id in pd.concat(
        [games_df["HOME_TEAM_ID"], games_df["VISITOR_TEAM_ID"]]
    ).unique()
}


def expected_outcome(home_elo, away_elo):
    return 1 / (1 + 10 ** ((away_elo - home_elo) / 400))


def update_elo(home_elo, visitor_elo, home_win, k_factor):
    expected_home_win = expected_outcome(home_elo, visitor_elo)
    actual_home_win = 1 if home_win else 0
    new_home_elo = home_elo + k_factor * (actual_home_win - expected_home_win)
    new_visitor_elo = visitor_elo + k_factor * (
        (1 - actual_home_win) - (1 - expected_home_win)
    )

    return new_home_elo, new_visitor_elo

After iterating, team_elos will have the updated Elo ratings for each team

In [4]:
games_df = games_df.sort_values("GAME_DATE_EST", ascending=False)
games_df["ELO_home"] = 0
games_df["ELO_away"] = 0

for index, row in games_df.iterrows():
    home_team, away_team = row["HOME_TEAM_ID"], row["VISITOR_TEAM_ID"]
    home_elo, away_elo = team_elos[home_team], team_elos[away_team]
    home_win = row["HOME_TEAM_WINS"]
    new_home_elo, new_away_elo = update_elo(home_elo, away_elo, home_win, k_factor)
    games_df.at[index, "ELO_home"] = round(new_home_elo)
    games_df.at[index, "ELO_away"] = round(new_away_elo)

    team_elos[home_team], team_elos[away_team] = new_home_elo, new_away_elo

Group each Season into its own dataframe, this will help for calculating the overall ELO rating for each team in a season. I can then also track the ELO rating for each team over time, and see how it changes over the course of the dataset.

In [5]:
seasons_df = games_df.groupby("SEASON")
seasons_dict = {}
for season, season_df in seasons_df:
    seasons_dict[season] = season_df

# Iterate over the dictionary and calculate the percentage of missing values
for season, season_df in seasons_dict.items():
    missing_values_percent = season_df.isnull().sum() * 100 / len(season_df)
    # print(f"Percentage of missing values for season {season}:")
    # print(missing_values_percent)

Here each team's ELO will be saved in a dataframe then merged with the team dataframe to get the ELO rating for each team in the dataset. 

In [6]:
team_elos_df = pd.DataFrame.from_dict(team_elos, orient="index", columns=["ELO"])
team_elos_df = team_elos_df.merge(teams_df, left_index=True, right_on="TEAM_ID")
team_elos_df = team_elos_df.sort_values("ELO", ascending=False)

In [7]:
# Create margin of victory column
games_df["MOV"] = games_df["PTS_home"] - games_df["PTS_away"]
close_games = games_df[(games_df["MOV"] > -5) & (games_df["MOV"] < 5)]
print("Number of close games:", close_games.shape[0])

close_game_prob = close_games["MOV"].count() / games_df["MOV"].count() * 100
print("Probability of a close game:", close_game_prob)

Number of close games: 22764
Probability of a close game: 21.293871136720796


In [8]:
# Create high scoring game column
games_df["total_score"] = games_df["PTS_home"] + games_df["PTS_away"]
high_scoring_games = games_df[games_df["total_score"] > 220]
print("Number of high scoring games:", high_scoring_games.shape[0])

high_scoring_game_prob = len(high_scoring_games) / len(games_df) * 100
print("Probability of a high scoring game:", high_scoring_game_prob)

games_df = games_df.dropna(ignore_index=True)

Number of high scoring games: 25344
Probability of a high scoring game: 23.619757688723205


Saving the model each time the model is trained 

In [9]:
def save_model(model, losses):
    average_loss = round(np.average(losses), 3)
    model_to_string = model.__class__.__name__
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    if not os.path.exists(f"./models/{model_to_string}"):
        os.makedirs(f"./models/{model_to_string}")
    torch.save(
        {"model_state_dict": model.state_dict(), "losses": losses},
        f=f"./models/{model_to_string}/{average_loss}_on_{timestamp}_model.pth",
    )

In [10]:
def calculate_metrics(targets, predictions):
    predictions_binary = [1 if pred > 0.5 else 0 for pred in predictions]
    accuracy = accuracy_score(targets, predictions_binary)
    precision = precision_score(targets, predictions_binary, zero_division=0.0)
    recall = recall_score(targets, predictions_binary)
    f1 = f1_score(targets, predictions_binary)
    roc_auc = roc_auc_score(targets, predictions)
    conf_matrix = confusion_matrix(targets, predictions_binary)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
        "conf_matrix": conf_matrix,
    }


def print_metrics(phase, metrics):
    print(f"{phase} Accuracy: {metrics['accuracy']}")
    print(f"{phase} Precision: {metrics['precision']}")
    print(f"{phase} Recall: {metrics['recall']}")
    print(f"{phase} F1-Score: {metrics['f1']}")
    print(f"{phase} ROC-AUC: {metrics['roc_auc']}")
    print(f"{phase} Confusion Matrix:\n{metrics['conf_matrix']}")

This class will stop training runs if the training loss is not decreasing anymore. This is done through comparing the current loss with the previous loss. If the loss is not decreasing, the training will stop.

In [11]:
class EarlyStopping:
    def __init__(self, patience, min_delta):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [12]:
def predict_with_threshold(outputs, threshold=0.5):
    return (outputs > threshold).float()

Model Training function, this function will train the model on the training data and return the model. The model will be trained using the training data and the labels. The model will be trained for a certain number of epochs and the loss will be calculated after each epoch. The loss will be used to determine if the model is improving or not. If the loss is not decreasing, the training will stop.

Calculating and printing the metrics for the model, we will use the following metrics: accuracy, precision, recall, f1 score, and confusion matrix. These metrics will help us understand how well the model is performing.

In [13]:
def train_model(
    model,
    train_dataloader,
    test_dataloader,
    validation_dataloader,
    criterion,
    optimizer,
    scheduler,
    early_stopping,
    num_epochs=100,
):
    losses = []

    for epoch in range(num_epochs):
        model.train()
        train_targets = []
        train_predictions = []

        # Training loop
        for inputs, targets in train_dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.float()
            # Ensure targets are the correct shape
            if isinstance(criterion, nn.BCEWithLogitsLoss):
                targets = targets.float()
            elif isinstance(criterion, nn.CrossEntropyLoss):
                targets = targets.float()
            elif isinstance(criterion, nn.BCELoss):
                targets = targets.float()

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            scheduler.step(loss)
            losses.append(loss.item())

            train_targets.extend(targets.numpy())
            train_predictions.extend(outputs.detach().numpy())

        train_metrics = calculate_metrics(train_targets, train_predictions)
        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
            print_metrics("Train", train_metrics)

        # Validation loop
        model.eval()
        val_targets = []
        val_predictions = []

        with torch.no_grad():
            for inputs, targets in validation_dataloader:
                outputs = model(inputs)
                outputs = outputs.squeeze()

                # Ensure targets are the correct shape
                if isinstance(criterion, torch.nn.BCEWithLogitsLoss):
                    targets = targets.float()
                elif isinstance(criterion, torch.nn.CrossEntropyLoss):
                    targets = targets.float()
                elif isinstance(criterion, nn.BCELoss):
                    targets = targets.float()

                val_targets.extend(targets.numpy())
                val_predictions.extend(outputs.numpy())
        val_predictions_np = np.array(val_predictions)
        val_targets_np = np.array(val_targets)
        val_metrics = calculate_metrics(val_targets, val_predictions)
        if (epoch + 1) % 5 == 0:
            print_metrics("Validation", val_metrics)

        val_loss = criterion(
            torch.tensor(val_predictions_np),
            torch.tensor(val_targets_np).float().squeeze(),
        ).item()
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    save_model(model, losses)

Data Preprocessing function, this function will preprocess the data before training the model. The data will be split into training and testing data. The data will be normalized and the labels will be one-hot encoded. The data will be split into training and testing data using the train_test_split function from sklearn.

In [14]:
def balance_dataset(labels):
    # Calculate the number of samples for each class
    class_sample_counts = labels.value_counts().sort_index().tolist()
    # Calculate class weights: inverse of the number of samples for each class
    class_weights = 1.0 / torch.tensor(class_sample_counts, dtype=torch.float)
    # Assign a weight to each sample based on its class
    sample_weights = torch.tensor(
        [class_weights[label] for label in labels], dtype=torch.float
    )
    print(f"Sample weights: {sample_weights[:5]}")

    # Create the WeightedRandomSampler
    sampler = WeightedRandomSampler(sample_weights, len(sample_weights))

    return sampler


def smote_fn(features_df, games_df):
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(
        features_df,
        games_df["HOME_TEAM_WINS"],
        test_size=0.2,
        stratify=games_df["HOME_TEAM_WINS"],
    )

    # Apply SMOTE
    smote = SMOTE()
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    # Convert to tensors
    features = torch.tensor(X_train_res.values, dtype=torch.float32)
    labels = torch.tensor(y_train_res.values, dtype=torch.long).view(-1, 1)

    return features, labels, X_val, y_val


def prepare_data(games_df):
    # Select relevant features
    features_df = games_df[
        [
            "SEASON",
            "total_score",
            "ELO_home",
            "ELO_away",
            "HOME_TEAM_ID",
            "VISITOR_TEAM_ID",
            "HOME_TEAM_L10",
            "AWAY_TEAM_L10",
        ]
    ]

    # Apply SMOTE and split data
    features, labels, X_val, y_val = smote_fn(features_df, games_df)
    dataset = TensorDataset(features, labels)

    # Split dataset into training and test sets
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    # Extract labels from the train dataset for balancing
    train_labels = pd.Series([label.item() for _, label in train_dataset])
    sampler = balance_dataset(train_labels)

    # Create DataLoaders
    train_dataloader = DataLoader(train_dataset, batch_size=128, sampler=sampler)
    test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

    # Create DataLoader for validation set
    val_features = torch.tensor(X_val.values, dtype=torch.float32)
    val_labels = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)
    val_dataset = TensorDataset(val_features, val_labels)
    val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)

    return train_dataloader, test_dataloader, val_dataloader

## Initial Model Creation

The model is created, experimenting with the different model architecture. The model will be trained on the first 80% of the data, and tested on the last 20% of the data. The model will be evaluated based on the accuracy of the predictions. 

The model will be trained on the ELO rating of each team, and the difference in ELO rating between the two teams. The model will predict the outcome of the game based on the ELO rating of each team.

In [15]:
skip_cell = True

if not skip_cell:

    class BinaryClassifier(nn.Module):
        def __init__(self):
            super(BinaryClassifier, self).__init__()
            self.fc1 = nn.Linear(in_features=8, out_features=50)
            self.fc2 = nn.Linear(50, 1)

        def forward(self, x):
            x = F.prelu(self.fc1(x), torch.tensor(0.75))
            x = torch.sigmoid(self.fc2(x))
            return x

    train_dataloader, test_dataloader, validation_dataloader = prepare_data(games_df)
    model = BinaryClassifier()
    criterion = nn.BCELoss(weight=torch.tensor(0.75))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )

    train_model(
        model,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        criterion,
        optimizer,
        scheduler,
        EarlyStopping(),
        num_epochs=200,
    )

else:
    print("Cell skipped")

Cell skipped


## Model v2 - relu activation function and larger hidden layers
Here I have created a model with larger hidden layers, and more hidden layers, to see if the model can learn more complex patterns in the data. The learning rate is lowered and a scheduler is added to help the model converge to a better solution. The model is trained for 100 epochs, and the loss rate is assessed to see if the model is learning at a better rate than the previous model.

In [16]:
skip_cell = True
if not skip_cell:

    class BinaryClassifierV2(nn.Module):
        def __init__(self):
            super(BinaryClassifierV2, self).__init__()
            self.fc1 = nn.Linear(in_features=8, out_features=1000)
            self.fc2 = nn.Linear(1000, 50)
            self.fc3 = nn.Linear(50, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = torch.sigmoid(self.fc3(x))
            return x

    model = BinaryClassifierV2()
    train_dataloader, test_dataloader, validation_dataloader = prepare_data(games_df)
    criterion = nn.BCELoss(weight=torch.tensor(0.75))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )
    train_model(
        model,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        criterion,
        optimizer,
        scheduler,
        EarlyStopping(),
        num_epochs=200,
    )
else:
    print("Cell skipped")

Cell skipped


## Model v3.1 - More model information

In [17]:
skip_cell = True
if not skip_cell:

    class BinaryClassifierV3(nn.Module):
        def __init__(self):
            super(BinaryClassifierV3, self).__init__()
            self.fc1 = nn.Linear(in_features=8, out_features=1000)
            self.fc2 = nn.Linear(1000, 50)
            self.fc3 = nn.Linear(50, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = torch.sigmoid(self.fc3(x))
            return x

    model = BinaryClassifierV3()
    train_dataloader, test_dataloader, validation_dataloader = prepare_data(games_df)
    criterion = nn.BCELoss(weight=torch.tensor(0.75))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )

    train_model(
        model,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        criterion,
        optimizer,
        scheduler,
        EarlyStopping(),
        num_epochs=200,
    )
else:
    print("Cell skipped")

Cell skipped


## Third Model - Adding Dropout Layers

Here I have added dropout layers to the model to help prevent overfitting. The dropout rate is set to 0.5, increasing layer count to 7 and increasing the number of neurons in each layer. The model is evaluated based on the loss rate, and the accuracy of the predictions.

In [18]:
skip_cell = True
if not skip_cell:

    class BinaryClassifierDropout(nn.Module):
        def __init__(self):
            super(BinaryClassifierDropout, self).__init__()
            self.fc1 = nn.Linear(in_features=8, out_features=1000)
            self.dropout1 = nn.Dropout(p=0.5)
            self.fc2 = nn.Linear(1000, 1000)
            self.dropout2 = nn.Dropout(p=0.5)
            self.fc3 = nn.Linear(1000, 1000)
            self.dropout3 = nn.Dropout(p=0.5)
            self.fc4 = nn.Linear(1000, 1000)
            self.dropout4 = nn.Dropout(p=0.5)
            self.fc5 = nn.Linear(1000, 1000)
            self.dropout5 = nn.Dropout(p=0.5)
            self.fc6 = nn.Linear(1000, 500)
            self.dropout6 = nn.Dropout(p=0.5)
            self.fc7 = nn.Linear(500, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = self.dropout1(x)
            x = F.relu(self.fc2(x))
            x = self.dropout2(x)
            x = F.relu(self.fc3(x))
            x = self.dropout3(x)
            x = F.relu(self.fc4(x))
            x = self.dropout4(x)
            x = F.relu(self.fc5(x))
            x = self.dropout5(x)
            x = F.relu(self.fc6(x))
            x = self.dropout6(x)
            x = torch.sigmoid(self.fc7(x))
            return x

    train_dataloader, test_dataloader, validation_dataloader = prepare_data(games_df)
    class_weights = [1.0, 1.0]
    model = BinaryClassifierDropout()
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([class_weights[1]]))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )

    train_model(
        model,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        criterion,
        optimizer,
        scheduler,
        EarlyStopping(),
        num_epochs=200,
    )
else:
    print("Cell skipped")

Cell skipped


## Fourth Model - Adding Batch Normalization 


In [19]:
skip_cell = True

if not skip_cell:

    class BinaryClassifierDropout(nn.Module):
        def __init__(self):
            super(BinaryClassifierDropout, self).__init__()
            self.fc1 = nn.Linear(in_features=8, out_features=1000)
            self.dropout1 = nn.Dropout(p=0.5)
            self.fc2 = nn.Linear(1000, 500)
            self.dropout2 = nn.Dropout(p=0.2)
            self.fc3 = nn.Linear(500, 500)
            self.dropout3 = nn.Dropout(p=0.5)
            self.fc4 = nn.Linear(500, 500)
            self.dropout4 = nn.Dropout(p=0.7)
            self.fc5 = nn.Linear(500, 500)
            self.dropout5 = nn.Dropout(p=0.2)
            self.fc6 = nn.Linear(500, 500)
            self.dropout6 = nn.Dropout(p=0.5)
            self.fc7 = nn.Linear(500, 1)

        def forward(self, x):
            x = F.leaky_relu(self.fc1(x))
            x = self.dropout1(x)
            x = F.leaky_relu(self.fc2(x))
            x = self.dropout2(x)
            x = F.leaky_relu(self.fc3(x))
            x = self.dropout3(x)
            x = F.leaky_relu(self.fc4(x))
            x = self.dropout4(x)
            x = F.leaky_relu(self.fc5(x))
            x = self.dropout5(x)
            x = F.leaky_relu(self.fc6(x))
            x = self.dropout6(x)
            x = torch.sigmoid(self.fc7(x))
            return x

    model = BinaryClassifierDropout()
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([class_weights[1]]))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )

    train_model(
        model,
        train_dataloader,
        test_dataloader,
        criterion,
        optimizer,
        EarlyStopping(),
        num_epochs=200,
    )
else:
    print("Cell skipped")

Cell skipped


## Drop out model v2 - Altering dropout layers with different dropout rates

In [20]:
skip_cell = True

if not skip_cell:

    class BinaryClassifierDropoutV2(nn.Module):
        def __init__(self):
            super(BinaryClassifierDropoutV2, self).__init__()
            self.fc1 = nn.Linear(in_features=8, out_features=1000)
            self.dropout1 = nn.Dropout(p=0.5)
            self.fc2 = nn.Linear(1000, 500)
            self.dropout2 = nn.Dropout(p=0.2)
            self.fc3 = nn.Linear(500, 500)
            self.dropout3 = nn.Dropout(p=0.5)
            self.fc4 = nn.Linear(500, 500)
            self.dropout4 = nn.Dropout(p=0.7)
            self.fc5 = nn.Linear(500, 500)
            self.dropout5 = nn.Dropout(p=0.2)
            self.fc6 = nn.Linear(500, 500)
            self.dropout6 = nn.Dropout(p=0.5)
            self.fc7 = nn.Linear(500, 1)

        def forward(self, x):
            x = self.fc1(x)
            x = self.dropout1(x)
            x = F.leaky_relu(self.fc2(x))
            x = self.dropout2(x)
            x = F.leaky_relu(self.fc3(x))
            x = self.dropout3(x)
            x = F.leaky_relu(self.fc4(x))
            x = self.dropout4(x)
            x = F.leaky_relu(self.fc5(x))
            x = self.dropout5(x)
            x = F.leaky_relu(self.fc6(x))
            x = self.dropout6(x)
            x = torch.sigmoid(self.fc7(x))
            return x

    model = BinaryClassifierDropoutV2()
    train_dataloader, test_dataloader, validation_dataloader = prepare_data(games_df)
    class_weights = [1.0, 1.0]
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([class_weights[1]]))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=10, min_delta=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )

    train_model(
        model,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        criterion,
        optimizer,
        scheduler,
        EarlyStopping(),
        num_epochs=200,
    )
else:
    print("Cell skipped")

Cell skipped


In [21]:
skip_cell = True

if not skip_cell:

    class BinaryClassifierDropoutV2(nn.Module):
        def __init__(self):
            super(BinaryClassifierDropoutV2, self).__init__()
            self.fc1 = nn.Linear(in_features=8, out_features=500)
            self.dropout1 = nn.Dropout(p=0.5)
            self.fc2 = nn.Linear(500, 64)
            self.dropout2 = nn.Dropout(p=0.2)
            self.fc3 = nn.Linear(64, 128)
            self.dropout3 = nn.Dropout(p=0.5)
            self.fc4 = nn.Linear(64, 128)
            self.dropout4 = nn.Dropout(p=0.7)
            self.fc5 = nn.Linear(64, 128)
            self.dropout5 = nn.Dropout(p=0.2)
            self.fc6 = nn.Linear(64, 128)
            self.dropout6 = nn.Dropout(p=0.5)
            self.fc7 = nn.Linear(64, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            # print(f"FC1{x.shape}")
            x = self.dropout1(x)
            # print(f"DO1{x.shape}")
            x = F.relu(self.fc2(x))
            # print(f"FC2{x.shape}")
            x = self.dropout2(x)
            # print(f"DO2{x.shape}")
            x = F.glu(self.fc3(x))
            # print(f"FC3 {x.shape}")
            x = self.dropout3(x)
            # print(f"DO3 {x.shape}")
            x = F.glu(self.fc4(x))
            # print(f"FC4 {x.shape}")
            x = self.dropout4(x)
            # print(f"DO4 {x.shape}")
            x = F.glu(self.fc5(x))
            # print(f"FC5 {x.shape}")
            x = self.dropout5(x)
            # print(f"DO5 {x.shape}")
            x = F.glu(self.fc6(x))
            # print(f"FC6 {x.shape}")
            x = self.dropout6(x)
            # print(f"DO6 {x.shape}")
            x = torch.sigmoid(self.fc7(x))
            return x

    model = BinaryClassifierDropoutV2()
    train_dataloader, test_dataloader, validation_dataloader = prepare_data(games_df)
    class_weights = [1.0, 1.0]
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([class_weights[1]]))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=10, min_delta=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )

    train_model(
        model,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        criterion,
        optimizer,
        scheduler,
        EarlyStopping(50, 0.001),
        num_epochs=200,
    )
else:
    print("Cell skipped")

Cell skipped


In [22]:
skip_cell = True

if not skip_cell:

    class BinaryClassifierDropoutV2(nn.Module):
        def __init__(self):
            super(BinaryClassifierDropoutV2, self).__init__()
            self.fc1 = nn.Linear(in_features=8, out_features=64)
            self.dropout1 = nn.Dropout(p=0.5)
            self.fc2 = nn.Linear(64, 32)
            self.dropout2 = nn.Dropout(p=0.2)
            self.fc3 = nn.Linear(32, 64)
            self.dropout3 = nn.Dropout(p=0.5)
            self.fc4 = nn.Linear(32, 64)
            self.dropout4 = nn.Dropout(p=0.7)
            self.fc5 = nn.Linear(32, 64)
            self.dropout5 = nn.Dropout(p=0.2)
            self.fc6 = nn.Linear(32, 64)
            self.dropout6 = nn.Dropout(p=0.5)
            self.fc7 = nn.Linear(32, 64)
            self.dropout7 = nn.Dropout(p=0.7)
            self.fc8 = nn.Linear(32, 64)
            self.dropout8 = nn.Dropout(p=0.2)
            self.fc9 = nn.Linear(32, 64)
            self.dropout9 = nn.Dropout(p=0.5)
            self.fc10 = nn.Linear(32, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            # print(f"FC1{x.shape}")
            x = self.dropout1(x)
            # print(f"DO1{x.shape}")
            x = F.relu(self.fc2(x))
            # print(f"FC2{x.shape}")
            x = self.dropout2(x)
            # print(f"DO2{x.shape}")
            x = F.glu(self.fc3(x))
            # print(f"FC3 {x.shape}")
            x = self.dropout3(x)
            # print(f"DO3 {x.shape}")
            x = F.glu(self.fc4(x))
            # print(f"FC4 {x.shape}")
            x = self.dropout4(x)
            # print(f"DO4 {x.shape}")
            x = F.glu(self.fc5(x))
            # print(f"FC5 {x.shape}")
            x = self.dropout5(x)
            # print(f"DO5 {x.shape}")
            x = F.glu(self.fc6(x))
            # print(f"FC6 {x.shape}")
            x = self.dropout6(x)
            # print(f"DO6 {x.shape}")
            x = F.glu(self.fc7(x))
            # print(f"FC7 {x.shape}")
            x = self.dropout7(x)
            # print(f"DO7 {x.shape}")
            x = F.glu(self.fc8(x))
            # print(f"FC8 {x.shape}")
            x = self.dropout8(x)
            # print(f"DO8 {x.shape}")
            x = F.glu(self.fc9(x))
            # print(f"FC9 {x.shape}")
            x = self.dropout9(x)
            # print(f"DO9 {x.shape}")
            x = torch.sigmoid(self.fc10(x))
            return x

    model = BinaryClassifierDropoutV2()
    train_dataloader, test_dataloader, validation_dataloader = prepare_data(games_df)
    class_weights = [1.0, 1.0]
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([class_weights[1]]))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=10, min_delta=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )

    train_model(
        model,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        criterion,
        optimizer,
        scheduler,
        EarlyStopping(50, 0.001),
        num_epochs=200,
    )
else:
    print("Cell skipped")

Sample weights: tensor([2.4817e-05, 2.4817e-05, 2.4817e-05, 2.4817e-05, 2.4817e-05])
Epoch 5, Loss: 0.7716470956802368
Train Accuracy: 0.5003596943838063
Train Precision: 0.4939476171601061
Train Recall: 0.40615754901469187
Train F1-Score: 0.44577136331741946
Train ROC-AUC: 0.4984946302371942
Train Confusion Matrix:
[[24141 16597]
 [23686 16200]]
Validation Accuracy: 0.5892147233525092
Validation Precision: 0.5892147233525092
Validation Recall: 1.0
Validation F1-Score: 0.7415168192118662
Validation ROC-AUC: 0.5
Validation Confusion Matrix:
[[    0  8783]
 [    0 12598]]
Epoch 10, Loss: 0.7959917783737183
Train Accuracy: 0.49967751538003574
Train Precision: 0.4985958090300281
Train Recall: 0.4014212239421572
Train F1-Score: 0.44476256022023397
Train ROC-AUC: 0.49960484614780154
Train Confusion Matrix:
[[24130 16247]
 [24091 16156]]
Validation Accuracy: 0.5892147233525092
Validation Precision: 0.5892147233525092
Validation Recall: 1.0
Validation F1-Score: 0.7415168192118662
Validation RO

In [None]:
skip_cell = False

if not skip_cell:

    class BinaryClassifierDropoutV2(nn.Module):
        def __init__(self):
            super(BinaryClassifierDropoutV2, self).__init__()
            self.fc1 = nn.Linear(in_features=8, out_features=64)
            self.dropout1 = nn.Dropout(p=0.5)
            self.fc2 = nn.Linear(64, 32)
            self.dropout2 = nn.Dropout(p=0.2)
            self.fc3 = nn.Linear(32, 64)
            self.dropout3 = nn.Dropout(p=0.5)
            self.fc4 = nn.Linear(32, 64)
            self.dropout4 = nn.Dropout(p=0.7)
            self.fc5 = nn.Linear(32, 64)
            self.dropout5 = nn.Dropout(p=0.2)
            self.fc6 = nn.Linear(32, 64)
            self.dropout6 = nn.Dropout(p=0.5)
            self.fc7 = nn.Linear(32, 64)
            self.dropout7 = nn.Dropout(p=0.7)
            self.fc8 = nn.Linear(32, 16)
            self.dropout8 = nn.Dropout(p=0.2)
            self.fc9 = nn.Linear(16, 8)
            self.dropout9 = nn.Dropout(p=0.5)
            self.fc10 = nn.Linear(8, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            # print(f"FC1{x.shape}")
            x = self.dropout1(x)
            # print(f"DO1{x.shape}")
            x = F.relu(self.fc2(x))
            # print(f"FC2{x.shape}")
            x = self.dropout2(x)
            # print(f"DO2{x.shape}")
            x = F.glu(self.fc3(x))
            # print(f"FC3 {x.shape}")
            x = self.dropout3(x)
            # print(f"DO3 {x.shape}")
            x = F.glu(self.fc4(x))
            # print(f"FC4 {x.shape}")
            x = self.dropout4(x)
            # print(f"DO4 {x.shape}")
            x = F.glu(self.fc5(x))
            # print(f"FC5 {x.shape}")
            x = self.dropout5(x)
            # print(f"DO5 {x.shape}")
            x = F.glu(self.fc6(x))
            # print(f"FC6 {x.shape}")
            x = self.dropout6(x)
            # print(f"DO6 {x.shape}")
            x = F.glu(self.fc7(x))
            # print(f"FC7 {x.shape}")
            x = self.dropout7(x)
            # print(f"DO7 {x.shape}")
            x = F.relu(self.fc8(x))
            # print(f"FC8 {x.shape}")
            x = self.dropout8(x)
            # print(f"DO8 {x.shape}")
            x = F.relu(self.fc9(x))
            # print(f"FC9 {x.shape}")
            x = self.dropout9(x)
            # print(f"DO9 {x.shape}")
            x = torch.sigmoid(self.fc10(x))
            return x

    model = BinaryClassifierDropoutV2()
    train_dataloader, test_dataloader, validation_dataloader = prepare_data(games_df)
    class_weights = [0.75, 1.0]
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([class_weights[1]]))
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=10, min_delta=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, "min", cooldown=5, patience=2
    )

    train_model(
        model,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        criterion,
        optimizer,
        scheduler,
        EarlyStopping(50, 0.001),
        num_epochs=200,
    )
else:
    print("Cell skipped")

Below is a rough outline of the final recommender system that I will be building in this notebook. It takes in the schedule for upcoming games, and outputs the recommended games that will be the closest and most exciting to watch based on the historical data from NBA games.

In [23]:
# from sklearn.neighbors import NearestNeighbors
# from sklearn.metrics import f1_score

# # Load the trained model
# model = NearestNeighbors(n_neighbors=5, metric='cosine')
# model.load('nba_game_recommender.pkl')

# # Load the schedule for the upcoming night's games
# schedule = pd.read_csv('nba_schedule.csv')

# # Extract relevant features for each game
# features = []
# for index, row in schedule.iterrows():
#     game_id = row['Game_ID']
#     home_team = row['Home_Team']
#     away_team = row['Away_Team']
#     # Extract features from historical data
#     feature_vector = get_features(game_id, home_team, away_team)
#     features.append(feature_vector)

# # Use the model to predict the most exciting game(s) to watch
# distances, indices = model.kneighbors(features)
# recommended_games = []
# for i, dist in enumerate(distances):
#     if dist < 0.5:  # Arbitrarily set a threshold