In [289]:
import pandas as pd
import numpy as np
import os
from datetime import datetime


# change the display width to see all columns
pd.set_option('display.width', 500)

games_df = pd.read_csv('./data/games.csv', low_memory=False)
games_details_df = pd.read_csv('./data/games_details.csv', low_memory=False)
# players_df = pd.read_csv('./data/players.csv')
ranking_df = pd.read_csv('./data/ranking.csv')
teams_df = pd.read_csv('./data/teams.csv')

Clean the game dataframe and group by season

In [290]:
games_df = games_df.drop(columns=['GAME_STATUS_TEXT'])

games_df.groupby(['SEASON'])
games_df = games_df.sort_values('GAME_DATE_EST')

# games_df = games_df.dropna()

print(games_df.head(200))

      GAME_DATE_EST   GAME_ID  HOME_TEAM_ID  VISITOR_TEAM_ID  SEASON  TEAM_ID_home  PTS_home  FG_PCT_home  FT_PCT_home  FG3_PCT_home  AST_home  REB_home  TEAM_ID_away  PTS_away  FG_PCT_away  FT_PCT_away  FG3_PCT_away  AST_away  REB_away  HOME_TEAM_WINS
19288    2003-10-05  10300001    1610612762       1610612742    2003    1610612762      90.0        0.457        0.735         0.143      23.0      41.0    1610612742      85.0        0.447        0.500         0.250      20.0      38.0               1
19287    2003-10-06  10300002    1610612763       1610612749    2003    1610612763     105.0        0.494        0.618         0.267      25.0      48.0    1610612749      94.0        0.427        0.700         0.154      20.0      43.0               1
19280    2003-10-07  10300010    1610612764       1610612752    2003    1610612764     104.0        0.506        0.677         0.455      26.0      45.0    1610612752      86.0        0.380        0.852         0.188      19.0      37.0     

Below we will create the ELO rating for each team, as a default the team will start with 1500 points. Each time a team wins or loses a game, the ELO rating will be updated. The ELO rating will be updated based on the following formula:



Updated Team ELO = Team ELO + k * (Team Expected Outcome - Team Actual Outcome)

In [291]:
initial_elo = 1500
k_factor = 15
team_elos = {team_id: initial_elo for team_id in pd.concat([games_df['HOME_TEAM_ID'], games_df['VISITOR_TEAM_ID']]).unique()}


def expected_outcome(home_elo, away_elo):
    return 1 / (1 + 10 ** ((away_elo - home_elo) / 400))

def update_elo(home_elo, visitor_elo, home_win, k_factor):
    expected_home_win = expected_outcome(home_elo, visitor_elo)
    actual_home_win = 1 if home_win else 0
    new_home_elo = home_elo + k_factor * (actual_home_win - expected_home_win)
    new_visitor_elo = visitor_elo + k_factor * ((1 - actual_home_win) - (1 - expected_home_win))
    # new_home_elo = new_home_elo.astype(float)
    # new_visitor_elo = new_visitor_elo.astype(float)
    return new_home_elo, new_visitor_elo


After iterating, team_elos will have the updated Elo ratings for each team

In [292]:
games_df = games_df.sort_values('GAME_DATE_EST', ascending=False)
games_df['ELO_home'] = 0
games_df['ELO_away'] = 0

for index, row in games_df.iterrows():
    home_team, away_team = row['HOME_TEAM_ID'], row['VISITOR_TEAM_ID']
    home_elo, away_elo = team_elos[home_team], team_elos[away_team]
    home_win = row['HOME_TEAM_WINS']
    new_home_elo, new_away_elo = update_elo(home_elo, away_elo, home_win, k_factor)
    games_df.at[index, 'ELO_home'] = round(new_home_elo)
    games_df.at[index, 'ELO_away'] = round(new_away_elo)
    
    team_elos[home_team], team_elos[away_team] = new_home_elo, new_away_elo
    

  games_df.at[index, 'ELO_home'] = new_home_elo
  games_df.at[index, 'ELO_away'] = new_away_elo


Group each Season into its own dataframe, this will help for calculating the overall ELO rating for each team in a season. I can then also track the ELO rating for each team over time, and see how it changes over the course of the dataset.

In [293]:
seasons_df = games_df.groupby('SEASON')
seasons_dict = {}
for season, season_df in seasons_df:
    seasons_dict[season] = season_df

# Iterate over the dictionary and calculate the percentage of missing values
for season, season_df in seasons_dict.items():
    missing_values_percent = season_df.isnull().sum() * 100 / len(season_df)
    # print(f"Percentage of missing values for season {season}:")
    print(missing_values_percent)

GAME_DATE_EST      0.000000
GAME_ID            0.000000
HOME_TEAM_ID       0.000000
VISITOR_TEAM_ID    0.000000
SEASON             0.000000
TEAM_ID_home       0.000000
PTS_home           7.148014
FG_PCT_home        7.148014
FT_PCT_home        7.148014
FG3_PCT_home       7.148014
AST_home           7.148014
REB_home           7.148014
TEAM_ID_away       0.000000
PTS_away           7.148014
FG_PCT_away        7.148014
FT_PCT_away        7.148014
FG3_PCT_away       7.148014
AST_away           7.148014
REB_away           7.148014
HOME_TEAM_WINS     0.000000
ELO_home           0.000000
ELO_away           0.000000
dtype: float64
GAME_DATE_EST      0.0
GAME_ID            0.0
HOME_TEAM_ID       0.0
VISITOR_TEAM_ID    0.0
SEASON             0.0
TEAM_ID_home       0.0
PTS_home           0.0
FG_PCT_home        0.0
FT_PCT_home        0.0
FG3_PCT_home       0.0
AST_home           0.0
REB_home           0.0
TEAM_ID_away       0.0
PTS_away           0.0
FG_PCT_away        0.0
FT_PCT_away        0.0
F

Here each team's ELO will be saved in a dataframe then merged with the team dataframe to get the ELO rating for each team in the dataset. 

In [294]:
team_elos_df = pd.DataFrame.from_dict(team_elos, orient="index", columns=["ELO"])
team_elos_df = team_elos_df.merge(teams_df, left_index=True, right_on="TEAM_ID")
team_elos_df = team_elos_df.sort_values("ELO", ascending=False)

In [295]:
# Create margin of victory column
games_df["MOV"] = games_df["PTS_home"] - games_df["PTS_away"]
close_games = games_df[(games_df["MOV"] > -5) & (games_df["MOV"] < 5)]
print("Number of close games:", close_games.shape[0])

close_game_prob = close_games["MOV"].count() / games_df["MOV"].count() * 100
print("Probability of a close game:", close_game_prob)

Number of close games: 5631
Probability of a close game: 21.20744200060259


In [296]:
# Create high scoring game column
games_df["total_score"] = games_df["PTS_home"] + games_df["PTS_away"]
high_scoring_games = games_df[games_df["total_score"] > 220]
print("Number of high scoring games:", high_scoring_games.shape[0])

high_scoring_game_prob = len(high_scoring_games) / len(games_df) * 100
print("Probability of a high scoring game:", high_scoring_game_prob)

Number of high scoring games: 6258
Probability of a high scoring game: 23.48129526096582


The model is created, experimenting with the different model architecture. The model will be trained on the first 80% of the data, and tested on the last 20% of the data. The model will be evaluated based on the accuracy of the predictions. 

The model will be trained on the ELO rating of each team, and the difference in ELO rating between the two teams. The model will predict the outcome of the game based on the ELO rating of each team.

In [297]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split


games_df = games_df.dropna(ignore_index=True)
features_df = games_df[
        [
            # "PTS_home",
            # "PTS_away",
            # "FG_PCT_home",
            # "FT_PCT_home",
            # "FG3_PCT_home",
            # "AST_home",
            # "REB_home",
            # "FG_PCT_away",
            # "FT_PCT_away",
            # "FG3_PCT_away",
            # "AST_away",
            # "REB_away",
            "SEASON",
            # "MOV",
            "total_score",
            "ELO_home",
            "ELO_away",
            "HOME_TEAM_ID",
            "VISITOR_TEAM_ID",]
    ]


class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=6, out_features=50)
        self.fc2 = nn.Linear(50, 1)

    def forward(self, x):
        x = F.prelu(self.fc1(x), torch.tensor(0.75))
        x = torch.sigmoid(self.fc2(x))
        return x


features = torch.tensor(
    features_df.values,
    dtype=torch.float32,
)
labels = torch.tensor(games_df["HOME_TEAM_WINS"])

dataset = TensorDataset(features, labels)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128)

model = BinaryClassifier()
criterion = nn.BCELoss(weight=torch.tensor(0.75))
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

num_epochs = 10
losses = []
for epoch in range(num_epochs):
    for inputs, targets in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.squeeze()
        loss = criterion(outputs, targets.float())
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 30.991735458374023
Epoch 2, Loss: 24.173553466796875
Epoch 3, Loss: 32.23140335083008
Epoch 4, Loss: 29.752065658569336
Epoch 5, Loss: 26.033058166503906
Epoch 6, Loss: 29.752065658569336
Epoch 7, Loss: 29.752065658569336
Epoch 8, Loss: 30.991735458374023
Epoch 9, Loss: 27.892562866210938
Epoch 10, Loss: 25.413223266601562


Here I have created a model with larger hidden layers, and more hidden layers, to see if the model can learn more complex patterns in the data. The learning rate is lowered and a scheduler is added to help the model converge to a better solution. The model is trained for 100 epochs, and the loss rate is assessed to see if the model is learning at a better rate than the previous model.

In [298]:
from numpy import average


features_df = games_df[
        [
            # "PTS_home",
            # "PTS_away",
            # "FG_PCT_home",
            # "FT_PCT_home",
            # "FG3_PCT_home",
            # "AST_home",
            # "REB_home",
            # "FG_PCT_away",
            # "FT_PCT_away",
            # "FG3_PCT_away",
            # "AST_away",
            # "REB_away",
            "SEASON",
            # "MOV",
            "total_score",
            "ELO_home",
            "ELO_away",
            "HOME_TEAM_ID",
            "VISITOR_TEAM_ID",]
    ]

class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=6, out_features=1000)
        self.fc2 = nn.Linear(1000, 50)
        self.fc3 = nn.Linear(50, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x


features = torch.tensor(
    features_df.values,
    dtype=torch.float32,
)
labels = torch.tensor(games_df["HOME_TEAM_WINS"])

dataset = TensorDataset(features, labels)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128)

model = BinaryClassifier()
criterion = nn.BCELoss(weight=torch.tensor(0.75))
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', cooldown=5, patience=2)

num_epochs = 100
losses = []
for epoch in range(num_epochs):
    for inputs, targets in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.squeeze()
        loss = criterion(outputs, targets.float())
        loss.backward()
        optimizer.step()
        scheduler.step(loss)
        losses.append(loss.item())
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    
# copy to each cell for saving the model to a file
average_loss = round(average(losses), 3)
model_to_string = (model.__class__.__name__)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
if os.path.exists(f'./models/{model_to_string}') is False:
    os.makedirs(f'./models/{model_to_string}')
torch.save(model.state_dict(), f=f'./models/{model_to_string}/{average_loss}_on_{timestamp}_model.pth')

Epoch 1, Loss: 45.867767333984375
Epoch 2, Loss: 47.10743713378906
Epoch 3, Loss: 47.727272033691406
Epoch 4, Loss: 47.10743713378906
Epoch 5, Loss: 46.48760223388672
Epoch 6, Loss: 46.48760223388672
Epoch 7, Loss: 48.966941833496094
Epoch 8, Loss: 44.008262634277344
Epoch 9, Loss: 44.62809753417969
Epoch 10, Loss: 40.90909194946289
Epoch 11, Loss: 46.48760223388672
Epoch 12, Loss: 46.48760223388672
Epoch 13, Loss: 43.388431549072266
Epoch 14, Loss: 46.48760223388672
Epoch 15, Loss: 42.76859664916992
Epoch 16, Loss: 44.62809753417969
Epoch 17, Loss: 47.10743713378906
Epoch 18, Loss: 45.24793243408203
Epoch 19, Loss: 44.008262634277344
Epoch 20, Loss: 43.388431549072266
Epoch 21, Loss: 41.528926849365234
Epoch 22, Loss: 46.48760223388672
Epoch 23, Loss: 49.58677673339844
Epoch 24, Loss: 46.48760223388672
Epoch 25, Loss: 45.867767333984375
Epoch 26, Loss: 46.48760223388672
Epoch 27, Loss: 45.867767333984375
Epoch 28, Loss: 49.58677673339844
Epoch 29, Loss: 42.14876174926758
Epoch 30, Los

In [299]:
from numpy import average


features_df = games_df[
        [
            "SEASON",
            "total_score",
            "ELO_home",
            "ELO_away",
            "HOME_TEAM_ID",
            "VISITOR_TEAM_ID",]
    ]

class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=6, out_features=1000)
        self.fc2 = nn.Linear(1000, 50)
        self.fc3 = nn.Linear(50, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x


features = torch.tensor(
    features_df.values,
    dtype=torch.float32,
)
labels = torch.tensor(games_df["HOME_TEAM_WINS"])

dataset = TensorDataset(features, labels)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128)

model = BinaryClassifier()
criterion = nn.BCELoss(weight=torch.tensor(0.75))
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', cooldown=5, patience=2)

num_epochs = 100
losses = []
for epoch in range(num_epochs):
    for inputs, targets in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.squeeze()
        loss = criterion(outputs, targets.float())
        loss.backward()
        optimizer.step()
        scheduler.step(loss)
        losses.append(loss.item())
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    
# copy to each cell for saving the model to a file
average_loss = round(average(losses), 3)
model_to_string = (model.__class__.__name__)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
if os.path.exists(f'./models/{model_to_string}') is False:
    os.makedirs(f'./models/{model_to_string}')
torch.save(model.state_dict(), f=f'./models/{model_to_string}/{average_loss}_on_{timestamp}_model.pth')

Epoch 1, Loss: 29.752065658569336
Epoch 2, Loss: 29.752065658569336
Epoch 3, Loss: 31.611570358276367
Epoch 4, Loss: 34.71074295043945
Epoch 5, Loss: 32.23140335083008
Epoch 6, Loss: 28.51239585876465
Epoch 7, Loss: 27.892562866210938
Epoch 8, Loss: 30.991735458374023
Epoch 9, Loss: 33.471073150634766
Epoch 10, Loss: 30.37190055847168
Epoch 11, Loss: 34.09090805053711
Epoch 12, Loss: 27.892562866210938
Epoch 13, Loss: 35.3305778503418
Epoch 14, Loss: 30.37190055847168
Epoch 15, Loss: 32.85123825073242
Epoch 16, Loss: 39.6694221496582
Epoch 17, Loss: 34.71074295043945
Epoch 18, Loss: 29.752065658569336
Epoch 19, Loss: 36.570247650146484
Epoch 20, Loss: 34.09090805053711
Epoch 21, Loss: 31.611570358276367
Epoch 22, Loss: 32.23140335083008
Epoch 23, Loss: 26.65289306640625
Epoch 24, Loss: 27.892562866210938
Epoch 25, Loss: 26.65289306640625
Epoch 26, Loss: 24.173553466796875
Epoch 27, Loss: 35.3305778503418
Epoch 28, Loss: 34.71074295043945
Epoch 29, Loss: 30.991735458374023
Epoch 30, Los

In [300]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# fig = plt.figure()
# fig, ax = plt.subplots()
# fig, axs = plt.subplots(2, 1)
# fig, axs = plt.subplot_mosaic([['close_games', 'high_scoring_games']])
# high_scoring_games = np.arange(high_scoring_game_prob)
# close_games = np.arange(close_game_prob)
# axs.plot(high_scoring_games, close_games, label='High Scoring Games')


Below is a rough outline of the final recommender system that I will be building in this notebook. It takes in the schedule for upcoming games, and outputs the recommended games that will be the closest and most exciting to watch based on the historical data from NBA games.

In [301]:
# from sklearn.neighbors import NearestNeighbors
# from sklearn.metrics import f1_score

# # Load the trained model
# model = NearestNeighbors(n_neighbors=5, metric='cosine')
# model.load('nba_game_recommender.pkl')

# # Load the schedule for the upcoming night's games
# schedule = pd.read_csv('nba_schedule.csv')

# # Extract relevant features for each game
# features = []
# for index, row in schedule.iterrows():
#     game_id = row['Game_ID']
#     home_team = row['Home_Team']
#     away_team = row['Away_Team']
#     # Extract features from historical data
#     feature_vector = get_features(game_id, home_team, away_team)
#     features.append(feature_vector)

# # Use the model to predict the most exciting game(s) to watch
# distances, indices = model.kneighbors(features)
# recommended_games = []
# for i, dist in enumerate(distances):
#     if dist < 0.5:  # Arbitrarily set a threshold