In [71]:
#import dependencies
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, InputLayer, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
from keras.regularizers import l2

# Problem Introduction
  
Predict outcomes of NBA games

In [8]:
#create df of preprocessed data - game data from 2004-2024
df = pd.read_csv('nbaDatabasePreProcessed2004-2024.csv')
df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeam_win%,homeTeam_wins,homeTeam_losses,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,...,awayTeam_D_Reb,awayTeam_Total_Reb,awayTeam_Ast,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner,season,isPlayoffGame
0,2024-06-17,Boston Celtics,BOS,0.797980,79,20,106,28,39,19,...,28,35,18,4,4,13,20,Boston Celtics,2023-2024,True
1,2024-06-14,Dallas Mavericks,DAL,0.613861,62,39,122,34,27,31,...,27,31,18,2,5,13,19,Dallas Mavericks,2023-2024,True
2,2024-06-12,Dallas Mavericks,DAL,0.610000,61,39,99,31,20,19,...,30,36,26,4,6,9,19,Boston Celtics,2023-2024,True
3,2024-06-09,Boston Celtics,BOS,0.802083,77,19,105,25,29,29,...,34,43,21,5,3,15,17,Boston Celtics,2023-2024,True
4,2024-06-06,Boston Celtics,BOS,0.800000,76,19,107,37,26,23,...,33,43,9,8,1,11,16,Boston Celtics,2023-2024,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25726,2004-11-03,Cleveland Cavaliers,CLE,0.000000,0,0,104,21,19,27,...,36,47,24,9,5,13,28,Indiana Pacers,2004-2005,False
25727,2004-11-03,Boston Celtics,BOS,0.000000,0,0,95,22,31,30,...,27,41,20,9,2,16,26,Philadelphia 76ers,2004-2005,False
25728,2004-11-02,Los Angeles Lakers,LAL,0.000000,0,0,89,24,26,22,...,32,48,17,10,8,16,21,Los Angeles Lakers,2004-2005,False
25729,2004-11-02,Detroit Pistons,DET,0.000000,0,0,87,19,18,22,...,27,36,8,4,5,16,24,Detroit Pistons,2004-2005,False


# Proposed Solution

Use a conditional time series forecasting LSTM TensorFlow model to predict NBA games

## Experimental Setup

Create a baseline model that predicts games based on who has more wins going into the game  
Create conditional time series forecasting model to predict game outcomes  
Refer to PreProcess and Scrapes folders for gathering data

In [9]:
#predict the team who has more wins going into the game
df['predicted_winner'] = df.apply(lambda row: row['homeTeam'] if row['homeTeam_wins'] > row['awayTeam_wins'] else row['awayTeam'], axis=1)
df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeam_win%,homeTeam_wins,homeTeam_losses,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,...,awayTeam_Total_Reb,awayTeam_Ast,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner,season,isPlayoffGame,predicted_winner
0,2024-06-17,Boston Celtics,BOS,0.797980,79,20,106,28,39,19,...,35,18,4,4,13,20,Boston Celtics,2023-2024,True,Boston Celtics
1,2024-06-14,Dallas Mavericks,DAL,0.613861,62,39,122,34,27,31,...,31,18,2,5,13,19,Dallas Mavericks,2023-2024,True,Boston Celtics
2,2024-06-12,Dallas Mavericks,DAL,0.610000,61,39,99,31,20,19,...,36,26,4,6,9,19,Boston Celtics,2023-2024,True,Boston Celtics
3,2024-06-09,Boston Celtics,BOS,0.802083,77,19,105,25,29,29,...,43,21,5,3,15,17,Boston Celtics,2023-2024,True,Boston Celtics
4,2024-06-06,Boston Celtics,BOS,0.800000,76,19,107,37,26,23,...,43,9,8,1,11,16,Boston Celtics,2023-2024,True,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25726,2004-11-03,Cleveland Cavaliers,CLE,0.000000,0,0,104,21,19,27,...,47,24,9,5,13,28,Indiana Pacers,2004-2005,False,Indiana Pacers
25727,2004-11-03,Boston Celtics,BOS,0.000000,0,0,95,22,31,30,...,41,20,9,2,16,26,Philadelphia 76ers,2004-2005,False,Philadelphia 76ers
25728,2004-11-02,Los Angeles Lakers,LAL,0.000000,0,0,89,24,26,22,...,48,17,10,8,16,21,Los Angeles Lakers,2004-2005,False,Denver Nuggets
25729,2004-11-02,Detroit Pistons,DET,0.000000,0,0,87,19,18,22,...,36,8,4,5,16,24,Detroit Pistons,2004-2005,False,Houston Rockets


# Baseline 

Just comparing how many wins each team has going into the game, NBA game outcomes are predicted with 65.5% success

In [10]:
#evaluate baseline model
print(accuracy_score(df['winner'], df['predicted_winner']))

0.6852434806264817


In [11]:
#maps franchises to subjectID
subjectID_dict = {
    "ATL": 1,
    "BOS": 2,
    "BKN": 3,
    "CHA": 4,
    "CHI": 5,
    "CLE": 6,
    "DAL": 7,
    "DEN": 8,
    "DET": 9,
    "GSW": 10,
    "HOU": 11,
    "IND": 12,
    "LAC": 13,
    "LAL": 14,
    "MEM": 15,
    "MIA": 16,
    "MIL": 17,
    "MIN": 18,
    "NOP": 19,
    "NYK": 20,
    "OKC": 21,
    "ORL": 22,
    "PHI": 23,
    "PHX": 24,
    "POR": 25,
    "SAC": 26,
    "SAS": 27,
    "TOR": 28,
    "UTA": 29,
    "WAS": 30,
    "NOH": 19, #New Orleans Hornets is same franchise as NOP- New Orleans Pelicans
    "NJN": 3, #New Jersey Nets relocated to Brooklyn
    "SEA": 21, #Seattle Supersonics Relocated to OKC
    "NOK": 19, #New Orleans Hornets relocated to OKC 2005-2007 - Eventually became New Orleans Pelicans
    
}

In [12]:
#map the subject IDs to a number to handle franchise moves
df["homeTeamSubject_id"] = df["homeTeam_id"].map(subjectID_dict)
df["awayTeamSubject_id"] = df["awayTeam_id"].map(subjectID_dict)

cols = df.columns.tolist()
cols.insert(cols.index("homeTeam_id") + 1, cols.pop(cols.index("homeTeamSubject_id")))
cols.insert(cols.index("awayTeam_id") + 1, cols.pop(cols.index("awayTeamSubject_id")))
df = df[cols]

df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_win%,homeTeam_wins,homeTeam_losses,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,...,awayTeam_Total_Reb,awayTeam_Ast,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner,season,isPlayoffGame,predicted_winner
0,2024-06-17,Boston Celtics,BOS,2,0.797980,79,20,106,28,39,...,35,18,4,4,13,20,Boston Celtics,2023-2024,True,Boston Celtics
1,2024-06-14,Dallas Mavericks,DAL,7,0.613861,62,39,122,34,27,...,31,18,2,5,13,19,Dallas Mavericks,2023-2024,True,Boston Celtics
2,2024-06-12,Dallas Mavericks,DAL,7,0.610000,61,39,99,31,20,...,36,26,4,6,9,19,Boston Celtics,2023-2024,True,Boston Celtics
3,2024-06-09,Boston Celtics,BOS,2,0.802083,77,19,105,25,29,...,43,21,5,3,15,17,Boston Celtics,2023-2024,True,Boston Celtics
4,2024-06-06,Boston Celtics,BOS,2,0.800000,76,19,107,37,26,...,43,9,8,1,11,16,Boston Celtics,2023-2024,True,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25726,2004-11-03,Cleveland Cavaliers,CLE,6,0.000000,0,0,104,21,19,...,47,24,9,5,13,28,Indiana Pacers,2004-2005,False,Indiana Pacers
25727,2004-11-03,Boston Celtics,BOS,2,0.000000,0,0,95,22,31,...,41,20,9,2,16,26,Philadelphia 76ers,2004-2005,False,Philadelphia 76ers
25728,2004-11-02,Los Angeles Lakers,LAL,14,0.000000,0,0,89,24,26,...,48,17,10,8,16,21,Los Angeles Lakers,2004-2005,False,Denver Nuggets
25729,2004-11-02,Detroit Pistons,DET,9,0.000000,0,0,87,19,18,...,36,8,4,5,16,24,Detroit Pistons,2004-2005,False,Houston Rockets


In [13]:
#generate a binary winner column
df['winner_binary'] = (df['winner'] == df['awayTeam']).astype(int)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['winner_binary'] = (df['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_win%,homeTeam_wins,homeTeam_losses,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,...,awayTeam_Ast,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner,season,isPlayoffGame,predicted_winner,winner_binary
0,2024-06-17,Boston Celtics,BOS,2,0.797980,79,20,106,28,39,...,18,4,4,13,20,Boston Celtics,2023-2024,True,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,DAL,7,0.613861,62,39,122,34,27,...,18,2,5,13,19,Dallas Mavericks,2023-2024,True,Boston Celtics,0
2,2024-06-12,Dallas Mavericks,DAL,7,0.610000,61,39,99,31,20,...,26,4,6,9,19,Boston Celtics,2023-2024,True,Boston Celtics,1
3,2024-06-09,Boston Celtics,BOS,2,0.802083,77,19,105,25,29,...,21,5,3,15,17,Boston Celtics,2023-2024,True,Boston Celtics,0
4,2024-06-06,Boston Celtics,BOS,2,0.800000,76,19,107,37,26,...,9,8,1,11,16,Boston Celtics,2023-2024,True,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25726,2004-11-03,Cleveland Cavaliers,CLE,6,0.000000,0,0,104,21,19,...,24,9,5,13,28,Indiana Pacers,2004-2005,False,Indiana Pacers,1
25727,2004-11-03,Boston Celtics,BOS,2,0.000000,0,0,95,22,31,...,20,9,2,16,26,Philadelphia 76ers,2004-2005,False,Philadelphia 76ers,1
25728,2004-11-02,Los Angeles Lakers,LAL,14,0.000000,0,0,89,24,26,...,17,10,8,16,21,Los Angeles Lakers,2004-2005,False,Denver Nuggets,0
25729,2004-11-02,Detroit Pistons,DET,9,0.000000,0,0,87,19,18,...,8,4,5,16,24,Detroit Pistons,2004-2005,False,Houston Rockets,0


## Conditional Time Series Forecasting - Part 1

Populate a dataframe that holds all team performances  
The df is twice the size of the original df as it holds performances by both teams from each game

In [14]:
#populate dfIDtoStat - should be double the size of df
homedf = df[['date', 'homeTeamSubject_id', 'homeTeam_points_total']]
homedf.columns = ['date', 'team_id', 'team_points_total']
awaydf = df[['date', 'awayTeamSubject_id', 'awayTeam_points_total']]
awaydf.columns = ['date', 'team_id', 'team_points_total']

dfIDtoStat = pd.concat([homedf, awaydf], ignore_index=True)
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total
0,2024-06-17,2,106
1,2024-06-14,7,122
2,2024-06-12,7,99
3,2024-06-09,2,105
4,2024-06-06,2,107
...,...,...,...
51457,2004-11-03,12,109
51458,2004-11-03,23,98
51459,2004-11-02,8,78
51460,2004-11-02,11,79


## Conditional Time Series Forecasting - Part 2

Function df_to_X_y iterates through all games held in df - the database game details from 2004-2024  
Limits the window to include only games that have been played before the game to be predicted  
Confirms that window_size number of games have been played by each team  
Limits the window to the df to games played by each team before the date of the game to be predicted  
Creates X and y matrices of the last window_size performances by each team
Drops non pre processed columns  
X holds window_size previous game stats  
y holds result of the game - 0 if homeTeam won, 1 if awayTeam won  
Handles data regardless of whether it is in order or not


In [15]:
#generate the X Y matrices
def df_to_X_y(df, dfWindow, window_size):
    X = []
    y = []
    #each team must have played window_size games before data can be extracted
    for index, row in df.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfWindow[dfWindow['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
            homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            homeTeam_window = homeTeam_window.to_numpy()
            awayTeam_window = awayTeam_window.to_numpy()
            combined_window = np.hstack((homeTeam_window, awayTeam_window))
            X.append(combined_window)
            y.append(row['winner_binary'])
    
    return np.array(X), np.array(y)

### Example of the first iteration of only one variable
Prints the first game where both teams have 5 previous performances  
Prints homeTeam's window  
Prints awayTeam's window  
Prints simple example of X matrix  
Prints simple example of y matrix  

In [16]:
#showcase of the first iteration
window_size =5 
exX =[]
exY = []
for index, row in df.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfIDtoStat[dfIDtoStat['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
                homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size))
                awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size))
                print(date)
                print(homeTeam_window)
                print(awayTeam_window)
                print(type(homeTeam_window))
                homeTeam_window = homeTeam_window.drop(columns=['date', 'team_id'])
                awayTeam_window = awayTeam_window.drop(columns=['date', 'team_id'])
                homeTeam_window = homeTeam_window.to_numpy()
                awayTeam_window = awayTeam_window.to_numpy()
                combined_window = np.hstack((homeTeam_window, awayTeam_window))
                exX.append(combined_window)
                exY.append(row['winner_binary'])
                print(exX)
                print(exY)
                break
                        

2024-06-17
             date  team_id  team_points_total
25732  2024-06-14        2                 84
25733  2024-06-12        2                106
3      2024-06-09        2                105
4      2024-06-06        2                107
25738  2024-05-27        2                105
             date  team_id  team_points_total
1      2024-06-14        7                122
2      2024-06-12        7                 99
25734  2024-06-09        7                 98
25735  2024-06-06        7                 89
25736  2024-05-30        7                124
<class 'pandas.core.frame.DataFrame'>
[array([[ 84, 122],
       [106,  99],
       [105,  98],
       [107,  89],
       [105, 124]], dtype=int64)]
[0]


### Call df_to_X_y function
Output indicated number of entries, window size, and number of variables  
2 variables - one for homeTeam and one for awayTeam  
Window size set to 5 for now

In [17]:
#X1.Shape is num data, window size, variables
X, y = df_to_X_y(df, dfIDtoStat, 5)
X.shape, y.shape

((25634, 5, 2), (25634,))

In [18]:
#80-10-10 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((20507, 5, 2), (20507,), (2563, 5, 2), (2563,), (2564, 5, 2), (2564,))

### Simple forecasting model

Input layer specifies window size and num input variables  
Sigmoid activation for final layer as this is binary output

In [19]:
model = Sequential()
model.add(InputLayer((5, 2)))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(LSTM(32, activation='tanh'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 5, 64)             17152     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 30241 (118.13 KB)
Trainable params: 30241 (118.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
#early stopping if model does not improve performance for 50 epochs
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
#simple model fit
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x180c8801610>

Simple time series forecasting yields 59.4% accuracy

In [22]:
#accuracy score for simple model
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.59399375975039


### Adding the rest of the input variables

In [23]:
#maps franchises to subjectID
subjectID_dict = {
    "ATL": 1,
    "BOS": 2,
    "BKN": 3,
    "CHA": 4,
    "CHI": 5,
    "CLE": 6,
    "DAL": 7,
    "DEN": 8,
    "DET": 9,
    "GSW": 10,
    "HOU": 11,
    "IND": 12,
    "LAC": 13,
    "LAL": 14,
    "MEM": 15,
    "MIA": 16,
    "MIL": 17,
    "MIN": 18,
    "NOP": 19,
    "NYK": 20,
    "OKC": 21,
    "ORL": 22,
    "PHI": 23,
    "PHX": 24,
    "POR": 25,
    "SAC": 26,
    "SAS": 27,
    "TOR": 28,
    "UTA": 29,
    "WAS": 30,
    "NOH": 19, #New Orleans Hornets is same franchise as NOP- New Orleans Pelicans
    "NJN": 3, #New Jersey Nets relocated to Brooklyn
    "SEA": 21, #Seattle Supersonics Relocated to OKC
    "NOK": 19, #New Orleans Hornets relocated to OKC 2005-2007 - Eventually became New Orleans Pelicans
    
}

In [24]:
#map the subject IDs to a number to handle franchise moves
df["homeTeamSubject_id"] = df["homeTeam_id"].map(subjectID_dict)
df["awayTeamSubject_id"] = df["awayTeam_id"].map(subjectID_dict)

cols = df.columns.tolist()
cols.insert(cols.index("homeTeam_id") + 1, cols.pop(cols.index("homeTeamSubject_id")))
cols.insert(cols.index("awayTeam_id") + 1, cols.pop(cols.index("awayTeamSubject_id")))
df = df[cols]

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["homeTeamSubject_id"] = df["homeTeam_id"].map(subjectID_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["awayTeamSubject_id"] = df["awayTeam_id"].map(subjectID_dict)


Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_win%,homeTeam_wins,homeTeam_losses,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,...,awayTeam_Ast,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner,season,isPlayoffGame,predicted_winner,winner_binary
0,2024-06-17,Boston Celtics,BOS,2,0.797980,79,20,106,28,39,...,18,4,4,13,20,Boston Celtics,2023-2024,True,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,DAL,7,0.613861,62,39,122,34,27,...,18,2,5,13,19,Dallas Mavericks,2023-2024,True,Boston Celtics,0
2,2024-06-12,Dallas Mavericks,DAL,7,0.610000,61,39,99,31,20,...,26,4,6,9,19,Boston Celtics,2023-2024,True,Boston Celtics,1
3,2024-06-09,Boston Celtics,BOS,2,0.802083,77,19,105,25,29,...,21,5,3,15,17,Boston Celtics,2023-2024,True,Boston Celtics,0
4,2024-06-06,Boston Celtics,BOS,2,0.800000,76,19,107,37,26,...,9,8,1,11,16,Boston Celtics,2023-2024,True,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25726,2004-11-03,Cleveland Cavaliers,CLE,6,0.000000,0,0,104,21,19,...,24,9,5,13,28,Indiana Pacers,2004-2005,False,Indiana Pacers,1
25727,2004-11-03,Boston Celtics,BOS,2,0.000000,0,0,95,22,31,...,20,9,2,16,26,Philadelphia 76ers,2004-2005,False,Philadelphia 76ers,1
25728,2004-11-02,Los Angeles Lakers,LAL,14,0.000000,0,0,89,24,26,...,17,10,8,16,21,Los Angeles Lakers,2004-2005,False,Denver Nuggets,0
25729,2004-11-02,Detroit Pistons,DET,9,0.000000,0,0,87,19,18,...,8,4,5,16,24,Detroit Pistons,2004-2005,False,Houston Rockets,0


In [25]:
#replace instances of true/false with 1,0
df = df.replace({True: 1, False: 0})
df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_win%,homeTeam_wins,homeTeam_losses,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,...,awayTeam_Ast,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner,season,isPlayoffGame,predicted_winner,winner_binary
0,2024-06-17,Boston Celtics,BOS,2,0.797980,79,20,106,28,39,...,18,4,4,13,20,Boston Celtics,2023-2024,1,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,DAL,7,0.613861,62,39,122,34,27,...,18,2,5,13,19,Dallas Mavericks,2023-2024,1,Boston Celtics,0
2,2024-06-12,Dallas Mavericks,DAL,7,0.610000,61,39,99,31,20,...,26,4,6,9,19,Boston Celtics,2023-2024,1,Boston Celtics,1
3,2024-06-09,Boston Celtics,BOS,2,0.802083,77,19,105,25,29,...,21,5,3,15,17,Boston Celtics,2023-2024,1,Boston Celtics,0
4,2024-06-06,Boston Celtics,BOS,2,0.800000,76,19,107,37,26,...,9,8,1,11,16,Boston Celtics,2023-2024,1,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25726,2004-11-03,Cleveland Cavaliers,CLE,6,0.000000,0,0,104,21,19,...,24,9,5,13,28,Indiana Pacers,2004-2005,0,Indiana Pacers,1
25727,2004-11-03,Boston Celtics,BOS,2,0.000000,0,0,95,22,31,...,20,9,2,16,26,Philadelphia 76ers,2004-2005,0,Philadelphia 76ers,1
25728,2004-11-02,Los Angeles Lakers,LAL,14,0.000000,0,0,89,24,26,...,17,10,8,16,21,Los Angeles Lakers,2004-2005,0,Denver Nuggets,0
25729,2004-11-02,Detroit Pistons,DET,9,0.000000,0,0,87,19,18,...,8,4,5,16,24,Detroit Pistons,2004-2005,0,Houston Rockets,0


In [27]:
#populate dfIDtoStat - should be double the size of df
homedf = df[['date', 'homeTeamSubject_id', 'isPlayoffGame', 'homeTeam_win%', 'homeTeam_wins', 'homeTeam_losses', 'homeTeam_points_total', 'homeTeam_FG', 'homeTeam_FG_made', 'homeTeam_3P', 'homeTeam_3P_made', 'homeTeam_O_Reb', 'homeTeam_D_Reb', 'homeTeam_Total_Reb', 'homeTeam_Ast', 'homeTeam_Stl', 'homeTeam_Blk', 'homeTeam_TO', 'homeTeam_PF']]
homedf.columns = ['date', 'team_id', 'isPlayoffGame', 'team_win%', 'team_wins', 'team_losses', 'team_points_total', 'team_FG', 'team_FG_made', 'team_3P', 'team_3P_made', 'team_O_Reb', 'team_D_Reb', 'team_Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_TO', 'team_PF']
awaydf = df[['date', 'awayTeamSubject_id', 'isPlayoffGame', 'awayTeam_win%', 'awayTeam_wins', 'awayTeam_losses', 'awayTeam_points_total', 'awayTeam_FG', 'awayTeam_FG_made', 'awayTeam_3P', 'awayTeam_3P_made', 'awayTeam_O_Reb', 'awayTeam_D_Reb', 'awayTeam_Total_Reb', 'awayTeam_Ast', 'awayTeam_Stl', 'awayTeam_Blk', 'awayTeam_TO', 'awayTeam_PF']]
awaydf.columns = ['date', 'team_id', 'isPlayoffGame', 'team_win%', 'team_wins', 'team_losses', 'team_points_total', 'team_FG', 'team_FG_made', 'team_3P', 'team_3P_made', 'team_O_Reb', 'team_D_Reb', 'team_Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_TO', 'team_PF']

dfIDtoStat = pd.concat([homedf, awaydf], ignore_index=True)
dfIDtoStat

Unnamed: 0,date,team_id,isPlayoffGame,team_win%,team_wins,team_losses,team_points_total,team_FG,team_FG_made,team_3P,team_3P_made,team_O_Reb,team_D_Reb,team_Total_Reb,team_Ast,team_Stl,team_Blk,team_TO,team_PF
0,2024-06-17,2,1,0.797980,79,20,106,0.426966,38,0.333333,13,15,36,51,25,9,2,7,15
1,2024-06-14,7,1,0.613861,62,39,122,0.505495,46,0.405405,15,13,39,52,21,7,2,8,17
2,2024-06-12,7,1,0.610000,61,39,99,0.441860,38,0.360000,9,7,36,43,15,5,1,8,17
3,2024-06-09,2,1,0.802083,77,19,105,0.452381,38,0.256410,10,10,31,41,29,10,5,10,15
4,2024-06-06,2,1,0.800000,76,19,107,0.475610,39,0.380952,16,10,37,47,23,6,9,12,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51457,2004-11-03,12,0,0.000000,0,0,109,0.462366,43,0.235294,4,11,36,47,24,9,5,13,28
51458,2004-11-03,23,0,0.000000,0,0,98,0.444444,36,0.375000,9,14,27,41,20,9,2,16,26
51459,2004-11-02,8,0,0.000000,0,0,78,0.340909,30,0.250000,3,16,32,48,17,10,8,16,21
51460,2004-11-02,11,0,0.000000,0,0,79,0.394366,28,0.461538,6,9,27,36,8,4,5,16,24


In [28]:
#X.Shape is num data, window size, variables
X, y = df_to_X_y(df, dfIDtoStat, 5)
X.shape, y.shape

((25634, 5, 34), (25634,))

In [29]:
#80-10-10 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((20507, 5, 34), (20507,), (2563, 5, 34), (2563,), (2564, 5, 34), (2564,))

In [30]:
model = Sequential()
model.add(InputLayer((5, 34)))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(LSTM(32, activation='tanh'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 5, 64)             25344     
                                                                 
 lstm_3 (LSTM)               (None, 32)                12416     
                                                                 
 dense_3 (Dense)             (None, 16)                528       
                                                                 
 dense_4 (Dense)             (None, 8)                 136       
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                                 
Total params: 38433 (150.13 KB)
Trainable params: 38433 (150.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [31]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [32]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x180db266ca0>

### Accuracy Score With All Input Variables

Accuracy has increased to 65.9%, still less than baseline model's 68.5%

In [33]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6591263650546022


## Test to Find Optimized Window Size

In [34]:
#Increase window size to 10
X, y = df_to_X_y(df, dfIDtoStat, 10)
X.shape, y.shape

((25556, 10, 34), (25556,))

In [35]:
#80-10-10 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((20444, 10, 34), (20444,), (2556, 10, 34), (2556,), (2556, 10, 34), (2556,))

In [37]:
#Increased input layer to window size 10
model = Sequential()
model.add(InputLayer((10, 34)))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(LSTM(32, activation='tanh'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 10, 64)            25344     
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dense_6 (Dense)             (None, 16)                528       
                                                                 
 dense_7 (Dense)             (None, 8)                 136       
                                                                 
 dense_8 (Dense)             (None, 1)                 9         
                                                                 
Total params: 38433 (150.13 KB)
Trainable params: 38433 (150.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [38]:
#reducing patience to 25
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [39]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100


<keras.src.callbacks.History at 0x180db22c190>

Accuracy still 65.9%

In [40]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6592331768388107


### Test window size 15

In [41]:
#Increase window size to 15
X, y = df_to_X_y(df, dfIDtoStat, 15)
X.shape, y.shape

((25479, 15, 34), (25479,))

In [42]:
#80-10-10 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((20383, 15, 34), (20383,), (2548, 15, 34), (2548,), (2548, 15, 34), (2548,))

In [43]:
#Increased input layer to window size 15
model = Sequential()
model.add(InputLayer((15, 34)))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(LSTM(32, activation='tanh'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 15, 64)            25344     
                                                                 
 lstm_7 (LSTM)               (None, 32)                12416     
                                                                 
 dense_9 (Dense)             (None, 16)                528       
                                                                 
 dense_10 (Dense)            (None, 8)                 136       
                                                                 
 dense_11 (Dense)            (None, 1)                 9         
                                                                 
Total params: 38433 (150.13 KB)
Trainable params: 38433 (150.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [44]:
#reducing patience to 25
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [45]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


<keras.src.callbacks.History at 0x180e1336940>

Accuracy increased to 66.4% for window size 15

In [46]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6640502354788069


### Test Window Size 20

In [47]:
#Increase window size to 20
X, y = df_to_X_y(df, dfIDtoStat, 20)
X.shape, y.shape

((25406, 20, 34), (25406,))

In [48]:
#80-10-10 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((20324, 20, 34), (20324,), (2541, 20, 34), (2541,), (2541, 20, 34), (2541,))

In [49]:
#Increased input layer to window size 20
model = Sequential()
model.add(InputLayer((20, 34)))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(LSTM(32, activation='tanh'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (None, 20, 64)            25344     
                                                                 
 lstm_9 (LSTM)               (None, 32)                12416     
                                                                 
 dense_12 (Dense)            (None, 16)                528       
                                                                 
 dense_13 (Dense)            (None, 8)                 136       
                                                                 
 dense_14 (Dense)            (None, 1)                 9         
                                                                 
Total params: 38433 (150.13 KB)
Trainable params: 38433 (150.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [50]:
#reducing patience to 20
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [51]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


<keras.src.callbacks.History at 0x180db3badf0>

Accuracy increased to 67.1% for window size 20

In [52]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6713892168437623


### Test Window Size 25

In [53]:
#Increase window size to 25
X, y = df_to_X_y(df, dfIDtoStat, 25)
X.shape, y.shape

((25328, 25, 34), (25328,))

In [59]:
#80-10-10 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((20262, 25, 34), (20262,), (2533, 25, 34), (2533,), (2533, 25, 34), (2533,))

In [60]:
#Increased input layer to window size 25
model = Sequential()
model.add(InputLayer((25, 34)))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(LSTM(32, activation='tanh'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_12 (LSTM)              (None, 25, 64)            25344     
                                                                 
 lstm_13 (LSTM)              (None, 32)                12416     
                                                                 
 dense_18 (Dense)            (None, 16)                528       
                                                                 
 dense_19 (Dense)            (None, 8)                 136       
                                                                 
 dense_20 (Dense)            (None, 1)                 9         
                                                                 
Total params: 38433 (150.13 KB)
Trainable params: 38433 (150.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [61]:
#reducing patience to 20
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [62]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100


<keras.src.callbacks.History at 0x180e04a9250>

Accuracy did not improve - Using window size 20 for rest of project
The optimized window size probably lies between 15-25

In [63]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6608764311093565


## Test Dropout To Prevent Overfitting

In [64]:
#Keeping window size 20
X, y = df_to_X_y(df, dfIDtoStat, 20)
X.shape, y.shape

((25406, 20, 34), (25406,))

In [65]:
#80-10-10 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((20324, 20, 34), (20324,), (2541, 20, 34), (2541,), (2541, 20, 34), (2541,))

In [66]:
model = Sequential()
model.add(InputLayer((20, 34)))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(Dropout(0.2)) 
model.add(LSTM(32, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_14 (LSTM)              (None, 20, 64)            25344     
                                                                 
 dropout (Dropout)           (None, 20, 64)            0         
                                                                 
 lstm_15 (LSTM)              (None, 32)                12416     
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_21 (Dense)            (None, 16)                528       
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_22 (Dense)            (None, 8)                

In [67]:
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [68]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100


<keras.src.callbacks.History at 0x180ecca20a0>

Accuracy improved from 67.1% to 67.2%

In [69]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6729634002361276


### Test Batch Normalization

In [None]:
#added batch normalization
model = Sequential()
model.add(InputLayer((20, 34)))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.2)) 
model.add(LSTM(32, activation='tanh'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(BatchNormalization()) 
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_17 (LSTM)              (None, 20, 64)            25344     
                                                                 
 batch_normalization (Batch  (None, 20, 64)            256       
 Normalization)                                                  
                                                                 
 dropout_3 (Dropout)         (None, 20, 64)            0         
                                                                 
 lstm_18 (LSTM)              (None, 32)                12416     
                                                                 
 batch_normalization_1 (Bat  (None, 32)                128       
 chNormalization)                                                
                                                                 
 dropout_4 (Dropout)         (None, 32)               

In [73]:
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [74]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100


<keras.src.callbacks.History at 0x180ec75e940>

Accuracy improved from 67.2% to 67.3%

In [75]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6733569460842188


### Test L2 Regularization

In [76]:
#added l2 Regularization
model = Sequential()
model.add(InputLayer((20, 34)))
model.add(LSTM(64, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(Dropout(0.2)) 
model.add(LSTM(32, activation='tanh', kernel_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(BatchNormalization()) 
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_19 (LSTM)              (None, 20, 64)            25344     
                                                                 
 batch_normalization_4 (Bat  (None, 20, 64)            256       
 chNormalization)                                                
                                                                 
 dropout_6 (Dropout)         (None, 20, 64)            0         
                                                                 
 lstm_20 (LSTM)              (None, 32)                12416     
                                                                 
 batch_normalization_5 (Bat  (None, 32)                128       
 chNormalization)                                                
                                                                 
 dropout_7 (Dropout)         (None, 32)              

In [77]:
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [78]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100


<keras.src.callbacks.History at 0x180dd95f700>

Accuracy decreased from 67.3% to 66.8%

In [79]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6682408500590319


### Test smaller L2 Regularization - penalize weights less

In [80]:
#Decreased l2 regularization to 0.001
model = Sequential()
model.add(InputLayer((20, 34)))
model.add(LSTM(64, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dropout(0.2)) 
model.add(LSTM(32, activation='tanh', kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(BatchNormalization()) 
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_21 (LSTM)              (None, 20, 64)            25344     
                                                                 
 batch_normalization_8 (Bat  (None, 20, 64)            256       
 chNormalization)                                                
                                                                 
 dropout_9 (Dropout)         (None, 20, 64)            0         
                                                                 
 lstm_22 (LSTM)              (None, 32)                12416     
                                                                 
 batch_normalization_9 (Bat  (None, 32)                128       
 chNormalization)                                                
                                                                 
 dropout_10 (Dropout)        (None, 32)              

In [82]:
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [83]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


<keras.src.callbacks.History at 0x180e0399e50>

Accuracy decreased again to 66.1% - removing l2 regularization

In [84]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6611570247933884


### Test Increased Model Complexity

In [85]:
#Increased complexity first layer now 128
#Removed l2 regularization
model = Sequential()
model.add(InputLayer((20, 34)))
model.add(LSTM(128, activation='tanh', return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.2)) 
model.add(LSTM(32, activation='tanh'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(BatchNormalization()) 
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_23 (LSTM)              (None, 20, 128)           83456     
                                                                 
 batch_normalization_12 (Ba  (None, 20, 128)           512       
 tchNormalization)                                               
                                                                 
 dropout_12 (Dropout)        (None, 20, 128)           0         
                                                                 
 lstm_24 (LSTM)              (None, 20, 64)            49408     
                                                                 
 batch_normalization_13 (Ba  (None, 20, 64)            256       
 tchNormalization)                                               
                                                                 
 dropout_13 (Dropout)        (None, 20, 64)          

In [86]:
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [87]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


<keras.src.callbacks.History at 0x1811cba0be0>

Accuracy did not improve from 67.3%

In [88]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6654860291223927


### Test One-hot encoding of subjectID

In [90]:
#one hot encode franchises - without dropping subject_id as it is needed for X_y function
df = pd.get_dummies(df, columns=['homeTeamSubject_id'], prefix='homeTeam', drop_first=False)
df = pd.get_dummies(df, columns=['awayTeamSubject_id'], prefix='awayTeam', drop_first=False)
df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeam_win%,homeTeam_wins,homeTeam_losses,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,...,awayTeam_21,awayTeam_22,awayTeam_23,awayTeam_24,awayTeam_25,awayTeam_26,awayTeam_27,awayTeam_28,awayTeam_29,awayTeam_30
0,2024-06-17,Boston Celtics,BOS,0.797980,79,20,106,28,39,19,...,False,False,False,False,False,False,False,False,False,False
1,2024-06-14,Dallas Mavericks,DAL,0.613861,62,39,122,34,27,31,...,False,False,False,False,False,False,False,False,False,False
2,2024-06-12,Dallas Mavericks,DAL,0.610000,61,39,99,31,20,19,...,False,False,False,False,False,False,False,False,False,False
3,2024-06-09,Boston Celtics,BOS,0.802083,77,19,105,25,29,29,...,False,False,False,False,False,False,False,False,False,False
4,2024-06-06,Boston Celtics,BOS,0.800000,76,19,107,37,26,23,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25726,2004-11-03,Cleveland Cavaliers,CLE,0.000000,0,0,104,21,19,27,...,False,False,False,False,False,False,False,False,False,False
25727,2004-11-03,Boston Celtics,BOS,0.000000,0,0,95,22,31,30,...,False,False,True,False,False,False,False,False,False,False
25728,2004-11-02,Los Angeles Lakers,LAL,0.000000,0,0,89,24,26,22,...,False,False,False,False,False,False,False,False,False,False
25729,2004-11-02,Detroit Pistons,DET,0.000000,0,0,87,19,18,22,...,False,False,False,False,False,False,False,False,False,False


In [91]:
#bring back subject_id as it is needed for X_y function
df["homeTeamSubject_id"] = df["homeTeam_id"].map(subjectID_dict)
df["awayTeamSubject_id"] = df["awayTeam_id"].map(subjectID_dict)

cols = df.columns.tolist()
cols.insert(cols.index("homeTeam_id") + 1, cols.pop(cols.index("homeTeamSubject_id")))
cols.insert(cols.index("awayTeam_id") + 1, cols.pop(cols.index("awayTeamSubject_id")))
df = df[cols]

df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_win%,homeTeam_wins,homeTeam_losses,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,...,awayTeam_21,awayTeam_22,awayTeam_23,awayTeam_24,awayTeam_25,awayTeam_26,awayTeam_27,awayTeam_28,awayTeam_29,awayTeam_30
0,2024-06-17,Boston Celtics,BOS,2,0.797980,79,20,106,28,39,...,False,False,False,False,False,False,False,False,False,False
1,2024-06-14,Dallas Mavericks,DAL,7,0.613861,62,39,122,34,27,...,False,False,False,False,False,False,False,False,False,False
2,2024-06-12,Dallas Mavericks,DAL,7,0.610000,61,39,99,31,20,...,False,False,False,False,False,False,False,False,False,False
3,2024-06-09,Boston Celtics,BOS,2,0.802083,77,19,105,25,29,...,False,False,False,False,False,False,False,False,False,False
4,2024-06-06,Boston Celtics,BOS,2,0.800000,76,19,107,37,26,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25726,2004-11-03,Cleveland Cavaliers,CLE,6,0.000000,0,0,104,21,19,...,False,False,False,False,False,False,False,False,False,False
25727,2004-11-03,Boston Celtics,BOS,2,0.000000,0,0,95,22,31,...,False,False,True,False,False,False,False,False,False,False
25728,2004-11-02,Los Angeles Lakers,LAL,14,0.000000,0,0,89,24,26,...,False,False,False,False,False,False,False,False,False,False
25729,2004-11-02,Detroit Pistons,DET,9,0.000000,0,0,87,19,18,...,False,False,False,False,False,False,False,False,False,False


In [92]:
#replace instances of true/false with 1,0
df = df.replace({True: 1, False: 0})
df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_win%,homeTeam_wins,homeTeam_losses,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,...,awayTeam_21,awayTeam_22,awayTeam_23,awayTeam_24,awayTeam_25,awayTeam_26,awayTeam_27,awayTeam_28,awayTeam_29,awayTeam_30
0,2024-06-17,Boston Celtics,BOS,2,0.797980,79,20,106,28,39,...,0,0,0,0,0,0,0,0,0,0
1,2024-06-14,Dallas Mavericks,DAL,7,0.613861,62,39,122,34,27,...,0,0,0,0,0,0,0,0,0,0
2,2024-06-12,Dallas Mavericks,DAL,7,0.610000,61,39,99,31,20,...,0,0,0,0,0,0,0,0,0,0
3,2024-06-09,Boston Celtics,BOS,2,0.802083,77,19,105,25,29,...,0,0,0,0,0,0,0,0,0,0
4,2024-06-06,Boston Celtics,BOS,2,0.800000,76,19,107,37,26,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25726,2004-11-03,Cleveland Cavaliers,CLE,6,0.000000,0,0,104,21,19,...,0,0,0,0,0,0,0,0,0,0
25727,2004-11-03,Boston Celtics,BOS,2,0.000000,0,0,95,22,31,...,0,0,1,0,0,0,0,0,0,0
25728,2004-11-02,Los Angeles Lakers,LAL,14,0.000000,0,0,89,24,26,...,0,0,0,0,0,0,0,0,0,0
25729,2004-11-02,Detroit Pistons,DET,9,0.000000,0,0,87,19,18,...,0,0,0,0,0,0,0,0,0,0


In [93]:
#populate dfIDtoStat - should be double the size of df
homedf = df[['date', 'homeTeamSubject_id', 'isPlayoffGame', 'homeTeam_win%', 'homeTeam_wins', 'homeTeam_losses', 'homeTeam_points_total', 'homeTeam_FG', 'homeTeam_FG_made', 'homeTeam_3P', 'homeTeam_3P_made', 'homeTeam_O_Reb', 'homeTeam_D_Reb', 'homeTeam_Total_Reb', 'homeTeam_Ast', 'homeTeam_Stl', 'homeTeam_Blk', 'homeTeam_TO', 'homeTeam_PF', 'homeTeam_1', 'homeTeam_2', 'homeTeam_3', 'homeTeam_4', 'homeTeam_5', 'homeTeam_6', 'homeTeam_7', 'homeTeam_8', 'homeTeam_9', 'homeTeam_10', 'homeTeam_11', 'homeTeam_12', 'homeTeam_13', 'homeTeam_14', 'homeTeam_15', 'homeTeam_16', 'homeTeam_17', 'homeTeam_18', 'homeTeam_19', 'homeTeam_20', 'homeTeam_21', 'homeTeam_22', 'homeTeam_23', 'homeTeam_24', 'homeTeam_25', 'homeTeam_26', 'homeTeam_27', 'homeTeam_28', 'homeTeam_29', 'homeTeam_30']]
homedf.columns = ['date', 'team_id', 'isPlayoffGame', 'team_win%', 'team_wins', 'team_losses', 'team_points_total', 'team_FG', 'team_FG_made', 'team_3P', 'team_3P_made', 'team_O_Reb', 'team_D_Reb', 'team_Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_TO', 'team_PF', 'team_1', 'team_2', 'team_3', 'team_4', 'team_5', 'team_6', 'team_7', 'team_8', 'team_9', 'team_10', 'team_11', 'team_12', 'team_13', 'team_14', 'team_15', 'team_16', 'team_17', 'team_18', 'team_19', 'team_20', 'team_21', 'team_22', 'team_23', 'team_24', 'team_25', 'team_26', 'team_27', 'team_28', 'team_29', 'team_30']
awaydf = df[['date', 'awayTeamSubject_id', 'isPlayoffGame', 'awayTeam_win%', 'awayTeam_wins', 'awayTeam_losses', 'awayTeam_points_total', 'awayTeam_FG', 'awayTeam_FG_made', 'awayTeam_3P', 'awayTeam_3P_made', 'awayTeam_O_Reb', 'awayTeam_D_Reb', 'awayTeam_Total_Reb', 'awayTeam_Ast', 'awayTeam_Stl', 'awayTeam_Blk', 'awayTeam_TO', 'awayTeam_PF', 'awayTeam_1', 'awayTeam_2', 'awayTeam_3', 'awayTeam_4', 'awayTeam_5', 'awayTeam_6', 'awayTeam_7', 'awayTeam_8', 'awayTeam_9', 'awayTeam_10', 'awayTeam_11', 'awayTeam_12', 'awayTeam_13', 'awayTeam_14', 'awayTeam_15', 'awayTeam_16', 'awayTeam_17', 'awayTeam_18', 'awayTeam_19', 'awayTeam_20', 'awayTeam_21', 'awayTeam_22', 'awayTeam_23', 'awayTeam_24', 'awayTeam_25', 'awayTeam_26', 'awayTeam_27', 'awayTeam_28', 'awayTeam_29', 'awayTeam_30']]
awaydf.columns = ['date', 'team_id', 'isPlayoffGame', 'team_win%', 'team_wins', 'team_losses', 'team_points_total', 'team_FG', 'team_FG_made', 'team_3P', 'team_3P_made', 'team_O_Reb', 'team_D_Reb', 'team_Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_TO', 'team_PF', 'team_1', 'team_2', 'team_3', 'team_4', 'team_5', 'team_6', 'team_7', 'team_8', 'team_9', 'team_10', 'team_11', 'team_12', 'team_13', 'team_14', 'team_15', 'team_16', 'team_17', 'team_18', 'team_19', 'team_20', 'team_21', 'team_22', 'team_23', 'team_24', 'team_25', 'team_26', 'team_27', 'team_28', 'team_29', 'team_30']

dfIDtoStat = pd.concat([homedf, awaydf], ignore_index=True)
dfIDtoStat

Unnamed: 0,date,team_id,isPlayoffGame,team_win%,team_wins,team_losses,team_points_total,team_FG,team_FG_made,team_3P,...,team_21,team_22,team_23,team_24,team_25,team_26,team_27,team_28,team_29,team_30
0,2024-06-17,2,1,0.797980,79,20,106,0.426966,38,0.333333,...,0,0,0,0,0,0,0,0,0,0
1,2024-06-14,7,1,0.613861,62,39,122,0.505495,46,0.405405,...,0,0,0,0,0,0,0,0,0,0
2,2024-06-12,7,1,0.610000,61,39,99,0.441860,38,0.360000,...,0,0,0,0,0,0,0,0,0,0
3,2024-06-09,2,1,0.802083,77,19,105,0.452381,38,0.256410,...,0,0,0,0,0,0,0,0,0,0
4,2024-06-06,2,1,0.800000,76,19,107,0.475610,39,0.380952,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51457,2004-11-03,12,0,0.000000,0,0,109,0.462366,43,0.235294,...,0,0,0,0,0,0,0,0,0,0
51458,2004-11-03,23,0,0.000000,0,0,98,0.444444,36,0.375000,...,0,0,1,0,0,0,0,0,0,0
51459,2004-11-02,8,0,0.000000,0,0,78,0.340909,30,0.250000,...,0,0,0,0,0,0,0,0,0,0
51460,2004-11-02,11,0,0.000000,0,0,79,0.394366,28,0.461538,...,0,0,0,0,0,0,0,0,0,0


In [94]:
#X.Shape is num data, window size, variables
X, y = df_to_X_y(df, dfIDtoStat, 20)
X.shape, y.shape

((25406, 20, 94), (25406,))

In [95]:
#80-10-10 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((20324, 20, 94), (20324,), (2541, 20, 94), (2541,), (2541, 20, 94), (2541,))

In [96]:
model = Sequential()
model.add(InputLayer((20, 94)))
model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.2)) 
model.add(LSTM(32, activation='tanh'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(BatchNormalization()) 
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_26 (LSTM)              (None, 20, 64)            40704     
                                                                 
 batch_normalization_17 (Ba  (None, 20, 64)            256       
 tchNormalization)                                               
                                                                 
 dropout_16 (Dropout)        (None, 20, 64)            0         
                                                                 
 lstm_27 (LSTM)              (None, 32)                12416     
                                                                 
 batch_normalization_18 (Ba  (None, 32)                128       
 tchNormalization)                                               
                                                                 
 dropout_17 (Dropout)        (None, 32)              

In [97]:
earlystopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [98]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


<keras.src.callbacks.History at 0x1810df97220>

Accuracy did not improve

In [99]:
#get accuracy score
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y_test, y_pred_binary))

0.6666666666666666


# Results
Baseline model yielded 68.5% accuracy  
Simple time series forecasting model yielded 59.4% accuracy  
Adding all 34 variables into the time series model increased accuracy to 65.9%  
Increasing the window size from 5 to 10 kept accuracy at 65.9%  
Increasing the window size to 15 increased accuracy to 66.4%  
Increasing the window size to 20 increased accuracy to 67.1%  
Increasing the window size to 25 decreased the accuracy by over 1%  
Adding dropout of 0.2 increased accuracy to 67.2%  
Adding Batch Normalization increased accuracy to 67.3%  
L2 regularization of 0.01 decreased accuracy to 66.8%  
L2 regularization of 0.001 decreased accuracy further to 66.1%  
Removed L2 regularization based on results  
Increased model complexity by adding another LSTM layer with 128 neurons - yielded 66.5% accuracy  
One-hot encoded franchises yielded 66.66% accuracy


# Conclusion

Unfortunately I was not able to compile a model that surpassed the baseline model  
I still believe in this conditional time series forecasting model so I will be returning to the project  
Possible improvements: 
Testing could be structured better  
Window size can still be optimized  
Optimize window size further  
Test more regularization techniques  
Possibly try predicting something other than game outcome  
Capture more relevant features - win streaks, loss streaks, etc.  
Exhaustively search all input features to find optimized input variables  
Try different model architecture  
Change the sliding window to use home performances if the team is the home team instead of all performances regardless of whether they are home or away  
Include player statistics and rosters  

# Work Cited

John Watson Rooney - https://www.youtube.com/@JohnWatsonRooney  
Used his concurrent features technique for the proxy scraper  
landofbasketball.com - Scraped data from this website  
https://www.tensorflow.org/tutorials/structured_data/time_series - time series forecasting example  
https://www.youtube.com/watch?v=kGdbPnMCdOg&t=1692s - used this tutorial to create the sliding window  
Inhereted the function below from the youtube tutorial and modeled the conditional sliding window function from this function.



In [100]:
def df_to_x_y(df, window_size=5):
    df_as_np = df.to_numpy()
    X = []
    y = []
    for i in range(len(df_as_np)-window_size):
        row = [[a] for a in df_as_np[i:i+5]]
        X.append(row)
        label = df_as_np[i+5]
        y.append(label)
    return np.array(X), np.array(y)  