In [136]:
import tensorflow as tf
import os
import pandas as pd 
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, InputLayer
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score

In [114]:
df = pd.read_csv('nbaDatabase.csv')
df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,homeTeam_points_1OT,homeTeam_points_2OT,...,awayTeam_3P,awayTeam_O_Reb,awayTeam_D_Reb,awayTeam_Total_Reb,awayTeam_Ast,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner
0,"June 17, 2024",Boston Celtics,BOS,106,28,39,19,20,0,0,...,11-37,7,28,35,18,4,4,13,20,Boston Celtics
1,"June 14, 2024",Dallas Mavericks,DAL,122,34,27,31,30,0,0,...,14-41,4,27,31,18,2,5,13,19,Dallas Mavericks
2,"June 12, 2024",Dallas Mavericks,DAL,99,31,20,19,29,0,0,...,17-46,6,30,36,26,4,6,9,19,Boston Celtics
3,"June 09, 2024",Boston Celtics,BOS,105,25,29,29,22,0,0,...,6-26,9,34,43,21,5,3,15,17,Boston Celtics
4,"June 06, 2024",Boston Celtics,BOS,107,37,26,23,21,0,0,...,7-27,10,33,43,9,8,1,11,16,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,"April 30, 2014",Houston Rockets,HOU,108,30,26,26,26,0,0,...,9-25,9,25,34,14,8,14,10,21,Houston Rockets
12880,"April 30, 2014",San Antonio Spurs,SAS,109,27,31,21,30,0,0,...,12-28,9,38,47,18,2,8,8,21,San Antonio Spurs
12881,"April 29, 2014",Chicago Bulls,CHI,69,15,26,11,17,0,0,...,2-9,13,36,49,16,9,11,12,19,Washington Wizards
12882,"April 29, 2014",Oklahoma City Thunder,OKC,99,25,18,27,20,9,0,...,7-19,13,37,50,21,11,3,9,21,Memphis Grizzlies


In [115]:
subjectID_dict = {
    "ATL": 1,
    "BOS": 2,
    "BKN": 3,
    "CHA": 4,
    "CHI": 5,
    "CLE": 6,
    "DAL": 7,
    "DEN": 8,
    "DET": 9,
    "GSW": 10,
    "HOU": 11,
    "IND": 12,
    "LAC": 13,
    "LAL": 14,
    "MEM": 15,
    "MIA": 16,
    "MIL": 17,
    "MIN": 18,
    "NOP": 19,
    "NYK": 20,
    "OKC": 21,
    "ORL": 22,
    "PHI": 23,
    "PHX": 24,
    "POR": 25,
    "SAC": 26,
    "SAS": 27,
    "TOR": 28,
    "UTA": 29,
    "WAS": 30
}

In [116]:
df["homeTeamSubject_id"] = df["homeTeam_id"].map(subjectID_dict)
df["awayTeamSubject_id"] = df["awayTeam_id"].map(subjectID_dict)

cols = df.columns.tolist()
cols.insert(cols.index("homeTeam_id") + 1, cols.pop(cols.index("homeTeamSubject_id")))
cols.insert(cols.index("awayTeam_id") + 1, cols.pop(cols.index("awayTeamSubject_id")))
df = df[cols]

df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,homeTeam_points_1OT,...,awayTeam_3P,awayTeam_O_Reb,awayTeam_D_Reb,awayTeam_Total_Reb,awayTeam_Ast,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner
0,"June 17, 2024",Boston Celtics,BOS,2,106,28,39,19,20,0,...,11-37,7,28,35,18,4,4,13,20,Boston Celtics
1,"June 14, 2024",Dallas Mavericks,DAL,7,122,34,27,31,30,0,...,14-41,4,27,31,18,2,5,13,19,Dallas Mavericks
2,"June 12, 2024",Dallas Mavericks,DAL,7,99,31,20,19,29,0,...,17-46,6,30,36,26,4,6,9,19,Boston Celtics
3,"June 09, 2024",Boston Celtics,BOS,2,105,25,29,29,22,0,...,6-26,9,34,43,21,5,3,15,17,Boston Celtics
4,"June 06, 2024",Boston Celtics,BOS,2,107,37,26,23,21,0,...,7-27,10,33,43,9,8,1,11,16,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,"April 30, 2014",Houston Rockets,HOU,11,108,30,26,26,26,0,...,9-25,9,25,34,14,8,14,10,21,Houston Rockets
12880,"April 30, 2014",San Antonio Spurs,SAS,27,109,27,31,21,30,0,...,12-28,9,38,47,18,2,8,8,21,San Antonio Spurs
12881,"April 29, 2014",Chicago Bulls,CHI,5,69,15,26,11,17,0,...,2-9,13,36,49,16,9,11,12,19,Washington Wizards
12882,"April 29, 2014",Oklahoma City Thunder,OKC,21,99,25,18,27,20,9,...,7-19,13,37,50,21,11,3,9,21,Memphis Grizzlies


In [117]:
#limit df to few variables
dfLim = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    "winner"
]]
dfLim

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner
0,"June 17, 2024",Boston Celtics,2,106,Dallas Mavericks,7,88,Boston Celtics
1,"June 14, 2024",Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks
2,"June 12, 2024",Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics
3,"June 09, 2024",Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics
4,"June 06, 2024",Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics
...,...,...,...,...,...,...,...,...
12879,"April 30, 2014",Houston Rockets,11,108,Portland Trail Blazers,25,98,Houston Rockets
12880,"April 30, 2014",San Antonio Spurs,27,109,Dallas Mavericks,7,103,San Antonio Spurs
12881,"April 29, 2014",Chicago Bulls,5,69,Washington Wizards,30,75,Washington Wizards
12882,"April 29, 2014",Oklahoma City Thunder,21,99,Memphis Grizzlies,15,100,Memphis Grizzlies


In [118]:
#reformat the dates so they can be compared
dfLim['date'] = dfLim['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))
dfLim


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim['date'] = dfLim['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner
0,2024-06-17,Boston Celtics,2,106,Dallas Mavericks,7,88,Boston Celtics
1,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks
2,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics
3,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics
4,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics
...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,Portland Trail Blazers,25,98,Houston Rockets
12880,2014-04-30,San Antonio Spurs,27,109,Dallas Mavericks,7,103,San Antonio Spurs
12881,2014-04-29,Chicago Bulls,5,69,Washington Wizards,30,75,Washington Wizards
12882,2014-04-29,Oklahoma City Thunder,21,99,Memphis Grizzlies,15,100,Memphis Grizzlies


In [119]:
dfLim['winner_binary'] = (dfLim['winner'] == df['awayTeam']).astype(int)
dfLim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim['winner_binary'] = (dfLim['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,Dallas Mavericks,7,88,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,Portland Trail Blazers,25,98,Houston Rockets,0
12880,2014-04-30,San Antonio Spurs,27,109,Dallas Mavericks,7,103,San Antonio Spurs,0
12881,2014-04-29,Chicago Bulls,5,69,Washington Wizards,30,75,Washington Wizards,1
12882,2014-04-29,Oklahoma City Thunder,21,99,Memphis Grizzlies,15,100,Memphis Grizzlies,1


In [120]:
dfIDtoStat = pd.DataFrame(columns=['date', 'team_id', 'team_points_total'])
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total


In [121]:
for index, row in dfLim.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    dfIDtoStat.loc[len(dfIDtoStat)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total}
    dfIDtoStat.loc[len(dfIDtoStat)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total}
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total
0,2024-06-17,2,106
1,2024-06-17,7,88
2,2024-06-14,7,122
3,2024-06-14,2,84
4,2024-06-12,7,99
...,...,...,...
25763,2014-04-29,30,75
25764,2014-04-29,21,99
25765,2014-04-29,15,100
25766,2014-04-29,13,113


In [122]:
dfIDtoStat.head()

Unnamed: 0,date,team_id,team_points_total
0,2024-06-17,2,106
1,2024-06-17,7,88
2,2024-06-14,7,122
3,2024-06-14,2,84
4,2024-06-12,7,99


In [123]:
dfLim = dfLim.iloc[::-1].reset_index(drop=True)
dfLim

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner,winner_binary
0,2014-04-29,Los Angeles Clippers,13,113,Golden State Warriors,10,103,Los Angeles Clippers,0
1,2014-04-29,Oklahoma City Thunder,21,99,Memphis Grizzlies,15,100,Memphis Grizzlies,1
2,2014-04-29,Chicago Bulls,5,69,Washington Wizards,30,75,Washington Wizards,1
3,2014-04-30,San Antonio Spurs,27,109,Dallas Mavericks,7,103,San Antonio Spurs,0
4,2014-04-30,Houston Rockets,11,108,Portland Trail Blazers,25,98,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks,0


In [124]:
dfIDtoStat = dfIDtoStat.iloc[::-1].reset_index(drop=True)
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total
0,2014-04-29,10,103
1,2014-04-29,13,113
2,2014-04-29,15,100
3,2014-04-29,21,99
4,2014-04-29,30,75
...,...,...,...
25763,2024-06-12,7,99
25764,2024-06-14,2,84
25765,2024-06-14,7,122
25766,2024-06-17,7,88


In [125]:
'''def df_to_X_y_Simple(df, window_size=5):
    df_as_np = df.to_numpy()
    X = []
    y = []
    for i in range(len(df_as_np)-window_size):
        row = [r for r in df_as_np[i:i+window_size]]
        X.append(row)
        label = df_as_np[i+window_size][0]
        y.append(label)
    return np.array(X), np.array(y)'''

def df_to_X_y(df, dfWindow, window_size):
    X = []
    y = []
    #each team must have played window_size games before data can be extracted
    for index, row in df.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfWindow[dfWindow['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
            homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            homeTeam_window = homeTeam_window.to_numpy()
            awayTeam_window = awayTeam_window.to_numpy()
            combined_window = np.hstack((homeTeam_window, awayTeam_window))
            X.append(combined_window)
            y.append(row['winner_binary'])
    
    return np.array(X), np.array(y)   



In [112]:
window_size =5 
exX =[]
exY = []
for index, row in dfLim.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfIDtoStat[dfIDtoStat['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
                homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size))
                awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size))
                print(date)
                print(homeTeam_window)
                print(awayTeam_window)
                print(type(homeTeam_window))
                homeTeam_window = homeTeam_window.drop(columns=['date', 'team_id'])
                awayTeam_window = awayTeam_window.drop(columns=['date', 'team_id'])
                homeTeam_window = homeTeam_window.to_numpy()
                awayTeam_window = awayTeam_window.to_numpy()
                combined_window = np.hstack((homeTeam_window, awayTeam_window))
                exX.append(combined_window)
                exY.append(row['winner_binary'])
                print(exX)
                print(exY)
                break
                        

                

2014-05-11 00:00:00
         date  team_id  team_points_total
51 2014-05-09       13                112
42 2014-05-07       13                101
34 2014-05-05       13                122
25 2014-05-03       13                126
12 2014-05-01       13                 99
         date  team_id  team_points_total
50 2014-05-09       21                118
43 2014-05-07       21                112
35 2014-05-05       21                105
27 2014-05-03       21                120
14 2014-05-01       21                104
<class 'pandas.core.frame.DataFrame'>
[array([[112, 118],
       [101, 112],
       [122, 105],
       [126, 120],
       [ 99, 104]], dtype=int64)]
[0]


In [126]:
#X1.Shape is num data, window size, variables
X1, y1 = df_to_X_y(dfLim, dfIDtoStat, 5)
X1.shape, y1.shape

((12773, 5, 2), (12773,))

In [None]:
#70-15-15 split
X1_train, X1_temp, y1_train, y1_temp = train_test_split(X1, y1, test_size=0.3, random_state=42)
X1_val, X1_test, y1_val, y1_test = train_test_split(X1_temp, y1_temp, test_size=0.5, random_state=42)
X1_train.shape, y1_train.shape, X1_val.shape, y1_val.shape, X1_test.shape, y1_test.shape

((8941, 5, 2), (8941,), (1916, 5, 2), (1916,), (1916, 5, 2), (1916,))

In [132]:
model1 = Sequential()
model1.add(InputLayer((5, 2)))
model1.add(LSTM(64, activation='tanh', return_sequences=True))
model1.add(LSTM(32, activation='tanh'))
model1.add(Dense(16, activation='relu'))
model1.add(Dense(8, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))

model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 5, 64)             17152     
                                                                 
 lstm_3 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 30241 (118.13 KB)
Trainable params: 30241 (118.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [134]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [135]:
model1.fit(X1_train, y1_train, validation_data=(X1_val, y1_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100


<keras.src.callbacks.History at 0x208267a6940>

In [137]:
y_pred = model1.predict(X1_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y1_test, y_pred_binary))


0.5793319415448852


In [150]:
#Now adding 3 point percentages
dfLim2 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    #added homeTeam3P
    "homeTeam_3P",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    #added awayTeam3P
    "awayTeam_3P",
    "winner"
]]
dfLim2

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,"June 17, 2024",Boston Celtics,2,106,13-39,Dallas Mavericks,7,88,11-37,Boston Celtics
1,"June 14, 2024",Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks
2,"June 12, 2024",Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
3,"June 09, 2024",Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
4,"June 06, 2024",Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...
12879,"April 30, 2014",Houston Rockets,11,108,8-25,Portland Trail Blazers,25,98,9-25,Houston Rockets
12880,"April 30, 2014",San Antonio Spurs,27,109,8-16,Dallas Mavericks,7,103,12-28,San Antonio Spurs
12881,"April 29, 2014",Chicago Bulls,5,69,6-16,Washington Wizards,30,75,2-9,Washington Wizards
12882,"April 29, 2014",Oklahoma City Thunder,21,99,12-31,Memphis Grizzlies,15,100,7-19,Memphis Grizzlies


In [151]:
#reformat the dates so they can be compared
dfLim2['date'] = dfLim2['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))
dfLim2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['date'] = dfLim2['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2024-06-17,Boston Celtics,2,106,13-39,Dallas Mavericks,7,88,11-37,Boston Celtics
1,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks
2,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
3,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
4,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,8-25,Portland Trail Blazers,25,98,9-25,Houston Rockets
12880,2014-04-30,San Antonio Spurs,27,109,8-16,Dallas Mavericks,7,103,12-28,San Antonio Spurs
12881,2014-04-29,Chicago Bulls,5,69,6-16,Washington Wizards,30,75,2-9,Washington Wizards
12882,2014-04-29,Oklahoma City Thunder,21,99,12-31,Memphis Grizzlies,15,100,7-19,Memphis Grizzlies


In [152]:
dfLim2['winner_binary'] = (dfLim2['winner'] == df['awayTeam']).astype(int)
dfLim2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['winner_binary'] = (dfLim2['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,13-39,Dallas Mavericks,7,88,11-37,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,8-25,Portland Trail Blazers,25,98,9-25,Houston Rockets,0
12880,2014-04-30,San Antonio Spurs,27,109,8-16,Dallas Mavericks,7,103,12-28,San Antonio Spurs,0
12881,2014-04-29,Chicago Bulls,5,69,6-16,Washington Wizards,30,75,2-9,Washington Wizards,1
12882,2014-04-29,Oklahoma City Thunder,21,99,12-31,Memphis Grizzlies,15,100,7-19,Memphis Grizzlies,1


In [153]:
#Format as percentage
dfLim2['homeTeam_3P'] = dfLim2['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim2['awayTeam_3P'] = dfLim2['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['homeTeam_3P'] = dfLim2['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['awayTeam_3P'] = dfLim2['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,0.333333,Dallas Mavericks,7,88,0.297297,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,0.405405,Boston Celtics,2,84,0.341463,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,0.360000,Boston Celtics,2,106,0.369565,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,0.256410,Dallas Mavericks,7,98,0.230769,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,0.380952,Dallas Mavericks,7,89,0.259259,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,0.320000,Portland Trail Blazers,25,98,0.360000,Houston Rockets,0
12880,2014-04-30,San Antonio Spurs,27,109,0.500000,Dallas Mavericks,7,103,0.428571,San Antonio Spurs,0
12881,2014-04-29,Chicago Bulls,5,69,0.375000,Washington Wizards,30,75,0.222222,Washington Wizards,1
12882,2014-04-29,Oklahoma City Thunder,21,99,0.387097,Memphis Grizzlies,15,100,0.368421,Memphis Grizzlies,1


In [154]:
dfIDtoStat2 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%'])
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%


In [155]:
#populate dfIDtoStat2
for index, row in dfLim2.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    #added homeTeam3P
    homeTeam_3P = row['homeTeam_3P']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    #added awayTeam3P
    awayTeam_3P = row['homeTeam_3P']
    dfIDtoStat2.loc[len(dfIDtoStat2)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P}
    dfIDtoStat2.loc[len(dfIDtoStat2)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P}
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%
0,2024-06-17,2,106,0.333333
1,2024-06-17,7,88,0.333333
2,2024-06-14,7,122,0.405405
3,2024-06-14,2,84,0.405405
4,2024-06-12,7,99,0.360000
...,...,...,...,...
25763,2014-04-29,30,75,0.375000
25764,2014-04-29,21,99,0.387097
25765,2014-04-29,15,100,0.387097
25766,2014-04-29,13,113,0.400000


In [None]:
#reverse df order - should now start with 2014
dfLim2 = dfLim2.iloc[::-1].reset_index(drop=True)
dfLim2

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2014-04-29,Los Angeles Clippers,13,113,0.400000,Golden State Warriors,10,103,0.384615,Los Angeles Clippers,0
1,2014-04-29,Oklahoma City Thunder,21,99,0.387097,Memphis Grizzlies,15,100,0.368421,Memphis Grizzlies,1
2,2014-04-29,Chicago Bulls,5,69,0.375000,Washington Wizards,30,75,0.222222,Washington Wizards,1
3,2014-04-30,San Antonio Spurs,27,109,0.500000,Dallas Mavericks,7,103,0.428571,San Antonio Spurs,0
4,2014-04-30,Houston Rockets,11,108,0.320000,Portland Trail Blazers,25,98,0.360000,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,0.380952,Dallas Mavericks,7,89,0.259259,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,0.256410,Dallas Mavericks,7,98,0.230769,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,0.360000,Boston Celtics,2,106,0.369565,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,0.405405,Boston Celtics,2,84,0.341463,Dallas Mavericks,0


In [None]:
#reverse order - should now start with 2014
dfIDtoStat2 = dfIDtoStat2.iloc[::-1].reset_index(drop=True)
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%
0,2014-04-29,10,103,0.400000
1,2014-04-29,13,113,0.400000
2,2014-04-29,15,100,0.387097
3,2014-04-29,21,99,0.387097
4,2014-04-29,30,75,0.375000
...,...,...,...,...
25763,2024-06-12,7,99,0.360000
25764,2024-06-14,2,84,0.405405
25765,2024-06-14,7,122,0.405405
25766,2024-06-17,7,88,0.333333


In [158]:
#X1.Shape is num data, window size, variables
X2, y2 = df_to_X_y(dfLim2, dfIDtoStat2, 5)
X2.shape, y2.shape

((12773, 5, 4), (12773,))

In [159]:
#70-15-15 split
X2_train, X2_temp, y2_train, y2_temp = train_test_split(X2, y2, test_size=0.3, random_state=42)
X2_val, X2_test, y2_val, y2_test = train_test_split(X2_temp, y2_temp, test_size=0.5, random_state=42)
X2_train.shape, y2_train.shape, X2_val.shape, y2_val.shape, X2_test.shape, y2_test.shape

((8941, 5, 4), (8941,), (1916, 5, 4), (1916,), (1916, 5, 4), (1916,))

In [160]:
model2 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model2.add(InputLayer((5, 4)))
model2.add(LSTM(64, activation='tanh', return_sequences=True))
model2.add(LSTM(32, activation='tanh'))
model2.add(Dense(16, activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 5, 64)             17664     
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dense_3 (Dense)             (None, 16)                528       
                                                                 
 dense_4 (Dense)             (None, 8)                 136       
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                                 
Total params: 30753 (120.13 KB)
Trainable params: 30753 (120.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [161]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [162]:
model2.fit(X2_train, y2_train, validation_data=(X2_val, y2_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100


<keras.src.callbacks.History at 0x2083258dd60>

In [163]:
y_pred = model2.predict(X2_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y2_test, y_pred_binary))

0.587160751565762
