In [29]:
import tensorflow as tf
import os
import pandas as pd 
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, InputLayer
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score

In [30]:
df = pd.read_csv('CopyOfNBADatawithWinandPlayoff.csv')
df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,homeTeam_points_1OT,homeTeam_points_2OT,...,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner,season,homeWinPct,awayWinPct,gameNumber,isPlayoffGame
0,2014-04-29,Chicago Bulls,CHI,69,15,26,11,17,0,0,...,9,11,12,19,Washington Wizards,2013-2014,0.000000,100.000000,1,False
1,2014-04-29,Oklahoma City Thunder,OKC,99,25,18,27,20,9,0,...,11,3,9,21,Memphis Grizzlies,2013-2014,0.000000,100.000000,2,False
2,2014-04-29,Los Angeles Clippers,LAC,113,31,24,22,36,0,0,...,6,1,13,28,Los Angeles Clippers,2013-2014,100.000000,0.000000,3,False
3,2014-04-30,Toronto Raptors,TOR,115,28,34,29,24,0,0,...,8,4,14,27,Toronto Raptors,2013-2014,100.000000,0.000000,4,False
4,2014-04-30,Houston Rockets,HOU,108,30,26,26,26,0,0,...,8,14,10,21,Houston Rockets,2013-2014,100.000000,0.000000,5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,BOS,107,37,26,23,21,0,0,...,8,1,11,16,Boston Celtics,2023-2024,79.381443,62.000000,1315,True
12880,2024-06-09,Boston Celtics,BOS,105,25,29,29,22,0,0,...,5,3,15,17,Boston Celtics,2023-2024,79.591837,61.386139,1316,True
12881,2024-06-12,Dallas Mavericks,DAL,99,31,20,19,29,0,0,...,4,6,9,19,Boston Celtics,2023-2024,60.784314,79.797980,1317,True
12882,2024-06-14,Dallas Mavericks,DAL,122,34,27,31,30,0,0,...,2,5,13,19,Dallas Mavericks,2023-2024,61.165049,79.000000,1318,True


In [31]:
subjectID_dict = {
    "ATL": 1,
    "BOS": 2,
    "BKN": 3,
    "CHA": 4,
    "CHI": 5,
    "CLE": 6,
    "DAL": 7,
    "DEN": 8,
    "DET": 9,
    "GSW": 10,
    "HOU": 11,
    "IND": 12,
    "LAC": 13,
    "LAL": 14,
    "MEM": 15,
    "MIA": 16,
    "MIL": 17,
    "MIN": 18,
    "NOP": 19,
    "NYK": 20,
    "OKC": 21,
    "ORL": 22,
    "PHI": 23,
    "PHX": 24,
    "POR": 25,
    "SAC": 26,
    "SAS": 27,
    "TOR": 28,
    "UTA": 29,
    "WAS": 30
}

In [32]:
df["homeTeamSubject_id"] = df["homeTeam_id"].map(subjectID_dict)
df["awayTeamSubject_id"] = df["awayTeam_id"].map(subjectID_dict)

cols = df.columns.tolist()
cols.insert(cols.index("homeTeam_id") + 1, cols.pop(cols.index("homeTeamSubject_id")))
cols.insert(cols.index("awayTeam_id") + 1, cols.pop(cols.index("awayTeamSubject_id")))
df = df[cols]

df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,homeTeam_points_1OT,...,awayTeam_Stl,awayTeam_Blk,awayTeam_TO,awayTeam_PF,winner,season,homeWinPct,awayWinPct,gameNumber,isPlayoffGame
0,2014-04-29,Chicago Bulls,CHI,5,69,15,26,11,17,0,...,9,11,12,19,Washington Wizards,2013-2014,0.000000,100.000000,1,False
1,2014-04-29,Oklahoma City Thunder,OKC,21,99,25,18,27,20,9,...,11,3,9,21,Memphis Grizzlies,2013-2014,0.000000,100.000000,2,False
2,2014-04-29,Los Angeles Clippers,LAC,13,113,31,24,22,36,0,...,6,1,13,28,Los Angeles Clippers,2013-2014,100.000000,0.000000,3,False
3,2014-04-30,Toronto Raptors,TOR,28,115,28,34,29,24,0,...,8,4,14,27,Toronto Raptors,2013-2014,100.000000,0.000000,4,False
4,2014-04-30,Houston Rockets,HOU,11,108,30,26,26,26,0,...,8,14,10,21,Houston Rockets,2013-2014,100.000000,0.000000,5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,BOS,2,107,37,26,23,21,0,...,8,1,11,16,Boston Celtics,2023-2024,79.381443,62.000000,1315,True
12880,2024-06-09,Boston Celtics,BOS,2,105,25,29,29,22,0,...,5,3,15,17,Boston Celtics,2023-2024,79.591837,61.386139,1316,True
12881,2024-06-12,Dallas Mavericks,DAL,7,99,31,20,19,29,0,...,4,6,9,19,Boston Celtics,2023-2024,60.784314,79.797980,1317,True
12882,2024-06-14,Dallas Mavericks,DAL,7,122,34,27,31,30,0,...,2,5,13,19,Dallas Mavericks,2023-2024,61.165049,79.000000,1318,True


In [5]:
#limit df to few variables
dfLim = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    "winner"
]]
dfLim

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner
0,"June 17, 2024",Boston Celtics,2,106,Dallas Mavericks,7,88,Boston Celtics
1,"June 14, 2024",Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks
2,"June 12, 2024",Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics
3,"June 09, 2024",Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics
4,"June 06, 2024",Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics
...,...,...,...,...,...,...,...,...
12879,"April 30, 2014",Houston Rockets,11,108,Portland Trail Blazers,25,98,Houston Rockets
12880,"April 30, 2014",San Antonio Spurs,27,109,Dallas Mavericks,7,103,San Antonio Spurs
12881,"April 29, 2014",Chicago Bulls,5,69,Washington Wizards,30,75,Washington Wizards
12882,"April 29, 2014",Oklahoma City Thunder,21,99,Memphis Grizzlies,15,100,Memphis Grizzlies


In [6]:
#reformat the dates so they can be compared
dfLim['date'] = dfLim['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))
dfLim


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim['date'] = dfLim['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner
0,2024-06-17,Boston Celtics,2,106,Dallas Mavericks,7,88,Boston Celtics
1,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks
2,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics
3,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics
4,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics
...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,Portland Trail Blazers,25,98,Houston Rockets
12880,2014-04-30,San Antonio Spurs,27,109,Dallas Mavericks,7,103,San Antonio Spurs
12881,2014-04-29,Chicago Bulls,5,69,Washington Wizards,30,75,Washington Wizards
12882,2014-04-29,Oklahoma City Thunder,21,99,Memphis Grizzlies,15,100,Memphis Grizzlies


In [7]:
dfLim['winner_binary'] = (dfLim['winner'] == df['awayTeam']).astype(int)
dfLim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim['winner_binary'] = (dfLim['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,Dallas Mavericks,7,88,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,Portland Trail Blazers,25,98,Houston Rockets,0
12880,2014-04-30,San Antonio Spurs,27,109,Dallas Mavericks,7,103,San Antonio Spurs,0
12881,2014-04-29,Chicago Bulls,5,69,Washington Wizards,30,75,Washington Wizards,1
12882,2014-04-29,Oklahoma City Thunder,21,99,Memphis Grizzlies,15,100,Memphis Grizzlies,1


In [8]:
dfIDtoStat = pd.DataFrame(columns=['date', 'team_id', 'team_points_total'])
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total


In [9]:
for index, row in dfLim.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    dfIDtoStat.loc[len(dfIDtoStat)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total}
    dfIDtoStat.loc[len(dfIDtoStat)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total}
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total
0,2024-06-17,2,106
1,2024-06-17,7,88
2,2024-06-14,7,122
3,2024-06-14,2,84
4,2024-06-12,7,99
...,...,...,...
25763,2014-04-29,30,75
25764,2014-04-29,21,99
25765,2014-04-29,15,100
25766,2014-04-29,13,113


In [10]:
dfIDtoStat.head()

Unnamed: 0,date,team_id,team_points_total
0,2024-06-17,2,106
1,2024-06-17,7,88
2,2024-06-14,7,122
3,2024-06-14,2,84
4,2024-06-12,7,99


In [11]:
dfLim = dfLim.iloc[::-1].reset_index(drop=True)
dfLim

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner,winner_binary
0,2014-04-29,Los Angeles Clippers,13,113,Golden State Warriors,10,103,Los Angeles Clippers,0
1,2014-04-29,Oklahoma City Thunder,21,99,Memphis Grizzlies,15,100,Memphis Grizzlies,1
2,2014-04-29,Chicago Bulls,5,69,Washington Wizards,30,75,Washington Wizards,1
3,2014-04-30,San Antonio Spurs,27,109,Dallas Mavericks,7,103,San Antonio Spurs,0
4,2014-04-30,Houston Rockets,11,108,Portland Trail Blazers,25,98,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks,0


In [12]:
dfIDtoStat = dfIDtoStat.iloc[::-1].reset_index(drop=True)
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total
0,2014-04-29,10,103
1,2014-04-29,13,113
2,2014-04-29,15,100
3,2014-04-29,21,99
4,2014-04-29,30,75
...,...,...,...
25763,2024-06-12,7,99
25764,2024-06-14,2,84
25765,2024-06-14,7,122
25766,2024-06-17,7,88


In [13]:
'''def df_to_X_y_Simple(df, window_size=5):
    df_as_np = df.to_numpy()
    X = []
    y = []
    for i in range(len(df_as_np)-window_size):
        row = [r for r in df_as_np[i:i+window_size]]
        X.append(row)
        label = df_as_np[i+window_size][0]
        y.append(label)
    return np.array(X), np.array(y)'''

def df_to_X_y(df, dfWindow, window_size):
    X = []
    y = []
    #each team must have played window_size games before data can be extracted
    for index, row in df.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfWindow[dfWindow['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
            homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            homeTeam_window = homeTeam_window.to_numpy()
            awayTeam_window = awayTeam_window.to_numpy()
            combined_window = np.hstack((homeTeam_window, awayTeam_window))
            X.append(combined_window)
            y.append(row['winner_binary'])
    
    return np.array(X), np.array(y)   



In [14]:
window_size =5 
exX =[]
exY = []
for index, row in dfLim.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfIDtoStat[dfIDtoStat['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
                homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size))
                awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size))
                print(date)
                print(homeTeam_window)
                print(awayTeam_window)
                print(type(homeTeam_window))
                homeTeam_window = homeTeam_window.drop(columns=['date', 'team_id'])
                awayTeam_window = awayTeam_window.drop(columns=['date', 'team_id'])
                homeTeam_window = homeTeam_window.to_numpy()
                awayTeam_window = awayTeam_window.to_numpy()
                combined_window = np.hstack((homeTeam_window, awayTeam_window))
                exX.append(combined_window)
                exY.append(row['winner_binary'])
                print(exX)
                print(exY)
                break
                        

                

2014-05-11 00:00:00
         date  team_id  team_points_total
51 2014-05-09       13                112
42 2014-05-07       13                101
34 2014-05-05       13                122
25 2014-05-03       13                126
12 2014-05-01       13                 99
         date  team_id  team_points_total
50 2014-05-09       21                118
43 2014-05-07       21                112
35 2014-05-05       21                105
27 2014-05-03       21                120
14 2014-05-01       21                104
<class 'pandas.core.frame.DataFrame'>
[array([[112, 118],
       [101, 112],
       [122, 105],
       [126, 120],
       [ 99, 104]])]
[0]


In [15]:
#X1.Shape is num data, window size, variables
X1, y1 = df_to_X_y(dfLim, dfIDtoStat, 5)
X1.shape, y1.shape

((12773, 5, 2), (12773,))

In [16]:
#70-15-15 split
X1_train, X1_temp, y1_train, y1_temp = train_test_split(X1, y1, test_size=0.3, random_state=42)
X1_val, X1_test, y1_val, y1_test = train_test_split(X1_temp, y1_temp, test_size=0.5, random_state=42)
X1_train.shape, y1_train.shape, X1_val.shape, y1_val.shape, X1_test.shape, y1_test.shape

((8941, 5, 2), (8941,), (1916, 5, 2), (1916,), (1916, 5, 2), (1916,))

In [17]:
model1 = Sequential()
model1.add(InputLayer((5, 2)))
model1.add(LSTM(64, activation='tanh', return_sequences=True))
model1.add(LSTM(32, activation='tanh'))
model1.add(Dense(16, activation='relu'))
model1.add(Dense(8, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))

model1.summary()

In [18]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [19]:
model1.fit(X1_train, y1_train, validation_data=(X1_val, y1_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5723 - loss: 0.6803 - val_accuracy: 0.5694 - val_loss: 0.6760
Epoch 2/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5717 - loss: 0.6747 - val_accuracy: 0.5694 - val_loss: 0.6779
Epoch 3/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5760 - loss: 0.6732 - val_accuracy: 0.5872 - val_loss: 0.6753
Epoch 4/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5839 - loss: 0.6748 - val_accuracy: 0.5919 - val_loss: 0.6753
Epoch 5/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5805 - loss: 0.6761 - val_accuracy: 0.5783 - val_loss: 0.6757
Epoch 6/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5844 - loss: 0.6724 - val_accuracy: 0.5877 - val_loss: 0.6768
Epoch 7/100
[1m280/28

<keras.src.callbacks.history.History at 0x34f93cf80>

In [20]:
y_pred = model1.predict(X1_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y1_test, y_pred_binary))


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
0.5788100208768268


In [21]:
#Now adding 3 point percentages
dfLim2 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    #added homeTeam3P
    "homeTeam_3P",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    #added awayTeam3P
    "awayTeam_3P",
    "winner"
]]
dfLim2

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,"June 17, 2024",Boston Celtics,2,106,13-39,Dallas Mavericks,7,88,11-37,Boston Celtics
1,"June 14, 2024",Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks
2,"June 12, 2024",Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
3,"June 09, 2024",Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
4,"June 06, 2024",Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...
12879,"April 30, 2014",Houston Rockets,11,108,8-25,Portland Trail Blazers,25,98,9-25,Houston Rockets
12880,"April 30, 2014",San Antonio Spurs,27,109,8-16,Dallas Mavericks,7,103,12-28,San Antonio Spurs
12881,"April 29, 2014",Chicago Bulls,5,69,6-16,Washington Wizards,30,75,2-9,Washington Wizards
12882,"April 29, 2014",Oklahoma City Thunder,21,99,12-31,Memphis Grizzlies,15,100,7-19,Memphis Grizzlies


In [22]:
#reformat the dates so they can be compared
dfLim2['date'] = dfLim2['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))
dfLim2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['date'] = dfLim2['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2024-06-17,Boston Celtics,2,106,13-39,Dallas Mavericks,7,88,11-37,Boston Celtics
1,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks
2,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
3,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
4,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,8-25,Portland Trail Blazers,25,98,9-25,Houston Rockets
12880,2014-04-30,San Antonio Spurs,27,109,8-16,Dallas Mavericks,7,103,12-28,San Antonio Spurs
12881,2014-04-29,Chicago Bulls,5,69,6-16,Washington Wizards,30,75,2-9,Washington Wizards
12882,2014-04-29,Oklahoma City Thunder,21,99,12-31,Memphis Grizzlies,15,100,7-19,Memphis Grizzlies


In [23]:
dfLim2['winner_binary'] = (dfLim2['winner'] == df['awayTeam']).astype(int)
dfLim2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['winner_binary'] = (dfLim2['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,13-39,Dallas Mavericks,7,88,11-37,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,8-25,Portland Trail Blazers,25,98,9-25,Houston Rockets,0
12880,2014-04-30,San Antonio Spurs,27,109,8-16,Dallas Mavericks,7,103,12-28,San Antonio Spurs,0
12881,2014-04-29,Chicago Bulls,5,69,6-16,Washington Wizards,30,75,2-9,Washington Wizards,1
12882,2014-04-29,Oklahoma City Thunder,21,99,12-31,Memphis Grizzlies,15,100,7-19,Memphis Grizzlies,1


In [24]:
#Format as percentage
dfLim2['homeTeam_3P'] = dfLim2['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim2['awayTeam_3P'] = dfLim2['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['homeTeam_3P'] = dfLim2['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['awayTeam_3P'] = dfLim2['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,0.333333,Dallas Mavericks,7,88,0.297297,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,0.405405,Boston Celtics,2,84,0.341463,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,0.360000,Boston Celtics,2,106,0.369565,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,0.256410,Dallas Mavericks,7,98,0.230769,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,0.380952,Dallas Mavericks,7,89,0.259259,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,0.320000,Portland Trail Blazers,25,98,0.360000,Houston Rockets,0
12880,2014-04-30,San Antonio Spurs,27,109,0.500000,Dallas Mavericks,7,103,0.428571,San Antonio Spurs,0
12881,2014-04-29,Chicago Bulls,5,69,0.375000,Washington Wizards,30,75,0.222222,Washington Wizards,1
12882,2014-04-29,Oklahoma City Thunder,21,99,0.387097,Memphis Grizzlies,15,100,0.368421,Memphis Grizzlies,1


In [25]:
dfIDtoStat2 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%'])
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%


In [26]:
#populate dfIDtoStat2
for index, row in dfLim2.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    #added homeTeam3P
    homeTeam_3P = row['homeTeam_3P']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    #added awayTeam3P
    awayTeam_3P = row['homeTeam_3P']
    dfIDtoStat2.loc[len(dfIDtoStat2)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P}
    dfIDtoStat2.loc[len(dfIDtoStat2)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P}
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%
0,2024-06-17,2,106,0.333333
1,2024-06-17,7,88,0.333333
2,2024-06-14,7,122,0.405405
3,2024-06-14,2,84,0.405405
4,2024-06-12,7,99,0.360000
...,...,...,...,...
25763,2014-04-29,30,75,0.375000
25764,2014-04-29,21,99,0.387097
25765,2014-04-29,15,100,0.387097
25766,2014-04-29,13,113,0.400000


In [27]:
#reverse df order - should now start with 2014
dfLim2 = dfLim2.iloc[::-1].reset_index(drop=True)
dfLim2

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2014-04-29,Los Angeles Clippers,13,113,0.400000,Golden State Warriors,10,103,0.384615,Los Angeles Clippers,0
1,2014-04-29,Oklahoma City Thunder,21,99,0.387097,Memphis Grizzlies,15,100,0.368421,Memphis Grizzlies,1
2,2014-04-29,Chicago Bulls,5,69,0.375000,Washington Wizards,30,75,0.222222,Washington Wizards,1
3,2014-04-30,San Antonio Spurs,27,109,0.500000,Dallas Mavericks,7,103,0.428571,San Antonio Spurs,0
4,2014-04-30,Houston Rockets,11,108,0.320000,Portland Trail Blazers,25,98,0.360000,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,0.380952,Dallas Mavericks,7,89,0.259259,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,0.256410,Dallas Mavericks,7,98,0.230769,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,0.360000,Boston Celtics,2,106,0.369565,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,0.405405,Boston Celtics,2,84,0.341463,Dallas Mavericks,0


In [28]:
#reverse order - should now start with 2014
dfIDtoStat2 = dfIDtoStat2.iloc[::-1].reset_index(drop=True)
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%
0,2014-04-29,10,103,0.400000
1,2014-04-29,13,113,0.400000
2,2014-04-29,15,100,0.387097
3,2014-04-29,21,99,0.387097
4,2014-04-29,30,75,0.375000
...,...,...,...,...
25763,2024-06-12,7,99,0.360000
25764,2024-06-14,2,84,0.405405
25765,2024-06-14,7,122,0.405405
25766,2024-06-17,7,88,0.333333


In [29]:
#X1.Shape is num data, window size, variables
X2, y2 = df_to_X_y(dfLim2, dfIDtoStat2, 5)
X2.shape, y2.shape

((12773, 5, 4), (12773,))

In [30]:
#70-15-15 split
X2_train, X2_temp, y2_train, y2_temp = train_test_split(X2, y2, test_size=0.3, random_state=42)
X2_val, X2_test, y2_val, y2_test = train_test_split(X2_temp, y2_temp, test_size=0.5, random_state=42)
X2_train.shape, y2_train.shape, X2_val.shape, y2_val.shape, X2_test.shape, y2_test.shape

((8941, 5, 4), (8941,), (1916, 5, 4), (1916,), (1916, 5, 4), (1916,))

In [31]:
model2 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model2.add(InputLayer((5, 4)))
model2.add(LSTM(64, activation='tanh', return_sequences=True))
model2.add(LSTM(32, activation='tanh'))
model2.add(Dense(16, activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

model2.summary()

In [32]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
model2.fit(X2_train, y2_train, validation_data=(X2_val, y2_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100


[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5767 - loss: 0.6789 - val_accuracy: 0.5694 - val_loss: 0.6766
Epoch 2/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5830 - loss: 0.6726 - val_accuracy: 0.5694 - val_loss: 0.6818
Epoch 3/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5864 - loss: 0.6712 - val_accuracy: 0.5913 - val_loss: 0.6761
Epoch 4/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5937 - loss: 0.6708 - val_accuracy: 0.5929 - val_loss: 0.6777
Epoch 5/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5770 - loss: 0.6769 - val_accuracy: 0.5835 - val_loss: 0.6769
Epoch 6/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5779 - loss: 0.6753 - val_accuracy: 0.5924 - val_loss: 0.6761
Epoch 7/100
[1m280/280[0m [32m━

<keras.src.callbacks.history.History at 0x35157d730>

In [34]:
y_pred = model2.predict(X2_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y2_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
0.5788100208768268


In [35]:
#Not adding anything new because I am gonna pull out 3 pointers made
dfLim3 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    #added homeTeam3P
    "homeTeam_3P",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    #added awayTeam3P
    "awayTeam_3P",
    "winner"
]]
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,"June 17, 2024",Boston Celtics,2,106,13-39,Dallas Mavericks,7,88,11-37,Boston Celtics
1,"June 14, 2024",Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks
2,"June 12, 2024",Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
3,"June 09, 2024",Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
4,"June 06, 2024",Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...
12879,"April 30, 2014",Houston Rockets,11,108,8-25,Portland Trail Blazers,25,98,9-25,Houston Rockets
12880,"April 30, 2014",San Antonio Spurs,27,109,8-16,Dallas Mavericks,7,103,12-28,San Antonio Spurs
12881,"April 29, 2014",Chicago Bulls,5,69,6-16,Washington Wizards,30,75,2-9,Washington Wizards
12882,"April 29, 2014",Oklahoma City Thunder,21,99,12-31,Memphis Grizzlies,15,100,7-19,Memphis Grizzlies


In [36]:
#reformat the dates so they can be compared
dfLim3['date'] = dfLim3['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))
dfLim3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['date'] = dfLim3['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2024-06-17,Boston Celtics,2,106,13-39,Dallas Mavericks,7,88,11-37,Boston Celtics
1,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks
2,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
3,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
4,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,8-25,Portland Trail Blazers,25,98,9-25,Houston Rockets
12880,2014-04-30,San Antonio Spurs,27,109,8-16,Dallas Mavericks,7,103,12-28,San Antonio Spurs
12881,2014-04-29,Chicago Bulls,5,69,6-16,Washington Wizards,30,75,2-9,Washington Wizards
12882,2014-04-29,Oklahoma City Thunder,21,99,12-31,Memphis Grizzlies,15,100,7-19,Memphis Grizzlies


In [37]:
dfLim3['winner_binary'] = (dfLim3['winner'] == df['awayTeam']).astype(int)
dfLim3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['winner_binary'] = (dfLim3['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,13-39,Dallas Mavericks,7,88,11-37,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,8-25,Portland Trail Blazers,25,98,9-25,Houston Rockets,0
12880,2014-04-30,San Antonio Spurs,27,109,8-16,Dallas Mavericks,7,103,12-28,San Antonio Spurs,0
12881,2014-04-29,Chicago Bulls,5,69,6-16,Washington Wizards,30,75,2-9,Washington Wizards,1
12882,2014-04-29,Oklahoma City Thunder,21,99,12-31,Memphis Grizzlies,15,100,7-19,Memphis Grizzlies,1


In [38]:
dfLim3.insert(dfLim3.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim3['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim3.insert(dfLim3.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim3['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,13-39,13,Dallas Mavericks,7,88,11-37,11,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,15-37,15,Boston Celtics,2,84,14-41,14,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,9-25,9,Boston Celtics,2,106,17-46,17,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,10-39,10,Dallas Mavericks,7,98,6-26,6,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,16-42,16,Dallas Mavericks,7,89,7-27,7,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,8-25,8,Portland Trail Blazers,25,98,9-25,9,Houston Rockets,0
12880,2014-04-30,San Antonio Spurs,27,109,8-16,8,Dallas Mavericks,7,103,12-28,12,San Antonio Spurs,0
12881,2014-04-29,Chicago Bulls,5,69,6-16,6,Washington Wizards,30,75,2-9,2,Washington Wizards,1
12882,2014-04-29,Oklahoma City Thunder,21,99,12-31,12,Memphis Grizzlies,15,100,7-19,7,Memphis Grizzlies,1


In [39]:
#Format as percentage
dfLim3['homeTeam_3P'] = dfLim3['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim3['awayTeam_3P'] = dfLim3['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['homeTeam_3P'] = dfLim3['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['awayTeam_3P'] = dfLim3['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,0.333333,13,Dallas Mavericks,7,88,0.297297,11,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,0.405405,15,Boston Celtics,2,84,0.341463,14,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,0.360000,9,Boston Celtics,2,106,0.369565,17,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,0.256410,10,Dallas Mavericks,7,98,0.230769,6,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,0.380952,16,Dallas Mavericks,7,89,0.259259,7,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2014-04-30,Houston Rockets,11,108,0.320000,8,Portland Trail Blazers,25,98,0.360000,9,Houston Rockets,0
12880,2014-04-30,San Antonio Spurs,27,109,0.500000,8,Dallas Mavericks,7,103,0.428571,12,San Antonio Spurs,0
12881,2014-04-29,Chicago Bulls,5,69,0.375000,6,Washington Wizards,30,75,0.222222,2,Washington Wizards,1
12882,2014-04-29,Oklahoma City Thunder,21,99,0.387097,12,Memphis Grizzlies,15,100,0.368421,7,Memphis Grizzlies,1


In [40]:
dfIDtoStat3 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made'])
dfIDtoStat3

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made


In [41]:
#populate dfIDtoStat3
for index, row in dfLim3.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    homeTeam_3P = row['homeTeam_3P']
    #added homeTeam3P_made
    homeTeam_3P_made = row['homeTeam_3P_made']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    awayTeam_3P = row['homeTeam_3P']
    #added awayTeam3P_made
    awayTeam_3P_made = row['awayTeam_3P_made']
    #added these new variables into this
    dfIDtoStat3.loc[len(dfIDtoStat3)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P, '3P_made':homeTeam_3P_made}
    dfIDtoStat3.loc[len(dfIDtoStat3)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P, '3P_made':awayTeam_3P_made}
dfIDtoStat3

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made
0,2024-06-17,2,106,0.333333,13
1,2024-06-17,7,88,0.333333,11
2,2024-06-14,7,122,0.405405,15
3,2024-06-14,2,84,0.405405,14
4,2024-06-12,7,99,0.360000,9
...,...,...,...,...,...
25763,2014-04-29,30,75,0.375000,2
25764,2014-04-29,21,99,0.387097,12
25765,2014-04-29,15,100,0.387097,7
25766,2014-04-29,13,113,0.400000,8


In [42]:
#reverse df order - should now start with 2014
dfLim3 = dfLim3.iloc[::-1].reset_index(drop=True)
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,winner,winner_binary
0,2014-04-29,Los Angeles Clippers,13,113,0.400000,8,Golden State Warriors,10,103,0.384615,10,Los Angeles Clippers,0
1,2014-04-29,Oklahoma City Thunder,21,99,0.387097,12,Memphis Grizzlies,15,100,0.368421,7,Memphis Grizzlies,1
2,2014-04-29,Chicago Bulls,5,69,0.375000,6,Washington Wizards,30,75,0.222222,2,Washington Wizards,1
3,2014-04-30,San Antonio Spurs,27,109,0.500000,8,Dallas Mavericks,7,103,0.428571,12,San Antonio Spurs,0
4,2014-04-30,Houston Rockets,11,108,0.320000,8,Portland Trail Blazers,25,98,0.360000,9,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,0.380952,16,Dallas Mavericks,7,89,0.259259,7,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,0.256410,10,Dallas Mavericks,7,98,0.230769,6,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,0.360000,9,Boston Celtics,2,106,0.369565,17,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,0.405405,15,Boston Celtics,2,84,0.341463,14,Dallas Mavericks,0


In [43]:
#reverse order - should now start with 2014
dfIDtoStat3 = dfIDtoStat3.iloc[::-1].reset_index(drop=True)
dfIDtoStat3

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made
0,2014-04-29,10,103,0.400000,10
1,2014-04-29,13,113,0.400000,8
2,2014-04-29,15,100,0.387097,7
3,2014-04-29,21,99,0.387097,12
4,2014-04-29,30,75,0.375000,2
...,...,...,...,...,...
25763,2024-06-12,7,99,0.360000,9
25764,2024-06-14,2,84,0.405405,14
25765,2024-06-14,7,122,0.405405,15
25766,2024-06-17,7,88,0.333333,11


In [44]:
#X1.Shape is num data, window size, variables
X3, y3 = df_to_X_y(dfLim3, dfIDtoStat3, 5)
X3.shape, y3.shape

((12773, 5, 6), (12773,))

In [45]:
#70-15-15 split
X3_train, X3_temp, y3_train, y3_temp = train_test_split(X3, y3, test_size=0.3, random_state=42)
X3_val, X3_test, y3_val, y3_test = train_test_split(X3_temp, y3_temp, test_size=0.5, random_state=42)
X3_train.shape, y3_train.shape, X3_val.shape, y3_val.shape, X3_test.shape, y3_test.shape

((8941, 5, 6), (8941,), (1916, 5, 6), (1916,), (1916, 5, 6), (1916,))

In [46]:
model3 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model3.add(InputLayer((5, 6)))
model3.add(LSTM(64, activation='tanh', return_sequences=True))
model3.add(LSTM(32, activation='tanh'))
model3.add(Dense(16, activation='relu'))
model3.add(Dense(8, activation='relu'))
model3.add(Dense(1, activation='sigmoid'))

model3.summary()

In [47]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [48]:
model3.fit(X3_train, y3_train, validation_data=(X3_val, y3_val), epochs=100, callbacks=[earlystopping])


Epoch 1/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5731 - loss: 0.6791 - val_accuracy: 0.5757 - val_loss: 0.6767
Epoch 2/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5815 - loss: 0.6742 - val_accuracy: 0.5652 - val_loss: 0.6784
Epoch 3/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5874 - loss: 0.6714 - val_accuracy: 0.5825 - val_loss: 0.6727
Epoch 4/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5701 - loss: 0.6763 - val_accuracy: 0.5877 - val_loss: 0.6721
Epoch 5/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5915 - loss: 0.6721 - val_accuracy: 0.5908 - val_loss: 0.6728
Epoch 6/100
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5919 - loss: 0.6683 - val_accuracy: 0.5903 - val_loss: 0.6777
Epoch 7/100
[1m280/28

<keras.src.callbacks.history.History at 0x35131c4d0>

In [49]:
y_pred = model3.predict(X3_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y3_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
0.581419624217119


In [62]:
dfLim4 = dfLim3.copy()  # Copy the original DataFrame

# Get the position of 'homeTeam_3P'
position = dfLim4.columns.get_loc('homeTeam_3P')

# Insert 'homeTeam_FG' after 'shomeTeam_3P'
dfLim4.insert(position + 1, 'homeTeam_FG', df['homeTeam_FG'])

position2 = dfLim4.columns.get_loc('awayTeam_3P')
# Insert 'awayTeam_FG' right after 'awayTeam_3P'
dfLim4.insert(position2 + 1, 'awayTeam_FG', df['awayTeam_FG'])

dfLim4

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_3P_made,winner,winner_binary
0,2014-04-29,Los Angeles Clippers,13,113,0.400000,38-89,8,Golden State Warriors,10,103,0.384615,35-78,10,Los Angeles Clippers,0
1,2014-04-29,Oklahoma City Thunder,21,99,0.387097,46-91,12,Memphis Grizzlies,15,100,0.368421,29-80,7,Memphis Grizzlies,1
2,2014-04-29,Chicago Bulls,5,69,0.375000,38-86,6,Washington Wizards,30,75,0.222222,38-82,2,Washington Wizards,1
3,2014-04-30,San Antonio Spurs,27,109,0.500000,38-84,8,Dallas Mavericks,7,103,0.428571,38-80,12,San Antonio Spurs,0
4,2014-04-30,Houston Rockets,11,108,0.320000,39-82,8,Portland Trail Blazers,25,98,0.360000,35-84,9,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,0.380952,43-91,16,Dallas Mavericks,7,89,0.259259,36-83,7,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,0.256410,41-88,10,Dallas Mavericks,7,98,0.230769,39-89,6,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,0.360000,25-75,9,Boston Celtics,2,106,0.369565,30-74,17,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,0.405405,36-92,15,Boston Celtics,2,84,0.341463,37-87,14,Dallas Mavericks,0


In [63]:
dfLim4.insert(dfLim4.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim4['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim4.insert(dfLim4.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim4['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim4

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_FG_made,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_FG_made,awayTeam_3P_made,winner,winner_binary
0,2014-04-29,Los Angeles Clippers,13,113,0.400000,38-89,38,8,Golden State Warriors,10,103,0.384615,35-78,35,10,Los Angeles Clippers,0
1,2014-04-29,Oklahoma City Thunder,21,99,0.387097,46-91,46,12,Memphis Grizzlies,15,100,0.368421,29-80,29,7,Memphis Grizzlies,1
2,2014-04-29,Chicago Bulls,5,69,0.375000,38-86,38,6,Washington Wizards,30,75,0.222222,38-82,38,2,Washington Wizards,1
3,2014-04-30,San Antonio Spurs,27,109,0.500000,38-84,38,8,Dallas Mavericks,7,103,0.428571,38-80,38,12,San Antonio Spurs,0
4,2014-04-30,Houston Rockets,11,108,0.320000,39-82,39,8,Portland Trail Blazers,25,98,0.360000,35-84,35,9,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,0.380952,43-91,43,16,Dallas Mavericks,7,89,0.259259,36-83,36,7,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,0.256410,41-88,41,10,Dallas Mavericks,7,98,0.230769,39-89,39,6,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,0.360000,25-75,25,9,Boston Celtics,2,106,0.369565,30-74,30,17,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,0.405405,36-92,36,15,Boston Celtics,2,84,0.341463,37-87,37,14,Dallas Mavericks,0


In [64]:
#Format as percentage
dfLim4['homeTeam_FG'] = dfLim4['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim4['awayTeam_FG'] = dfLim4['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim4

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_FG_made,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_FG_made,awayTeam_3P_made,winner,winner_binary
0,2014-04-29,Los Angeles Clippers,13,113,0.400000,0.426966,38,8,Golden State Warriors,10,103,0.384615,0.448718,35,10,Los Angeles Clippers,0
1,2014-04-29,Oklahoma City Thunder,21,99,0.387097,0.505495,46,12,Memphis Grizzlies,15,100,0.368421,0.362500,29,7,Memphis Grizzlies,1
2,2014-04-29,Chicago Bulls,5,69,0.375000,0.441860,38,6,Washington Wizards,30,75,0.222222,0.463415,38,2,Washington Wizards,1
3,2014-04-30,San Antonio Spurs,27,109,0.500000,0.452381,38,8,Dallas Mavericks,7,103,0.428571,0.475000,38,12,San Antonio Spurs,0
4,2014-04-30,Houston Rockets,11,108,0.320000,0.475610,39,8,Portland Trail Blazers,25,98,0.360000,0.416667,35,9,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,0.380952,0.472527,43,16,Dallas Mavericks,7,89,0.259259,0.433735,36,7,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,0.256410,0.465909,41,10,Dallas Mavericks,7,98,0.230769,0.438202,39,6,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,0.360000,0.333333,25,9,Boston Celtics,2,106,0.369565,0.405405,30,17,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,0.405405,0.391304,36,15,Boston Celtics,2,84,0.341463,0.425287,37,14,Dallas Mavericks,0


In [65]:
dfIDtoStat4 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made' ])
dfIDtoStat4

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made


In [66]:
for index, row in dfLim4.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    homeTeam_3P = row['homeTeam_3P']
    homeTeam_3P_made = row['homeTeam_3P_made']
     #added homeTeamFG_made
    homeTeam_FG = row['homeTeam_FG']
    homeTeam_FG_made = row['homeTeam_FG_made']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    awayTeam_3P = row['homeTeam_3P']
    awayTeam_3P_made = row['awayTeam_3P_made']
    #added awayTeamFG
    awayTeam_FG = row['awayTeam_FG']
    awayTeam_FG_made = row['awayTeam_FG_made']
    #added these new variables into this
    dfIDtoStat4.loc[len(dfIDtoStat4)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P, '3P_made':homeTeam_3P_made, 'FG%':homeTeam_FG, 'FG_made':homeTeam_FG_made}
    dfIDtoStat4.loc[len(dfIDtoStat4)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P, '3P_made':awayTeam_3P_made, 'FG%':awayTeam_FG, 'FG_made':awayTeam_FG_made}
dfIDtoStat4

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made
0,2014-04-29,13,113,0.400000,8,0.426966,38
1,2014-04-29,10,103,0.400000,10,0.448718,35
2,2014-04-29,21,99,0.387097,12,0.505495,46
3,2014-04-29,15,100,0.387097,7,0.362500,29
4,2014-04-29,5,69,0.375000,6,0.441860,38
...,...,...,...,...,...,...,...
25763,2024-06-12,2,106,0.360000,17,0.405405,30
25764,2024-06-14,7,122,0.405405,15,0.391304,36
25765,2024-06-14,2,84,0.405405,14,0.425287,37
25766,2024-06-17,2,106,0.333333,13,0.486842,37


In [68]:
#X1.Shape is num data, window size, variables
X4, y4 = df_to_X_y(dfLim4, dfIDtoStat4, 6)
X4.shape, y4.shape

((12753, 6, 10), (12753,))

In [69]:
#70-15-15 split
X4_train, X4_temp, y4_train, y4_temp = train_test_split(X4, y4, test_size=0.3, random_state=42)
X4_val, X4_test, y4_val, y4_test = train_test_split(X4_temp, y4_temp, test_size=0.5, random_state=42)
X4_train.shape, y4_train.shape, X4_val.shape, y4_val.shape, X4_test.shape, y4_test.shape

((8927, 6, 10), (8927,), (1913, 6, 10), (1913,), (1913, 6, 10), (1913,))

In [83]:
model4 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model4.add(InputLayer((5, 10)))
model4.add(LSTM(64, activation='tanh', return_sequences=True))
model4.add(LSTM(32, activation='tanh'))
model4.add(Dense(16, activation='relu'))
model4.add(Dense(8, activation='relu'))
model4.add(Dense(1, activation='sigmoid'))

model4.summary()

In [84]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [86]:
model4.fit(X4_train, y4_train, validation_data=(X4_val, y4_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5816 - loss: 0.6731 - val_accuracy: 0.5975 - val_loss: 0.6693
Epoch 2/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5873 - loss: 0.6730 - val_accuracy: 0.5876 - val_loss: 0.6709
Epoch 3/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5798 - loss: 0.6727 - val_accuracy: 0.5902 - val_loss: 0.6706
Epoch 4/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5948 - loss: 0.6644 - val_accuracy: 0.5860 - val_loss: 0.6720
Epoch 5/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5844 - loss: 0.6717 - val_accuracy: 0.5970 - val_loss: 0.6742
Epoch 6/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5920 - loss: 0.6695 - val_accuracy: 0.5933 - val_loss: 0.6700
Epoch 7/100
[1m279/27

<keras.src.callbacks.history.History at 0x30aa41010>

In [89]:
y_pred = model4.predict(X4_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y4_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 775us/step
0.6006272869837951


In [33]:
#Adding ast, reb, and winning percentages
dfLim5 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    "homeTeam_3P",
    "homeTeam_FG",
    #added homeTeamReb and homeTeamAst
    "homeTeam_Total_Reb",
    "homeTeam_Ast",
    "homeWinPct",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    "awayTeam_3P",
    "awayTeam_FG",
    #added awayTeamReb and awayTeamAst
    "awayTeam_Total_Reb",
    "awayTeam_Ast",
    "awayWinPct",
    "winner"
]]
dfLim5

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,homeWinPct,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner
0,2014-04-29,Chicago Bulls,5,69,6-16,25-75,43,19,0.000000,Washington Wizards,30,75,2-9,30-74,49,16,100.000000,Washington Wizards
1,2014-04-29,Oklahoma City Thunder,21,99,12-31,36-92,51,24,0.000000,Memphis Grizzlies,15,100,7-19,37-87,50,21,100.000000,Memphis Grizzlies
2,2014-04-29,Los Angeles Clippers,13,113,8-20,37-76,41,20,100.000000,Golden State Warriors,10,103,10-26,40-85,42,24,0.000000,Los Angeles Clippers
3,2014-04-30,Toronto Raptors,28,115,12-26,37-77,37,21,100.000000,Brooklyn Nets,3,113,11-23,40-75,34,22,0.000000,Toronto Raptors
4,2014-04-30,Houston Rockets,11,108,8-25,43-91,48,23,100.000000,Portland Trail Blazers,25,98,9-25,36-83,34,14,0.000000,Houston Rockets
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,16-42,39-82,47,23,79.381443,Dallas Mavericks,7,89,7-27,35-84,43,9,62.000000,Boston Celtics
12880,2024-06-09,Boston Celtics,2,105,10-39,38-84,41,29,79.591837,Dallas Mavericks,7,98,6-26,38-80,43,21,61.386139,Boston Celtics
12881,2024-06-12,Dallas Mavericks,7,99,9-25,38-86,43,15,60.784314,Boston Celtics,2,106,17-46,38-82,36,26,79.797980,Boston Celtics
12882,2024-06-14,Dallas Mavericks,7,122,15-37,46-91,52,21,61.165049,Boston Celtics,2,84,14-41,29-80,31,18,79.000000,Dallas Mavericks


In [None]:
#No need to reformat dates with new dataset

In [34]:
dfLim5['winner_binary'] = (dfLim5['winner'] == df['awayTeam']).astype(int)
dfLim5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim5['winner_binary'] = (dfLim5['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,homeWinPct,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-04-29,Chicago Bulls,5,69,6-16,25-75,43,19,0.000000,Washington Wizards,30,75,2-9,30-74,49,16,100.000000,Washington Wizards,1
1,2014-04-29,Oklahoma City Thunder,21,99,12-31,36-92,51,24,0.000000,Memphis Grizzlies,15,100,7-19,37-87,50,21,100.000000,Memphis Grizzlies,1
2,2014-04-29,Los Angeles Clippers,13,113,8-20,37-76,41,20,100.000000,Golden State Warriors,10,103,10-26,40-85,42,24,0.000000,Los Angeles Clippers,0
3,2014-04-30,Toronto Raptors,28,115,12-26,37-77,37,21,100.000000,Brooklyn Nets,3,113,11-23,40-75,34,22,0.000000,Toronto Raptors,0
4,2014-04-30,Houston Rockets,11,108,8-25,43-91,48,23,100.000000,Portland Trail Blazers,25,98,9-25,36-83,34,14,0.000000,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,16-42,39-82,47,23,79.381443,Dallas Mavericks,7,89,7-27,35-84,43,9,62.000000,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,10-39,38-84,41,29,79.591837,Dallas Mavericks,7,98,6-26,38-80,43,21,61.386139,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,9-25,38-86,43,15,60.784314,Boston Celtics,2,106,17-46,38-82,36,26,79.797980,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,15-37,46-91,52,21,61.165049,Boston Celtics,2,84,14-41,29-80,31,18,79.000000,Dallas Mavericks,0


In [35]:
#adding 3P made
dfLim5.insert(dfLim5.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim5['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5.insert(dfLim5.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim5['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,homeWinPct,...,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-04-29,Chicago Bulls,5,69,6-16,6,25-75,43,19,0.000000,...,30,75,2-9,2,30-74,49,16,100.000000,Washington Wizards,1
1,2014-04-29,Oklahoma City Thunder,21,99,12-31,12,36-92,51,24,0.000000,...,15,100,7-19,7,37-87,50,21,100.000000,Memphis Grizzlies,1
2,2014-04-29,Los Angeles Clippers,13,113,8-20,8,37-76,41,20,100.000000,...,10,103,10-26,10,40-85,42,24,0.000000,Los Angeles Clippers,0
3,2014-04-30,Toronto Raptors,28,115,12-26,12,37-77,37,21,100.000000,...,3,113,11-23,11,40-75,34,22,0.000000,Toronto Raptors,0
4,2014-04-30,Houston Rockets,11,108,8-25,8,43-91,48,23,100.000000,...,25,98,9-25,9,36-83,34,14,0.000000,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,16-42,16,39-82,47,23,79.381443,...,7,89,7-27,7,35-84,43,9,62.000000,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,10-39,10,38-84,41,29,79.591837,...,7,98,6-26,6,38-80,43,21,61.386139,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,9-25,9,38-86,43,15,60.784314,...,2,106,17-46,17,38-82,36,26,79.797980,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,15-37,15,46-91,52,21,61.165049,...,2,84,14-41,14,29-80,31,18,79.000000,Dallas Mavericks,0


In [36]:
#adding FG made
dfLim5.insert(dfLim5.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim5['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5.insert(dfLim5.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim5['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-04-29,Chicago Bulls,5,69,6-16,6,25-75,25,43,19,...,75,2-9,2,30-74,30,49,16,100.000000,Washington Wizards,1
1,2014-04-29,Oklahoma City Thunder,21,99,12-31,12,36-92,36,51,24,...,100,7-19,7,37-87,37,50,21,100.000000,Memphis Grizzlies,1
2,2014-04-29,Los Angeles Clippers,13,113,8-20,8,37-76,37,41,20,...,103,10-26,10,40-85,40,42,24,0.000000,Los Angeles Clippers,0
3,2014-04-30,Toronto Raptors,28,115,12-26,12,37-77,37,37,21,...,113,11-23,11,40-75,40,34,22,0.000000,Toronto Raptors,0
4,2014-04-30,Houston Rockets,11,108,8-25,8,43-91,43,48,23,...,98,9-25,9,36-83,36,34,14,0.000000,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,16-42,16,39-82,39,47,23,...,89,7-27,7,35-84,35,43,9,62.000000,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,10-39,10,38-84,38,41,29,...,98,6-26,6,38-80,38,43,21,61.386139,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,9-25,9,38-86,38,43,15,...,106,17-46,17,38-82,38,36,26,79.797980,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,15-37,15,46-91,46,52,21,...,84,14-41,14,29-80,29,31,18,79.000000,Dallas Mavericks,0


In [37]:
#Format as percentage
dfLim5['homeTeam_3P'] = dfLim5['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim5['awayTeam_3P'] = dfLim5['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim5['homeTeam_FG'] = dfLim5['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim5['awayTeam_FG'] = dfLim5['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)

dfLim5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim5['homeTeam_3P'] = dfLim5['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim5['awayTeam_3P'] = dfLim5['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-04-29,Chicago Bulls,5,69,0.375000,6,0.333333,25,43,19,...,75,0.222222,2,0.405405,30,49,16,100.000000,Washington Wizards,1
1,2014-04-29,Oklahoma City Thunder,21,99,0.387097,12,0.391304,36,51,24,...,100,0.368421,7,0.425287,37,50,21,100.000000,Memphis Grizzlies,1
2,2014-04-29,Los Angeles Clippers,13,113,0.400000,8,0.486842,37,41,20,...,103,0.384615,10,0.470588,40,42,24,0.000000,Los Angeles Clippers,0
3,2014-04-30,Toronto Raptors,28,115,0.461538,12,0.480519,37,37,21,...,113,0.478261,11,0.533333,40,34,22,0.000000,Toronto Raptors,0
4,2014-04-30,Houston Rockets,11,108,0.320000,8,0.472527,43,48,23,...,98,0.360000,9,0.433735,36,34,14,0.000000,Houston Rockets,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12879,2024-06-06,Boston Celtics,2,107,0.380952,16,0.475610,39,47,23,...,89,0.259259,7,0.416667,35,43,9,62.000000,Boston Celtics,0
12880,2024-06-09,Boston Celtics,2,105,0.256410,10,0.452381,38,41,29,...,98,0.230769,6,0.475000,38,43,21,61.386139,Boston Celtics,0
12881,2024-06-12,Dallas Mavericks,7,99,0.360000,9,0.441860,38,43,15,...,106,0.369565,17,0.463415,38,36,26,79.797980,Boston Celtics,1
12882,2024-06-14,Dallas Mavericks,7,122,0.405405,15,0.505495,46,52,21,...,84,0.341463,14,0.362500,29,31,18,79.000000,Dallas Mavericks,0


In [38]:
dfIDtoStat5 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'WinPct'])
dfIDtoStat5

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,WinPct


In [39]:
# Create DataFrame for home team statistics
home_df = dfLim5[['date', 'homeTeamSubject_id', 'homeTeam_points_total', 'homeTeam_3P', 'homeTeam_3P_made', 'homeTeam_FG', 'homeTeam_FG_made', 'homeTeam_Total_Reb', 'homeTeam_Ast', 'homeWinPct']].copy()
home_df.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'WinPct']

# Create DataFrame for away team statistics
away_df = dfLim5[['date', 'awayTeamSubject_id', 'awayTeam_points_total', 'awayTeam_3P', 'awayTeam_3P_made', 'awayTeam_FG', 'awayTeam_FG_made', 'awayTeam_Total_Reb', 'awayTeam_Ast', 'awayWinPct']].copy()
away_df.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'WinPct']

# Combine both DataFrames
dfIDtoStat5 = pd.concat([home_df, away_df], ignore_index=True)

# Display the resulting DataFrame
dfIDtoStat5

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,WinPct
0,2014-04-29,5,69,0.375000,6,0.333333,25,43,19,0.000000
1,2014-04-29,21,99,0.387097,12,0.391304,36,51,24,0.000000
2,2014-04-29,13,113,0.400000,8,0.486842,37,41,20,100.000000
3,2014-04-30,28,115,0.461538,12,0.480519,37,37,21,100.000000
4,2014-04-30,11,108,0.320000,8,0.472527,43,48,23,100.000000
...,...,...,...,...,...,...,...,...,...,...
25763,2024-06-06,7,89,0.259259,7,0.416667,35,43,9,62.000000
25764,2024-06-09,7,98,0.230769,6,0.475000,38,43,21,61.386139
25765,2024-06-12,2,106,0.369565,17,0.463415,38,36,26,79.797980
25766,2024-06-14,2,84,0.341463,14,0.362500,29,31,18,79.000000


In [40]:
#No need to reverse df becasue new dataset is already reversed

In [41]:
#X1.Shape is num data, window size, variables
X5, y5 = df_to_X_y(dfLim5, dfIDtoStat5, 10)
X5.shape, y5.shape

((12681, 10, 16), (12681,))

In [42]:
#70-15-15 split
X5_train, X5_temp, y5_train, y5_temp = train_test_split(X5, y5, test_size=0.3, random_state=42)
X5_val, X5_test, y5_val, y5_test = train_test_split(X5_temp, y5_temp, test_size=0.5, random_state=42)
X5_train.shape, y5_train.shape, X5_val.shape, y5_val.shape, X5_test.shape, y5_test.shape

((8876, 10, 16), (8876,), (1902, 10, 16), (1902,), (1903, 10, 16), (1903,))

In [43]:
model5 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model5.add(InputLayer((10, 16)))
model5.add(LSTM(64, activation='tanh', return_sequences=True))
model5.add(LSTM(32, activation='tanh'))
model5.add(Dense(16, activation='relu'))
model5.add(Dense(8, activation='relu'))
model5.add(Dense(1, activation='sigmoid'))

model5.summary()

In [44]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model5.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model5.fit(X5_train, y5_train, validation_data=(X5_val, y5_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.6150 - loss: 0.6591 - val_accuracy: 0.6472 - val_loss: 0.6245
Epoch 2/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6288 - loss: 0.6427 - val_accuracy: 0.6509 - val_loss: 0.6240
Epoch 3/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6440 - loss: 0.6321 - val_accuracy: 0.6393 - val_loss: 0.6324
Epoch 4/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6467 - loss: 0.6390 - val_accuracy: 0.6351 - val_loss: 0.6307
Epoch 5/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6387 - loss: 0.6355 - val_accuracy: 0.6609 - val_loss: 0.6204
Epoch 6/100
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6489 - loss: 0.6287 - val_accuracy: 0.6462 - val_loss: 0.6236
Epoch 7/100
[1m278/27

<keras.src.callbacks.history.History at 0x177dee3c0>

In [45]:
y_pred = model5.predict(X5_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y5_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
0.6426694692590647
