In [43]:
import tensorflow as tf
import os
import pandas as pd 
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, InputLayer, Dropout, BatchNormalization
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score

In [44]:
df = pd.read_csv('nbsDataAllScrapedInfo.csv')
df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,homeTeam_points_1OT,homeTeam_points_2OT,...,winner,season,homeWinPct,awayWinPct,gameNumber,isPlayoffGame,homeWins,homeLosses,awayWins,awayLosses
0,2007-04-18,Utah Jazz,UTA,101,20,30,25,26,0,0,...,Utah Jazz,2006-2007,100.000000,0.000000,1,False,1,0,0,1
1,2007-04-18,Toronto Raptors,TOR,119,28,29,29,33,0,0,...,Philadelphia 76ers,2006-2007,0.000000,100.000000,2,False,0,1,1,0
2,2007-04-18,Seattle Supersonics,SEA,75,16,21,22,16,0,0,...,Dallas Mavericks,2006-2007,0.000000,100.000000,3,False,0,1,1,0
3,2007-04-18,San Antonio Spurs,SAS,77,16,24,22,15,0,0,...,Denver Nuggets,2006-2007,0.000000,100.000000,4,False,0,1,1,0
4,2007-04-18,Sacramento Kings,SAC,106,30,27,23,26,0,0,...,Los Angeles Lakers,2006-2007,0.000000,100.000000,5,False,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,BOS,107,37,26,23,21,0,0,...,Boston Celtics,2023-2024,79.381443,62.000000,1315,True,77,20,62,38
21873,2024-06-09,Boston Celtics,BOS,105,25,29,29,22,0,0,...,Boston Celtics,2023-2024,79.591837,61.386139,1316,True,78,20,62,39
21874,2024-06-12,Dallas Mavericks,DAL,99,31,20,19,29,0,0,...,Boston Celtics,2023-2024,60.784314,79.797980,1317,True,62,40,79,20
21875,2024-06-14,Dallas Mavericks,DAL,122,34,27,31,30,0,0,...,Dallas Mavericks,2023-2024,61.165049,79.000000,1318,True,63,40,79,21


In [45]:
subjectID_dict = {
    "ATL": 1,
    "BOS": 2,
    "BKN": 3,
    "CHA": 4,
    "CHI": 5,
    "CLE": 6,
    "DAL": 7,
    "DEN": 8,
    "DET": 9,
    "GSW": 10,
    "HOU": 11,
    "IND": 12,
    "LAC": 13,
    "LAL": 14,
    "MEM": 15,
    "MIA": 16,
    "MIL": 17,
    "MIN": 18,
    "NOP": 19,
    "NYK": 20,
    "OKC": 21,
    "ORL": 22,
    "PHI": 23,
    "PHX": 24,
    "POR": 25,
    "SAC": 26,
    "SAS": 27,
    "TOR": 28,
    "UTA": 29,
    "WAS": 30,
    "NOH": 19,
    "NJN": 3,
    "SEA": 21
}

In [46]:
df["homeTeamSubject_id"] = df["homeTeam_id"].map(subjectID_dict)
df["awayTeamSubject_id"] = df["awayTeam_id"].map(subjectID_dict)

cols = df.columns.tolist()
cols.insert(cols.index("homeTeam_id") + 1, cols.pop(cols.index("homeTeamSubject_id")))
cols.insert(cols.index("awayTeam_id") + 1, cols.pop(cols.index("awayTeamSubject_id")))
df = df[cols]

df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,homeTeam_points_1OT,...,winner,season,homeWinPct,awayWinPct,gameNumber,isPlayoffGame,homeWins,homeLosses,awayWins,awayLosses
0,2007-04-18,Utah Jazz,UTA,29,101,20,30,25,26,0,...,Utah Jazz,2006-2007,100.000000,0.000000,1,False,1,0,0,1
1,2007-04-18,Toronto Raptors,TOR,28,119,28,29,29,33,0,...,Philadelphia 76ers,2006-2007,0.000000,100.000000,2,False,0,1,1,0
2,2007-04-18,Seattle Supersonics,SEA,21,75,16,21,22,16,0,...,Dallas Mavericks,2006-2007,0.000000,100.000000,3,False,0,1,1,0
3,2007-04-18,San Antonio Spurs,SAS,27,77,16,24,22,15,0,...,Denver Nuggets,2006-2007,0.000000,100.000000,4,False,0,1,1,0
4,2007-04-18,Sacramento Kings,SAC,26,106,30,27,23,26,0,...,Los Angeles Lakers,2006-2007,0.000000,100.000000,5,False,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,BOS,2,107,37,26,23,21,0,...,Boston Celtics,2023-2024,79.381443,62.000000,1315,True,77,20,62,38
21873,2024-06-09,Boston Celtics,BOS,2,105,25,29,29,22,0,...,Boston Celtics,2023-2024,79.591837,61.386139,1316,True,78,20,62,39
21874,2024-06-12,Dallas Mavericks,DAL,7,99,31,20,19,29,0,...,Boston Celtics,2023-2024,60.784314,79.797980,1317,True,62,40,79,20
21875,2024-06-14,Dallas Mavericks,DAL,7,122,34,27,31,30,0,...,Dallas Mavericks,2023-2024,61.165049,79.000000,1318,True,63,40,79,21


In [47]:
#limit df to few variables
dfLim = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    "winner"
]]
dfLim

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner
0,2007-04-18,Utah Jazz,29,101,Houston Rockets,11,91,Utah Jazz
1,2007-04-18,Toronto Raptors,28,119,Philadelphia 76ers,23,122,Philadelphia 76ers
2,2007-04-18,Seattle Supersonics,21,75,Dallas Mavericks,7,106,Dallas Mavericks
3,2007-04-18,San Antonio Spurs,27,77,Denver Nuggets,8,100,Denver Nuggets
4,2007-04-18,Sacramento Kings,26,106,Los Angeles Lakers,14,117,Los Angeles Lakers
...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics
21873,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics
21874,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics
21875,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks


In [48]:
#reformat the dates so they can be compared
dfLim['date'] = dfLim['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
dfLim


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim['date'] = dfLim['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner
0,2007-04-18,Utah Jazz,29,101,Houston Rockets,11,91,Utah Jazz
1,2007-04-18,Toronto Raptors,28,119,Philadelphia 76ers,23,122,Philadelphia 76ers
2,2007-04-18,Seattle Supersonics,21,75,Dallas Mavericks,7,106,Dallas Mavericks
3,2007-04-18,San Antonio Spurs,27,77,Denver Nuggets,8,100,Denver Nuggets
4,2007-04-18,Sacramento Kings,26,106,Los Angeles Lakers,14,117,Los Angeles Lakers
...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics
21873,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics
21874,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics
21875,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks


In [49]:
dfLim['winner_binary'] = (dfLim['winner'] == df['awayTeam']).astype(int)
dfLim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim['winner_binary'] = (dfLim['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,Houston Rockets,11,91,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,Philadelphia 76ers,23,122,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,Dallas Mavericks,7,106,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,Denver Nuggets,8,100,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,Los Angeles Lakers,14,117,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks,0


In [50]:
dfIDtoStat = pd.DataFrame(columns=['date', 'team_id', 'team_points_total'])
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total


In [51]:
for index, row in dfLim.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    dfIDtoStat.loc[len(dfIDtoStat)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total}
    dfIDtoStat.loc[len(dfIDtoStat)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total}
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total
0,2007-04-18,29,101
1,2007-04-18,11,91
2,2007-04-18,28,119
3,2007-04-18,23,122
4,2007-04-18,21,75
...,...,...,...
43749,2024-06-12,2,106
43750,2024-06-14,7,122
43751,2024-06-14,2,84
43752,2024-06-17,2,106


In [52]:
dfIDtoStat.head()

Unnamed: 0,date,team_id,team_points_total
0,2007-04-18,29,101
1,2007-04-18,11,91
2,2007-04-18,28,119
3,2007-04-18,23,122
4,2007-04-18,21,75


In [53]:

dfLim

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,Houston Rockets,11,91,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,Philadelphia 76ers,23,122,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,Dallas Mavericks,7,106,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,Denver Nuggets,8,100,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,Los Angeles Lakers,14,117,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks,0


In [54]:

dfIDtoStat

Unnamed: 0,date,team_id,team_points_total
0,2007-04-18,29,101
1,2007-04-18,11,91
2,2007-04-18,28,119
3,2007-04-18,23,122
4,2007-04-18,21,75
...,...,...,...
43749,2024-06-12,2,106
43750,2024-06-14,7,122
43751,2024-06-14,2,84
43752,2024-06-17,2,106


In [55]:
'''def df_to_X_y_Simple(df, window_size=5):
    df_as_np = df.to_numpy()
    X = []
    y = []
    for i in range(len(df_as_np)-window_size):
        row = [r for r in df_as_np[i:i+window_size]]
        X.append(row)
        label = df_as_np[i+window_size][0]
        y.append(label)
    return np.array(X), np.array(y)'''

def df_to_X_y(df, dfWindow, window_size):
    X = []
    y = []
    #each team must have played window_size games before data can be extracted
    for index, row in df.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfWindow[dfWindow['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
            homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            homeTeam_window = homeTeam_window.to_numpy()
            awayTeam_window = awayTeam_window.to_numpy()
            combined_window = np.hstack((homeTeam_window, awayTeam_window))
            X.append(combined_window)
            y.append(row['winner_binary'])
    
    return np.array(X), np.array(y)   



In [None]:
dfIDtoStat['team_points_total'] = pd.to_numeric(dfIDtoStat['team_points_total'], errors='coerce')
window_size =5 
exX =[]
exY = []
for index, row in dfLim.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfIDtoStat[dfIDtoStat['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
                homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size))
                awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size))
                print(date)
                print(homeTeam_window)
                print(awayTeam_window)
                print(type(homeTeam_window))
                homeTeam_window = homeTeam_window.drop(columns=['date', 'team_id'])
                awayTeam_window = awayTeam_window.drop(columns=['date', 'team_id'])
                homeTeam_window = homeTeam_window.to_numpy()
                awayTeam_window = awayTeam_window.to_numpy()
                combined_window = np.hstack((homeTeam_window, awayTeam_window))
                exX.append(combined_window)
                exY.append(row['winner_binary'])
                print(exX)
                print(exY)
                break
                        

                

2007-05-03 00:00:00
         date  team_id  team_points_total
81 2007-04-30       29                 92
64 2007-04-28       29                 98
50 2007-04-26       29                 81
35 2007-04-23       29                 90
21 2007-04-21       29                 75
         date  team_id  team_points_total
80 2007-04-30       11                 96
65 2007-04-28       11                 85
51 2007-04-26       11                 67
34 2007-04-23       11                 98
20 2007-04-21       11                 84
<class 'pandas.core.frame.DataFrame'>
[array([[92, 96],
       [98, 85],
       [81, 67],
       [90, 98],
       [75, 84]], dtype=int64)]
[0]


In [57]:
#X1.Shape is num data, window size, variables
X1, y1 = df_to_X_y(dfLim, dfIDtoStat, 5)
X1.shape, y1.shape

((21761, 5, 2), (21761,))

In [58]:
#70-15-15 split
X1_train, X1_temp, y1_train, y1_temp = train_test_split(X1, y1, test_size=0.3, random_state=42)
X1_val, X1_test, y1_val, y1_test = train_test_split(X1_temp, y1_temp, test_size=0.5, random_state=42)
X1_train.shape, y1_train.shape, X1_val.shape, y1_val.shape, X1_test.shape, y1_test.shape

((15232, 5, 2), (15232,), (3264, 5, 2), (3264,), (3265, 5, 2), (3265,))

In [59]:
model1 = Sequential()
model1.add(InputLayer((5, 2)))
model1.add(LSTM(64, activation='tanh', return_sequences=True))
model1.add(LSTM(32, activation='tanh'))
model1.add(Dense(16, activation='relu'))
model1.add(Dense(8, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))

model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 5, 64)             17152     
                                                                 
 lstm_3 (LSTM)               (None, 32)                12416     
                                                                 
 dense_3 (Dense)             (None, 16)                528       
                                                                 
 dense_4 (Dense)             (None, 8)                 136       
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                                 
Total params: 30241 (118.13 KB)
Trainable params: 30241 (118.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [60]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [61]:
model1.fit(X1_train, y1_train, validation_data=(X1_val, y1_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100


<keras.src.callbacks.History at 0x1c751f940d0>

In [62]:
y_pred = model1.predict(X1_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y1_test, y_pred_binary))


0.5914241960183767


In [69]:
#Now adding 3 point percentages
dfLim2 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    #added homeTeam3P
    "homeTeam_3P",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    #added awayTeam3P
    "awayTeam_3P",
    "winner"
]]
dfLim2

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2007-04-18,Utah Jazz,29,101,4-11,Houston Rockets,11,91,6-25,Utah Jazz
1,2007-04-18,Toronto Raptors,28,119,7-19,Philadelphia 76ers,23,122,5-9,Philadelphia 76ers
2,2007-04-18,Seattle Supersonics,21,75,1-7,Dallas Mavericks,7,106,4-13,Dallas Mavericks
3,2007-04-18,San Antonio Spurs,27,77,5-20,Denver Nuggets,8,100,11-26,Denver Nuggets
4,2007-04-18,Sacramento Kings,26,106,7-25,Los Angeles Lakers,14,117,2-14,Los Angeles Lakers
...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
21873,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
21874,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
21875,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks


In [70]:
#reformat the dates so they can be compared
dfLim2

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2007-04-18,Utah Jazz,29,101,4-11,Houston Rockets,11,91,6-25,Utah Jazz
1,2007-04-18,Toronto Raptors,28,119,7-19,Philadelphia 76ers,23,122,5-9,Philadelphia 76ers
2,2007-04-18,Seattle Supersonics,21,75,1-7,Dallas Mavericks,7,106,4-13,Dallas Mavericks
3,2007-04-18,San Antonio Spurs,27,77,5-20,Denver Nuggets,8,100,11-26,Denver Nuggets
4,2007-04-18,Sacramento Kings,26,106,7-25,Los Angeles Lakers,14,117,2-14,Los Angeles Lakers
...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
21873,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
21874,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
21875,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks


In [71]:
dfLim2['winner_binary'] = (dfLim2['winner'] == df['awayTeam']).astype(int)
dfLim2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['winner_binary'] = (dfLim2['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,4-11,Houston Rockets,11,91,6-25,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,7-19,Philadelphia 76ers,23,122,5-9,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,1-7,Dallas Mavericks,7,106,4-13,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,5-20,Denver Nuggets,8,100,11-26,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,7-25,Los Angeles Lakers,14,117,2-14,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks,0


In [72]:
#Format as percentage
dfLim2['homeTeam_3P'] = dfLim2['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim2['awayTeam_3P'] = dfLim2['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['homeTeam_3P'] = dfLim2['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['awayTeam_3P'] = dfLim2['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,0.363636,Houston Rockets,11,91,0.240000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,0.368421,Philadelphia 76ers,23,122,0.555556,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,0.142857,Dallas Mavericks,7,106,0.307692,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,0.250000,Denver Nuggets,8,100,0.423077,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,0.280000,Los Angeles Lakers,14,117,0.142857,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,0.380952,Dallas Mavericks,7,89,0.259259,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,0.256410,Dallas Mavericks,7,98,0.230769,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,0.360000,Boston Celtics,2,106,0.369565,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,0.405405,Boston Celtics,2,84,0.341463,Dallas Mavericks,0


In [73]:
dfIDtoStat2 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%'])
dfIDtoStat['team_points_total'] = pd.to_numeric(dfIDtoStat['team_points_total'], errors='coerce')
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%


In [74]:
#populate dfIDtoStat2
dfIDtoStat['team_points_total'] = pd.to_numeric(dfIDtoStat['team_points_total'], errors='coerce')
for index, row in dfLim2.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    #added homeTeam3P
    homeTeam_3P = row['homeTeam_3P']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    #added awayTeam3P
    awayTeam_3P = row['homeTeam_3P']
    dfIDtoStat2.loc[len(dfIDtoStat2)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P}
    dfIDtoStat2.loc[len(dfIDtoStat2)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P}
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%
0,2007-04-18,29,101,0.363636
1,2007-04-18,11,91,0.363636
2,2007-04-18,28,119,0.368421
3,2007-04-18,23,122,0.368421
4,2007-04-18,21,75,0.142857
...,...,...,...,...
43749,2024-06-12,2,106,0.360000
43750,2024-06-14,7,122,0.405405
43751,2024-06-14,2,84,0.405405
43752,2024-06-17,2,106,0.333333


In [75]:
#reverse df order - should now start with 2014
dfLim2

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,0.363636,Houston Rockets,11,91,0.240000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,0.368421,Philadelphia 76ers,23,122,0.555556,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,0.142857,Dallas Mavericks,7,106,0.307692,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,0.250000,Denver Nuggets,8,100,0.423077,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,0.280000,Los Angeles Lakers,14,117,0.142857,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,0.380952,Dallas Mavericks,7,89,0.259259,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,0.256410,Dallas Mavericks,7,98,0.230769,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,0.360000,Boston Celtics,2,106,0.369565,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,0.405405,Boston Celtics,2,84,0.341463,Dallas Mavericks,0


In [78]:
#reverse order - should now start with 2014
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%
0,2007-04-18,29,101,0.363636
1,2007-04-18,11,91,0.363636
2,2007-04-18,28,119,0.368421
3,2007-04-18,23,122,0.368421
4,2007-04-18,21,75,0.142857
...,...,...,...,...
43749,2024-06-12,2,106,0.360000
43750,2024-06-14,7,122,0.405405
43751,2024-06-14,2,84,0.405405
43752,2024-06-17,2,106,0.333333


In [79]:
#X1.Shape is num data, window size, variables
X2, y2 = df_to_X_y(dfLim2, dfIDtoStat2, 5)
X2.shape, y2.shape

((21761, 5, 4), (21761,))

In [80]:
#70-15-15 split
X2_train, X2_temp, y2_train, y2_temp = train_test_split(X2, y2, test_size=0.3, random_state=42)
X2_val, X2_test, y2_val, y2_test = train_test_split(X2_temp, y2_temp, test_size=0.5, random_state=42)
X2_train.shape, y2_train.shape, X2_val.shape, y2_val.shape, X2_test.shape, y2_test.shape

((15232, 5, 4), (15232,), (3264, 5, 4), (3264,), (3265, 5, 4), (3265,))

In [81]:
model2 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model2.add(InputLayer((5, 4)))
model2.add(LSTM(64, activation='tanh', return_sequences=True))
model2.add(LSTM(32, activation='tanh'))
model2.add(Dense(16, activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 5, 64)             17664     
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dense_6 (Dense)             (None, 16)                528       
                                                                 
 dense_7 (Dense)             (None, 8)                 136       
                                                                 
 dense_8 (Dense)             (None, 1)                 9         
                                                                 
Total params: 30753 (120.13 KB)
Trainable params: 30753 (120.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [82]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [83]:
from datetime import datetime

def convert_date_to_numeric(date_str):
    try:
        return datetime.strptime(date_str, '%Y-%m-%d').timestamp()
    except ValueError:
        return date_str  # Keep as is if it's not a valid date format

In [84]:

model2.fit(X2_train, y2_train, validation_data=(X2_val, y2_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100


<keras.src.callbacks.History at 0x1c75abaf340>

In [85]:
y_pred = model2.predict(X2_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y2_test, y_pred_binary))

0.5898928024502297


In [86]:
#Not adding anything new because I am gonna pull out 3 pointers made
dfLim3 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    #added homeTeam3P
    "homeTeam_3P",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    #added awayTeam3P
    "awayTeam_3P",
    "winner"
]]
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2007-04-18,Utah Jazz,29,101,4-11,Houston Rockets,11,91,6-25,Utah Jazz
1,2007-04-18,Toronto Raptors,28,119,7-19,Philadelphia 76ers,23,122,5-9,Philadelphia 76ers
2,2007-04-18,Seattle Supersonics,21,75,1-7,Dallas Mavericks,7,106,4-13,Dallas Mavericks
3,2007-04-18,San Antonio Spurs,27,77,5-20,Denver Nuggets,8,100,11-26,Denver Nuggets
4,2007-04-18,Sacramento Kings,26,106,7-25,Los Angeles Lakers,14,117,2-14,Los Angeles Lakers
...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
21873,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
21874,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
21875,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks


In [88]:
#reformat the dates so they can be compared
#dfLim3['date'] = dfLim3['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2007-04-18,Utah Jazz,29,101,4-11,Houston Rockets,11,91,6-25,Utah Jazz
1,2007-04-18,Toronto Raptors,28,119,7-19,Philadelphia 76ers,23,122,5-9,Philadelphia 76ers
2,2007-04-18,Seattle Supersonics,21,75,1-7,Dallas Mavericks,7,106,4-13,Dallas Mavericks
3,2007-04-18,San Antonio Spurs,27,77,5-20,Denver Nuggets,8,100,11-26,Denver Nuggets
4,2007-04-18,Sacramento Kings,26,106,7-25,Los Angeles Lakers,14,117,2-14,Los Angeles Lakers
...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
21873,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
21874,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
21875,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks


In [89]:
dfLim3['winner_binary'] = (dfLim3['winner'] == df['awayTeam']).astype(int)
dfLim3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['winner_binary'] = (dfLim3['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,4-11,Houston Rockets,11,91,6-25,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,7-19,Philadelphia 76ers,23,122,5-9,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,1-7,Dallas Mavericks,7,106,4-13,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,5-20,Denver Nuggets,8,100,11-26,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,7-25,Los Angeles Lakers,14,117,2-14,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks,0


In [90]:
dfLim3.insert(dfLim3.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim3['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim3.insert(dfLim3.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim3['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,4-11,4,Houston Rockets,11,91,6-25,6,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,7-19,7,Philadelphia 76ers,23,122,5-9,5,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,1-7,1,Dallas Mavericks,7,106,4-13,4,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,5-20,5,Denver Nuggets,8,100,11-26,11,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,7-25,7,Los Angeles Lakers,14,117,2-14,2,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,16,Dallas Mavericks,7,89,7-27,7,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10-39,10,Dallas Mavericks,7,98,6-26,6,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,9-25,9,Boston Celtics,2,106,17-46,17,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,15-37,15,Boston Celtics,2,84,14-41,14,Dallas Mavericks,0


In [91]:
#Format as percentage
dfLim3['homeTeam_3P'] = dfLim3['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim3['awayTeam_3P'] = dfLim3['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['homeTeam_3P'] = dfLim3['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['awayTeam_3P'] = dfLim3['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,0.363636,4,Houston Rockets,11,91,0.240000,6,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,0.368421,7,Philadelphia 76ers,23,122,0.555556,5,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,0.142857,1,Dallas Mavericks,7,106,0.307692,4,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,0.250000,5,Denver Nuggets,8,100,0.423077,11,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,0.280000,7,Los Angeles Lakers,14,117,0.142857,2,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,0.380952,16,Dallas Mavericks,7,89,0.259259,7,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,0.256410,10,Dallas Mavericks,7,98,0.230769,6,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,0.360000,9,Boston Celtics,2,106,0.369565,17,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,0.405405,15,Boston Celtics,2,84,0.341463,14,Dallas Mavericks,0


In [92]:
dfIDtoStat3 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made'])
dfIDtoStat3

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made


In [93]:
#populate dfIDtoStat3
for index, row in dfLim3.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    homeTeam_3P = row['homeTeam_3P']
    #added homeTeam3P_made
    homeTeam_3P_made = row['homeTeam_3P_made']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    awayTeam_3P = row['homeTeam_3P']
    #added awayTeam3P_made
    awayTeam_3P_made = row['awayTeam_3P_made']
    #added these new variables into this
    dfIDtoStat3.loc[len(dfIDtoStat3)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P, '3P_made':homeTeam_3P_made}
    dfIDtoStat3.loc[len(dfIDtoStat3)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P, '3P_made':awayTeam_3P_made}
dfIDtoStat3

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made
0,2007-04-18,29,101,0.363636,4
1,2007-04-18,11,91,0.363636,6
2,2007-04-18,28,119,0.368421,7
3,2007-04-18,23,122,0.368421,5
4,2007-04-18,21,75,0.142857,1
...,...,...,...,...,...
43749,2024-06-12,2,106,0.360000,17
43750,2024-06-14,7,122,0.405405,15
43751,2024-06-14,2,84,0.405405,14
43752,2024-06-17,2,106,0.333333,13


In [95]:
#reverse df order - should now start with 2014
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,0.363636,4,Houston Rockets,11,91,0.240000,6,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,0.368421,7,Philadelphia 76ers,23,122,0.555556,5,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,0.142857,1,Dallas Mavericks,7,106,0.307692,4,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,0.250000,5,Denver Nuggets,8,100,0.423077,11,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,0.280000,7,Los Angeles Lakers,14,117,0.142857,2,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,0.380952,16,Dallas Mavericks,7,89,0.259259,7,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,0.256410,10,Dallas Mavericks,7,98,0.230769,6,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,0.360000,9,Boston Celtics,2,106,0.369565,17,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,0.405405,15,Boston Celtics,2,84,0.341463,14,Dallas Mavericks,0


In [96]:
#reverse order - should now start with 2014
#dfIDtoStat3 = dfIDtoStat3.iloc[::-1].reset_index(drop=True)
dfIDtoStat3

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made
0,2007-04-18,29,101,0.363636,4
1,2007-04-18,11,91,0.363636,6
2,2007-04-18,28,119,0.368421,7
3,2007-04-18,23,122,0.368421,5
4,2007-04-18,21,75,0.142857,1
...,...,...,...,...,...
43749,2024-06-12,2,106,0.360000,17
43750,2024-06-14,7,122,0.405405,15
43751,2024-06-14,2,84,0.405405,14
43752,2024-06-17,2,106,0.333333,13


In [97]:
#X1.Shape is num data, window size, variables
X3, y3 = df_to_X_y(dfLim3, dfIDtoStat3, 5)
X3.shape, y3.shape

((21761, 5, 6), (21761,))

In [98]:
#70-15-15 split
X3_train, X3_temp, y3_train, y3_temp = train_test_split(X3, y3, test_size=0.3, random_state=42)
X3_val, X3_test, y3_val, y3_test = train_test_split(X3_temp, y3_temp, test_size=0.5, random_state=42)
X3_train.shape, y3_train.shape, X3_val.shape, y3_val.shape, X3_test.shape, y3_test.shape

((15232, 5, 6), (15232,), (3264, 5, 6), (3264,), (3265, 5, 6), (3265,))

In [99]:
model3 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model3.add(InputLayer((5, 6)))
model3.add(LSTM(64, activation='tanh', return_sequences=True))
model3.add(LSTM(32, activation='tanh'))
model3.add(Dense(16, activation='relu'))
model3.add(Dense(8, activation='relu'))
model3.add(Dense(1, activation='sigmoid'))

model3.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 5, 64)             18176     
                                                                 
 lstm_7 (LSTM)               (None, 32)                12416     
                                                                 
 dense_9 (Dense)             (None, 16)                528       
                                                                 
 dense_10 (Dense)            (None, 8)                 136       
                                                                 
 dense_11 (Dense)            (None, 1)                 9         
                                                                 
Total params: 31265 (122.13 KB)
Trainable params: 31265 (122.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [100]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [101]:
model3.fit(X3_train, y3_train, validation_data=(X3_val, y3_val), epochs=100, callbacks=[earlystopping])


Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


<keras.src.callbacks.History at 0x1c76b0ff490>

In [102]:
y_pred = model3.predict(X3_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y3_test, y_pred_binary))

0.5947932618683002


In [103]:
dfLim4 = dfLim3.copy()  # Copy the original DataFrame

# Get the position of 'homeTeam_3P'
position = dfLim4.columns.get_loc('homeTeam_3P')

# Insert 'homeTeam_FG' after 'shomeTeam_3P'
dfLim4.insert(position + 1, 'homeTeam_FG', df['homeTeam_FG'])

position2 = dfLim4.columns.get_loc('awayTeam_3P')
# Insert 'awayTeam_FG' right after 'awayTeam_3P'
dfLim4.insert(position2 + 1, 'awayTeam_FG', df['awayTeam_FG'])

dfLim4

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_3P_made,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,0.363636,36-73,4,Houston Rockets,11,91,0.240000,31-74,6,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,0.368421,46-86,7,Philadelphia 76ers,23,122,0.555556,44-83,5,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,0.142857,29-69,1,Dallas Mavericks,7,106,0.307692,43-81,4,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,0.250000,29-81,5,Denver Nuggets,8,100,0.423077,42-85,11,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,0.280000,40-83,7,Los Angeles Lakers,14,117,0.142857,45-79,2,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,0.380952,39-82,16,Dallas Mavericks,7,89,0.259259,35-84,7,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,0.256410,38-84,10,Dallas Mavericks,7,98,0.230769,38-80,6,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,0.360000,38-86,9,Boston Celtics,2,106,0.369565,38-82,17,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,0.405405,46-91,15,Boston Celtics,2,84,0.341463,29-80,14,Dallas Mavericks,0


In [104]:
dfLim4.insert(dfLim4.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim4['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim4.insert(dfLim4.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim4['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim4

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_FG_made,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_FG_made,awayTeam_3P_made,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,0.363636,36-73,36,4,Houston Rockets,11,91,0.240000,31-74,31,6,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,0.368421,46-86,46,7,Philadelphia 76ers,23,122,0.555556,44-83,44,5,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,0.142857,29-69,29,1,Dallas Mavericks,7,106,0.307692,43-81,43,4,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,0.250000,29-81,29,5,Denver Nuggets,8,100,0.423077,42-85,42,11,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,0.280000,40-83,40,7,Los Angeles Lakers,14,117,0.142857,45-79,45,2,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,0.380952,39-82,39,16,Dallas Mavericks,7,89,0.259259,35-84,35,7,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,0.256410,38-84,38,10,Dallas Mavericks,7,98,0.230769,38-80,38,6,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,0.360000,38-86,38,9,Boston Celtics,2,106,0.369565,38-82,38,17,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,0.405405,46-91,46,15,Boston Celtics,2,84,0.341463,29-80,29,14,Dallas Mavericks,0


In [105]:
#Format as percentage
dfLim4['homeTeam_FG'] = dfLim4['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim4['awayTeam_FG'] = dfLim4['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim4

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_FG_made,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_FG_made,awayTeam_3P_made,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,0.363636,0.493151,36,4,Houston Rockets,11,91,0.240000,0.418919,31,6,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,0.368421,0.534884,46,7,Philadelphia 76ers,23,122,0.555556,0.530120,44,5,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,0.142857,0.420290,29,1,Dallas Mavericks,7,106,0.307692,0.530864,43,4,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,0.250000,0.358025,29,5,Denver Nuggets,8,100,0.423077,0.494118,42,11,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,0.280000,0.481928,40,7,Los Angeles Lakers,14,117,0.142857,0.569620,45,2,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,0.380952,0.475610,39,16,Dallas Mavericks,7,89,0.259259,0.416667,35,7,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,0.256410,0.452381,38,10,Dallas Mavericks,7,98,0.230769,0.475000,38,6,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,0.360000,0.441860,38,9,Boston Celtics,2,106,0.369565,0.463415,38,17,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,0.405405,0.505495,46,15,Boston Celtics,2,84,0.341463,0.362500,29,14,Dallas Mavericks,0


In [106]:
dfIDtoStat4 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made' ])
dfIDtoStat4

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made


In [107]:
for index, row in dfLim4.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    homeTeam_3P = row['homeTeam_3P']
    homeTeam_3P_made = row['homeTeam_3P_made']
     #added homeTeamFG_made
    homeTeam_FG = row['homeTeam_FG']
    homeTeam_FG_made = row['homeTeam_FG_made']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    awayTeam_3P = row['homeTeam_3P']
    awayTeam_3P_made = row['awayTeam_3P_made']
    #added awayTeamFG
    awayTeam_FG = row['awayTeam_FG']
    awayTeam_FG_made = row['awayTeam_FG_made']
    #added these new variables into this
    dfIDtoStat4.loc[len(dfIDtoStat4)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P, '3P_made':homeTeam_3P_made, 'FG%':homeTeam_FG, 'FG_made':homeTeam_FG_made}
    dfIDtoStat4.loc[len(dfIDtoStat4)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P, '3P_made':awayTeam_3P_made, 'FG%':awayTeam_FG, 'FG_made':awayTeam_FG_made}
dfIDtoStat4

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made
0,2007-04-18,29,101,0.363636,4,0.493151,36
1,2007-04-18,11,91,0.363636,6,0.418919,31
2,2007-04-18,28,119,0.368421,7,0.534884,46
3,2007-04-18,23,122,0.368421,5,0.530120,44
4,2007-04-18,21,75,0.142857,1,0.420290,29
...,...,...,...,...,...,...,...
43749,2024-06-12,2,106,0.360000,17,0.463415,38
43750,2024-06-14,7,122,0.405405,15,0.505495,46
43751,2024-06-14,2,84,0.405405,14,0.362500,29
43752,2024-06-17,2,106,0.333333,13,0.426966,38


In [108]:
#X1.Shape is num data, window size, variables
X4, y4 = df_to_X_y(dfLim4, dfIDtoStat4, 6)
X4.shape, y4.shape

((21742, 6, 10), (21742,))

In [109]:
#70-15-15 split
X4_train, X4_temp, y4_train, y4_temp = train_test_split(X4, y4, test_size=0.3, random_state=42)
X4_val, X4_test, y4_val, y4_test = train_test_split(X4_temp, y4_temp, test_size=0.5, random_state=42)
X4_train.shape, y4_train.shape, X4_val.shape, y4_val.shape, X4_test.shape, y4_test.shape

((15219, 6, 10), (15219,), (3261, 6, 10), (3261,), (3262, 6, 10), (3262,))

In [114]:
model4 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model4.add(InputLayer((6, 10)))
model4.add(LSTM(64, activation='tanh', return_sequences=True))
model4.add(LSTM(32, activation='tanh'))
model4.add(Dense(16, activation='relu'))
model4.add(Dense(8, activation='relu'))
model4.add(Dense(1, activation='sigmoid'))

model4.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_10 (LSTM)              (None, 6, 64)             19200     
                                                                 
 lstm_11 (LSTM)              (None, 32)                12416     
                                                                 
 dense_15 (Dense)            (None, 16)                528       
                                                                 
 dense_16 (Dense)            (None, 8)                 136       
                                                                 
 dense_17 (Dense)            (None, 1)                 9         
                                                                 
Total params: 32289 (126.13 KB)
Trainable params: 32289 (126.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [115]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [116]:
model4.fit(X4_train, y4_train, validation_data=(X4_val, y4_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x1c771e9acd0>

In [117]:
y_pred = model4.predict(X4_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y4_test, y_pred_binary))

0.5944206008583691


In [118]:
#Adding ast, reb, and winning percentages
dfLim5 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    "homeTeam_3P",
    "homeTeam_FG",
    #added homeTeamReb and homeTeamAst
    "homeTeam_Total_Reb",
    "homeTeam_Ast",
    "homeWinPct",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    "awayTeam_3P",
    "awayTeam_FG",
    #added awayTeamReb and awayTeamAst
    "awayTeam_Total_Reb",
    "awayTeam_Ast",
    "awayWinPct",
    "winner"
]]
dfLim5

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,homeWinPct,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner
0,2007-04-18,Utah Jazz,29,101,4-11,36-73,45,23,100.000000,Houston Rockets,11,91,6-25,31-74,25,17,0.000000,Utah Jazz
1,2007-04-18,Toronto Raptors,28,119,7-19,46-86,33,32,0.000000,Philadelphia 76ers,23,122,5-9,44-83,41,22,100.000000,Philadelphia 76ers
2,2007-04-18,Seattle Supersonics,21,75,1-7,29-69,31,13,0.000000,Dallas Mavericks,7,106,4-13,43-81,38,26,100.000000,Dallas Mavericks
3,2007-04-18,San Antonio Spurs,27,77,5-20,29-81,39,14,0.000000,Denver Nuggets,8,100,11-26,42-85,55,28,100.000000,Denver Nuggets
4,2007-04-18,Sacramento Kings,26,106,7-25,40-83,30,21,0.000000,Los Angeles Lakers,14,117,2-14,45-79,47,19,100.000000,Los Angeles Lakers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,39-82,47,23,79.381443,Dallas Mavericks,7,89,7-27,35-84,43,9,62.000000,Boston Celtics
21873,2024-06-09,Boston Celtics,2,105,10-39,38-84,41,29,79.591837,Dallas Mavericks,7,98,6-26,38-80,43,21,61.386139,Boston Celtics
21874,2024-06-12,Dallas Mavericks,7,99,9-25,38-86,43,15,60.784314,Boston Celtics,2,106,17-46,38-82,36,26,79.797980,Boston Celtics
21875,2024-06-14,Dallas Mavericks,7,122,15-37,46-91,52,21,61.165049,Boston Celtics,2,84,14-41,29-80,31,18,79.000000,Dallas Mavericks


In [243]:
#No need to reformat dates with new dataset

In [119]:
dfLim5['winner_binary'] = (dfLim5['winner'] == df['awayTeam']).astype(int)
dfLim5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim5['winner_binary'] = (dfLim5['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,homeWinPct,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,4-11,36-73,45,23,100.000000,Houston Rockets,11,91,6-25,31-74,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,7-19,46-86,33,32,0.000000,Philadelphia 76ers,23,122,5-9,44-83,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,1-7,29-69,31,13,0.000000,Dallas Mavericks,7,106,4-13,43-81,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,5-20,29-81,39,14,0.000000,Denver Nuggets,8,100,11-26,42-85,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,7-25,40-83,30,21,0.000000,Los Angeles Lakers,14,117,2-14,45-79,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,39-82,47,23,79.381443,Dallas Mavericks,7,89,7-27,35-84,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10-39,38-84,41,29,79.591837,Dallas Mavericks,7,98,6-26,38-80,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,9-25,38-86,43,15,60.784314,Boston Celtics,2,106,17-46,38-82,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,15-37,46-91,52,21,61.165049,Boston Celtics,2,84,14-41,29-80,31,18,79.000000,Dallas Mavericks,0


In [120]:
#adding 3P made
dfLim5.insert(dfLim5.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim5['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5.insert(dfLim5.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim5['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,homeWinPct,...,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,4-11,4,36-73,45,23,100.000000,...,11,91,6-25,6,31-74,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,7-19,7,46-86,33,32,0.000000,...,23,122,5-9,5,44-83,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,1-7,1,29-69,31,13,0.000000,...,7,106,4-13,4,43-81,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,5-20,5,29-81,39,14,0.000000,...,8,100,11-26,11,42-85,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,7-25,7,40-83,30,21,0.000000,...,14,117,2-14,2,45-79,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,16,39-82,47,23,79.381443,...,7,89,7-27,7,35-84,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10-39,10,38-84,41,29,79.591837,...,7,98,6-26,6,38-80,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,9-25,9,38-86,43,15,60.784314,...,2,106,17-46,17,38-82,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,15-37,15,46-91,52,21,61.165049,...,2,84,14-41,14,29-80,31,18,79.000000,Dallas Mavericks,0


In [121]:
#adding FG made
dfLim5.insert(dfLim5.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim5['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5.insert(dfLim5.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim5['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,4-11,4,36-73,36,45,23,...,91,6-25,6,31-74,31,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,7-19,7,46-86,46,33,32,...,122,5-9,5,44-83,44,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,1-7,1,29-69,29,31,13,...,106,4-13,4,43-81,43,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,5-20,5,29-81,29,39,14,...,100,11-26,11,42-85,42,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,7-25,7,40-83,40,30,21,...,117,2-14,2,45-79,45,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,16-42,16,39-82,39,47,23,...,89,7-27,7,35-84,35,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10-39,10,38-84,38,41,29,...,98,6-26,6,38-80,38,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,9-25,9,38-86,38,43,15,...,106,17-46,17,38-82,38,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,15-37,15,46-91,46,52,21,...,84,14-41,14,29-80,29,31,18,79.000000,Dallas Mavericks,0


In [122]:
#Format as percentage
dfLim5['homeTeam_3P'] = dfLim5['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim5['awayTeam_3P'] = dfLim5['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim5['homeTeam_FG'] = dfLim5['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim5['awayTeam_FG'] = dfLim5['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)

dfLim5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim5['homeTeam_3P'] = dfLim5['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim5['awayTeam_3P'] = dfLim5['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,0.363636,4,0.493151,36,45,23,...,91,0.240000,6,0.418919,31,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,0.368421,7,0.534884,46,33,32,...,122,0.555556,5,0.530120,44,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,0.142857,1,0.420290,29,31,13,...,106,0.307692,4,0.530864,43,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,0.250000,5,0.358025,29,39,14,...,100,0.423077,11,0.494118,42,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,0.280000,7,0.481928,40,30,21,...,117,0.142857,2,0.569620,45,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,0.380952,16,0.475610,39,47,23,...,89,0.259259,7,0.416667,35,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,0.256410,10,0.452381,38,41,29,...,98,0.230769,6,0.475000,38,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,0.360000,9,0.441860,38,43,15,...,106,0.369565,17,0.463415,38,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,0.405405,15,0.505495,46,52,21,...,84,0.341463,14,0.362500,29,31,18,79.000000,Dallas Mavericks,0


In [123]:
dfIDtoStat5 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'WinPct'])
dfIDtoStat5

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,WinPct


In [128]:
# Create DataFrame for home team statistics
home_df = dfLim5[['date', 'homeTeamSubject_id', 'homeTeam_points_total', 'homeTeam_3P', 'homeTeam_3P_made', 'homeTeam_FG', 'homeTeam_FG_made', 'homeTeam_Total_Reb', 'homeTeam_Ast', 'homeWinPct']].copy()
home_df.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'WinPct']

# Create DataFrame for away team statistics
away_df = dfLim5[['date', 'awayTeamSubject_id', 'awayTeam_points_total', 'awayTeam_3P', 'awayTeam_3P_made', 'awayTeam_FG', 'awayTeam_FG_made', 'awayTeam_Total_Reb', 'awayTeam_Ast', 'awayWinPct']].copy()
away_df.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'WinPct']

# Combine both DataFrames
dfIDtoStat5 = pd.concat([home_df, away_df], ignore_index=True)

# Display the resulting DataFrame
dfIDtoStat5

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,WinPct
0,2007-04-18,29,101,0.363636,4,0.493151,36,45,23,100.000000
1,2007-04-18,28,119,0.368421,7,0.534884,46,33,32,0.000000
2,2007-04-18,21,75,0.142857,1,0.420290,29,31,13,0.000000
3,2007-04-18,27,77,0.250000,5,0.358025,29,39,14,0.000000
4,2007-04-18,26,106,0.280000,7,0.481928,40,30,21,0.000000
...,...,...,...,...,...,...,...,...,...,...
43749,2024-06-06,7,89,0.259259,7,0.416667,35,43,9,62.000000
43750,2024-06-09,7,98,0.230769,6,0.475000,38,43,21,61.386139
43751,2024-06-12,2,106,0.369565,17,0.463415,38,36,26,79.797980
43752,2024-06-14,2,84,0.341463,14,0.362500,29,31,18,79.000000


In [250]:
#No need to reverse df becasue new dataset is already reversed

In [131]:
#X1.Shape is num data, window size, variables
X5, y5 = df_to_X_y(dfLim5, dfIDtoStat5, 10)
X5.shape, y5.shape

((21669, 10, 16), (21669,))

In [132]:
#70-15-15 split
X5_train, X5_temp, y5_train, y5_temp = train_test_split(X5, y5, test_size=0.3, random_state=42)
X5_val, X5_test, y5_val, y5_test = train_test_split(X5_temp, y5_temp, test_size=0.5, random_state=42)
X5_train.shape, y5_train.shape, X5_val.shape, y5_val.shape, X5_test.shape, y5_test.shape

((15168, 10, 16), (15168,), (3250, 10, 16), (3250,), (3251, 10, 16), (3251,))

In [133]:
model5 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model5.add(InputLayer((10, 16)))
model5.add(LSTM(64, activation='tanh', return_sequences=True))
model5.add(LSTM(32, activation='tanh'))
model5.add(Dense(16, activation='relu'))
model5.add(Dense(8, activation='relu'))
model5.add(Dense(1, activation='sigmoid'))

model5.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_12 (LSTM)              (None, 10, 64)            20736     
                                                                 
 lstm_13 (LSTM)              (None, 32)                12416     
                                                                 
 dense_18 (Dense)            (None, 16)                528       
                                                                 
 dense_19 (Dense)            (None, 8)                 136       
                                                                 
 dense_20 (Dense)            (None, 1)                 9         
                                                                 
Total params: 33825 (132.13 KB)
Trainable params: 33825 (132.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [134]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model5.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model5.fit(X5_train, y5_train, validation_data=(X5_val, y5_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x1c771c80fd0>

In [135]:
y_pred = model5.predict(X5_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y5_test, y_pred_binary))

0.6637957551522609


In [136]:
#Adding ast, reb, and winning percentages
dfLim6 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    #Added Home steal and block
    "homeTeam_Stl",
    "homeTeam_Blk",
    "homeTeam_3P",
    "homeTeam_FG",
    "homeTeam_Total_Reb",
    "homeTeam_Ast",
    "homeWinPct",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    #added away steal and block
    "awayTeam_Stl",
    "awayTeam_Blk",
    "awayTeam_3P",
    "awayTeam_FG",
    "awayTeam_Total_Reb",
    "awayTeam_Ast",
    "awayWinPct",
    "winner"
]]
dfLim6

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeamSubject_id,awayTeam_points_total,awayTeam_Stl,awayTeam_Blk,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner
0,2007-04-18,Utah Jazz,29,101,5,0,4-11,36-73,45,23,...,11,91,8,2,6-25,31-74,25,17,0.000000,Utah Jazz
1,2007-04-18,Toronto Raptors,28,119,8,3,7-19,46-86,33,32,...,23,122,4,5,5-9,44-83,41,22,100.000000,Philadelphia 76ers
2,2007-04-18,Seattle Supersonics,21,75,5,3,1-7,29-69,31,13,...,7,106,7,5,4-13,43-81,38,26,100.000000,Dallas Mavericks
3,2007-04-18,San Antonio Spurs,27,77,7,3,5-20,29-81,39,14,...,8,100,5,3,11-26,42-85,55,28,100.000000,Denver Nuggets
4,2007-04-18,Sacramento Kings,26,106,5,3,7-25,40-83,30,21,...,14,117,6,6,2-14,45-79,47,19,100.000000,Los Angeles Lakers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,6,9,16-42,39-82,47,23,...,7,89,8,1,7-27,35-84,43,9,62.000000,Boston Celtics
21873,2024-06-09,Boston Celtics,2,105,10,5,10-39,38-84,41,29,...,7,98,5,3,6-26,38-80,43,21,61.386139,Boston Celtics
21874,2024-06-12,Dallas Mavericks,7,99,5,1,9-25,38-86,43,15,...,2,106,4,6,17-46,38-82,36,26,79.797980,Boston Celtics
21875,2024-06-14,Dallas Mavericks,7,122,7,2,15-37,46-91,52,21,...,2,84,2,5,14-41,29-80,31,18,79.000000,Dallas Mavericks


In [137]:
dfLim6['winner_binary'] = (dfLim6['winner'] == df['awayTeam']).astype(int)
dfLim6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim6['winner_binary'] = (dfLim6['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeam_points_total,awayTeam_Stl,awayTeam_Blk,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,5,0,4-11,36-73,45,23,...,91,8,2,6-25,31-74,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,8,3,7-19,46-86,33,32,...,122,4,5,5-9,44-83,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,5,3,1-7,29-69,31,13,...,106,7,5,4-13,43-81,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,7,3,5-20,29-81,39,14,...,100,5,3,11-26,42-85,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,5,3,7-25,40-83,30,21,...,117,6,6,2-14,45-79,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,6,9,16-42,39-82,47,23,...,89,8,1,7-27,35-84,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10,5,10-39,38-84,41,29,...,98,5,3,6-26,38-80,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,5,1,9-25,38-86,43,15,...,106,4,6,17-46,38-82,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,7,2,15-37,46-91,52,21,...,84,2,5,14-41,29-80,31,18,79.000000,Dallas Mavericks,0


In [138]:
dfLim6.insert(dfLim6.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim6['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim6.insert(dfLim6.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim6['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))

#adding FG made
dfLim6.insert(dfLim6.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim6['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim6.insert(dfLim6.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim6['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim6

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,...,awayTeam_Blk,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,5,0,4-11,4,36-73,36,...,2,6-25,6,31-74,31,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,8,3,7-19,7,46-86,46,...,5,5-9,5,44-83,44,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,5,3,1-7,1,29-69,29,...,5,4-13,4,43-81,43,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,7,3,5-20,5,29-81,29,...,3,11-26,11,42-85,42,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,5,3,7-25,7,40-83,40,...,6,2-14,2,45-79,45,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,6,9,16-42,16,39-82,39,...,1,7-27,7,35-84,35,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10,5,10-39,10,38-84,38,...,3,6-26,6,38-80,38,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,5,1,9-25,9,38-86,38,...,6,17-46,17,38-82,38,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,7,2,15-37,15,46-91,46,...,5,14-41,14,29-80,29,31,18,79.000000,Dallas Mavericks,0


In [139]:
#Format as percentage
dfLim6['homeTeam_3P'] = dfLim6['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim6['awayTeam_3P'] = dfLim6['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim6['homeTeam_FG'] = dfLim6['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim6['awayTeam_FG'] = dfLim6['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)

dfLim6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim6['homeTeam_3P'] = dfLim6['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim6['awayTeam_3P'] = dfLim6['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,...,awayTeam_Blk,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,5,0,0.363636,4,0.493151,36,...,2,0.240000,6,0.418919,31,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,8,3,0.368421,7,0.534884,46,...,5,0.555556,5,0.530120,44,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,5,3,0.142857,1,0.420290,29,...,5,0.307692,4,0.530864,43,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,7,3,0.250000,5,0.358025,29,...,3,0.423077,11,0.494118,42,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,5,3,0.280000,7,0.481928,40,...,6,0.142857,2,0.569620,45,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,6,9,0.380952,16,0.475610,39,...,1,0.259259,7,0.416667,35,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10,5,0.256410,10,0.452381,38,...,3,0.230769,6,0.475000,38,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,5,1,0.360000,9,0.441860,38,...,6,0.369565,17,0.463415,38,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,7,2,0.405405,15,0.505495,46,...,5,0.341463,14,0.362500,29,31,18,79.000000,Dallas Mavericks,0


In [140]:
dfIDtoStat6 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'WinPct'])
dfIDtoStat6

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,team_Stl,team_Blk,WinPct


In [141]:
# Create DataFrame for home team statistics
home_df6 = dfLim6[['date', 'homeTeamSubject_id', 'homeTeam_points_total', 'homeTeam_3P', 'homeTeam_3P_made', 'homeTeam_FG', 'homeTeam_FG_made', 'homeTeam_Total_Reb', 'homeTeam_Ast', 'homeTeam_Stl', 'homeTeam_Blk','homeWinPct']].copy()
home_df6.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'WinPct']

# Create DataFrame for away team statistics
away_df6 = dfLim6[['date', 'awayTeamSubject_id', 'awayTeam_points_total', 'awayTeam_3P', 'awayTeam_3P_made', 'awayTeam_FG', 'awayTeam_FG_made', 'awayTeam_Total_Reb', 'awayTeam_Ast', 'awayTeam_Stl', 'homeTeam_Blk','awayWinPct']].copy()
away_df6.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast','team_Stl', 'team_Blk', 'WinPct']

# Combine both DataFrames
dfIDtoStat6 = pd.concat([home_df, away_df], ignore_index=True)

# Display the resulting DataFrame
dfIDtoStat6

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,WinPct
0,2007-04-18,29,101,0.363636,4,0.493151,36,45,23,100.000000
1,2007-04-18,28,119,0.368421,7,0.534884,46,33,32,0.000000
2,2007-04-18,21,75,0.142857,1,0.420290,29,31,13,0.000000
3,2007-04-18,27,77,0.250000,5,0.358025,29,39,14,0.000000
4,2007-04-18,26,106,0.280000,7,0.481928,40,30,21,0.000000
...,...,...,...,...,...,...,...,...,...,...
43749,2024-06-06,7,89,0.259259,7,0.416667,35,43,9,62.000000
43750,2024-06-09,7,98,0.230769,6,0.475000,38,43,21,61.386139
43751,2024-06-12,2,106,0.369565,17,0.463415,38,36,26,79.797980
43752,2024-06-14,2,84,0.341463,14,0.362500,29,31,18,79.000000


In [142]:
#X1.Shape is num data, window size, variables
X6, y6 = df_to_X_y(dfLim6, dfIDtoStat6, 14)
X6.shape, y6.shape

((21599, 14, 16), (21599,))

In [143]:
#70-15-15 split
X6_train, X6_temp, y6_train, y6_temp = train_test_split(X6, y6, test_size=0.3, random_state=42)
X6_val, X6_test, y6_val, y6_test = train_test_split(X6_temp, y6_temp, test_size=0.5, random_state=42)
X6_train.shape, y6_train.shape, X6_val.shape, y6_val.shape, X6_test.shape, y6_test.shape

((15119, 14, 16), (15119,), (3240, 14, 16), (3240,), (3240, 14, 16), (3240,))

In [144]:
model6 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model6.add(InputLayer((14, 16)))
model6.add(LSTM(64, activation='tanh', return_sequences=True))
model6.add(LSTM(32, activation='tanh'))
model6.add(Dense(16, activation='relu'))
model6.add(Dense(8, activation='relu'))
model6.add(Dense(1, activation='sigmoid'))

model6.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_14 (LSTM)              (None, 14, 64)            20736     
                                                                 
 lstm_15 (LSTM)              (None, 32)                12416     
                                                                 
 dense_21 (Dense)            (None, 16)                528       
                                                                 
 dense_22 (Dense)            (None, 8)                 136       
                                                                 
 dense_23 (Dense)            (None, 1)                 9         
                                                                 
Total params: 33825 (132.13 KB)
Trainable params: 33825 (132.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [145]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model6.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model6.fit(X6_train, y6_train, validation_data=(X6_val, y6_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x1c771e90af0>

In [146]:
y_pred = model6.predict(X6_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y6_test, y_pred_binary))

0.6459876543209877


In [147]:
#Adding quarter point data
dfLim7 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    "homeTeam_Stl",
    "homeTeam_Blk",
    #added home team quarter scores
    "homeTeam_points_q1",
    "homeTeam_points_q2",
    "homeTeam_points_q3",
    "homeTeam_points_q4",
    "homeTeam_3P",
    "homeTeam_FG",
    "homeTeam_Total_Reb",
    "homeTeam_Ast",
    "homeWinPct",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    "awayTeam_Stl",
    "awayTeam_Blk",
     #added away team quarter scores
    "awayTeam_points_q1",
    "awayTeam_points_q2",
    "awayTeam_points_q3",
    "awayTeam_points_q4",
    "awayTeam_3P",
    "awayTeam_FG",
    "awayTeam_Total_Reb",
    "awayTeam_Ast",
    "awayWinPct",
    "winner"
]]
dfLim7

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,...,awayTeam_points_q1,awayTeam_points_q2,awayTeam_points_q3,awayTeam_points_q4,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner
0,2007-04-18,Utah Jazz,29,101,5,0,20,30,25,26,...,32,15,24,20,6-25,31-74,25,17,0.000000,Utah Jazz
1,2007-04-18,Toronto Raptors,28,119,8,3,28,29,29,33,...,25,39,22,36,5-9,44-83,41,22,100.000000,Philadelphia 76ers
2,2007-04-18,Seattle Supersonics,21,75,5,3,16,21,22,16,...,33,28,28,17,4-13,43-81,38,26,100.000000,Dallas Mavericks
3,2007-04-18,San Antonio Spurs,27,77,7,3,16,24,22,15,...,25,21,33,21,11-26,42-85,55,28,100.000000,Denver Nuggets
4,2007-04-18,Sacramento Kings,26,106,5,3,30,27,23,26,...,28,25,35,29,2-14,45-79,47,19,100.000000,Los Angeles Lakers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,6,9,37,26,23,21,...,20,22,24,23,7-27,35-84,43,9,62.000000,Boston Celtics
21873,2024-06-09,Boston Celtics,2,105,10,5,25,29,29,22,...,28,23,23,24,6-26,38-80,43,21,61.386139,Boston Celtics
21874,2024-06-12,Dallas Mavericks,7,99,5,1,31,20,19,29,...,30,20,35,21,17-46,38-82,36,26,79.797980,Boston Celtics
21875,2024-06-14,Dallas Mavericks,7,122,7,2,34,27,31,30,...,21,14,25,24,14-41,29-80,31,18,79.000000,Dallas Mavericks


In [148]:
dfLim7['winner_binary'] = (dfLim7['winner'] == df['awayTeam']).astype(int)
dfLim7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim7['winner_binary'] = (dfLim7['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,...,awayTeam_points_q2,awayTeam_points_q3,awayTeam_points_q4,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,5,0,20,30,25,26,...,15,24,20,6-25,31-74,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,8,3,28,29,29,33,...,39,22,36,5-9,44-83,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,5,3,16,21,22,16,...,28,28,17,4-13,43-81,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,7,3,16,24,22,15,...,21,33,21,11-26,42-85,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,5,3,30,27,23,26,...,25,35,29,2-14,45-79,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,6,9,37,26,23,21,...,22,24,23,7-27,35-84,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10,5,25,29,29,22,...,23,23,24,6-26,38-80,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,5,1,31,20,19,29,...,20,35,21,17-46,38-82,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,7,2,34,27,31,30,...,14,25,24,14-41,29-80,31,18,79.000000,Dallas Mavericks,0


In [149]:
dfLim7.insert(dfLim7.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim7['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim7.insert(dfLim7.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim7['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))

#adding FG made
dfLim7.insert(dfLim7.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim7['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim7.insert(dfLim7.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim7['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim7

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,...,awayTeam_points_q4,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,5,0,20,30,25,26,...,20,6-25,6,31-74,31,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,8,3,28,29,29,33,...,36,5-9,5,44-83,44,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,5,3,16,21,22,16,...,17,4-13,4,43-81,43,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,7,3,16,24,22,15,...,21,11-26,11,42-85,42,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,5,3,30,27,23,26,...,29,2-14,2,45-79,45,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,6,9,37,26,23,21,...,23,7-27,7,35-84,35,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10,5,25,29,29,22,...,24,6-26,6,38-80,38,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,5,1,31,20,19,29,...,21,17-46,17,38-82,38,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,7,2,34,27,31,30,...,24,14-41,14,29-80,29,31,18,79.000000,Dallas Mavericks,0


In [150]:
#Format as percentage
dfLim7['homeTeam_3P'] = dfLim7['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim7['awayTeam_3P'] = dfLim7['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim7['homeTeam_FG'] = dfLim7['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim7['awayTeam_FG'] = dfLim7['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)

dfLim7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim7['homeTeam_3P'] = dfLim7['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim7['awayTeam_3P'] = dfLim7['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,...,awayTeam_points_q4,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2007-04-18,Utah Jazz,29,101,5,0,20,30,25,26,...,20,0.240000,6,0.418919,31,25,17,0.000000,Utah Jazz,0
1,2007-04-18,Toronto Raptors,28,119,8,3,28,29,29,33,...,36,0.555556,5,0.530120,44,41,22,100.000000,Philadelphia 76ers,1
2,2007-04-18,Seattle Supersonics,21,75,5,3,16,21,22,16,...,17,0.307692,4,0.530864,43,38,26,100.000000,Dallas Mavericks,1
3,2007-04-18,San Antonio Spurs,27,77,7,3,16,24,22,15,...,21,0.423077,11,0.494118,42,55,28,100.000000,Denver Nuggets,1
4,2007-04-18,Sacramento Kings,26,106,5,3,30,27,23,26,...,29,0.142857,2,0.569620,45,47,19,100.000000,Los Angeles Lakers,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21872,2024-06-06,Boston Celtics,2,107,6,9,37,26,23,21,...,23,0.259259,7,0.416667,35,43,9,62.000000,Boston Celtics,0
21873,2024-06-09,Boston Celtics,2,105,10,5,25,29,29,22,...,24,0.230769,6,0.475000,38,43,21,61.386139,Boston Celtics,0
21874,2024-06-12,Dallas Mavericks,7,99,5,1,31,20,19,29,...,21,0.369565,17,0.463415,38,36,26,79.797980,Boston Celtics,1
21875,2024-06-14,Dallas Mavericks,7,122,7,2,34,27,31,30,...,24,0.341463,14,0.362500,29,31,18,79.000000,Dallas Mavericks,0


In [151]:
dfIDtoStat7 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_points_q1', 'team_points_q2', 'team_points_q3', 'team_points_q4', 'WinPct'])
dfIDtoStat7

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,team_Stl,team_Blk,team_points_q1,team_points_q2,team_points_q3,team_points_q4,WinPct


In [152]:
# Create DataFrame for home team statistics
home_df7 = dfLim7[['date', 'homeTeamSubject_id', 'homeTeam_points_total', 'homeTeam_3P', 'homeTeam_3P_made', 'homeTeam_FG', 'homeTeam_FG_made', 'homeTeam_Total_Reb', 'homeTeam_Ast', 'homeTeam_Stl', 'homeTeam_Blk', 'homeTeam_points_q1', 'homeTeam_points_q2', 'homeTeam_points_q3', 'homeTeam_points_q4', 'homeWinPct']].copy()
home_df7.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_points_q1', 'team_points_q2', 'team_points_q3', 'team_points_q4', 'WinPct']

# Create DataFrame for away team statistics
away_df7 = dfLim7[['date', 'awayTeamSubject_id', 'awayTeam_points_total', 'awayTeam_3P', 'awayTeam_3P_made', 'awayTeam_FG', 'awayTeam_FG_made', 'awayTeam_Total_Reb', 'awayTeam_Ast', 'awayTeam_Stl', 'homeTeam_Blk','awayTeam_points_q1', 'awayTeam_points_q2', 'awayTeam_points_q3', 'awayTeam_points_q4','awayWinPct']].copy()
away_df7.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_points_q1', 'team_points_q2', 'team_points_q3', 'team_points_q4', 'WinPct']

# Combine both DataFrames
dfIDtoStat7 = pd.concat([home_df7, away_df7], ignore_index=True)

# Display the resulting DataFrame
dfIDtoStat7

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,team_Stl,team_Blk,team_points_q1,team_points_q2,team_points_q3,team_points_q4,WinPct
0,2007-04-18,29,101,0.363636,4,0.493151,36,45,23,5,0,20,30,25,26,100.000000
1,2007-04-18,28,119,0.368421,7,0.534884,46,33,32,8,3,28,29,29,33,0.000000
2,2007-04-18,21,75,0.142857,1,0.420290,29,31,13,5,3,16,21,22,16,0.000000
3,2007-04-18,27,77,0.250000,5,0.358025,29,39,14,7,3,16,24,22,15,0.000000
4,2007-04-18,26,106,0.280000,7,0.481928,40,30,21,5,3,30,27,23,26,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43749,2024-06-06,7,89,0.259259,7,0.416667,35,43,9,8,9,20,22,24,23,62.000000
43750,2024-06-09,7,98,0.230769,6,0.475000,38,43,21,5,5,28,23,23,24,61.386139
43751,2024-06-12,2,106,0.369565,17,0.463415,38,36,26,4,1,30,20,35,21,79.797980
43752,2024-06-14,2,84,0.341463,14,0.362500,29,31,18,2,2,21,14,25,24,79.000000


In [153]:
#X1.Shape is num data, window size, variables
X7, y7 = df_to_X_y(dfLim7, dfIDtoStat7, 20)
X7.shape, y7.shape

((21506, 20, 28), (21506,))

In [154]:
#70-15-15 split
X7_train, X7_temp, y7_train, y7_temp = train_test_split(X7, y7, test_size=0.3, random_state=42)
X7_val, X7_test, y7_val, y7_test = train_test_split(X7_temp, y7_temp, test_size=0.5, random_state=42)
X7_train.shape, y7_train.shape, X7_val.shape, y7_val.shape, X7_test.shape, y7_test.shape

((15054, 20, 28), (15054,), (3226, 20, 28), (3226,), (3226, 20, 28), (3226,))

In [155]:
model7 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model7.add(InputLayer((20, 28)))
model7.add(LSTM(64, activation='tanh', return_sequences=True))
model7.add(Dropout(0.2))
model7.add(LSTM(32, activation='tanh'))
model7.add(Dense(16, activation='relu'))
model7.add(BatchNormalization())
model7.add(Dense(8, activation='relu'))
model7.add(Dense(1, activation='sigmoid'))

model7.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_16 (LSTM)              (None, 20, 64)            23808     
                                                                 
 dropout (Dropout)           (None, 20, 64)            0         
                                                                 
 lstm_17 (LSTM)              (None, 32)                12416     
                                                                 
 dense_24 (Dense)            (None, 16)                528       
                                                                 
 batch_normalization (Batch  (None, 16)                64        
 Normalization)                                                  
                                                                 
 dense_25 (Dense)            (None, 8)                 136       
                                                      

In [156]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-5, verbose=1)
model7.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model7.fit(X7_train, y7_train, validation_data=(X7_val, y7_val), epochs=200, callbacks=[earlystopping])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x1c768c16850>

In [157]:
y_pred = model7.predict(X7_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y7_test, y_pred_binary))

0.6630502169869807
