In [73]:
import tensorflow as tf
import os
import pandas as pd 
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, InputLayer, Dropout, BatchNormalization
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score

In [48]:
df = pd.read_csv('CopyOfNBADatawithWinandPlayoff.csv')
df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,homeTeam_points_1OT,homeTeam_points_2OT,...,winner,season,homeWinPct,awayWinPct,gameNumber,isPlayoffGame,homeWins,homeLosses,awayWins,awayLosses
0,2014-06-15,San Antonio Spurs,SAS,104,22.0,25.0,30.0,27.0,0,0,...,San Antonio Spurs,2013-2014,74.193548,64.406780,1375,True,92,32,76,42
1,2014-10-28,Los Angeles Lakers,LAL,90,19.0,26.0,24.0,21.0,0,0,...,Houston Rockets,2014-2015,0.000000,100.000000,1,False,0,1,1,0
2,2014-10-28,San Antonio Spurs,SAS,101,26.0,19.0,31.0,25.0,0,0,...,San Antonio Spurs,2014-2015,100.000000,0.000000,2,False,1,0,0,1
3,2014-10-28,New Orleans Pelicans,NOP,101,24.0,24.0,30.0,23.0,0,0,...,New Orleans Pelicans,2014-2015,100.000000,0.000000,3,False,1,0,0,1
4,2014-10-29,Portland Trail Blazers,POR,106,29.0,20.0,26.0,31.0,0,0,...,Portland Trail Blazers,2014-2015,100.000000,0.000000,4,False,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,BOS,107,37.0,26.0,23.0,21.0,0,0,...,Boston Celtics,2023-2024,79.381443,62.000000,1315,True,77,20,62,38
12825,2024-06-09,Boston Celtics,BOS,105,25.0,29.0,29.0,22.0,0,0,...,Boston Celtics,2023-2024,79.591837,61.386139,1316,True,78,20,62,39
12826,2024-06-12,Dallas Mavericks,DAL,99,31.0,20.0,19.0,29.0,0,0,...,Boston Celtics,2023-2024,60.784314,79.797980,1317,True,62,40,79,20
12827,2024-06-14,Dallas Mavericks,DAL,122,34.0,27.0,31.0,30.0,0,0,...,Dallas Mavericks,2023-2024,61.165049,79.000000,1318,True,63,40,79,21


In [49]:
subjectID_dict = {
    "ATL": 1,
    "BOS": 2,
    "BKN": 3,
    "CHA": 4,
    "CHI": 5,
    "CLE": 6,
    "DAL": 7,
    "DEN": 8,
    "DET": 9,
    "GSW": 10,
    "HOU": 11,
    "IND": 12,
    "LAC": 13,
    "LAL": 14,
    "MEM": 15,
    "MIA": 16,
    "MIL": 17,
    "MIN": 18,
    "NOP": 19,
    "NYK": 20,
    "OKC": 21,
    "ORL": 22,
    "PHI": 23,
    "PHX": 24,
    "POR": 25,
    "SAC": 26,
    "SAS": 27,
    "TOR": 28,
    "UTA": 29,
    "WAS": 30
}

In [50]:
df["homeTeamSubject_id"] = df["homeTeam_id"].map(subjectID_dict)
df["awayTeamSubject_id"] = df["awayTeam_id"].map(subjectID_dict)

cols = df.columns.tolist()
cols.insert(cols.index("homeTeam_id") + 1, cols.pop(cols.index("homeTeamSubject_id")))
cols.insert(cols.index("awayTeam_id") + 1, cols.pop(cols.index("awayTeamSubject_id")))
df = df[cols]

df

Unnamed: 0,date,homeTeam,homeTeam_id,homeTeamSubject_id,homeTeam_points_total,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,homeTeam_points_1OT,...,winner,season,homeWinPct,awayWinPct,gameNumber,isPlayoffGame,homeWins,homeLosses,awayWins,awayLosses
0,2014-06-15,San Antonio Spurs,SAS,27,104,22.0,25.0,30.0,27.0,0,...,San Antonio Spurs,2013-2014,74.193548,64.406780,1375,True,92,32,76,42
1,2014-10-28,Los Angeles Lakers,LAL,14,90,19.0,26.0,24.0,21.0,0,...,Houston Rockets,2014-2015,0.000000,100.000000,1,False,0,1,1,0
2,2014-10-28,San Antonio Spurs,SAS,27,101,26.0,19.0,31.0,25.0,0,...,San Antonio Spurs,2014-2015,100.000000,0.000000,2,False,1,0,0,1
3,2014-10-28,New Orleans Pelicans,NOP,19,101,24.0,24.0,30.0,23.0,0,...,New Orleans Pelicans,2014-2015,100.000000,0.000000,3,False,1,0,0,1
4,2014-10-29,Portland Trail Blazers,POR,25,106,29.0,20.0,26.0,31.0,0,...,Portland Trail Blazers,2014-2015,100.000000,0.000000,4,False,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,BOS,2,107,37.0,26.0,23.0,21.0,0,...,Boston Celtics,2023-2024,79.381443,62.000000,1315,True,77,20,62,38
12825,2024-06-09,Boston Celtics,BOS,2,105,25.0,29.0,29.0,22.0,0,...,Boston Celtics,2023-2024,79.591837,61.386139,1316,True,78,20,62,39
12826,2024-06-12,Dallas Mavericks,DAL,7,99,31.0,20.0,19.0,29.0,0,...,Boston Celtics,2023-2024,60.784314,79.797980,1317,True,62,40,79,20
12827,2024-06-14,Dallas Mavericks,DAL,7,122,34.0,27.0,31.0,30.0,0,...,Dallas Mavericks,2023-2024,61.165049,79.000000,1318,True,63,40,79,21


In [185]:
#limit df to few variables
dfLim = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    "winner"
]]
dfLim

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner
0,2014-06-15,San Antonio Spurs,27,104,Miami Heat,16,87,San Antonio Spurs
1,2014-10-28,Los Angeles Lakers,14,90,Houston Rockets,11,108,Houston Rockets
2,2014-10-28,San Antonio Spurs,27,101,Dallas Mavericks,7,100,San Antonio Spurs
3,2014-10-28,New Orleans Pelicans,19,101,Orlando Magic,22,84,New Orleans Pelicans
4,2014-10-29,Portland Trail Blazers,25,106,Oklahoma City Thunder,21,89,Portland Trail Blazers
...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics
12825,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics
12826,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics
12827,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks


In [186]:
#reformat the dates so they can be compared
dfLim['date'] = dfLim['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
dfLim


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim['date'] = dfLim['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner
0,2014-06-15,San Antonio Spurs,27,104,Miami Heat,16,87,San Antonio Spurs
1,2014-10-28,Los Angeles Lakers,14,90,Houston Rockets,11,108,Houston Rockets
2,2014-10-28,San Antonio Spurs,27,101,Dallas Mavericks,7,100,San Antonio Spurs
3,2014-10-28,New Orleans Pelicans,19,101,Orlando Magic,22,84,New Orleans Pelicans
4,2014-10-29,Portland Trail Blazers,25,106,Oklahoma City Thunder,21,89,Portland Trail Blazers
...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics
12825,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics
12826,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics
12827,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks


In [187]:
dfLim['winner_binary'] = (dfLim['winner'] == df['awayTeam']).astype(int)
dfLim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim['winner_binary'] = (dfLim['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,Miami Heat,16,87,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,Houston Rockets,11,108,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,Dallas Mavericks,7,100,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,Orlando Magic,22,84,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,Oklahoma City Thunder,21,89,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks,0


In [188]:
dfIDtoStat = pd.DataFrame(columns=['date', 'team_id', 'team_points_total'])
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total


In [189]:
for index, row in dfLim.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    dfIDtoStat.loc[len(dfIDtoStat)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total}
    dfIDtoStat.loc[len(dfIDtoStat)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total}
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total
0,2014-06-15,27,104
1,2014-06-15,16,87
2,2014-10-28,14,90
3,2014-10-28,11,108
4,2014-10-28,27,101
...,...,...,...
25653,2024-06-12,2,106
25654,2024-06-14,7,122
25655,2024-06-14,2,84
25656,2024-06-17,2,106


In [190]:
dfIDtoStat.head()

Unnamed: 0,date,team_id,team_points_total
0,2014-06-15,27,104
1,2014-06-15,16,87
2,2014-10-28,14,90
3,2014-10-28,11,108
4,2014-10-28,27,101


In [191]:
dfLim = dfLim.iloc[::-1].reset_index(drop=True)
dfLim

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,awayTeam,awayTeamSubject_id,awayTeam_points_total,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,Dallas Mavericks,7,88,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,Boston Celtics,2,84,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,Boston Celtics,2,106,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,Dallas Mavericks,7,98,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,Dallas Mavericks,7,89,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...
12824,2014-10-29,Portland Trail Blazers,25,106,Oklahoma City Thunder,21,89,Portland Trail Blazers,0
12825,2014-10-28,New Orleans Pelicans,19,101,Orlando Magic,22,84,New Orleans Pelicans,0
12826,2014-10-28,San Antonio Spurs,27,101,Dallas Mavericks,7,100,San Antonio Spurs,0
12827,2014-10-28,Los Angeles Lakers,14,90,Houston Rockets,11,108,Houston Rockets,1


In [192]:
dfIDtoStat = dfIDtoStat.iloc[::-1].reset_index(drop=True)
dfIDtoStat

Unnamed: 0,date,team_id,team_points_total
0,2024-06-17,7,88
1,2024-06-17,2,106
2,2024-06-14,2,84
3,2024-06-14,7,122
4,2024-06-12,2,106
...,...,...,...
25653,2014-10-28,27,101
25654,2014-10-28,11,108
25655,2014-10-28,14,90
25656,2014-06-15,16,87


In [193]:
'''def df_to_X_y_Simple(df, window_size=5):
    df_as_np = df.to_numpy()
    X = []
    y = []
    for i in range(len(df_as_np)-window_size):
        row = [r for r in df_as_np[i:i+window_size]]
        X.append(row)
        label = df_as_np[i+window_size][0]
        y.append(label)
    return np.array(X), np.array(y)'''

def df_to_X_y(df, dfWindow, window_size):
    X = []
    y = []
    #each team must have played window_size games before data can be extracted
    for index, row in df.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfWindow[dfWindow['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
            homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size).drop(columns=['date', 'team_id']))
            homeTeam_window = homeTeam_window.to_numpy()
            awayTeam_window = awayTeam_window.to_numpy()
            combined_window = np.hstack((homeTeam_window, awayTeam_window))
            X.append(combined_window)
            y.append(row['winner_binary'])
    
    return np.array(X), np.array(y)   



In [194]:
dfIDtoStat['team_points_total'] = pd.to_numeric(dfIDtoStat['team_points_total'], errors='coerce')
window_size =5 
exX =[]
exY = []
for index, row in dfLim.iterrows():
        homeTeam_id = row['homeTeamSubject_id']
        awayTeam_id = row['awayTeamSubject_id']
        date = row['date']
        #limit dfWindow to include rows that occured before date of row we are on
        dfWindow_before_date = dfIDtoStat[dfIDtoStat['date']< date]
        homeTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].shape[0]
        awayTeam_occurrences = dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].shape[0]
        if homeTeam_occurrences > window_size and awayTeam_occurrences > window_size:
                homeTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == homeTeam_id].sort_values(by='date', ascending=False).head(window_size))
                awayTeam_window = (dfWindow_before_date[dfWindow_before_date['team_id'] == awayTeam_id].sort_values(by='date', ascending=False).head(window_size))
                print(date)
                print(homeTeam_window)
                print(awayTeam_window)
                print(type(homeTeam_window))
                homeTeam_window = homeTeam_window.drop(columns=['date', 'team_id'])
                awayTeam_window = awayTeam_window.drop(columns=['date', 'team_id'])
                homeTeam_window = homeTeam_window.to_numpy()
                awayTeam_window = awayTeam_window.to_numpy()
                combined_window = np.hstack((homeTeam_window, awayTeam_window))
                exX.append(combined_window)
                exY.append(row['winner_binary'])
                print(exX)
                print(exY)
                break
                        

                

2024-06-17 00:00:00
         date  team_id  team_points_total
2  2024-06-14        2                 84
4  2024-06-12        2                106
7  2024-06-09        2                105
9  2024-06-06        2                107
14 2024-05-27        2                105
         date  team_id  team_points_total
3  2024-06-14        7                122
5  2024-06-12        7                 99
6  2024-06-09        7                 98
8  2024-06-06        7                 89
10 2024-05-30        7                124
<class 'pandas.core.frame.DataFrame'>
[array([[ 84, 122],
       [106,  99],
       [105,  98],
       [107,  89],
       [105, 124]])]
[0]


In [195]:
#X1.Shape is num data, window size, variables
X1, y1 = df_to_X_y(dfLim, dfIDtoStat, 5)
X1.shape, y1.shape

((12734, 5, 2), (12734,))

In [196]:
#70-15-15 split
X1_train, X1_temp, y1_train, y1_temp = train_test_split(X1, y1, test_size=0.3, random_state=42)
X1_val, X1_test, y1_val, y1_test = train_test_split(X1_temp, y1_temp, test_size=0.5, random_state=42)
X1_train.shape, y1_train.shape, X1_val.shape, y1_val.shape, X1_test.shape, y1_test.shape

((8913, 5, 2), (8913,), (1910, 5, 2), (1910,), (1911, 5, 2), (1911,))

In [197]:
model1 = Sequential()
model1.add(InputLayer((5, 2)))
model1.add(LSTM(64, activation='tanh', return_sequences=True))
model1.add(LSTM(32, activation='tanh'))
model1.add(Dense(16, activation='relu'))
model1.add(Dense(8, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))

model1.summary()

In [198]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [199]:
model1.fit(X1_train, y1_train, validation_data=(X1_val, y1_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5729 - loss: 0.6786 - val_accuracy: 0.5780 - val_loss: 0.6775
Epoch 2/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5683 - loss: 0.6790 - val_accuracy: 0.5749 - val_loss: 0.6778
Epoch 3/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5885 - loss: 0.6738 - val_accuracy: 0.5743 - val_loss: 0.6799
Epoch 4/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5941 - loss: 0.6734 - val_accuracy: 0.5733 - val_loss: 0.6771
Epoch 5/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5849 - loss: 0.6726 - val_accuracy: 0.5754 - val_loss: 0.6776
Epoch 6/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5831 - loss: 0.6741 - val_accuracy: 0.5817 - val_loss: 0.6757
Epoch 7/100
[1m279/27

<keras.src.callbacks.history.History at 0x364cbb230>

In [200]:
y_pred = model1.predict(X1_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y1_test, y_pred_binary))


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
0.5902668759811617


In [201]:
#Now adding 3 point percentages
dfLim2 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    #added homeTeam3P
    "homeTeam_3P",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    #added awayTeam3P
    "awayTeam_3P",
    "winner"
]]
dfLim2

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2014-06-15,San Antonio Spurs,27,104,12-26,Miami Heat,16,87,7-25,San Antonio Spurs
1,2014-10-28,Los Angeles Lakers,14,90,3-10,Houston Rockets,11,108,12-29,Houston Rockets
2,2014-10-28,San Antonio Spurs,27,101,14-28,Dallas Mavericks,7,100,8-21,San Antonio Spurs
3,2014-10-28,New Orleans Pelicans,19,101,4-17,Orlando Magic,22,84,4-11,New Orleans Pelicans
4,2014-10-29,Portland Trail Blazers,25,106,11-29,Oklahoma City Thunder,21,89,2-16,Portland Trail Blazers
...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
12825,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
12826,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
12827,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks


In [202]:
#reformat the dates so they can be compared
dfLim2['date'] = dfLim2['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
dfLim2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['date'] = dfLim2['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2014-06-15,San Antonio Spurs,27,104,12-26,Miami Heat,16,87,7-25,San Antonio Spurs
1,2014-10-28,Los Angeles Lakers,14,90,3-10,Houston Rockets,11,108,12-29,Houston Rockets
2,2014-10-28,San Antonio Spurs,27,101,14-28,Dallas Mavericks,7,100,8-21,San Antonio Spurs
3,2014-10-28,New Orleans Pelicans,19,101,4-17,Orlando Magic,22,84,4-11,New Orleans Pelicans
4,2014-10-29,Portland Trail Blazers,25,106,11-29,Oklahoma City Thunder,21,89,2-16,Portland Trail Blazers
...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
12825,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
12826,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
12827,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks


In [203]:
dfLim2['winner_binary'] = (dfLim2['winner'] == df['awayTeam']).astype(int)
dfLim2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['winner_binary'] = (dfLim2['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,12-26,Miami Heat,16,87,7-25,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,3-10,Houston Rockets,11,108,12-29,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,14-28,Dallas Mavericks,7,100,8-21,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,4-17,Orlando Magic,22,84,4-11,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,11-29,Oklahoma City Thunder,21,89,2-16,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks,0


In [204]:
#Format as percentage
dfLim2['homeTeam_3P'] = dfLim2['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim2['awayTeam_3P'] = dfLim2['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['homeTeam_3P'] = dfLim2['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim2['awayTeam_3P'] = dfLim2['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,0.461538,Miami Heat,16,87,0.280000,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,0.300000,Houston Rockets,11,108,0.413793,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,0.500000,Dallas Mavericks,7,100,0.380952,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,0.235294,Orlando Magic,22,84,0.363636,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,0.379310,Oklahoma City Thunder,21,89,0.125000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,0.380952,Dallas Mavericks,7,89,0.259259,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,0.256410,Dallas Mavericks,7,98,0.230769,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,0.360000,Boston Celtics,2,106,0.369565,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,0.405405,Boston Celtics,2,84,0.341463,Dallas Mavericks,0


In [205]:
dfIDtoStat2 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%'])
dfIDtoStat['team_points_total'] = pd.to_numeric(dfIDtoStat['team_points_total'], errors='coerce')
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%


In [206]:
#populate dfIDtoStat2
dfIDtoStat['team_points_total'] = pd.to_numeric(dfIDtoStat['team_points_total'], errors='coerce')
for index, row in dfLim2.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    #added homeTeam3P
    homeTeam_3P = row['homeTeam_3P']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    #added awayTeam3P
    awayTeam_3P = row['homeTeam_3P']
    dfIDtoStat2.loc[len(dfIDtoStat2)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P}
    dfIDtoStat2.loc[len(dfIDtoStat2)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P}
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%
0,2014-06-15,27,104,0.461538
1,2014-06-15,16,87,0.461538
2,2014-10-28,14,90,0.300000
3,2014-10-28,11,108,0.300000
4,2014-10-28,27,101,0.500000
...,...,...,...,...
25653,2024-06-12,2,106,0.360000
25654,2024-06-14,7,122,0.405405
25655,2024-06-14,2,84,0.405405
25656,2024-06-17,2,106,0.333333


In [207]:
#reverse df order - should now start with 2014
dfLim2 = dfLim2.iloc[::-1].reset_index(drop=True)
dfLim2

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,0.333333,Dallas Mavericks,7,88,0.297297,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,0.405405,Boston Celtics,2,84,0.341463,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,0.360000,Boston Celtics,2,106,0.369565,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,0.256410,Dallas Mavericks,7,98,0.230769,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,0.380952,Dallas Mavericks,7,89,0.259259,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...
12824,2014-10-29,Portland Trail Blazers,25,106,0.379310,Oklahoma City Thunder,21,89,0.125000,Portland Trail Blazers,0
12825,2014-10-28,New Orleans Pelicans,19,101,0.235294,Orlando Magic,22,84,0.363636,New Orleans Pelicans,0
12826,2014-10-28,San Antonio Spurs,27,101,0.500000,Dallas Mavericks,7,100,0.380952,San Antonio Spurs,0
12827,2014-10-28,Los Angeles Lakers,14,90,0.300000,Houston Rockets,11,108,0.413793,Houston Rockets,1


In [208]:
#reverse order - should now start with 2014
dfIDtoStat2 = dfIDtoStat2.iloc[::-1].reset_index(drop=True)
dfIDtoStat2

Unnamed: 0,date,team_id,team_points_total,3P%
0,2024-06-17,7,88,0.333333
1,2024-06-17,2,106,0.333333
2,2024-06-14,2,84,0.405405
3,2024-06-14,7,122,0.405405
4,2024-06-12,2,106,0.360000
...,...,...,...,...
25653,2014-10-28,27,101,0.500000
25654,2014-10-28,11,108,0.300000
25655,2014-10-28,14,90,0.300000
25656,2014-06-15,16,87,0.461538


In [209]:
#X1.Shape is num data, window size, variables
X2, y2 = df_to_X_y(dfLim2, dfIDtoStat2, 5)
X2.shape, y2.shape

((12734, 5, 4), (12734,))

In [210]:
#70-15-15 split
X2_train, X2_temp, y2_train, y2_temp = train_test_split(X2, y2, test_size=0.3, random_state=42)
X2_val, X2_test, y2_val, y2_test = train_test_split(X2_temp, y2_temp, test_size=0.5, random_state=42)
X2_train.shape, y2_train.shape, X2_val.shape, y2_val.shape, X2_test.shape, y2_test.shape

((8913, 5, 4), (8913,), (1910, 5, 4), (1910,), (1911, 5, 4), (1911,))

In [211]:
model2 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model2.add(InputLayer((5, 4)))
model2.add(LSTM(64, activation='tanh', return_sequences=True))
model2.add(LSTM(32, activation='tanh'))
model2.add(Dense(16, activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

model2.summary()

In [212]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [213]:
from datetime import datetime

def convert_date_to_numeric(date_str):
    try:
        return datetime.strptime(date_str, '%Y-%m-%d').timestamp()
    except ValueError:
        return date_str  # Keep as is if it's not a valid date format

In [214]:

model2.fit(X2_train, y2_train, validation_data=(X2_val, y2_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5589 - loss: 0.6826 - val_accuracy: 0.5728 - val_loss: 0.6812
Epoch 2/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5854 - loss: 0.6739 - val_accuracy: 0.5743 - val_loss: 0.6789
Epoch 3/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5859 - loss: 0.6742 - val_accuracy: 0.5759 - val_loss: 0.6785
Epoch 4/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5867 - loss: 0.6737 - val_accuracy: 0.5707 - val_loss: 0.6778
Epoch 5/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5894 - loss: 0.6705 - val_accuracy: 0.5759 - val_loss: 0.6776
Epoch 6/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5760 - loss: 0.6750 - val_accuracy: 0.5812 - val_loss: 0.6749
Epoch 7/100
[1m279/27

<keras.src.callbacks.history.History at 0x359a06e10>

In [215]:
y_pred = model2.predict(X2_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y2_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
0.5928833071690215


In [216]:
#Not adding anything new because I am gonna pull out 3 pointers made
dfLim3 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    #added homeTeam3P
    "homeTeam_3P",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    #added awayTeam3P
    "awayTeam_3P",
    "winner"
]]
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2014-06-15,San Antonio Spurs,27,104,12-26,Miami Heat,16,87,7-25,San Antonio Spurs
1,2014-10-28,Los Angeles Lakers,14,90,3-10,Houston Rockets,11,108,12-29,Houston Rockets
2,2014-10-28,San Antonio Spurs,27,101,14-28,Dallas Mavericks,7,100,8-21,San Antonio Spurs
3,2014-10-28,New Orleans Pelicans,19,101,4-17,Orlando Magic,22,84,4-11,New Orleans Pelicans
4,2014-10-29,Portland Trail Blazers,25,106,11-29,Oklahoma City Thunder,21,89,2-16,Portland Trail Blazers
...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
12825,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
12826,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
12827,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks


In [217]:
#reformat the dates so they can be compared
#dfLim3['date'] = dfLim3['date'].apply(lambda x: datetime.strptime(x, "%B %d, %Y"))
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner
0,2014-06-15,San Antonio Spurs,27,104,12-26,Miami Heat,16,87,7-25,San Antonio Spurs
1,2014-10-28,Los Angeles Lakers,14,90,3-10,Houston Rockets,11,108,12-29,Houston Rockets
2,2014-10-28,San Antonio Spurs,27,101,14-28,Dallas Mavericks,7,100,8-21,San Antonio Spurs
3,2014-10-28,New Orleans Pelicans,19,101,4-17,Orlando Magic,22,84,4-11,New Orleans Pelicans
4,2014-10-29,Portland Trail Blazers,25,106,11-29,Oklahoma City Thunder,21,89,2-16,Portland Trail Blazers
...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics
12825,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics
12826,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics
12827,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks


In [218]:
dfLim3['winner_binary'] = (dfLim3['winner'] == df['awayTeam']).astype(int)
dfLim3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['winner_binary'] = (dfLim3['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,12-26,Miami Heat,16,87,7-25,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,3-10,Houston Rockets,11,108,12-29,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,14-28,Dallas Mavericks,7,100,8-21,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,4-17,Orlando Magic,22,84,4-11,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,11-29,Oklahoma City Thunder,21,89,2-16,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,Dallas Mavericks,7,89,7-27,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10-39,Dallas Mavericks,7,98,6-26,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,9-25,Boston Celtics,2,106,17-46,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,15-37,Boston Celtics,2,84,14-41,Dallas Mavericks,0


In [219]:
dfLim3.insert(dfLim3.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim3['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim3.insert(dfLim3.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim3['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,12-26,12,Miami Heat,16,87,7-25,7,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,3-10,3,Houston Rockets,11,108,12-29,12,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,14-28,14,Dallas Mavericks,7,100,8-21,8,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,4-17,4,Orlando Magic,22,84,4-11,4,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,11-29,11,Oklahoma City Thunder,21,89,2-16,2,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,16,Dallas Mavericks,7,89,7-27,7,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10-39,10,Dallas Mavericks,7,98,6-26,6,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,9-25,9,Boston Celtics,2,106,17-46,17,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,15-37,15,Boston Celtics,2,84,14-41,14,Dallas Mavericks,0


In [220]:
#Format as percentage
dfLim3['homeTeam_3P'] = dfLim3['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim3['awayTeam_3P'] = dfLim3['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['homeTeam_3P'] = dfLim3['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim3['awayTeam_3P'] = dfLim3['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,0.461538,12,Miami Heat,16,87,0.280000,7,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,0.300000,3,Houston Rockets,11,108,0.413793,12,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,0.500000,14,Dallas Mavericks,7,100,0.380952,8,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,0.235294,4,Orlando Magic,22,84,0.363636,4,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,0.379310,11,Oklahoma City Thunder,21,89,0.125000,2,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,0.380952,16,Dallas Mavericks,7,89,0.259259,7,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,0.256410,10,Dallas Mavericks,7,98,0.230769,6,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,0.360000,9,Boston Celtics,2,106,0.369565,17,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,0.405405,15,Boston Celtics,2,84,0.341463,14,Dallas Mavericks,0


In [221]:
dfIDtoStat3 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made'])
dfIDtoStat3

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made


In [222]:
#populate dfIDtoStat3
for index, row in dfLim3.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    homeTeam_3P = row['homeTeam_3P']
    #added homeTeam3P_made
    homeTeam_3P_made = row['homeTeam_3P_made']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    awayTeam_3P = row['homeTeam_3P']
    #added awayTeam3P_made
    awayTeam_3P_made = row['awayTeam_3P_made']
    #added these new variables into this
    dfIDtoStat3.loc[len(dfIDtoStat3)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P, '3P_made':homeTeam_3P_made}
    dfIDtoStat3.loc[len(dfIDtoStat3)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P, '3P_made':awayTeam_3P_made}
dfIDtoStat3

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made
0,2014-06-15,27,104,0.461538,12
1,2014-06-15,16,87,0.461538,7
2,2014-10-28,14,90,0.300000,3
3,2014-10-28,11,108,0.300000,12
4,2014-10-28,27,101,0.500000,14
...,...,...,...,...,...
25653,2024-06-12,2,106,0.360000,17
25654,2024-06-14,7,122,0.405405,15
25655,2024-06-14,2,84,0.405405,14
25656,2024-06-17,2,106,0.333333,13


In [223]:
#reverse df order - should now start with 2014
dfLim3 = dfLim3.iloc[::-1].reset_index(drop=True)
dfLim3

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,0.333333,13,Dallas Mavericks,7,88,0.297297,11,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,0.405405,15,Boston Celtics,2,84,0.341463,14,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,0.360000,9,Boston Celtics,2,106,0.369565,17,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,0.256410,10,Dallas Mavericks,7,98,0.230769,6,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,0.380952,16,Dallas Mavericks,7,89,0.259259,7,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2014-10-29,Portland Trail Blazers,25,106,0.379310,11,Oklahoma City Thunder,21,89,0.125000,2,Portland Trail Blazers,0
12825,2014-10-28,New Orleans Pelicans,19,101,0.235294,4,Orlando Magic,22,84,0.363636,4,New Orleans Pelicans,0
12826,2014-10-28,San Antonio Spurs,27,101,0.500000,14,Dallas Mavericks,7,100,0.380952,8,San Antonio Spurs,0
12827,2014-10-28,Los Angeles Lakers,14,90,0.300000,3,Houston Rockets,11,108,0.413793,12,Houston Rockets,1


In [224]:
#reverse order - should now start with 2014
dfIDtoStat3 = dfIDtoStat3.iloc[::-1].reset_index(drop=True)
dfIDtoStat3

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made
0,2024-06-17,7,88,0.333333,11
1,2024-06-17,2,106,0.333333,13
2,2024-06-14,2,84,0.405405,14
3,2024-06-14,7,122,0.405405,15
4,2024-06-12,2,106,0.360000,17
...,...,...,...,...,...
25653,2014-10-28,27,101,0.500000,14
25654,2014-10-28,11,108,0.300000,12
25655,2014-10-28,14,90,0.300000,3
25656,2014-06-15,16,87,0.461538,7


In [225]:
#X1.Shape is num data, window size, variables
X3, y3 = df_to_X_y(dfLim3, dfIDtoStat3, 5)
X3.shape, y3.shape

((12734, 5, 6), (12734,))

In [226]:
#70-15-15 split
X3_train, X3_temp, y3_train, y3_temp = train_test_split(X3, y3, test_size=0.3, random_state=42)
X3_val, X3_test, y3_val, y3_test = train_test_split(X3_temp, y3_temp, test_size=0.5, random_state=42)
X3_train.shape, y3_train.shape, X3_val.shape, y3_val.shape, X3_test.shape, y3_test.shape

((8913, 5, 6), (8913,), (1910, 5, 6), (1910,), (1911, 5, 6), (1911,))

In [227]:
model3 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model3.add(InputLayer((5, 6)))
model3.add(LSTM(64, activation='tanh', return_sequences=True))
model3.add(LSTM(32, activation='tanh'))
model3.add(Dense(16, activation='relu'))
model3.add(Dense(8, activation='relu'))
model3.add(Dense(1, activation='sigmoid'))

model3.summary()

In [228]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [229]:
model3.fit(X3_train, y3_train, validation_data=(X3_val, y3_val), epochs=100, callbacks=[earlystopping])


Epoch 1/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5841 - loss: 0.6756 - val_accuracy: 0.5775 - val_loss: 0.6811
Epoch 2/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5845 - loss: 0.6744 - val_accuracy: 0.5832 - val_loss: 0.6748
Epoch 3/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5849 - loss: 0.6710 - val_accuracy: 0.5723 - val_loss: 0.6802
Epoch 4/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5746 - loss: 0.6768 - val_accuracy: 0.5749 - val_loss: 0.7016
Epoch 5/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5853 - loss: 0.6744 - val_accuracy: 0.5827 - val_loss: 0.6761
Epoch 6/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5768 - loss: 0.6751 - val_accuracy: 0.5822 - val_loss: 0.6766
Epoch 7/100
[1m279/27

<keras.src.callbacks.history.History at 0x380aa7d40>

In [230]:
y_pred = model3.predict(X3_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y3_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
0.5934065934065934


In [231]:
dfLim4 = dfLim3.copy()  # Copy the original DataFrame

# Get the position of 'homeTeam_3P'
position = dfLim4.columns.get_loc('homeTeam_3P')

# Insert 'homeTeam_FG' after 'shomeTeam_3P'
dfLim4.insert(position + 1, 'homeTeam_FG', df['homeTeam_FG'])

position2 = dfLim4.columns.get_loc('awayTeam_3P')
# Insert 'awayTeam_FG' right after 'awayTeam_3P'
dfLim4.insert(position2 + 1, 'awayTeam_FG', df['awayTeam_FG'])

dfLim4

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_3P_made,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,0.333333,37-78,13,Dallas Mavericks,7,88,0.297297,30-75,11,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,0.405405,28-79,15,Boston Celtics,2,84,0.341463,31-73,14,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,0.360000,37-70,9,Boston Celtics,2,106,0.369565,38-78,17,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,0.256410,41-101,10,Dallas Mavericks,7,98,0.230769,32-84,6,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,0.380952,39-87,16,Dallas Mavericks,7,89,0.259259,33-81,7,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2014-10-29,Portland Trail Blazers,25,106,0.379310,39-82,11,Oklahoma City Thunder,21,89,0.125000,35-84,2,Portland Trail Blazers,0
12825,2014-10-28,New Orleans Pelicans,19,101,0.235294,38-84,4,Orlando Magic,22,84,0.363636,38-80,4,New Orleans Pelicans,0
12826,2014-10-28,San Antonio Spurs,27,101,0.500000,38-86,14,Dallas Mavericks,7,100,0.380952,38-82,8,San Antonio Spurs,0
12827,2014-10-28,Los Angeles Lakers,14,90,0.300000,46-91,3,Houston Rockets,11,108,0.413793,29-80,12,Houston Rockets,1


In [232]:
dfLim4.insert(dfLim4.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim4['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim4.insert(dfLim4.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim4['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim4

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_FG_made,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_FG_made,awayTeam_3P_made,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,0.333333,37-78,37,13,Dallas Mavericks,7,88,0.297297,30-75,30,11,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,0.405405,28-79,28,15,Boston Celtics,2,84,0.341463,31-73,31,14,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,0.360000,37-70,37,9,Boston Celtics,2,106,0.369565,38-78,38,17,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,0.256410,41-101,41,10,Dallas Mavericks,7,98,0.230769,32-84,32,6,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,0.380952,39-87,39,16,Dallas Mavericks,7,89,0.259259,33-81,33,7,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2014-10-29,Portland Trail Blazers,25,106,0.379310,39-82,39,11,Oklahoma City Thunder,21,89,0.125000,35-84,35,2,Portland Trail Blazers,0
12825,2014-10-28,New Orleans Pelicans,19,101,0.235294,38-84,38,4,Orlando Magic,22,84,0.363636,38-80,38,4,New Orleans Pelicans,0
12826,2014-10-28,San Antonio Spurs,27,101,0.500000,38-86,38,14,Dallas Mavericks,7,100,0.380952,38-82,38,8,San Antonio Spurs,0
12827,2014-10-28,Los Angeles Lakers,14,90,0.300000,46-91,46,3,Houston Rockets,11,108,0.413793,29-80,29,12,Houston Rockets,1


In [233]:
#Format as percentage
dfLim4['homeTeam_FG'] = dfLim4['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim4['awayTeam_FG'] = dfLim4['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim4

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_FG_made,homeTeam_3P_made,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_FG_made,awayTeam_3P_made,winner,winner_binary
0,2024-06-17,Boston Celtics,2,106,0.333333,0.474359,37,13,Dallas Mavericks,7,88,0.297297,0.400000,30,11,Boston Celtics,0
1,2024-06-14,Dallas Mavericks,7,122,0.405405,0.354430,28,15,Boston Celtics,2,84,0.341463,0.424658,31,14,Dallas Mavericks,0
2,2024-06-12,Dallas Mavericks,7,99,0.360000,0.528571,37,9,Boston Celtics,2,106,0.369565,0.487179,38,17,Boston Celtics,1
3,2024-06-09,Boston Celtics,2,105,0.256410,0.405941,41,10,Dallas Mavericks,7,98,0.230769,0.380952,32,6,Boston Celtics,0
4,2024-06-06,Boston Celtics,2,107,0.380952,0.448276,39,16,Dallas Mavericks,7,89,0.259259,0.407407,33,7,Boston Celtics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2014-10-29,Portland Trail Blazers,25,106,0.379310,0.475610,39,11,Oklahoma City Thunder,21,89,0.125000,0.416667,35,2,Portland Trail Blazers,0
12825,2014-10-28,New Orleans Pelicans,19,101,0.235294,0.452381,38,4,Orlando Magic,22,84,0.363636,0.475000,38,4,New Orleans Pelicans,0
12826,2014-10-28,San Antonio Spurs,27,101,0.500000,0.441860,38,14,Dallas Mavericks,7,100,0.380952,0.463415,38,8,San Antonio Spurs,0
12827,2014-10-28,Los Angeles Lakers,14,90,0.300000,0.505495,46,3,Houston Rockets,11,108,0.413793,0.362500,29,12,Houston Rockets,1


In [234]:
dfIDtoStat4 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made' ])
dfIDtoStat4

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made


In [235]:
for index, row in dfLim4.iterrows():
    date = row['date']
    homeTeam_id = row['homeTeamSubject_id']
    homeTeam_points_total = row['homeTeam_points_total']
    homeTeam_3P = row['homeTeam_3P']
    homeTeam_3P_made = row['homeTeam_3P_made']
     #added homeTeamFG_made
    homeTeam_FG = row['homeTeam_FG']
    homeTeam_FG_made = row['homeTeam_FG_made']
    awayTeam_id = row['awayTeamSubject_id']
    awayTeam_points_total = row['awayTeam_points_total']
    awayTeam_3P = row['homeTeam_3P']
    awayTeam_3P_made = row['awayTeam_3P_made']
    #added awayTeamFG
    awayTeam_FG = row['awayTeam_FG']
    awayTeam_FG_made = row['awayTeam_FG_made']
    #added these new variables into this
    dfIDtoStat4.loc[len(dfIDtoStat4)] = {"date": date, 'team_id': homeTeam_id, 'team_points_total':homeTeam_points_total, '3P%':homeTeam_3P, '3P_made':homeTeam_3P_made, 'FG%':homeTeam_FG, 'FG_made':homeTeam_FG_made}
    dfIDtoStat4.loc[len(dfIDtoStat4)] = {"date": date, 'team_id': awayTeam_id, 'team_points_total':awayTeam_points_total, '3P%':awayTeam_3P, '3P_made':awayTeam_3P_made, 'FG%':awayTeam_FG, 'FG_made':awayTeam_FG_made}
dfIDtoStat4

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made
0,2024-06-17,2,106,0.333333,13,0.474359,37
1,2024-06-17,7,88,0.333333,11,0.400000,30
2,2024-06-14,7,122,0.405405,15,0.354430,28
3,2024-06-14,2,84,0.405405,14,0.424658,31
4,2024-06-12,7,99,0.360000,9,0.528571,37
...,...,...,...,...,...,...,...
25653,2014-10-28,7,100,0.500000,8,0.463415,38
25654,2014-10-28,14,90,0.300000,3,0.505495,46
25655,2014-10-28,11,108,0.300000,12,0.362500,29
25656,2014-06-15,27,104,0.461538,12,0.426966,38


In [236]:
#X1.Shape is num data, window size, variables
X4, y4 = df_to_X_y(dfLim4, dfIDtoStat4, 6)
X4.shape, y4.shape

((12718, 6, 10), (12718,))

In [237]:
#70-15-15 split
X4_train, X4_temp, y4_train, y4_temp = train_test_split(X4, y4, test_size=0.3, random_state=42)
X4_val, X4_test, y4_val, y4_test = train_test_split(X4_temp, y4_temp, test_size=0.5, random_state=42)
X4_train.shape, y4_train.shape, X4_val.shape, y4_val.shape, X4_test.shape, y4_test.shape

((8902, 6, 10), (8902,), (1908, 6, 10), (1908,), (1908, 6, 10), (1908,))

In [238]:
model4 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model4.add(InputLayer((5, 10)))
model4.add(LSTM(64, activation='tanh', return_sequences=True))
model4.add(LSTM(32, activation='tanh'))
model4.add(Dense(16, activation='relu'))
model4.add(Dense(8, activation='relu'))
model4.add(Dense(1, activation='sigmoid'))

model4.summary()

In [239]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [240]:
model4.fit(X4_train, y4_train, validation_data=(X4_val, y4_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5753 - loss: 0.6797 - val_accuracy: 0.5723 - val_loss: 0.6789
Epoch 2/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5908 - loss: 0.6723 - val_accuracy: 0.5781 - val_loss: 0.6774
Epoch 3/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5771 - loss: 0.6737 - val_accuracy: 0.5781 - val_loss: 0.6774
Epoch 4/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5769 - loss: 0.6731 - val_accuracy: 0.5744 - val_loss: 0.6766
Epoch 5/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5852 - loss: 0.6711 - val_accuracy: 0.5718 - val_loss: 0.6774
Epoch 6/100
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5848 - loss: 0.6718 - val_accuracy: 0.5729 - val_loss: 0.6783
Epoch 7/100
[1m279/27

<keras.src.callbacks.history.History at 0x3815230e0>

In [241]:
y_pred = model4.predict(X4_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y4_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
0.6032494758909853


In [242]:
#Adding ast, reb, and winning percentages
dfLim5 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    "homeTeam_3P",
    "homeTeam_FG",
    #added homeTeamReb and homeTeamAst
    "homeTeam_Total_Reb",
    "homeTeam_Ast",
    "homeWinPct",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    "awayTeam_3P",
    "awayTeam_FG",
    #added awayTeamReb and awayTeamAst
    "awayTeam_Total_Reb",
    "awayTeam_Ast",
    "awayWinPct",
    "winner"
]]
dfLim5

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,homeWinPct,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner
0,2014-06-15,San Antonio Spurs,27,104,12-26,37-78,40,25,74.193548,Miami Heat,16,87,7-25,30-75,41,14,64.406780,San Antonio Spurs
1,2014-10-28,Los Angeles Lakers,14,90,3-10,28-79,36,16,0.000000,Houston Rockets,11,108,12-29,31-73,47,22,100.000000,Houston Rockets
2,2014-10-28,San Antonio Spurs,27,101,14-28,37-70,38,23,100.000000,Dallas Mavericks,7,100,8-21,38-78,33,17,0.000000,San Antonio Spurs
3,2014-10-28,New Orleans Pelicans,19,101,4-17,41-101,62,20,100.000000,Orlando Magic,22,84,4-11,32-84,56,17,0.000000,New Orleans Pelicans
4,2014-10-29,Portland Trail Blazers,25,106,11-29,39-87,42,23,100.000000,Oklahoma City Thunder,21,89,2-16,33-81,43,19,0.000000,Portland Trail Blazers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,39-82,47,23,79.381443,Dallas Mavericks,7,89,7-27,35-84,43,9,62.000000,Boston Celtics
12825,2024-06-09,Boston Celtics,2,105,10-39,38-84,41,29,79.591837,Dallas Mavericks,7,98,6-26,38-80,43,21,61.386139,Boston Celtics
12826,2024-06-12,Dallas Mavericks,7,99,9-25,38-86,43,15,60.784314,Boston Celtics,2,106,17-46,38-82,36,26,79.797980,Boston Celtics
12827,2024-06-14,Dallas Mavericks,7,122,15-37,46-91,52,21,61.165049,Boston Celtics,2,84,14-41,29-80,31,18,79.000000,Dallas Mavericks


In [243]:
#No need to reformat dates with new dataset

In [244]:
dfLim5['winner_binary'] = (dfLim5['winner'] == df['awayTeam']).astype(int)
dfLim5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim5['winner_binary'] = (dfLim5['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,homeWinPct,awayTeam,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,12-26,37-78,40,25,74.193548,Miami Heat,16,87,7-25,30-75,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,3-10,28-79,36,16,0.000000,Houston Rockets,11,108,12-29,31-73,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,14-28,37-70,38,23,100.000000,Dallas Mavericks,7,100,8-21,38-78,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,4-17,41-101,62,20,100.000000,Orlando Magic,22,84,4-11,32-84,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,11-29,39-87,42,23,100.000000,Oklahoma City Thunder,21,89,2-16,33-81,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,39-82,47,23,79.381443,Dallas Mavericks,7,89,7-27,35-84,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10-39,38-84,41,29,79.591837,Dallas Mavericks,7,98,6-26,38-80,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,9-25,38-86,43,15,60.784314,Boston Celtics,2,106,17-46,38-82,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,15-37,46-91,52,21,61.165049,Boston Celtics,2,84,14-41,29-80,31,18,79.000000,Dallas Mavericks,0


In [245]:
#adding 3P made
dfLim5.insert(dfLim5.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim5['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5.insert(dfLim5.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim5['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,homeWinPct,...,awayTeamSubject_id,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,12-26,12,37-78,40,25,74.193548,...,16,87,7-25,7,30-75,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,3-10,3,28-79,36,16,0.000000,...,11,108,12-29,12,31-73,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,14-28,14,37-70,38,23,100.000000,...,7,100,8-21,8,38-78,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,4-17,4,41-101,62,20,100.000000,...,22,84,4-11,4,32-84,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,11-29,11,39-87,42,23,100.000000,...,21,89,2-16,2,33-81,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,16,39-82,47,23,79.381443,...,7,89,7-27,7,35-84,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10-39,10,38-84,41,29,79.591837,...,7,98,6-26,6,38-80,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,9-25,9,38-86,43,15,60.784314,...,2,106,17-46,17,38-82,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,15-37,15,46-91,52,21,61.165049,...,2,84,14-41,14,29-80,31,18,79.000000,Dallas Mavericks,0


In [246]:
#adding FG made
dfLim5.insert(dfLim5.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim5['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5.insert(dfLim5.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim5['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim5

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,12-26,12,37-78,37,40,25,...,87,7-25,7,30-75,30,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,3-10,3,28-79,28,36,16,...,108,12-29,12,31-73,31,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,14-28,14,37-70,37,38,23,...,100,8-21,8,38-78,38,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,4-17,4,41-101,41,62,20,...,84,4-11,4,32-84,32,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,11-29,11,39-87,39,42,23,...,89,2-16,2,33-81,33,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,16-42,16,39-82,39,47,23,...,89,7-27,7,35-84,35,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10-39,10,38-84,38,41,29,...,98,6-26,6,38-80,38,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,9-25,9,38-86,38,43,15,...,106,17-46,17,38-82,38,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,15-37,15,46-91,46,52,21,...,84,14-41,14,29-80,29,31,18,79.000000,Dallas Mavericks,0


In [247]:
#Format as percentage
dfLim5['homeTeam_3P'] = dfLim5['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim5['awayTeam_3P'] = dfLim5['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim5['homeTeam_FG'] = dfLim5['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim5['awayTeam_FG'] = dfLim5['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)

dfLim5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim5['homeTeam_3P'] = dfLim5['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim5['awayTeam_3P'] = dfLim5['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeam_points_total,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,0.461538,12,0.474359,37,40,25,...,87,0.280000,7,0.400000,30,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,0.300000,3,0.354430,28,36,16,...,108,0.413793,12,0.424658,31,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,0.500000,14,0.528571,37,38,23,...,100,0.380952,8,0.487179,38,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,0.235294,4,0.405941,41,62,20,...,84,0.363636,4,0.380952,32,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,0.379310,11,0.448276,39,42,23,...,89,0.125000,2,0.407407,33,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,0.380952,16,0.475610,39,47,23,...,89,0.259259,7,0.416667,35,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,0.256410,10,0.452381,38,41,29,...,98,0.230769,6,0.475000,38,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,0.360000,9,0.441860,38,43,15,...,106,0.369565,17,0.463415,38,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,0.405405,15,0.505495,46,52,21,...,84,0.341463,14,0.362500,29,31,18,79.000000,Dallas Mavericks,0


In [248]:
dfIDtoStat5 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'WinPct'])
dfIDtoStat5

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,WinPct


In [249]:
# Create DataFrame for home team statistics
home_df = dfLim5[['date', 'homeTeamSubject_id', 'homeTeam_points_total', 'homeTeam_3P', 'homeTeam_3P_made', 'homeTeam_FG', 'homeTeam_FG_made', 'homeTeam_Total_Reb', 'homeTeam_Ast', 'homeWinPct']].copy()
home_df.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'WinPct']

# Create DataFrame for away team statistics
away_df = dfLim5[['date', 'awayTeamSubject_id', 'awayTeam_points_total', 'awayTeam_3P', 'awayTeam_3P_made', 'awayTeam_FG', 'awayTeam_FG_made', 'awayTeam_Total_Reb', 'awayTeam_Ast', 'awayWinPct']].copy()
away_df.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'WinPct']

# Combine both DataFrames
dfIDtoStat5 = pd.concat([home_df, away_df], ignore_index=True)

# Display the resulting DataFrame
dfIDtoStat5

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,WinPct
0,2014-06-15,27,104,0.461538,12,0.474359,37,40,25,74.193548
1,2014-10-28,14,90,0.300000,3,0.354430,28,36,16,0.000000
2,2014-10-28,27,101,0.500000,14,0.528571,37,38,23,100.000000
3,2014-10-28,19,101,0.235294,4,0.405941,41,62,20,100.000000
4,2014-10-29,25,106,0.379310,11,0.448276,39,42,23,100.000000
...,...,...,...,...,...,...,...,...,...,...
25653,2024-06-06,7,89,0.259259,7,0.416667,35,43,9,62.000000
25654,2024-06-09,7,98,0.230769,6,0.475000,38,43,21,61.386139
25655,2024-06-12,2,106,0.369565,17,0.463415,38,36,26,79.797980
25656,2024-06-14,2,84,0.341463,14,0.362500,29,31,18,79.000000


In [250]:
#No need to reverse df becasue new dataset is already reversed

In [251]:
#X1.Shape is num data, window size, variables
X5, y5 = df_to_X_y(dfLim5, dfIDtoStat5, 10)
X5.shape, y5.shape

((12656, 10, 16), (12656,))

In [252]:
#70-15-15 split
X5_train, X5_temp, y5_train, y5_temp = train_test_split(X5, y5, test_size=0.3, random_state=42)
X5_val, X5_test, y5_val, y5_test = train_test_split(X5_temp, y5_temp, test_size=0.5, random_state=42)
X5_train.shape, y5_train.shape, X5_val.shape, y5_val.shape, X5_test.shape, y5_test.shape

((8859, 10, 16), (8859,), (1898, 10, 16), (1898,), (1899, 10, 16), (1899,))

In [253]:
model5 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model5.add(InputLayer((10, 16)))
model5.add(LSTM(64, activation='tanh', return_sequences=True))
model5.add(LSTM(32, activation='tanh'))
model5.add(Dense(16, activation='relu'))
model5.add(Dense(8, activation='relu'))
model5.add(Dense(1, activation='sigmoid'))

model5.summary()

In [254]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model5.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model5.fit(X5_train, y5_train, validation_data=(X5_val, y5_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6220 - loss: 0.6528 - val_accuracy: 0.6122 - val_loss: 0.6684
Epoch 2/100
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6497 - loss: 0.6308 - val_accuracy: 0.6043 - val_loss: 0.6570
Epoch 3/100
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6480 - loss: 0.6276 - val_accuracy: 0.6249 - val_loss: 0.6578
Epoch 4/100
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6437 - loss: 0.6339 - val_accuracy: 0.6043 - val_loss: 0.6617
Epoch 5/100
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6482 - loss: 0.6279 - val_accuracy: 0.6001 - val_loss: 0.6603
Epoch 6/100
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6424 - loss: 0.6324 - val_accuracy: 0.6143 - val_loss: 0.6563
Epoch 7/100
[1m277/27

<keras.src.callbacks.history.History at 0x38156e990>

In [255]:
y_pred = model5.predict(X5_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y5_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
0.65086887835703


In [51]:
#Adding ast, reb, and winning percentages
dfLim6 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    #Added Home steal and block
    "homeTeam_Stl",
    "homeTeam_Blk",
    "homeTeam_3P",
    "homeTeam_FG",
    "homeTeam_Total_Reb",
    "homeTeam_Ast",
    "homeWinPct",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    #added away steal and block
    "awayTeam_Stl",
    "awayTeam_Blk",
    "awayTeam_3P",
    "awayTeam_FG",
    "awayTeam_Total_Reb",
    "awayTeam_Ast",
    "awayWinPct",
    "winner"
]]
dfLim6

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeamSubject_id,awayTeam_points_total,awayTeam_Stl,awayTeam_Blk,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner
0,2014-06-15,San Antonio Spurs,27,104,5,4,12-26,37-78,40,25,...,16,87,5,4,7-25,30-75,41,14,64.406780,San Antonio Spurs
1,2014-10-28,Los Angeles Lakers,14,90,7,3,3-10,28-79,36,16,...,11,108,7,3,12-29,31-73,47,22,100.000000,Houston Rockets
2,2014-10-28,San Antonio Spurs,27,101,5,3,14-28,37-70,38,23,...,7,100,9,3,8-21,38-78,33,17,0.000000,San Antonio Spurs
3,2014-10-28,New Orleans Pelicans,19,101,10,17,4-17,41-101,62,20,...,22,84,5,9,4-11,32-84,56,17,0.000000,New Orleans Pelicans
4,2014-10-29,Portland Trail Blazers,25,106,4,9,11-29,39-87,42,23,...,21,89,9,5,2-16,33-81,43,19,0.000000,Portland Trail Blazers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,6,9,16-42,39-82,47,23,...,7,89,8,1,7-27,35-84,43,9,62.000000,Boston Celtics
12825,2024-06-09,Boston Celtics,2,105,10,5,10-39,38-84,41,29,...,7,98,5,3,6-26,38-80,43,21,61.386139,Boston Celtics
12826,2024-06-12,Dallas Mavericks,7,99,5,1,9-25,38-86,43,15,...,2,106,4,6,17-46,38-82,36,26,79.797980,Boston Celtics
12827,2024-06-14,Dallas Mavericks,7,122,7,2,15-37,46-91,52,21,...,2,84,2,5,14-41,29-80,31,18,79.000000,Dallas Mavericks


In [52]:
dfLim6['winner_binary'] = (dfLim6['winner'] == df['awayTeam']).astype(int)
dfLim6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim6['winner_binary'] = (dfLim6['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_3P,homeTeam_FG,homeTeam_Total_Reb,homeTeam_Ast,...,awayTeam_points_total,awayTeam_Stl,awayTeam_Blk,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,5,4,12-26,37-78,40,25,...,87,5,4,7-25,30-75,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,7,3,3-10,28-79,36,16,...,108,7,3,12-29,31-73,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,5,3,14-28,37-70,38,23,...,100,9,3,8-21,38-78,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,10,17,4-17,41-101,62,20,...,84,5,9,4-11,32-84,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,4,9,11-29,39-87,42,23,...,89,9,5,2-16,33-81,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,6,9,16-42,39-82,47,23,...,89,8,1,7-27,35-84,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10,5,10-39,38-84,41,29,...,98,5,3,6-26,38-80,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,5,1,9-25,38-86,43,15,...,106,4,6,17-46,38-82,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,7,2,15-37,46-91,52,21,...,84,2,5,14-41,29-80,31,18,79.000000,Dallas Mavericks,0


In [53]:
dfLim6.insert(dfLim6.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim6['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim6.insert(dfLim6.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim6['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))

#adding FG made
dfLim6.insert(dfLim6.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim6['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim6.insert(dfLim6.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim6['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim6

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,...,awayTeam_Blk,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,5,4,12-26,12,37-78,37,...,4,7-25,7,30-75,30,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,7,3,3-10,3,28-79,28,...,3,12-29,12,31-73,31,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,5,3,14-28,14,37-70,37,...,3,8-21,8,38-78,38,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,10,17,4-17,4,41-101,41,...,9,4-11,4,32-84,32,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,4,9,11-29,11,39-87,39,...,5,2-16,2,33-81,33,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,6,9,16-42,16,39-82,39,...,1,7-27,7,35-84,35,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10,5,10-39,10,38-84,38,...,3,6-26,6,38-80,38,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,5,1,9-25,9,38-86,38,...,6,17-46,17,38-82,38,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,7,2,15-37,15,46-91,46,...,5,14-41,14,29-80,29,31,18,79.000000,Dallas Mavericks,0


In [54]:
#Format as percentage
dfLim6['homeTeam_3P'] = dfLim6['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim6['awayTeam_3P'] = dfLim6['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim6['homeTeam_FG'] = dfLim6['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim6['awayTeam_FG'] = dfLim6['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)

dfLim6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim6['homeTeam_3P'] = dfLim6['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim6['awayTeam_3P'] = dfLim6['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_3P,homeTeam_3P_made,homeTeam_FG,homeTeam_FG_made,...,awayTeam_Blk,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,5,4,0.461538,12,0.474359,37,...,4,0.280000,7,0.400000,30,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,7,3,0.300000,3,0.354430,28,...,3,0.413793,12,0.424658,31,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,5,3,0.500000,14,0.528571,37,...,3,0.380952,8,0.487179,38,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,10,17,0.235294,4,0.405941,41,...,9,0.363636,4,0.380952,32,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,4,9,0.379310,11,0.448276,39,...,5,0.125000,2,0.407407,33,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,6,9,0.380952,16,0.475610,39,...,1,0.259259,7,0.416667,35,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10,5,0.256410,10,0.452381,38,...,3,0.230769,6,0.475000,38,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,5,1,0.360000,9,0.441860,38,...,6,0.369565,17,0.463415,38,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,7,2,0.405405,15,0.505495,46,...,5,0.341463,14,0.362500,29,31,18,79.000000,Dallas Mavericks,0


In [55]:
dfIDtoStat6 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'WinPct'])
dfIDtoStat6

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,team_Stl,team_Blk,WinPct


In [56]:
# Create DataFrame for home team statistics
home_df6 = dfLim6[['date', 'homeTeamSubject_id', 'homeTeam_points_total', 'homeTeam_3P', 'homeTeam_3P_made', 'homeTeam_FG', 'homeTeam_FG_made', 'homeTeam_Total_Reb', 'homeTeam_Ast', 'homeTeam_Stl', 'homeTeam_Blk','homeWinPct']].copy()
home_df6.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'WinPct']

# Create DataFrame for away team statistics
away_df6 = dfLim6[['date', 'awayTeamSubject_id', 'awayTeam_points_total', 'awayTeam_3P', 'awayTeam_3P_made', 'awayTeam_FG', 'awayTeam_FG_made', 'awayTeam_Total_Reb', 'awayTeam_Ast', 'awayTeam_Stl', 'homeTeam_Blk','awayWinPct']].copy()
away_df6.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast','team_Stl', 'team_Blk', 'WinPct']

# Combine both DataFrames
dfIDtoStat6 = pd.concat([home_df, away_df], ignore_index=True)

# Display the resulting DataFrame
dfIDtoStat6

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,WinPct
0,2014-04-29,5,69,0.375000,6,0.333333,25,43,19,0.000000
1,2014-04-29,21,99,0.387097,12,0.391304,36,51,24,0.000000
2,2014-04-29,13,113,0.400000,8,0.486842,37,41,20,100.000000
3,2014-04-30,28,115,0.461538,12,0.480519,37,37,21,100.000000
4,2014-04-30,11,108,0.320000,8,0.472527,43,48,23,100.000000
...,...,...,...,...,...,...,...,...,...,...
25763,2024-06-06,7,89,0.259259,7,0.416667,35,43,9,62.000000
25764,2024-06-09,7,98,0.230769,6,0.475000,38,43,21,61.386139
25765,2024-06-12,2,106,0.369565,17,0.463415,38,36,26,79.797980
25766,2024-06-14,2,84,0.341463,14,0.362500,29,31,18,79.000000


In [57]:
#X1.Shape is num data, window size, variables
X6, y6 = df_to_X_y(dfLim6, dfIDtoStat6, 14)
X6.shape, y6.shape

((12614, 14, 16), (12614,))

In [58]:
#70-15-15 split
X6_train, X6_temp, y6_train, y6_temp = train_test_split(X6, y6, test_size=0.3, random_state=42)
X6_val, X6_test, y6_val, y6_test = train_test_split(X6_temp, y6_temp, test_size=0.5, random_state=42)
X6_train.shape, y6_train.shape, X6_val.shape, y6_val.shape, X6_test.shape, y6_test.shape

((8829, 14, 16), (8829,), (1892, 14, 16), (1892,), (1893, 14, 16), (1893,))

In [59]:
model6 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model6.add(InputLayer((14, 16)))
model6.add(LSTM(64, activation='tanh', return_sequences=True))
model6.add(LSTM(32, activation='tanh'))
model6.add(Dense(16, activation='relu'))
model6.add(Dense(8, activation='relu'))
model6.add(Dense(1, activation='sigmoid'))

model6.summary()

In [60]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
model6.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model6.fit(X6_train, y6_train, validation_data=(X6_val, y6_val), epochs=100, callbacks=[earlystopping])

Epoch 1/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5914 - loss: 0.6655 - val_accuracy: 0.6411 - val_loss: 0.6325
Epoch 2/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6313 - loss: 0.6374 - val_accuracy: 0.6411 - val_loss: 0.6339
Epoch 3/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6386 - loss: 0.6343 - val_accuracy: 0.6364 - val_loss: 0.6382
Epoch 4/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6336 - loss: 0.6344 - val_accuracy: 0.6459 - val_loss: 0.6353
Epoch 5/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6279 - loss: 0.6380 - val_accuracy: 0.6390 - val_loss: 0.6359
Epoch 6/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6403 - loss: 0.6349 - val_accuracy: 0.6284 - val_loss: 0.6386
Epoch 7/100
[1m276/27

<keras.src.callbacks.history.History at 0x28914d760>

In [61]:
y_pred = model6.predict(X6_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y6_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
0.6619123085050185


In [62]:
#Adding quarter point data
dfLim7 = df[[
    "date",
    "homeTeam",
    "homeTeamSubject_id", 
    "homeTeam_points_total", 
    "homeTeam_Stl",
    "homeTeam_Blk",
    #added home team quarter scores
    "homeTeam_points_q1",
    "homeTeam_points_q2",
    "homeTeam_points_q3",
    "homeTeam_points_q4",
    "homeTeam_3P",
    "homeTeam_FG",
    "homeTeam_Total_Reb",
    "homeTeam_Ast",
    "homeWinPct",
    "awayTeam",
    "awayTeamSubject_id", 
    "awayTeam_points_total",
    "awayTeam_Stl",
    "awayTeam_Blk",
     #added away team quarter scores
    "awayTeam_points_q1",
    "awayTeam_points_q2",
    "awayTeam_points_q3",
    "awayTeam_points_q4",
    "awayTeam_3P",
    "awayTeam_FG",
    "awayTeam_Total_Reb",
    "awayTeam_Ast",
    "awayWinPct",
    "winner"
]]
dfLim7

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,...,awayTeam_points_q1,awayTeam_points_q2,awayTeam_points_q3,awayTeam_points_q4,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner
0,2014-06-15,San Antonio Spurs,27,104,5,4,22.0,25.0,30.0,27.0,...,29.0,11.0,18.0,29.0,7-25,30-75,41,14,64.406780,San Antonio Spurs
1,2014-10-28,Los Angeles Lakers,14,90,7,3,19.0,26.0,24.0,21.0,...,31.0,31.0,23.0,23.0,12-29,31-73,47,22,100.000000,Houston Rockets
2,2014-10-28,San Antonio Spurs,27,101,5,3,26.0,19.0,31.0,25.0,...,24.0,29.0,20.0,27.0,8-21,38-78,33,17,0.000000,San Antonio Spurs
3,2014-10-28,New Orleans Pelicans,19,101,10,17,24.0,24.0,30.0,23.0,...,25.0,16.0,23.0,20.0,4-11,32-84,56,17,0.000000,New Orleans Pelicans
4,2014-10-29,Portland Trail Blazers,25,106,4,9,29.0,20.0,26.0,31.0,...,34.0,20.0,23.0,12.0,2-16,33-81,43,19,0.000000,Portland Trail Blazers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,6,9,37.0,26.0,23.0,21.0,...,20.0,22.0,24.0,23.0,7-27,35-84,43,9,62.000000,Boston Celtics
12825,2024-06-09,Boston Celtics,2,105,10,5,25.0,29.0,29.0,22.0,...,28.0,23.0,23.0,24.0,6-26,38-80,43,21,61.386139,Boston Celtics
12826,2024-06-12,Dallas Mavericks,7,99,5,1,31.0,20.0,19.0,29.0,...,30.0,20.0,35.0,21.0,17-46,38-82,36,26,79.797980,Boston Celtics
12827,2024-06-14,Dallas Mavericks,7,122,7,2,34.0,27.0,31.0,30.0,...,21.0,14.0,25.0,24.0,14-41,29-80,31,18,79.000000,Dallas Mavericks


In [63]:
dfLim7['winner_binary'] = (dfLim7['winner'] == df['awayTeam']).astype(int)
dfLim7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim7['winner_binary'] = (dfLim7['winner'] == df['awayTeam']).astype(int)


Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,...,awayTeam_points_q2,awayTeam_points_q3,awayTeam_points_q4,awayTeam_3P,awayTeam_FG,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,5,4,22.0,25.0,30.0,27.0,...,11.0,18.0,29.0,7-25,30-75,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,7,3,19.0,26.0,24.0,21.0,...,31.0,23.0,23.0,12-29,31-73,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,5,3,26.0,19.0,31.0,25.0,...,29.0,20.0,27.0,8-21,38-78,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,10,17,24.0,24.0,30.0,23.0,...,16.0,23.0,20.0,4-11,32-84,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,4,9,29.0,20.0,26.0,31.0,...,20.0,23.0,12.0,2-16,33-81,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,6,9,37.0,26.0,23.0,21.0,...,22.0,24.0,23.0,7-27,35-84,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10,5,25.0,29.0,29.0,22.0,...,23.0,23.0,24.0,6-26,38-80,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,5,1,31.0,20.0,19.0,29.0,...,20.0,35.0,21.0,17-46,38-82,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,7,2,34.0,27.0,31.0,30.0,...,14.0,25.0,24.0,14-41,29-80,31,18,79.000000,Dallas Mavericks,0


In [64]:
dfLim7.insert(dfLim7.columns.get_loc('homeTeam_3P') + 1, 'homeTeam_3P_made', dfLim7['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim7.insert(dfLim7.columns.get_loc('awayTeam_3P') + 1, 'awayTeam_3P_made', dfLim7['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))

#adding FG made
dfLim7.insert(dfLim7.columns.get_loc('homeTeam_FG') + 1, 'homeTeam_FG_made', dfLim7['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim7.insert(dfLim7.columns.get_loc('awayTeam_FG') + 1, 'awayTeam_FG_made', dfLim7['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) if isinstance(x, list) and len(x) == 2 else np.nan))
dfLim7

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,...,awayTeam_points_q4,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,5,4,22.0,25.0,30.0,27.0,...,29.0,7-25,7,30-75,30,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,7,3,19.0,26.0,24.0,21.0,...,23.0,12-29,12,31-73,31,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,5,3,26.0,19.0,31.0,25.0,...,27.0,8-21,8,38-78,38,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,10,17,24.0,24.0,30.0,23.0,...,20.0,4-11,4,32-84,32,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,4,9,29.0,20.0,26.0,31.0,...,12.0,2-16,2,33-81,33,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,6,9,37.0,26.0,23.0,21.0,...,23.0,7-27,7,35-84,35,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10,5,25.0,29.0,29.0,22.0,...,24.0,6-26,6,38-80,38,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,5,1,31.0,20.0,19.0,29.0,...,21.0,17-46,17,38-82,38,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,7,2,34.0,27.0,31.0,30.0,...,24.0,14-41,14,29-80,29,31,18,79.000000,Dallas Mavericks,0


In [65]:
#Format as percentage
dfLim7['homeTeam_3P'] = dfLim7['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim7['awayTeam_3P'] = dfLim7['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim7['homeTeam_FG'] = dfLim7['homeTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
dfLim7['awayTeam_FG'] = dfLim7['awayTeam_FG'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)

dfLim7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim7['homeTeam_3P'] = dfLim7['homeTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfLim7['awayTeam_3P'] = dfLim7['awayTeam_3P'].str.split('-').apply(lambda x: int(x[0]) / int(x[1]) if isinstance(x, list) and len(x) == 2 else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

Unnamed: 0,date,homeTeam,homeTeamSubject_id,homeTeam_points_total,homeTeam_Stl,homeTeam_Blk,homeTeam_points_q1,homeTeam_points_q2,homeTeam_points_q3,homeTeam_points_q4,...,awayTeam_points_q4,awayTeam_3P,awayTeam_3P_made,awayTeam_FG,awayTeam_FG_made,awayTeam_Total_Reb,awayTeam_Ast,awayWinPct,winner,winner_binary
0,2014-06-15,San Antonio Spurs,27,104,5,4,22.0,25.0,30.0,27.0,...,29.0,0.280000,7,0.400000,30,41,14,64.406780,San Antonio Spurs,0
1,2014-10-28,Los Angeles Lakers,14,90,7,3,19.0,26.0,24.0,21.0,...,23.0,0.413793,12,0.424658,31,47,22,100.000000,Houston Rockets,1
2,2014-10-28,San Antonio Spurs,27,101,5,3,26.0,19.0,31.0,25.0,...,27.0,0.380952,8,0.487179,38,33,17,0.000000,San Antonio Spurs,0
3,2014-10-28,New Orleans Pelicans,19,101,10,17,24.0,24.0,30.0,23.0,...,20.0,0.363636,4,0.380952,32,56,17,0.000000,New Orleans Pelicans,0
4,2014-10-29,Portland Trail Blazers,25,106,4,9,29.0,20.0,26.0,31.0,...,12.0,0.125000,2,0.407407,33,43,19,0.000000,Portland Trail Blazers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824,2024-06-06,Boston Celtics,2,107,6,9,37.0,26.0,23.0,21.0,...,23.0,0.259259,7,0.416667,35,43,9,62.000000,Boston Celtics,0
12825,2024-06-09,Boston Celtics,2,105,10,5,25.0,29.0,29.0,22.0,...,24.0,0.230769,6,0.475000,38,43,21,61.386139,Boston Celtics,0
12826,2024-06-12,Dallas Mavericks,7,99,5,1,31.0,20.0,19.0,29.0,...,21.0,0.369565,17,0.463415,38,36,26,79.797980,Boston Celtics,1
12827,2024-06-14,Dallas Mavericks,7,122,7,2,34.0,27.0,31.0,30.0,...,24.0,0.341463,14,0.362500,29,31,18,79.000000,Dallas Mavericks,0


In [84]:
dfIDtoStat7 = pd.DataFrame(columns=['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_points_q1', 'team_points_q2', 'team_points_q3', 'team_points_q4', 'WinPct'])
dfIDtoStat7

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,team_Stl,team_Blk,team_points_q1,team_points_q2,team_points_q3,team_points_q4,WinPct


In [86]:
# Create DataFrame for home team statistics
home_df7 = dfLim7[['date', 'homeTeamSubject_id', 'homeTeam_points_total', 'homeTeam_3P', 'homeTeam_3P_made', 'homeTeam_FG', 'homeTeam_FG_made', 'homeTeam_Total_Reb', 'homeTeam_Ast', 'homeTeam_Stl', 'homeTeam_Blk', 'homeTeam_points_q1', 'homeTeam_points_q2', 'homeTeam_points_q3', 'homeTeam_points_q4', 'homeWinPct']].copy()
home_df7.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_points_q1', 'team_points_q2', 'team_points_q3', 'team_points_q4', 'WinPct']

# Create DataFrame for away team statistics
away_df7 = dfLim7[['date', 'awayTeamSubject_id', 'awayTeam_points_total', 'awayTeam_3P', 'awayTeam_3P_made', 'awayTeam_FG', 'awayTeam_FG_made', 'awayTeam_Total_Reb', 'awayTeam_Ast', 'awayTeam_Stl', 'homeTeam_Blk','awayTeam_points_q1', 'awayTeam_points_q2', 'awayTeam_points_q3', 'awayTeam_points_q4','awayWinPct']].copy()
away_df7.columns = ['date', 'team_id', 'team_points_total', '3P%', '3P_made', 'FG%', 'FG_made', 'Total_Reb', 'team_Ast', 'team_Stl', 'team_Blk', 'team_points_q1', 'team_points_q2', 'team_points_q3', 'team_points_q4', 'WinPct']

# Combine both DataFrames
dfIDtoStat7 = pd.concat([home_df7, away_df7], ignore_index=True)

# Display the resulting DataFrame
dfIDtoStat7

Unnamed: 0,date,team_id,team_points_total,3P%,3P_made,FG%,FG_made,Total_Reb,team_Ast,team_Stl,team_Blk,team_points_q1,team_points_q2,team_points_q3,team_points_q4,WinPct
0,2014-06-15,27,104,0.461538,12,0.474359,37,40,25,5,4,22.0,25.0,30.0,27.0,74.193548
1,2014-10-28,14,90,0.300000,3,0.354430,28,36,16,7,3,19.0,26.0,24.0,21.0,0.000000
2,2014-10-28,27,101,0.500000,14,0.528571,37,38,23,5,3,26.0,19.0,31.0,25.0,100.000000
3,2014-10-28,19,101,0.235294,4,0.405941,41,62,20,10,17,24.0,24.0,30.0,23.0,100.000000
4,2014-10-29,25,106,0.379310,11,0.448276,39,42,23,4,9,29.0,20.0,26.0,31.0,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25653,2024-06-06,7,89,0.259259,7,0.416667,35,43,9,8,9,20.0,22.0,24.0,23.0,62.000000
25654,2024-06-09,7,98,0.230769,6,0.475000,38,43,21,5,5,28.0,23.0,23.0,24.0,61.386139
25655,2024-06-12,2,106,0.369565,17,0.463415,38,36,26,4,1,30.0,20.0,35.0,21.0,79.797980
25656,2024-06-14,2,84,0.341463,14,0.362500,29,31,18,2,2,21.0,14.0,25.0,24.0,79.000000


In [None]:
#X1.Shape is num data, window size, variables
X7, y7 = df_to_X_y(dfLim7, dfIDtoStat7, 20)
X7.shape, y7.shape

((12734, 5, 28), (12734,))

In [123]:
#70-15-15 split
X7_train, X7_temp, y7_train, y7_temp = train_test_split(X7, y7, test_size=0.3, random_state=42)
X7_val, X7_test, y7_val, y7_test = train_test_split(X7_temp, y7_temp, test_size=0.5, random_state=42)
X7_train.shape, y7_train.shape, X7_val.shape, y7_val.shape, X7_test.shape, y7_test.shape

((8913, 5, 28), (8913,), (1910, 5, 28), (1910,), (1911, 5, 28), (1911,))

In [None]:
model7 = Sequential()
#You need to change the input layer to reflect the amount of variables. first number is window size, second is variables
model7.add(InputLayer((20, 28)))
model7.add(LSTM(64, activation='tanh', return_sequences=True))
model7.add(Dropout(0.2))
model7.add(LSTM(32, activation='tanh'))
model7.add(Dense(16, activation='relu'))
model7.add(BatchNormalization())
model7.add(Dense(8, activation='relu'))
model7.add(Dense(1, activation='sigmoid'))

model7.summary()

In [125]:
earlystopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-5, verbose=1)
model7.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model7.fit(X7_train, y7_train, validation_data=(X7_val, y7_val), epochs=200, callbacks=[earlystopping])

Epoch 1/200
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.6030 - loss: 0.6696 - val_accuracy: 0.6068 - val_loss: 0.6494
Epoch 2/200
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6321 - loss: 0.6416 - val_accuracy: 0.6147 - val_loss: 0.6426
Epoch 3/200
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6229 - loss: 0.6458 - val_accuracy: 0.6330 - val_loss: 0.6334
Epoch 4/200
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6422 - loss: 0.6352 - val_accuracy: 0.6393 - val_loss: 0.6419
Epoch 5/200
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6433 - loss: 0.6374 - val_accuracy: 0.6330 - val_loss: 0.6427
Epoch 6/200
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6421 - loss: 0.6362 - val_accuracy: 0.6277 - val_loss: 0.6346
Epoch 7/200
[1m279/27

<keras.src.callbacks.history.History at 0x2f6c0ffb0>

In [126]:
y_pred = model7.predict(X7_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print(accuracy_score(y7_test, y_pred_binary))

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
0.6425954997383568
