# Obtaining Data

In [1]:
import tensorflow as tf
from tensorflow.keras import utils, models, layers
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gui import create_model_gui

In [30]:
model, params_dict = create_model_gui()
params_dict

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 128)               3456      
                                                                 
 dense_19 (Dense)            (None, 64)                8256      
                                                                 
 dense_20 (Dense)            (None, 32)                2080      
                                                                 
 dense_21 (Dense)            (None, 1)                 33        
                                                                 
Total params: 13,825
Trainable params: 13,825
Non-trainable params: 0
_________________________________________________________________


{'Season': '2000-2001', 'loss function': 'Cross Entropy'}

In [13]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               3456      
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 15,937
Trainable params: 15,937
Non-trainable params: 0
_________________________________________________________________


In [3]:
from datetime import date
#date time objects for each nba season

nba_seasons = {
    '2000-2001': {'start': date(2000, 10, 31), 'end': date(2001, 4, 18)},
    '2001-2002': {'start': date(2001, 10, 30), 'end': date(2002, 4, 17)},
    '2002-2003': {'start': date(2002, 10, 29), 'end': date(2003, 4, 16)},
    '2003-2004': {'start': date(2003, 10, 28), 'end': date(4, 4, 14)},
    '2004-2005': {'start': date(2004, 11, 2), 'end': date(2005, 4, 20)},
    '2005-2006': {'start': date(2005, 11, 1), 'end': date(2006, 4, 19)},
    '2006-2007': {'start': date(2006, 10, 31), 'end': date(2007, 4, 18)},
    '2007-2008': {'start': date(2007, 10, 30), 'end': date(2008, 4, 16)},
    '2008-2009': {'start': date(2008, 10, 28), 'end': date(2009, 4, 15)},
    '2009-2010': {'start': date(2009, 10, 27), 'end': date(2010, 4, 14)},
    '2010-2011': {'start': date(2010, 10, 26), 'end': date(2011, 4, 13)},
    '2011-2012': {'start': date(2011, 12, 25), 'end': date(2012, 4, 26)},
    '2012-2013': {'start': date(2012, 10, 30), 'end': date(2013, 4, 17)},
    '2013-2014': {'start': date(2013, 10, 29), 'end': date(2014, 4, 16)},
    '2014-2015': {'start': date(2014, 10, 28), 'end': date(2015, 4, 15)},
    '2015-2016': {'start': date(2015, 10, 27), 'end': date(2016, 4, 13)},
    '2016-2017': {'start': date(2016, 10, 25), 'end': date(2017, 4, 12)},
    '2017-2018': {'start': date(2017, 10, 17), 'end': date(2018, 4, 11)},
    '2018-2019': {'start': date(2018, 10, 16), 'end': date(2019, 4, 10)},
    '2019-2020': {'start': date(2019, 10, 22), 'end': date(2020, 9, 28)},  # Adjusted end date
    '2020-2021': {'start': date(2020, 12, 22), 'end': date(2021, 7, 22)},  # Adjusted end date
    '2021-2022': {'start': date(2021, 10, 19), 'end': date(2022, 4, 10)},
    '2022-2023': {'start': date(2022, 10, 18), 'end': date(2023, 4, 9)}
}


Our first step is to create our training and testing data from all the data we downloaded.

In [4]:
from game_data_ingest import *

some_dates = date_range(nba_seasons[params_dict['Season']]['start'], nba_seasons[params_dict['Season']]['end'])
put_dates_in_db(some_dates)

with sqlite3.connect("games.db") as conn:
    c = conn.cursor()
    q = f"SELECT COUNT(*) FROM games;"
    c.execute(q)
    print("Total count of games in the database:", c.fetchone()[0])

with sqlite3.connect("games.db") as conn:
    df = generate_training_data_for_season(
        conn, nba_seasons[params_dict['Season']]['start'], nba_seasons[params_dict['Season']]['end']
    )
    print("---- Training Data ----")
    print(df)

5 dates have to be scraped
Starting crawl
Finished crawl
Total count of games in the database: 1574
---- Training Data ----
      hometeam_fieldGoalsMade  hometeam_fieldGoalsAttempted  \
0                   35.524590                     78.786885   
1                   37.642857                     82.875000   
2                   38.338983                     81.864407   
3                   36.482759                     77.810345   
4                   34.983607                     79.229508   
...                       ...                           ...   
1168                34.844156                     80.909091   
1169                33.227848                     78.455696   
1170                33.833333                     79.166667   
1171                34.233766                     78.792208   
1172                35.571429                     83.597403   

      hometeam_threePointersMade  hometeam_threePointersAttempted  \
0                       6.098361                  

In [7]:
df.head()

Unnamed: 0,hometeam_fieldGoalsMade,hometeam_fieldGoalsAttempted,hometeam_threePointersMade,hometeam_threePointersAttempted,hometeam_freeThrowsMade,hometeam_freeThrowsAttempted,hometeam_reboundsOffensive,hometeam_reboundsDefensive,hometeam_assists,hometeam_steals,...,awayteam_freeThrowsMade,awayteam_freeThrowsAttempted,awayteam_reboundsOffensive,awayteam_reboundsDefensive,awayteam_assists,awayteam_steals,awayteam_blocks,awayteam_foulsPersonal,awayteam_points,winner_is_home_team
0,35.52459,78.786885,6.098361,17.016393,19.196721,25.311475,11.327869,30.42623,19.409836,7.360656,...,17.126984,24.936508,12.222222,31.142857,18.873016,6.063492,6.301587,21.952381,91.555556,1
1,37.642857,82.875,6.910714,17.75,18.089286,23.178571,11.964286,30.267857,22.125,8.178571,...,19.103448,25.189655,11.293103,30.431034,19.37931,7.310345,4.689655,20.913793,96.034483,0
2,38.338983,81.864407,5.305085,15.661017,19.033898,28.305085,13.271186,31.491525,22.881356,6.508475,...,18.836066,25.081967,13.868852,30.983607,24.032787,7.098361,5.754098,21.754098,97.622951,1
3,36.482759,77.810345,4.465517,12.586207,17.965517,23.310345,11.62069,30.362069,23.810345,8.551724,...,17.15,25.083333,12.383333,31.316667,19.083333,6.033333,6.35,22.266667,91.666667,1
4,34.983607,79.229508,4.344262,13.114754,17.180328,25.098361,12.262295,31.262295,19.04918,6.016393,...,18.767857,24.517857,12.232143,31.964286,22.875,9.785714,5.678571,19.535714,101.107143,1


We have 27 - 1 = 26 input parameters. The remaining value is what we are trying to predict: whether the home team won.

In [9]:
#make arrays for X and Y data
X = df.drop('winner_is_home_team', axis=1).values
y = df['winner_is_home_team'].values
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, shuffle=True)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), shuffle=True) 

Now we have our training, test, and validation data!

# Creating The Model

Before we create our model, we test the predictive performance of a simple Logistic Regression model first as a baseline comparison. It should be around 50%, since it is essentially flipping a coin on which team wins the game.

In [11]:
model_logistic_regression = models.Sequential([
    # Logistic Regression
    layers.InputLayer(input_shape=(26,)),
    layers.Dense(units=2, activation='sigmoid')
])

model_logistic_regression.compile(optimizer='adam', # optimizer type
              loss=tf.keras.losses.sparse_categorical_crossentropy, # loss function
              metrics=['accuracy'])
history_logistic_regression = model_logistic_regression.fit(x=X_train,y=y_train,
                     epochs=20,
                     validation_data=(X_val,y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Looks like we hover around 50% as we predicted.

Now we try our own Neural Network model.

In [14]:
model1 = models.Sequential([
    # NN
    layers.InputLayer(input_shape=(26,)),
    layers.Dense(units=128, use_bias=True), # Linear layer
    layers.Dense(units=64, use_bias=True, activation='relu'), # Non-Linear layer
    #layers.Flatten(),
    layers.Dense(units=64, activation='relu'), # An inner product of previous parameters with weights
    layers.Dense(units=1, activation='sigmoid')
    
])

model1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 128)               3456      
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dense_7 (Dense)             (None, 64)                4160      
                                                                 
 dense_8 (Dense)             (None, 1)                 65        
                                                                 
Total params: 15,937
Trainable params: 15,937
Non-trainable params: 0
_________________________________________________________________


In [18]:
model1.compile(optimizer='adam', # optimizer type
              loss=tf.keras.losses.binary_crossentropy, # loss function
              metrics=['accuracy'])

stop_here_please = EarlyStopping(patience=3)

history_1 = model1.fit(x=X_train,y=y_train,
                     epochs=20,
                     validation_data=(X_val,y_val),
                     )#callbacks=[stop_here_please]

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
model1.evaluate(x=X_test,y=y_test)



[0.63176429271698, 0.6016949415206909]

In [106]:
model.summary() # 3 Hidden Layers, Relu, Linear, Relu

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 128)               3456      
                                                                 
 dense_19 (Dense)            (None, 64)                8256      
                                                                 
 dense_20 (Dense)            (None, 32)                2080      
                                                                 
 dense_21 (Dense)            (None, 1)                 33        
                                                                 
Total params: 13,825
Trainable params: 13,825
Non-trainable params: 0
_________________________________________________________________


In [77]:
model.compile(optimizer='adam', # optimizer type
              loss=tf.keras.losses.binary_crossentropy, # loss function
              metrics=['accuracy'])

stop_here_please = EarlyStopping(patience=3)

history_1 = model.fit(x=X_train,y=y_train,
                     epochs=30,
                     validation_data=(X_val,y_val),
                     )#callbacks=[stop_here_please]

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [78]:
model.evaluate(x=X_test,y=y_test)



[0.6234367489814758, 0.6610169410705566]

# Predicting the Playoffs

Now that we have the model and the season, we now can test the model on each matchup in the postseason and see how it performs!

In [79]:
def get_matchup_df(conn, team1, team2):
    '''
    Creates concatenated dataframe of the regular season averaged data of two specified teams.
    Make sure to write team names in lowercase! 
    '''
    df1 = get_average_stats_over_period(conn, team1, nba_seasons[params_dict['Season']]['start'], nba_seasons[params_dict['Season']]['end'])
    df2 = get_average_stats_over_period(conn, team2, nba_seasons[params_dict['Season']]['start'], nba_seasons[params_dict['Season']]['end'])
    df1 = df1.drop('gamesCount', axis = 1)
    df2 = df2.drop('gamesCount', axis = 1)
    df1 = df1.add_prefix("hometeam_")
    df2 = df2.add_prefix("awayteam_")
    resultdf = pd.concat([df1,df2], axis = 1)
    return resultdf


## Function To Predict Playoffs

We now need a function to evaluate a full playoff bracket!

# 2000-2001 Playoff Prediction

In [80]:
lakers_vs_blazers = get_matchup_df(conn, "lakers", "blazers")
mavs_vs_jazz = get_matchup_df(conn, "mavericks", "jazz")
kings_vs_suns = get_matchup_df(conn, "kings", "suns")
spurs_vs_wolves = get_matchup_df(conn, "spurs", "timberwolves")
hornets_vs_heat = get_matchup_df(conn, "hornets", "heat")
bucks_vs_magic = get_matchup_df(conn, "bucks", "magic")
sixers_vs_pacers = get_matchup_df(conn, "sixers", "pacers")
raptors_vs_knicks = get_matchup_df(conn, "raptors", "knicks")



# Playoff Simulation

## Round 1:

In [81]:
model1.predict(lakers_vs_blazers)



array([[0.6373418]], dtype=float32)

In [82]:
model.predict(lakers_vs_blazers) #Actual result: Lakers W 3-0



array([[0.66255695]], dtype=float32)

In [83]:
model.predict(mavs_vs_jazz) # Actual result: Mavs W 3-2



array([[0.6447607]], dtype=float32)

In [84]:
model.predict(kings_vs_suns) # Actual result: Kings W 3-1



array([[0.8448413]], dtype=float32)

In [85]:
model.predict(spurs_vs_wolves) # Actual Result: Spurs W 3-1



array([[0.76108074]], dtype=float32)

In [86]:
model.predict(hornets_vs_heat) # Actual Result: Hornets W 3-0



array([[0.6066924]], dtype=float32)

In [87]:
model.predict(bucks_vs_magic) #Actual Result: Bucks W 3-1



array([[0.6545932]], dtype=float32)

In [88]:
model.predict(sixers_vs_pacers) #Actual Result: Sixers W 3-1



array([[0.58582693]], dtype=float32)

In [89]:
model.predict(raptors_vs_knicks) #Actual Result: Raptors W 3-2



array([[0.83017826]], dtype=float32)

## Conference Semifinals

In [95]:
lakers_vs_kings = get_matchup_df(conn, "lakers", "kings")
spurs_vs_mavs = get_matchup_df(conn, "spurs", "mavericks")
sixers_vs_raptors = get_matchup_df(conn, "sixers", "raptors")
bucks_vs_hornets = get_matchup_df(conn, "bucks", "hornets")

In [91]:
model.predict(lakers_vs_kings) # Actual Result: Lakers W 4-0



array([[0.57875705]], dtype=float32)

In [93]:
model.predict(spurs_vs_mavs) # Actual Result: Spurs W 4-1



array([[0.6902155]], dtype=float32)

In [96]:
model.predict(sixers_vs_raptors) #Actual Result: Sixers W 4-3 



array([[0.5131549]], dtype=float32)

In [97]:
model.predict(bucks_vs_hornets) # Actual Result: Bucks W 4-3



array([[0.7862208]], dtype=float32)

## Conference Finals

In [98]:
lakers_vs_spurs = get_matchup_df(conn, "lakers", "spurs")
sixers_vs_bucks = get_matchup_df(conn, "sixers", "bucks")

In [99]:
model.predict(lakers_vs_spurs) # Actual Result: Lakers W 4-0



array([[0.5604064]], dtype=float32)

In [100]:
model.predict(sixers_vs_bucks) # Actual Result: Sixers W 4-3 First Incorrect Prediction :(



array([[0.49856293]], dtype=float32)

## NBA Finals

In [101]:
model.predict(get_matchup_df(conn, "lakers", "bucks")) # Actual Result, Lakers beat sixers 4-1



array([[0.6102966]], dtype=float32)

In [102]:
conn.close()