In [14]:
import tensorflow as tf
from tensorflow.keras import utils, models, layers
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from gui import create_model_gui

In [None]:
model, params_dict = create_model_gui()
params_dict

In [2]:
from datetime import date
#date time objects for each nba season

nba_seasons = {
    '2000-2001': {'start': date(2000, 10, 31), 'end': date(2001, 4, 18)},
    '2001-2002': {'start': date(2001, 10, 30), 'end': date(2002, 4, 17)},
    '2002-2003': {'start': date(2002, 10, 29), 'end': date(2003, 4, 16)},
    '2003-2004': {'start': date(2003, 10, 28), 'end': date(4, 4, 14)},
    '2004-2005': {'start': date(2004, 11, 2), 'end': date(2005, 4, 20)},
    '2005-2006': {'start': date(2005, 11, 1), 'end': date(2006, 4, 19)},
    '2006-2007': {'start': date(2006, 10, 31), 'end': date(2007, 4, 18)},
    '2007-2008': {'start': date(2007, 10, 30), 'end': date(2008, 4, 16)},
    '2008-2009': {'start': date(2008, 10, 28), 'end': date(2009, 4, 15)},
    '2009-2010': {'start': date(2009, 10, 27), 'end': date(2010, 4, 14)},
    '2010-2011': {'start': date(2010, 10, 26), 'end': date(2011, 4, 13)},
    '2011-2012': {'start': date(2011, 12, 25), 'end': date(2012, 4, 26)},
    '2012-2013': {'start': date(2012, 10, 30), 'end': date(2013, 4, 17)},
    '2013-2014': {'start': date(2013, 10, 29), 'end': date(2014, 4, 16)},
    '2014-2015': {'start': date(2014, 10, 28), 'end': date(2015, 4, 15)},
    '2015-2016': {'start': date(2015, 10, 27), 'end': date(2016, 4, 13)},
    '2016-2017': {'start': date(2016, 10, 25), 'end': date(2017, 4, 12)},
    '2017-2018': {'start': date(2017, 10, 17), 'end': date(2018, 4, 11)},
    '2018-2019': {'start': date(2018, 10, 16), 'end': date(2019, 4, 10)},
    '2019-2020': {'start': date(2019, 10, 22), 'end': date(2020, 9, 28)},  # Adjusted end date
    '2020-2021': {'start': date(2020, 12, 22), 'end': date(2021, 7, 22)},  # Adjusted end date
    '2021-2022': {'start': date(2021, 10, 19), 'end': date(2022, 4, 10)},
    '2022-2023': {'start': date(2022, 10, 18), 'end': date(2023, 4, 9)}
}


Our first step is to create our training and testing data from all the data we downloaded.

In [3]:
from game_data_ingest import *

some_dates = date_range(nba_seasons[params_dict['Season']]['start'], nba_seasons[params_dict['Season']]['end'])
put_dates_in_db(some_dates)

with sqlite3.connect("games.db") as conn:
    c = conn.cursor()
    q = f"SELECT COUNT(*) FROM games;"
    c.execute(q)
    print("Total count of games in the database:", c.fetchone()[0])

with sqlite3.connect("games.db") as conn:
    df = generate_training_data_for_season(
        conn, nba_seasons[params_dict['Season']]['start'], nba_seasons[params_dict['Season']]['end']
    )
    print("---- Training Data ----")
    print(df)

142 dates have to be scraped
Starting crawl
Finished crawl
Total count of games in the database: 1574
---- Training Data ----
      hometeam_fieldGoalsMade  hometeam_fieldGoalsAttempted  \
0                   35.524590                     78.786885   
1                   37.642857                     82.875000   
2                   38.338983                     81.864407   
3                   36.482759                     77.810345   
4                   34.983607                     79.229508   
...                       ...                           ...   
1168                34.844156                     80.909091   
1169                33.227848                     78.455696   
1170                33.833333                     79.166667   
1171                34.233766                     78.792208   
1172                35.571429                     83.597403   

      hometeam_threePointersMade  hometeam_threePointersAttempted  \
0                       6.098361                

In [7]:
df.head()

Unnamed: 0,hometeam_fieldGoalsMade,hometeam_fieldGoalsAttempted,hometeam_threePointersMade,hometeam_threePointersAttempted,hometeam_freeThrowsMade,hometeam_freeThrowsAttempted,hometeam_reboundsOffensive,hometeam_reboundsDefensive,hometeam_assists,hometeam_steals,...,awayteam_freeThrowsMade,awayteam_freeThrowsAttempted,awayteam_reboundsOffensive,awayteam_reboundsDefensive,awayteam_assists,awayteam_steals,awayteam_blocks,awayteam_foulsPersonal,awayteam_points,winner_is_home_team
0,35.52459,78.786885,6.098361,17.016393,19.196721,25.311475,11.327869,30.42623,19.409836,7.360656,...,17.126984,24.936508,12.222222,31.142857,18.873016,6.063492,6.301587,21.952381,91.555556,1
1,37.642857,82.875,6.910714,17.75,18.089286,23.178571,11.964286,30.267857,22.125,8.178571,...,19.103448,25.189655,11.293103,30.431034,19.37931,7.310345,4.689655,20.913793,96.034483,0
2,38.338983,81.864407,5.305085,15.661017,19.033898,28.305085,13.271186,31.491525,22.881356,6.508475,...,18.836066,25.081967,13.868852,30.983607,24.032787,7.098361,5.754098,21.754098,97.622951,1
3,36.482759,77.810345,4.465517,12.586207,17.965517,23.310345,11.62069,30.362069,23.810345,8.551724,...,17.15,25.083333,12.383333,31.316667,19.083333,6.033333,6.35,22.266667,91.666667,1
4,34.983607,79.229508,4.344262,13.114754,17.180328,25.098361,12.262295,31.262295,19.04918,6.016393,...,18.767857,24.517857,12.232143,31.964286,22.875,9.785714,5.678571,19.535714,101.107143,1


We have 27 - 1 = 26 input parameters. The remaining value is what we are trying to predict: whether the home team won.

In [8]:
#make arrays for X and Y data
X = df.drop('winner_is_home_team', axis=1).values
y = df['winner_is_home_team'].values

In [17]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, shuffle=True)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), shuffle=True) 

Before we create our model, we test the predictive performance of a simple Logistic Regression model first.

In [18]:
model_logistic_regression = models.Sequential([
    # Logistic Regression
    layers.InputLayer(input_shape=(26,)),
    layers.Dense(units=2, activation='sigmoid')
])

model_logistic_regression.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 2)                 54        
                                                                 
Total params: 54 (216.00 Byte)
Trainable params: 54 (216.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
model_logistic_regression.compile(optimizer='adam', # optimizer type
              loss=tf.keras.losses.sparse_categorical_crossentropy, # loss function
              metrics=['accuracy'])
history_logistic_regression = model_logistic_regression.fit(x=X_train,y=y_train,
                     epochs=20,
                     validation_data=(X_val,y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Now we try our own Neural Network model.

In [22]:
model1 = models.Sequential([
    # NN
    layers.InputLayer(input_shape=(26,)),
    layers.Dense(units=128, use_bias=True), # Linear layer
    layers.Dense(units=64, use_bias=True, activation='relu'), # Non-Linear layer
    #layers.Flatten(),
    layers.Dense(units=64, activation='relu'), # An inner product of previous parameters with weights
    layers.Dense(units=2)
    
])

model1.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 128)               3456      
                                                                 
 dense_19 (Dense)            (None, 64)                8256      
                                                                 
 dense_20 (Dense)            (None, 64)                4160      
                                                                 
 dense_21 (Dense)            (None, 2)                 130       
                                                                 
Total params: 16002 (62.51 KB)
Trainable params: 16002 (62.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
model1.compile(optimizer='adam', # optimizer type
              loss=tf.keras.losses.sparse_categorical_crossentropy, # loss function
              metrics=['accuracy'])

stop_here_please = EarlyStopping(patience=3)

history_1 = model1.fit(x=X_train,y=y_train,
                     epochs=20,
                     validation_data=(X_val,y_val),
                     callbacks=[stop_here_please])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [26]:
model1.evaluate(x=X_test,y=y_test)



[9.698176383972168, 0.39830508828163147]

In [None]:
model1.predict()