# Deep Learning
In this notebook, we will be using neural networks in order to try and predict individual games. We will be using the data that was collected from the sportsreference API from the summer of 2020.

# Imports
Import the necessary modules we will be utilizing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

pd.set_option("display.max_rows", None, "display.max_columns", None)

# Getting Data Ready

In [2]:
# Load data.csv
dataframe = pd.read_csv('../assets/data/data.csv')
dataframe.head()

Unnamed: 0,away_assist_percentage,away_assists,away_block_percentage,away_blocks,away_defensive_rating,away_defensive_rebound_percentage,away_defensive_rebounds,away_effective_field_goal_percentage,away_field_goal_attempts,away_field_goal_percentage,away_field_goals,away_free_throw_attempt_rate,away_free_throw_attempts,away_free_throw_percentage,away_free_throws,away_losses,away_minutes_played,away_offensive_rating,away_offensive_rebound_percentage,away_offensive_rebounds,away_personal_fouls,away_points,away_ranking,away_steal_percentage,away_steals,away_three_point_attempt_rate,away_three_point_field_goal_attempts,away_three_point_field_goal_percentage,away_three_point_field_goals,away_total_rebound_percentage,away_total_rebounds,away_true_shooting_percentage,away_turnover_percentage,away_turnovers,away_two_point_field_goal_attempts,away_two_point_field_goal_percentage,away_two_point_field_goals,away_win_percentage,away_wins,date,home_assist_percentage,home_assists,home_block_percentage,home_blocks,home_defensive_rating,home_defensive_rebound_percentage,home_defensive_rebounds,home_effective_field_goal_percentage,home_field_goal_attempts,home_field_goal_percentage,home_field_goals,home_free_throw_attempt_rate,home_free_throw_attempts,home_free_throw_percentage,home_free_throws,home_losses,home_minutes_played,home_offensive_rating,home_offensive_rebound_percentage,home_offensive_rebounds,home_personal_fouls,home_points,home_ranking,home_steal_percentage,home_steals,home_three_point_attempt_rate,home_three_point_field_goal_attempts,home_three_point_field_goal_percentage,home_three_point_field_goals,home_total_rebound_percentage,home_total_rebounds,home_true_shooting_percentage,home_turnover_percentage,home_turnovers,home_two_point_field_goal_attempts,home_two_point_field_goal_percentage,home_two_point_field_goals,home_win_percentage,home_wins,location,losing_abbr,losing_name,pace,winner,winning_abbr,winning_name
0,25.0,3,6.3,3,115.4,69.2,27,0.286,49,0.245,12,0.367,18,0.611,11,0,200,50.0,18.9,7,17,39,,3.8,3,0.429,21,0.19,4,44.7,34,0.339,33.0,28,28,0.286,8,0.0,0,"November 5, 2019",55.6,20,7.1,2,50.0,81.1,30,0.52,75,0.48,36,0.227,17,0.706,12,0,200,115.4,30.8,12,21,90,,20.5,16,0.36,27,0.222,6,55.3,42,0.542,7.8,7,48,0.625,30,1.0,1,"Moody Coliseum , Abilene, Texas",Arlington Baptist\r\n\t\t\t,Arlington Baptist\r\n\t\t\t,78.3,Home,ABILENE-CHRISTIAN,Abilene Christian
1,43.3,13,6.3,2,107.5,65.2,15,0.531,65,0.462,30,0.308,20,0.7,14,1,225,103.8,8.8,3,26,83,,15.0,12,0.385,25,0.36,9,31.6,18,0.557,9.8,8,40,0.525,21,0.5,1,"November 10, 2019",55.6,15,12.5,5,103.8,91.2,31,0.596,52,0.519,27,0.615,32,0.75,24,1,225,107.5,34.8,8,17,86,,5.0,4,0.385,20,0.4,8,68.4,39,0.64,25.0,22,32,0.594,19,0.667,2,"Daskalakis Athletic Center, Philadelphia, Penn...",ABILENE-CHRISTIAN,Abilene Christian,71.4,Home,DREXEL,Drexel
2,39.1,9,3.1,1,94.5,70.0,21,0.52,51,0.451,23,0.451,23,0.87,20,1,200,100.0,18.5,5,22,73,,12.3,9,0.412,21,0.333,7,45.6,26,0.589,21.8,17,30,0.533,16,0.75,3,"November 16, 2019",45.0,9,0.0,0,100.0,81.5,22,0.45,50,0.4,20,0.66,33,0.727,24,2,200,94.5,30.0,9,20,69,,13.7,10,0.36,18,0.278,5,54.4,31,0.525,18.9,15,32,0.469,15,0.333,1,"Moody Coliseum , Abilene, Texas",ABILENE-CHRISTIAN,Abilene Christian,72.8,Away,PEPPERDINE,Pepperdine
3,72.7,16,3.6,1,112.5,69.6,16,0.432,59,0.373,22,0.153,9,0.778,7,3,200,90.6,29.0,9,17,58,,9.4,6,0.441,26,0.269,7,46.3,25,0.458,14.9,11,33,0.455,15,0.25,1,"November 18, 2019",51.9,14,12.1,4,90.6,71.0,22,0.6,50,0.54,27,0.3,15,0.8,12,3,200,112.5,30.4,7,13,72,,6.3,4,0.44,22,0.273,6,53.7,29,0.63,17.5,12,28,0.75,21,0.4,2,"Thomas & Mack Center, Las Vegas, Nevada",ABILENE-CHRISTIAN,Abilene Christian,63.7,Home,NEVADA-LAS-VEGAS,UNLV
4,43.8,7,2.4,1,121.6,55.6,20,0.433,45,0.356,16,0.467,21,0.905,19,0,200,78.4,17.2,5,23,58,,9.5,7,0.356,16,0.438,7,38.5,25,0.528,29.8,23,29,0.31,9,0.0,0,"November 21, 2019",66.7,22,10.3,3,78.4,82.8,24,0.545,67,0.493,33,0.343,23,0.739,17,3,200,121.6,44.4,16,17,90,,8.1,6,0.373,25,0.28,7,61.5,40,0.577,14.4,13,42,0.619,26,0.4,2,"Moody Coliseum , Abilene, Texas",Champion Christian\r\n\t\t\t,Champion Christian\r\n\t\t\t,74.0,Home,ABILENE-CHRISTIAN,Abilene Christian


In [3]:
# These fields aren't necessary to 
FIELDS_TO_DROP = ['date',
                  'location',
                  'away_defensive_rating',
                  'home_defensive_rating',
                  'away_defensive_rebound_percentage',
                  'home_defensive_rebound_percentage',
                  'losing_abbr',
                  'winner',
                  'winning_abbr',
                  'home_ranking',
                  'away_ranking',
                  'winning_name',
                  'losing_name',
                  'away_points',
                  'home_points']
data = dataframe.copy()
data['target'] = np.where(dataframe['winner'] == "Home", 0, 1)
data.drop(columns=FIELDS_TO_DROP, inplace=True)
data.dropna(inplace=True)
data.head()

Unnamed: 0,away_assist_percentage,away_assists,away_block_percentage,away_blocks,away_defensive_rebounds,away_effective_field_goal_percentage,away_field_goal_attempts,away_field_goal_percentage,away_field_goals,away_free_throw_attempt_rate,away_free_throw_attempts,away_free_throw_percentage,away_free_throws,away_losses,away_minutes_played,away_offensive_rating,away_offensive_rebound_percentage,away_offensive_rebounds,away_personal_fouls,away_steal_percentage,away_steals,away_three_point_attempt_rate,away_three_point_field_goal_attempts,away_three_point_field_goal_percentage,away_three_point_field_goals,away_total_rebound_percentage,away_total_rebounds,away_true_shooting_percentage,away_turnover_percentage,away_turnovers,away_two_point_field_goal_attempts,away_two_point_field_goal_percentage,away_two_point_field_goals,away_win_percentage,away_wins,home_assist_percentage,home_assists,home_block_percentage,home_blocks,home_defensive_rebounds,home_effective_field_goal_percentage,home_field_goal_attempts,home_field_goal_percentage,home_field_goals,home_free_throw_attempt_rate,home_free_throw_attempts,home_free_throw_percentage,home_free_throws,home_losses,home_minutes_played,home_offensive_rating,home_offensive_rebound_percentage,home_offensive_rebounds,home_personal_fouls,home_steal_percentage,home_steals,home_three_point_attempt_rate,home_three_point_field_goal_attempts,home_three_point_field_goal_percentage,home_three_point_field_goals,home_total_rebound_percentage,home_total_rebounds,home_true_shooting_percentage,home_turnover_percentage,home_turnovers,home_two_point_field_goal_attempts,home_two_point_field_goal_percentage,home_two_point_field_goals,home_win_percentage,home_wins,pace,target
0,25.0,3,6.3,3,27,0.286,49,0.245,12,0.367,18,0.611,11,0,200,50.0,18.9,7,17,3.8,3,0.429,21,0.19,4,44.7,34,0.339,33.0,28,28,0.286,8,0.0,0,55.6,20,7.1,2,30,0.52,75,0.48,36,0.227,17,0.706,12,0,200,115.4,30.8,12,21,20.5,16,0.36,27,0.222,6,55.3,42,0.542,7.8,7,48,0.625,30,1.0,1,78.3,0
1,43.3,13,6.3,2,15,0.531,65,0.462,30,0.308,20,0.7,14,1,225,103.8,8.8,3,26,15.0,12,0.385,25,0.36,9,31.6,18,0.557,9.8,8,40,0.525,21,0.5,1,55.6,15,12.5,5,31,0.596,52,0.519,27,0.615,32,0.75,24,1,225,107.5,34.8,8,17,5.0,4,0.385,20,0.4,8,68.4,39,0.64,25.0,22,32,0.594,19,0.667,2,71.4,0
2,39.1,9,3.1,1,21,0.52,51,0.451,23,0.451,23,0.87,20,1,200,100.0,18.5,5,22,12.3,9,0.412,21,0.333,7,45.6,26,0.589,21.8,17,30,0.533,16,0.75,3,45.0,9,0.0,0,22,0.45,50,0.4,20,0.66,33,0.727,24,2,200,94.5,30.0,9,20,13.7,10,0.36,18,0.278,5,54.4,31,0.525,18.9,15,32,0.469,15,0.333,1,72.8,1
3,72.7,16,3.6,1,16,0.432,59,0.373,22,0.153,9,0.778,7,3,200,90.6,29.0,9,17,9.4,6,0.441,26,0.269,7,46.3,25,0.458,14.9,11,33,0.455,15,0.25,1,51.9,14,12.1,4,22,0.6,50,0.54,27,0.3,15,0.8,12,3,200,112.5,30.4,7,13,6.3,4,0.44,22,0.273,6,53.7,29,0.63,17.5,12,28,0.75,21,0.4,2,63.7,0
4,43.8,7,2.4,1,20,0.433,45,0.356,16,0.467,21,0.905,19,0,200,78.4,17.2,5,23,9.5,7,0.356,16,0.438,7,38.5,25,0.528,29.8,23,29,0.31,9,0.0,0,66.7,22,10.3,3,24,0.545,67,0.493,33,0.343,23,0.739,17,3,200,121.6,44.4,16,17,8.1,6,0.373,25,0.28,7,61.5,40,0.577,14.4,13,42,0.619,26,0.4,2,74.0,0


Now, we have to split the data into training, validation, and test sets

In [4]:
train, test = train_test_split(data, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

7075 train examples
1769 validation examples
2212 test examples


Next, we need wrap the dataframes with `tf.data`, in order to shuffle and batch the data.

In [5]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

For each of the Numeric feature, we will use a Normalization() layer to make sure the mean of each feature is 0 and its standard deviation is 1.

In [6]:
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()

    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

Create new input pipeline and make numeric columns for all of the features

In [7]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [8]:
all_inputs = []
encoded_features = []

# Just to get the training features
[(train_features, label_batch)] = train_ds.take(1)

# Numeric features.
for header in list(train_features.keys()):
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

# Compile the Neural Network

In [9]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(16, activation="relu")(all_features)
x = tf.keras.layers.Dense(16, activation="relu")(x)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

# Train the Network

In [10]:
model.fit(train_ds, epochs=5, validation_data=val_ds)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x20605eff7c0>

In [11]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.982820987701416


# Analysis
The results here, while they may look spectacular, are very misleading. The way that the current data is setup, we are only looking at the game stats and determining the winner based on that. If we have the number of 2PM, 3PM, and freethrows for both teams in the current game, it's trivial to find out who won the game, we can easily just calculate the number of points each team had. To make this better, we should restructure our data. In particular, for each sample (row), it should contain the **averages** for both teams **going into** the match up and then have a label of which team won after they played. This way, we don't know the game stats, but might be able to predict what they could be given the teams' averages.