In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer

from sklearn.model_selection import train_test_split
import joblib

import keras
from keras.models import Sequential
from keras.layers import Dense

import tensorflow as tf

Using TensorFlow backend.


In [2]:
games_path = "/data/nfl/armchairanalysis/nfl_00-14/csv/GAME.csv"
plays_path = "/data/nfl/armchairanalysis/nfl_00-14/csv/PLAY.csv"

In [3]:
plays = pd.read_csv(plays_path)
print(plays.columns)
plays = plays[['gid', 'off', 'def', 'qtr', 'min', 'sec', 'ptso', 'ptsd']]

# Convert game clock to total seconds elapsed in the game. OT=5
plays['time_elapsed_sec'] = 15*60*(plays['qtr'] - 1) + (15*60 - (60*plays['min'] + plays['sec']))

plays = plays.drop(['qtr', 'min', 'sec'], axis=1)  # Drop fields no longer needed
plays.sample(5)

Index(['gid', 'pid', 'off', 'def', 'type', 'dseq', 'len', 'qtr', 'min', 'sec',
       'ptso', 'ptsd', 'timo', 'timd', 'dwn', 'ytg', 'yfog', 'zone', 'fd',
       'sg', 'nh', 'pts', 'tck', 'sk', 'pen', 'ints', 'fum', 'saf', 'blk',
       'olid'],
      dtype='object')


Unnamed: 0,gid,off,def,ptso,ptsd,time_elapsed_sec
418833,2581,NO,ATL,3,6,927
357203,2198,TB,CAR,17,3,2007
212857,1305,NYJ,STL,26,29,3586
425543,2622,BAL,PIT,10,13,1607
200203,1228,GB,STL,31,17,3343


In [4]:
games = pd.read_csv(games_path)
print(games.columns)
games = games[['gid', 'seas', 'v', 'h', 'sprv', 'ptsv', 'ptsh', 'ou']]
games['home_dog'] = games['sprv'] < 0  # Spread is relative to the visiting team

# which team is the favorite and which is the underdog?
games['favorite'] = np.where(games['home_dog'], games['v'], games['h'])
games['underdog'] = np.where(games['home_dog'], games['h'], games['v'])
games['final_points_favorite'] = np.where(games['home_dog'], games['ptsv'], games['ptsh'])
games['final_points_underdog'] = np.where(games['home_dog'], games['ptsh'], games['ptsv'])

# Determine whether the favorite covered
games['underdog_with_spread'] = np.where(games['home_dog'], games['final_points_underdog'] - games['sprv'], games['final_points_underdog'] + games['sprv'])
games['did_favorite_cover'] = games['underdog_with_spread'] < games['final_points_favorite']  # Favors the underdog on a push

# Drop columns that we no longer need
games = games.drop(['underdog_with_spread', 'ptsv', 'ptsh', 'h', 'v'], axis=1)

print("Num games: ", len(games.index))
print("Num home dogs: ", np.sum(games['home_dog']))
print("Num of favorites covering: ", np.sum(games['did_favorite_cover']))
games.head(10)

Index(['gid', 'seas', 'wk', 'day', 'v', 'h', 'stad', 'temp', 'humd', 'wspd',
       'wdir', 'cond', 'surf', 'ou', 'sprv', 'ptsv', 'ptsh'],
      dtype='object')
Num games:  3989
Num home dogs:  1268
Num of favorites covering:  1890


Unnamed: 0,gid,seas,sprv,ou,home_dog,favorite,underdog,final_points_favorite,final_points_underdog,did_favorite_cover
0,1,2000,7.0,42.5,False,ATL,SF,36,28,True
1,2,2000,-10.0,38.0,True,JAC,CLE,27,7,True
2,3,2000,6.0,40.0,False,DAL,PHI,14,41,False
3,4,2000,2.5,36.0,False,GB,NYJ,16,20,False
4,5,2000,-3.0,44.0,True,IND,KC,27,14,True
5,6,2000,3.0,36.0,False,MIA,SEA,23,0,True
6,7,2000,4.5,47.0,False,MIN,CHI,30,27,False
7,8,2000,-3.0,35.5,True,TB,NE,21,16,True
8,9,2000,1.0,39.5,False,NO,DET,10,14,False
9,10,2000,7.0,40.0,False,NYG,ARI,21,16,False


In [5]:
joined = plays.merge(games, on="gid",)  # Join game data on play data

# Play lengths are highly inaccurate in the 2000 and 2001 season's due to sporadic recording of play clock times.
joined = joined[joined['seas'] > 2001]

# Compute underdog/favorite points at this point in the game
joined['current_points_favorite'] = np.where(joined['off'] == joined['favorite'], joined['ptso'], joined['ptsd'])
joined['current_points_underdog'] = np.where(joined['off'] == joined['favorite'], joined['ptsd'], joined['ptso'])

joined['home_dog'] =joined.home_dog.astype(int)  # convert to numeric
joined['did_favorite_cover'] = joined.did_favorite_cover.astype(int)  # Convert to numeric

joined.head(10)

Unnamed: 0,gid,off,def,ptso,ptsd,time_elapsed_sec,seas,sprv,ou,home_dog,favorite,underdog,final_points_favorite,final_points_underdog,did_favorite_cover,current_points_favorite,current_points_underdog
84060,519,NYG,SF,0,0,0,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0
84061,519,NYG,SF,0,0,0,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0
84062,519,NYG,SF,0,0,35,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0
84063,519,NYG,SF,0,0,40,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0
84064,519,NYG,SF,0,0,40,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0
84065,519,SF,NYG,0,0,51,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0
84066,519,SF,NYG,0,0,82,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0
84067,519,SF,NYG,0,0,85,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0
84068,519,SF,NYG,0,0,133,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0
84069,519,SF,NYG,0,0,140,2002,-3.5,38.5,1,SF,NYG,16,13,0,0,0


In [6]:
# Generate X and Y dataframes for training
y = joined['did_favorite_cover'].values
x = joined.drop(
    ['gid', 'seas', 'off', 'def', 'favorite', 'underdog', 'final_points_favorite', 
     'final_points_underdog', 'ptso', 'ptsd', 'ou', "did_favorite_cover"]
    , axis=1)
x.head(5)

Unnamed: 0,time_elapsed_sec,sprv,home_dog,current_points_favorite,current_points_underdog
84060,0,-3.5,1,0,0
84061,0,-3.5,1,0,0
84062,35,-3.5,1,0,0
84063,40,-3.5,1,0,0
84064,40,-3.5,1,0,0


In [7]:
# Generate train/test splits
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Fit scaler on the training data, then apply to the test data
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

print(x.shape)
print(y.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
pd.DataFrame(X_train_scaled).head(5)

(568306, 5)
(568306,)
(454644, 5)
(113662, 5)
(454644,)
(113662,)


Unnamed: 0,0,1,2,3,4
0,-1.655779,-0.952832,1.461786,-1.16769,-1.049051
1,1.196066,0.762698,-0.684095,1.028249,0.876862
2,-0.441402,0.076486,-0.684095,-1.16769,0.536995
3,-0.517123,0.076486,-0.684095,-0.594836,-1.049051
4,-0.467905,-0.781279,1.461786,-0.881263,-1.049051


In [8]:
# Define the network
model = Sequential()
model.add(Dense(128, activation=tf.nn.relu, input_dim=X_train_scaled.shape[1]))
model.add(keras.layers.Dense(128, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [9]:
# Train
model.fit(X_train_scaled, y_train, batch_size = 100, epochs = 25, shuffle=True)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x138e88dd8>

In [10]:
y_pred = model.predict(X_test_scaled)  # Generate probabilites for y test
y_pred = (y_pred > 0.5)  # Convert to boolean


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred) # How often can we correctly predict the outcome?

array([[48338, 11653],
       [16290, 37381]])

In [11]:
# Get random plays, print the play scenario, and generate a prediction
# Used for validating the outcome of the model
for i in range(10):
    row = joined.sample(1)
    print("favorite is ", row['favorite'].values[0])
    print("underdog is ", row['underdog'].values[0])
    print("spread is ", row['sprv'].values[0])
    print("time_elapsed_min is ", (row['time_elapsed_sec'].values[0]/60))
    print("game is {0:.2f}% over".format(100*(row['time_elapsed_sec'].values[0]/60)/60))
    print("current points_favorite is ", row['current_points_favorite'].values[0])
    print("current points_underdog is ", row['current_points_underdog'].values[0])
    print("did_favorite_cover: {}".format(row['did_favorite_cover'].values[0]))

    test = row.drop(
        ['gid', 'seas', 'off', 'def', 'favorite', 'underdog', 'final_points_favorite', 
         'final_points_underdog', 'ptso', 'ptsd', 'ou', 'did_favorite_cover'
        ], axis=1)
    row_scaled = sc.transform(test)

    pred = model.predict(row_scaled)
    print("PREDICTION FOR FAVORITE COVERING: {}".format(pred[0]))
    print("*********************")

favorite is  CLE
underdog is  KC
spread is  1.0
time_elapsed_min is  48.28333333333333
game is 80.47% over
current points_favorite is  14
current points_underdog is  13
did_favorite_cover: 0
PREDICTION FOR FAVORITE COVERING: [0.22844863]
*********************
favorite is  PIT
underdog is  HOU
spread is  -5.5
time_elapsed_min is  16.016666666666666
game is 26.69% over
current points_favorite is  17
current points_underdog is  0
did_favorite_cover: 1
PREDICTION FOR FAVORITE COVERING: [0.9071907]
*********************
favorite is  MIN
underdog is  CAR
spread is  -9.0
time_elapsed_min is  18.383333333333333
game is 30.64% over
current points_favorite is  0
current points_underdog is  0
did_favorite_cover: 0
PREDICTION FOR FAVORITE COVERING: [0.5278629]
*********************
favorite is  JAC
underdog is  HOU
spread is  6.5
time_elapsed_min is  12.1
game is 20.17% over
current points_favorite is  0
current points_underdog is  3
did_favorite_cover: 1
PREDICTION FOR FAVORITE COVERING: [0.46944

In [23]:
# Test specific in game scenarios
test = row = joined.sample(1)
test['time_elapsed_sec'] = 1500
test['current_points_favorite'] = 18
test['current_points_underdog'] = 3
test['sprv'] = 1.5
test['home_dog'] = int(test['sprv'] < 0)
test = row.drop(
    ['gid', 'seas', 'off', 'def', 'favorite', 'underdog', 'final_points_favorite', 
     'final_points_underdog', 'ptso', 'ptsd', 'ou', 'did_favorite_cover'
    ], axis=1)
print(test)
row_scaled = sc.transform(test)
pred = model.predict(row_scaled)
pred
row_scaled

        time_elapsed_sec  sprv  home_dog  current_points_favorite  \
104082              1500   1.5         0                       18   

        current_points_underdog  
104082                        3  


array([[-0.34296494, -0.18084354, -0.68409485,  0.55087108, -0.70918415]])

In [21]:
# Save the trained model to disk for reuse
model.save("model/model.h5")

# save the scaler to disk
joblib.dump(sc, "model/scaler.pkl") 

['model/scaler.pkl']

In [40]:
%%time
import inference
inference.load_model('model/model.h5')

CPU times: user 1.05 s, sys: 21.8 ms, total: 1.07 s
Wall time: 1.07 s


<keras.engine.sequential.Sequential at 0x13faf6278>