In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer

from sklearn.model_selection import train_test_split
import joblib

import keras
from keras.models import Sequential
from keras.layers import Dense

import tensorflow as tf

Using TensorFlow backend.


In [2]:
games_path = "/data/nfl/armchairanalysis/nfl_00-14/csv/GAME.csv"
plays_path = "/data/nfl/armchairanalysis/nfl_00-14/csv/PLAY.csv"

In [3]:
plays = pd.read_csv(plays_path)
print(plays.columns)
plays = plays[['gid', 'off', 'def', 'qtr', 'min', 'sec', 'ptso', 'ptsd']]

# Convert game clock to total seconds elapsed in the game. OT=5
plays['time_elapsed_sec'] = 15*60*(plays['qtr'] - 1) + (15*60 - (60*plays['min'] + plays['sec']))

plays = plays.drop(['qtr', 'min', 'sec'], axis=1)  # Drop fields no longer needed
plays.sample(5)

Index(['gid', 'pid', 'off', 'def', 'type', 'dseq', 'len', 'qtr', 'min', 'sec',
       'ptso', 'ptsd', 'timo', 'timd', 'dwn', 'ytg', 'yfog', 'zone', 'fd',
       'sg', 'nh', 'pts', 'tck', 'sk', 'pen', 'ints', 'fum', 'saf', 'blk',
       'olid'],
      dtype='object')


Unnamed: 0,gid,off,def,ptso,ptsd,time_elapsed_sec
87141,537,CAR,DET,31,7,3262
646483,3954,KC,PIT,0,3,524
626760,3835,NYJ,BUF,7,24,1430
636638,3894,MIA,DEN,21,17,1788
345322,2124,KC,NE,3,7,1844


In [4]:
games = pd.read_csv(games_path)
print(games.columns)
games = games[['gid', 'seas', 'v', 'h', 'sprv', 'ptsv', 'ptsh', 'ou']]
games['home_dog'] = games['sprv'] < 0  # Spread is relative to the visiting team

# which team is the favorite and which is the underdog?
games['favorite'] = np.where(games['home_dog'], games['v'], games['h'])
games['underdog'] = np.where(games['home_dog'], games['h'], games['v'])
games['final_points_favorite'] = np.where(games['home_dog'], games['ptsv'], games['ptsh'])
games['final_points_underdog'] = np.where(games['home_dog'], games['ptsh'], games['ptsv'])

# Determine whether the favorite covered
games['underdog_with_spread'] = np.where(games['home_dog'], games['final_points_underdog'] - games['sprv'], games['final_points_underdog'] + games['sprv'])
games['did_favorite_cover'] = games['underdog_with_spread'] < games['final_points_favorite']  # Favors the underdog on a push

# Drop columns that we no longer need
games = games.drop(['underdog_with_spread', 'ptsv', 'ptsh', 'h', 'v'], axis=1)

print("Num games: ", len(games.index))
print("Num home dogs: ", np.sum(games['home_dog']))
print("Num of favorites covering: ", np.sum(games['did_favorite_cover']))
games.head(10)

Index(['gid', 'seas', 'wk', 'day', 'v', 'h', 'stad', 'temp', 'humd', 'wspd',
       'wdir', 'cond', 'surf', 'ou', 'sprv', 'ptsv', 'ptsh'],
      dtype='object')
Num games:  3989
Num home dogs:  1268
Num of favorites covering:  1890


Unnamed: 0,gid,seas,sprv,ou,home_dog,favorite,underdog,final_points_favorite,final_points_underdog,did_favorite_cover
0,1,2000,7.0,42.5,False,ATL,SF,36,28,True
1,2,2000,-10.0,38.0,True,JAC,CLE,27,7,True
2,3,2000,6.0,40.0,False,DAL,PHI,14,41,False
3,4,2000,2.5,36.0,False,GB,NYJ,16,20,False
4,5,2000,-3.0,44.0,True,IND,KC,27,14,True
5,6,2000,3.0,36.0,False,MIA,SEA,23,0,True
6,7,2000,4.5,47.0,False,MIN,CHI,30,27,False
7,8,2000,-3.0,35.5,True,TB,NE,21,16,True
8,9,2000,1.0,39.5,False,NO,DET,10,14,False
9,10,2000,7.0,40.0,False,NYG,ARI,21,16,False


In [58]:
joined = plays.merge(games, on="gid",)  # Join game data on play data

# Play lengths are highly inaccurate in the 2000 and 2001 season's due to sporadic recording of play clock times.
joined = joined[joined['seas'] > 2001]

# Compute underdog/favorite points at this point in the game
joined['current_points_favorite'] = np.where(joined['off'] == joined['favorite'], joined['ptso'], joined['ptsd'])
joined['current_points_underdog'] = np.where(joined['off'] == joined['favorite'], joined['ptsd'], joined['ptso'])

joined['current_points_underdog_with_spread'] = np.where(joined['home_dog'], joined['current_points_underdog'] - joined['sprv'], joined['current_points_underdog'] + joined['sprv'])
joined['current_spread_difference'] = joined['current_points_favorite'] - joined['current_points_underdog_with_spread']

joined['home_dog'] =joined.home_dog.astype(int)  # convert to numeric
joined['did_favorite_cover'] = joined.did_favorite_cover.astype(int)  # Convert to numeric

joined.sample(10)

Unnamed: 0,gid,off,def,ptso,ptsd,time_elapsed_sec,seas,sprv,ou,home_dog,favorite,underdog,final_points_favorite,final_points_underdog,did_favorite_cover,current_points_favorite,current_points_underdog,current_points_underdog_with_spread,current_spread_difference
483440,2976,PHI,SF,23,10,2260,2011,9.0,44.0,0,PHI,SF,23,24,0,23,10,19.0,4.0
643646,3937,KC,OAK,7,0,839,2014,10.0,41.5,0,KC,OAK,31,13,1,7,0,10.0,-3.0
192790,1183,ARI,NYG,17,14,2631,2004,-2.0,38.5,1,NYG,ARI,14,17,0,14,17,19.0,-5.0
364573,2244,TEN,GB,13,10,1806,2008,4.5,40.5,0,TEN,GB,19,16,0,13,10,14.5,-1.5
410609,2530,NE,IND,24,14,1934,2009,3.0,49.5,0,IND,NE,35,34,0,14,24,27.0,-13.0
375332,2311,CHI,MIN,7,0,584,2008,3.5,41.5,0,MIN,CHI,34,14,1,0,7,10.5,-10.5
587332,3599,ARI,HOU,7,7,932,2013,3.0,41.5,0,ARI,HOU,27,24,0,7,7,10.0,-3.0
354430,2180,PIT,BAL,20,20,3826,2008,5.5,33.5,0,PIT,BAL,23,20,0,20,20,25.5,-5.5
201873,1238,PHI,GB,44,3,2673,2004,6.0,48.0,0,PHI,GB,47,17,1,44,3,9.0,35.0
630304,3856,NYG,IND,0,10,1292,2014,-3.0,51.0,1,IND,NYG,40,24,1,10,0,3.0,7.0


In [60]:
# Generate X and Y dataframes for training
y = joined['did_favorite_cover'].values
x = joined.drop(
    ['gid', 'seas', 'off', 'def', 'favorite', 'underdog', 'final_points_favorite', 
     'final_points_underdog', 'ptso', 'ptsd', 'ou', "did_favorite_cover",
     'current_points_underdog_with_spread'
    ]
    , axis=1)
x.head(5)

Unnamed: 0,time_elapsed_sec,sprv,home_dog,current_points_favorite,current_points_underdog,current_spread_difference
84060,0,-3.5,1,0,0,-3.5
84061,0,-3.5,1,0,0,-3.5
84062,35,-3.5,1,0,0,-3.5
84063,40,-3.5,1,0,0,-3.5
84064,40,-3.5,1,0,0,-3.5


In [61]:
# Generate train/test splits
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Fit scaler on the training data, then apply to the test data
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

print(x.shape)
print(y.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
pd.DataFrame(X_train_scaled).head(5)

(568306, 6)
(568306,)
(454644, 6)
(113662, 6)
(454644,)
(113662,)


Unnamed: 0,0,1,2,3,4,5
0,-1.655779,-0.952832,1.461786,-1.16769,-1.049051,-0.055554
1,1.196066,0.762698,-0.684095,1.028249,0.876862,0.134774
2,-0.441402,0.076486,-0.684095,-1.16769,0.536995,-1.387846
3,-0.517123,0.076486,-0.684095,-0.594836,-1.049051,0.515429
4,-0.467905,-0.781279,1.461786,-0.881263,-1.049051,0.325101


In [78]:
# Initialising the ANN
model = Sequential()
model.add(Dense(32, activation=tf.nn.relu, input_dim=X_train_scaled.shape[1]))
model.add(Dense(16, activation=tf.nn.relu))
model.add(Dense(8, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [79]:
# Train
model.fit(X_train_scaled, y_train, batch_size = 128, epochs = 20, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x13987fc18>

In [72]:
y_pred = model.predict(X_test_scaled)  # Generate probabilites for y test
y_pred = (y_pred > 0.5)  # Convert to boolean


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred) # How often can we correctly predict the outcome?

array([[48481, 11510],
       [17636, 36035]])

In [82]:
# Get random plays, print the play scenario, and generate a prediction
# Used for validating the outcome of the model
for i in range(10):
    row = joined.sample(1)
#     print("favorite is ", row['favorite'].values[0])
#     print("underdog is ", row['underdog'].values[0])
    print("spread is ", row['sprv'].values[0])
    print("time_elapsed_min is ", (row['time_elapsed_sec'].values[0]/60))
    print("game is {0:.2f}% over".format(100*(row['time_elapsed_sec'].values[0]/60)/60))
    print("current points_favorite is ", row['current_points_favorite'].values[0])
    print("current points_underdog is ", row['current_points_underdog'].values[0])
    print("did_favorite_cover: {}".format(row['did_favorite_cover'].values[0]))

    test = row.drop(
        ['gid', 'seas', 'off', 'def', 'favorite', 'underdog', 'final_points_favorite', 
         'final_points_underdog', 'ptso', 'ptsd', 'ou', 'did_favorite_cover',
         'current_points_underdog_with_spread'
        ], axis=1)
    row_scaled = sc.transform(test)

    pred = model.predict(row_scaled)
    print("PREDICTION FOR FAVORITE COVERING: {}".format(pred[0]))
    print("*********************")

spread is  9.0
time_elapsed_min is  34.61666666666667
game is 57.69% over
current points_favorite is  7
current points_underdog is  14
did_favorite_cover: 1
PREDICTION FOR FAVORITE COVERING: [0.11206017]
*********************
spread is  1.0
time_elapsed_min is  20.883333333333333
game is 34.81% over
current points_favorite is  0
current points_underdog is  20
did_favorite_cover: 0
PREDICTION FOR FAVORITE COVERING: [0.09300562]
*********************
spread is  -3.0
time_elapsed_min is  46.333333333333336
game is 77.22% over
current points_favorite is  17
current points_underdog is  21
did_favorite_cover: 0
PREDICTION FOR FAVORITE COVERING: [0.28201386]
*********************
spread is  -3.0
time_elapsed_min is  50.11666666666667
game is 83.53% over
current points_favorite is  31
current points_underdog is  21
did_favorite_cover: 1
PREDICTION FOR FAVORITE COVERING: [0.7995033]
*********************
spread is  -7.5
time_elapsed_min is  12.483333333333333
game is 20.81% over
current points_

In [83]:
# Save the trained model to disk for reuse
model.save("model/model.h5")

# save the scaler to disk
joblib.dump(sc, "model/scaler.pkl") 

['model/scaler.pkl']

In [20]:
%%time
import inference
inference.load_model('model/model.h5')

CPU times: user 878 ms, sys: 20.3 ms, total: 899 ms
Wall time: 895 ms


<keras.engine.sequential.Sequential at 0x1356a3198>