In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer

from sklearn.model_selection import train_test_split
import joblib

import keras
from keras.models import Sequential
from keras.layers import Dense

import tensorflow as tf

In [5]:
games_path = "/data/nfl/armchairanalysis/nfl_00-14/csv/GAME.csv"
plays_path = "/data/nfl/armchairanalysis/nfl_00-14/csv/PLAY.csv"

In [6]:
plays = pd.read_csv(plays_path)
print(plays.columns)
plays = plays[['gid', 'off', 'def', 'qtr', 'min', 'sec', 'ptso', 'ptsd']]

# Convert game clock to total seconds elapsed in the game. OT=5
plays['time_elapsed_sec'] = 15*60*(plays['qtr'] - 1) + (15*60 - (60*plays['min'] + plays['sec']))

plays = plays.drop(['qtr', 'min', 'sec'], axis=1)  # Drop fields no longer needed
plays.sample(5)

Index(['gid', 'pid', 'off', 'def', 'type', 'dseq', 'len', 'qtr', 'min', 'sec',
       'ptso', 'ptsd', 'timo', 'timd', 'dwn', 'ytg', 'yfog', 'zone', 'fd',
       'sg', 'nh', 'pts', 'tck', 'sk', 'pen', 'ints', 'fum', 'saf', 'blk',
       'olid'],
      dtype='object')


Unnamed: 0,gid,off,def,ptso,ptsd,time_elapsed_sec
126737,778,NYG,SF,0,0,160
402688,2482,MIN,PIT,0,3,1048
401855,2476,CHI,ATL,14,21,3480
483363,2976,PHI,SF,0,0,643
2799,18,CIN,CLE,0,0,403


In [7]:
games = pd.read_csv(games_path)
print(games.columns)
games = games[['gid', 'seas', 'v', 'h', 'sprv', 'ptsv', 'ptsh', 'ou']]
games['home_dog'] = games['sprv'] < 0  # Spread is relative to the visiting team

# which team is the favorite and which is the underdog?
games['favorite'] = np.where(games['home_dog'], games['v'], games['h'])
games['underdog'] = np.where(games['home_dog'], games['h'], games['v'])
games['final_points_favorite'] = np.where(games['home_dog'], games['ptsv'], games['ptsh'])
games['final_points_underdog'] = np.where(games['home_dog'], games['ptsh'], games['ptsv'])

# Determine whether the favorite covered
games['underdog_with_spread'] = np.where(games['home_dog'], games['final_points_underdog'] - games['sprv'], games['final_points_underdog'] + games['sprv'])
games['did_favorite_cover'] = games['underdog_with_spread'] < games['final_points_favorite']  # Favors the underdog on a push

# Drop columns that we no longer need
games = games.drop(['underdog_with_spread', 'ptsv', 'ptsh', 'h', 'v'], axis=1)

print("Num games: ", len(games.index))
print("Num home dogs: ", np.sum(games['home_dog']))
print("Num of favorites covering: ", np.sum(games['did_favorite_cover']))
games.head(10)

Index(['gid', 'seas', 'wk', 'day', 'v', 'h', 'stad', 'temp', 'humd', 'wspd',
       'wdir', 'cond', 'surf', 'ou', 'sprv', 'ptsv', 'ptsh'],
      dtype='object')
Num games:  3989
Num home dogs:  1268
Num of favorites covering:  1890


Unnamed: 0,gid,seas,sprv,ou,home_dog,favorite,underdog,final_points_favorite,final_points_underdog,did_favorite_cover
0,1,2000,7.0,42.5,False,ATL,SF,36,28,True
1,2,2000,-10.0,38.0,True,JAC,CLE,27,7,True
2,3,2000,6.0,40.0,False,DAL,PHI,14,41,False
3,4,2000,2.5,36.0,False,GB,NYJ,16,20,False
4,5,2000,-3.0,44.0,True,IND,KC,27,14,True
5,6,2000,3.0,36.0,False,MIA,SEA,23,0,True
6,7,2000,4.5,47.0,False,MIN,CHI,30,27,False
7,8,2000,-3.0,35.5,True,TB,NE,21,16,True
8,9,2000,1.0,39.5,False,NO,DET,10,14,False
9,10,2000,7.0,40.0,False,NYG,ARI,21,16,False


In [9]:
joined = plays.merge(games, on="gid",)  # Join game data on play data

# Play lengths are highly inaccurate in the 2000 and 2001 season's due to sporadic recording of play clock times.
joined = joined[joined['seas'] > 2001]

# Compute underdog/favorite points at this point in the game
joined['current_points_favorite'] = np.where(joined['off'] == joined['favorite'], joined['ptso'], joined['ptsd'])
joined['current_points_underdog'] = np.where(joined['off'] == joined['favorite'], joined['ptsd'], joined['ptso'])

joined['current_points_underdog_with_spread'] = np.where(joined['home_dog'], joined['current_points_underdog'] - joined['sprv'], joined['current_points_underdog'] + joined['sprv'])
joined['current_spread_difference'] = joined['current_points_favorite'] - joined['current_points_underdog_with_spread']

joined['home_dog'] =joined.home_dog.astype(int)  # convert to numeric
joined['did_favorite_cover'] = joined.did_favorite_cover.astype(int)  # Convert to numeric

joined.to_parquet("/data/nfl/armchairanalysis/crewpool/joined.parquet")

joined.sample(10)

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
pyarrow or fastparquet is required for parquet support

In [6]:
# Generate X and Y dataframes for training
y = joined['did_favorite_cover'].values
x = joined.drop(
    ['gid', 'seas', 'off', 'def', 'favorite', 'underdog', 'final_points_favorite', 
     'final_points_underdog', 'ptso', 'ptsd', 'ou', "did_favorite_cover",
     'current_points_underdog_with_spread'
    ]
    , axis=1)
x.head(5)

Unnamed: 0,time_elapsed_sec,sprv,home_dog,current_points_favorite,current_points_underdog,current_spread_difference
84060,0,-3.5,1,0,0,-3.5
84061,0,-3.5,1,0,0,-3.5
84062,35,-3.5,1,0,0,-3.5
84063,40,-3.5,1,0,0,-3.5
84064,40,-3.5,1,0,0,-3.5


In [7]:
# Generate train/test splits
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Fit scaler on the training data, then apply to the test data
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

print(x.shape)
print(y.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
pd.DataFrame(X_train_scaled).head(5)

(568306, 6)
(568306,)
(454644, 6)
(113662, 6)
(454644,)
(113662,)


Unnamed: 0,0,1,2,3,4,5
0,-1.655779,-0.952832,1.461786,-1.16769,-1.049051,-0.055554
1,1.196066,0.762698,-0.684095,1.028249,0.876862,0.134774
2,-0.441402,0.076486,-0.684095,-1.16769,0.536995,-1.387846
3,-0.517123,0.076486,-0.684095,-0.594836,-1.049051,0.515429
4,-0.467905,-0.781279,1.461786,-0.881263,-1.049051,0.325101


In [8]:
# Initialising the ANN
model = Sequential()
model.add(Dense(32, activation=tf.nn.relu, input_dim=X_train_scaled.shape[1]))
model.add(Dense(16, activation=tf.nn.relu))
model.add(Dense(8, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [9]:
# Train
model.fit(X_train_scaled, y_train, batch_size = 128, epochs = 64, shuffle=True)


Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<keras.callbacks.History at 0x13dcd1c18>

In [10]:
y_pred = model.predict(X_test_scaled)  # Generate probabilites for y test
y_pred = (y_pred > 0.5)  # Convert to boolean


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred) # How often can we correctly predict the outcome?

array([[49985, 10006],
       [18954, 34717]])

In [11]:
# Get random plays, print the play scenario, and generate a prediction
# Used for validating the outcome of the model
for i in range(10):
    row = joined.sample(1)
#     print("favorite is ", row['favorite'].values[0])
#     print("underdog is ", row['underdog'].values[0])
    print("spread is ", row['sprv'].values[0])
    print("time_elapsed_min is ", (row['time_elapsed_sec'].values[0]/60))
    print("game is {0:.2f}% over".format(100*(row['time_elapsed_sec'].values[0]/60)/60))
    print("current points_favorite is ", row['current_points_favorite'].values[0])
    print("current points_underdog is ", row['current_points_underdog'].values[0])
    print("did_favorite_cover: {}".format(row['did_favorite_cover'].values[0]))

    test = row.drop(
        ['gid', 'seas', 'off', 'def', 'favorite', 'underdog', 'final_points_favorite', 
         'final_points_underdog', 'ptso', 'ptsd', 'ou', 'did_favorite_cover',
         'current_points_underdog_with_spread'
        ], axis=1)
    row_scaled = sc.transform(test)

    pred = model.predict(row_scaled)
    print("PREDICTION FOR FAVORITE COVERING: {}".format(pred[0]))
    print("*********************")

spread is  -2.0
time_elapsed_min is  19.25
game is 32.08% over
current points_favorite is  0
current points_underdog is  13
did_favorite_cover: 0
PREDICTION FOR FAVORITE COVERING: [0.27864107]
*********************
spread is  4.0
time_elapsed_min is  40.93333333333333
game is 68.22% over
current points_favorite is  17
current points_underdog is  19
did_favorite_cover: 1
PREDICTION FOR FAVORITE COVERING: [0.2141943]
*********************
spread is  -4.0
time_elapsed_min is  28.816666666666666
game is 48.03% over
current points_favorite is  17
current points_underdog is  24
did_favorite_cover: 0
PREDICTION FOR FAVORITE COVERING: [0.24420935]
*********************
spread is  7.5
time_elapsed_min is  36.7
game is 61.17% over
current points_favorite is  21
current points_underdog is  17
did_favorite_cover: 0
PREDICTION FOR FAVORITE COVERING: [0.391829]
*********************
spread is  5.5
time_elapsed_min is  24.766666666666666
game is 41.28% over
current points_favorite is  7
current point

In [12]:
# Save the trained model to disk for reuse
model.save("model/model.h5")

# save the scaler to disk
joblib.dump(sc, "model/scaler.pkl") 

['model/scaler.pkl']

In [20]:
%%time
import inference
inference.load_model('model/model.h5')

CPU times: user 878 ms, sys: 20.3 ms, total: 899 ms
Wall time: 895 ms


<keras.engine.sequential.Sequential at 0x1356a3198>