In [132]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi
# from kaggle.competitions import nflrush
from nimbusml import Pipeline
from nimbusml.ensemble import LightGbmRegressor
from nimbusml.ensemble.booster import Dart
from nimbusml.feature_extraction.categorical import OneHotVectorizer, OneHotHashVectorizer
from nimbusml.linear_model import FastLinearRegressor, OrdinaryLeastSquaresRegressor
from scipy.stats import norm

In [108]:
# Before running this cell, make sure you have the Kaggle API token.
# Go to Kaggle >> My Account and click "Create New API Token". This
# will download "kaggle.json" to your downloads folder. Copy this
# file to "C:\Users\<YOUR_USER_NAME\.kaggle\>".

# NOTE: Creating a new API token expires your previous token. If you
# are using the API on multiple machines, all machines should have
# the current API token.

COMPETITION_NAME = 'nfl-big-data-bowl-2020'
TRAIN_CSV = 'train.csv'
TRAIN_ZIP = TRAIN_CSV + '.zip'

api = KaggleApi()
api.authenticate()
api.competition_download_file(COMPETITION_NAME, TRAIN_CSV)
with zipfile.ZipFile(TRAIN_ZIP, 'r') as f:
    f.extractall()
os.remove(TRAIN_ZIP)

train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [133]:
raw_data = pd.read_csv("train.csv", low_memory=False)

In [134]:
pd.set_option('display.max_columns', 199)
raw_data.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,496723,Eric Berry,29,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,212,12/29/1988,Tennessee,SS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,2495116,Allen Bailey,97,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,288,03/25/1989,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,2495493,Justin Houston,50,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,270,01/21/1989,Georgia,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,2506353,Derrick Johnson,56,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,245,11/22/1982,Texas,ILB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,2530794,Ron Parker,38,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,206,08/17/1987,Newberry,FS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW


In [135]:
%%capture
def preprocess_data(data):
    # For each play, we have 1 row per player, i.e. 22
    # rows. For simplicity, we will focus on only those
    # rows corresponding to the player who is rushing.
    #
    # TODO: Augment our solution by adding information
    #       on the other players later.
    data = data[data.NflId == data.NflIdRusher]
    
    data.loc[data.VisitorTeamAbbr == "ARI",'VisitorTeamAbbr'] = "ARZ"
    data.loc[data.HomeTeamAbbr == "ARI",'HomeTeamAbbr'] = "ARZ"

    data.loc[data.VisitorTeamAbbr == "BAL",'VisitorTeamAbbr'] = "BLT"
    data.loc[data.HomeTeamAbbr == "BAL",'HomeTeamAbbr'] = "BLT"

    data.loc[data.VisitorTeamAbbr == "CLE",'VisitorTeamAbbr'] = "CLV"
    data.loc[data.HomeTeamAbbr == "CLE",'HomeTeamAbbr'] = "CLV"

    data.loc[data.VisitorTeamAbbr == "HOU",'VisitorTeamAbbr'] = "HST"
    data.loc[data.HomeTeamAbbr == "HOU",'HomeTeamAbbr'] = "HST"
    
    data['ToLeft'] = data['PlayDirection'] == 'left'
    data['TeamOnOffense'] = data.apply(lambda x: 'home' if x.PossessionTeam == x.HomeTeamAbbr else 'away', axis=1)
    data['IsOnOffence'] = data['Team'] == data['TeamOnOffense']
    
    def yards_from_goal(yard_line, field_position, possession_team):
        if yard_line == 50:
            return 50
        elif field_position == possession_team:
            return yard_line
        else:
            return 100 - yard_line
    data['YardsFromGoal'] = data.apply(lambda x: yards_from_goal(x.YardLine, x.FieldPosition, x.PossessionTeam), axis=1)
    
    data['X_std'] = data.apply(lambda x: 120 - x.X if x.ToLeft else x.X, axis=1)
    data['Y_std'] = data.apply(lambda x: 160/3 - x.Y if x.ToLeft else x.Y, axis=1)
    
    data['GameDate'] = data.GameId.map(lambda x:pd.to_datetime(str(x)[:8]))
    data['PlayerAge'] = (data.GameDate.map(pd.to_datetime) - data.PlayerBirthDate.map(pd.to_datetime)).map(lambda x:x.days)/365
    data['PlayerHeightInches'] = data['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0]) + int(x.split('-')[1]))

    return data

data = preprocess_data(raw_data)

In [136]:
data.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection,ToLeft,TeamOnOffense,IsOnOffence,YardsFromGoal,X_std,Y_std,GameDate,PlayerAge,PlayerHeightInches
18,2017090700,20170907000118,home,78.75,30.53,3.63,3.35,0.38,161.98,245.74,2543773,James White,28,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,5-10,205,02/03/1992,Wisconsin,RB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,True,home,True,35,41.25,22.803333,2017-09-07,25.610959,70
40,2017090700,20170907000139,home,71.07,27.16,3.06,2.41,0.34,210.7,312.2,2543773,James White,28,2017,43,1,13:52:00,NE,1,10,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:27.000Z,2017-09-08T00:44:26.000Z,3,5-10,205,02/03/1992,Wisconsin,RB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,True,home,True,43,48.93,26.173333,2017-09-07,25.610959,70
62,2017090700,20170907000189,home,48.66,19.11,5.77,2.42,0.6,140.82,221.96,2543773,James White,28,2017,35,1,13:02:00,NE,1,10,KC,0,0,2543773,SINGLEBACK,"1 RB, 1 TE, 3 WR",7.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:45:17.000Z,2017-09-08T00:45:15.000Z,5,5-10,205,02/03/1992,Wisconsin,RB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,True,home,True,65,71.34,34.223333,2017-09-07,25.610959,70
84,2017090700,20170907000345,home,15.53,25.36,4.45,3.2,0.46,186.22,275.44,2539663,Mike Gillislee,35,2017,2,1,12:12:00,NE,2,2,KC,0,0,2539663,JUMBO,"6 OL, 2 RB, 2 TE, 0 WR",9.0,"4 DL, 4 LB, 3 DB",left,2017-09-08T00:48:41.000Z,2017-09-08T00:48:39.000Z,2,5-11,210,11/01/1990,Florida,RB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,True,home,True,98,104.47,27.973333,2017-09-07,26.868493,71
98,2017090700,20170907000395,away,29.99,27.12,3.9,2.53,0.44,34.27,157.92,2557917,Kareem Hunt,27,2017,25,1,12:08:00,KC,1,10,KC,7,0,2557917,SHOTGUN,"1 RB, 3 TE, 1 WR",7.0,"3 DL, 2 LB, 6 DB",right,2017-09-08T00:53:14.000Z,2017-09-08T00:53:13.000Z,7,5-11,216,08/06/1995,Toledo,RB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW,False,away,True,25,29.99,27.12,2017-09-07,22.10411,71


In [137]:
def train_test_split_by_games(data, proportion):
    games = data.GameId.unique()
    n_games = games.size
    train_games = games[:int(proportion*n_games)]
    
    train = data[data.GameId.isin(train_games)]
    y_train = train['Yards']
    X_train = train.drop(['Yards'], axis=1)
    
    test = data[data.GameId.isin(train_games) == False]
    y_test = test['Yards'].reset_index()['Yards']
    X_test = test.drop(['Yards'], axis=1)
    
    return (X_train, y_train, X_test, y_test)

X_train, y_train, X_test, y_test = train_test_split_by_games(data, 0.8)

In [138]:
categorical_features = ['OffenseFormation',
                        'PlayerCollegeName',
                        'Stadium',
                        'StadiumType',
                        'Turf',
                        'GameWeather',
                       'TeamOnOffense']
numerical_features = ['X_std','Y_std', 'S', 'A', 'Dis',
                     'YardsFromGoal', 'PlayerAge', 'PlayerHeightInches']
features = numerical_features + categorical_features

In [139]:
X_train = X_train[features]
X_test = X_test[features]
X_train.head()

Unnamed: 0,X_std,Y_std,S,A,Dis,YardsFromGoal,PlayerAge,PlayerHeightInches,OffenseFormation,PlayerCollegeName,Stadium,StadiumType,Turf,GameWeather,TeamOnOffense
18,41.25,22.803333,3.63,3.35,0.38,35,25.610959,70,SHOTGUN,Wisconsin,Gillette Stadium,Outdoor,Field Turf,Clear and warm,home
40,48.93,26.173333,3.06,2.41,0.34,43,25.610959,70,SHOTGUN,Wisconsin,Gillette Stadium,Outdoor,Field Turf,Clear and warm,home
62,71.34,34.223333,5.77,2.42,0.6,65,25.610959,70,SINGLEBACK,Wisconsin,Gillette Stadium,Outdoor,Field Turf,Clear and warm,home
84,104.47,27.973333,4.45,3.2,0.46,98,26.868493,71,JUMBO,Florida,Gillette Stadium,Outdoor,Field Turf,Clear and warm,home
98,29.99,27.12,3.9,2.53,0.44,25,22.10411,71,SHOTGUN,Toledo,Gillette Stadium,Outdoor,Field Turf,Clear and warm,away


In [140]:
xf = [OneHotVectorizer(columns=categorical_features)]
xf_pipe = Pipeline(xf)
X_train = xf_pipe.fit_transform(X_train)
X_test = xf_pipe.transform(X_test)

In [141]:
# For simplicity, we are starting with a few numerical features.
#
# TODO: Do some preprocessing to extract features like Game date,
#       player age, player height, game clock time, offense and defense
#       personnel, and more.
# TODO: Apply NimbusML transforms to featurize categorical features
#       like player's college, player position, Stadium, StadiumType,
#       Turf, GameWeather, WindDirection, OffenseFormation, and more.
# TODO: Read up on how tree regression residuals are distributed. We can
#       use LightGbmRegressor to do predictions, and use the appropriate
#       distribution to calculate the cdf.

# pipe = Pipeline([OneHotHashVectorizer(columns=categorical_features),
#                 LightGbmRegressor(feature=numerical_features, label='Yards',
#                                  booster=Dart(feature_fraction=0.9,
#                                              subsample_fraction=0.8),
#                                   number_of_iterations=50,
#                                 minimum_example_count_per_leaf=20,
#                                  number_of_leaves=20, learning_rate=0.1)])

# pipe = Pipeline([LightGbmRegressor(feature=['X','Y', 'S', 'A', 'Dis', 'Orientation', 'Dir'],
#                                    label='Yards',
#                                  booster=Dart(feature_fraction=0.7,
#                                              subsample_fraction=0.7),
#                                   number_of_iterations=10000,
#                                 minimum_example_count_per_leaf=20,
#                                  number_of_leaves=100, learning_rate=0.005)])

pipe = Pipeline([
    OrdinaryLeastSquaresRegressor()])
model = pipe.fit(X_train, y_train)

Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
Trainer solving for 309 parameters across 18568 examples
Coefficient of determination R2 = 0.074827336513872, or 0.0592211598145058 (adjusted)
Not training a calibrator because it is not needed.
Elapsed time: 00:00:00.7478409


In [142]:
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

In [143]:
def standard_error_of_prediction(X_train, X_test, y_train, y_train_pred):
    n_train = X_train.shape[0]
    s2 = np.sum(np.square(np.subtract(y_train, y_train_pred.Score))) / (n_train - model.summary().shape[1])
    mean = np.mean(X_train)
    norm2 = np.square(np.linalg.norm(X_test - mean, axis=1))
    sum_norm2 = np.sum(norm2)
    pred_se = np.sqrt(s2*(1 + 1/n_train + (norm2/sum_norm2)))
    return pred_se

pred_se = standard_error_of_prediction(X_train, X_test, y_train, y_train_pred)

In [144]:
pred_se

array([1.2091509 , 1.20909431, 1.20905118, ..., 1.20908022, 1.20919583,
       1.20920415])

In [145]:
def get_cdf_for_prediction(pred, var):
    if len(pred) != len(var):
        print('lengths of predictions and prediction variances fo not match')
        return None
    
    column_names = ['Yards'+str(i) for i in range(-99,100)]
    #result = pd.DataFrame(columns = column_names)
    res = []
    for i in range(len(pred)):
        dist = norm(pred[i], var[i])
        vals = dist.cdf(range(-99, 100))
        res.append(vals)
        #cdf = pd.DataFrame([vals], columns=column_names)
        #result = result.append(cdf)
    result = pd.DataFrame(res, columns=column_names)
    return result

def evaluate_crps(cdf, y_true):
    if cdf.shape[0] != len(y_true):
        print('true labels and number of predicted distributions do not match')
        return None
    h =  np.heaviside(range(-99, 100), 1)
    res = 0
    for i in range(len(y_true)):
        row = cdf.iloc[[i]]
        res += np.sum(np.square(np.subtract(row, h)), axis=1)[i]  
    res = res / (199 * len(y_true))
    return res

In [146]:
cdf = get_cdf_for_prediction(y_pred.Score, pred_se)
crps = evaluate_crps(cdf, y_test)
crps

0.022487825486175075