# Expected Yards Baseline Model

## 1 Notebook Set Up

In [2]:
# install packages

#!pip install nfl-data-py
#!pip install pyarrow

In [1]:
# import libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import nfl_data_py as nfl
from sklearn import linear_model

import pyarrow as pa
import pyarrow.parquet as pq

pd.set_option('display.max_columns', None)

### 1.1 Pulling in data

In [2]:
# full data
years_required_full =[2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]
NFL_PBP_Data = nfl.import_pbp_data(years = years_required_full, downcast=True, cache=False, alt_path=None)

2010 done.
2011 done.
2012 done.
2013 done.
2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
Downcasting floats.


### 1.2 Prepping data for baseline model
Pulling in 6 years of data for the baseline model. 

In [160]:
#years_required_small =[2016,2017,2018,2019,2020,2021]
#NFL_PBP_Data_Small = nfl.import_pbp_data(years = years_required_small, downcast=True, cache=False, alt_path=None)

In [31]:
nfl_df = NFL_PBP_Data[NFL_PBP_Data['play_type'].isin(['pass', 'run'])]
nfl_df = nfl_df[nfl_df['yardline_100'].notna()]
nfl_df.shape

(425625, 384)

In [33]:
nfl_df.head(10)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense
2,58.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
3,82.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
4,103.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
5,132.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
6,156.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
7,177.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
8,201.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
9,222.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
10,254.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,LA,home,ARI,...,,,,,,,,,,
11,278.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,LA,home,ARI,...,,,,,,,,,,


In [11]:
[i for i in nfl_df.columns]

['play_id',
 'game_id',
 'old_game_id',
 'home_team',
 'away_team',
 'season_type',
 'week',
 'posteam',
 'posteam_type',
 'defteam',
 'side_of_field',
 'yardline_100',
 'game_date',
 'quarter_seconds_remaining',
 'half_seconds_remaining',
 'game_seconds_remaining',
 'game_half',
 'quarter_end',
 'drive',
 'sp',
 'qtr',
 'down',
 'goal_to_go',
 'time',
 'yrdln',
 'ydstogo',
 'ydsnet',
 'desc',
 'play_type',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_kneel',
 'qb_spike',
 'qb_scramble',
 'pass_length',
 'pass_location',
 'air_yards',
 'yards_after_catch',
 'run_location',
 'run_gap',
 'field_goal_result',
 'kick_distance',
 'extra_point_result',
 'two_point_conv_result',
 'home_timeouts_remaining',
 'away_timeouts_remaining',
 'timeout',
 'timeout_team',
 'td_team',
 'td_player_name',
 'td_player_id',
 'posteam_timeouts_remaining',
 'defteam_timeouts_remaining',
 'total_home_score',
 'total_away_score',
 'posteam_score',
 'defteam_score',
 'score_differential',
 'po

## Modeling 

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [67]:
def Play_type_detailed(s):
    if s['play_type'] == 'pass' and s['pass_location'] != None and s['pass_length'] != None: 
        return 'PASS' + '_' + str(s['pass_location']).upper() + '_' + str(s['pass_length']).upper()
    elif s['play_type'] == 'run' and s['run_location'] == 'middle': 
        return 'RUSH' + '_' + str(s['run_location']).upper()
    elif s['play_type'] == 'run' and s['run_location'] != None and s['run_gap'] != None: 
        return 'RUSH' + '_' + str(s['run_location']).upper()+ '_' + str(s['run_gap']).upper()
    else:
        return 'OTHER'
    
def personnel (formation):
    """Finding the number of RB's, TE's, WR's in each formation
    """
    for i in range(len(formation)):
        # takes the personnel and saves it as 'form'
        form = formation[i]
        try:
            rbloc = form.find('RB')-2
            num_rb.append(int(form[rbloc]))
        except: 
            num_rb.append(int(0))
        try:
            teloc = form.find('TE')-2
            num_te.append(int(form[teloc]))
        except: 
            num_te.append(int(0))
        try:
            wrloc = form.find('WR')-2
            num_wr.append(int(form[wrloc]))
        except: 
            num_wr.append(int(0))

In [93]:
cols = ['yards_gained', 'play_id', 'game_id', 'home_team', 'away_team', 'season_type', 
        'week', 'posteam', 'defteam','yardline_100', 'game_seconds_remaining', 
        'down', 'ydstogo', 'desc', 'score_differential',
        'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'season', 'offense_personnel', 
        'play_type', 'pass_location','pass_length','run_location','run_gap']

nfl_model_data = nfl_df[cols]

nfl_model_data['Play_type_detailed'] = nfl_model_data.apply(Play_type_detailed, axis=1)
nfl_model_data = nfl_model_data[~(nfl_model_data['Play_type_detailed'] == 'OTHER')]


# get dummy for down
playtypedummy = pd.get_dummies(nfl_model_data['Play_type_detailed'])
playtypedummy.columns = ['PASS_LEFT_DEEP', 'PASS_LEFT_SHORT', 'PASS_MIDDLE_DEEP',
                        'PASS_MIDDLE_SHORT', 'PASS_RIGHT_DEEP', 'PASS_RIGHT_SHORT',
                        'RUSH_LEFT_END', 'RUSH_LEFT_GUARD', 'RUSH_LEFT_TACKLE', 'RUSH_MIDDLE', 
                        'RUSH_RIGHT_END', 'RUSH_RIGHT_GUARD', 'RUSH_RIGHT_TACKLE']
nfl_model_data = pd.concat([nfl_model_data, playtypedummy], axis=1) 
# drop RUSH_RIGHT_TACKLE to avoid multicollinearity
nfl_model_data = nfl_model_data.drop(columns=['RUSH_RIGHT_TACKLE'])

operson = np.array(nfl_model_data['offense_personnel'])

num_rb = []
num_te = []
num_wr = []
            
personnel(operson)

# appending the new columns to the df
nfl_model_data['num_rb'] = num_rb
nfl_model_data['num_te'] = num_te
nfl_model_data['num_wr'] = num_wr

# get dummy for down
dummy = pd.get_dummies(nfl_model_data['down'])
dummy.columns = ['firstdown', 'seconddown', 'thirddown', 'fourthdown']
nfl_model_data = pd.concat([nfl_model_data, dummy], axis=1) 
# drop fourthdown to avoid multicollinearity
nfl_model_data = nfl_model_data.drop(columns=['fourthdown'])

# get dummy for offensive team
posteamdummy = pd.get_dummies(nfl_model_data['posteam'])
posteamdummy.columns = ['oARI', 'oATL', 'oBAL', 'oBUF', 'oCAR', 'oCHI', 'oCIN', 'oCLE', 'oDAL', 
                        'oDEN','oDET', 'oGB', 'oHOU', 'oIND', 'oJAX', 'oKC', 'oLA', 'oLAC', 'oLV', 
                        'oMIA', 'oMIN','oNE', 'oNO', 'oNYG', 'oNYJ', 'oPHI', 'oPIT', 'oSEA', 'oSF', 
                        'oTB', 'oTEN', 'oWAS']
nfl_model_data = pd.concat([nfl_model_data, posteamdummy], axis=1) 
# drop oWAS to avoid multicollinearity
nfl_model_data = nfl_model_data.drop(columns=['oWAS'])

# get dummy for defensive team
defteamdummy = pd.get_dummies(nfl_model_data['defteam'])
defteamdummy.columns = ['dARI', 'dATL', 'dBAL', 'dBUF', 'dCAR', 'dCHI', 'dCIN', 'dCLE', 'dDAL', 
                        'dDEN','dDET', 'dGB', 'dHOU', 'dIND', 'dJAX', 'dKC', 'dLA', 'dLAC', 'dLV', 
                        'dMIA', 'dMIN','dNE', 'dNO', 'dNYG', 'dNYJ', 'dPHI', 'dPIT', 'dSEA', 'dSF', 
                        'dTB', 'dTEN', 'dWAS']
nfl_model_data = pd.concat([nfl_model_data, defteamdummy], axis=1) 
# drop dWAS to avoid multicollinearity
nfl_model_data = nfl_model_data.drop(columns=['dWAS'])

nfl_model_data = nfl_model_data[nfl_model_data['season'] > 2015]
# it looks like 2016 is the start of when they tracked personnel
seasondummy = pd.get_dummies(nfl_model_data['season'])
seasondummy.columns = ['2016', '2017', '2018', '2019', '2020', '2021', '2022']
nfl_model_data = pd.concat([nfl_model_data, seasondummy], axis=1) 
# drop 2022 to avoid multicollinearity
nfl_model_data = nfl_model_data.drop(columns=['2022'])

nfl_model_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_model_data['Play_type_detailed'] = nfl_model_data.apply(Play_type_detailed, axis=1)


Unnamed: 0,yards_gained,play_id,game_id,home_team,away_team,season_type,week,posteam,defteam,yardline_100,game_seconds_remaining,down,ydstogo,desc,score_differential,posteam_timeouts_remaining,defteam_timeouts_remaining,season,offense_personnel,play_type,pass_location,pass_length,run_location,run_gap,Play_type_detailed,PASS_LEFT_DEEP,PASS_LEFT_SHORT,PASS_MIDDLE_DEEP,PASS_MIDDLE_SHORT,PASS_RIGHT_DEEP,PASS_RIGHT_SHORT,RUSH_LEFT_END,RUSH_LEFT_GUARD,RUSH_LEFT_TACKLE,RUSH_MIDDLE,RUSH_RIGHT_END,RUSH_RIGHT_GUARD,num_rb,num_te,num_wr,firstdown,seconddown,thirddown,oARI,oATL,oBAL,oBUF,oCAR,oCHI,oCIN,oCLE,oDAL,oDEN,oDET,oGB,oHOU,oIND,oJAX,oKC,oLA,oLAC,oLV,oMIA,oMIN,oNE,oNO,oNYG,oNYJ,oPHI,oPIT,oSEA,oSF,oTB,oTEN,dARI,dATL,dBAL,dBUF,dCAR,dCHI,dCIN,dCLE,dDAL,dDEN,dDET,dGB,dHOU,dIND,dJAX,dKC,dLA,dLAC,dLV,dMIA,dMIN,dNE,dNO,dNYG,dNYJ,dPHI,dPIT,dSEA,dSF,dTB,dTEN,2016,2017,2018,2019,2020,2021
290679,6.0,58.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,86.0,3597.0,1.0,10.0,(14:57) 29-J.Forsett right end pushed ob at BA...,0.0,3.0,3.0,2016,"2 RB, 1 TE, 2 WR",run,,,right,end,RUSH_RIGHT_END,0,0,0,0,0,0,0,0,0,0,1,0,2,1,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
290680,5.0,85.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,80.0,3572.0,2.0,4.0,"(14:32) (No Huddle, Shotgun) 5-J.Flacco pass s...",0.0,3.0,3.0,2016,"2 RB, 1 TE, 2 WR",pass,right,short,,,PASS_RIGHT_SHORT,0,0,0,0,0,1,0,0,0,0,0,0,2,1,2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
290681,0.0,109.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,75.0,3541.0,1.0,10.0,"(14:01) (No Huddle, Shotgun) 29-J.Forsett left...",0.0,3.0,3.0,2016,"2 RB, 1 TE, 2 WR",run,,,left,guard,RUSH_LEFT_GUARD,0,0,0,0,0,0,0,1,0,0,0,0,2,1,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
290682,9.0,130.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,75.0,3515.0,2.0,10.0,(13:35) (No Huddle) 5-J.Flacco pass short righ...,0.0,3.0,3.0,2016,"2 RB, 1 TE, 2 WR",pass,right,short,,,PASS_RIGHT_SHORT,0,0,0,0,0,1,0,0,0,0,0,0,2,1,2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
290683,0.0,154.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,66.0,3474.0,3.0,1.0,(12:54) (No Huddle) 28-T.West right tackle to ...,0.0,3.0,3.0,2016,"1 RB, 2 TE, 2 WR",run,,,right,tackle,RUSH_RIGHT_TACKLE,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [94]:
label = ['yards_gained']

features = ['yardline_100', 'firstdown', 'seconddown', 'thirddown', 
            'ydstogo', 'game_seconds_remaining', 'score_differential', 
            'num_rb', 'num_te', 'num_wr','posteam_timeouts_remaining', 'defteam_timeouts_remaining',
            'oARI', 'oATL', 'oBAL', 'oBUF', 'oCAR', 'oCHI', 'oCIN', 'oCLE', 'oDAL', 
            'oDEN','oDET', 'oGB', 'oHOU', 'oIND', 'oJAX', 'oKC', 'oLA', 'oLAC', 'oLV', 
            'oMIA', 'oMIN','oNE', 'oNO', 'oNYG', 'oNYJ', 'oPHI', 'oPIT', 'oSEA', 'oSF', 
            'oTB', 'oTEN',
            'dARI', 'dATL', 'dBAL', 'dBUF', 'dCAR', 'dCHI', 'dCIN', 'dCLE', 'dDAL', 
            'dDEN','dDET', 'dGB', 'dHOU', 'dIND', 'dJAX', 'dKC', 'dLA', 'dLAC', 'dLV', 
            'dMIA', 'dMIN','dNE', 'dNO', 'dNYG', 'dNYJ', 'dPHI', 'dPIT', 'dSEA', 'dSF', 
            'dTB', 'dTEN',
            '2016', '2017', '2018', '2019', '2020', '2021',
            'PASS_LEFT_DEEP', 'PASS_LEFT_SHORT', 'PASS_MIDDLE_DEEP',
            'PASS_MIDDLE_SHORT', 'PASS_RIGHT_DEEP', 'PASS_RIGHT_SHORT',
            'RUSH_LEFT_END', 'RUSH_LEFT_GUARD', 'RUSH_LEFT_TACKLE', 'RUSH_MIDDLE', 
            'RUSH_RIGHT_END', 'RUSH_RIGHT_GUARD']
clean_nfl_model_data = nfl_model_data[features + label]

In [95]:
# importing train_test_split from sklearn
training_data, testing_data = train_test_split(clean_nfl_model_data, test_size=0.2, random_state=2022)
training_data, validation_data = train_test_split(training_data, test_size=0.25, random_state=2022)
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {validation_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")
# very simple model of play_type_binary ~ features
x_train = training_data[features].to_numpy()
y_train = training_data['yards_gained'].to_numpy()
x_val =  validation_data[features].to_numpy()
y_val =  validation_data['yards_gained'].to_numpy()
x_test =  testing_data[features].to_numpy()
y_test =  testing_data['yards_gained'].to_numpy()

No. of training examples: 127674
No. of testing examples: 42558
No. of testing examples: 42558


In [96]:
clf = LinearRegression(normalize = True)
reg = clf.fit(x_train, y_train)
predictions = reg.predict(x_val)
MSE = metrics.mean_squared_error(y_val, predictions, squared=False)
print("MSE:", round(MSE,6))
reg.coef_

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




MSE: 8.226873


array([ 3.31450546e-02,  3.21446894e-01,  2.52395844e-01,  3.89652376e-01,
        4.60701206e-02,  5.83619719e-06,  3.91468905e-03,  8.51395035e-02,
        3.22771581e-04,  1.47362825e-01,  1.10646415e-01,  7.06589407e-02,
       -7.42871924e-02,  6.00246223e-01,  2.84577025e-01,  2.38403156e-01,
        8.56089119e-02,  1.07894095e-02,  2.08057500e-01,  3.19535654e-01,
        6.54286870e-01, -2.19200670e-01,  2.53151961e-01,  4.61713171e-01,
        2.29788432e-01,  1.34341593e-01, -2.90931811e-01,  8.22663828e-01,
        3.15220164e-01,  3.06374663e-01,  3.28737945e-01,  1.01369269e-02,
        3.61631080e-01,  4.22830383e-01,  8.07224260e-01, -1.54206726e-01,
       -3.68057910e-01,  2.79694303e-01, -2.55771103e-01,  6.71564378e-01,
        6.72922394e-01,  2.82213657e-01,  4.78181291e-01, -2.25484106e-02,
        9.78024225e-02, -4.43315189e-01, -9.15938150e-02,  1.03094372e-01,
       -3.06935670e-01,  1.23989103e-01,  1.88496417e-01, -1.20619102e-01,
       -3.30792605e-01,  

In [98]:
pd.set_option('display.max_rows', 100)
coeff_table = pd.DataFrame({'Features': features, 'Coefficients': list(reg.coef_)}, columns=['Features', 'Coefficients'])
coeff_table

Unnamed: 0,Features,Coefficients
0,yardline_100,0.033145
1,firstdown,0.321447
2,seconddown,0.252396
3,thirddown,0.389652
4,ydstogo,0.04607
5,game_seconds_remaining,6e-06
6,score_differential,0.003915
7,num_rb,0.08514
8,num_te,0.000323
9,num_wr,0.147363


In [167]:
! pip install huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 4.0 MB/s eta 0:00:01
Installing collected packages: huggingface-hub
Successfully installed huggingface-hub-0.10.1


## Uploading the Yards Gained Prediction Model

In [99]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /Users/ic.mac16/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [102]:
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id="yards-gained-model", private=True)

In [100]:
from joblib import dump, load
dump(clf, 'yards-gained-clf.joblib')

['yards-gained-clf.joblib']

In [103]:
api.upload_file(path_or_fileobj="yards-gained-clf.joblib", 
                path_in_repo="yards-gained-clf.joblib",
                repo_id="ic-hua/yards-gained-model")

'https://huggingface.co/ic-hua/yards-gained-model/blob/main/yards-gained-clf.joblib'

## Downloading the Yards Gained Prediction Model

In [104]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="ic-hua/yards-gained-model", filename="yards-gained-clf.joblib")
download_yards_gained_model = load('yards-gained-clf.joblib') 

Downloading:   0%|          | 0.00/2.00k [00:00<?, ?B/s]

In [105]:
yards_prediction=download_yards_gained_model.predict(x_test)
yards_prediction

array([5.81899911, 5.71399204, 8.17666481, ..., 7.72511064, 6.22149648,
       7.15680742])