# Expected Yards Baseline Model

## 1 Notebook Set Up

In [2]:
# install packages

#!pip install nfl-data-py
#!pip install pyarrow

In [1]:
# import libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import nfl_data_py as nfl
from sklearn import linear_model

import pyarrow as pa
import pyarrow.parquet as pq

### 1.1 Pulling in data

In [2]:
# full data
years_required_full =[2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]
NFL_PBP_Data = nfl.import_pbp_data(years = years_required_full, downcast=True, cache=False, alt_path=None)

2010 done.
2011 done.
2012 done.
2013 done.
2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
Downcasting floats.


### 1.2 Prepping data for baseline model
Pulling in 6 years of data for the baseline model. 

In [160]:
#years_required_small =[2016,2017,2018,2019,2020,2021]
#NFL_PBP_Data_Small = nfl.import_pbp_data(years = years_required_small, downcast=True, cache=False, alt_path=None)

In [155]:
nfl_df = NFL_PBP_Data[NFL_PBP_Data['play_type'].isin(['pass', 'run'])]
nfl_df = nfl_df[nfl_df['yardline_100'].notna()]
nfl_df.shape

(420660, 384)

In [12]:
nfl_df.head(10)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense
2,58.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
3,82.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
4,103.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
5,132.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
6,156.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
7,177.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
8,201.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
9,222.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,ARI,away,LA,...,,,,,,,,,,
10,254.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,LA,home,ARI,...,,,,,,,,,,
11,278.0,2010_01_ARI_STL,2010091208,LA,ARI,REG,1,LA,home,ARI,...,,,,,,,,,,


In [6]:
[i for i in nfl_df.columns]

['play_id',
 'game_id',
 'old_game_id',
 'home_team',
 'away_team',
 'season_type',
 'week',
 'posteam',
 'posteam_type',
 'defteam',
 'side_of_field',
 'yardline_100',
 'game_date',
 'quarter_seconds_remaining',
 'half_seconds_remaining',
 'game_seconds_remaining',
 'game_half',
 'quarter_end',
 'drive',
 'sp',
 'qtr',
 'down',
 'goal_to_go',
 'time',
 'yrdln',
 'ydstogo',
 'ydsnet',
 'desc',
 'play_type',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_kneel',
 'qb_spike',
 'qb_scramble',
 'pass_length',
 'pass_location',
 'air_yards',
 'yards_after_catch',
 'run_location',
 'run_gap',
 'field_goal_result',
 'kick_distance',
 'extra_point_result',
 'two_point_conv_result',
 'home_timeouts_remaining',
 'away_timeouts_remaining',
 'timeout',
 'timeout_team',
 'td_team',
 'td_player_name',
 'td_player_id',
 'posteam_timeouts_remaining',
 'defteam_timeouts_remaining',
 'total_home_score',
 'total_away_score',
 'posteam_score',
 'defteam_score',
 'score_differential',
 'po

## Modeling 

In [147]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [161]:
cols = ['yards_gained', 'play_id', 'game_id', 'home_team', 'away_team', 'season_type', 
        'week', 'posteam', 'defteam','yardline_100', 'game_seconds_remaining', 
        'down', 'ydstogo', 'desc', 'play_type', 'score_differential',
        'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'season', 'offense_personnel']

nfl_model_data = nfl_df[cols].dropna()

operson = np.array(nfl_model_data['offense_personnel'])

num_rb = []
num_te = []
num_wr = []

def personnel (formation):
    """Finding the number of RB's, TE's, WR's in each formation
    """
    for i in range(len(formation)):
        # takes the personnel and saves it as 'form'
        form = formation[i]
        try:
            rbloc = form.find('RB')-2
            num_rb.append(int(form[rbloc]))
        except: 
            num_rb.append(int(0))
        try:
            teloc = form.find('TE')-2
            num_te.append(int(form[teloc]))
        except: 
            num_te.append(int(0))
        try:
            wrloc = form.find('WR')-2
            num_wr.append(int(form[wrloc]))
        except: 
            num_wr.append(int(0))
            
personnel(operson)

# appending the new columns to the df
nfl_model_data['num_rb'] = num_rb
nfl_model_data['num_te'] = num_te
nfl_model_data['num_wr'] = num_wr

# get dummy for down
dummy = pd.get_dummies(nfl_model_data['down'])
dummy.columns = ['firstdown', 'seconddown', 'thirddown', 'fourthdown']
nfl_model_data = pd.concat([nfl_model_data, dummy], axis=1) 
# drop fourthdown to avoid multicollinearity
nfl_model_data = nfl_model_data.drop(columns=['fourthdown'])

# get dummy for offensive team
posteamdummy = pd.get_dummies(nfl_model_data['posteam'])
posteamdummy.columns = ['oARI', 'oATL', 'oBAL', 'oBUF', 'oCAR', 'oCHI', 'oCIN', 'oCLE', 'oDAL', 
                        'oDEN','oDET', 'oGB', 'oHOU', 'oIND', 'oJAX', 'oKC', 'oLA', 'oLAC', 'oLV', 
                        'oMIA', 'oMIN','oNE', 'oNO', 'oNYG', 'oNYJ', 'oPHI', 'oPIT', 'oSEA', 'oSF', 
                        'oTB', 'oTEN', 'oWAS']
nfl_model_data = pd.concat([nfl_model_data, posteamdummy], axis=1) 
# drop oWAS to avoid multicollinearity
nfl_model_data = nfl_model_data.drop(columns=['oWAS'])

# get dummy for defensive team
defteamdummy = pd.get_dummies(nfl_model_data['defteam'])
defteamdummy.columns = ['dARI', 'dATL', 'dBAL', 'dBUF', 'dCAR', 'dCHI', 'dCIN', 'dCLE', 'dDAL', 
                        'dDEN','dDET', 'dGB', 'dHOU', 'dIND', 'dJAX', 'dKC', 'dLA', 'dLAC', 'dLV', 
                        'dMIA', 'dMIN','dNE', 'dNO', 'dNYG', 'dNYJ', 'dPHI', 'dPIT', 'dSEA', 'dSF', 
                        'dTB', 'dTEN', 'dWAS']
nfl_model_data = pd.concat([nfl_model_data, defteamdummy], axis=1) 
# drop dWAS to avoid multicollinearity
nfl_model_data = nfl_model_data.drop(columns=['dWAS'])

# get dummy for each season
# it looks like 2016 is the start of when they tracked personnel
seasondummy = pd.get_dummies(nfl_model_data['season'])
seasondummy.columns = ['2016', '2017', '2018', '2019', '2020', '2021', '2022']
nfl_model_data = pd.concat([nfl_model_data, seasondummy], axis=1) 
# drop 2022 to avoid multicollinearity
nfl_model_data = nfl_model_data.drop(columns=['2022'])

nfl_model_data.head()

Unnamed: 0,yards_gained,play_id,game_id,home_team,away_team,season_type,week,posteam,defteam,yardline_100,...,dSEA,dSF,dTB,dTEN,2016,2017,2018,2019,2020,2021
290679,6.0,58.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,86.0,...,0,0,0,0,1,0,0,0,0,0
290680,5.0,85.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,80.0,...,0,0,0,0,1,0,0,0,0,0
290681,0.0,109.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,75.0,...,0,0,0,0,1,0,0,0,0,0
290682,9.0,130.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,75.0,...,0,0,0,0,1,0,0,0,0,0
290683,0.0,154.0,2016_01_BUF_BAL,BAL,BUF,REG,1,BAL,BUF,66.0,...,0,0,0,0,1,0,0,0,0,0


In [162]:
label = ['yards_gained']

features = ['yardline_100', 'firstdown', 'seconddown', 'thirddown', 
            'ydstogo', 'game_seconds_remaining', 'score_differential', 
            'num_rb', 'num_te', 'num_wr','posteam_timeouts_remaining', 'defteam_timeouts_remaining',
            'oARI', 'oATL', 'oBAL', 'oBUF', 'oCAR', 'oCHI', 'oCIN', 'oCLE', 'oDAL', 
            'oDEN','oDET', 'oGB', 'oHOU', 'oIND', 'oJAX', 'oKC', 'oLA', 'oLAC', 'oLV', 
            'oMIA', 'oMIN','oNE', 'oNO', 'oNYG', 'oNYJ', 'oPHI', 'oPIT', 'oSEA', 'oSF', 
            'oTB', 'oTEN',
            'dARI', 'dATL', 'dBAL', 'dBUF', 'dCAR', 'dCHI', 'dCIN', 'dCLE', 'dDAL', 
            'dDEN','dDET', 'dGB', 'dHOU', 'dIND', 'dJAX', 'dKC', 'dLA', 'dLAC', 'dLV', 
            'dMIA', 'dMIN','dNE', 'dNO', 'dNYG', 'dNYJ', 'dPHI', 'dPIT', 'dSEA', 'dSF', 
            'dTB', 'dTEN',
            '2016', '2017', '2018', '2019', '2020', '2021']
clean_nfl_model_data = nfl_model_data[features + label]

In [163]:
# importing train_test_split from sklearn
training_data, testing_data = train_test_split(clean_nfl_model_data, test_size=0.2, random_state=2022)
training_data, validation_data = train_test_split(training_data, test_size=0.25, random_state=2022)
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {validation_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")
# very simple model of play_type_binary ~ features
x_train = training_data[features].to_numpy()
y_train = training_data['yards_gained'].to_numpy()
x_val =  validation_data[features].to_numpy()
y_val =  validation_data['yards_gained'].to_numpy()
x_test =  testing_data[features].to_numpy()
y_test =  testing_data['yards_gained'].to_numpy()

No. of training examples: 128550
No. of testing examples: 42850
No. of testing examples: 42851


In [164]:
clf = LinearRegression(normalize = True)
reg = clf.fit(x_train, y_train)
predictions = reg.predict(x_val)
MSE = metrics.mean_squared_error(y_val, predictions, squared=False)
print("MSE:", round(MSE,6))
reg.coef_

MSE: 8.621244


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




array([ 3.40317106e-02, -3.76858518e-02,  5.67501268e-02,  1.57109442e-01,
        8.47823007e-02,  1.38026747e-05, -2.65668428e-03,  3.80608901e-01,
        4.70516320e-01,  7.64137298e-01, -5.79798515e-03,  4.74135706e-02,
       -1.09253254e-01,  7.21806900e-01,  6.51464330e-01,  4.96163036e-01,
        1.87626010e-01,  8.19422431e-02,  1.29219091e-01,  3.68471166e-01,
        5.69483438e-01, -5.25562003e-02,  5.65066019e-01,  4.99682072e-01,
        2.32575018e-02,  5.05574046e-01,  3.74618366e-02,  1.29259991e+00,
        4.34091369e-01,  5.61900248e-01,  6.25234743e-01,  1.05239525e-01,
        4.08832727e-01,  9.22777330e-01,  1.07853961e+00, -6.31312097e-02,
       -4.22148506e-01,  3.23157566e-01,  2.89198279e-01,  7.29243020e-01,
        6.39822181e-01,  7.79317012e-01,  6.39067098e-01, -3.25086254e-01,
        6.07258817e-02, -3.58124332e-01, -5.09851361e-01, -1.17659735e-01,
       -4.87362968e-01,  6.00452024e-02, -4.90704710e-02, -1.96532560e-01,
       -4.39199472e-01,  

In [166]:
pd.set_option('display.max_rows', 80)
coeff_table = pd.DataFrame({'Features': features, 'Coefficients': list(reg.coef_)}, columns=['Features', 'Coefficients'])
coeff_table

Unnamed: 0,Features,Coefficients
0,yardline_100,0.034032
1,firstdown,-0.037686
2,seconddown,0.05675
3,thirddown,0.157109
4,ydstogo,0.084782
5,game_seconds_remaining,1.4e-05
6,score_differential,-0.002657
7,num_rb,0.380609
8,num_te,0.470516
9,num_wr,0.764137


In [167]:
! pip install huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 4.0 MB/s eta 0:00:01
Installing collected packages: huggingface-hub
Successfully installed huggingface-hub-0.10.1


## Uploading the Yards Gained Prediction Model

In [172]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [169]:
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id="yards-gained-model", private=True)

'https://huggingface.co/ic-hua/yards-gained-model'

In [170]:
from joblib import dump, load
dump(clf, 'yards-gained-clf.joblib')

['yards-gained-clf.joblib']

In [171]:
api.upload_file(path_or_fileobj="yards-gained-clf.joblib", 
                path_in_repo="yards-gained-clf.joblib",
                repo_id="ic-hua/yards-gained-model")

'https://huggingface.co/ic-hua/yards-gained-model/blob/main/yards-gained-clf.joblib'

## Downloading the Yards Gained Prediction Model

In [175]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="ic-hua/yards-gained-model", filename="yards-gained-clf.joblib")
download_yards_gained_model = load('yards-gained-clf.joblib') 

Downloading:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

In [179]:
yards_prediction=download_yards_gained_model.predict(x_test)
yards_prediction

array([6.72791598, 3.68546363, 6.80413756, ..., 6.1917763 , 5.09696422,
       5.76593172])