# Logistic Regression Baseline Model

## 1 Notebook Set Up

In [1]:
# install packages
#!pip install nfl-data-py
#!pip install pyarrow

In [2]:
# import libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import nfl_data_py as nfl
from sklearn import linear_model

import pyarrow as pa
import pyarrow.parquet as pq

### 1.1 Pulling in data

In [None]:
# full data
years_required_full =[2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]
NFL_PBP_Data = nfl.import_pbp_data(years = years_required_full, downcast=True, cache=False, alt_path=None)

2010 done.
2011 done.
2012 done.
2013 done.
2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
Downcasting floats.


  plays.loc[:, cols] = plays.loc[:, cols].astype(numpy.float32)


### 1.2 Prepping data for baseline model

In [None]:
years_required_small = [2017,2018,2019,2020,2021]
NFL_PBP_Data_Small = nfl.import_pbp_data(years = years_required_small, downcast=True, cache=False, alt_path=None)

In [22]:
NFL_PBP_Data_Small.head()

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense
0,1.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,,,,...,,,,,,,,,,
1,36.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,,,,,,,,,,
2,51.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,,,,,,,,,,
3,75.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,,,,,,,,,,
4,96.0,2015_01_BAL_DEN,2015091309,DEN,BAL,REG,1,BAL,away,DEN,...,,,,,,,,,,


In [6]:
nfl_df = NFL_PBP_Data_Small[NFL_PBP_Data_Small['play_type'].isin(['pass', 'run', 'punt', 
                                                                  'extra_point', 'field_goal'])]
nfl_df = nfl_df[~(nfl_df['game_half'] == 'Overtime')]
nfl_df = nfl_df[nfl_df['yardline_100'].notna()]
nfl_df.shape

(268742, 384)

### Parsing the personnel to pimary positions

In [7]:
def transform_off_personnel(row):

    rb_count = 0
    te_count = 0
    wr_count = 0
    ol_count = 0
    dl_count = 0
    db_count = 0

    if not pd.isna(row['offense_personnel']):
        personnel = row['offense_personnel'].split(', ')
        for p in personnel:
            if p[2:4] == 'RB':
                rb_count = int(p[0])
            elif p[2:4] == 'TE':
                 te_count = int(p[0])
            elif p[2:4] == 'WR':
                 wr_count = int(p[0])
            elif p[2:4] == 'OL':
                 ol_count = int(p[0])
            elif p[2:4] == 'DL':
                 dl_count = int(p[0])
            elif p[2:4] == 'DB':
                db_count = int(p[0])

    return pd.Series([rb_count,te_count,wr_count,ol_count,dl_count, db_count])

nfl_df[['off_rb_count','off_te_count','off_wr_count','off_ol_count','off_dl_count', 'off_db_count']] = nfl_df.apply(transform_off_personnel, axis=1)

In [8]:
def transform_def_personnel(row):

    dl_count = 0
    db_count = 0
    lb_count = 0
    rb_count = 0
    wr_count = 0
    ol_count = 0
   
    if not pd.isna(row['defense_personnel']):
        personnel = row['defense_personnel'].split(', ')
        for p in personnel:
            if p[2:4] == 'LB':
                lb_count = int(p[0])
            elif p[2:4] == 'DL':
                dl_count = int(p[0])
            elif p[2:4] == 'DB':
                db_count = int(p[0])
            elif p[2:4] == 'WR':
                wr_count = int(p[0])
            elif p[2:4] == 'RB':
                rb_count = int(p[0])
            elif p[2:4] == 'OL':
                ol_count = int(p[0])
                
    return pd.Series([dl_count,db_count,lb_count,rb_count,wr_count,ol_count])

nfl_df[['def_dl_count','def_db_count','def_lb_count','def_rb_count','def_wr_count','def_ol_count']] = nfl_df.apply(transform_def_personnel, axis=1)

In [9]:
nfl_df_2 = nfl_df[[
    'defense_personnel',
                   'def_dl_count',
                   'def_db_count',
                   'def_lb_count',
                   'def_rb_count',
                   'def_wr_count',
                   'def_ol_count']]

In [10]:
nfl_df_2.drop_duplicates()

Unnamed: 0,defense_personnel,def_dl_count,def_db_count,def_lb_count,def_rb_count,def_wr_count,def_ol_count
2,,0,0,0,0,0,0
48871,"4 DL, 3 LB, 4 DB",4,4,3,0,0,0
48878,"3 DL, 4 LB, 4 DB",3,4,4,0,0,0
48880,"1 DL, 5 LB, 5 DB",1,5,5,0,0,0
48886,"3 DL, 3 LB, 5 DB",3,5,3,0,0,0
...,...,...,...,...,...,...,...
329859,"1 DL, 4 LB, 5 DB, 1 OL",1,5,4,0,0,1
332553,"2 DL, 1 LB, 4 DB, 1 RB,3 WR",2,4,1,1,0,0
335982,"1 DL, 3 LB, 4 DB, 1 TE,2 WR",1,4,3,0,0,0
336733,"1 DL, 3 LB, 4 DB, 1 RB,2 WR",1,4,3,1,0,0


In [19]:
cross_tab = pd.crosstab(index=nfl_df['play_type'],
                        columns=nfl_df['play_type_nfl'])
cross_tab

play_type_nfl,FIELD_GOAL,PASS,PAT2,PUNT,RUSH,SACK,XP_KICK
play_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
extra_point,0,0,0,0,0,0,5261
field_goal,4043,0,0,0,0,0,0
pass,0,74848,399,0,0,5123,0
punt,0,0,0,8691,0,0,0
run,0,0,151,0,55373,0,0


In [38]:
bins = [0,10,20,30,40,50,60,70,80,90,100]
nfl_df['yardline_binned'] = pd.cut(nfl_df['yardline_100'], bins)

### Basic Model (Run/Pass)

In [93]:
# filter for just pass and run
nfl_df_pass_rush = nfl_df[nfl_df['play_type'].isin(['pass', 'run'])]
nfl_df_pass_rush['play_type_binary'] = nfl_df_pass_rush['play_type'].map(lambda p: 1 if p == 'pass' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df_pass_rush['play_type_binary'] = nfl_df_pass_rush['play_type'].map(lambda p: 1 if p == 'pass' else 0)


In [94]:
nfl_df_pass_rush[['play_type_binary', 'play_type']].head()

Unnamed: 0,play_type_binary,play_type
2,1,pass
3,0,run
4,1,pass
5,0,run
6,1,pass


Index(['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN',
       'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN',
       'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN',
       'WAS'],
      dtype='object')

In [121]:
nfl_model_data = nfl_df_pass_rush
dummy = pd.get_dummies(nfl_model_data['down'])
dummy.columns = ['firstdown', 'seconddown', 'thirddown', 'fourthdown']
nfl_model_data = pd.concat([nfl_model_data, dummy], axis=1)  
nfl_model_data = nfl_model_data.drop(columns=['fourthdown'])
defteamdummy = pd.get_dummies(nfl_model_data['defteam'])
defteamdummy.columns = ['dARI', 'dATL', 'dBAL', 'dBUF', 'dCAR', 'dCHI', 'dCIN', 'dCLE', 'dDAL', 
                        'dDEN','dDET', 'dGB', 'dHOU', 'dIND', 'dJAX', 'dKC', 'dLA', 'dLAC', 'dLV', 
                        'dMIA', 'dMIN','dNE', 'dNO', 'dNYG', 'dNYJ', 'dPHI', 'dPIT', 'dSEA', 'dSF', 
                        'dTB', 'dTEN', 'dWAS']
nfl_model_data = pd.concat([nfl_model_data, defteamdummy], axis=1)  
nfl_model_data = nfl_model_data.drop(columns=['dWAS'])
nfl_model_data.columns

Index(['play_id', 'game_id', 'old_game_id', 'home_team', 'away_team',
       'season_type', 'week', 'posteam', 'posteam_type', 'defteam',
       ...
       'dNE', 'dNO', 'dNYG', 'dNYJ', 'dPHI', 'dPIT', 'dSEA', 'dSF', 'dTB',
       'dTEN'],
      dtype='object', length=431)

In [97]:
nfl_model_data[['yardline_100', 'firstdown', 'seconddown', 'thirddown', 'ydstogo', 'game_seconds_remaining', 
            'score_differential', 'dARI', 'dATL', 'dBAL', 'dBUF', 'dCAR', 'dCHI', 'dCIN', 'dCLE', 'dDAL', 
                        'dDEN','dDET', 'dGB', 'dHOU', 'dIND', 'dJAX', 'dKC', 'dLA', 'dLAC', 'dLV', 
                        'dMIA', 'dMIN','dNE', 'dNO', 'dNYG', 'dNYJ', 'dPHI', 'dPIT', 'dSEA', 'dSF', 
                        'dTB', 'dTEN']]

Unnamed: 0,yardline_100,firstdown,seconddown,thirddown,ydstogo,game_seconds_remaining,score_differential,dARI,dATL,dBAL,...,dNE,dNO,dNYG,dNYJ,dPHI,dPIT,dSEA,dSF,dTB,dTEN
2,80.0,1,0,0,10.0,3600.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,77.0,0,1,0,7.0,3573.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,75.0,0,0,1,5.0,3532.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,65.0,1,0,0,10.0,3494.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,65.0,0,1,0,10.0,3450.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340411,75.0,1,0,0,10.0,85.0,-3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340412,58.0,1,0,0,10.0,79.0,-3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340413,49.0,0,1,0,1.0,54.0,-3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340414,49.0,0,0,1,1.0,48.0,-3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
[i for i in nfl_model_data.columns]

['play_id',
 'game_id',
 'old_game_id',
 'home_team',
 'away_team',
 'season_type',
 'week',
 'posteam',
 'posteam_type',
 'defteam',
 'side_of_field',
 'yardline_100',
 'game_date',
 'quarter_seconds_remaining',
 'half_seconds_remaining',
 'game_seconds_remaining',
 'game_half',
 'quarter_end',
 'drive',
 'sp',
 'qtr',
 'down',
 'goal_to_go',
 'time',
 'yrdln',
 'ydstogo',
 'ydsnet',
 'desc',
 'play_type',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_kneel',
 'qb_spike',
 'qb_scramble',
 'pass_length',
 'pass_location',
 'air_yards',
 'yards_after_catch',
 'run_location',
 'run_gap',
 'field_goal_result',
 'kick_distance',
 'extra_point_result',
 'two_point_conv_result',
 'home_timeouts_remaining',
 'away_timeouts_remaining',
 'timeout',
 'timeout_team',
 'td_team',
 'td_player_name',
 'td_player_id',
 'posteam_timeouts_remaining',
 'defteam_timeouts_remaining',
 'total_home_score',
 'total_away_score',
 'posteam_score',
 'defteam_score',
 'score_differential',
 'po

In [122]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [123]:
label = ['play_type_binary']
features = ['yardline_100', 'firstdown', 'seconddown', 'thirddown', 'ydstogo', 'game_seconds_remaining', 
            'score_differential', 'dARI', 'dATL', 'dBAL', 'dBUF', 'dCAR', 'dCHI', 'dCIN', 'dCLE', 'dDAL', 
                        'dDEN','dDET', 'dGB', 'dHOU', 'dIND', 'dJAX', 'dKC', 'dLA', 'dLAC', 'dLV', 
                        'dMIA', 'dMIN','dNE', 'dNO', 'dNYG', 'dNYJ', 'dPHI', 'dPIT', 'dSEA', 'dSF', 
                        'dTB', 'dTEN']
clean_nfl_model_data = nfl_model_data[features + label].dropna()

The number of timeouts remaining does not help: `posteam_timeouts_remaining`, `defteam_timeouts_remaining`

### Splitting the data

In [124]:
# importing train_test_split from sklearn
training_data, testing_data = train_test_split(clean_nfl_model_data, test_size=0.2, random_state=2022)
training_data, validation_data = train_test_split(training_data, test_size=0.25, random_state=2022)
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {validation_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 141883
No. of testing examples: 47295
No. of testing examples: 47295


In [125]:
# very simple model of play_type_binary ~ features
x_train = training_data[features].to_numpy()
y_train = training_data['play_type_binary'].to_numpy()
x_val =  validation_data[features].to_numpy()
y_val =  validation_data['play_type_binary'].to_numpy()
x_test =  testing_data[features].to_numpy()
y_test =  testing_data['play_type_binary'].to_numpy()

In [134]:
clf = LogisticRegression(random_state=2022, max_iter=300).fit(x_train, y_train)

In [116]:
training_data[features]

Unnamed: 0,yardline_100,firstdown,seconddown,thirddown,ydstogo,game_seconds_remaining,score_differential,dARI,dATL,dBAL,...,dNE,dNO,dNYG,dNYJ,dPHI,dPIT,dSEA,dSF,dTB,dTEN
100625,47.0,0,1,0,20.0,109.0,-14.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
224748,52.0,0,0,1,8.0,2133.0,-3.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
138853,82.0,0,1,0,10.0,73.0,-11.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
303689,75.0,1,0,0,10.0,509.0,-5.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40628,67.0,0,1,0,6.0,1389.0,1.0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78425,32.0,0,0,1,2.0,1265.0,4.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
299669,75.0,1,0,0,10.0,539.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
310566,49.0,1,0,0,10.0,2133.0,10.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
166061,54.0,1,0,0,10.0,3164.0,-7.0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [117]:
y_train

array([1, 1, 1, ..., 1, 0, 0])

In [135]:
prediction = clf.predict(x_val)
accuracy = metrics.accuracy_score(y_val, prediction)
print("The accuracy of the base model is","{:.2f}".format(accuracy *100),"%")

The accuracy of the base model is 65.28 %


In [120]:
y_val

array([0, 1, 0, ..., 0, 1, 0])