# Importing Libraries

In [1]:
#pip uninstall sklearn

In [2]:
#pip uninstall numpy
#!pip install statsmodels

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import linear_model
import matplotlib.pyplot as plt


In [4]:
#!pip install sagemaker
!pip install boto3
#!pip install pyarrow
#!pip install pyarrow.parquet
import pyarrow as pa
import pyarrow.parquet as pq
#from sagemaker import get_execution_role
#import boto3



## Basic Clean Up  (with smaller sample size)

In [5]:
years_required_2 =[2016,2017,2018,2019,2020,2021]
#download data
!aws s3 cp s3://capstone-nfl-data-adi/NFL_PBP_Data_2010_2022_original.parquet NFL_PBP_Data_2010_2022_original.parquet

fatal error: An error occurred (403) when calling the HeadObject operation: Forbidden


In [6]:
#Load Data
df = pd.read_parquet('NFL_PBP_Data_2010_2022_original.parquet', engine='pyarrow')
print(df.shape)

(588078, 384)


In [7]:
NFL_PBP_Data_small= df[df['season'].isin([2016,2017,2018,2019,2020,2021])]
NFL_PBP_Data_small.shape

(291550, 384)

In [8]:
print(NFL_PBP_Data_small.season.unique())
print(NFL_PBP_Data_small.play_type.unique())

[2016 2017 2018 2019 2020 2021]
[None 'kickoff' 'run' 'pass' 'no_play' 'punt' 'field_goal' 'extra_point'
 'qb_kneel' 'qb_spike']


In [9]:
#nfl_df = NFL_PBP_Data_small.loc[NFL_PBP_Data_small['season'] == 2020 ]
nfl_df = NFL_PBP_Data_small
print(nfl_df.season.unique())
print(nfl_df.shape)

#nfl_df.to_excel (r'nfl_df_2020.xlsx', index = True, header=True)

[2016 2017 2018 2019 2020 2021]
(291550, 384)


### Removing Kneels, None, Qb Spikes & no_play 

In [10]:
nfl_df = nfl_df[nfl_df['play_type'].isin(['pass', 'run', 'punt', 'extra_point', 'field_goal']) ]
print(nfl_df.shape)

(231732, 384)


### Removing overtime - teams will act very different during overtime...

In [11]:
nfl_df = nfl_df[~(nfl_df['game_half'] == 'Overtime' )]
print(nfl_df.shape)

(230218, 384)


### Normalizing the Yardline as an absolute matrix (Yards to End-Zone)

In [12]:
#  yardline currently moves from 0 to 50.. so '30' could be subjective based on who possesses the ball 
#so I am normalizing it to 0 to 99 to make it easy to follow..

#Using yardline_100 much clearer matrix

nfl_df = nfl_df[nfl_df['yardline_100'].notna()]
print(nfl_df.shape)

(230218, 384)


### Parsing the offense_personnel to pimary positions

In [13]:
print(nfl_df.offense_personnel.unique())

['2 RB, 1 TE, 2 WR' '1 RB, 2 TE, 2 WR' None '6 OL, 1 RB, 2 TE, 1 WR'
 '1 RB, 1 TE, 3 WR' '2 RB, 2 TE, 1 WR' '6 OL, 2 RB, 2 TE, 0 WR'
 '6 OL, 1 RB, 1 TE, 2 WR' '1 RB, 3 TE, 1 WR' '6 OL, 2 RB, 0 TE, 2 WR'
 '1 RB, 0 TE, 4 WR' '2 RB, 0 TE, 3 WR' '6 OL, 2 RB, 1 TE, 1 WR'
 '0 RB, 1 TE, 4 WR' '2 RB, 3 TE, 0 WR' '' '1 RB, 1 TE, 1 WR,1 P,1 LS,1 DB'
 '6 OL, 1 RB, 2 TE, 0 WR,1 DL' '0 RB, 2 TE, 3 WR' '3 RB, 1 TE, 1 WR'
 '0 RB, 3 TE, 2 WR' '2 RB, 2 TE, 2 WR' '7 OL, 2 RB, 1 TE, 0 WR'
 '3 RB, 0 TE, 4 WR' '6 OL, 1 RB, 3 TE, 0 WR' '6 OL, 1 RB, 0 TE, 3 WR'
 '2 QB, 1 RB, 1 TE, 2 WR' '7 OL, 1 RB, 1 TE, 1 WR'
 '0 RB, 2 TE, 0 WR,1 P,1 LS,1 DL,1 K' '1 RB, 2 TE, 3 WR'
 '1 RB, 1 TE, 4 WR' '2 RB, 1 TE, 3 WR' '0 RB, 0 TE, 5 WR'
 '7 OL, 1 RB, 2 TE, 0 WR' '1 RB, 4 TE, 0 WR' '7 OL, 1 RB, 1 TE, 0 WR,1 DL'
 '2 RB, 2 TE, 5 WR' '0 RB, 1 TE, 0 WR,1 P,4 LB,1 LS,1 DL,3 DB'
 '6 OL, 1 RB, 1 TE, 1 WR,1 DL' '1 RB, 3 TE, 3 WR' '1 RB, 1 TE, 2 WR'
 '2 RB, 2 TE, 0 WR,1 DL' '2 RB, 3 TE, 4 WR' '2 RB, 3 TE, 1 WR'
 '2 RB, 1 TE, 2 WR,

In [14]:
def transform_off_personnel(row):

   rb_count = 0
   te_count = 0
   wr_count = 0
   ol_count = 0
   dl_count = 0
   db_count = 0

   if not pd.isna(row['offense_personnel']):
       personnel = row['offense_personnel'].split(', ')
       for p in personnel:
           if p[2:4] == 'RB':
               rb_count = int(p[0])
           elif p[2:4] == 'TE':
                te_count = int(p[0])
           elif p[2:4] == 'WR':
                wr_count = int(p[0])
           elif p[2:4] == 'OL':
                ol_count = int(p[0])
           elif p[2:4] == 'DL':
                dl_count = int(p[0])
           elif p[2:4] == 'DB':
               db_count = int(p[0])
       
   off_formation = str(rb_count) + str(te_count)
   return pd.Series([rb_count,te_count,wr_count,ol_count,dl_count, db_count, off_formation])

nfl_df[['off_rb_count','off_te_count','off_wr_count','off_ol_count','off_dl_count', 'off_db_count','off_formation']] = nfl_df.apply(transform_off_personnel, axis=1)

#### Validation

In [15]:
nfl_df_2 = nfl_df[['offense_personnel','off_rb_count','off_te_count','off_wr_count','off_ol_count','off_dl_count', 'off_db_count','off_formation']]

In [16]:
nfl_df_2.drop_duplicates()

Unnamed: 0,offense_personnel,off_rb_count,off_te_count,off_wr_count,off_ol_count,off_dl_count,off_db_count,off_formation
290679,"2 RB, 1 TE, 2 WR",2,1,2,0,0,0,21
290683,"1 RB, 2 TE, 2 WR",1,2,2,0,0,0,12
290685,,0,0,0,0,0,0,00
290687,"6 OL, 1 RB, 2 TE, 1 WR",1,2,1,6,0,0,12
290688,"1 RB, 1 TE, 3 WR",1,1,3,0,0,0,11
...,...,...,...,...,...,...,...,...
573370,"2 RB, 0 TE, 4 WR",2,0,4,0,0,0,20
574361,"1 RB, 1 TE, 0 WR,1 P,5 LB,1 LS,2 DB",1,1,0,0,0,0,11
575689,"6 OL, 1 RB, 0 TE, 0 WR,1 P,1 LS,1 DL,1 K",1,0,0,6,0,0,10
578541,"0 RB, 2 TE, 1 WR,1 P,4 LB,1 LS,2 DB",0,2,1,0,0,0,02


### Parsing the defense_personnel to pimary positions

In [17]:
print(nfl_df.defense_personnel.unique())

['4 DL, 3 LB, 4 DB' None '3 DL, 4 LB, 4 DB' '1 DL, 5 LB, 5 DB'
 '3 DL, 3 LB, 5 DB' '2 DL, 4 LB, 5 DB' '2 DL, 3 LB, 6 DB'
 '4 DL, 4 LB, 3 DB' '5 DL, 3 LB, 3 DB' '4 DL, 2 LB, 5 DB'
 '1 DL, 4 LB, 6 DB' '3 DL, 5 LB, 3 DB' '6 DL, 3 LB, 2 DB'
 '2 DL, 5 LB, 4 DB' '5 DL, 4 LB, 2 DB' '4 DL, 1 LB, 6 DB' ''
 '1 DL, 3 LB, 5 DB, 2 RB' '6 DL, 4 LB, 1 DB' '3 DL, 2 LB, 6 DB'
 '5 DL, 2 LB, 4 DB' '3 DL, 1 LB, 7 DB' '2 DL, 2 LB, 7 DB'
 '6 DL, 2 LB, 3 DB' '5 DL, 3 LB, 2 DB' '4 DL, 4 LB, 2 DB'
 '4 DL, 5 LB, 2 DB' '1 DL, 3 LB, 7 DB' '4 DL, 1 LB, 5 DB'
 '4 DL, 2 LB, 3 DB' '4 DL, 3 LB, 3 DB' '5 DL, 1 LB, 5 DB'
 '1 DL, 3 LB, 6 DB' '5 DL, 2 LB, 3 DB, 1 OL' '2 DL, 3 LB, 5 DB, 1 WR'
 '3 DL, 3 LB, 4 DB, 1 WR' '4 DL, 6 LB, 1 DB' '2 DL, 3 LB, 4 DB, 1 RB,1 WR'
 '4 DL, 0 LB, 7 DB' '5 DL, 5 LB, 1 DB' '4 DL, 2 LB, 4 DB'
 '3 DL, 3 LB, 4 DB' '2 DL, 3 LB, 5 DB' '3 DL, 2 LB, 5 DB, 1 OL'
 '0 DL, 3 LB, 4 DB, 1 RB,1 TE,2 WR' '2 DL, 5 LB, 3 DB, 1 OL'
 '6 DL, 1 LB, 4 DB' '3 DL, 2 LB, 5 DB' '0 DL, 2 LB, 3 DB, 3 RB,1 TE,2 WR'
 '3 

In [18]:
def transform_def_personnel(row):

   dl_count = 0
   db_count = 0
   lb_count = 0
   rb_count = 0
   wr_count = 0
   ol_count = 0
   
   if not pd.isna(row['defense_personnel']):
       personnel = row['defense_personnel'].split(', ')
       for p in personnel:
           if p[2:4] == 'LB':
                lb_count = int(p[0])
           elif p[2:4] == 'DL':
                dl_count = int(p[0])
           elif p[2:4] == 'DB':
                db_count = int(p[0])
           elif p[2:4] == 'WR':
                wr_count = int(p[0])
           elif p[2:4] == 'RB':
                rb_count = int(p[0])
           elif p[2:4] == 'OL':
                ol_count = int(p[0])
                
    
   return pd.Series([dl_count,db_count,lb_count,rb_count,wr_count,ol_count])

nfl_df[['def_dl_count','def_db_count','def_lb_count','def_rb_count','def_wr_count','def_ol_count']] = nfl_df.apply(transform_def_personnel, axis=1)

#### Validation

In [19]:
nfl_df_2 = nfl_df[[
    'defense_personnel',
                   'def_dl_count',
                   'def_db_count',
                   'def_lb_count',
                   'def_rb_count',
                   'def_wr_count',
                   'def_ol_count']]

In [20]:
nfl_df_2.drop_duplicates()

Unnamed: 0,defense_personnel,def_dl_count,def_db_count,def_lb_count,def_rb_count,def_wr_count,def_ol_count
290679,"4 DL, 3 LB, 4 DB",4,4,3,0,0,0
290685,,0,0,0,0,0,0
290686,"3 DL, 4 LB, 4 DB",3,4,4,0,0,0
290688,"1 DL, 5 LB, 5 DB",1,5,5,0,0,0
290694,"3 DL, 3 LB, 5 DB",3,5,3,0,0,0
...,...,...,...,...,...,...,...
571667,"1 DL, 4 LB, 5 DB, 1 OL",1,5,4,0,0,1
574361,"2 DL, 1 LB, 4 DB, 1 RB,3 WR",2,4,1,1,0,0
577790,"1 DL, 3 LB, 4 DB, 1 TE,2 WR",1,4,3,0,0,0
578541,"1 DL, 3 LB, 4 DB, 1 RB,2 WR",1,4,3,1,0,0


### Parsing the players_on_play to pimary positions

In [21]:
nfl_df_2 = nfl_df[['first_down_rush','first_down_pass','first_down_penalty','series','series_success','series_result','ydstogo','yards_gained']]

nfl_df_2.head(5)

Unnamed: 0,first_down_rush,first_down_pass,first_down_penalty,series,series_success,series_result,ydstogo,yards_gained
290679,0.0,0.0,0.0,1.0,1.0,First down,10.0,6.0
290680,0.0,1.0,0.0,1.0,1.0,First down,4.0,5.0
290681,0.0,0.0,0.0,2.0,0.0,Punt,10.0,0.0
290682,0.0,0.0,0.0,2.0,0.0,Punt,10.0,9.0
290683,0.0,0.0,0.0,2.0,0.0,Punt,1.0,0.0


In [22]:
print(nfl_df_2.series_result.unique())

['First down' 'Punt' 'Turnover' 'Field goal' 'Touchdown' 'End of half'
 'Missed field goal' 'Turnover on downs' 'Safety' 'Opp touchdown'
 'QB kneel']


In [23]:
cross_tab = pd.crosstab(index=nfl_df_2['series_result'],
                        columns=nfl_df_2['series_result'])
cross_tab

series_result,End of half,Field goal,First down,Missed field goal,Opp touchdown,Punt,QB kneel,Safety,Touchdown,Turnover,Turnover on downs
series_result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
End of half,1033,0,0,0,0,0,0,0,0,0,0
Field goal,0,19780,0,0,0,0,0,0,0,0,0
First down,0,0,112247,0,0,0,0,0,0,0,0
Missed field goal,0,0,0,3512,0,0,0,0,0,0,0
Opp touchdown,0,0,0,0,1524,0,0,0,0,0,0
Punt,0,0,0,0,0,54372,0,0,0,0,0
QB kneel,0,0,0,0,0,0,140,0,0,0,0
Safety,0,0,0,0,0,0,0,205,0,0,0
Touchdown,0,0,0,0,0,0,0,0,23536,0,0
Turnover,0,0,0,0,0,0,0,0,0,7544,0


In [24]:
#assing passing as 1 & rush as 0
nfl_df_2['first_Down_success'] = nfl_df_2['series_result'].map(lambda p: 1 if p == 'First down' else 0)

nfl_df_2['Touch_Down_success'] = nfl_df_2['series_result'].map(lambda p: 1 if p == 'Touchdown' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df_2['first_Down_success'] = nfl_df_2['series_result'].map(lambda p: 1 if p == 'First down' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df_2['Touch_Down_success'] = nfl_df_2['series_result'].map(lambda p: 1 if p == 'Touchdown' else 0)


In [25]:
nfl_df_2.head(5)

Unnamed: 0,first_down_rush,first_down_pass,first_down_penalty,series,series_success,series_result,ydstogo,yards_gained,first_Down_success,Touch_Down_success
290679,0.0,0.0,0.0,1.0,1.0,First down,10.0,6.0,1,0
290680,0.0,1.0,0.0,1.0,1.0,First down,4.0,5.0,1,0
290681,0.0,0.0,0.0,2.0,0.0,Punt,10.0,0.0,0,0
290682,0.0,0.0,0.0,2.0,0.0,Punt,10.0,9.0,0,0
290683,0.0,0.0,0.0,2.0,0.0,Punt,1.0,0.0,0,0


# Creating a dataset for Successful First Down & Touch Down

In [26]:
#assing passing as 1 & rush as 0
nfl_df['first_Down_success'] = nfl_df['series_result'].map(lambda p: 1 if p == 'First down' else 0)

nfl_df['Touch_Down_success'] = nfl_df['series_result'].map(lambda p: 1 if p == 'Touchdown' else 0)
nfl_df.head(5)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,off_db_count,off_formation,def_dl_count,def_db_count,def_lb_count,def_rb_count,def_wr_count,def_ol_count,first_Down_success,Touch_Down_success
290679,58.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,0,21,4,4,3,0,0,0,1,0
290680,85.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,0,21,4,4,3,0,0,0,1,0
290681,109.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,0,21,4,4,3,0,0,0,0,0
290682,130.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,0,21,4,4,3,0,0,0,0,0
290683,154.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,0,12,4,4,3,0,0,0,0,0


### Determining equating pass or rush as numberical

In [27]:
def PlayType_normalized(s):
  if s['play_type_nfl'] == 'PASS': 
    return 'PASS'
  elif s['play_type_nfl'] == 'RUSH': 
    return 'RUSH'
  else:
    return 'OTHER'

In [28]:
nfl_df['PlayType_normalized'] = nfl_df.apply(PlayType_normalized, axis=1)

In [29]:
#only keeping pass or rushes
nfl_df = nfl_df[~(nfl_df['PlayType_normalized'] == 'OTHER' )]

In [30]:
#assing passing as 1 & rush as 0
nfl_df['PlayType_Pass'] = nfl_df['PlayType_normalized'].map(lambda p: 1 if p == 'PASS' else 0)

#assing passing as 1 & rush as 0
nfl_df['PlayType_rush'] = nfl_df['PlayType_normalized'].map(lambda p: 1 if p == 'RUSH' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['PlayType_Pass'] = nfl_df['PlayType_normalized'].map(lambda p: 1 if p == 'PASS' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['PlayType_rush'] = nfl_df['PlayType_normalized'].map(lambda p: 1 if p == 'RUSH' else 0)


In [31]:
#[i for i in nfl_df.columns]

In [32]:
#Creating binary columns for timeouts remaining
temp_df= nfl_df[['home_team','away_team','possession_team','home_timeouts_remaining', 'away_timeouts_remaining','posteam_timeouts_remaining', 'defteam_timeouts_remaining']]
temp_df.head(5)
#Feild 'posteam_timeouts_remaining' is relevant for us

Unnamed: 0,home_team,away_team,possession_team,home_timeouts_remaining,away_timeouts_remaining,posteam_timeouts_remaining,defteam_timeouts_remaining
290679,BAL,BUF,BAL,3.0,3.0,3.0,3.0
290680,BAL,BUF,BAL,3.0,3.0,3.0,3.0
290681,BAL,BUF,BAL,3.0,3.0,3.0,3.0
290682,BAL,BUF,BAL,3.0,3.0,3.0,3.0
290683,BAL,BUF,BAL,3.0,3.0,3.0,3.0


In [33]:
# creating location based data - Situational
 
#Creating binary columns for Down

nfl_df['first_down_flag'] = nfl_df['down'].map(lambda p: 1 if p == 1 else 0)
nfl_df['second_down_flag'] = nfl_df['down'].map(lambda p: 1 if p == 2 else 0)
nfl_df['third_down_flag'] = nfl_df['down'].map(lambda p: 1 if p == 3 else 0)
nfl_df['forth_down_flag'] = nfl_df['down'].map(lambda p: 1 if p == 4 else 0)
temp_df = nfl_df[['down','first_down_flag', 'second_down_flag','third_down_flag', 'forth_down_flag']]
temp_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['first_down_flag'] = nfl_df['down'].map(lambda p: 1 if p == 1 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['second_down_flag'] = nfl_df['down'].map(lambda p: 1 if p == 2 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['third_down_flag'] = nfl_df['down'].map(lam

Unnamed: 0,down,first_down_flag,second_down_flag,third_down_flag,forth_down_flag
290679,1.0,1,0,0,0
290680,2.0,0,1,0,0
290681,1.0,1,0,0,0
290682,2.0,0,1,0,0
290683,3.0,0,0,1,0


In [34]:
temp_df= nfl_df[['play_clock']]

In [35]:
temp_df['play_clock_clean']= ['0' if v is None else v for v in temp_df['play_clock']]

print(temp_df.play_clock.unique())

def clock_binned(row): 
    diff = int(row.play_clock_clean)
    if diff > 20: 
        return 'Greater_than_20 sec'
    elif diff >= 11 and diff <= 20: 
        return 'Between 11 and 20 sec'
    elif diff == 0: 
        return 'clock run-out'
    elif diff >= 5 and diff <= 10: 
        return 'Between 5 and 10 sec'
    elif diff > 0 and diff <= 5 : 
        return 'Between 0 and 5'
    else:
        return None

temp_df['Play_clock_categorized'] = temp_df.apply(clock_binned, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['play_clock_clean']= ['0' if v is None else v for v in temp_df['play_clock']]


['14' '17' '20' '1' '16' '3' '13' '8' '9' '7' '21' '10' '5' '4' '0' '6'
 '18' '11' '15' '2' '19' '22' '12' '24' '29' '39' '26' '38' '23' '27' '28'
 '30' '36' '33' '32' '35' '34' '31' '37' '40' '25' '54' '64' '74' '44']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Play_clock_categorized'] = temp_df.apply(clock_binned, axis=1)


In [36]:
temp_df

Unnamed: 0,play_clock,play_clock_clean,Play_clock_categorized
290679,14,14,Between 11 and 20 sec
290680,17,17,Between 11 and 20 sec
290681,17,17,Between 11 and 20 sec
290682,20,20,Between 11 and 20 sec
290683,1,1,Between 0 and 5
...,...,...,...
582219,14,14,Between 11 and 20 sec
582220,9,9,Between 5 and 10 sec
582221,21,21,Greater_than_20 sec
582222,7,7,Between 5 and 10 sec


In [37]:
nfl_df['play_clock_clean']= ['0' if v is None else v for v in nfl_df['play_clock']]

#converting score differentails into possession gaps
def clock_binned(row): 
    diff = int(row.play_clock_clean)
    if diff > 20: 
        return 'Greater_than_20 sec'
    elif diff >= 11 and diff <= 20: 
        return 'Between 11 and 20 sec'
    elif diff == 0: 
        return 'clock run-out'
    elif diff >= 5 and diff <= 10: 
        return 'Between 5 and 10 sec'
    elif diff > 0 and diff <= 5 : 
        return 'Between 0 and 5'
    else:
        return None

#validations
    #temp_df['Play_clock_categorized'] = temp_df.apply(clock_binned, axis=1)
    #df_temp_1 = temp_df.drop_duplicates(keep='last')
    #df_temp_1
#applyig on the data set    
nfl_df['Play_clock_categorized'] = nfl_df.apply(clock_binned, axis=1)

#validation
#temp_df= nfl_df[['play_clock','Play_clock_categorized']]
#temp_df.drop_duplicates()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['play_clock_clean']= ['0' if v is None else v for v in nfl_df['play_clock']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['Play_clock_categorized'] = nfl_df.apply(clock_binned, axis=1)


In [38]:
#Creating binary columns for Play_clock_categorized

nfl_df['Play_clock_Greater_than_20'] = nfl_df['Play_clock_categorized'].map(lambda p: 1 if p == 'Greater_than_20 sec' else 0)
nfl_df['Play_clock_Between_11_and_20'] = nfl_df['Play_clock_categorized'].map(lambda p: 1 if p == 'Between 11 and 20 sec' else 0)
nfl_df['Play_clock_equal_0'] = nfl_df['Play_clock_categorized'].map(lambda p: 1 if p == 'clock run-out' else 0)
nfl_df['Play_clock_Between_5_and_10'] = nfl_df['Play_clock_categorized'].map(lambda p: 1 if p == 'Between 5 and 10 sec' else 0)
nfl_df['Play_clock_errors'] = nfl_df['Play_clock_categorized'].map(lambda p: 1 if p == None else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['Play_clock_Greater_than_20'] = nfl_df['Play_clock_categorized'].map(lambda p: 1 if p == 'Greater_than_20 sec' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['Play_clock_Between_11_and_20'] = nfl_df['Play_clock_categorized'].map(lambda p: 1 if p == 'Between 11 and 20 sec' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

In [39]:
#converting score differentails into possession gaps
def score_diff_possession(row): 
    diff = row.score_differential
    if diff > 7: 
        return 2
    elif diff > 0 and diff < 7: 
        return 1
    elif diff == 0: 
        return 0
    elif diff < 0 and diff > -7: 
        return -1
    elif diff < -7: 
        return -2
    else:
        return None

nfl_df['poss_differential'] = nfl_df.apply(score_diff_possession, axis=1)


#Creating binary columns for Play_clock_categorized

nfl_df['poss_differential_2'] = nfl_df['poss_differential'].map(lambda p: 1 if p == 2 else 0)
nfl_df['poss_differential_1'] = nfl_df['poss_differential'].map(lambda p: 1 if p == 1 else 0)
nfl_df['poss_differential_0'] = nfl_df['poss_differential'].map(lambda p: 1 if p == 0 else 0)
nfl_df['poss_differential_-1'] = nfl_df['poss_differential'].map(lambda p: 1 if p == -1 else 0)
nfl_df['poss_differential_-2'] = nfl_df['poss_differential'].map(lambda p: 1 if p == -2 else 0)
temp_df = nfl_df[['score_differential','poss_differential_2','poss_differential_1', 'poss_differential_0','poss_differential_-1', 'poss_differential_-2']]
temp_df = temp_df.drop_duplicates(keep='last')
temp_df = temp_df.sort_values(by=['score_differential'], ascending=False)
temp_df.head(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['poss_differential'] = nfl_df.apply(score_diff_possession, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['poss_differential_2'] = nfl_df['poss_differential'].map(lambda p: 1 if p == 2 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['poss_differential_1'] = nfl_d

Unnamed: 0,score_differential,poss_differential_2,poss_differential_1,poss_differential_0,poss_differential_-1,poss_differential_-2
574288,49.0,1,0,0,0,0
517009,45.0,1,0,0,0,0
387467,44.0,1,0,0,0,0
350245,43.0,1,0,0,0,0
574307,42.0,1,0,0,0,0


In [40]:
temp_df= nfl_df[['defenders_in_box','n_offense','n_defense']]

print(temp_df.drop_duplicates())

#temp_df['defenders_in_box'] = (temp_df['defenders_in_box'] !='n').astype(int)
#temp_df['n_offense'] = (temp_df['n_offense'] !='n').astype(int)
#temp_df['n_defense'] = (temp_df['n_defense'] !='n').astype(int)

temp_df['defenders_in_box']= pd.to_numeric(temp_df['defenders_in_box'], errors='coerce')
temp_df['n_offense']= pd.to_numeric(temp_df['n_offense'], errors='coerce')
temp_df['n_defense']= pd.to_numeric(temp_df['n_defense'], errors='coerce')

temp_df['defenders_in_box'] = temp_df['defenders_in_box'].replace(np.nan, 0)
temp_df['n_offense'] = temp_df['n_offense'].replace(np.nan, 0)
temp_df['n_defense'] = temp_df['n_defense'].replace(np.nan, 0)


#temp_df = temp_df.drop_duplicates()
temp_df

        defenders_in_box  n_offense  n_defense
290679                 8       11.0       11.0
290680                 6       11.0       11.0
290681                 7       11.0       11.0
290686                 7       11.0       10.0
290688                 6       10.0       10.0
...                  ...        ...        ...
465974                 7        0.0       23.0
466038                 8        0.0       23.0
466094                 4        0.0       23.0
506671              <NA>        NaN        NaN
562781                 0       11.0       11.0

[117 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['defenders_in_box']= pd.to_numeric(temp_df['defenders_in_box'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['n_offense']= pd.to_numeric(temp_df['n_offense'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['n_defense']= pd.to_numeric(temp_

Unnamed: 0,defenders_in_box,n_offense,n_defense
290679,8,11.0,11.0
290680,6,11.0,11.0
290681,7,11.0,11.0
290682,7,11.0,11.0
290683,8,11.0,11.0
...,...,...,...
582219,6,0.0,0.0
582220,6,0.0,0.0
582221,5,0.0,0.0
582222,6,0.0,0.0


In [41]:
#Converting values in integer
#nfl_df['defenders_in_box'] = (nfl_df['defenders_in_box'] !='n').astype(int)
#nfl_df['n_offense'] = (nfl_df['n_offense'] !='n').astype(int)
#nfl_df['n_defense'] = (nfl_df['n_defense'] !='n').astype(int)

nfl_df['defenders_in_box']= pd.to_numeric(nfl_df['defenders_in_box'], errors='coerce')
nfl_df['n_offense']= pd.to_numeric(nfl_df['n_offense'], errors='coerce')
nfl_df['n_defense']= pd.to_numeric(nfl_df['n_defense'], errors='coerce')

nfl_df['defenders_in_box'] = nfl_df['defenders_in_box'].replace(np.nan, 0)
nfl_df['n_offense'] = nfl_df['n_offense'].replace(np.nan, 0)
nfl_df['n_defense'] = nfl_df['n_defense'].replace(np.nan, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['defenders_in_box']= pd.to_numeric(nfl_df['defenders_in_box'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['n_offense']= pd.to_numeric(nfl_df['n_offense'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nfl_df['n_defense']= pd.to_numeric(nfl_df['n_

In [42]:
nfl_df.head(5)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,Play_clock_Between_11_and_20,Play_clock_equal_0,Play_clock_Between_5_and_10,Play_clock_errors,poss_differential,poss_differential_2,poss_differential_1,poss_differential_0,poss_differential_-1,poss_differential_-2
290679,58.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,1,0,0,0,0.0,0,0,1,0,0
290680,85.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,1,0,0,0,0.0,0,0,1,0,0
290681,109.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,1,0,0,0,0.0,0,0,1,0,0
290682,130.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,1,0,0,0,0.0,0,0,1,0,0
290683,154.0,2016_01_BUF_BAL,2016091101,BAL,BUF,REG,1,BAL,home,BUF,...,0,0,0,0,0.0,0,0,1,0,0


### Subsetting the columns for model

In [43]:
nfl_model_data = nfl_df[['play_id','game_id','home_team','away_team','posteam','defteam',
                    'season_type','ydstogo','yardline_100','season',
                    'game_seconds_remaining',
                    #scoring feilds
                    'score_differential','poss_differential_2','poss_differential_1', 'poss_differential_0','poss_differential_-1', 'poss_differential_-2',
                    #play type details
                         'PlayType_normalized','pass_location','pass_length', 'run_location','run_gap',
                    #down fields
                    'down','first_down_flag', 'second_down_flag','third_down_flag', 'forth_down_flag','posteam_timeouts_remaining',
                    #play clock fields
                   'play_clock','Play_clock_errors','Play_clock_equal_0', 'Play_clock_Between_5_and_10','Play_clock_Between_11_and_20', 'Play_clock_Greater_than_20',
                    #offensive configurtion
                    'off_formation','off_rb_count','off_te_count','off_wr_count','off_ol_count','off_dl_count','off_db_count',
                    #defensive configurtion
                   'defense_personnel','def_dl_count','def_db_count','def_lb_count','def_rb_count','def_wr_count','def_ol_count',
                    #offense + defense count feilds
                    'offense_formation','defenders_in_box','n_offense','n_defense',
                        #success parameters
                        'first_Down_success','Touch_Down_success']]

In [44]:
Export_df = nfl_model_data.head(50)
#Export_df.to_excel (r'Export_df.xlsx', index = True, header=True)

In [45]:
for i in nfl_model_data.columns:
    print(i)

play_id
game_id
home_team
away_team
posteam
defteam
season_type
ydstogo
yardline_100
season
game_seconds_remaining
score_differential
poss_differential_2
poss_differential_1
poss_differential_0
poss_differential_-1
poss_differential_-2
PlayType_normalized
pass_location
pass_length
run_location
run_gap
down
first_down_flag
second_down_flag
third_down_flag
forth_down_flag
posteam_timeouts_remaining
play_clock
Play_clock_errors
Play_clock_equal_0
Play_clock_Between_5_and_10
Play_clock_Between_11_and_20
Play_clock_Greater_than_20
off_formation
off_rb_count
off_te_count
off_wr_count
off_ol_count
off_dl_count
off_db_count
defense_personnel
def_dl_count
def_db_count
def_lb_count
def_rb_count
def_wr_count
def_ol_count
offense_formation
defenders_in_box
n_offense
n_defense
first_Down_success
Touch_Down_success


In [46]:
# get dummy for offensive team
temp_df_o = nfl_model_data[['posteam']]
temp_df_o= pd.get_dummies(temp_df_o['posteam'], prefix='pos')
#validation
#temp_df_results.head(5)
nfl_model_data = pd.concat([nfl_model_data, temp_df_o], axis=1)
nfl_model_data.head(5)

# get dummy for defensive team
temp_df_d = nfl_model_data[['defteam']]
temp_df_d = pd.get_dummies(temp_df_d['defteam'], prefix='def')
#validation
#temp_df_results.head(5)
nfl_model_data = pd.concat([nfl_model_data, temp_df_d], axis=1)
nfl_model_data.head(5)

# get dummy for year

temp_df_s = nfl_model_data[['season']]
temp_df_s= pd.get_dummies(temp_df_s['season'])
#validation
#temp_df_results.head(5)
nfl_model_data = pd.concat([nfl_model_data, temp_df_s], axis=1)
nfl_model_data.head(5)


Unnamed: 0,play_id,game_id,home_team,away_team,posteam,defteam,season_type,ydstogo,yardline_100,season,...,def_SF,def_TB,def_TEN,def_WAS,2016,2017,2018,2019,2020,2021
290679,58.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,10.0,86.0,2016,...,0,0,0,0,1,0,0,0,0,0
290680,85.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,4.0,80.0,2016,...,0,0,0,0,1,0,0,0,0,0
290681,109.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,10.0,75.0,2016,...,0,0,0,0,1,0,0,0,0,0
290682,130.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,10.0,75.0,2016,...,0,0,0,0,1,0,0,0,0,0
290683,154.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,1.0,66.0,2016,...,0,0,0,0,1,0,0,0,0,0


In [47]:
# Creating more detailed pass & run categories
temp_df = nfl_model_data[['PlayType_normalized','pass_location','pass_length', 'run_location','run_gap']]

def Play_type_detailed(s):
  if s['PlayType_normalized'] == 'PASS' and s['pass_location'] != None and s['pass_length'] != None: 
    return 'PASS' + '_' + str(s['pass_location']).upper() + '_' + str(s['pass_length']).upper()
  elif s['PlayType_normalized'] == 'RUSH' and s['run_location'] != None and s['run_gap'] != None: 
    return 'RUSH' + '_' + str(s['run_location']).upper()+ '_' + str(s['run_gap']).upper()
  else:
    return 'OTHER'
temp_df['Play_type_detailed'] = temp_df.apply(Play_type_detailed, axis=1)
temp_df = temp_df[~(temp_df['Play_type_detailed'] == 'OTHER' )]

#validation
temp_df = temp_df.drop_duplicates(subset=['Play_type_detailed'])
temp_df

# applyin on the data model

nfl_model_data['Play_type_detailed'] = nfl_model_data.apply(Play_type_detailed, axis=1)
nfl_model_data = nfl_model_data[~(nfl_model_data['Play_type_detailed'] == 'OTHER' )]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Play_type_detailed'] = temp_df.apply(Play_type_detailed, axis=1)


In [48]:
print(nfl_model_data.Play_type_detailed.unique())

['RUSH_RIGHT_END' 'PASS_RIGHT_SHORT' 'RUSH_LEFT_GUARD' 'RUSH_RIGHT_TACKLE'
 'PASS_MIDDLE_SHORT' 'PASS_LEFT_SHORT' 'PASS_MIDDLE_DEEP' 'PASS_LEFT_DEEP'
 'PASS_RIGHT_DEEP' 'RUSH_RIGHT_GUARD' 'RUSH_LEFT_TACKLE' 'RUSH_LEFT_END'
 'RUSH_MIDDLE_END']


In [49]:
#binning play type

# get dummy for year

temp_df = nfl_model_data[['Play_type_detailed']]
temp_df= pd.get_dummies(temp_df['Play_type_detailed'], prefix='PTDetailed')
#validation
#temp_df_results.head(5)
nfl_model_data = pd.concat([nfl_model_data, temp_df], axis=1)
nfl_model_data.head(5)
#Export_df.to_excel (r'Export_df.xlsx', index = True, header=True)

Unnamed: 0,play_id,game_id,home_team,away_team,posteam,defteam,season_type,ydstogo,yardline_100,season,...,PTDetailed_PASS_MIDDLE_SHORT,PTDetailed_PASS_RIGHT_DEEP,PTDetailed_PASS_RIGHT_SHORT,PTDetailed_RUSH_LEFT_END,PTDetailed_RUSH_LEFT_GUARD,PTDetailed_RUSH_LEFT_TACKLE,PTDetailed_RUSH_MIDDLE_END,PTDetailed_RUSH_RIGHT_END,PTDetailed_RUSH_RIGHT_GUARD,PTDetailed_RUSH_RIGHT_TACKLE
290679,58.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,10.0,86.0,2016,...,0,0,0,0,0,0,0,1,0,0
290680,85.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,4.0,80.0,2016,...,0,0,1,0,0,0,0,0,0,0
290681,109.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,10.0,75.0,2016,...,0,0,0,0,1,0,0,0,0,0
290682,130.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,10.0,75.0,2016,...,0,0,1,0,0,0,0,0,0,0
290683,154.0,2016_01_BUF_BAL,BAL,BUF,BAL,BUF,REG,1.0,66.0,2016,...,0,0,0,0,0,0,0,0,0,1


## 1st down Prediction - Model

In [50]:
nfl_model_data_Clean = nfl_model_data

In [51]:
# importing train_test_split from sklearn
from sklearn.model_selection import train_test_split
nfl_model_data = nfl_model_data.reset_index()
training_data, testing_data = train_test_split(nfl_model_data_Clean, test_size=0.2, random_state=210)
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 136825
No. of testing examples: 34207


In [52]:
print(training_data.shape)
training_data.head()
for i in training_data.columns:
    print(i)

(136825, 138)
play_id
game_id
home_team
away_team
posteam
defteam
season_type
ydstogo
yardline_100
season
game_seconds_remaining
score_differential
poss_differential_2
poss_differential_1
poss_differential_0
poss_differential_-1
poss_differential_-2
PlayType_normalized
pass_location
pass_length
run_location
run_gap
down
first_down_flag
second_down_flag
third_down_flag
forth_down_flag
posteam_timeouts_remaining
play_clock
Play_clock_errors
Play_clock_equal_0
Play_clock_Between_5_and_10
Play_clock_Between_11_and_20
Play_clock_Greater_than_20
off_formation
off_rb_count
off_te_count
off_wr_count
off_ol_count
off_dl_count
off_db_count
defense_personnel
def_dl_count
def_db_count
def_lb_count
def_rb_count
def_wr_count
def_ol_count
offense_formation
defenders_in_box
n_offense
n_defense
first_Down_success
Touch_Down_success
pos_ARI
pos_ATL
pos_BAL
pos_BUF
pos_CAR
pos_CHI
pos_CIN
pos_CLE
pos_DAL
pos_DEN
pos_DET
pos_GB
pos_HOU
pos_IND
pos_JAX
pos_KC
pos_LA
pos_LAC
pos_LV
pos_MIA
pos_MIN
pos_NE
po

In [53]:
print(testing_data.shape)
testing_data.head()
#testing_data.to_excel (r'Export_df_testing.xlsx', index = True, header=True)

(34207, 138)


Unnamed: 0,play_id,game_id,home_team,away_team,posteam,defteam,season_type,ydstogo,yardline_100,season,...,PTDetailed_PASS_MIDDLE_SHORT,PTDetailed_PASS_RIGHT_DEEP,PTDetailed_PASS_RIGHT_SHORT,PTDetailed_RUSH_LEFT_END,PTDetailed_RUSH_LEFT_GUARD,PTDetailed_RUSH_LEFT_TACKLE,PTDetailed_RUSH_MIDDLE_END,PTDetailed_RUSH_RIGHT_END,PTDetailed_RUSH_RIGHT_GUARD,PTDetailed_RUSH_RIGHT_TACKLE
474217,3776.0,2019_15_MIA_NYG,NYG,MIA,NYG,MIA,REG,10.0,20.0,2019,...,0,0,0,0,0,0,0,1,0,0
528253,3091.0,2020_17_NO_CAR,CAR,NO,CAR,NO,REG,5.0,70.0,2020,...,0,0,0,0,0,0,0,0,1,0
550968,2551.0,2021_08_CIN_NYJ,NYJ,CIN,NYJ,CIN,REG,11.0,72.0,2021,...,0,0,0,0,0,0,0,0,0,0
337119,3948.0,2016_17_SEA_SF,SF,SEA,SEA,SF,REG,1.0,61.0,2016,...,0,0,1,0,0,0,0,0,0,0
400988,773.0,2018_05_OAK_LAC,LAC,LV,LV,LAC,REG,10.0,10.0,2018,...,0,0,1,0,0,0,0,0,0,0


In [54]:
#[i for i in training_data.columns]

In [55]:
# very simple model of playstyle ~ yardline_100 + down
feature_cols= ['ydstogo', 'yardline_100', 'season', 'game_seconds_remaining', 'score_differential', 
               'poss_differential_2', 'poss_differential_1', 'poss_differential_0', 'poss_differential_-1', 'poss_differential_-2', 
  'first_down_flag', 'second_down_flag', 'third_down_flag', 'forth_down_flag', 
  'Play_clock_errors', 'Play_clock_equal_0', 'Play_clock_Between_5_and_10', 'Play_clock_Between_11_and_20', 'Play_clock_Greater_than_20',
  'off_rb_count', 'off_te_count', 'off_wr_count', 'off_ol_count', 'off_dl_count', 'off_db_count',
  'def_dl_count', 'def_db_count', 'def_lb_count', 'def_rb_count', 'def_wr_count', 'def_ol_count',
               # 'defenders_in_box', 'n_offense', 'n_defense',
 'pos_ARI', 'pos_ATL', 'pos_BAL', 'pos_BUF', 'pos_CAR', 'pos_CHI', 'pos_CIN', 'pos_CLE', 'pos_DAL', 'pos_DEN', 'pos_DET', 'pos_GB',
 'pos_HOU', 'pos_IND', 'pos_JAX', 'pos_KC', 'pos_LA', 'pos_LAC', 'pos_LV', 'pos_MIA', 'pos_MIN', 'pos_NE', 'pos_NO', 'pos_NYG', 'pos_NYJ',
 'pos_PHI', 'pos_PIT', 'pos_SEA', 'pos_SF', 'pos_TB', 'pos_TEN', 'pos_WAS', 'def_ARI', 'def_ATL', 'def_BAL', 'def_BUF', 'def_CAR', 'def_CHI', 'def_CIN', 'def_CLE', 'def_DAL', 'def_DEN', 'def_DET', 'def_GB',
 'def_HOU', 'def_IND', 'def_JAX', 'def_KC', 'def_LA', 'def_LAC', 'def_LV', 'def_MIA', 'def_MIN', 'def_NE', 'def_NO', 'def_NYG', 'def_NYJ',
 'def_PHI', 'def_PIT', 'def_SEA', 'def_SF', 'def_TB', 'def_TEN', 'def_WAS',  2016, 2017, 2018, 2019, 2020, 2021,
  'PTDetailed_PASS_LEFT_DEEP', 'PTDetailed_PASS_LEFT_SHORT', 'PTDetailed_PASS_MIDDLE_DEEP', 'PTDetailed_PASS_MIDDLE_SHORT', 'PTDetailed_PASS_RIGHT_DEEP',
 'PTDetailed_PASS_RIGHT_SHORT', 'PTDetailed_RUSH_LEFT_END', 'PTDetailed_RUSH_LEFT_GUARD', 'PTDetailed_RUSH_LEFT_TACKLE', 'PTDetailed_RUSH_MIDDLE_END',
 'PTDetailed_RUSH_RIGHT_END', 'PTDetailed_RUSH_RIGHT_GUARD', 'PTDetailed_RUSH_RIGHT_TACKLE']

print(len(feature_cols))
x_train_1st_down = training_data[feature_cols].to_numpy()
y_train_1st_down = training_data['first_Down_success'].to_numpy()
x_test_1st_down =  testing_data[feature_cols].to_numpy()
y_test_1st_down =  testing_data['first_Down_success'].to_numpy()

114


In [56]:
from sklearn.linear_model import LinearRegression
regressor_first_down = LinearRegression()

In [57]:
regressor_first_down.fit(x_train_1st_down, y_train_1st_down)

In [58]:
y_prediction=regressor_first_down.predict(x_test_1st_down)

In [60]:
import sklearn.metrics as metrics
from sklearn.metrics import classification_report
print(y_prediction)
rounded_predictions = np.round(y_prediction)
print(rounded_predictions)
accuracy = metrics.accuracy_score(y_test_1st_down, rounded_predictions)
print(classification_report(y_test_1st_down, rounded_predictions))

[0.38387299 0.77983856 0.39214706 ... 0.36829376 0.63450623 0.36618423]
[0. 1. 0. ... 0. 1. 0.]
              precision    recall  f1-score   support

           0       0.64      0.48      0.55     14559
           1       0.68      0.80      0.73     19648

    accuracy                           0.66     34207
   macro avg       0.66      0.64      0.64     34207
weighted avg       0.66      0.66      0.65     34207



# Touch Down Model - Linear Regression

In [66]:
x_train = training_data[feature_cols].to_numpy()
y_train = training_data['Touch_Down_success'].to_numpy()
x_test =  testing_data[feature_cols].to_numpy()
y_test =  testing_data['Touch_Down_success'].to_numpy()

In [67]:
regressor_Touch_Down = LinearRegression()

In [68]:
regressor_Touch_Down.fit(x_train, y_train)

In [69]:
y_prediction=regressor_Touch_Down.predict(x_test)

In [70]:
import sklearn.metrics as metrics
rounded_predictions = np.round(y_prediction)
print(classification_report(y_test, rounded_predictions))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     31581
           1       0.80      0.00      0.00      2626

    accuracy                           0.92     34207
   macro avg       0.86      0.50      0.48     34207
weighted avg       0.91      0.92      0.89     34207



# Uploading Touch Down Model on Hugging face 

In [71]:
#! pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [72]:
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id="regressor_Touch_Down", private=True)

'https://huggingface.co/mchawla33/regressor_Touch_Down'

In [73]:
from joblib import dump, load
dump(regressor_Touch_Down, 'regressor_Touch_Down.joblib')

['regressor_Touch_Down.joblib']

In [74]:
# api.upload_file(path_or_fileobj="regressor_Touch_Down.joblib", 
#             path_in_repo="regressor_Touch_Down.joblib",
#             repo_id= 'Adi-khurana-berk/regressor_Touch_Down')

api.upload_file(path_or_fileobj="regressor_Touch_Down.joblib", 
            path_in_repo="regressor_Touch_Down.joblib",
            repo_id= 'mchawla33/regressor_Touch_Down')


'https://huggingface.co/mchawla33/regressor_Touch_Down/blob/main/regressor_Touch_Down.joblib'

# Downloading the Touch_Down Prediction Model

In [77]:
from huggingface_hub import hf_hub_download
# hf_hub_download(repo_id="Adi-khurana-berk/regressor_Touch_Down", filename="regressor_Touch_Down.joblib")
location = hf_hub_download(repo_id="mchawla33/regressor_Touch_Down", filename="regressor_Touch_Down.joblib")
# downloaded_regressor_Touch_Down = load('regressor_Touch_Down.joblib')
downloaded_regressor_Touch_Down = load(location)

Downloading:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

In [79]:
first_down_predictions = downloaded_regressor_Touch_Down.predict(x_test_1st_down)
first_down_predictions

array([ 0.24129105, -0.01120758, -0.07261658, ...,  0.22384071,
        0.12533188,  0.06753159])

# Uploading 1st Down Model on Hugging face 

In [80]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [81]:
api = HfApi()
api.create_repo(repo_id="regressor_first_down", private=True)

'https://huggingface.co/mchawla33/regressor_first_down'

In [82]:
from joblib import dump, load
dump(regressor_first_down, 'regressor_first_down.joblib')

['regressor_first_down.joblib']

In [83]:
# api.upload_file(path_or_fileobj="regressor_first_down.joblib", 
#             path_in_repo="regressor_first_down.joblib",
#             repo_id= 'Adi-khurana-berk/regressor_first_down')

api.upload_file(path_or_fileobj="regressor_first_down.joblib", 
            path_in_repo="regressor_first_down.joblib",
            repo_id= 'mchawla33/regressor_first_down')

'https://huggingface.co/mchawla33/regressor_first_down/blob/main/regressor_first_down.joblib'

In [84]:
# hf_hub_download(repo_id="Adi-khurana-berk/regressor_first_down", filename="regressor_first_down.joblib")
# downloaded_regressor_Touch_Down = load('regressor_first_down.joblib') 

location = hf_hub_download(repo_id="mchawla33/regressor_first_down", filename="regressor_first_down.joblib")
downloaded_regressor_Touch_Down = load(location) 

Downloading:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

In [85]:
touch_down_prediction = downloaded_regressor_Touch_Down.predict(x_test_1st_down)
touch_down_prediction

array([0.38387299, 0.77983856, 0.39214706, ..., 0.36829376, 0.63450623,
       0.36618423])

# ** Work in Progress **

## Touch Down Predictions - Using softmax & cross entropy Linear regression

In [None]:
def one_hot(y, c):
    
    # y--> label/ground truth.
    # c--> Number of classes.
    
    # A zero matrix of size (m, c)
    y_hot = np.zeros((len(y), c))
    
    # Putting 1 for column where the label is,
    # Using multidimensional indexing.
    y_hot[np.arange(len(y)), y] = 1
    
    return y_hot

In [None]:
def softmax(z):
    
    # z--> linear part.
    
    # subtracting the max of z for numerical stability.
    exp = np.exp(z - np.max(z))
    
    # Calculating softmax for all examples.
    for i in range(len(z)):
        exp[i] /= np.sum(exp[i])
        
    return exp

In [None]:
def fit(X, y, lr, c, epochs):
    
    # X --> Input.
    # y --> true/target value.
    # lr --> Learning rate.
    # c --> Number of classes.
    # epochs --> Number of iterations.
    
        
    # m-> number of training examples
    # n-> number of features 
    m, n = X.shape
    
    # Initializing weights and bias randomly.
    w = np.random.random((n, c))
    b = np.random.random(c)
    # Empty list to store losses.
    losses = []
    
    # Training loop.
    for epoch in range(epochs):
        
        # Calculating hypothesis/prediction.
        z = X@w + b
        y_hat = softmax(z)
        
        # One-hot encoding y.
        y_hot = one_hot(y, c)
        
        # Calculating the gradient of loss w.r.t w and b.
        w_grad = (1/m)*np.dot(X.T, (y_hat - y_hot)) 
        b_grad = (1/m)*np.sum(y_hat - y_hot)
        
        # Updating the parameters.
        w = w - lr*w_grad
        b = b - lr*b_grad
        
        # Calculating loss and appending it in the list.
        loss = -np.mean(np.log(y_hat[np.arange(len(y)), y]))
        losses.append(loss)
        # Printing out the loss at every 100th iteration.
        if epoch%100==0:
            print('Epoch {epoch}==> Loss = {loss}'
                  .format(epoch=epoch, loss=loss))
    return w, b, losses

def predict(X, w, b):
    
    # X --> Input.
    # w --> weights.
    # b --> bias.
    
    # Predicting
    z = X@w + b
    y_hat = softmax(z)
    
    # Returning the class with highest probability.
    return np.argmax(y_hat, axis=1)

In [None]:
x_train = training_data[feature_cols].to_numpy()
y_train = training_data['Touch_Down_success'].to_numpy()

x_test =  testing_data[feature_cols].to_numpy()
y_test =  testing_data['Touch_Down_success'].to_numpy()

In [None]:
#w, b, l = fit(x_train, y_train, lr=1, c=2, epochs=300)

In [None]:
y_prediction  = predict(x_test, w, b)
rounded_predictions = np.round(y_prediction)
print(classification_report(y_test, rounded_predictions))