In [131]:
import pandas as pd
import numpy as np

In [132]:
# load dataframes
games = pd.read_csv("data/games.csv")
plays = pd.read_csv("data/plays.csv")
tackles = pd.read_csv("data/tackles.csv")
players = pd.read_csv("data/players.csv")

In [133]:
# add target var (can also use play desc to count scrambles as runs)
plays["run"] = plays["passResult"].isna()

In [134]:
# filter plays dataframe
plays_filtered=plays[["gameId", "playId", "run", "quarter", "down", "yardsToGo", "possessionTeam", "defensiveTeam",
                      "yardlineSide", "yardlineNumber", "gameClock", "preSnapHomeScore", "preSnapVisitorScore", "absoluteYardlineNumber",
                      "preSnapHomeTeamWinProbability", "expectedPoints", "offenseFormation", "defendersInTheBox",
                      "homeTeamWinProbabilityAdded", "expectedPointsAdded"]]

In [135]:
# filter games dataframe
games_filtered = games[["gameId", "week", "gameDate", "gameTimeEastern", "homeFinalScore", "visitorFinalScore"]]

In [136]:
# filter players dataframe
players_filtered = players[["position", "nflId"]]

In [137]:
# filter tackles
tackles_filtered = tackles[["gameId", "playId", "tackle", "assist", "forcedFumble", "pff_missedTackle"]]
tackles_filtered = tackles_filtered.groupby(["gameId", "playId"]).sum().reset_index()

In [138]:
# merge plays and games
merged1 = plays_filtered.merge(games_filtered, on="gameId")

In [139]:
# merge tackles and merged1
data = merged1.merge(tackles_filtered, on=["gameId", "playId"], how="left")

In [140]:
data

Unnamed: 0,gameId,playId,run,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,...,expectedPointsAdded,week,gameDate,gameTimeEastern,homeFinalScore,visitorFinalScore,tackle,assist,forcedFumble,pff_missedTackle
0,2022100908,3537,False,4,1,10,ATL,TB,ATL,41,...,0.981955,5,10/09/2022,13:00:00,21,15,1.0,0.0,0.0,0.0
1,2022091103,3126,True,4,1,10,PIT,CIN,PIT,34,...,-0.263424,1,09/11/2022,13:00:00,20,23,1.0,0.0,0.0,1.0
2,2022091111,1148,False,2,2,5,LV,LAC,LV,30,...,1.133666,1,09/11/2022,16:25:00,24,19,1.0,0.0,0.0,0.0
3,2022100212,2007,True,3,2,10,DEN,LV,DEN,37,...,-0.043580,4,10/02/2022,16:25:00,32,23,0.0,2.0,0.0,2.0
4,2022091900,1372,True,2,1,10,BUF,TEN,TEN,35,...,-0.167903,2,09/19/2022,19:15:00,41,7,0.0,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12481,2022100204,123,True,1,1,10,DAL,WAS,WAS,39,...,-0.504018,4,10/02/2022,13:00:00,25,10,1.0,0.0,0.0,0.0
12482,2022091200,3467,False,4,1,10,SEA,DEN,SEA,30,...,-0.444642,1,09/12/2022,20:15:00,17,16,1.0,1.0,0.0,0.0
12483,2022101605,3371,True,4,1,10,CIN,NO,CIN,41,...,0.203819,6,10/16/2022,13:00:00,26,30,1.0,1.0,0.0,1.0
12484,2022100207,2777,True,3,1,10,IND,TEN,TEN,34,...,-0.976039,4,10/02/2022,13:00:00,17,24,1.0,1.0,0.0,0.0


In [141]:
data.columns

# check the possible values of offenseFormation
data["offenseFormation"].unique()

array(['SHOTGUN', 'I_FORM', 'SINGLEBACK', 'EMPTY', 'PISTOL', 'WILDCAT',
       'JUMBO', nan], dtype=object)

In [142]:
# one hot encode offenseFormation
data = pd.get_dummies(data, columns=["offenseFormation"])

data.columns

Index(['gameId', 'playId', 'run', 'quarter', 'down', 'yardsToGo',
       'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber',
       'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore',
       'absoluteYardlineNumber', 'preSnapHomeTeamWinProbability',
       'expectedPoints', 'defendersInTheBox', 'homeTeamWinProbabilityAdded',
       'expectedPointsAdded', 'week', 'gameDate', 'gameTimeEastern',
       'homeFinalScore', 'visitorFinalScore', 'tackle', 'assist',
       'forcedFumble', 'pff_missedTackle', 'offenseFormation_EMPTY',
       'offenseFormation_I_FORM', 'offenseFormation_JUMBO',
       'offenseFormation_PISTOL', 'offenseFormation_SHOTGUN',
       'offenseFormation_SINGLEBACK', 'offenseFormation_WILDCAT'],
      dtype='object')

Before starting logistic regression, let me write out what I predict will happen based on the parameters I have set.  

EPA will be a difficult variable to find a pattern in because, in theory, the mean EPA of every play should be zero.  In addition, assuming actions that are completely informed and perfect on each side, the decision to run or pass (and consequently, the pre-snap tells such as defenders in box and offensive formation) will also have zero impact on EPA as well.  However, it is possible that we find that some strategies are better than others, leading to a correlation with these two parameters.

Another thing to account for is when NFL Teams might choose to account for other things rather than EPA.  For example, the position resets at halftime.  Therefore, as teams are forced to rush to complete plays, we may find that EPA per play tails off right at the end of the second half, and at the end of the game as well.   In addition, I expect to find observable patterns at the end of games, where teams sacrifice EPA per play in order to achieve other objectives (running out the clock if ahead, or scoring quickly if behind).  

In [143]:
# remove all post-snap features other than EPA Added

data = data.drop(columns=["homeTeamWinProbabilityAdded", 'homeFinalScore', 'visitorFinalScore', 'tackle', 'assist',
       'forcedFumble', 'pff_missedTackle'])

#makes no sense to preform regression of these either
data = data.drop(columns=["gameId", "playId"])

# this could be added back.  for now, remove, as imo insights on what teams are good is not as valuable
# as insights as to how to optimize controlled pre-snap variables to achieve better outcomes
data = data.drop(columns=["defensiveTeam", "possessionTeam"])

# yardLineSide and yardLineNumber are redundant
data = data.drop(columns=["yardlineSide", "yardlineNumber"])

# gameTimeEastern is a string, convert to hours, and convert gameDate to days after 2022-09-01
data["gameTimeEastern"] = data["gameTimeEastern"].str.split(":").str[0].astype(int) + data["gameTimeEastern"].str.split(":").str[1].astype(int)/60
data["gameDate"] = (pd.to_datetime(data["gameDate"]) - pd.to_datetime("2022-09-01")).dt.days

# validate that gameDate is in int form
if not (data["gameDate"].dtype == 'int'):
    print("gameDate is not in int form")




# convert gameClock to seconds
data["gameClock"] = data["gameClock"].str.split(":").str[0].astype(int)*60 + data["gameClock"].str.split(":").str[1].astype(int)

data

Unnamed: 0,run,quarter,down,yardsToGo,gameClock,preSnapHomeScore,preSnapVisitorScore,absoluteYardlineNumber,preSnapHomeTeamWinProbability,expectedPoints,...,week,gameDate,gameTimeEastern,offenseFormation_EMPTY,offenseFormation_I_FORM,offenseFormation_JUMBO,offenseFormation_PISTOL,offenseFormation_SHOTGUN,offenseFormation_SINGLEBACK,offenseFormation_WILDCAT
0,False,4,1,10,472,21,7,69,0.976785,2.360609,...,5,38,13.000000,False,False,False,False,True,False,False
1,True,4,1,10,458,14,20,76,0.160485,1.733344,...,1,10,13.000000,False,False,False,False,True,False,False
2,False,2,2,5,537,10,3,40,0.756661,1.312855,...,1,10,16.416667,False,True,False,False,False,False,False
3,True,3,2,10,792,19,16,47,0.620552,1.641006,...,4,31,16.416667,False,False,False,False,False,True,False
4,True,2,1,10,513,7,7,75,0.836290,3.686428,...,2,18,19.250000,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12481,True,1,1,10,811,0,0,49,0.638600,3.642571,...,4,31,13.000000,False,False,False,False,False,True,False
12482,False,4,1,10,368,17,16,40,0.615241,1.434580,...,1,11,20.250000,False,False,False,False,False,True,False
12483,True,4,1,10,575,26,21,69,0.667054,2.115356,...,6,45,13.000000,False,False,False,False,True,False,False
12484,True,3,1,10,122,17,24,44,0.410611,3.946232,...,4,31,13.000000,False,False,False,False,True,False,False


In [144]:
# run multivariate linear regression in order to predict EPA Based on all other features
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# drop rows with missing values
data = data.dropna()

# split data into train and test
X = data.drop(columns=["expectedPointsAdded"])
y = data["expectedPointsAdded"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# fit model
model = LinearRegression()
model.fit(X_train, y_train)

# predict
y_pred = model.predict(X_test)

# evaluate
mse = mean_squared_error(y_test, y_pred)
mse

np.float64(1.2922277786022853)

How to gain insights from the linear regression?

In [145]:
# get coefficients
coefficients = pd.Series(model.coef_, index=X.columns)
coefficients

run                             -0.670988
quarter                         -0.046396
down                             0.158340
yardsToGo                       -0.023519
gameClock                        0.000167
preSnapHomeScore                 0.003436
preSnapVisitorScore              0.001358
absoluteYardlineNumber          -0.000104
preSnapHomeTeamWinProbability   -0.061565
expectedPoints                  -0.020220
defendersInTheBox               -0.004843
week                             0.106366
gameDate                        -0.015619
gameTimeEastern                 -0.001978
offenseFormation_EMPTY          -0.066626
offenseFormation_I_FORM         -0.040366
offenseFormation_JUMBO           0.103965
offenseFormation_PISTOL         -0.098139
offenseFormation_SHOTGUN        -0.023279
offenseFormation_SINGLEBACK     -0.040706
offenseFormation_WILDCAT         0.165151
dtype: float64

So, first of all, what did we learn from this?

First of all, the most conclusive set of data we learned is that running the ball is almost certainly a worse decision than passing the ball.  In fact, the conclusion is so shocking that I would like to go back and make sure that sacks aren't counted as run plays.  

Secondly, we see that the most effective formations are Wildcat and Jumbo, with everyting else having a negative effect.  Both of these formations are atraditional formations traditionally used in short-yardage scenarios (i.e. times where getting 2 yards is similar in value in getting 10, such as a third and one where teams need one yard to gain three more plays, or scenarios where the offense is one yard away from scoring a touchdown).  There are two possible explanations for this.  Firstly, the Estimated Points metric underestimates the value of these situations, resulting in a high EPA for all short yardage situations, or secondly, that Jumbo and Wildcat are effectively employed in the situations in which they are called upon.  