In [222]:
import pandas as pd
import numpy as np

In [223]:
class selfDerivedLinearRegression:
    def __init__(self, learning_rate=0.01, iterations=1000):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        print(n_samples, n_features)
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.iterations):
            y_predicted = np.dot(X, self.weights) + self.bias
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            # if self.weights and bias are not null, then print them
            if self.weights.all() != None and self.bias != None:
                print(self.weights, self.bias, _)



    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [224]:
# load dataframes
games = pd.read_csv("data/games.csv")
plays = pd.read_csv("data/plays.csv")
tackles = pd.read_csv("data/tackles.csv")
players = pd.read_csv("data/players.csv")

In [225]:
# add target var (can also use play desc to count scrambles as runs)
plays["run"] = plays["passResult"].isna()
def get_direction(playDescription):
        if("left" in playDescription):
                return 0
        if("right" in playDescription):
                return 2
        return 1
plays["direction"] = plays["playDescription"].apply(get_direction)

In [226]:
# filter plays dataframe
plays_filtered=plays[["gameId", "playId", "run", "direction", "quarter", "down", "yardsToGo", "possessionTeam", "defensiveTeam",
                      "yardlineSide", "yardlineNumber", "gameClock", "preSnapHomeScore", "preSnapVisitorScore", "absoluteYardlineNumber",
                      "preSnapHomeTeamWinProbability", "expectedPoints", "offenseFormation", "defendersInTheBox",
                      "homeTeamWinProbabilityAdded", "expectedPointsAdded"]]

In [227]:
# filter games dataframe
games_filtered = games[["gameId", "week", "gameDate", "gameTimeEastern", "homeFinalScore", "visitorFinalScore"]]

In [228]:
# filter players dataframe
players_filtered = players[["position", "nflId"]]

In [229]:
# filter tackles
tackles_filtered = tackles[["gameId", "playId", "tackle", "assist", "forcedFumble", "pff_missedTackle"]]
tackles_filtered = tackles_filtered.groupby(["gameId", "playId"]).sum().reset_index()

In [230]:
# merge plays and games
merged1 = plays_filtered.merge(games_filtered, on="gameId")

In [231]:
# merge tackles and merged1
data = merged1.merge(tackles_filtered, on=["gameId", "playId"], how="left")

In [232]:
data

Unnamed: 0,gameId,playId,run,direction,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,...,expectedPointsAdded,week,gameDate,gameTimeEastern,homeFinalScore,visitorFinalScore,tackle,assist,forcedFumble,pff_missedTackle
0,2022100908,3537,False,1,4,1,10,ATL,TB,ATL,...,0.981955,5,10/09/2022,13:00:00,21,15,1.0,0.0,0.0,0.0
1,2022091103,3126,True,2,4,1,10,PIT,CIN,PIT,...,-0.263424,1,09/11/2022,13:00:00,20,23,1.0,0.0,0.0,1.0
2,2022091111,1148,False,1,2,2,5,LV,LAC,LV,...,1.133666,1,09/11/2022,16:25:00,24,19,1.0,0.0,0.0,0.0
3,2022100212,2007,True,0,3,2,10,DEN,LV,DEN,...,-0.043580,4,10/02/2022,16:25:00,32,23,0.0,2.0,0.0,2.0
4,2022091900,1372,True,2,2,1,10,BUF,TEN,TEN,...,-0.167903,2,09/19/2022,19:15:00,41,7,0.0,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12481,2022100204,123,True,2,1,1,10,DAL,WAS,WAS,...,-0.504018,4,10/02/2022,13:00:00,25,10,1.0,0.0,0.0,0.0
12482,2022091200,3467,False,2,4,1,10,SEA,DEN,SEA,...,-0.444642,1,09/12/2022,20:15:00,17,16,1.0,1.0,0.0,0.0
12483,2022101605,3371,True,0,4,1,10,CIN,NO,CIN,...,0.203819,6,10/16/2022,13:00:00,26,30,1.0,1.0,0.0,1.0
12484,2022100207,2777,True,1,3,1,10,IND,TEN,TEN,...,-0.976039,4,10/02/2022,13:00:00,17,24,1.0,1.0,0.0,0.0


In [233]:
data.columns

# check the possible values of offenseFormation
data["offenseFormation"].unique()

array(['SHOTGUN', 'I_FORM', 'SINGLEBACK', 'EMPTY', 'PISTOL', 'WILDCAT',
       'JUMBO', nan], dtype=object)

In [234]:
# one hot encode offenseFormation
teams = data.possessionTeam.unique()
data = pd.get_dummies(data, columns=["offenseFormation", "possessionTeam", "defensiveTeam"])

data.columns

Index(['gameId', 'playId', 'run', 'direction', 'quarter', 'down', 'yardsToGo',
       'yardlineSide', 'yardlineNumber', 'gameClock', 'preSnapHomeScore',
       'preSnapVisitorScore', 'absoluteYardlineNumber',
       'preSnapHomeTeamWinProbability', 'expectedPoints', 'defendersInTheBox',
       'homeTeamWinProbabilityAdded', 'expectedPointsAdded', 'week',
       'gameDate', 'gameTimeEastern', 'homeFinalScore', 'visitorFinalScore',
       'tackle', 'assist', 'forcedFumble', 'pff_missedTackle',
       'offenseFormation_EMPTY', 'offenseFormation_I_FORM',
       'offenseFormation_JUMBO', 'offenseFormation_PISTOL',
       'offenseFormation_SHOTGUN', 'offenseFormation_SINGLEBACK',
       'offenseFormation_WILDCAT', 'possessionTeam_ARI', 'possessionTeam_ATL',
       'possessionTeam_BAL', 'possessionTeam_BUF', 'possessionTeam_CAR',
       'possessionTeam_CHI', 'possessionTeam_CIN', 'possessionTeam_CLE',
       'possessionTeam_DAL', 'possessionTeam_DEN', 'possessionTeam_DET',
       'possessionT

Before starting logistic regression, let me write out what I predict will happen based on the parameters I have set.  

EPA will be a difficult variable to find a pattern in because, in theory, the mean EPA of every play should be zero.  In addition, assuming actions that are completely informed and perfect on each side, the decision to run or pass (and consequently, the pre-snap tells such as defenders in box and offensive formation) will also have zero impact on EPA as well.  However, it is possible that we find that some strategies are better than others, leading to a correlation with these two parameters.

Another thing to account for is when NFL Teams might choose to account for other things rather than EPA.  For example, the position resets at halftime.  Therefore, as teams are forced to rush to complete plays, we may find that EPA per play tails off right at the end of the second half, and at the end of the game as well.   In addition, I expect to find observable patterns at the end of games, where teams sacrifice EPA per play in order to achieve other objectives (running out the clock if ahead, or scoring quickly if behind).  

In [235]:
# remove all post-snap features other than EPA Added

data = data.drop(columns=["homeTeamWinProbabilityAdded", 'homeFinalScore', 'visitorFinalScore', 'tackle', 'assist',
       'forcedFumble', 'pff_missedTackle'])

#makes no sense to preform regression of these either
data = data.drop(columns=["gameId", "playId"])


# yardLineSide and yardLineNumber are redundant
data = data.drop(columns=["yardlineSide", "yardlineNumber"])

# gameTimeEastern is a string, convert to hours, and convert gameDate to days after 2022-09-01
data["gameTimeEastern"] = data["gameTimeEastern"].str.split(":").str[0].astype(int) + data["gameTimeEastern"].str.split(":").str[1].astype(int)/60
data["gameDate"] = (pd.to_datetime(data["gameDate"]) - pd.to_datetime("2022-09-01")).dt.days

# validate that gameDate is in int form
if not (data["gameDate"].dtype == 'int'):
    print("gameDate is not in int form")




# convert gameClock to seconds
data["gameClock"] = data["gameClock"].str.split(":").str[0].astype(int)*60 + data["gameClock"].str.split(":").str[1].astype(int)

data

Unnamed: 0,run,direction,quarter,down,yardsToGo,gameClock,preSnapHomeScore,preSnapVisitorScore,absoluteYardlineNumber,preSnapHomeTeamWinProbability,...,defensiveTeam_NO,defensiveTeam_NYG,defensiveTeam_NYJ,defensiveTeam_PHI,defensiveTeam_PIT,defensiveTeam_SEA,defensiveTeam_SF,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS
0,False,1,4,1,10,472,21,7,69,0.976785,...,False,False,False,False,False,False,False,True,False,False
1,True,2,4,1,10,458,14,20,76,0.160485,...,False,False,False,False,False,False,False,False,False,False
2,False,1,2,2,5,537,10,3,40,0.756661,...,False,False,False,False,False,False,False,False,False,False
3,True,0,3,2,10,792,19,16,47,0.620552,...,False,False,False,False,False,False,False,False,False,False
4,True,2,2,1,10,513,7,7,75,0.836290,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12481,True,2,1,1,10,811,0,0,49,0.638600,...,False,False,False,False,False,False,False,False,False,True
12482,False,2,4,1,10,368,17,16,40,0.615241,...,False,False,False,False,False,False,False,False,False,False
12483,True,0,4,1,10,575,26,21,69,0.667054,...,True,False,False,False,False,False,False,False,False,False
12484,True,1,3,1,10,122,17,24,44,0.410611,...,False,False,False,False,False,False,False,False,True,False


In [236]:
# convert one hot encoded columns to int
data = data.astype({"run": int, "offenseFormation_EMPTY": int, "offenseFormation_I_FORM": int,
                    "offenseFormation_JUMBO": int, "offenseFormation_PISTOL": int, "offenseFormation_SHOTGUN": int,
                    "offenseFormation_SINGLEBACK": int, "offenseFormation_WILDCAT": int})
# do the same for offense and defense teams
for team in teams:
    data = data.astype({"possessionTeam_" + team: int, "defensiveTeam_" + team: int})

# show what data has missing values
data[data.isna().any(axis=1)]

Unnamed: 0,run,direction,quarter,down,yardsToGo,gameClock,preSnapHomeScore,preSnapVisitorScore,absoluteYardlineNumber,preSnapHomeTeamWinProbability,...,defensiveTeam_NO,defensiveTeam_NYG,defensiveTeam_NYJ,defensiveTeam_PHI,defensiveTeam_PIT,defensiveTeam_SEA,defensiveTeam_SF,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS
1975,0,0,4,3,6,194,16,10,90,0.80269,...,0,0,0,0,0,0,0,0,0,0
5770,1,2,4,4,7,13,31,25,84,0.995626,...,0,0,0,0,0,0,0,0,0,0
8558,1,1,1,2,1,288,0,7,11,0.386114,...,0,0,0,0,0,0,0,0,0,0
9079,0,0,1,3,10,496,0,7,88,0.255894,...,0,0,0,0,0,0,0,0,0,0
11460,0,2,2,1,10,451,3,6,36,0.380061,...,0,0,0,0,0,0,0,1,0,0
12277,1,1,2,1,10,88,21,3,21,0.931742,...,0,0,0,0,0,0,0,0,0,0


In [237]:
data.dropna(inplace=True)
data

Unnamed: 0,run,direction,quarter,down,yardsToGo,gameClock,preSnapHomeScore,preSnapVisitorScore,absoluteYardlineNumber,preSnapHomeTeamWinProbability,...,defensiveTeam_NO,defensiveTeam_NYG,defensiveTeam_NYJ,defensiveTeam_PHI,defensiveTeam_PIT,defensiveTeam_SEA,defensiveTeam_SF,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS
0,0,1,4,1,10,472,21,7,69,0.976785,...,0,0,0,0,0,0,0,1,0,0
1,1,2,4,1,10,458,14,20,76,0.160485,...,0,0,0,0,0,0,0,0,0,0
2,0,1,2,2,5,537,10,3,40,0.756661,...,0,0,0,0,0,0,0,0,0,0
3,1,0,3,2,10,792,19,16,47,0.620552,...,0,0,0,0,0,0,0,0,0,0
4,1,2,2,1,10,513,7,7,75,0.836290,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12481,1,2,1,1,10,811,0,0,49,0.638600,...,0,0,0,0,0,0,0,0,0,1
12482,0,2,4,1,10,368,17,16,40,0.615241,...,0,0,0,0,0,0,0,0,0,0
12483,1,0,4,1,10,575,26,21,69,0.667054,...,1,0,0,0,0,0,0,0,0,0
12484,1,1,3,1,10,122,17,24,44,0.410611,...,0,0,0,0,0,0,0,0,1,0


In [238]:
#empty dictionary of team names
data_by_team_test = {x : pd.DataFrame() for x in teams}
data_by_team_train = {x : pd.DataFrame() for x in teams}
data_test = pd.DataFrame()
data_train = pd.DataFrame()
#fills dictionary with all plays according to possesion team
for team in teams:
    data_by_team_test[team] = data[data["possessionTeam_"+str(team)] == 1]
    #seperate into train and test sets seperated by team 
    rows = len(data_by_team_test[team])
    rand_idx = np.random.randint(0, rows,size = int(rows/10))
    #make seperate train and test sets for each team
    data_by_team_train[team] = data_by_team_test[team].drop(data_by_team_test[team].index[rand_idx])
    data_by_team_test[team] = data_by_team_test[team].iloc[rand_idx]
    #create a joined train and test set(for overall)
    data_test = pd.concat([data_test, data_by_team_test[team]], ignore_index = True)
    data_train = pd.concat([data_train, data_by_team_train[team]], ignore_index = True)

#Randomize final sets to mix teams
data_test = data_test.sample(frac=1)
data_train = data_train.sample(frac=1)
#split datasets into parameters and result
data_test_run = data_test["run"]
#data_test_epa = data_test["expectedPoints"]
data_test_dir = data_test["direction"]
data_test.drop(["run","direction"],axis =1, inplace=True)

data_train_run = data_train["run"]
#data_train_epa = data_train["expectedPoints"]
data_train_dir = data_train["direction"]
data_train.drop(["run","direction"],axis =1, inplace=True)

data_test

Unnamed: 0,quarter,down,yardsToGo,gameClock,preSnapHomeScore,preSnapVisitorScore,absoluteYardlineNumber,preSnapHomeTeamWinProbability,expectedPoints,defendersInTheBox,...,defensiveTeam_NO,defensiveTeam_NYG,defensiveTeam_NYJ,defensiveTeam_PHI,defensiveTeam_PIT,defensiveTeam_SEA,defensiveTeam_SF,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS
489,4,1,10,889,17,9,84,0.830117,1.518351,7.0,...,0,0,0,0,0,0,0,0,0,0
452,4,2,6,792,17,13,85,0.833956,3.759383,7.0,...,0,1,0,0,0,0,0,0,0,0
517,3,2,4,694,3,12,84,0.201078,1.124028,7.0,...,0,0,0,0,0,0,0,1,0,0
538,1,2,4,857,0,0,36,0.765245,1.014981,7.0,...,0,0,0,0,0,0,0,0,0,0
928,1,1,10,386,0,0,80,0.400584,1.587692,7.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972,2,2,10,500,0,13,51,0.128884,1.844602,6.0,...,0,0,0,0,0,0,0,0,0,0
788,4,3,3,74,17,17,25,0.476622,-0.120459,5.0,...,0,0,0,0,0,0,0,0,0,0
1124,4,1,5,783,14,26,105,0.041665,5.813514,8.0,...,0,0,0,0,0,0,0,0,0,0
57,4,1,10,818,38,3,75,0.999161,1.966373,6.0,...,0,0,0,0,0,0,0,0,0,0


In [239]:
# show me the data points where an NaN is present
data_test[data_test.isna().any(axis=1)]

Unnamed: 0,quarter,down,yardsToGo,gameClock,preSnapHomeScore,preSnapVisitorScore,absoluteYardlineNumber,preSnapHomeTeamWinProbability,expectedPoints,defendersInTheBox,...,defensiveTeam_NO,defensiveTeam_NYG,defensiveTeam_NYJ,defensiveTeam_PHI,defensiveTeam_PIT,defensiveTeam_SEA,defensiveTeam_SF,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS


In [240]:
# fit model
model = selfDerivedLinearRegression()
model.fit(data_train, data_train_run)

# predict
y_pred = model.predict(data_test)

tester = np.mean((data_test_run - y_pred) ** 2)
tester

11316 85
[ 1.30116649e-02  7.98868858e-03  4.02050194e-02  2.33303464e+00
  5.69627077e-02  4.92161541e-02  3.07798692e-01  2.85728030e-03
  1.30793474e-02  3.46200071e-02 -2.49955155e-04  2.51519972e-02
  1.90705196e-01  7.71300960e-02  3.18133616e-05  6.07104984e-04
  8.13008130e-05  3.53481796e-04  1.83280311e-03  2.12530930e-03
  6.09756098e-05  1.55531990e-04  2.17391304e-04  1.89112761e-04
  1.21951220e-04  1.49346059e-04  2.09437964e-04  1.53764581e-04
  2.00600919e-04  1.67903853e-04  1.47578650e-04  1.56415695e-04
  1.58183104e-04  1.38741605e-04  1.51997172e-04  1.90880170e-04
  1.29020855e-04  1.30788264e-04  1.29020855e-04  1.27253446e-04
  1.42276423e-04  1.35206787e-04  1.79392011e-04  1.73206080e-04
  1.66136444e-04  1.60834217e-04  1.87345352e-04  1.27253446e-04
  1.48462354e-04  1.61717922e-04  1.36090491e-04  1.79392011e-04
  1.70554966e-04  1.64369035e-04  1.72322375e-04  1.36090491e-04
  1.34323082e-04  1.97066101e-04  1.83810534e-04  1.83810534e-04
  1.50229763e-04

  self.weights -= self.learning_rate * dw


[-inf -inf -inf  nan  nan  nan -inf -inf  nan -inf  nan -inf -inf -inf
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan] -inf 89
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan] -inf 90
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan na

nan

How to gain insights from the linear regression?

In [241]:
print("Weights: ", model.weights)
print("Bias: ", model.bias)

Weights:  [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan]
Bias:  -inf


So, first of all, what did we learn from this?

First of all, the most conclusive set of data we learned is that running the ball is almost certainly a worse decision than passing the ball.  In fact, the conclusion is so shocking that I thought sacks might be counted as run plays (they are not). 

Secondly, we see that the most effective formations are Wildcat and Jumbo, with everyting else having a negative effect.  Both of these formations are atraditional formations traditionally used in short-yardage scenarios (i.e. times where getting 2 yards is similar in value in getting 10, such as a third and one where teams need one yard to gain three more plays, or scenarios where the offense is one yard away from scoring a touchdown).  There are two possible explanations for this.  Firstly, the Estimated Points metric underestimates the value of these situations, resulting in a high EPA for all short yardage situations, or secondly, that Jumbo and Wildcat are effectively employed in the situations in which they are called upon.  