In [258]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [259]:
def get_direction(playDescription):
        if("left" in playDescription):
                return 0
        if("right" in playDescription):
                return 2
        return 0

In [260]:
# load dataframes
games = pd.read_csv("data/games.csv")
plays = pd.read_csv("data/plays.csv")
tackles = pd.read_csv("data/tackles.csv")
players = pd.read_csv("data/players.csv")


In [261]:
# add target var (can also use play desc to count scrambles as runs)
plays["run"] = plays["passResult"].isna()

In [262]:
#makes column for if run(1 true, 0 false)
plays["run"] = plays["passResult"].isna()
#makes column for play direction
plays["direction"] = plays["playDescription"].apply(get_direction)
#filters play data
plays_filtered = plays[["gameId","quarter", "down", "yardsToGo", "possessionTeam","defensiveTeam","absoluteYardlineNumber", "gameClock", "preSnapHomeScore",
                        "preSnapVisitorScore", "offenseFormation", "defendersInTheBox","run","direction","expectedPoints"]]

In [263]:
#filters game data
games_filtered = games[["gameId", "week","gameTimeEastern"]]

In [264]:
# filter players dataframe
players_filtered = players[["position", "nflId"]]

In [265]:
# # filter tackles
# tackles_filtered = tackles[["gameId", "playId", "tackle", "assist", "forcedFumble", "pff_missedTackle"]]
# tackles_filtered = tackles_filtered.groupby(["gameId", "playId"]).sum().reset_index()

In [266]:
#combine game data with play data
data = plays_filtered.merge(games_filtered, on="gameId")

In [267]:
#make all game times purly numbers (probably not necessary - most models can handle datetime or can convert to int differently)
def str_to_time(time):
    splt = time.split(":")
    return int(splt[0]) * 60 + int(splt[1])
data["gameTimeEastern"] = data["gameTimeEastern"].apply(str_to_time)
data["gameClock"] = data["gameClock"].apply(str_to_time)
#drop game id
data.drop("gameId",axis =1, inplace=True)
#normalization]
for param in ['yardsToGo','absoluteYardlineNumber',	'gameClock', 'week','preSnapHomeScore',	'preSnapVisitorScore',	'defendersInTheBox','gameTimeEastern']:
    data[param] = (data[param] - data[param].mean())/data[param].std()  +.5
    



In [268]:
#find all unique teams
teams = data.possessionTeam.unique()
#1 hot encode teams and formations
data = pd.get_dummies(data, prefix=['possessionTeam', 'defensiveTeam', 'offenseFormation'])
#fill 1 nan in defeive players in the box
data.fillna(0,inplace=True)

In [269]:
#empty dictionary of team names
data_by_team_test = {x : pd.DataFrame() for x in teams}
data_by_team_train = {x : pd.DataFrame() for x in teams}
data_test = pd.DataFrame()
data_train = pd.DataFrame()
#fills dictionary with all plays according to possesion team
for team in teams:
    data_by_team_test[team] = data[data["possessionTeam_"+str(team)]]
    #seperate into train and test sets seperated by team 
    rows = len(data_by_team_test[team])
    rand_idx = np.random.randint(0, rows,size = int(rows/10))
    #make seperate train and test sets for each team
    data_by_team_train[team] = data_by_team_test[team].drop(data_by_team_test[team].index[rand_idx])
    data_by_team_test[team] = data_by_team_test[team].iloc[rand_idx]
    #create a joined train and test set(for overall)
    data_test = pd.concat([data_test, data_by_team_test[team]], ignore_index = True)
    data_train = pd.concat([data_train, data_by_team_train[team]], ignore_index = True)

#Randomize final sets to mix teams
data_test = data_test.sample(frac=1)
data_train = data_train.sample(frac=1)
#split datasets into parameters and result
data_test_run = data_test["run"]
data_test_epa = data_test["expectedPoints"]
data_test_dir = data_test["direction"]
data_test.drop(["run","expectedPoints","direction"],axis =1, inplace=True)

data_train_run = data_train["run"]
data_train_epa = data_train["expectedPoints"]
data_train_dir = data_train["direction"]
data_train.drop(["run","expectedPoints","direction"],axis =1, inplace=True)

data_test

Unnamed: 0,quarter,down,yardsToGo,absoluteYardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,defendersInTheBox,week,gameTimeEastern,...,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS,offenseFormation_EMPTY,offenseFormation_I_FORM,offenseFormation_JUMBO,offenseFormation_PISTOL,offenseFormation_SHOTGUN,offenseFormation_SINGLEBACK,offenseFormation_WILDCAT
37,1,2,4.454164,1.151158,-0.194631,0.058287,-0.578827,1.067556,0.942939,-0.233174,...,False,False,False,False,False,False,False,True,False,False
858,2,3,-1.147026,1.527455,-0.157244,-0.047622,-0.578827,1.067556,0.551646,-0.233174,...,False,False,False,False,False,False,False,True,False,False
467,3,1,0.889770,-0.939381,1.887814,0.693743,0.952515,0.082071,1.334232,0.908454,...,False,False,False,False,False,False,False,True,False,False
410,3,1,0.889770,1.025725,0.418513,0.376015,1.280660,1.067556,-0.230940,-0.233174,...,False,False,False,False,False,False,False,True,False,False
285,3,2,0.889770,-1.524732,1.745744,0.799652,0.952515,1.067556,-0.230940,-0.233174,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,2,2,-1.401626,0.649428,-0.598409,0.270106,2.046331,1.067556,0.160353,-0.233174,...,False,False,False,False,False,False,False,False,True,False
506,4,2,0.635171,1.485644,-0.452600,1.329198,0.405607,1.067556,0.942939,0.797076,...,False,False,False,False,False,False,False,False,True,False
28,2,1,0.889770,1.067536,1.760699,1.541017,-0.578827,0.082071,1.334232,-0.233174,...,False,False,False,False,False,False,True,False,False,False
1030,2,1,0.889770,-0.563084,1.682187,0.799652,0.186844,1.067556,0.942939,-0.233174,...,False,False,False,False,False,False,False,False,True,False


Covert all to NP arrays incase this makes Custum Regression Easier

In [270]:
data_test_run = np.ravel(np.array(data_test_run))
data_test_epa = np.ravel(np.array(data_test_epa))
data_test_dir = np.ravel(np.array(data_test_dir))
data_test_param = np.array(data_test)

data_train_run = np.ravel(np.array(data_train_run))
data_train_epa = np.ravel(np.array(data_train_epa))
data_train_dir = np.ravel(np.array(data_train_dir))
data_train_param = np.array(data_train)

Random Forest

In [271]:
# initial random forest classifier
clf = RandomForestClassifier()

In [272]:
clf.fit(data_train, data_train_run)

In [273]:
preds = clf.predict(data_test)

In [274]:
acc = (preds == data_test_run).sum() / len(preds)
acc * 100

74.0470397404704

Linear Regression

In [275]:
model = LinearRegression()
model.fit(data_train_param, data_train_run)
pass_pred = model.predict(data_test_param)
pass_pred = np.where(pass_pred>.5,1,0)
matching = np.sum(pass_pred == data_test_run)
#percentage accuarcy
matching/len(pass_pred) * 100

72.34387672343877