In [890]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [891]:
# load dataframes
games = pd.read_csv("data/games.csv")
plays = pd.read_csv("data/plays.csv")
tackles = pd.read_csv("data/tackles.csv")
players = pd.read_csv("data/players.csv")


In [892]:
# add target var (can also use play desc to count scrambles as runs)
plays["run"] = plays["passResult"].isna()

In [893]:
#makes pass result binary(1 run, 0 pass)
plays["run"] = plays["passResult"].isna()
#filters play data
plays_filtered = plays[["gameId","quarter", "down", "yardsToGo", "possessionTeam","defensiveTeam","absoluteYardlineNumber", "gameClock", "preSnapHomeScore",
                        "preSnapVisitorScore", "run","offenseFormation", "defendersInTheBox","expectedPoints"]]

In [894]:
#filters game data
games_filtered = games[["gameId", "week","gameTimeEastern"]]

In [895]:
# filter players dataframe
players_filtered = players[["position", "nflId"]]

In [896]:
# filter tackles
tackles_filtered = tackles[["gameId", "playId", "tackle", "assist", "forcedFumble", "pff_missedTackle"]]
tackles_filtered = tackles_filtered.groupby(["gameId", "playId"]).sum().reset_index()

In [897]:
#combine game data with play data
data = plays_filtered.merge(games_filtered, on="gameId")

In [898]:
#make all game times purly numbers (probably not necessary - most models can handle datetime or can convert to int differently)
data['gameClock'] = data['gameClock'].replace(':', '', regex =True).astype(int)
data['gameTimeEastern'] = data['gameTimeEastern'].replace(':', '', regex =True).astype(int)
data.drop("gameId",axis =1, inplace=True)
#normalization]
for param in ['yardsToGo','absoluteYardlineNumber',	'gameClock', 'week','preSnapHomeScore',	'preSnapVisitorScore',	'defendersInTheBox','gameTimeEastern']:
    data[param] = (data[param] - data[param].mean())/data[param].std()  +.5

# def str_to_time(time):
#     splt = time.split(":")
#     return int(splt[0]) * 60 + int(splt[1])
# data["gameTimeEastern"] = data["gameTimeEastern"].apply(str_to_time)
# data["gameClock"] = data["gameClock"].apply(str_to_time)

In [899]:
#find all unique teams
teams = data.possessionTeam.unique()
#1 hot encode teams and formations
data = pd.get_dummies(data, prefix=['possessionTeam', 'defensiveTeam', 'offenseFormation'])
data.fillna(0,inplace=True)

In [900]:
#empty dictionary of team names
data_by_team_test = {x : pd.DataFrame() for x in teams}
data_by_team_train = {x : pd.DataFrame() for x in teams}
data_test = pd.DataFrame()
data_train = pd.DataFrame()
#fills dictionary with all plays according to possesion team
for team in teams:
    data_by_team_test[team] = data[data["possessionTeam_"+str(team)]]
    #seperate into train and test sets seperated by team 
    rows = len(data_by_team_test[team])
    rand_idx = np.random.randint(0, rows,size = int(rows/10))
    #make seperate train and test sets for each team
    data_by_team_train[team] = data_by_team_test[team].drop(data_by_team_test[team].index[rand_idx])
    data_by_team_test[team] = data_by_team_test[team].iloc[rand_idx]
    #create a joined train and test set(for overall)
    data_test = pd.concat([data_test, data_by_team_test[team]], ignore_index = True)
    data_train = pd.concat([data_train, data_by_team_train[team]], ignore_index = True)

#Randomize final sets to mix teams
data_test = data_test.sample(frac=1)
data_train = data_train.sample(frac=1)
#split datasets into parameters and result
data_test_run = data_test["run"]
data_test_epa = data_test["expectedPoints"]
data_test.drop(["run","expectedPoints"],axis =1, inplace=True)
data_train_run = data_train["run"]
data_train_epa = data_train["expectedPoints"]
data_train.drop(["run","expectedPoints"],axis =1, inplace=True)
data_test

Unnamed: 0,quarter,down,yardsToGo,absoluteYardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,defendersInTheBox,week,gameTimeEastern,possessionTeam_ARI,possessionTeam_ATL,possessionTeam_BAL,possessionTeam_BUF,possessionTeam_CAR,possessionTeam_CHI,possessionTeam_CIN,possessionTeam_CLE,possessionTeam_DAL,possessionTeam_DEN,possessionTeam_DET,possessionTeam_GB,possessionTeam_HOU,possessionTeam_IND,possessionTeam_JAX,possessionTeam_KC,possessionTeam_LA,possessionTeam_LAC,possessionTeam_LV,possessionTeam_MIA,possessionTeam_MIN,possessionTeam_NE,possessionTeam_NO,possessionTeam_NYG,possessionTeam_NYJ,possessionTeam_PHI,possessionTeam_PIT,possessionTeam_SEA,possessionTeam_SF,possessionTeam_TB,possessionTeam_TEN,possessionTeam_WAS,defensiveTeam_ARI,defensiveTeam_ATL,defensiveTeam_BAL,defensiveTeam_BUF,defensiveTeam_CAR,defensiveTeam_CHI,defensiveTeam_CIN,defensiveTeam_CLE,defensiveTeam_DAL,defensiveTeam_DEN,defensiveTeam_DET,defensiveTeam_GB,defensiveTeam_HOU,defensiveTeam_IND,defensiveTeam_JAX,defensiveTeam_KC,defensiveTeam_LA,defensiveTeam_LAC,defensiveTeam_LV,defensiveTeam_MIA,defensiveTeam_MIN,defensiveTeam_NE,defensiveTeam_NO,defensiveTeam_NYG,defensiveTeam_NYJ,defensiveTeam_PHI,defensiveTeam_PIT,defensiveTeam_SEA,defensiveTeam_SF,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS,offenseFormation_EMPTY,offenseFormation_I_FORM,offenseFormation_JUMBO,offenseFormation_PISTOL,offenseFormation_SHOTGUN,offenseFormation_SINGLEBACK,offenseFormation_WILDCAT
931,4,3,1.398969,0.231321,-0.623105,1.752836,1.608805,0.082071,1.334232,-0.224512,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
511,2,3,-0.128628,1.569265,0.053780,0.376015,0.514989,-0.903415,0.551646,-0.224512,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
1018,2,1,0.889770,-0.563084,-0.793447,1.117380,0.843134,-0.903415,0.551646,-0.224512,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False
572,4,3,-1.401626,2.530913,-0.598450,1.541017,1.827568,1.067556,0.551646,-0.224512,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
631,2,1,0.889770,0.231321,2.066505,-0.683078,-0.250682,1.067556,-1.013527,-0.224512,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
710,3,1,0.889770,0.900293,1.701166,1.541017,-0.250682,0.082071,-0.622234,0.809435,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
128,1,1,0.889770,-0.563084,1.851336,-0.683078,-0.578827,0.082071,1.725526,2.199330,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
342,2,2,0.889770,1.569265,1.705649,-0.683078,-0.250682,0.082071,0.942939,0.809435,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
468,1,2,-0.383228,1.318401,2.097884,-0.683078,-0.578827,0.082071,-0.622234,2.199330,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False


In [901]:
data_test_run = np.ravel(np.array(data_test_run))
data_test_epa = np.ravel(np.array(data_test_epa))
data_test_param = np.array(data_test)
data_train_run = np.ravel(np.array(data_train_run))
data_train_epa = np.ravel(np.array(data_train_epa))
data_train_param = np.array(data_train)

In [902]:
# initial random forest classifier
clf = RandomForestClassifier()

In [903]:
clf.fit(data_train, data_train_run)

In [904]:
preds = clf.predict(data_test)

In [905]:
acc = (preds == data_test_run).sum() / len(preds)
acc * 100

75.18248175182481

In [906]:
model = LinearRegression()
model.fit(data_train_param, data_train_run)
pass_pred = model.predict(data_test_param)
pass_pred = np.where(pass_pred>.5,1,0)
matching = np.sum(pass_pred == data_test_run)
#percentage accuarcy
matching/len(pass_pred) * 100

74.93917274939173