In [64]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [65]:
# load dataframes
games = pd.read_csv("data/games.csv")
plays = pd.read_csv("data/plays.csv")
tackles = pd.read_csv("data/tackles.csv")
players = pd.read_csv("data/players.csv")


In [66]:
# add target var (can also use play desc to count scrambles as runs)
plays["run"] = plays["passResult"].isna()

In [67]:
# filter plays dataframe
plays_filtered=plays[["gameId", "playId", "run", "quarter", "down", "yardsToGo", "possessionTeam", "defensiveTeam",
                      "yardlineNumber", "gameClock", "preSnapHomeScore", "preSnapVisitorScore", "absoluteYardlineNumber",
                      "preSnapHomeTeamWinProbability", "expectedPoints", "offenseFormation", "defendersInTheBox",
                      "homeTeamWinProbabilityAdded", "expectedPointsAdded"]]

In [68]:
# filter games dataframe
games_filtered = games[["gameId", "week", "gameTimeEastern", "homeFinalScore", "visitorFinalScore"]]

In [69]:
# filter players dataframe
players_filtered = players[["position", "nflId"]]

In [70]:
# filter tackles
tackles_filtered = tackles[["gameId", "playId", "tackle", "assist", "forcedFumble", "pff_missedTackle"]]
tackles_filtered = tackles_filtered.groupby(["gameId", "playId"]).sum().reset_index()

In [71]:
# merge plays and games
merged1 = plays_filtered.merge(games_filtered, on="gameId")

In [72]:
# merge tackles and merged1
data = merged1.merge(tackles_filtered, on=["gameId", "playId"], how="left")

In [73]:
def str_to_time(time):
    splt = time.split(":")
    return int(splt[0]) * 60 + int(splt[1])
data["gameTimeEastern"] = data["gameTimeEastern"].apply(str_to_time)
data["gameClock"] = data["gameClock"].apply(str_to_time)

In [76]:
data = pd.get_dummies(data, prefix=['possessionTeam', 'defensiveTeam', 'offenseFormation'])

In [77]:
X = data.drop("run", axis=1)
y = data["run"]

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [79]:
X_train

Unnamed: 0,gameId,playId,quarter,down,yardsToGo,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,absoluteYardlineNumber,...,defensiveTeam_TB,defensiveTeam_TEN,defensiveTeam_WAS,offenseFormation_EMPTY,offenseFormation_I_FORM,offenseFormation_JUMBO,offenseFormation_PISTOL,offenseFormation_SHOTGUN,offenseFormation_SINGLEBACK,offenseFormation_WILDCAT
11187,2022110602,767,1,1,10,14,50,7,0,24,...,False,False,False,False,False,False,False,False,True,False
102,2022091103,1672,2,4,1,35,76,3,17,75,...,False,False,False,False,True,False,False,False,False,False
5268,2022103000,2649,3,1,1,1,351,10,7,11,...,False,False,False,False,False,False,False,True,False,False
11306,2022103008,954,1,3,2,29,83,7,7,39,...,False,False,False,False,False,False,False,True,False,False
11145,2022110602,2517,3,2,10,25,611,35,7,35,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5898,2022091801,4131,4,2,10,27,55,30,24,83,...,False,False,False,True,False,False,False,False,False,False
4639,2022092500,1046,2,1,10,25,849,10,0,35,...,False,False,False,False,False,False,True,False,False,False
11020,2022091803,2153,3,1,10,20,440,17,0,90,...,False,False,False,False,True,False,False,False,False,False
5658,2022100206,856,1,1,10,46,61,0,7,56,...,False,False,False,False,False,False,False,True,False,False


In [80]:
# initial random forest classifier
clf = RandomForestClassifier()

In [81]:
clf.fit(X_train, y_train)

In [83]:
preds = clf.predict(X_test)

In [86]:
acc = (preds == y_test).sum() / len(preds)
acc

0.790839205637412