## Random Forest Model

### Data and Library Importations

In [1]:
#Sources: (for Random Forest implementation)
#https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
#https://medium.com/analytics-vidhya/evaluating-a-random-forest-model-9d165595ad56
# (for hyperparameter tuning) https://www.analyticsvidhya.com/blog/2021/06/understanding-random-forest/
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, confusion_matrix
%matplotlib inline

df = pd.read_csv("NFL_data_super_cleaned.csv")
#drop any rows with null values
df.dropna(inplace=True)
display(df)
print(df.isnull().values.any(), df.isnull().sum().sum())

Unnamed: 0,posteam,yardline_100,quarter_seconds_remaining,qtr,down,goal_to_go,ydstogo,play_type,score_margin
0,PIT,58.0,893.0,1,1.0,0.0,10,pass,0.0
1,PIT,53.0,856.0,1,2.0,0.0,5,run,0.0
2,PIT,56.0,815.0,1,3.0,0.0,8,pass,0.0
3,PIT,56.0,807.0,1,4.0,0.0,8,kick,0.0
4,TEN,98.0,796.0,1,1.0,0.0,10,run,0.0
...,...,...,...,...,...,...,...,...,...
353055,CAR,71.0,82.0,4,2.0,0.0,1,pass,-5.0
353056,CAR,71.0,77.0,4,3.0,0.0,1,pass,-5.0
353057,CAR,66.0,63.0,4,2.0,0.0,10,pass,-5.0
353058,CAR,66.0,58.0,4,3.0,0.0,10,pass,-5.0


False 0


### Data Preprocessing

In [5]:
#Random forest is very memory intensive, so take a sample
sample = df.sample(n=20000, random_state=21, axis=0)

#one-hot encode categorical variables
cat_y = ["play_type_kick", "play_type_pass", "play_type_run"]
#posteam, play_type
sample = pd.get_dummies(sample)
labels = np.array(sample[cat_y])

sample = sample.drop(cat_y[0], axis = 1)
sample = sample.drop(cat_y[1], axis=1)
sample = sample.drop(cat_y[2], axis=1)

feature_list = list(sample.columns)
features = np.array(sample)
display(sample)

#split data into train and test sets, 80/20, seed = 21
X_train, X_test, y_train, y_test = train_test_split(sample, labels, test_size = 0.2, random_state = 21)

Unnamed: 0,yardline_100,quarter_seconds_remaining,qtr,down,goal_to_go,ydstogo,score_margin,posteam_ARI,posteam_ATL,posteam_BAL,...,posteam_NO,posteam_NYG,posteam_NYJ,posteam_PHI,posteam_PIT,posteam_SEA,posteam_SF,posteam_TB,posteam_TEN,posteam_WAS
222356,61.0,119.0,1,3.0,0.0,22,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
61840,78.0,702.0,1,4.0,0.0,4,-7.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2684,61.0,585.0,4,2.0,0.0,4,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
225580,73.0,483.0,3,3.0,0.0,3,-3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46520,70.0,201.0,1,1.0,0.0,10,7.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233098,60.0,450.0,4,2.0,0.0,8,-32.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
157573,80.0,900.0,1,1.0,0.0,10,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
262079,58.0,298.0,3,2.0,0.0,4,-14.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
93297,83.0,469.0,2,3.0,0.0,12,-3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Model Training and Evaluation

In [6]:
%%time
#Hyperparameter tuning
#depth, samples per leaf, and number of trees
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {
    'n_estimators': [10,25,30,50,100,200],
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200]
}
grid_search = GridSearchCV(estimator=rf, param_grid=params, cv = 4, n_jobs=-1, verbose=1, scoring="accuracy")

grid_search.fit(X_train, y_train)
print("Best accuracy:", grid_search.best_score_)

rf_best = grid_search.best_estimator_
print("Best parameters:", rf_best)


Fitting 4 folds for each of 180 candidates, totalling 720 fits
Best accuracy: 0.69
Best parameters: RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_estimators=200,
                       n_jobs=-1, random_state=42)
Wall time: 13min 20s
