In [8]:
import pandas as pd
import numpy as np
import nfl_data_py as nfl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import pickle
from voyagerOne import learn
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import_data = True



In [12]:
if import_data == True:

    data = nfl.import_pbp_data([2022], downcast=True, cache=False, alt_path=None)
    data.head()

    df = data[['home_team', 
            'away_team', 
            'week', 
            'posteam', 
            'defteam', 
            'yardline_100', 
            'half_seconds_remaining',
            'game_seconds_remaining', 
            'down',
            'goal_to_go',
            'ydstogo',
            'posteam_score',
            'defteam_score',
            'play_type']]
    
    #filters to play type run, pass, punt, FG
    criteria = (df['play_type'] == 'run') | (df['play_type']=='pass') | (df['play_type']=='punt') | (df['play_type']=='field_goal')
    df = df.loc[criteria].copy()

    # fill whether the posessing team is home or away
    df['is_pos_home'] = df['posteam'] == df['home_team']

    df = df.drop(columns=['home_team', 'away_team'])
    df = df.dropna()

    # get dummies
    encoded_df = pd.get_dummies(df, columns=['posteam', 'defteam'])

    # save
    encoded_df.to_pickle('data/encoded_df.pkl')


2022 done.
Downcasting floats.


In [27]:
# load data
encoded_df = pd.read_pickle('data/encoded_df.pkl')

In [49]:
# split to X and y and train/test
X = encoded_df.drop(columns=['play_type'])
y = encoded_df['play_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [50]:
# establish randomsearch field
param_dist = {
    'n_estimators': np.arange(100, 1500, 100),
    'max_depth': [None] + list(np.arange(2, 20, 2)),
    'min_samples_split': np.arange(2, 18, 2),
    'min_samples_leaf': np.arange(1, 12, 2),
    'max_features': ['sqrt', None]
}

# instantiate model
rf = RandomForestClassifier()

# perform grid search
random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=50,  # Number of random parameter combinations to try
    cv=3,  # Number of cross-validation folds
    #scoring='neg_mean_absolute_error',  # Use negative MSE as the evaluation metric
    random_state=42
)
random_search.fit(X_train, y_train)

# het the best estimator and its hyperparameters
best_rf = random_search.best_estimator_
best_params = random_search.best_params_

# make predictions on the test data using the best model
y_pred = best_rf.predict(X_test)

# calculate accuracy_score to evaluate the best model
acc = accuracy_score(y_test, y_pred)
print(f"Best Model - Score: {acc}")
print("Best Hyperparameters:", best_params)

Best Model - Score: 0.6979588321419343
Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 4, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'max_depth': 16}


In [47]:
# review feature importances
importance = best_rf.feature_importances_
features = X.columns
imp_df = pd.DataFrame({'feature':features, 'imp':importance}).sort_values(by='imp', ascending=False)
imp_df.head(35)

Unnamed: 0,feature,imp
4,down,0.330018
1,yardline_100,0.175629
5,ydstogo,0.150739
2,half_seconds_remaining,0.09245
3,game_seconds_remaining,0.06515
7,defteam_score,0.06123
6,posteam_score,0.047717
0,week,0.031166
8,is_pos_home,0.006833
14,posteam_CHI,0.005321


In [24]:
# save model
with open('model/rf_v1.pkl', 'wb') as f:
    pickle.dump(best_rf, f)

# save columns
with open('data/feature_cols.pkl', 'wb') as f:
    pickle.dump(list(X_train.columns), f)
