In [8]:
import pandas as pd
import joblib
import numpy as np
import nfl_data_py as nfl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import pickle
from voyagerOne import learn
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import_data = True



In [12]:
if import_data == True:

    data = nfl.import_pbp_data([2022], downcast=True, cache=False, alt_path=None)
    data.head()

    df = data[['home_team', 
            'away_team', 
            'week', 
            'posteam', 
            'defteam', 
            'yardline_100', 
            'half_seconds_remaining',
            'game_seconds_remaining', 
            'down',
            'goal_to_go',
            'ydstogo',
            'posteam_score',
            'defteam_score',
            'play_type']]
    
    #filters to play type run or pass
    criteria = (df['play_type'] == 'run') | (df['play_type']=='pass') | (df['play_type']=='punt') | (df['play_type']=='field_goal')
    df = df.loc[criteria].copy()

    # fill whether the posessing team is home or away
    df['is_pos_home'] = df['posteam'] == df['home_team']

    df = df.drop(columns=['home_team', 'away_team'])
    df = df.dropna()

    # get dummies
    encoded_df = pd.get_dummies(df, columns=['posteam', 'defteam'])

    encoded_df.to_pickle('data/encoded_df.pkl')


2022 done.
Downcasting floats.


In [27]:
encoded_df = pd.read_pickle('data/encoded_df.pkl')

In [13]:
X = encoded_df.drop(columns=['play_type'])

y = encoded_df['play_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [16]:
param_dist = {
    'n_estimators': np.arange(100, 1500, 100),  # Number of trees in the forest
    'max_depth': [None] + list(np.arange(2, 20, 2)),  # Maximum depth of the trees
    'min_samples_split': np.arange(2, 18, 2),  # Minimum samples required to split an internal node
    'min_samples_leaf': np.arange(1, 12, 2),  # Minimum samples required to be at a leaf node
    'max_features': ['sqrt', None]  # Number of features to consider for the best split
}

rf = RandomForestClassifier()

random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=200,  # Number of random parameter combinations to try
    cv=3,  # Number of cross-validation folds
    #scoring='neg_mean_absolute_error',  # Use negative MSE as the evaluation metric
    random_state=42
)

random_search.fit(X_train, y_train)

# Get the best estimator and its hyperparameters
best_rf = random_search.best_estimator_
best_params = random_search.best_params_

# Make predictions on the test data using the best model
y_pred = best_rf.predict(X_test)

# Calculate Mean Squared Error to evaluate the best model
acc = accuracy_score(y_test, y_pred)
print(f"Best Model - Score: {acc}")
print("Best Hyperparameters:", best_params)

In [24]:
importance = best_rf.feature_importances_
features = X.columns
imp_df = pd.DataFrame({'feature':features, 'imp':importance}).sort_values(by='imp', ascending=False)
imp_df.head(25)

Unnamed: 0,feature,imp
7,ydstogo,0.198318
5,down,0.11888
3,half_seconds_remaining,0.102063
1,yardline_100,0.083486
4,game_seconds_remaining,0.07836
11,defteam_score,0.076556
10,posteam_score,0.066374
2,quarter_seconds_remaining,0.051873
0,week,0.042205
9,away_timeouts_remaining,0.009828


In [15]:
# save model
joblib.dump(best_rf, 'model/rf_v1.joblib')

# save columns
with open('data/feature_cols.pkl', 'wb') as f:
    pickle.dump(list(X_train.columns), f)


In [9]:
data = nfl.import_pbp_data([2022], downcast=True, cache=False, alt_path=None)
data.head()

df = data[['home_team', 
        'away_team', 
        'week', 
        'posteam', 
        'defteam', 
        'yardline_100', 
        'half_seconds_remaining',
        'game_seconds_remaining', 
        'down',
        'goal_to_go',
        'ydstogo',
        'posteam_score',
        'defteam_score',
        'play_type']]



2022 done.
Downcasting floats.


In [10]:
df['play_type'].value_counts()

play_type
pass           20393
run            15037
no_play         4563
kickoff         2835
punt            2294
extra_point     1253
field_goal      1105
qb_kneel         442
qb_spike          63
Name: count, dtype: int64