In [1]:
import os
import sys
from os.path import join
import json

from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
import nfl_data_py as nfl

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, os.path.join(ROOT_DIR,'py'))

import util
from plot.plot_simple import plot_play_with_speed

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

with open("paths.json", 'r') as f:
    paths = json.load(f)

PROCESSED_DATA_PATH = paths['processed_data']

# Classify Underlying Run Concept of "Trick" plays
The motion player often receives the handoff on trick plays, but there are underlying run concepts on many of these plays. We want to classify these run concepts because whether or not the motion player receives the ball is a feature in the downstream model.

In [2]:
df_run_concept = pd.DataFrame()
for wk in range(1,10):
    df_tmp = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'run_concept.pkl'))
    df_run_concept = pd.concat([df_run_concept, df_tmp]).reset_index(drop=True)
del df_tmp
print(df_run_concept.shape)
df_run_concept.head()

(5149, 17)


Unnamed: 0,game_play_id,rb_dir_post_snap,avg_oline_angle_1s_after_snap,var_oline_angle_1s_after_snap,avg_oline_angle_1s_after_snap_4_rightmost_oline,var_oline_angle_1s_after_snap_4_rightmost_oline,avg_oline_dx_1s_after_snap,dx_oline_1s_after_snap,n_pullers_left_of_center,right_gaurd_pulls,n_puller_behind_los_3s_after_snap,shotgun,singleback,i_form,pistol,jumbo,run_concept
0,2022091105_3712,52.610909,148.560227,24510.990381,185.127631,23951.924703,1.296,1.296,0.0,0.0,0.0,0,1,0,0,0,OUTSIDE ZONE
1,2022091104_1094,14.758182,15.144709,110.514458,17.400072,112.71308,1.97,1.97,1.0,0.0,1.0,0,1,0,0,0,OUTSIDE ZONE
2,2022091101_3923,88.350909,178.522905,1124.141592,189.261815,828.107547,-1.178,-1.178,1.0,0.0,1.0,0,1,0,0,0,INSIDE ZONE
3,2022091106_1380,75.493636,27.870588,98.452026,25.141191,85.798372,1.676,1.676,0.0,0.0,0.0,0,0,1,0,0,INSIDE ZONE
4,2022091100_501,324.989091,155.162638,26256.025791,190.553162,26558.086443,1.896,1.896,0.0,0.0,0.0,0,1,0,0,0,OUTSIDE ZONE


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Separate TRICK plays from other run concepts
df_trick_plays = df_run_concept.query('run_concept == "TRICK"')
df = df_run_concept.query('run_concept != "TRICK"')

# Prepare the dataset for training
X = df.drop(columns=['game_play_id', 'run_concept'])
y = df['run_concept']

# Encode target labels as integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define models to train
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'KNeighbors': KNeighborsClassifier(),
    'SVC': SVC(probability=True, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Perform k-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}
trained_models = {}

for model_name, model in tqdm(models.items()):
    fold_accuracies = []
    for train_idx, val_idx in skf.split(X, y_encoded):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict probabilities and select the class with max probability
        y_proba = model.predict_proba(X_val)
        y_pred = y_proba.argmax(axis=1)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_val, y_pred)
        fold_accuracies.append(accuracy)
    
    # Store mean accuracy across folds
    results[model_name] = np.mean(fold_accuracies)
    trained_models[model_name] = model

# Find the best model
best_model_name = max(results, key=results.get)
best_model = trained_models[best_model_name]
print(f"Best Model: {best_model_name} with Accuracy: {results[best_model_name]:.2f}")

# Prepare TRICK plays for prediction
X_trick = df_trick_plays.drop(columns=['game_play_id', 'run_concept'])

# Make predictions on TRICK plays
trick_proba = best_model.predict_proba(X_trick)
trick_predictions = trick_proba.argmax(axis=1)

# Map predicted labels back to run concepts
trick_predictions_labels = label_encoder.inverse_transform(trick_predictions)

# Add predictions to the TRICK plays DataFrame
df_trick_plays['predicted_run_concept'] = trick_predictions_labels

for i, class_name in enumerate(label_encoder.classes_):
    df_trick_plays[f'proba_{class_name}'] = trick_proba[:, i]

# Display the predictions
cols = ['game_play_id', 'predicted_run_concept']
proba_cols = [col for col in df_trick_plays.columns if col.startswith('proba_')]
df_trick_plays = df_trick_plays[cols + proba_cols]

# Set proba cols to 2 decimal places
for col in proba_cols:
    df_trick_plays[col] = df_trick_plays[col].round(2)

100%|██████████| 6/6 [00:48<00:00,  8.11s/it]

Best Model: XGBoost with Accuracy: 0.80





In [6]:
results

{'RandomForest': 0.7838967932029626,
 'LogisticRegression': 0.4332235474561562,
 'GradientBoosting': 0.7899184678412141,
 'KNeighbors': 0.5583465520372949,
 'SVC': 0.48363161187463427,
 'XGBoost': 0.7959429678513048}

In [7]:
df_trick_plays.head()

Unnamed: 0,game_play_id,predicted_run_concept,proba_COUNTER,proba_DRAW,proba_INSIDE ZONE,proba_MAN,proba_OUTSIDE ZONE,proba_POWER,proba_PULL LEAD,proba_TRAP
19,2022091113_3722,PULL LEAD,0.0,0.0,0.02,0.0,0.31,0.0,0.65,0.01
31,2022091113_1389,OUTSIDE ZONE,0.0,0.0,0.0,0.0,0.93,0.0,0.07,0.0
50,2022091113_110,POWER,0.16,0.0,0.0,0.01,0.0,0.78,0.01,0.04
51,2022091109_767,INSIDE ZONE,0.0,0.0,0.54,0.01,0.44,0.0,0.01,0.0
77,2022091109_863,PULL LEAD,0.19,0.16,0.03,0.02,0.04,0.18,0.39,0.0


In [11]:
run_concepts = (
    pd.concat([
        df_run_concept.query('run_concept != "TRICK"')[['game_play_id', 'run_concept']],
        df_trick_plays[['game_play_id', 'predicted_run_concept']].rename(columns={'predicted_run_concept': 'run_concept'})
    ])
    .reset_index(drop=True)
)

In [18]:
df_trick_plays.predicted_run_concept.value_counts(normalize=True).astype(float).round(2)

OUTSIDE ZONE    0.34
MAN             0.18
POWER           0.16
PULL LEAD       0.12
INSIDE ZONE     0.12
COUNTER         0.05
DRAW            0.03
TRAP            0.01
Name: predicted_run_concept, dtype: float64

In [17]:
df_run_concept.query('run_concept != "TRICK"')[['game_play_id', 'run_concept']].run_concept.value_counts(normalize=True).astype(float).round(2)

OUTSIDE ZONE    0.30
INSIDE ZONE     0.24
MAN             0.17
POWER           0.09
PULL LEAD       0.08
COUNTER         0.06
DRAW            0.03
TRAP            0.02
Name: run_concept, dtype: float64

In [19]:
run_concepts.to_pickle(join(PROCESSED_DATA_PATH, 'run_concepts.pkl'))