In [114]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [115]:

#import dataset

df_raw = pd.read_csv('data/processed/df_clean.csv')

cols_to_use = [
      'gameId', 'playId', 'frameId', 'nflId', 'displayName',
      'position', 'club', 'possessionTeam', 'defensiveTeam',
      'preSnapHomeScore', 'preSnapVisitorScore', 'quarter',
      'gameClock', 'down', 'yardsToGo', 'yardlineNumber',
      'yardlineSide', 'offenseFormation', 'receiverAlignment',
      'preSnapHomeTeamWinProbability', 'preSnapVisitorTeamWinProbability',
      'o_clean', 'a_clean', 's_clean', 'x_clean', 'y_clean', 'dir_clean',
      'playDescription', 'passLocationType', 'rushLocationType', 
      'pff_runConceptPrimary', 'yardsGained', 'wasInitialPassRusher', 'event'
      ]

df = df_raw[cols_to_use]



In [116]:


#we need to group by game_id, play_id, and player_id, then find the x coordinate for the row where event = "line_set" and the row where event = "ball_snapped"

#then we can subtract the x coordinate for the "line_set" row from the x coordinate for the "ball_snapped" row

#we can then add this value to the dataframe as a new column
#actually since we're removing all frames between line_set and ball snapped we can just take the difference in the x coordinates between the two rows
#We need to take into consideration that since the plays are all going from left to right, 
# the defense is always facing the other way, so when we take the difference in x coords, 
# we need to * by -1 to get a positive value if the defender is creeping forward which is more intuitive

# Group by 'gameId', 'playId', and 'nflId' to calculate the difference in x coordinates for each player
# Using .agg() instead of .apply() to avoid warnings
crept_up_df = df.groupby(['gameId', 'playId', 'nflId'])['x_clean'].agg(lambda x: x.diff().iloc[-1]).reset_index()

# Multiply the resulting differences by -1
crept_up_df['creptDist'] = crept_up_df['x_clean'] * -1

# Now filter rows where the event is 'line_set' (we assume 'event' column exists)
df = df[df['event'] == 'line_set'].copy()  # Make a copy to avoid SettingWithCopyWarning

# Merge the 'creptDist' values into the filtered dataframe based on ['gameId', 'playId', 'nflId']
df = df.merge(crept_up_df[['gameId', 'playId', 'nflId', 'creptDist']], on=['gameId', 'playId', 'nflId'], how='left')

#confirmed with animation that the creptDist is correct

In [117]:
#now calculate horizontal distance to the ball for each player
# we get the y coord. of the ball by using the y coord. of the center.

#so we want to group by game_id, play_id, get the y coord of the player with "position" = "C" 
#then loop through each player on the defense and calculate the horizontal distance to the ball by subtracting their y coord. with the y coord. of the center
#then we'll add this value to the dataframe as a new column

# Group by 'gameId' and 'playId', then find the y coord of the center ('C') player
def get_center_y(group):
    center = group[group['position'] == 'C']
    if not center.empty:
        return center['y_clean'].iloc[0]
    return None  # In case there's no center player

# Apply this function to each group to get the y_clean for the center player
center_y_dict = df.groupby(['gameId', 'playId']).apply(get_center_y).to_dict()

# Map the center_y value to the dataframe based on 'gameId' and 'playId'
df['center_y'] = df.set_index(['gameId', 'playId']).index.map(center_y_dict)

# Now, calculate the horizontal distance to the ball for each player
# Check if 'center_y' is assigned properly, otherwise hDist will be NaN
df['hDist'] = df['y_clean'] - df['center_y']

#confirmed 


  center_y_dict = df.groupby(['gameId', 'playId']).apply(get_center_y).to_dict()


In [118]:

# now we need to calculate distance from teh line of scrimmage, which is the difference between the x coordinate of the player and the line of scrimmage
#lets do the same thing as before, using the center's x coordinate as the line of scrimmage

# Group by 'gameId' and 'playId', then find the x coord of the center ('C') player
def get_center_x(group):
    center = group[group['position'] == 'C']
    if not center.empty:
        return center['x_clean'].iloc[0]
    return None  # In case there's no center player

# Apply this function to each group to get the x_clean for the center player
center_x_dict = df.groupby(['gameId', 'playId']).apply(get_center_x).to_dict()

# Map the center_x value to the dataframe based on 'gameId' and 'playId'
df['center_x'] = df.set_index(['gameId', 'playId']).index.map(center_x_dict)

# Now, calculate the distance from the line of scrimmage for each player

df['losDist'] = df['x_clean'] - df['center_x']

#confirmed with animation that these values are correct


  center_x_dict = df.groupby(['gameId', 'playId']).apply(get_center_x).to_dict()


In [119]:
#ok now lets get only our features of interest for predicting wasInitialPassRusher
features = ['wasInitialPassRusher','position','creptDist', 'hDist', 'losDist', 'o_clean']

df_model = df[features]

#drop rows with NaN values in watInitialPassRusher (offense players)
df_model.dropna(subset=['wasInitialPassRusher'], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model.dropna(subset=['wasInitialPassRusher'], inplace=True)


In [120]:
import joblib
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import roc_auc_score, accuracy_score
from xgboost import XGBClassifier
import numpy as np

# Get the unique positions
positions = df_model['position'].unique()

# Get only positions that have more than 1 unique value in the target
positions = [position for position in positions if df_model[df_model['position'] == position]['wasInitialPassRusher'].nunique() > 1]

# Loop over each position
for position in positions:
    # Filter the dataset for the current position
    df_position = df_model[df_model['position'] == position].copy()  # Use a copy to avoid modifying df_model

    # Drop the 'position' column (it’s not needed for modeling)
    df_position.drop('position', axis=1, inplace=True)

    # Define the features (X) and target (y)
    X = df_position.drop('wasInitialPassRusher', axis=1)
    y = df_position['wasInitialPassRusher']

    # Split into training and testing sets (hold out a test set for final evaluation)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the model
    model = XGBClassifier()

    # Set up the cross-validation strategy
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation

    # Check if both classes (0 and 1) are present in the training data before calculating AUC
    print(f'Position: {position}')
    
    # Check if both classes are present in the training set
    if y_train.nunique() > 1:
        try:
            auc_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
            print(f'Average AUC (CV): {auc_scores.mean():.4f} ± {auc_scores.std():.4f}')
        except ValueError as e:
            print(f"Error calculating AUC for {position}: {e}")
            auc_scores = np.nan
            print(f'Average AUC: {auc_scores} (Error: Only one class present in some folds)')
    else:
        auc_scores = np.nan
        print(f'Average AUC: {auc_scores} (Only one class present in target)')
    
    # Get cross-validated predictions using cross_val_predict
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv)

    # Calculate Accuracy on the training data across folds
    accuracy_train = accuracy_score(y_train, y_pred_train)
    print(f'Training Accuracy (CV): {accuracy_train * 100.0:.2f}%')

    # Train the model on the entire training set (after cross-validation)
    model.fit(X_train, y_train)

    # Save the trained model using joblib
    model_filename = f'models/model_{position}.joblib'  # Save with the position name
    joblib.dump(model, model_filename)
    print(f'Model for position {position} saved as {model_filename}')

    # Evaluate the final model on the test set
    y_pred_test = model.predict(X_test)
    y_pred_prob_test = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

    # Check if both classes are present in the test set before calculating AUC
    if y_test.nunique() > 1:
        auc_test = roc_auc_score(y_test, y_pred_prob_test)
        print(f'Test AUC: {auc_test:.4f}')
    else:
        auc_test = np.nan
        print(f'Test AUC: N/A (Only one class present in y_test)')

    # Calculate Accuracy on the test set
    accuracy_test = accuracy_score(y_test, y_pred_test)
    print(f'Test Accuracy: {accuracy_test * 100.0:.2f}%')

    print('\n')




Position: ILB
Average AUC (CV): 0.8046 ± 0.0120
Training Accuracy (CV): 85.23%
Model for position ILB saved as models/model_ILB.joblib
Test AUC: 0.8062
Test Accuracy: 86.16%


Position: DT
Average AUC (CV): 0.3696 ± 0.0569
Training Accuracy (CV): 99.64%
Model for position DT saved as models/model_DT.joblib
Test AUC: 0.5673
Test Accuracy: 99.57%


Position: CB
Average AUC (CV): 0.8564 ± 0.0111
Training Accuracy (CV): 98.15%
Model for position CB saved as models/model_CB.joblib
Test AUC: 0.8564
Test Accuracy: 98.08%


Position: DE
Average AUC (CV): 0.6541 ± 0.0223
Training Accuracy (CV): 96.82%
Model for position DE saved as models/model_DE.joblib
Test AUC: 0.6619
Test Accuracy: 96.52%


Position: SS
Average AUC (CV): 0.9206 ± 0.0079
Training Accuracy (CV): 95.67%
Model for position SS saved as models/model_SS.joblib
Test AUC: 0.9281
Test Accuracy: 96.01%


Position: NT
Average AUC (CV): 0.7123 ± 0.0608
Training Accuracy (CV): 99.43%
Model for position NT saved as models/model_NT.joblib


Traceback (most recent call last):
  File "/Users/michaelmontemurri/Downloads/NFLDataBowl2025/BigData25/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/Users/michaelmontemurri/Downloads/NFLDataBowl2025/BigData25/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/michaelmontemurri/Downloads/NFLDataBowl2025/BigData25/venv/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/michaelmontemurri/Downloads/NFLDataBowl2025/BigData25/venv/lib/python3.12/site-packages/sklearn/metrics/_ranking.py", line 640, in roc_auc_score
    return _average_binary_score(
           ^^^^^^^^^^^^^^^^^^^^^^


Average AUC (CV): nan ± nan




Training Accuracy (CV): 94.57%
Model for position LB saved as models/model_LB.joblib
Test AUC: N/A (Only one class present in y_test)
Test Accuracy: 100.00%




In [121]:
import joblib
import numpy as np

# Define the features used for training
training_features = ['creptDist', 'hDist', 'losDist', 'o_clean']

# Function to predict InitialPassRusher probability for each row
def predict_xPassRush(row, model_dict):
    # Get the player's position for the current row
    position = row['position']
    
    # If the model exists for this position, use it to predict the probability
    if position in model_dict:
        model = model_dict[position]
        
        # Select only the features used for training (excluding 'position')
        X_row = row[training_features].values.reshape(1, -1)  # reshape for a single sample
        prob = model.predict_proba(X_row)[:, 1]  # Get probability for the positive class (InitialPassRush = 1)
        
        return prob[0]
    else:
        # If no model is available for the position, return NaN or 0 as a fallback
        return np.nan

# Load all the models (this assumes they were saved in the previous step)
model_dict = {}
positions = df_model['position'].unique()

for position in positions:
    model_filename = f'models/model_{position}.joblib'
    try:
        model_dict[position] = joblib.load(model_filename)
    except FileNotFoundError:
        print(f'Model for position {position} not found.')

# Now apply the prediction for each row in the original DataFrame
df['xPassRush'] = df.apply(lambda row: predict_xPassRush(row, model_dict), axis=1)


Model for position WR not found.


In [122]:

# Check the results for a specific player (e.g., Aaron Donald)

print(df[['gameId', 'playId', 'nflId', 'position', 'xPassRush']][df['displayName'] == 'Aaron Donald'])

#save the df with the new feature as a csv
df.to_csv('data/processed/df_processed.csv', index=False)

            gameId  playId  nflId position  xPassRush
12      2022090800      56  41239       DT   0.999991
34      2022090800      80  41239       DT   0.999963
56      2022090800     101  41239       DT   0.998359
78      2022090800     122  41239       DT   0.999781
100     2022090800     167  41239       DT   0.999979
...            ...     ...    ...      ...        ...
289960  2022110609    4058  41239       DT   0.999507
289982  2022110609    4104  41239       DT   0.999969
290004  2022110609    4133  41239       DT   0.999744
290026  2022110609    4162  41239       DT   0.999968
290048  2022110609    4224  41239       DT   0.999550

[339 rows x 5 columns]
