In [663]:
import pandas as pd
import os
import numpy as np
import math
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [2]:
# Read the CSV files
games = pd.read_csv("data/nfl-big-data-bowl-2024/games.csv")
players = pd.read_csv("data/nfl-big-data-bowl-2024/players.csv")

# Calculate height in inches
players['height'] = players['height'].str.extract(r'(\d+)').astype(int) * 12 + players['height'].str.extract(r'-(\d+)').astype(int)

# Select columns
players = players[['displayName', 'nflId', 'height', 'weight', 'position']]

plays = pd.read_csv("data/nfl-big-data-bowl-2024/plays.csv")
tackles = pd.read_csv("data/nfl-big-data-bowl-2024/tackles.csv")

# Read and combine tracking data for all weeks
tracking = pd.concat([pd.read_csv(f"data/nfl-big-data-bowl-2024/tracking_week_{week}.csv") for week in range(1, 10)])
ball_tracking = tracking.loc[tracking['nflId'].isna()][["gameId", "frameId" "playId", "x", "y"]].rename({"x" : "ball_x", "y" : "ball_y"}, axis = 1)

In [751]:
tackles.query("gameId == 2022090800 & playId ==  56")

Unnamed: 0,gameId,playId,nflId,tackle,assist,forcedFumble,pff_missedTackle
840,2022090800,56,43294,1,0,0,0


In [148]:
ball_tracking

Unnamed: 0,gameId,frameId,playId,ball_x,ball_y
484,2022090800,1,56,85.050003,33.810001
485,2022090800,2,56,83.150002,34.830002
486,2022090800,3,56,81.739998,35.590000
487,2022090800,4,56,80.139999,36.450001
488,2022090800,5,56,79.290001,36.930000
...,...,...,...,...,...
1150018,2022110700,40,3787,26.219999,19.680000
1150019,2022110700,41,3787,26.320000,19.610001
1150020,2022110700,42,3787,26.389999,19.559999
1150021,2022110700,43,3787,26.450001,19.520000


In [757]:
def euclidean_distance(x1, y1, x2, y2):
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

class play:
    def __init__(self, game_id, play_id):
        self.play = plays.query("gameId == @game_id & playId ==  @play_id")
        self.ball_carry_id = ball_carrier = self.play.ballCarrierId.reset_index(drop =1)[0]
        self.tracking_df = tracking.query("gameId == @game_id & playId ==  @play_id")
        self.ball_track = ball_tracking.query("gameId == @game_id & playId ==  @play_id")
        self.tackle_oppurtunities = tackles.query("gameId == @game_id & playId ==  @play_id")
        self.num_frames = max(self.tracking_df.frameId)
        self.tackle_attempts = self.get_tackle_attempt_frames()
        self.eop = self.get_end_of_play_location()
        self.tracking_refined = {frame_id : self.refine_tracking(frame_id = frame_id) for frame_id in range(1, self.num_frames)}
        self.tracking_refined_stratified = {frame_id : {player_type : self.tracking_refined.get(frame_id).loc[(self.tracking_refined.get(frame_id)['type'] == player_type)] 
                                                        for player_type in ["Offense", "Defense", "Carrier"]} for frame_id in range(1, self.num_frames)}
    
    def get_tackle_attempt_frames(self):
        tacklers = self.tackle_oppurtunities.nflId.unique()
        tackler_tracking = self.tracking_df.query("nflId in @tacklers").merge(
            self.ball_track, on = ["gameId", "playId", "frameId"], how = "left")
        tackler_tracking['distance_from_ball'] = euclidean_distance(tackler_tracking['x'], tackler_tracking['y'],
                                                                    tackler_tracking['ball_x'], tackler_tracking['ball_y'])
        tackler_tracking['min_distance_from_ball'] = tackler_tracking.groupby('nflId')['distance_from_ball'].transform('min')
        return tackler_tracking.query("min_distance_from_ball == distance_from_ball")[["nflId", "frameId", "ball_x", "ball_y"]].rename(
            {"ball_x" : "tackle_x", "ball_y" : "tackle_y"}, axis = 1
        )
    
    def get_tackle_attempt_matrix(self, N):
        tackles_attempt_mat = np.zeros((int(120/N), math.ceil(54/N)))
        for item in list(zip(self.tackle_attempts.tackle_x, self.tackle_attempts.tackle_y)):
            tackles_attempt_mat[int(item[0]/N), int(item[1]/N)] = 1
        return tackles_attempt_mat
    
    def get_end_of_play_location(self):
        ball_carrier = self.ball_carry_id
        last_frame = self.num_frames
        end_of_play_carrier = self.tracking_df.query("nflId == @ball_carrier & frameId == @last_frame")
        return end_of_play_carrier[["frameId", "x", "y"]].rename({"x" : "eop_x", "y" : "eop_y"}, axis = 1)
    
    def get_end_of_play_matrix(self, N):
        tackles_attempt_mat = np.zeros((int(120/N), math.ceil(54/N)))
        for item in list(zip(self.eop.eop_x, self.eop.eop_y)):
            tackles_attempt_mat[int(item[0]/N), int(item[1]/N)] = 1
        return tackles_attempt_mat
    
    def refine_tracking(self, frame_id):
        current_positions = self.tracking_df.query("frameId == @frame_id").merge(players, on = "nflId", how = "left")
        current_positions['type'] = current_positions['position'].apply(
            lambda x: "Offense" if x in ["QB", "TE", "WR", "G", "OLB", "RB", "C", "FB"] else "Defense")
        current_positions['type'] = current_positions.apply(lambda row: 'Ball' if pd.isna(row['nflId']) else row['type'], axis=1)
        current_positions.loc[current_positions.nflId == self.ball_carry_id, 'type'] = "Carrier"
        return current_positions[['nflId', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'height', 'weight', 'type']]
        
    def plot_tackle_attempt_matrix(self, frame_id = 1):
        plt.imshow(self.get_tackle_attempt_matrix(), cmap='binary', interpolation='none')
        plt.colorbar()  
        plt.show()
    
    def get_grid_features(self, frame_id, N, matrix_form = True):
        stratified_dfs = self.tracking_refined_stratified[frame_id]
        grid_features = pd.DataFrame()
        return_mat = np.zeros((24, len(list(range(0, 120, N))), len(list(range(0, 54, N)))))
        for x_low in list(range(0, 120, N)):
            for y_low in list(range(0, 54, N)):
                off_df = stratified_dfs["Offense"]
                def_df = stratified_dfs["Defense"]
                ball_df = stratified_dfs["Carrier"]
                x_high = x_low + N
                y_high = y_low + N

                    # Extract relevant subsets of data
                off_subset = off_df[(off_df['x'] >= x_low) & (off_df['x'] < x_high) & (off_df['y'] >= y_low) & (off_df['y'] < y_high)]
                def_subset = def_df[(def_df['x'] >= x_low) & (def_df['x'] < x_high) & (def_df['y'] >= y_low) & (def_df['y'] < y_high)]
                ball_subset = ball_df[(ball_df['x'] >= x_low) & (ball_df['x'] < x_high) & (ball_df['y'] >= y_low) & (ball_df['y'] < y_high)]

                # Calculate statistics using vectorized operations
                current_offensive_player_density = len(off_subset)
                current_defensive_player_density = len(def_subset)
                current_ballcarrier_player_density = len(ball_subset)

                offense_directional_vector = np.cos(off_df['dir'] * (np.pi / 180)) * (x_low + N/2 - off_df['x']) + np.sin(off_df['dir'] * (math.pi / 180)) * (y_low + N/2 - off_df['y'])
                defense_directional_vector = np.cos(def_df['dir'] * (np.pi / 180)) * (x_low + N/2 - def_df['x']) + np.sin(def_df['dir'] * (math.pi / 180)) * (y_low + N/2 - def_df['y'])

                velocities_offensive_toward_point = off_df['s'] * offense_directional_vector
                velocities_defensive_toward_point = def_df['s'] * defense_directional_vector
                
                acceleration_offensive_toward_point = off_df['a'] * offense_directional_vector
                acceleration_defensive_toward_point = def_df['a'] * defense_directional_vector

                distance_offense_from_point = np.sqrt((off_df['x'] - (x_low + N/2))**2 + (off_df['y'] - (y_low + N/2))**2)
                distance_defensive_from_point = np.sqrt((def_df['x'] - (x_low + N/2))**2 + (def_df['y'] - (y_low + N/2))**2)

                velocities_ballcarrier_toward_point = ball_df['s'] * (np.cos(ball_df['dir'] * (math.pi / 180)) * (x_low + N/2 - ball_df['x']) +
                                                                    np.sin(ball_df['dir'] * (math.pi / 180)) * (y_low + N/2 - ball_df['y']))
                acceleration_ballcarrier_toward_point = ball_df['a'] * (np.cos(ball_df['dir'] * (math.pi / 180)) * (x_low + N/2 - ball_df['x']) +
                                                                        np.sin(ball_df['dir'] * (math.pi / 180)) * (y_low + N/2 - ball_df['y']))
                distance_ballcarrier_from_point = np.sqrt((ball_df['x'] - (x_low + N/2))**2 + (ball_df['y'] - (y_low + N/2))**2)
                ret = pd.DataFrame({'grid_id': [f"{x_low} {y_low}"],
                                                    'off_density': [current_offensive_player_density],
                                                    'def_density': [current_defensive_player_density],
                                                    'ballcarrier_density': [current_ballcarrier_player_density],
                                                    'off_velocity_mean': [np.mean(velocities_offensive_toward_point)],
                                                    'off_velocity_sum': [np.sum(velocities_offensive_toward_point)],
                                                    'off_velocity_std': [np.std(velocities_offensive_toward_point)],
                                                    'def_velocity_mean': [np.mean(velocities_defensive_toward_point)],
                                                    'def_velocity_sum': [np.sum(velocities_defensive_toward_point)],
                                                    'def_velocity_std': [np.std(velocities_defensive_toward_point)],
                                                    'ballcarrier_velocity': [velocities_ballcarrier_toward_point.values[0]],
                                                    'off_acc_mean': [np.mean(acceleration_offensive_toward_point)],
                                                    'off_acc_sum': [np.sum(acceleration_offensive_toward_point)],
                                                    'off_acc_std': [np.std(acceleration_offensive_toward_point)],
                                                    'def_acc_mean': [np.mean(acceleration_defensive_toward_point)],
                                                    'def_acc_sum': [np.sum(acceleration_defensive_toward_point)],
                                                    'def_acc_std': [np.std(acceleration_defensive_toward_point)],
                                                    'ballcarrier_acc': [acceleration_ballcarrier_toward_point.values[0]],
                                                    'off_distance_mean': [np.mean(distance_offense_from_point)],
                                                    'off_distance_sum': [np.sum(distance_offense_from_point)],
                                                    'off_distance_std': [np.std(distance_offense_from_point)],
                                                    'def_distance_mean': [np.mean(distance_defensive_from_point)],
                                                    'def_distance_sum': [np.sum(distance_defensive_from_point)],
                                                    'def_distance_std': [np.std(distance_defensive_from_point)],
                                                    'ballcarrier_distance': [distance_ballcarrier_from_point.values[0]]})
                if matrix_form:
                    return_mat[:, int(x_low/N), int(y_low/N)] = np.array(ret.drop(['grid_id'], axis = 1).iloc[0])
                else:
                    grid_features = pd.concat([grid_features, ret])
        if matrix_form:
            return return_mat
        else:
            return grid_features

class TackleAttemptDataset:

    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        self.num_samples = len(images)
    
    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        image = torch.FloatTensor(self.images[idx])
        label = torch.FloatTensor(self.labels[idx])
        return image, label

In [760]:
play_object = play(game_id=2022090800, play_id=56)
play_object.get_end_of_play_matrix(N = 20)

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [761]:
# Do for only t = 10 to start

images = []
labels = []
N = 20
for row in tqdm(range(5000)):
    play_row = plays.iloc[row,]
    play_object = play(game_id=play_row.gameId, play_id=play_row.playId)
    if play_object.num_frames < 10:
        continue # if not n frames happened
    if len(play_object.tracking_refined.get(1).type.unique()) != 4:
        continue # if not offense, defense, ball and carrier in play
    images.append(play_object.get_grid_features(frame_id = 10, N = N, matrix_form = True))
    labels.append(play_object.get_end_of_play_matrix(N = N))
train_data = TackleAttemptDataset(images = images, labels = labels)

100%|██████████| 5000/5000 [26:14<00:00,  3.18it/s]


In [762]:
len(train_data)

4997

In [763]:
tackle_dataset = TackleAttemptDataset(images = images, labels = labels)
train_data, val_data = torch.utils.data.random_split(tackle_dataset, [0.9, 0.1])

In [771]:
from torchvision.models import resnet50

class TackleNet(nn.Module):
    def __init__(self, N, nvar):
        super(TackleNet, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(nvar, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 20, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(20, 10, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(10, 20, kernel_size=3, padding=1)
        
        # Fully connected layers
        self.fc1 = nn.Linear(math.ceil(120/N)*math.ceil(54/N)*20, 128)
        self.fc2 = nn.Linear(128, math.ceil(120/N)*math.ceil(54/N))
        self.N = N
        
    def forward(self, x):
        # Input shape: (batch_size, 24, 12, 6)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        
        # Flatten the output for the fully connected layers
        x = x.view(x.size(0), -1)
        
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=1)
        
        # Apply softmax to ensure the output sums to 1 along the channel dimension (12*6)
        x = x.view(-1, math.ceil(120/self.N), math.ceil(54/self.N))
        
        return x
    
def plot_predictions(prediction_output, true):
    fig, axs = plt.subplots(8, 8, figsize=(16, 16))

    # Flatten the 8x8 grid of subplots to a 1D array for easier indexing
    axs = axs.flatten()

    # Loop through the 64 images and display each in a subplot
    for i in range(64):
        image = prediction_output[i].detach().numpy()
        true_image = true[i].detach().numpy()
        axs[i].imshow(image, cmap='viridis', interpolation='none')
        axs[i].axis('off')  # Turn off the axis for each subplot
        axs[i].set_title(f"Image {i + 1}")
        axs[i].grid()
        x_max = true_image.shape[0]
        y_max = true_image.shape[1]
        for k in range(x_max):
            for l in range(y_max):
                if true_image[k, l] == 1:
                    axs[i].plot(l, k, 'ro', markersize=5)

    # Adjust spacing between subplots to make them look better
    plt.subplots_adjust(wspace=0.2, hspace=0.2)
    plt.show()

In [772]:
# Create Data Loader
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=2)

# Define the loss function (you can choose an appropriate loss function for your task)
criterion = nn.CrossEntropyLoss()

# Create Model
model = resnet50()

# Define the optimizer (e.g., Stochastic Gradient Descent)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    i = 0
    for X_batch, y_batch in train_dataloader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output.flatten(), y_batch.flatten())
        loss.backward()
        optimizer.step()
        if i == 0:
          plot_predictions(output, y_batch)
        i += 1
    output = model(X_batch)
    loss = criterion(output.flatten(), y_batch.flatten())
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}')

# After training, you can use the model to make predictions
predictions = []
true_labels = []
with torch.no_grad():  # Disable gradient computation for inference
    for X_batch, y_batch in val_loader:
        outputs = model(X_batch)
        predictions.append(outputs.flatten())
        true_labels.append(y_batch.flatten())

predictions = torch.cat(predictions, dim=0) 
true_labels = torch.cat(true_labels, dim=0) 

RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[64, 24, 6, 3] to have 3 channels, but got 24 channels instead

In [705]:
y_batch.shape

torch.Size([64, 12, 6])

In [709]:
train_dataloader.dataset[0][1].shape

torch.Size([12, 6])

In [576]:
train_dataloader.dataset[0][0].shape

torch.Size([24, 12, 6])

In [766]:
output

tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 4.8101e-32],
         [1.0000e+00, 1.0000e+00, 1.0000e+00],
         [0.0000e+00, 0.0000e+00, 3.8891e-30],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 1.1163e-37],
         [1.0000e+00, 1.0000e+00, 1.0000e+00],
         [0.0000e+00, 0.0000e+00, 8.9992e-23],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 2.4764e-35],
         [1.0000e+00, 0.0000e+00, 1.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 1.0000e+00, 6.5216e-14],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 5.4651e-44],
         [1.0000e+00, 0.0000e+00, 1.0000e+00],
       

In [256]:
## Retry