In [None]:
# Libraries

import pandas as pd
import numpy as np
import seaborn as sns

from pyts.approximation import SymbolicAggregateApproximation
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='pyts')

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import (
    Input, Embedding, SimpleRNN, Dense, Concatenate, BatchNormalization, Dropout
)

In [None]:
# reading in all files and limiting them to just the before_snap data
# change the file name depending on where you have downloaded the data to

for i in range(1, 10):
    data_chunks = pd.read_csv(f"data/all_tracking_compressed/tracking_week_{i}.csv", chunksize = 1000)
    df_all = pd.concat(data_chunks)
    df = df_all[df_all["frameType"] == "BEFORE_SNAP"]
    df.to_csv(f"data/all_tracking_compressed/tracking_week_{i}.csv")


# combining all dataframes into one

combined_df = pd.DataFrame()

# processing each in chunks
for week in range(1, 10):
    file_name = f"data/all_tracking_compressed/tracking_week_{week}.csv"
    chunk_iter = pd.read_csv(file_name, chunksize=100000)
    for chunk in chunk_iter:
        # adding "week" column so we can keep track of which week each row belongs to
        chunk['week'] = week
        # Append the chunk to the combined DataFrame
        combined_df = pd.concat([combined_df, chunk], ignore_index=True)

# Save the combined DataFrame to a CSV file
#combined_df.to_csv("combined_tracking_data.csv", index=False)

# Display the first few rows of the combined DataFrame
print(combined_df.head())

# limit teams to those in the NFC South

nfc_south = ["CAR", "NO", "ATL", "TB"]

combined_df = combined_df[combined_df["club"].isin(nfc_south)]

In [None]:
# convert play directions so that everything is "moving" in the same direction (left to right)

combined_df["ToLeft"] = combined_df["playDirection"] == "left"

combined_df['X_std'] = np.where(combined_df['ToLeft'], 120 - combined_df['x'], combined_df['x']) - 10
combined_df['Y_std'] = np.where(combined_df['ToLeft'], 160 / 3 - combined_df['y'], combined_df['y'])

# standardize orientation
combined_df['o_std'] = np.where(combined_df['ToLeft'], (combined_df['o'] + 180) % 360, combined_df['o'])

# standardize direction of motion (dir)
combined_df['dir_std'] = np.where(combined_df['ToLeft'], (combined_df['dir'] + 180) % 360, combined_df['dir'])

# limit orientations to those within the field

combined_df = combined_df[combined_df["X_std"] >= 0]
combined_df = combined_df[combined_df["X_std"] <= 120]
combined_df = combined_df[combined_df["Y_std"] >= 0]
combined_df = combined_df[combined_df["Y_std"] <= 53.33]

In [None]:
def compute_group_features(group):
    group['time_diff'] = group['time'].diff().dt.total_seconds().bfill().ffill()
    group['delta_x'] = group['X_std'].diff()
    group['delta_y'] = group['Y_std'].diff()
    group['distance'] = np.sqrt(group['delta_x'] ** 2 + group['delta_y'] ** 2)
    group['speed'] = group['distance'] / group['time_diff']
    group['speed_diff'] = group['speed'].diff()
    group['acceleration'] = group['speed_diff'] / group['time_diff']
    group['movement_angle'] = np.arctan2(group['delta_y'], group['delta_x'])
    return group

if 'X_std' in combined_df.columns and 'Y_std' in combined_df.columns:
    grouped_data = combined_df.groupby(['gameId', 'nflId', 'playId'], group_keys=False).apply(compute_group_features)

In [None]:
# perform SAX; symbolic approximate aggregation

sax_representations = []

alphabet_size = 15  # chose 15 after messing around with lower numbers; higher alphabet gives more granularity
n_points = 10  # Fixed number of points to resample each sequence

# Group by gameId, nflId, and playId
for (gameId, nflId, playId), group in combined_df.groupby(['gameId', 'nflId', 'playId']):
    features = group[['X_std', 'Y_std', 'a', 's', 'o', 'dir']]
    feature_names = features.columns.tolist()
    
    # Resample each feature to a fixed number of points
    resampled_features = []
    for column in feature_names:
        original = features[column].values
        resampled = np.interp(
            np.linspace(0, len(original) - 1, n_points),  # New indices
            np.arange(len(original)),                    # Original indices
            original                                     # Original values
        )
        resampled_features.append(resampled)
    
    # Stack resampled features
    resampled_features = np.array(resampled_features)
    
    # Perform SAX transformation
    sax = SymbolicAggregateApproximation(n_bins=alphabet_size, strategy='quantile')
    sax_words = sax.fit_transform(resampled_features)
    
    # Create a dictionary for SAX results
    sax_representation = {'gameId': gameId, 'nflId': nflId, 'playId': playId}
    for i, name in enumerate(feature_names):
        sax_representation[f'sax_{name}'] = ''.join(sax_words[i])
    
    # Append the SAX representation
    sax_representations.append(sax_representation)

# Convert SAX results to a DataFrame
df_sax = pd.DataFrame(sax_representations)

data = combined_df.merge(df_sax, on=['gameId', 'nflId', 'playId'], how='left')

In [None]:
# data is now in groups but there is still a row for each time point; only take one entry from each group

grouped_data = combined_df.groupby(['gameId', 'nflId', 'playId', "sax_X_std", "sax_Y_std", "sax_a", "sax_s", "sax_o", "sax_dir"]).first().reset_index()

In [None]:
# merge with other datasets

plays = pd.read_csv("data/plays.csv")
players = pd.read_csv("data/players.csv")
player_play = pd.read_csv("data/player_play.csv")
games = pd.read_csv("data/games.csv")

grouped_data = grouped_data.merge(plays, on = ["gameId", "playId"])
grouped_data = grouped_data.merge(players, on = "nflId")
grouped_data = grouped_data.merge(player_play, on = ["gameId", "playId", "nflId"])
grouped_data = grouped_data.merge(games, on = "gameId")

In [None]:
# convert height to height in inches

def height_to_inches(height):
    try:
        # Split the height into feet and inches
        feet, inches = map(int, height.split('-'))
        # Convert feet to inches and add the remaining inches
        return feet * 12 + inches
    except (ValueError, AttributeError):
        # Return None for invalid or missing data
        return None

# Apply the function to the height column
grouped_data['height_inches'] = grouped_data['height'].apply(height_to_inches)

In [None]:
# limit data to only offensive positions

offensive_positions = ["T", "G", "C", "QB", "RB", "FB", "WR"]
grouped_data = grouped_data[grouped_data["position"].isin(offensive_positions)]

In [None]:
# add target variable; playOutcome

def determine_play_outcome(row):
    if row['routeRan'] == 'Slant':
        return 'Slant'
    elif pd.notnull(row['routeRan']) and row['hadRushAttempt'] == 0:
        return row['routeRan']  # Use the route name for other routes
    elif pd.isnull(row['routeRan']) and row['hadRushAttempt'] == 1:
        return 'Rush'
    elif pd.isnull(row['routeRan']) and row['hadRushAttempt'] == 0:
        return 'None'
    elif pd.notnull(row['routeRan']) and row['hadRushAttempt'] == 1:
        return f"{row['routeRan']} and Rush"
    else:
        return 'Unknown'

# Apply the function to create the playOutcome column
grouped_data['playOutcome'] = grouped_data.apply(determine_play_outcome, axis=1)

In [None]:
# update time metrics

grouped_data['time'] = pd.to_datetime(grouped_data['time'], errors='coerce')
grouped_data['month'] = grouped_data['time'].dt.month
grouped_data['day'] = grouped_data['time'].dt.day
#grouped_data['day_of_week'] = grouped_data['time'].dt.dayofweek  # Monday=0, Sunday=6

In [None]:
grouped_data["age"] = 2024 - pd.to_datetime(grouped_data["birthDate"], format = "mixed").dt.year
grouped_data["gameClock_min"] = grouped_data["gameClock"].str[:2].astype(int)
grouped_data["gameClock_sec"] = grouped_data["gameClock"].str[-2:].astype(int)

In [None]:
# limit to only columnts that are relevant to pre-snap info

grouped_data = grouped_data[grouped_data.columns[~grouped_data.columns.str.endswith('_y')]]
grouped_data.columns = [col.replace('_x', '') for col in grouped_data.columns]

data = grouped_data[["playOutcome", "gameId", "playId", "nflId", "frameId", "time", "week", "norm_o", "o_minus_dir", "quarter", "down", "yardsToGo", "possessionTeam", "defensiveTeam", "yardlineNumber", "gameClock_min", "gameClock_min", "preSnapHomeScore", "preSnapVisitorScore", "absoluteYardlineNumber", "preSnapHomeTeamWinProbability", "expectedPoints", "offenseFormation", "receiverAlignment", "routeRan", "pff_defensiveCoverageAssignment", "height_inches", "weight", "position", "age", "sax_X_std", "sax_Y_std", "sax_a", "sax_s", "sax_o", "sax_dir", "time_diff", "distance", "movement_angle_degrees"]]

In [None]:
data.to_csv("data/reduced_sax_presnap.csv")