In [None]:
# NFL Big Data Bowl 2026 - Model Analysis Notebook
# Physics-Informed Residual Trajectory Network (PIRTN)

import os
import sys
import warnings
import math
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool as CatPool
from catboost.utils import get_gpu_device_count

warnings.filterwarnings("ignore")

DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction")
WORK_DIR = Path("/kaggle/working") if DATA_DIR.exists() else Path.cwd()

print("Python:", sys.version)
print("Data dir exists:", DATA_DIR.exists())
print("Work dir:", WORK_DIR)

try:
    import torch
    print("CUDA available:", torch.cuda.is_available())
except Exception as e:
    print("Torch not available:", e)



ModuleNotFoundError: No module named 'catboost'

In [None]:
def predict_test_template(test_input: pd.DataFrame, test_template: pd.DataFrame, models_x, models_y, feature_columns):
    # Build last-frame features from observed test_input
    feats = engineer_physics_features(test_input)
    feats = add_sequence_features(feats)
    feats = add_formation_features(feats)
    feats = add_time_features(feats)
    feats = add_motion_ema_features(feats)
    feats = add_orientation_features(feats)
    feats = add_geometric_features(feats)

    gnn = compute_neighbor_embeddings(feats)

    last_frames = (feats.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
                   .groupby(['game_id', 'play_id', 'nfl_id'], as_index=False)
                   .tail(1)
                   .rename(columns={'frame_id': 'last_frame_id'}))

    last_frames = last_frames.merge(gnn, on=['game_id', 'play_id', 'nfl_id'], how='left')
    if 'gnn_opp_dist_min' in last_frames.columns:
        last_frames['pressure'] = 1.0 / (last_frames['gnn_opp_dist_min'] + 0.1)

    # Expand to all required horizons in test.csv
    future_rows = test_template.merge(last_frames, on=['game_id', 'play_id', 'nfl_id'], how='left')

    future_rows['last_frame_id'] = future_rows['last_frame_id'].fillna(future_rows['frame_id'])
    future_rows['delta_frames'] = (future_rows['frame_id'] - future_rows['last_frame_id']).clip(lower=0)
    future_rows['delta_t'] = (future_rows['delta_frames'] / 10.0).fillna(0.0)
    future_rows['waypoint_idx'] = (np.maximum(future_rows['delta_frames'] - 1, 0) // 10).astype(int)
    future_rows['is_waypoint'] = (future_rows['delta_frames'] % 10 == 0).astype(int)

    for col in feature_columns:
        if col not in future_rows.columns:
            future_rows[col] = 0

    X_test = future_rows[feature_columns].fillna(0).values

    baseline_x, baseline_y = steered_kinematics_baseline(
        future_rows['x'].values,
        future_rows['y'].values,
        future_rows['velocity_x'].values,
        future_rows['velocity_y'].values,
        future_rows['ball_land_x'].values,
        future_rows['ball_land_y'].values,
        np.nan_to_num(future_rows['delta_t'].values, nan=0.0)
    )

    pred_rx = np.mean([m.predict(X_test) for m in models_x], axis=0)
    pred_ry = np.mean([m.predict(X_test) for m in models_y], axis=0)

    pred_x = np.clip(pred_rx + baseline_x, 0, 120)
    pred_y = np.clip(pred_ry + baseline_y, 0, 53.3)

    submission = test_template.copy()
    submission['id'] = (submission['game_id'].astype(str) + '_' +
                        submission['play_id'].astype(str) + '_' +
                        submission['nfl_id'].astype(str) + '_' +
                        submission['frame_id'].astype(str))
    submission['x'] = pred_x
    submission['y'] = pred_y
    submission['x'] = submission['x'].fillna(0.0)
    submission['y'] = submission['y'].fillna(0.0)
    return submission[['id', 'x', 'y']]


# Model Evaluation

In [None]:
# Self-contained pipeline (reads from /kaggle/input and writes submission.csv)

# Paths like Sample Notebooks
DATA_DIR = Path("/kaggle/input/nfl-big-data-bowl-2026-prediction")
SAVE_PATH = Path("/kaggle/working/submission.csv")

# Fallback for local dry-runs (optional)
if not DATA_DIR.exists():
    DATA_DIR = Path.cwd() / "kaggle" / "input" / "nfl-big-data-bowl-2026-prediction"
    SAVE_PATH = Path.cwd() / "submission.csv"

# Config
N_FOLDS = 5
ITERATIONS = 10000
LEARNING_RATE = 0.05
DEPTH = 8
L2_REG = 5.0
EARLY_STOPPING = 800
SEED = 42

V_MAX = 7.5
A_MAX = 3.0
TURN_MAX_DEG = 120.0
K_NEIGHBORS = 6
RADIUS_LIMIT = 30.0
TAU = 8.0
try:
    USE_GPU = get_gpu_device_count() > 0
except Exception:
    USE_GPU = False

# Feature engineering

def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if 'num_frames_output' in df.columns:
        max_frames = df['num_frames_output']
    else:
        max_frames = pd.Series(df.groupby(['game_id','play_id'])['frame_id'].transform('max'), index=df.index).fillna(30)
    df['max_play_duration'] = max_frames / 10.0
    df['frame_time'] = df['frame_id'] / 10.0
    df['progress_ratio'] = (df['frame_id'] / np.maximum(max_frames, 1)).clip(0.0, 1.0)
    df['time_remaining'] = (max_frames - df['frame_id']) / 10.0
    df['frames_remaining'] = (max_frames - df['frame_id']).clip(lower=0)

    # Ball-expected position and errors
    df['expected_x_at_ball'] = df['x'] + df['velocity_x'] * df['frame_time']
    df['expected_y_at_ball'] = df['y'] + df['velocity_y'] * df['frame_time']
    if 'ball_land_x' in df.columns:
        df['error_from_ball_x'] = df['expected_x_at_ball'] - df['ball_land_x']
        df['error_from_ball_y'] = df['expected_y_at_ball'] - df['ball_land_y']
        df['error_from_ball'] = np.sqrt(df['error_from_ball_x']**2 + df['error_from_ball_y']**2)
        df['weighted_dist_by_time'] = df['dist_to_ball'] / (df['frame_time'] + 0.1)
        df['dist_scaled_by_progress'] = df['dist_to_ball'] * (1.0 - df['progress_ratio'])
    df['time_squared'] = df['frame_time'] ** 2
    df['velocity_x_progress'] = df['velocity_x'] * df['progress_ratio']
    df['velocity_y_progress'] = df['velocity_y'] * df['progress_ratio']
    df['speed_scaled_by_time_left'] = df['s'] * df['time_remaining']
    df['actual_play_length'] = max_frames
    df['length_ratio'] = max_frames / 30.0
    return df


def add_motion_ema_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    g = df.groupby(['game_id','play_id','nfl_id'])
    df['velocity_x_ema'] = g['velocity_x'].transform(lambda x: x.ewm(alpha=0.3, adjust=False).mean())
    df['velocity_y_ema'] = g['velocity_y'].transform(lambda x: x.ewm(alpha=0.3, adjust=False).mean())
    df['speed_ema'] = g['s'].transform(lambda x: x.ewm(alpha=0.3, adjust=False).mean())
    return df


def add_orientation_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # orientation vs movement and towards ball
    df['orientation_diff'] = np.abs(df['o'] - df['dir'])
    df['orientation_diff'] = np.minimum(df['orientation_diff'], 360 - df['orientation_diff'])
    dir_rad = np.radians(df['dir'].fillna(0.0))
    df['velocity_alignment'] = np.cos(df['angle_to_ball'] - dir_rad)
    return df


def compute_geometric_endpoint(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # time to endpoint (seconds) from available num_frames_output; fallback 3.0s
    if 'num_frames_output' in df.columns:
        t_total = df['num_frames_output'] / 10.0
    else:
        t_total = pd.Series(3.0, index=df.index)
    df['geo_time_to_endpoint'] = t_total

    # default: momentum projection
    df['geo_endpoint_x'] = df['x'] + df['velocity_x'] * t_total
    df['geo_endpoint_y'] = df['y'] + df['velocity_y'] * t_total

    # targeted receiver → ball landing
    if 'player_role' in df.columns:
        recv_mask = (df['player_role'] == 'Targeted Receiver')
        if 'ball_land_x' in df.columns:
            df.loc[recv_mask, 'geo_endpoint_x'] = df.loc[recv_mask, 'ball_land_x']
            df.loc[recv_mask, 'geo_endpoint_y'] = df.loc[recv_mask, 'ball_land_y']

    # clip to field
    df['geo_endpoint_x'] = df['geo_endpoint_x'].clip(0.0, 120.0)
    df['geo_endpoint_y'] = df['geo_endpoint_y'].clip(0.0, 53.3)
    return df


def add_geometric_features(df: pd.DataFrame) -> pd.DataFrame:
    df = compute_geometric_endpoint(df)
    # vector to geometric endpoint
    df['geo_vector_x'] = df['geo_endpoint_x'] - df['x']
    df['geo_vector_y'] = df['geo_endpoint_y'] - df['y']
    df['geo_distance'] = np.sqrt(df['geo_vector_x']**2 + df['geo_vector_y']**2)

    # required velocity to reach endpoint
    t = df['geo_time_to_endpoint'].fillna(3.0) + 0.1
    df['geo_required_vx'] = df['geo_vector_x'] / t
    df['geo_required_vy'] = df['geo_vector_y'] / t

    # velocity error relative to geometric path
    df['geo_velocity_error_x'] = df['geo_required_vx'] - df['velocity_x']
    df['geo_velocity_error_y'] = df['geo_required_vy'] - df['velocity_y']
    df['geo_velocity_error'] = np.sqrt(df['geo_velocity_error_x']**2 + df['geo_velocity_error_y']**2)

    # required constant acceleration to endpoint a = 2*dx/t^2
    t_sq = t * t
    df['geo_required_ax'] = (2.0 * df['geo_vector_x'] / t_sq).clip(-10.0, 10.0)
    df['geo_required_ay'] = (2.0 * df['geo_vector_y'] / t_sq).clip(-10.0, 10.0)

    # alignment with geometric path
    vel_mag = np.sqrt(df['velocity_x']**2 + df['velocity_y']**2) + 0.1
    geo_unit_x = df['geo_vector_x'] / (df['geo_distance'] + 0.1)
    geo_unit_y = df['geo_vector_y'] / (df['geo_distance'] + 0.1)
    df['geo_alignment'] = (df['velocity_x'] * geo_unit_x + df['velocity_y'] * geo_unit_y) / vel_mag
    return df

def height_to_inches(h):
    try:
        ft, inch = map(int, str(h).split('-'))
        return ft * 12 + inch
    except Exception:
        return 72.0


def engineer_physics_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['height_inches'] = df['player_height'].map(height_to_inches)
    df['bmi'] = (df['player_weight'] / (df['height_inches']**2)) * 703.0

    dir_rad = np.radians(df['dir'].fillna(0.0))
    df['velocity_x'] = df['s'] * np.sin(dir_rad)
    df['velocity_y'] = df['s'] * np.cos(dir_rad)
    df['acceleration_x'] = df['a'] * np.cos(dir_rad)
    df['acceleration_y'] = df['a'] * np.sin(dir_rad)

    dx = df['ball_land_x'] - df['x']
    dy = df['ball_land_y'] - df['y']
    dist = np.sqrt(dx**2 + dy**2)
    df['dist_to_ball'] = dist
    df['angle_to_ball'] = np.arctan2(dy, dx)

    ux = dx / (dist + 1e-6)
    uy = dy / (dist + 1e-6)
    vx = -uy
    vy = ux

    df['velocity_parallel'] = df['velocity_x'] * ux + df['velocity_y'] * uy
    df['velocity_perpendicular'] = df['velocity_x'] * vx + df['velocity_y'] * vy
    df['acceleration_parallel'] = df['acceleration_x'] * ux + df['acceleration_y'] * uy
    df['acceleration_perpendicular'] = df['acceleration_x'] * vx + df['acceleration_y'] * vy

    df['speed_squared'] = df['s'] ** 2
    df['accel_magnitude'] = np.sqrt(df['acceleration_x']**2 + df['acceleration_y']**2)
    df['momentum_x'] = df['player_weight'] * df['velocity_x']
    df['momentum_y'] = df['player_weight'] * df['velocity_y']
    df['kinetic_energy'] = 0.5 * df['player_weight'] * df['speed_squared']

    df['role_targeted_receiver'] = (df['player_role'] == 'Targeted Receiver').astype(int)
    df['role_defensive_coverage'] = (df['player_role'] == 'Defensive Coverage').astype(int)
    df['role_passer'] = (df['player_role'] == 'Passer').astype(int)
    df['side_offense'] = (df['player_side'] == 'Offense').astype(int)
    return df


def add_sequence_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id']).copy()
    g = ['game_id', 'play_id', 'nfl_id']

    for lag in [1, 2, 3, 4, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's', 'a',
                    'velocity_parallel', 'velocity_perpendicular',
                    'acceleration_parallel', 'acceleration_perpendicular']:
            if col in df.columns:
                df[f'{col}_lag{lag}'] = df.groupby(g)[col].shift(lag)

    for window in [3, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's',
                    'velocity_parallel', 'velocity_perpendicular']:
            if col in df.columns:
                df[f'{col}_rolling_mean_{window}'] = (
                    df.groupby(g)[col].rolling(window, min_periods=1).mean().reset_index(level=[0, 1, 2], drop=True)
                )
                df[f'{col}_rolling_std_{window}'] = (
                    df.groupby(g)[col].rolling(window, min_periods=1).std().reset_index(level=[0, 1, 2], drop=True)
                )

    for col in ['velocity_x', 'velocity_y', 'velocity_parallel', 'velocity_perpendicular']:
        if col in df.columns:
            df[f'{col}_delta'] = df.groupby(g)[col].diff()
    return df


def add_formation_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    grp = df.groupby(['game_id', 'play_id', 'frame_id', 'player_side'])
    df['team_centroid_x'] = grp['x'].transform('mean')
    df['team_centroid_y'] = grp['y'].transform('mean')
    df['team_width'] = grp['y'].transform('std').fillna(0.0)
    df['team_length'] = grp['x'].transform('std').fillna(0.0)

    df['rel_centroid_x'] = df['x'] - df['team_centroid_x']
    df['rel_centroid_y'] = df['y'] - df['team_centroid_y']

    bearing = np.arctan2(df['ball_land_y'] - df['team_centroid_y'], df['ball_land_x'] - df['team_centroid_x'])
    df['formation_bearing_sin'] = np.sin(bearing)
    df['formation_bearing_cos'] = np.cos(bearing)
    return df


def compute_neighbor_embeddings(input_df: pd.DataFrame) -> pd.DataFrame:
    cols = ['game_id', 'play_id', 'nfl_id', 'frame_id', 'x', 'y', 'velocity_x', 'velocity_y', 'player_side']
    src = input_df[cols].copy()

    last_frames = (src.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
                   .groupby(['game_id', 'play_id', 'nfl_id'], as_index=False)
                   .tail(1)
                   .rename(columns={'frame_id': 'last_frame_id'}))

    neighbors = last_frames.merge(src.rename(columns={'frame_id': 'nb_frame_id', 'nfl_id': 'nfl_id_nb',
                                                      'x': 'x_nb', 'y': 'y_nb',
                                                      'velocity_x': 'vx_nb', 'velocity_y': 'vy_nb',
                                                      'player_side': 'player_side_nb'}),
                                  left_on=['game_id', 'play_id', 'last_frame_id'],
                                  right_on=['game_id', 'play_id', 'nb_frame_id'], how='left')
    neighbors = neighbors[neighbors['nfl_id_nb'] != neighbors['nfl_id']]

    neighbors['dx'] = neighbors['x_nb'] - neighbors['x']
    neighbors['dy'] = neighbors['y_nb'] - neighbors['y']
    neighbors['dvx'] = neighbors['vx_nb'] - neighbors['velocity_x']
    neighbors['dvy'] = neighbors['vy_nb'] - neighbors['velocity_y']
    neighbors['dist'] = np.sqrt(neighbors['dx']**2 + neighbors['dy']**2)

    neighbors = neighbors[np.isfinite(neighbors['dist']) & (neighbors['dist'] > 1e-6) & (neighbors['dist'] <= RADIUS_LIMIT)]
    neighbors['is_ally'] = (neighbors['player_side_nb'].fillna("") == neighbors['player_side'].fillna("")).astype(float)

    keys = ['game_id', 'play_id', 'nfl_id']
    neighbors['rank'] = neighbors.groupby(keys)['dist'].rank(method='first')
    neighbors = neighbors[neighbors['rank'] <= K_NEIGHBORS]

    neighbors['weight'] = np.exp(-neighbors['dist'] / TAU)
    sumw = neighbors.groupby(keys)['weight']. transform('sum')
    neighbors['weight_norm'] = np.where(sumw > 0, neighbors['weight'] / sumw, 0.0)

    neighbors['weight_ally'] = neighbors['weight_norm'] * neighbors['is_ally']
    neighbors['weight_opp'] = neighbors['weight_norm'] * (1.0 - neighbors['is_ally'])

    for c in ['dx', 'dy', 'dvx', 'dvy']:
        neighbors[f'{c}_ally_weighted'] = neighbors[c] * neighbors['weight_ally']
        neighbors[f'{c}_opp_weighted'] = neighbors[c] * neighbors['weight_opp']

    emb = neighbors.groupby(keys).agg(
        gnn_ally_dx_mean=('dx_ally_weighted', 'sum'),
        gnn_ally_dy_mean=('dy_ally_weighted', 'sum'),
        gnn_ally_dvx_mean=('dvx_ally_weighted', 'sum'),
        gnn_ally_dvy_mean=('dvy_ally_weighted', 'sum'),
        gnn_opp_dx_mean=('dx_opp_weighted', 'sum'),
        gnn_opp_dy_mean=('dy_opp_weighted', 'sum'),
        gnn_opp_dvx_mean=('dvx_opp_weighted', 'sum'),
        gnn_opp_dvy_mean=('dvy_opp_weighted', 'sum'),
        gnn_ally_count=('is_ally', 'sum'),
        gnn_opp_count=('is_ally', lambda x: len(x) - x.sum()),
        gnn_ally_dist_min=('dist', lambda x: x[neighbors.loc[x.index, 'is_ally'] > 0.5].min() if (neighbors.loc[x.index, 'is_ally'] > 0.5).any() else RADIUS_LIMIT),
        gnn_opp_dist_min=('dist', lambda x: x[neighbors.loc[x.index, 'is_ally'] < 0.5].min() if (neighbors.loc[x.index, 'is_ally'] < 0.5).any() else RADIUS_LIMIT)
    ).reset_index()

    for c in ['gnn_ally_dx_mean', 'gnn_ally_dy_mean', 'gnn_ally_dvx_mean', 'gnn_ally_dvy_mean', 'gnn_opp_dx_mean', 'gnn_opp_dy_mean', 'gnn_opp_dvx_mean', 'gnn_opp_dvy_mean']:
        emb[c] = emb[c].fillna(0.0)
    for c in ['gnn_ally_count', 'gnn_opp_count']:
        emb[c] = emb[c].fillna(0.0)
    for c in ['gnn_ally_dist_min', 'gnn_opp_dist_min']:
        emb[c] = emb[c].fillna(RADIUS_LIMIT)
    return emb

# Physics baseline

def steered_kinematics_baseline(x, y, vx, vy, ball_x, ball_y, dt, v_max=V_MAX, a_max=A_MAX, turn_rate_max_deg=TURN_MAX_DEG):
    eps = 1e-6
    speed = np.sqrt(vx**2 + vy**2)
    cur_dir = np.where(speed > eps, np.arctan2(vx, vy), np.arctan2(ball_y - y, ball_x - x))
    desired_dir = np.arctan2(ball_y - y, ball_x - x)

    ang_diff = (desired_dir - cur_dir + np.pi) % (2 * np.pi) - np.pi
    max_turn = np.radians(turn_rate_max_deg) * dt
    ang_step = np.clip(ang_diff, -max_turn, max_turn)
    new_dir = cur_dir + ang_step

    target_speed = np.minimum(v_max, speed + a_max * dt)
    vx_new = target_speed * np.sin(new_dir)
    vy_new = target_speed * np.cos(new_dir)

    pred_x = x + 0.5 * (vx + vx_new) * dt
    pred_y = y + 0.5 * (vy + vy_new) * dt

    pred_x = np.clip(pred_x, 0.0, 120.0)
    pred_y = np.clip(pred_y, 0.0, 53.3)
    return pred_x, pred_y

# Feature list

def build_feature_list(df: pd.DataFrame):
    base = [
        'x', 'y', 's', 'a', 'o', 'dir',
        'velocity_x', 'velocity_y', 'acceleration_x', 'acceleration_y',
        'velocity_parallel', 'velocity_perpendicular',
        'acceleration_parallel', 'acceleration_perpendicular',
        'player_weight', 'height_inches', 'bmi',
        'ball_land_x', 'ball_land_y', 'dist_to_ball', 'angle_to_ball',
        'speed_squared', 'accel_magnitude', 'momentum_x', 'momentum_y', 'kinetic_energy',
        'role_targeted_receiver', 'role_defensive_coverage', 'role_passer', 'side_offense',
        'team_centroid_x', 'team_centroid_y', 'team_width', 'team_length',
        'rel_centroid_x', 'rel_centroid_y', 'formation_bearing_sin', 'formation_bearing_cos',
        'delta_frames', 'delta_t', 'frame_id', 'waypoint_idx', 'is_waypoint',
        'pressure'
    ]
    # Geometric features (inspired by leader notebook)
    geo = [
        'geo_time_to_endpoint', 'geo_endpoint_x', 'geo_endpoint_y',
        'geo_vector_x', 'geo_vector_y', 'geo_distance',
        'geo_required_vx', 'geo_required_vy',
        'geo_velocity_error_x', 'geo_velocity_error_y', 'geo_velocity_error',
        'geo_required_ax', 'geo_required_ay', 'geo_alignment'
    ]
    gnn = [c for c in df.columns if c.startswith('gnn_')]

    lags = []
    for lag in [1, 2, 3, 4, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's', 'a',
                    'velocity_parallel', 'velocity_perpendicular',
                    'acceleration_parallel', 'acceleration_perpendicular']:
            lags.append(f'{col}_lag{lag}')

    rolls = []
    for window in [3, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's',
                    'velocity_parallel', 'velocity_perpendicular']:
            rolls.extend([f'{col}_rolling_mean_{window}', f'{col}_rolling_std_{window}'])

    deltas = ['velocity_x_delta', 'velocity_y_delta', 'velocity_parallel_delta', 'velocity_perpendicular_delta']

    all_cols = base + geo + gnn + lags + rolls + deltas
    return [c for c in all_cols if c in df.columns]

# Prepare training data

def prepare_training_data(input_df: pd.DataFrame, output_df: pd.DataFrame) -> pd.DataFrame:
    feats = engineer_physics_features(input_df)
    feats = add_sequence_features(feats)
    feats = add_formation_features(feats)
    feats = add_time_features(feats)
    feats = add_motion_ema_features(feats)
    feats = add_orientation_features(feats)
    feats = add_geometric_features(feats)

    gnn = compute_neighbor_embeddings(feats)

    last_frames = (feats.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
                   .groupby(['game_id', 'play_id', 'nfl_id'], as_index=False)
                   .tail(1)
                   .rename(columns={'frame_id': 'last_frame_id'}))

    last_frames = last_frames.merge(gnn, on=['game_id', 'play_id', 'nfl_id'], how='left')
    # pressure from nearest opponent distance
    if 'gnn_opp_dist_min' in last_frames.columns:
        last_frames['pressure'] = 1.0 / (last_frames['gnn_opp_dist_min'] + 0.1)

    out = output_df.rename(columns={'x': 'target_x', 'y': 'target_y'}).copy()

    train = out.merge(last_frames, on=['game_id', 'play_id', 'nfl_id'], how='left')
    train['delta_frames'] = (train['frame_id'] - train['last_frame_id']).clip(lower=0)
    train['delta_t'] = train['delta_frames'] / 10.0
    train['waypoint_idx'] = (np.maximum(train['delta_frames'] - 1, 0) // 10).astype(int)
    train['is_waypoint'] = (train['delta_frames'] % 10 == 0).astype(int)
    return train

# Train CatBoost residual models

def train_catboost_models(training_data: pd.DataFrame):
    features = build_feature_list(training_data)
    X = training_data[features].fillna(0).values
    y_x = training_data['target_x'].values
    y_y = training_data['target_y'].values

    baseline_x, baseline_y = steered_kinematics_baseline(
        training_data['x'].values,
        training_data['y'].values,
        training_data['velocity_x'].values,
        training_data['velocity_y'].values,
        training_data['ball_land_x'].values,
        training_data['ball_land_y'].values,
        training_data['delta_t'].values
    )
    residual_x = y_x - baseline_x
    residual_y = y_y - baseline_y

    groups = training_data['game_id'].astype(str) + '_' + training_data['play_id'].astype(str)
    gkf = GroupKFold(n_splits=N_FOLDS)

    params = {
        'iterations': ITERATIONS,
        'learning_rate': LEARNING_RATE,
        'depth': DEPTH,
        'l2_leaf_reg': L2_REG,
        'random_seed': SEED,
        'task_type': 'GPU' if USE_GPU else 'CPU',
        'loss_function': 'RMSE',
        'early_stopping_rounds': EARLY_STOPPING,
        'verbose': 200
    }

    models_x, models_y, fold_scores = [], [], []

    for fold, (tr, va) in enumerate(gkf.split(X, groups=groups), 1):
        print(f"\nFold {fold}/{N_FOLDS}")
        X_tr, X_va = X[tr], X[va]
        yx_tr, yx_va = residual_x[tr], residual_x[va]
        yy_tr, yy_va = residual_y[tr], residual_y[va]

        mx = CatBoostRegressor(**params)
        my = CatBoostRegressor(**params)
        mx.fit(CatPool(X_tr, yx_tr), eval_set=CatPool(X_va, yx_va))
        my.fit(CatPool(X_tr, yy_tr), eval_set=CatPool(X_va, yy_va))

        models_x.append(mx)
        models_y.append(my)

        pred_rx = mx.predict(X_va)
        pred_ry = my.predict(X_va)
        pred_x = np.clip(pred_rx + baseline_x[va], 0, 120)
        pred_y = np.clip(pred_ry + baseline_y[va], 0, 53.3)
        rmse = math.sqrt(0.5 * (mean_squared_error(y_x[va], pred_x) + mean_squared_error(y_y[va], pred_y)))
        print(f"Fold {fold} RMSE: {rmse:.5f}")
        fold_scores.append(rmse)

    print(f"\nCV Scores: {[f'{s:.5f}' for s in fold_scores]}")
    print(f"Mean CV RMSE: {np.mean(fold_scores):.5f} ± {np.std(fold_scores):.5f}")
    return models_x, models_y, features

# Predict test

def predict_test(test_input: pd.DataFrame, test_template: pd.DataFrame, models_x, models_y, feature_columns):
    feats = engineer_physics_features(test_input)
    feats = add_sequence_features(feats)
    feats = add_formation_features(feats)
    feats = add_time_features(feats)
    feats = add_motion_ema_features(feats)
    feats = add_orientation_features(feats)
    feats = add_geometric_features(feats)

    gnn = compute_neighbor_embeddings(feats)

    last_frames = (feats.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
                   .groupby(['game_id', 'play_id', 'nfl_id'], as_index=False)
                   .tail(1)
                   .rename(columns={'frame_id': 'last_frame_id'}))

    last_frames = last_frames.merge(gnn, on=['game_id', 'play_id', 'nfl_id'], how='left')
    # pressure from nearest opponent distance
    if 'gnn_opp_dist_min' in last_frames.columns:
        last_frames['pressure'] = 1.0 / (last_frames['gnn_opp_dist_min'] + 0.1)

    test_prepared = test_input.merge(last_frames, on=['game_id', 'play_id', 'nfl_id'], how='left')

    # Fill missing last_frame_id with current frame to ensure zero delta
    test_prepared['last_frame_id'] = test_prepared['last_frame_id'].fillna(test_prepared['frame_id'])
    test_prepared['delta_frames'] = (test_prepared['frame_id'] - test_prepared['last_frame_id']).clip(lower=0)
    test_prepared['delta_t'] = (test_prepared['delta_frames'] / 10.0).fillna(0.0)
    test_prepared['waypoint_idx'] = (np.maximum(test_prepared['delta_frames'] - 1, 0) // 10).astype(int)
    test_prepared['is_waypoint'] = (test_prepared['delta_frames'] % 10 == 0).astype(int)

    for col in feature_columns:
        if col not in test_prepared.columns:
            test_prepared[col] = 0

    X_test = test_prepared[feature_columns].fillna(0).values

    baseline_x, baseline_y = steered_kinematics_baseline(
        test_prepared['x'].values,
        test_prepared['y'].values,
        test_prepared['velocity_x'].values,
        test_prepared['velocity_y'].values,
        test_prepared['ball_land_x'].values,
        test_prepared['ball_land_y'].values,
        np.nan_to_num(test_prepared['delta_t'].values, nan=0.0)
    )

    pred_rx = np.mean([m.predict(X_test) for m in models_x], axis=0)
    pred_ry = np.mean([m.predict(X_test) for m in models_y], axis=0)

    pred_x = np.clip(pred_rx + baseline_x, 0, 120)
    pred_y = np.clip(pred_ry + baseline_y, 0, 53.3)

    # Build id for all rows in test_input
    pred_df = pd.DataFrame({
        'id': test_input['game_id'].astype(str) + '_' + test_input['play_id'].astype(str) + '_' + test_input['nfl_id'].astype(str) + '_' + test_input['frame_id'].astype(str),
        'x': pred_x,
        'y': pred_y
    })

    # Create submission in exact test_template order
    sub_ids = (test_template['game_id'].astype(str) + '_' +
               test_template['play_id'].astype(str) + '_' +
               test_template['nfl_id'].astype(str) + '_' +
               test_template['frame_id'].astype(str))

    submission = pd.DataFrame({'id': sub_ids})
    submission = submission.merge(pred_df, on='id', how='left')

    # Any missing predictions default to zeros (rare)
    submission['x'] = submission['x'].fillna(0.0)
    submission['y'] = submission['y'].fillna(0.0)
    return submission

# Load data exactly like Sample Notebooks
print("\n[1/3] Loading data...")
train_input_files = [DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
train_output_files = [DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]

train_input = pd.concat([pd.read_csv(f) for f in train_input_files if f.exists()])
train_output = pd.concat([pd.read_csv(f) for f in train_output_files if f.exists()])

test_input = pd.read_csv(DATA_DIR / "test_input.csv")
test_template = pd.read_csv(DATA_DIR / "test.csv")

print("[2/3] Preparing training data and training CatBoost (residuals)...")
training_data = prepare_training_data(train_input, train_output)
models_x, models_y, feature_columns = train_catboost_models(training_data)

print("[3/3] Predicting on test and writing submission.csv ...")
submission = predict_test_template(test_input, test_template, models_x, models_y, feature_columns)
submission = submission[['id', 'x', 'y']].astype({'id': str, 'x': float, 'y': float})
submission.to_csv(SAVE_PATH, index=False)
print(f"Saved submission to {SAVE_PATH}")

# Kaggle expects a root-level submission.csv
if str(SAVE_PATH) != 'submission.csv':
    try:
        import shutil
        shutil.copy(str(SAVE_PATH), 'submission.csv')
    except Exception:
        pass

submission.head()


ModuleNotFoundError: No module named 'catboost'

# After Action

In [None]:
# =============================================================================
# MODEL PERFORMANCE ANALYSIS & VISUALIZATION
# =============================================================================
print("\n" + "="*80)
print("MODEL PERFORMANCE ANALYSIS ON TRAINING DATA")
print("="*80)

# Prepare training data for prediction
print("\n[1/4] Preparing training data for model evaluation...")
train_eval_data = prepare_training_data(train_input, train_output)

# Get predictions on training data
print("[2/4] Generating predictions on training data...")
features = build_feature_list(train_eval_data)
X_train = train_eval_data[features].fillna(0).values

# Get baseline predictions
baseline_x, baseline_y = steered_kinematics_baseline(
    train_eval_data['x'].values,
    train_eval_data['y'].values,
    train_eval_data['velocity_x'].values,
    train_eval_data['velocity_y'].values,
    train_eval_data['ball_land_x'].values,
    train_eval_data['ball_land_y'].values,
    train_eval_data['delta_t'].values
)

# Get model predictions (residuals)
pred_rx = np.mean([m.predict(X_train) for m in models_x], axis=0)
pred_ry = np.mean([m.predict(X_train) for m in models_y], axis=0)

# Combine baseline + residuals for final predictions
pred_x = np.clip(pred_rx + baseline_x, 0, 120)
pred_y = np.clip(pred_ry + baseline_y, 0, 53.3)

# Calculate errors
actual_x = train_eval_data['target_x'].values
actual_y = train_eval_data['target_y'].values
error_x = pred_x - actual_x
error_y = pred_y - actual_y
error_distance = np.sqrt(error_x**2 + error_y**2)

# Add predictions and errors to dataframe
train_eval_data = train_eval_data.copy()
train_eval_data['pred_x'] = pred_x
train_eval_data['pred_y'] = pred_y
train_eval_data['error_x'] = error_x
train_eval_data['error_y'] = error_y
train_eval_data['error_distance'] = error_distance

print("[3/4] Analyzing worst predictions by player and play...")

# Calculate cumulative residuals per player per play (adjusted for play length)
play_analysis = train_eval_data.groupby(['game_id', 'play_id', 'nfl_id']).agg({
    'error_distance': ['sum', 'mean', 'max', 'count'],
    'delta_t': 'max',  # Play duration
    'player_name': 'first',
    'player_position': 'first',
    'player_role': 'first'
}).round(4)

# Flatten column names
play_analysis.columns = ['cumulative_error', 'mean_error', 'max_error', 'frame_count', 'play_duration', 'player_name', 'position', 'role']

# Adjust for play length: longer plays naturally have higher cumulative errors
# Normalize by play duration and frame count
play_analysis['error_per_second'] = play_analysis['cumulative_error'] / (play_analysis['play_duration'] + 0.1)
play_analysis['error_per_frame'] = play_analysis['cumulative_error'] / (play_analysis['frame_count'] + 0.1)

# Create a composite score that balances cumulative error with play length
play_analysis['problem_score'] = (
    play_analysis['cumulative_error'] * 0.4 +  # Raw cumulative error
    play_analysis['error_per_second'] * 0.3 +  # Error rate per second
    play_analysis['max_error'] * 0.3           # Worst single prediction
)

# Sort by problem score (highest = most problematic)
worst_predictions = play_analysis.sort_values('problem_score', ascending=False).head(20)

print(f"\n🏆 TOP 20 MOST PROBLEMATIC PREDICTIONS:")
print("="*100)
print(f"{'Rank':<4} {'Player':<20} {'Pos':<4} {'Role':<15} {'Game':<8} {'Play':<6} {'CumError':<10} {'MaxError':<10} {'Frames':<6} {'Duration':<8} {'Score':<8}")
print("-"*100)

for i, (idx, row) in enumerate(worst_predictions.iterrows(), 1):
    game_id, play_id, nfl_id = idx
    print(f"{i:<4} {row['player_name']:<20} {row['position']:<4} {row['role']:<15} {game_id:<8} {play_id:<6} {row['cumulative_error']:<10.2f} {row['max_error']:<10.2f} {row['frame_count']:<6.0f} {row['play_duration']:<8.2f} {row['problem_score']:<8.2f}")

print(f"\n📊 SUMMARY STATISTICS:")
print(f"  • Total plays analyzed: {len(play_analysis):,}")
print(f"  • Mean cumulative error: {play_analysis['cumulative_error'].mean():.2f} yards")
print(f"  • Mean error per second: {play_analysis['error_per_second'].mean():.2f} yards/sec")
print(f"  • Worst single prediction: {play_analysis['max_error'].max():.2f} yards")
print(f"  • Most problematic player: {worst_predictions.iloc[0]['player_name']} (Score: {worst_predictions.iloc[0]['problem_score']:.2f})")

# Show the worst prediction details
worst_idx = worst_predictions.index[0]
worst_details = train_eval_data[
    (train_eval_data['game_id'] == worst_idx[0]) & 
    (train_eval_data['play_id'] == worst_idx[1]) & 
    (train_eval_data['nfl_id'] == worst_idx[2])
].sort_values('frame_id')

print(f"\n🔍 DETAILED BREAKDOWN OF WORST PREDICTION:")
print(f"   Player: {worst_predictions.iloc[0]['player_name']} ({worst_predictions.iloc[0]['position']})")
print(f"   Game: {worst_idx[0]}, Play: {worst_idx[1]}")
print(f"   Role: {worst_predictions.iloc[0]['role']}")
print(f"   Play Duration: {worst_predictions.iloc[0]['play_duration']:.2f}s ({worst_predictions.iloc[0]['frame_count']:.0f} frames)")
print(f"   Cumulative Error: {worst_predictions.iloc[0]['cumulative_error']:.2f} yards")
print(f"   Max Single Error: {worst_predictions.iloc[0]['max_error']:.2f} yards")

print(f"\n   Frame-by-frame errors:")
print(f"   {'Frame':<6} {'Actual X':<8} {'Pred X':<8} {'Actual Y':<8} {'Pred Y':<8} {'Error':<8}")
print(f"   {'-'*6} {'-'*8} {'-'*8} {'-'*8} {'-'*8} {'-'*8}")
for _, row in worst_details.head(10).iterrows():
    print(f"   {row['frame_id']:<6.0f} {row['target_x']:<8.2f} {row['pred_x']:<8.2f} {row['target_y']:<8.2f} {row['pred_y']:<8.2f} {row['error_distance']:<8.2f}")

print("\n" + "="*80)


In [None]:
# =============================================================================
# PLAYER MOVEMENT VISUALIZATION
# =============================================================================
print("\n" + "="*80)
print("🏈 PLAYER MOVEMENT VISUALIZATION")
print("="*80)

# Additional imports for visualization
import matplotlib.pyplot as plt
from matplotlib import animation, rc
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

rc('animation', html='jshtml')

# Role colors for visualization
ROLE_COLORS = {
    "Targeted Receiver": "red",
    "Passer": "blue", 
    "Defensive Coverage": "darkred",
    "Other Route Runner": "orange",
}

def draw_field(ax, start_x=0.0, end_x=120.0, play_direction="right"):
    """Draw an NFL field with endzones, yard lines, and hash marks."""
    field_len = end_x - start_x
    left_goal, right_goal = start_x + 10.0, end_x - 10.0

    # Field + endzones
    if play_direction == "right":
        left_color, right_color = "lightblue", "#f4cccc"
    else:
        left_color, right_color = "#f4cccc", "lightblue"
    ax.add_patch(Rectangle((start_x, 0), field_len, 53.3, facecolor='forestgreen', edgecolor='black', lw=2, zorder=0))
    ax.add_patch(Rectangle((start_x, 0), 10, 53.3, facecolor=left_color, zorder=1))
    ax.add_patch(Rectangle((end_x - 10, 0), 10, 53.3, facecolor=right_color, zorder=1))

    # Sideline ticks
    for x in np.arange(start_x, end_x + 0.1, 1.0):
        for y in [0.4, 53.3 - 0.4]:
            ax.plot([x, x], [y, y + 0.5], color='white', lw=0.4, zorder=2)

    # Yard lines
    for x in np.arange(start_x + 10.0, end_x, 10.0):
        ax.plot([x, x], [0, 53.3], color='white', lw=1.6, zorder=2)

    # Yard numbers
    num_positions = np.arange(start_x + 20.0, end_x - 9.99, 10.0)
    for p in num_positions:
        d = p - left_goal
        label = int(min(d, 100.0 - d))
        ax.text(p, 5, str(label), color='white', fontsize=12, ha='center', va='center')
        ax.text(p, 53.3 - 5, str(label), color='white', fontsize=12, ha='center', va='center')

    # Inbounds hash marks (subtle, semi-transparent)
    hash_y = [18.37, 34.93]
    for x in np.arange(start_x + 10, end_x - 10 + 0.1, 1.0):
        for y in hash_y:
            ax.plot([x, x], [y, y + 0.4], color='white', lw=0.8, alpha=0.5, zorder=2)

    # Limits / aspect
    ax.set_xlim(start_x, end_x)
    ax.set_ylim(0, 53.3)
    ax.set_aspect('equal')
    ax.axis('off')

def draw_legend(ax):
    """Legend for player roles."""
    handles = [Line2D([0], [0], marker='o', color='w', label=role,
                      markerfacecolor=color, markersize=10)
               for role, color in ROLE_COLORS.items()]
    ax.legend(handles=handles, loc='upper center', bbox_to_anchor=(0.5, 1.07),
              ncol=len(handles), framealpha=0.9, frameon=False)

def visualize_play_with_predictions(game_id, play_id, df_in, df_out, train_eval_data, 
                                   show_predictions=True, figsize=(15, 6), subsample=1):
    """Visualize a play with actual vs predicted trajectories."""
    
    # Get input data
    in_play = df_in.query("game_id==@game_id & play_id==@play_id").sort_values("frame_id")
    out_play = df_out.query("game_id==@game_id & play_id==@play_id").sort_values("frame_id")
    play_dir = in_play.play_direction.iloc[0]
    
    # Get predictions for this play
    play_predictions = train_eval_data[
        (train_eval_data['game_id'] == game_id) & 
        (train_eval_data['play_id'] == play_id)
    ].sort_values('frame_id')
    
    if subsample > 1:
        in_play = in_play[in_play["frame_id"] % subsample == 0]
        if not out_play.empty:
            out_play = out_play[out_play["frame_id"] % subsample == 0]
        play_predictions = play_predictions[play_predictions["frame_id"] % subsample == 0]

    # Build frames
    frames = []
    for frame_id in sorted(in_play.frame_id.unique()):
        frame_data = []
        
        # Add actual positions
        frame_in = in_play[in_play.frame_id == frame_id]
        for _, r in frame_in.iterrows():
            frame_data.append({
                'x': r.x, 'y': r.y,
                'color': ROLE_COLORS.get(r.player_role, 'gray'),
                'nfl_id': r.nfl_id,
                'o': getattr(r, "o", None),
                'type': 'actual'
            })
        
        # Add predicted positions if available
        if show_predictions and not play_predictions.empty:
            frame_pred = play_predictions[play_predictions.frame_id == frame_id]
            for _, r in frame_pred.iterrows():
                frame_data.append({
                    'x': r.pred_x, 'y': r.pred_y,
                    'color': ROLE_COLORS.get(r.player_role, 'gray'),
                    'nfl_id': f"{r.nfl_id}_pred",
                    'o': None,
                    'type': 'predicted',
                    'error': r.error_distance
                })
        
        # Add output data (future frames)
        if not out_play.empty:
            frame_out = out_play[out_play.frame_id == frame_id]
            for _, r in frame_out.iterrows():
                frame_data.append({
                    'x': r.x, 'y': r.y,
                    'color': 'black',
                    'nfl_id': f"{r.nfl_id}_future",
                    'o': None,
                    'type': 'future'
                })
        
        frames.append(frame_data)

    # Create visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
    
    # Left plot: Input frames only
    draw_field(ax1, play_direction=play_dir)
    draw_legend(ax1)
    ax1.set_title(f"Input Frames - Game: {game_id}, Play: {play_id}", fontsize=12, fontweight="bold")
    
    # Right plot: Predictions vs Actual
    draw_field(ax2, play_direction=play_dir)
    draw_legend(ax2)
    ax2.set_title("Predictions vs Actual (Future Frames)", fontsize=12, fontweight="bold")
    
    # Plot static positions
    input_frames = [f for f in frames if any(p['type'] == 'actual' for p in f)]
    pred_frames = [f for f in frames if any(p['type'] == 'predicted' for p in f)]
    
    if input_frames:
        input_data = input_frames[-1]  # Last input frame
        actual_x = [p['x'] for p in input_data if p['type'] == 'actual']
        actual_y = [p['y'] for p in input_data if p['type'] == 'actual']
        actual_colors = [p['color'] for p in input_data if p['type'] == 'actual']
        ax1.scatter(actual_x, actual_y, c=actual_colors, s=100, zorder=5)
        
        # Add player IDs
        for p in input_data:
            if p['type'] == 'actual':
                ax1.text(p['x'] + 0.5, p['y'] + 0.5, str(p['nfl_id']), 
                        fontsize=8, ha='center', va='center', zorder=6)
    
    if pred_frames:
        pred_data = pred_frames[0]  # First prediction frame
        pred_x = [p['x'] for p in pred_data if p['type'] == 'predicted']
        pred_y = [p['y'] for p in pred_data if p['type'] == 'predicted']
        pred_colors = [p['color'] for p in pred_data if p['type'] == 'predicted']
        ax2.scatter(pred_x, pred_y, c=pred_colors, s=100, marker='o', 
                   alpha=0.7, label='Predicted', zorder=5)
        
        # Add actual future positions
        future_x = [p['x'] for p in pred_data if p['type'] == 'future']
        future_y = [p['y'] for p in pred_data if p['type'] == 'future']
        future_colors = [p['color'] for p in pred_data if p['type'] == 'future']
        ax2.scatter(future_x, future_y, c=future_colors, s=100, marker='x', 
                   alpha=0.9, label='Actual', zorder=5)
        
        # Draw error lines
        for p in pred_data:
            if p['type'] == 'predicted':
                actual_p = next((a for a in pred_data if a['nfl_id'] == p['nfl_id'].replace('_pred', '_future')), None)
                if actual_p:
                    ax2.plot([p['x'], actual_p['x']], [p['y'], actual_p['y']], 
                            'r--', alpha=0.5, linewidth=1)
                    ax2.text((p['x'] + actual_p['x'])/2, (p['y'] + actual_p['y'])/2, 
                            f"{p['error']:.1f}", fontsize=6, ha='center', va='center',
                            bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.8))
        
        ax2.legend(loc='upper right')
    
    plt.tight_layout()
    plt.show()
    
    return fig

print("[4/4] Setting up visualization functions...")
print("✅ Visualization functions ready!")
print("\nTo visualize a specific play, use:")
print("visualize_play_with_predictions(game_id, play_id, train_input, train_output, train_eval_data)")
print("\nExample with worst prediction:")
if 'worst_idx' in locals():
    print(f"visualize_play_with_predictions({worst_idx[0]}, {worst_idx[1]}, train_input, train_output, train_eval_data)")

print("\n" + "="*80)