In [1]:
import os

# Print current working directory
print("Current working dir:", os.getcwd())

# Check if the dataset path exists
print("Dataset path exists:", os.path.exists("processed_data/transformer_dataset"))

Current working dir: /Users/PetersMacBook/Documents/GitHub/ST456 - Deep Learning/project/2025-projects-shallowlearning/code
Dataset path exists: True


In [2]:
import tensorflow as tf
import numpy as np

# Replace with your actual path
dataset_path = "processed_data/transformer_dataset"

# Load the dataset
ds = tf.data.Dataset.load(dataset_path).take(10)

# Optionally: shuffle, batch, and prefetch for performance
ds = ds.shuffle(buffer_size=4096)
ds = ds.batch(64)
ds = ds.prefetch(tf.data.AUTOTUNE)

# Example: iterate over a few examples
for x, y in ds.take(1):
    print("X shape:", x.shape)  # (batch_size, max_seq_len, num_feats)
    print("Y shape:", y.shape)  # (batch_size, num_feats)
    print("X sample:", x[0])
    print("Y sample:", y[0])


2025-04-25 10:07:33.298158: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


X shape: (10, 100, 46)
Y shape: (10, 46)
X sample: tf.Tensor(
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.5544926  0.5078434  0.53826725 ... 0.47774518 0.57133013 0.49457976]
 [0.5554875  0.508226   0.53895605 ... 0.4737916  0.57133013 0.49457976]
 [0.5564825  0.50873613 0.5397214  ... 0.4697105  0.5714067  0.49483484]], shape=(100, 46), dtype=float32)
Y sample: tf.Tensor(
[0.55732435 0.5093738  0.5403337  0.4422905  0.6238329  0.48782042
 0.54071635 0.5017217  0.5983469  0.49406964 0.61503136 0.5105216
 0.5578601  0.49419716 0.6399051  0.54495597 0.5329864  0.49929854
 0.5946732  0.5810483  0.5552579  0.48373932 0.58434105 0.5277388
 0.6070718  0.4231603  0.5476045  0.5147303  0.6028624  0.48577985
 0.64281344 0.45007014 0.5456146  0.51026654 0.5945967  0.49700293
 0.5405633  0.526591   0.6121231  0.4477

2025-04-25 10:07:56.377595: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [3]:
NUM_FEATS = 46
MAX_LEN = 100

ds = tf.data.Dataset.load(dataset_path).take(1000)

def sanity_check(x, y):
    # Check shapes for a single example
    assert x.shape == (MAX_LEN, NUM_FEATS), f"Unexpected X shape: {x.shape}"
    assert y.shape == (NUM_FEATS,), f"Unexpected Y shape: {y.shape}"

    x_seq = x.numpy()
    y_vec = y.numpy()

    # Check padding: rows at the start should be all zeros
    nonzero_rows = np.any(x_seq != 0, axis=1)
    pad_count = MAX_LEN - np.count_nonzero(nonzero_rows)

    print(f"\nPadding rows: {pad_count}")
    print(f"First non-zero row (index {pad_count}): {x_seq[pad_count][:4]}...")

    # Check that the target isn't all zeros
    if np.allclose(y_vec, 0):
        print("⚠️ Warning: y vector is all zeros!")
    else:
        print(f"y sample (first few features): {y_vec[:4]}...")

# Run sanity checks
for x, y in ds:
    sanity_check(x, y)




Padding rows: 99
First non-zero row (index 99): [0.5528088  0.5069506  0.5368131  0.45976278]...
y sample (first few features): [0.55357414 0.5074608  0.53750193 0.4565744 ]...

Padding rows: 98
First non-zero row (index 98): [0.5528088  0.5069506  0.5368131  0.45976278]...
y sample (first few features): [0.5544926  0.5078434  0.53826725 0.45313096]...

Padding rows: 97
First non-zero row (index 97): [0.5528088  0.5069506  0.5368131  0.45976278]...
y sample (first few features): [0.5554875  0.508226   0.53895605 0.44956   ]...

Padding rows: 96
First non-zero row (index 96): [0.5528088  0.5069506  0.5368131  0.45976278]...
y sample (first few features): [0.5564825  0.50873613 0.5397214  0.445989  ]...

Padding rows: 95
First non-zero row (index 95): [0.5528088  0.5069506  0.5368131  0.45976278]...
y sample (first few features): [0.55732435 0.5093738  0.5403337  0.4422905 ]...

Padding rows: 94
First non-zero row (index 94): [0.5528088  0.5069506  0.5368131  0.45976278]...
y sample (fi

2025-04-25 10:08:08.172850: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

NUM_FEATS = 46          # x,y for 23 entities
MAX_LEN  = 100          # same value you used in dataset builder
D_MODEL  = 128          # transformer hidden size
N_HEADS  = 4
N_LAYERS = 4
D_FF     = 512
DROPOUT  = 0.1

In [5]:
# ╔═══════════════════╗
# ║ 2. Positional enc ║  (learnable 1‑D embedding)
# ╚═══════════════════╝
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.pos_emb = self.add_weight(
            name="pos_emb",
            shape=(max_len, d_model),
            initializer="uniform",
            trainable=True,
        )

    def call(self, x):
        return x + self.pos_emb


In [6]:
# ╔═══════════════════════════╗
# ║ 3. Padding‑mask function  ║
# ╚═══════════════════════════╝
class PaddingMask(layers.Layer):
    def call(self, x):
        # x:  (B, T, F) — zero‐padded on the left
        pad = tf.reduce_all(tf.equal(x, 0.0), axis=-1)      # → (B, T)
        # reshape to (B, 1, 1, T) for MultiHeadAttention
        return pad[:, tf.newaxis, tf.newaxis, :]



In [7]:
# ╔════════════════════════╗
# ║ 4. Transformer encoder ║
# ╚════════════════════════╝
def transformer_block(d_model, n_heads, d_ff, dropout):
    inputs   = layers.Input(shape=(None, d_model))
    padding  = layers.Input(shape=(1,1,None), dtype=tf.bool)  # mask

    x = layers.MultiHeadAttention(
        num_heads=n_heads, key_dim=d_model//n_heads, dropout=dropout
    )(inputs, inputs, attention_mask=padding)
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(inputs + x)

    y = layers.Dense(d_ff, activation="relu")(x)
    y = layers.Dense(d_model)(y)
    y = layers.Dropout(dropout)(y)
    y = layers.LayerNormalization(epsilon=1e-6)(x + y)

    return keras.Model([inputs, padding], y)


In [8]:
# ╔════════════════════════════════╗
# ║ 5. End‑to‑end prediction model ║
# ╚════════════════════════════════╝
def build_model(
    num_feats=NUM_FEATS,
    max_len=MAX_LEN,
    d_model=D_MODEL,
    n_heads=N_HEADS,
    n_layers=N_LAYERS,
    d_ff=D_FF,
    dropout=DROPOUT,
):
    seq_in  = layers.Input(shape=(max_len, num_feats), name="sequence")   # (B,T,F)

    # Linear projection to d_model
    x = layers.Dense(d_model)(seq_in)

    # Add learnable positional encodings
    x = PositionalEncoding(max_len, d_model)(x)

    # Build padding mask once
    pad_mask = PaddingMask()(seq_in)

    # Stack encoder layers
    for _ in range(n_layers):
        x = transformer_block(d_model, n_heads, d_ff, dropout)([x, pad_mask])

    # We need the hidden state that corresponds to *frame t* (the last row)
    # – that is always index -1 thanks to left padding.
    h_t = layers.Lambda(lambda t: t[:, -1])(x)          # (B, D)

    # Regress the 46 co‑ordinates
    out = layers.Dense(num_feats, name="pred_xy")(h_t)

    return keras.Model(seq_in, out, name="NFL_Frame_Predictor")

model = build_model()
model.summary()


In [None]:
# Physics informed loss function #1 
# ----> need to have a different speed constraint for the ball... (~70 mph)
# ----> confirm that m/s is accurate units
# ----> ask what other field constraints have been used so far and confirm consistency with other scripts

def physics_informed_loss(y_true, y_pred):
    mse_loss = tf.reduce_mean(tf.square(y_pred - y_true))

    # Reshape to (batch, num_players, 2) assuming (46 = (22 players + 1 ball) * 2D coords)
    y_pred_reshaped = tf.reshape(y_pred, (-1, 23, 2))

    # Compute velocity (delta between predictions)
    # Since we're predicting 1 step ahead, this is approximate:
    velocity = y_pred_reshaped  # Assume it's velocity if your label is position difference

    # Compute speed (Euclidean norm of velocity vectors)
    speed = tf.norm(velocity, axis=-1)  # shape: (batch, 23)

    # Penalize speeds > max_speed (fastest NFL speed in game ~ 23 mph by Mostert/Hill, --> ~ 11 m/s)
    max_speed = 11.0
    speed_penalty = tf.reduce_mean(tf.nn.relu(speed - max_speed)**2)

    # Optional: boundary penalty (assuming field size of 120x53.3 yards)
    x, y = y_pred_reshaped[..., 0], y_pred_reshaped[..., 1]
    x_bounds = tf.nn.relu(-x) + tf.nn.relu(x - 120)
    y_bounds = tf.nn.relu(-y) + tf.nn.relu(y - 53.3)
    boundary_penalty = tf.reduce_mean(x_bounds**2 + y_bounds**2)

    # Combine all components (tune lambda values)
    total_loss = (
        mse_loss +
        0.1 * speed_penalty +       # lambda_1
        0.1 * boundary_penalty      # lambda_2
    )

    return total_loss

In [None]:
# Physics informed loss function #2
# ----> Confirm that there is 0.1 seconds between frames (for the dt)
# ----> Tune the physics penalties to reasonable thresholds
# ----> In block #5 above, is the output of the archetecture a predicted single frame, or sequence of frames?
# ----------> If it is a single frame, need to update the "out" I believe...
# ----------> out = layers.TimeDistributed(layers.Dense(num_feats))(x)  # (B, T, 46) ???

def physics_informed_sequence_loss(y_true, y_pred):
    """
    y_true and y_pred shape: (batch, T, 46)
    """
    mse_loss = tf.reduce_mean(tf.square(y_pred - y_true))

    # Reshape to: (batch, T, 23 players (22 players + 1 ball), 2 coordinates)
    y_pred = tf.reshape(y_pred, (-1, tf.shape(y_pred)[1], 23, 2))
    
    # Time delta (e.g., 0.1 seconds between frames)
    dt = 0.1

    # First derivative: velocity
    velocity = (y_pred[:, 1:] - y_pred[:, :-1]) / dt   # shape: (B, T-1, 23, 2)

    # Second derivative: acceleration
    acceleration = (velocity[:, 1:] - velocity[:, :-1]) / dt  # shape: (B, T-2, 23, 2)

    # Third derivative: jerk (rate of change of acceleration)
    jerk = (acceleration[:, 1:] - acceleration[:, :-1]) / dt  # shape: (B, T-3, 23, 2)

    # Physics penalties
    max_speed = 11.0
    max_accel = 15.0
    max_jerk = 100.0

    speed = tf.norm(velocity, axis=-1)      # (B, T-1, 23)
    accel = tf.norm(acceleration, axis=-1)  # (B, T-2, 23)
    jerk_mag = tf.norm(jerk, axis=-1)       # (B, T-3, 23)

    # Penalize exceeding thresholds
    speed_penalty = tf.reduce_mean(tf.nn.relu(speed - max_speed)**2)
    accel_penalty = tf.reduce_mean(tf.nn.relu(accel - max_accel)**2)
    jerk_penalty = tf.reduce_mean(tf.nn.relu(jerk_mag - max_jerk)**2)

    # Optional: keep within field boundaries (only last frame for now)
    last_pos = y_pred[:, -1]  # (B, 23, 2)
    x, y = last_pos[..., 0], last_pos[..., 1]
    x_bounds = tf.nn.relu(-x) + tf.nn.relu(x - 120)
    y_bounds = tf.nn.relu(-y) + tf.nn.relu(y - 53.3)
    boundary_penalty = tf.reduce_mean(x_bounds**2 + y_bounds**2)

    # Total loss with weighted terms (tune lambda values)
    total_loss = (
        mse_loss +
        0.1 * speed_penalty +       # lambda_1
        0.1 * accel_penalty +       # lambda_2
        0.05 * jerk_penalty +       # lambda_3
        0.05 * boundary_penalty     # lambda_4
    )

    return total_loss


In [None]:
# ╔════════════════════╗
# ║ 6. Compile & train ║
# ╚════════════════════╝
LR = 1e-4
optimizer = keras.optimizers.Adam(LR)
# loss_fn   = keras.losses.MeanSquaredError()

# model.compile(optimizer=optimizer, loss=loss_fn, metrics=[keras.metrics.MeanAbsoluteError()])
model.compile(optimizer=optimizer, loss=physics_informed_loss, metrics=[keras.metrics.MeanAbsoluteError()])
# model.compile(optimizer=optimizer, loss=physics_informed_sequence_loss, metrics=[keras.metrics.MeanAbsoluteError()])

# ------------------------------------------------------------------
# Dataset: shuffle → batch → prefetch (already in your snippet)
# ------------------------------------------------------------------
dataset_path = "../processed_data/transformer_dataset"
ds = (tf.data.Dataset.load(dataset_path)
      .shuffle(4096)
      .batch(64)
      .prefetch(tf.data.AUTOTUNE))

val_split = 0.05
val_ds    = ds.take(int(len(ds) * val_split))
train_ds  = ds.skip(int(len(ds) * val_split))

EPOCHS = 15
history = model.fit(train_ds,
                    epochs=EPOCHS,
                    validation_data=val_ds,
                    verbose=2)


Epoch 1/15


In [None]:
# ╔═══════════════╗
# ║ 7. Evaluation ║
# ╚═══════════════╝
# Simple end‑to‑end evaluation on a held‑out batch
for X_batch, y_batch in val_ds.take(1):
    y_pred = model(X_batch)
    mse = tf.reduce_mean(tf.square(y_pred - y_batch))
    print("Validation MSE (batch):", mse.numpy())
