In [2]:
import tensorflow as tf
import numpy as np

# Replace with your actual path
dataset_path = "../processed_data/transformer_dataset"

# Load the dataset
ds = tf.data.Dataset.load(dataset_path).take(10)

# Optionally: shuffle, batch, and prefetch for performance
ds = ds.shuffle(buffer_size=4096)
ds = ds.batch(64)
ds = ds.prefetch(tf.data.AUTOTUNE)

# Example: iterate over a few examples
for x, y in ds.take(1):
    print("X shape:", x.shape)  # (batch_size, max_seq_len, num_feats)
    print("Y shape:", y.shape)  # (batch_size, num_feats)
    print("X sample:", x[0])
    print("Y sample:", y[0])


X shape: (10, 100, 46)
Y shape: (10, 46)
X sample: tf.Tensor(
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.3494398  0.4539401  0.38760507 ... 0.41309658 0.37457985 0.53496116]
 [0.3495098  0.45038846 0.38683474 ... 0.40799114 0.37457985 0.53507215]
 [0.34957984 0.44694784 0.38606444 ... 0.40277472 0.3745098  0.53507215]], shape=(100, 46), dtype=float32)
Y sample: tf.Tensor(
[0.34964988 0.44328526 0.38536417 0.5156493  0.39649862 0.61576027
 0.3609944  0.527636   0.38158265 0.5684795  0.40833336 0.5340733
 0.3959384  0.48523864 0.3767507  0.5337403  0.33305323 0.51786906
 0.41197482 0.5036626  0.35490197 0.49322975 0.38186276 0.54961157
 0.3654062  0.5485017  0.38837537 0.49589348 0.33354342 0.6165372
 0.39887956 0.43163154 0.33634454 0.5429523  0.29712886 0.5328524
 0.31792718 0.4720311  0.35637257 0.51531

In [45]:
NUM_FEATS = 46
MAX_LEN = 100

ds = tf.data.Dataset.load(dataset_path).take(1000)

def sanity_check(x, y):
    # Check shapes for a single example
    assert x.shape == (MAX_LEN, NUM_FEATS), f"Unexpected X shape: {x.shape}"
    assert y.shape == (NUM_FEATS,), f"Unexpected Y shape: {y.shape}"

    x_seq = x.numpy()
    y_vec = y.numpy()

    # Check padding: rows at the start should be all zeros
    nonzero_rows = np.any(x_seq != 0, axis=1)
    pad_count = MAX_LEN - np.count_nonzero(nonzero_rows)

    print(f"\nPadding rows: {pad_count}")
    print(f"First non-zero row (index {pad_count}): {x_seq[pad_count][:4]}...")

    # Check that the target isn't all zeros
    if np.allclose(y_vec, 0):
        print("⚠️ Warning: y vector is all zeros!")
    else:
        print(f"y sample (first few features): {y_vec[:4]}...")

# Run sanity checks
for x, y in ds:
    sanity_check(x, y)




Padding rows: 99
First non-zero row (index 99): [0.3491597  0.46403998 0.39019608 0.5164262 ]...
y sample (first few features): [0.34929973 0.46071035 0.38935575 0.51609325]...

Padding rows: 98
First non-zero row (index 98): [0.3491597  0.46403998 0.39019608 0.5164262 ]...
y sample (first few features): [0.34936976 0.45726973 0.3884454  0.5158713 ]...

Padding rows: 97
First non-zero row (index 97): [0.3491597  0.46403998 0.39019608 0.5164262 ]...
y sample (first few features): [0.3494398  0.4539401  0.38760507 0.5157603 ]...

Padding rows: 96
First non-zero row (index 96): [0.3491597  0.46403998 0.39019608 0.5164262 ]...
y sample (first few features): [0.3495098  0.45038846 0.38683474 0.5156493 ]...

Padding rows: 95
First non-zero row (index 95): [0.3491597  0.46403998 0.39019608 0.5164262 ]...
y sample (first few features): [0.34957984 0.44694784 0.38606444 0.5156493 ]...

Padding rows: 94
First non-zero row (index 94): [0.3491597  0.46403998 0.39019608 0.5164262 ]...
y sample (fi

In [46]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

NUM_FEATS = 46          # x,y for 23 entities
MAX_LEN  = 100          # same value you used in dataset builder
D_MODEL  = 128          # transformer hidden size
N_HEADS  = 4
N_LAYERS = 4
D_FF     = 512
DROPOUT  = 0.1

In [47]:
# ╔═══════════════════╗
# ║ 2. Positional enc ║  (learnable 1‑D embedding)
# ╚═══════════════════╝
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.pos_emb = self.add_weight(
            name="pos_emb",
            shape=(max_len, d_model),
            initializer="uniform",
            trainable=True,
        )

    def call(self, x):
        return x + self.pos_emb


In [48]:
# ╔═══════════════════════════╗
# ║ 3. Padding‑mask function  ║
# ╚═══════════════════════════╝
class PaddingMask(layers.Layer):
    def call(self, x):
        # x:  (B, T, F) — zero‐padded on the left
        pad = tf.reduce_all(tf.equal(x, 0.0), axis=-1)      # → (B, T)
        # reshape to (B, 1, 1, T) for MultiHeadAttention
        return pad[:, tf.newaxis, tf.newaxis, :]



In [49]:
# ╔════════════════════════╗
# ║ 4. Transformer encoder ║
# ╚════════════════════════╝
def transformer_block(d_model, n_heads, d_ff, dropout):
    inputs   = layers.Input(shape=(None, d_model))
    padding  = layers.Input(shape=(1,1,None), dtype=tf.bool)  # mask

    x = layers.MultiHeadAttention(
        num_heads=n_heads, key_dim=d_model//n_heads, dropout=dropout
    )(inputs, inputs, attention_mask=padding)
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(inputs + x)

    y = layers.Dense(d_ff, activation="relu")(x)
    y = layers.Dense(d_model)(y)
    y = layers.Dropout(dropout)(y)
    y = layers.LayerNormalization(epsilon=1e-6)(x + y)

    return keras.Model([inputs, padding], y)


In [50]:
# ╔════════════════════════════════╗
# ║ 5. End‑to‑end prediction model ║
# ╚════════════════════════════════╝
def build_model(
    num_feats=NUM_FEATS,
    max_len=MAX_LEN,
    d_model=D_MODEL,
    n_heads=N_HEADS,
    n_layers=N_LAYERS,
    d_ff=D_FF,
    dropout=DROPOUT,
):
    seq_in  = layers.Input(shape=(max_len, num_feats), name="sequence")   # (B,T,F)

    # Linear projection to d_model
    x = layers.Dense(d_model)(seq_in)

    # Add learnable positional encodings
    x = PositionalEncoding(max_len, d_model)(x)

    # Build padding mask once
    pad_mask = PaddingMask()(seq_in)

    # Stack encoder layers
    for _ in range(n_layers):
        x = transformer_block(d_model, n_heads, d_ff, dropout)([x, pad_mask])

    # We need the hidden state that corresponds to *frame t* (the last row)
    # – that is always index -1 thanks to left padding.
    h_t = layers.Lambda(lambda t: t[:, -1])(x)          # (B, D)

    # Regress the 46 co‑ordinates
    out = layers.Dense(num_feats, name="pred_xy")(h_t)

    return keras.Model(seq_in, out, name="NFL_Frame_Predictor")

model = build_model()
model.summary()


In [None]:
# ╔════════════════════╗
# ║ 6. Compile & train ║
# ╚════════════════════╝
LR = 1e-4
optimizer = keras.optimizers.Adam(LR)
loss_fn   = keras.losses.MeanSquaredError()

model.compile(optimizer=optimizer, loss=loss_fn, metrics=[keras.metrics.MeanAbsoluteError()])

# ------------------------------------------------------------------
# Dataset: shuffle → batch → prefetch (already in your snippet)
# ------------------------------------------------------------------
dataset_path = "../processed_data/transformer_dataset"
ds = (tf.data.Dataset.load(dataset_path)
      .shuffle(4096)
      .batch(64)
      .prefetch(tf.data.AUTOTUNE))

val_split = 0.05
val_ds    = ds.take(int(len(ds) * val_split))
train_ds  = ds.skip(int(len(ds) * val_split))

EPOCHS = 15
history = model.fit(train_ds,
                    epochs=EPOCHS,
                    validation_data=val_ds,
                    verbose=2)


Epoch 1/15


In [None]:
# ╔═══════════════╗
# ║ 7. Evaluation ║
# ╚═══════════════╝
# Simple end‑to‑end evaluation on a held‑out batch
for X_batch, y_batch in val_ds.take(1):
    y_pred = model(X_batch)
    mse = tf.reduce_mean(tf.square(y_pred - y_batch))
    print("Validation MSE (batch):", mse.numpy())
