In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import os

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c cse-251-b-2025

Downloading cse-251-b-2025.zip to /content
100% 0.98G/0.98G [00:04<00:00, 189MB/s]
100% 0.98G/0.98G [00:04<00:00, 220MB/s]


In [None]:
!unzip cse-251-b-2025.zip -d argoverse_data/

Archive:  cse-251-b-2025.zip
  inflating: argoverse_data/test_input.npz  
  inflating: argoverse_data/train.npz  


In [None]:
train_file = np.load('argoverse_data/train.npz')
train_data = train_file['data']
print("train_data's shape", train_data.shape)
test_file = np.load('argoverse_data/test_input.npz')
test_data = test_file['data']
print("test_data's shape", test_data.shape)

train_data's shape (10000, 50, 110, 6)
test_data's shape (2100, 50, 50, 6)


In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = layers.Dropout(dropout)(x)
    res = x + inputs

    # Pre-norm feedforward
    x = layers.LayerNormalization(epsilon=1e-6)(res)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="gelu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

In [None]:
class LearnablePositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.pos_emb = self.add_weight(
            shape=(1, max_len, d_model),
            initializer="random_normal",
            trainable=True,
            name="learnable_positional_encoding"
        )

    def call(self, x):
        return x + self.pos_emb[:, :tf.shape(x)[1], :]

In [None]:
def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    output_steps=60,
    output_dims=2,
    dropout=0,
    mlp_dropout=0
):
    timesteps, features = input_shape

    inputs = keras.Input(shape=(timesteps, features))

    # Add learnable positional encoding
    x = LearnablePositionalEncoding(max_len=timesteps, d_model=features)(inputs)

    # Apply transformer blocks
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    # Global pooling
    x = layers.GlobalAveragePooling1D()(x)

    # MLP head
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)

    outputs = layers.Dense(output_steps * output_dims)(x)
    outputs = layers.Reshape((output_steps, output_dims))(outputs)

    return keras.Model(inputs, outputs)

In [None]:
X_train_ego = train_data[:, 0, :50, :]  # (batch, timesteps, features) - ego vehicle only
y_train = train_data[:, 0, 50:110, :2]  # (batch, future_timesteps, xy)

X_tiny = X_train_ego[:50]
y_tiny = y_train[:50]

print(f"X_train_ego shape: {X_train_ego.shape}")
print(f"y_train shape: {y_train.shape}")

model = build_model(
    input_shape=X_train_ego.shape[1:],  # (timesteps, features)
    head_size=1024,         # Increase from 256
    num_heads=16,           # Increase from 4
    ff_dim=1024,            # Increase from 256
    num_transformer_blocks=8,  # Increase from 4
    mlp_units=[1024, 512],  # Back to larger MLP
    output_steps=60,
    output_dims=2,
    dropout=0.0,           # Turn off dropout to promote overfitting
    mlp_dropout=0.0        # Turn off MLP dropout
)

model.compile(
    optimizer="adam",
    loss="mse"
)

model.summary()

# Create smaller batches and use fewer epochs
BATCH_SIZE = 16  # Reduced from 32
EPOCHS = 100      # Reduced from 10

# Train with reduced memory footprint
history = model.fit(
    X_tiny,
    y_tiny,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,
)

X_train_ego shape: (10000, 50, 6)
y_train shape: (10000, 60, 2)


Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 7s/step - loss: 8364161.0000 - val_loss: 9548551.0000
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 177ms/step - loss: 7897741.0000 - val_loss: 6568812.0000
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step - loss: 4524692.0000 - val_loss: 3559397.2500
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 170ms/step - loss: 1995874.8750 - val_loss: 2549578.5000
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step - loss: 1386799.5000 - val_loss: 2228633.5000
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 182ms/step - loss: 1341335.6250 - val_loss: 759052.0000
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step - loss: 584954.0000 - val_loss: 371694.7812
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 180ms/step - 