<a href="https://colab.research.google.com/github/mahesmeh001/AgroverseComp/blob/main/SimpleTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import os

In [12]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [13]:
!kaggle competitions download -c cse-251-b-2025

Downloading cse-251-b-2025.zip to /content
 99% 996M/0.98G [00:07<00:00, 131MB/s]
100% 0.98G/0.98G [00:07<00:00, 139MB/s]


In [14]:
!unzip cse-251-b-2025.zip -d argoverse_data/

Archive:  cse-251-b-2025.zip
  inflating: argoverse_data/test_input.npz  
  inflating: argoverse_data/train.npz  


In [15]:
train_file = np.load('argoverse_data/train.npz')
train_data = train_file['data']
print("train_data's shape", train_data.shape)
test_file = np.load('argoverse_data/test_input.npz')
test_data = test_file['data']
print("test_data's shape", test_data.shape)

train_data's shape (10000, 50, 110, 6)
test_data's shape (2100, 50, 50, 6)


In [16]:
def transformer_encoder(inputs, model_dim, num_heads, ff_dim, dropout=0):
    # Layer norm is applied to inputs (pre-norm)
    x = layers.LayerNormalization(epsilon=1e-6)(inputs)

    # MultiHeadAttention: key_dim should be model_dim // num_heads
    # The output of MHA will be model_dim
    attn_output = layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=model_dim // num_heads, # Dimension of each head's projection
        dropout=dropout
    )(query=x, value=x, key=x)
    attn_output = layers.Dropout(dropout)(attn_output)
    res = attn_output + inputs # Residual connection

    # Feed Forward Part (pre-norm)
    x = layers.LayerNormalization(epsilon=1e-6)(res)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=model_dim, kernel_size=1)(x) # Project back to model_dim
    return x + res

In [17]:
class LearnablePositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model, **kwargs):
        super().__init__(**kwargs)
        self.pos_emb = self.add_weight(
            name="position_embeddings",
            shape=(1, max_len, d_model),
            initializer="random_normal",
            trainable=True,
        )

    def call(self, x):
        return x + self.pos_emb[:, :tf.shape(x)[1], :]

    def get_config(self):
        config = super().get_config()
        config.update({
            "max_len": self.pos_emb.shape[1],
            "d_model": self.pos_emb.shape[2],
        })
        return config

In [18]:
class LearnableQueries(layers.Layer):
    def __init__(self, num_queries, query_dim, **kwargs):
        super().__init__(**kwargs)
        self.num_queries = num_queries
        self.query_dim = query_dim
        self.queries = self.add_weight(
            name="learnable_queries",
            shape=(1, num_queries, query_dim),
            initializer="random_normal",
            trainable=True,
        )

    def call(self, inputs):
        # inputs here are just used to get the batch size, shape doesn't matter much
        batch_size = tf.shape(inputs)[0]
        return tf.tile(self.queries, [batch_size, 1, 1])

    def get_config(self):
        config = super().get_config()
        config.update({
            "num_queries": self.num_queries,
            "query_dim": self.query_dim,
        })
        return config

In [19]:
def transformer_decoder_block(query_input, key_value_input, model_dim, num_heads, ff_dim, dropout=0):
    # Cross-Attention (pre-norm)
    # query_input (from decoder) and key_value_input (from encoder)
    # both need to have a consistent feature dimension (model_dim)

    # LayerNorm for query input
    norm_query = layers.LayerNormalization(epsilon=1e-6)(query_input)

    # Cross-attention layer
    attn_output = layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=model_dim // num_heads, # Dimension of each head's projection
        dropout=dropout
    )(query=norm_query, value=key_value_input, key=key_value_input) # Query from decoder, K/V from encoder

    attn_output = layers.Dropout(dropout)(attn_output)
    res_attn = attn_output + query_input # Residual connection

    # Feed Forward Part (pre-norm)
    x = layers.LayerNormalization(epsilon=1e-6)(res_attn)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=model_dim, kernel_size=1)(x) # Project back to model_dim
    return x + res_attn

In [20]:
def build_model(
    input_shape,
    model_dim, # Renamed from head_size to be explicitly the model dimension
    num_heads,
    ff_dim,
    num_transformer_blocks,
    num_decoder_blocks,
    mlp_units,
    output_steps=60,
    output_dims=2,
    dropout=0,
    mlp_dropout=0
):
    timesteps, features = input_shape

    inputs = keras.Input(shape=(timesteps, features))

    # Initial projection to the model_dim
    # This is critical if initial `features` != `model_dim`
    x = layers.Dense(model_dim)(inputs) # Project from `features` to `model_dim`

    # Add learnable positional encoding for encoder
    encoder_output = LearnablePositionalEncoding(max_len=timesteps, d_model=model_dim)(x)

    # Apply transformer blocks
    for _ in range(num_transformer_blocks):
        encoder_output = transformer_encoder(encoder_output, model_dim, num_heads, ff_dim, dropout)

    # Decoder
    # Learned queries for the decoder - query_dim MUST be `model_dim`
    decoder_input = LearnableQueries(num_queries=output_steps, query_dim=model_dim)(inputs)

    # Add positional encoding to decoder queries
    decoder_input = LearnablePositionalEncoding(max_len=output_steps, d_model=model_dim)(decoder_input)

    # Apply decoder blocks
    decoder_output = decoder_input
    for _ in range(num_decoder_blocks):
        decoder_output = transformer_decoder_block(
            decoder_output, encoder_output, model_dim, num_heads, ff_dim, dropout
        )

    # Global pooling applied to the decoder output (which now represents the predicted sequence)
    # This pools across the `output_steps` dimension, leaving (batch_size, model_dim)
    x = layers.GlobalAveragePooling1D()(encoder_output)

    # MLP head
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)

    # Final output layer
    outputs = layers.Dense(output_steps * output_dims)(x)
    outputs = layers.Reshape((output_steps, output_dims))(outputs)

    return keras.Model(inputs, outputs)

In [21]:
X_train_ego = train_data[:, 0, :50, :]  # (batch, timesteps, features) - ego vehicle only
y_train = train_data[:, 0, 50:110, :2]  # (batch, future_timesteps, xy)

X_tiny = X_train_ego[:50]
y_tiny = y_train[:50]

print(f"X_train_ego shape: {X_train_ego.shape}")
print(f"y_train shape: {y_train.shape}")

X_train_ego shape: (10000, 50, 6)
y_train shape: (10000, 60, 2)


In [22]:
last_known_pos_xy = X_train_ego[:, -1, :2] # Shape: (batch, 2)

# Create y_train_deltas
y_train_deltas = np.zeros_like(y_train)
print(y_train.shape)
# The first delta is from the last known position to the first future position
y_train_deltas[:, 0, :] = y_train[:, 0, :] - last_known_pos_xy

# Subsequent deltas are between consecutive future positions
y_train_deltas[:, 1:, :] = y_train[:, 1:, :] - y_train[:, :-1, :]

# Now, use y_train_deltas for training
# y_tiny_deltas = y_train_deltas[:50] # If you're still using X_tiny

print(f"y_train_deltas shape: {y_train_deltas.shape}")

# Example of how y_train_deltas would look for X_tiny:
X_tiny = X_train_ego[:50]
y_tiny = y_train[:50]

last_known_pos_xy_tiny = X_tiny[:, -1, :2]
y_tiny_deltas = np.zeros_like(y_tiny)
y_tiny_deltas[:, 0, :] = y_tiny[:, 0, :] - last_known_pos_xy_tiny
y_tiny_deltas[:, 1:, :] = y_tiny[:, 1:, :] - y_tiny[:, :-1, :]

(10000, 60, 2)
y_train_deltas shape: (10000, 60, 2)


In [23]:
import numpy as np

# Assuming X_train_ego and y_train_deltas are your full training datasets
# (i.e., the original, larger X_train_ego and the y_train_deltas you calculated before slicing for X_tiny)

# --- For X_train_ego (input features) ---
# X_train_ego shape: (batch_size, timesteps, features)
# We want to calculate mean/std for each 'feature' across all 'batch_size' and 'timesteps'.
# Using axis=(0, 1) means we compute the mean/std along the 0th (batch) and 1st (timesteps) dimensions.
# keepdims=True ensures the output shape allows for easy broadcasting during subtraction/division.
X_mean = X_train_ego.mean(axis=(0, 1), keepdims=True) # Shape will be (1, 1, features)
X_std = X_train_ego.std(axis=(0, 1), keepdims=True)    # Shape will be (1, 1, features)

# Important: Avoid division by zero if a feature has zero standard deviation (i.e., it's constant).
# Replace 0 with 1.0 to prevent NaNs or Infs during division.
X_std[X_std == 0] = 1.0

print(f"X_train_ego original shape: {X_train_ego.shape}")
print(f"X_mean shape: {X_mean.shape}, X_std shape: {X_std.shape}")
print(f"First few X_mean values: {X_mean[0, 0, :5]}") # Print first 5 features' mean
print(f"First few X_std values: {X_std[0, 0, :5]}")   # Print first 5 features' std


# --- For y_train_deltas (target deltas) ---
# y_train_deltas shape: (batch_size, future_timesteps, xy_dims)
# We calculate mean/std for each 'xy_dim' (X or Y) across all 'batch_size' and 'future_timesteps'.
y_deltas_mean = y_train_deltas.mean(axis=(0, 1), keepdims=True) # Shape will be (1, 1, 2)
y_deltas_std = y_train_deltas.std(axis=(0, 1), keepdims=True)    # Shape will be (1, 1, 2)

# Prevent division by zero
y_deltas_std[y_deltas_std == 0] = 1.0

print(f"\ny_train_deltas original shape: {y_train_deltas.shape}")
print(f"y_deltas_mean shape: {y_deltas_mean.shape}, y_deltas_std shape: {y_deltas_std.shape}")
print(f"y_deltas_mean (X, Y): {y_deltas_mean[0, 0, :]}")
print(f"y_deltas_std (X, Y): {y_deltas_std[0, 0, :]}")

# --- IMPORTANT: SAVE THESE STATISTICS ---
# In a real project, you would save X_mean, X_std, y_deltas_mean, y_deltas_std
# (e.g., using np.savez, pickle, or a config file)
# so you can load them and apply them consistently during inference (prediction on new data).

X_train_ego original shape: (10000, 50, 6)
X_mean shape: (1, 1, 6), X_std shape: (1, 1, 6)
First few X_mean values: [ 2.72984436e+03  1.05158719e+03 -9.42919519e-02 -6.27655270e-02
 -1.15795715e-03]
First few X_std values: [3.36969193e+03 1.72704592e+03 5.34438801e+00 4.69611758e+00
 1.83653475e+00]

y_train_deltas original shape: (10000, 60, 2)
y_deltas_mean shape: (1, 1, 2), y_deltas_std shape: (1, 1, 2)
y_deltas_mean (X, Y): [-0.010136   -0.01060693]
y_deltas_std (X, Y): [0.52892473 0.46509676]


In [24]:
# Apply normalization to the *full* training datasets first
X_train_ego_normalized = (X_train_ego - X_mean) / X_std
y_train_deltas_normalized = (y_train_deltas - y_deltas_mean) / y_deltas_std

print(f"\nX_train_ego_normalized shape: {X_train_ego_normalized.shape}")
print(f"y_train_deltas_normalized shape: {y_train_deltas_normalized.shape}")

# Then, create your tiny subsets from the *normalized* data
X_tiny_normalized = X_train_ego_normalized[:50]
y_tiny_deltas_normalized = y_train_deltas_normalized[:50]

print(f"\nX_tiny_normalized shape: {X_tiny_normalized.shape}")
print(f"y_tiny_deltas_normalized shape: {y_tiny_deltas_normalized.shape}")


X_train_ego_normalized shape: (10000, 50, 6)
y_train_deltas_normalized shape: (10000, 60, 2)

X_tiny_normalized shape: (50, 50, 6)
y_tiny_deltas_normalized shape: (50, 60, 2)


In [25]:
model = build_model(
    input_shape=X_train_ego.shape[1:],  # (timesteps, features)
    model_dim=1024,         # Increase from 256
    num_heads=16,           # Increase from 4
    ff_dim=1024,            # Increase from 256
    num_transformer_blocks=8,  # Increase from 4
    num_decoder_blocks=4,   # Increase from 4
    mlp_units=[1024, 512],  # Back to larger MLP
    output_steps=60,
    output_dims=2,
    dropout=0.0,           # Turn off dropout to promote overfitting
    mlp_dropout=0.0        # Turn off MLP dropout
)

optimizer=keras.optimizers.Adam(learning_rate=5e-4)

model.compile(
    optimizer=optimizer,
    loss="mse",
    metrics=["mae"]
)

model.summary()

# Create smaller batches and use fewer epochs
BATCH_SIZE = 16  # Reduced from 32
EPOCHS = 10      # Reduced from 10

# Train with reduced memory footprint
history = model.fit(
    X_train_ego_normalized,
    y_train_deltas_normalized,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1
)

Epoch 1/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 204ms/step - loss: 0.5111 - mae: 0.4600 - val_loss: 0.1305 - val_mae: 0.2306
Epoch 2/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 182ms/step - loss: 0.1427 - mae: 0.2509 - val_loss: 0.1136 - val_mae: 0.2105
Epoch 3/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 183ms/step - loss: 0.1250 - mae: 0.2331 - val_loss: 0.1190 - val_mae: 0.2184
Epoch 4/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 185ms/step - loss: 0.1210 - mae: 0.2303 - val_loss: 0.1351 - val_mae: 0.2506
Epoch 5/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 185ms/step - loss: 0.1235 - mae: 0.2346 - val_loss: 0.1087 - val_mae: 0.2173
Epoch 6/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 183ms/step - loss: 0.1188 - mae: 0.2293 - val_loss: 0.1080 - val_mae: 0.2204
Epoch 7/100
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

KeyboardInterrupt: 

In [None]:
# --- Prediction ---
# y_pred_normalized_deltas will be the output from model.predict()
y_pred_normalized_deltas = model.predict(X_tiny_normalized) # Predict from normalized inputs

# --- Denormalization ---
# Denormalize the predicted deltas using the *same* mean and std calculated from y_train_deltas
y_pred_denormalized_deltas = (y_pred_normalized_deltas * y_deltas_std) + y_deltas_mean

print(f"\ny_pred_normalized_deltas shape: {y_pred_normalized_deltas.shape}")
print(f"y_pred_denormalized_deltas shape: {y_pred_denormalized_deltas.shape}")


# --- Reconstruct Absolute Trajectory (as before) ---
# Crucially, last_known_pos_xy_pred should be from the *original, unnormalized* X_tiny.
# This ensures you're starting from the correct absolute position.
last_known_pos_xy_pred = X_tiny[:, -1, :2] # Assuming X_tiny is the unnormalized input history

# Create a container for the reconstructed absolute predictions
y_pred_absolute = np.zeros_like(y_pred_denormalized_deltas)

# The first predicted absolute position is the last known position + the first denormalized delta
y_pred_absolute[:, 0, :] = last_known_pos_xy_pred + y_pred_denormalized_deltas[:, 0, :]

# Subsequent absolute positions are the previous absolute position + the current denormalized delta
for i in range(1, y_pred_denormalized_deltas.shape[1]):
    y_pred_absolute[:, i, :] = y_pred_absolute[:, i-1, :] + y_pred_denormalized_deltas[:, i, :]

print(f"y_pred_absolute shape: {y_pred_absolute.shape}")

# --- Plotting ---
# Now use the original X_tiny, original y_tiny, and the reconstructed y_pred_absolute for plotting.
# (Your plot_trajectory_predictions_deltas function expects unnormalized absolute coordinates for all)

In [None]:
y_pred_deltas = model.predict(X_tiny)

# Reconstruct absolute predicted trajectory from deltas
# You'll need the last known position from the input sequence for each sample
# Make sure X_tiny still holds the input data
last_known_pos_xy_pred = X_tiny[:, -1, :2] # Shape: (batch, 2)

# Create a container for the reconstructed absolute predictions
y_pred_absolute = np.zeros_like(y_pred_deltas)

# The first predicted absolute position is the last known position + the first delta
y_pred_absolute[:, 0, :] = last_known_pos_xy_pred + y_pred_deltas[:, 0, :]

# Subsequent absolute positions are the previous absolute position + the current delta
for i in range(1, y_pred_deltas.shape[1]):
    y_pred_absolute[:, i, :] = y_pred_absolute[:, i-1, :] + y_pred_deltas[:, i, :]

print(f"y_pred_absolute shape: {y_pred_absolute.shape}")

# --- 3. Update Visualization Code ---
# Now use y_pred_absolute and y_tiny (the original absolute ground truth) for plotting

def plot_trajectory_predictions_deltas(
    input_trajectory,
    ground_truth_trajectory_absolute, # Now takes original absolute y
    predicted_trajectory_absolute,    # Now takes reconstructed absolute y_pred
    sample_idx,
    title=""
):
    """
    Plots the input, ground truth, and predicted trajectories for a single sample.
    Assumes ground_truth_trajectory_absolute and predicted_trajectory_absolute are in absolute coordinates.
    """
    plt.figure(figsize=(10, 8))

    # Plot Input Trajectory (Historical Data)
    plt.plot(
    input_trajectory[:, 0], input_trajectory[:, 1],
    'o-', color='blue', label='Input History (Ego)', alpha=0.7, markersize=4, zorder=3 # Add zorder
)

    # Plot Ground Truth Trajectory (Absolute)
    plt.plot(
        ground_truth_trajectory_absolute[:, 0], ground_truth_trajectory_absolute[:, 1],
        'o-', color='green', label='Ground Truth Future (Absolute)', alpha=0.7, markersize=4
    )

    # Plot Predicted Trajectory (Reconstructed Absolute)
    plt.plot(
        predicted_trajectory_absolute[:, 0], predicted_trajectory_absolute[:, 1],
        'x--', color='red', label='Predicted Future (Absolute Reconstructed)', alpha=0.8, markersize=5
    )

    plt.xlabel("X Coordinate")
    plt.ylabel("Y Coordinate")
    plt.title(f"{title} - Sample {sample_idx}")
    plt.legend()
    plt.grid(True)
    plt.axis('equal') # Keep aspect ratio equal for better visualization of movement
    plt.show()

# Visualize with the reconstructed absolute predictions
num_samples_to_visualize = 10

for i in range(min(num_samples_to_visualize, X_tiny.shape[0])):
    plot_trajectory_predictions_deltas(
        input_trajectory=X_tiny[i],
        ground_truth_trajectory_absolute=y_tiny[i], # Use original y_tiny (absolute)
        predicted_trajectory_absolute=y_pred_absolute[i], # Use reconstructed absolute predictions
        sample_idx=i,
        title="Trajectory Prediction (Predicting Deltas)"
    )