In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn.functional as F
import math

In [None]:
class TrajectoryDataset(Dataset):
    def __init__(self, npz_file):
        data = np.load(npz_file)
        self.past = torch.tensor(data['past'], dtype=torch.float32)
        self.future = torch.tensor(data['future'], dtype=torch.float32)

    def __len__(self):
        return len(self.past)

    def __getitem__(self, idx):
        return self.past[idx], self.future[idx]

train_dataset = TrajectoryDataset("/content/drive/MyDrive/ECE271B project/val/processed_val_pit.npz")
# test_dataset = TrajectoryDataset("/content/drive/MyDrive/ECE271B project/test_obs/processed_test_pit.npz")

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2], pe[:, 1::2] = torch.sin(position * div_term), torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


class CustomAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.head_dim = d_model // num_heads
        self.num_heads = num_heads
        self.scale = math.sqrt(self.head_dim)
        self.qkv = nn.Linear(d_model, d_model * 3)
        self.fc_out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, context=None):
        B, N, _ = x.shape
        context = x if context is None else context
        qkv = self.qkv(torch.cat([x, context], dim=1)).chunk(3, dim=-1)
        q, k, v = [t.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2) for t in qkv]
        scores = (q @ k.transpose(-2, -1)) / self.scale
        attn = self.dropout(scores.softmax(-1))
        return self.fc_out((attn @ v).transpose(1, 2).reshape(B, N, -1))


class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.GELU(), nn.Dropout(dropout), nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)


class TransformerLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = CustomAttention(d_model, num_heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1, self.norm2 = nn.LayerNorm(d_model), nn.LayerNorm(d_model)

    def forward(self, x, context=None):
        x = self.norm1(x + self.mha(x, context))
        return self.norm2(x + self.ff(x))


class DualPathStochasticDecoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, d_ff, num_samples):
        super().__init__()
        self.coarse_layers = nn.ModuleList([TransformerLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.refine_layers = nn.ModuleList([TransformerLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.num_samples = num_samples
        self.latent_token = nn.Parameter(torch.randn(1, 1, d_model))  # Per-token learnable stochastic embedding

    def forward(self, queries, memory):
        batch, seq, _ = queries.size()
        outputs = []

        for _ in range(self.num_samples):
            noise = torch.randn(batch, seq, memory.size(-1), device=queries.device) * 0.1
            conditioned_noise = memory + noise
            x = queries + self.latent_token

            # Coarse decoding
            for layer in self.coarse_layers:
                x = layer(x, conditioned_noise)

            # Fine refinement
            for layer in self.refine_layers:
                x = layer(x, conditioned_noise + x)

            outputs.append(x)

        return torch.stack(outputs, dim=1)  # (batch, num_samples, seq, d_model)


class MonteCarloTransformerScratch(nn.Module):
    def __init__(self, input_dim, output_dim, d_model=256, num_layers=4, num_heads=8, d_ff=1024, num_samples=10):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.output_proj = nn.Linear(d_model, output_dim)
        self.encoder = nn.Sequential(*[TransformerLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.decoder = DualPathStochasticDecoder(num_layers, d_model, num_heads, d_ff, num_samples)
        self.pos_encoding = PositionalEncoding(d_model)

    def forward(self, past_trajectory, future_queries):
        x = self.pos_encoding(self.input_proj(past_trajectory))
        memory = self.encoder(x)
        decoded = self.decoder(future_queries, memory)
        return self.output_proj(decoded)


def losses(predicted_futures, future_gt, alpha=1.0, beta=1.0, gamma=0.1, delta=0.1):
    distances = torch.norm(predicted_futures - future_gt.unsqueeze(1), dim=-1)
    minADE = torch.mean(torch.min(distances.mean(2), 1)[0])
    minFDE = torch.mean(torch.min(distances[:, :, -1], 1)[0])
    log_likelihood = -torch.logsumexp(-distances.sum(2), 1).mean()

    # Contrastive loss for diversity
    diffs = predicted_futures.unsqueeze(2) - predicted_futures.unsqueeze(1)
    diversity = torch.mean(torch.norm(diffs, dim=-1))  # Average pairwise distance

    total_loss = alpha * minADE + beta * minFDE - gamma * log_likelihood - delta * diversity
    return total_loss, minADE, minFDE, -log_likelihood, diversity

In [None]:
def train_model(model, train_loader, num_epochs=50, lr=0.0005, device='cuda'):
    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for past, future in train_loader:
            past, future = past.to(device), future.to(device)
            optimizer.zero_grad()
            future_queries = torch.zeros_like(future).to(device)
            predicted_futures = model(past, future_queries)
            loss, minADE, minFDE, nll, diversity = losses(predicted_futures, future)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch [{epoch+1}/{num_epochs}] | Loss: {total_loss/len(train_loader):.4f} | minADE: {minADE:.4f} | "
              f"minFDE: {minFDE:.4f} | LL: {nll:.4f} | Diversity: {diversity:.4f}")

In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    total_minADE, total_minFDE = 0, 0

    with torch.no_grad():
        for past, future in test_loader:
            past, future = past.to(device), future.to(device)
            predicted_futures = model(past)

            # --- minADE ---
            distances = torch.norm(predicted_futures - future.unsqueeze(1), dim=-1)
            minADE = torch.mean(torch.min(distances.mean(dim=2), dim=1)[0])
            total_minADE += minADE.item()

            # --- minFDE ---
            final_displacement = torch.norm(predicted_futures[:, :, -1, :] - future[:, -1, :].unsqueeze(1), dim=-1)
            minFDE = torch.mean(torch.min(final_displacement, dim=1)[0])
            total_minFDE += minFDE.item()

    avg_minADE = total_minADE / len(test_loader)
    avg_minFDE = total_minFDE / len(test_loader)
    print(f"Evaluation Results -> minADE: {avg_minADE:.4f}, minFDE: {avg_minFDE:.4f}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MonteCarloTransformerScratch().to(device)

train_model(model, train_loader, num_epochs=50, lr=0.0005, alpha=1.0, beta=1.0, gamma=0.1)
evaluate_model(model, test_loader)