In [None]:
from typing import Dict

from tempfile import gettempdir
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision.models.resnet import resnet50
from tqdm import tqdm

from l5kit.configs import load_config_data
from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset, EgoDataset
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_gt_csv, write_pred_csv, compute_error_csv
from l5kit.geometry import transform_points
from l5kit.visualization import PREDICTED_POINTS_COLOR, TARGET_POINTS_COLOR, draw_trajectory
from prettytable import PrettyTable

import os

## Prepare Data path and load cfg

By setting the `L5KIT_DATA_FOLDER` variable, we can point the script to the folder where the data lies.

Then, we load our config file with relative paths and other configurations (rasteriser, training params...).

In [None]:
# set env variable for data
os.environ["L5KIT_DATA_FOLDER"] = "PATH_TO_YOUR_DATA"
dm = LocalDataManager(None)
# get config
cfg = load_config_data("./agent_motion_config.yaml")
print(cfg)

## Model

Our baseline is a simple `resnet50` pretrained on `imagenet`. We must replace the input and the final layer to address our requirements.

In [None]:
def build_model(cfg: Dict) -> torch.nn.Module:
    # load pre-trained Conv2D model
    model = resnet50(pretrained=True)

    # change input channels number to match the rasterizer's output
    num_history_channels = (cfg["model_params"]["history_num_frames"] + 1) * 2
    num_in_channels = 3 + num_history_channels
    model.conv1 = nn.Conv2d(
        num_in_channels,
        model.conv1.out_channels,
        kernel_size=model.conv1.kernel_size,
        stride=model.conv1.stride,
        padding=model.conv1.padding,
        bias=False,
    )
    # change output size to (X, Y) * number of future states
    num_targets = 2 * cfg["model_params"]["future_num_frames"]
    model.fc = nn.Linear(in_features=2048, out_features=num_targets)

    return model

In [None]:
def forward(data, model, device, criterion):
    inputs = data["image"].to(device)
    target_availabilities = data["target_availabilities"].unsqueeze(-1).to(device)
    targets = data["target_positions"].to(device)
    # Forward pass
    outputs = model(inputs).reshape(targets.shape)
    loss = criterion(outputs, targets)
    # not all the output steps are valid, but we can filter them out from the loss using availabilities
    loss = loss * target_availabilities
    loss = loss.mean()
    return loss, outputs

## Load the Train Data

Our data pipeline map a raw `.zarr` folder into a multi-processing instance ready for training by:
- loading the `zarr` into a `ChunkedDataset` object. This object has a reference to the different arrays into the zarr (e.g. agents and traffic lights);
- wrapping the `ChunkedDataset` into an `AgentDataset`, which inherits from torch `Dataset` class;
- passing the `AgentDataset` into a torch `DataLoader`

In [None]:
# ===== INIT DATASET
train_cfg = cfg["train_data_loader"]
rasterizer = build_rasterizer(cfg, dm)
train_zarr = ChunkedDataset(dm.require(train_cfg["key"])).open()
train_dataset = AgentDataset(cfg, train_zarr, rasterizer)
train_dataloader = DataLoader(train_dataset, shuffle=train_cfg["shuffle"], batch_size=train_cfg["batch_size"], 
                             num_workers=train_cfg["num_workers"])
print(train_dataset)

In [None]:
# ==== INIT MODEL
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = build_model(cfg).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss(reduction="none")

# Training

In [None]:
# ==== TRAIN LOOP
tr_it = iter(train_dataloader)
progress_bar = tqdm(range(cfg["train_params"]["max_num_steps"]))
losses_train = []
for _ in progress_bar:
    try:
        data = next(tr_it)
    except StopIteration:
        tr_it = iter(train_dataloader)
        data = next(tr_it)
    model.train()
    torch.set_grad_enabled(True)
    loss, _ = forward(data, model, device, criterion)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    losses_train.append(loss.item())
    progress_bar.set_description(f"loss: {loss.item()} loss(avg): {np.mean(losses_train)}")

# Evaluation

In [None]:
# ===== INIT DATASET
eval_cfg = cfg["val_data_loader"]
rasterizer = build_rasterizer(cfg, dm)
eval_zarr = ChunkedDataset(dm.require(eval_cfg["key"])).open()
eval_dataset = AgentDataset(cfg, eval_zarr, rasterizer)
eval_dataloader = DataLoader(eval_dataset, shuffle=eval_cfg["shuffle"], batch_size=eval_cfg["batch_size"], 
                             num_workers=eval_cfg["num_workers"])
print(eval_dataset)

In [None]:
# ==== EVAL LOOP
model.eval()
torch.set_grad_enabled(False)
losses_eval = []

# store information for evaluation
future_coords_offsets_pd = []
future_coords_offsets_gt = []
target_availabilities = []

timestamps = []
agent_ids = []
progress_bar = tqdm(eval_dataloader)
for data in progress_bar:
    loss, ouputs = forward(data, model, device, criterion)
    losses_eval.append(loss.item())
    progress_bar.set_description(f"Running EVAL, loss: {loss.item()} loss(avg): {np.mean(losses_eval)}")

    future_coords_offsets_pd.append(ouputs.cpu().numpy().copy())
    future_coords_offsets_gt.append(data["target_positions"].numpy().copy())
    target_availabilities.append(data["target_availabilities"].numpy().copy())

    timestamps.append(data["timestamp"].numpy().copy())
    agent_ids.append(data["track_id"].numpy().copy())
    

### Save results  and perform evaluation
After the model has predicted trajectories for our evaluation set, we can save them in a `csv` file. To simulate a complete evaluation session we can also save the ground truth in another `csv` and get the score.

Pleae note that our metric supports multi-modal predictions (i.e. multiple predictions for a single GT trajectory). In that case, you will need to provide a confidence for each prediction (confidences must all be between 0 and 1 and sum to 1).

In this simple example we don't generate multiple trajectories, so we won't pass any confidences vector. Internally, the metric computation will assume a single trajectory with confidence equal to 1

In [None]:
# ==== COMPUTE CSV
pred_path = f"{gettempdir()}/pred.csv"
gt_path = f"{gettempdir()}/gt.csv"

write_gt_csv(gt_path,
             timestamps=np.concatenate(timestamps),
             track_ids=np.concatenate(agent_ids),
             coords=np.concatenate(future_coords_offsets_gt),
             avails=np.concatenate(target_availabilities),
                    )

write_pred_csv(pred_path,
               timestamps=np.concatenate(timestamps),
               track_ids=np.concatenate(agent_ids),
               coords=np.concatenate(future_coords_offsets_pd),
                    )
error = compute_error_csv(gt_path, pred_path)
print(error)


### Visualise Results
We can also visualise some results from the ego (AV) point of view. 

We can use the `get_frame_indices` function to find a frame with interesting agents.

In [None]:
eval_ego_dataset = EgoDataset(cfg, eval_dataset.dataset, rasterizer)

# get the first non-zero agents frame
frame_number = -1
min_agents_to_pick = 4
for idx_frame in range(len(eval_ego_dataset.dataset.frames)):
    if len(eval_dataset.get_frame_indices(idx_frame)) >= min_agents_to_pick:
        frame_number = idx_frame
        break
if frame_number == -1:
    raise ValueError(f"can't find a frame with at least {min_agents_to_pick} agents in it")

model.eval()
torch.set_grad_enabled(False)

# get AV point-of-view frame
data_ego = eval_ego_dataset[frame_number]
im_ego = rasterizer.to_rgb(data_ego["image"].transpose(1, 2, 0))


center = np.asarray(cfg["raster_params"]["ego_center"]) * cfg["raster_params"]["raster_size"]
agent_indices = eval_dataset.get_frame_indices(frame_number)
predicted_positions = []
target_positions = []

for v_index in agent_indices:
    data_agent = eval_dataset[v_index]

    out_net = model(torch.from_numpy(data_agent["image"]).unsqueeze(0).to(device))
    out_pos = out_net[0].reshape(-1, 2).detach().cpu().numpy()
    
    # store absolute world coordinates
    predicted_positions.append(out_pos + data_agent["centroid"][:2])
    target_positions.append(data_agent["target_positions"] + data_agent["centroid"][:2])


# convert coordinates to AV point-of-view so we can draw them
predicted_positions = transform_points(np.concatenate(predicted_positions), data_ego["world_to_image"])
target_positions = transform_points(np.concatenate(target_positions), data_ego["world_to_image"])

yaws = np.zeros((len(predicted_positions), 1))
draw_trajectory(im_ego, predicted_positions, yaws, PREDICTED_POINTS_COLOR)
draw_trajectory(im_ego, target_positions, yaws, TARGET_POINTS_COLOR)

plt.imshow(im_ego[::-1])