In [None]:
from typing import Dict

from tempfile import gettempdir

import numpy as np
import torch
from torch import nn, optim
from torchvision.models.resnet import resnet50
from tqdm import tqdm

from l5kit.configs import load_config_data
from l5kit.data import LocalDataManager
from l5kit.dataset import AgentDataset, EgoDataset
from l5kit.dataset.utilities import build_dataloader
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_coords_as_csv, compute_mse_error_csv
from l5kit.geometry import transform_points
from l5kit.visualization import PREDICTED_POINTS_COLOR, TARGET_POINTS_COLOR, draw_trajectory
from matplotlib import pyplot as plt
from prettytable import PrettyTable

import os

## Prepare Data path and load cfg

By setting the `L5KIT_DATA_FOLDER` variable, we can point the script to the folder where the data lie.

Then, we load our config file with relative paths and other configurations (rasterer, training params...).

In [None]:
# set env variable for data
os.environ["L5KIT_DATA_FOLDER"] = "/tmp/l5kit_data"
# get config
cfg = load_config_data("./agent_motion_config.yaml")
print(cfg)

## Model

Our baseline is a simple `resnet50` pretrained on `imagenet`. We must replace the input and the final layer to address our requirements.

In [None]:
def build_model(cfg: Dict) -> torch.nn.Module:
    # load pre-trained Conv2D model
    model = resnet50(pretrained=True)

    # change input size
    num_history_channels = (cfg["model_params"]["history_num_frames"] + 1) * 2
    num_in_channels = 3 + num_history_channels
    model.conv1 = nn.Conv2d(
        num_in_channels,
        model.conv1.out_channels,
        kernel_size=model.conv1.kernel_size,
        stride=model.conv1.stride,
        padding=model.conv1.padding,
        bias=False,
    )
    # change output size
    # X, Y  * number of future states
    num_targets = 2 * cfg["model_params"]["future_num_frames"]
    model.fc = nn.Linear(in_features=2048, out_features=num_targets)

    return model

In [None]:
def forward(data, model, device, criterion):
    inputs = data["image"].to(device)
    targets = data["target_positions"].to(device).reshape(len(data["target_positions"]), -1)
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss = loss.mean()
    return loss, outputs

## Load some stuff

In [None]:
dm = LocalDataManager(None)
# ===== INIT DATASETS
rasterizer = build_rasterizer(cfg, dm)
train_dataloader = build_dataloader(cfg, "train", dm, AgentDataset, rasterizer)
eval_dataloader = build_dataloader(cfg, "val", dm, AgentDataset, rasterizer)


In [None]:
# ==== INIT MODEL
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = build_model(cfg).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss(reduction="none")

# Training

In [None]:
# ==== TRAIN LOOP
tr_it = iter(train_dataloader)
progress_bar = tqdm(range(cfg["train_params"]["max_num_steps"]))
losses_train = []
for _ in progress_bar:
    try:
        data = next(tr_it)
    except StopIteration:
        tr_it = iter(train_dataloader)
        data = next(tr_it)

    model.train()
    torch.set_grad_enabled(True)
    loss, _ = forward(data, model, device, criterion)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    losses_train.append(loss.item())
    progress_bar.set_description(f"loss: {loss.item()} loss(avg): {np.mean(losses_train)}")

# Evaluation
we can now run inference and store predicted and annotated trajectories. 

In this example we run it on a single scene from the eval dataset for computationl constraints. 

In [None]:
# ==== EVAL LOOP
model.eval()
torch.set_grad_enabled(False)
losses_eval = []

# store information for evaluation
future_coords_offsets_pd = []
future_coords_offsets_gt = []

timestamps = []
agent_ids = []
progress_bar = tqdm(eval_dataloader)
for data in progress_bar:
    loss, ouputs = forward(data, model, device, criterion)
    losses_eval.append(loss.item())
    progress_bar.set_description(f"Running EVAL, loss: {loss.item()} loss(avg): {np.mean(losses_eval)}")

    future_coords_offsets_pd.append(ouputs.reshape(len(ouputs), -1, 2).cpu().numpy())
    future_coords_offsets_gt.append(data["target_positions"].reshape(len(ouputs), -1, 2).cpu().numpy())

    timestamps.append(data["timestamp"].numpy())
    agent_ids.append(data["track_id"].numpy())
    

### Save results in the competition format and perform evaluation
After the model has predicted trajectories for our evaluation set, we can save them in a `csv` file in the competiion format. To simulate a complete evaluation session we can also save the GT in another `csv` and get the scores.

We will get `future_num_frames` values, corrisponding to the MSE (mean of squared errors) for that timestep.

In [None]:
# ==== COMPUTE CSV
pred_path = f"{gettempdir()}/pred.csv"
gt_path = f"{gettempdir()}/gt.csv"

write_coords_as_csv(pred_path, future_num_frames=cfg["model_params"]["future_num_frames"],
                    future_coords_offsets=np.concatenate(future_coords_offsets_pd),
                    timestamps=np.concatenate(timestamps),
                    agent_ids=np.concatenate(agent_ids))
write_coords_as_csv(gt_path, future_num_frames=cfg["model_params"]["future_num_frames"],
                    future_coords_offsets=np.concatenate(future_coords_offsets_gt),
                    timestamps=np.concatenate(timestamps),
                    agent_ids=np.concatenate(agent_ids))

# get a pretty visualisation of the errors
table = PrettyTable(field_names=["future step", "MSE"])
table.float_format = ".2"
steps = range(1, cfg["model_params"]["future_num_frames"] + 1)
errors = compute_mse_error_csv(gt_path, pred_path)
for step_idx, step_mse in zip(steps, errors):
    table.add_row([step_idx, step_mse])
print(table)

### Visualise results
We can also visualise some result from the ego(AV) point of view. Let's have a look at the frame number `5198`

In [None]:
eval_agent_dataset = eval_dataloader.dataset.datasets[0].dataset
eval_ego_dataset = EgoDataset(cfg, eval_agent_dataset.dataset, rasterizer)
frame_number = 5198

model.eval()
torch.set_grad_enabled(False)

# get AV point-of-view frame
data_ego = eval_ego_dataset[frame_number]
im_ego = rasterizer.to_rgb(data_ego["image"].transpose(1, 2, 0))


center = np.asarray(cfg["raster_params"]["ego_center"]) * cfg["raster_params"]["raster_size"]
agent_indices = eval_agent_dataset.get_frame_indices(frame_number)

predicted_positions = []
target_positions = []

for v_index in agent_indices:
    data_agent = eval_agent_dataset[v_index]

    out_net = model(torch.from_numpy(data_agent["image"]).unsqueeze(0).to(device))
    out_pos = out_net[0].reshape(-1, 2).detach().cpu().numpy()
    
    # store absolute world coordinates
    predicted_positions.append(out_pos + data_agent["centroid"][:2])
    target_positions.append(data_agent["target_positions"] + data_agent["centroid"][:2])


# convert coordinates to AV point-of-view so we can draw them
predicted_positions = transform_points(np.concatenate(predicted_positions), data_ego["world_to_image"])
target_positions = transform_points(np.concatenate(target_positions), data_ego["world_to_image"])

yaws = np.zeros((len(predicted_positions), 1))
draw_trajectory(im_ego, predicted_positions, yaws, PREDICTED_POINTS_COLOR)
draw_trajectory(im_ego, target_positions, yaws, TARGET_POINTS_COLOR)

plt.imshow(im_ego[::-1])