In [1]:
from protos import scenario_pb2
from tensorflow.data import TFRecordDataset

2023-08-05 17:03:16.596898: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-05 17:03:16.637679: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-05 17:03:16.638158: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import os

def getFiles(path: str) -> list[str]:
    path = os.path.expanduser(path)
    files = [os.path.join(path, f) for f in os.listdir(path)]
    return [f for f in files if os.path.isfile(f)]

files = getFiles('~/data/waymo/')

In [3]:
import tqdm
from dataclasses import dataclass
import numpy as np

@dataclass
class State:
    heading: float
    velocity: np.ndarray
    length: float
    width: float
    height: float



def parse_scenario(scenario: scenario_pb2.Scenario) -> list[State]:
    states = []
    for s in scenario.tracks[scenario.sdc_track_index].states:
        if s.valid:
            states.append(State(s.heading, np.array([s.velocity_x, s.velocity_y], dtype=np.float32), s.length, s.width, s.height))
    return states


h: list[list[State]] = []

for file_path in tqdm.tqdm(files):
    for data in TFRecordDataset(file_path, compression_type="").as_numpy_iterator():
        scenario = scenario_pb2.Scenario()
        scenario.ParseFromString(data)
        h.append(parse_scenario(scenario))


  0%|          | 0/100 [00:00<?, ?it/s]

2023-08-05 17:03:18.677044: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-05 17:03:18.698764: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
 23%|██▎       | 23/100 [15:14<51:02, 39.77s/it]  


KeyboardInterrupt: 

In [None]:
s = scenarios[0]
s.tracks[s.sdc_track_index]

In [7]:
print("trajectories: ", len(h))
if len(h) > 0:
    lens = [len(x) for x in h]
    print("avg len: ", sum(lens)/len(lens))

trajectories:  1719
avg len:  198.4822571262362


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# create an idm that attempts to predict: throttle and steering
# given state at current timestep, and state at next timestep:
# each state contains: velocity_x, velocity_y, heading, length, width, and height 
class IDM(nn.Module):
    def __init__(self):
        super(IDM, self).__init__()
        # input shape: (batch_size, 6, 2)
        # output shape: (batch_size, 2)

        self.conv1 = nn.Conv1d(6, 64, 2) # Bx6x2 -> Bx64x1
        self.fc1 = nn.Linear(64, 32) # Bx64 -> Bx32
        self.fc2 = nn.Linear(32, 2) # Bx32 -> Bx2
    
    def forward(self, x: torch.Tensor):
        x = F.relu(self.conv1(x)) # Bx6x2 -> Bx64x1
        x = torch.flatten(x, 1) # Bx64x1 -> Bx64
        x = F.relu(self.fc1(x)) # Bx64 -> Bx32
        x = self.fc2(x) # Bx32 -> Bx2
        return x


# How to train the Inverse Dynamics Model

An IDM (Inverse Dynamics Model) is a model that predicts the control input (steering angle and acceleration) given the current state of the vehicle and the next state of the vehicle. In RL parlance, we have $s_t$ and $s_{t+1}$ and we want to predict $a_t$.

We're training the IDM on the Waymo Motion Dataset. The dataset contains several thousand trajectories of vehicles driving in a variety of environments. The trajectories are sampled at 10Hz, and each sample contains the state of the vehicle (position, velocity, heading, etc.) and the environment (traffic lights, other vehicles, etc.). However, it does not contain the control input (steering angle and acceleration) of the vehicle. This is what we want to predict.

To do this, we'll leverage the Metadrive simulator. We reformulate the problem of predicting the action as a RL game where the model tries to take the action that will result in the next state in the simulator being as close as possible to the ground truth next state. The reward function is the negative of the distance between the predicted next state and the ground truth next state. We train the model using PPO. 

In [8]:
import metadrive
import gymnasium as gym

def run_game(env:gym.Env, st: State, st1: State,  policy:typing.Callable[tuple[State, State], tuple[float, float]]) -> tuple[tuple[float, float], float]:
    """
    runs the policy and returns the total reward
    """
    st_velocity = np.array([st.velocity_x, st.velocity_y])
    st1_velocity = np.array([st1.velocity_x, st1.velocity_y])
    
    # set the state
    env.reset()
    env.vehicle.set_velocity(st_velocity)
    env.vehicle.set_heading_theta(st.heading)
    # run the policy
    action = policy(st, st1)
    env.step(action)
    # compute the reward
    sim_st1_velocity = env.vehicle.velocity[:2]

    velocity_error = np.linalg.norm(sim_st1_velocity - st1_velocity)
    sim_st1_heading = env.vehicle.heading_theta

    return action, reward

def obs_batch_to_tensor(obs: list[npt.NDArray[np.float32]], device: torch.device) -> torch.Tensor:
    """
    Reshape the image observation from (B, H, W, C) to (B, C, H, W) and convert it to a tensor
    """
    return torch.tensor(np.stack(obs), dtype=torch.float32, device=device).permute(0, 3, 1, 2)


def deviceof(m: nn.Module) -> torch.device:
    """
    Get the device of the given module
    """
    return next(m.parameters()).device

def rewards_to_go(trajectory_rewards: list[float], gamma: float) -> list[float]:
    """
    Computes the gamma discounted reward-to-go for each state in the trajectory.
    """

    trajectory_len = len(trajectory_rewards)

    v_batch = np.zeros(trajectory_len)

    v_batch[-1] = trajectory_rewards[-1]

    # Use gamma to decay the advantage
    for t in reversed(range(trajectory_len - 1)):
        v_batch[t] = trajectory_rewards[t] + gamma * v_batch[t + 1]

    return list(v_batch)

Successfully registered the following environments: ['MetaDrive-validation-v0', 'MetaDrive-10env-v0', 'MetaDrive-100envs-v0', 'MetaDrive-1000envs-v0', 'SafeMetaDrive-validation-v0', 'SafeMetaDrive-10env-v0', 'SafeMetaDrive-100envs-v0', 'SafeMetaDrive-1000envs-v0', 'MARLTollgate-v0', 'MARLBottleneck-v0', 'MARLRoundabout-v0', 'MARLIntersection-v0', 'MARLParkingLot-v0', 'MARLMetaDrive-v0'].


In [None]:
def compute_policy_gradient_loss(
    # Current policy network's distribution of actions given a state
    # inner shape = (Batch, 2)
    pi_theta_given_st: torch.distributions.MultivariateNormal,
    # The action chosen by the policy network
    # in (Batch, 2)
    a_t: torch.Tensor,
    # Rewards To Go for the chosen action
    # in (Batch,)
    R_t: torch.Tensor,
) -> torch.Tensor:
    r"""
    Computes the policy gradient loss for a vector of examples, and reduces with mean.

    The standard policy gradient is given by the expected value over trajectories of:

    :math:`\sum_{t=0}^{T} \nabla_{\theta} (\log \pi_{\theta}(a_t|s_t))R_t`
    
    where:
    * :math:`\pi_{\theta}(a_t|s_t)` is the current policy's probability to perform action :math:`a_t` given :math:`s_t`
    * :math:`R_t` is the rewards-to-go from the state at time t to the end of the episode from which it came.
    """

    # Note: this loss has doesn't actually represent whether the action was good or bad
    # it is a dummy loss, that is only used to compute the gradient

    # Recall that the policy gradient for a single transition (state-action pair) is given by:
    # $\nabla_{\theta} \log \pi_{\theta}(a_t|s_t)R_t$
    # However, it's easier to work with losses, rather than raw gradients.
    # Therefore we construct a loss, that when differentiated, gives us the policy gradient.
    # this loss is given by:
    # $-\log \pi_{\theta}(a_t|s_t)R_t$

    # in (Batch,)
    loss_per_example = -pi_theta_given_st.log_prob(a_t) * R_t

    # we take the average loss over all examples
    return loss_per_example.mean()


def train_policygradient(
    policy_network: PolicyNetwork,
    policy_optimizer: torch.optim.Optimizer,
    observation_batch: list[npt.NDArray],
    action_batch: list[tuple[float, float]],
    rtg_batch: list[float],
) -> float:
    # assert that the batch_lengths are the same
    assert len(observation_batch) == len(action_batch)
    assert len(observation_batch) == len(rtg_batch)

    # get device
    device = deviceof(policy_network)

    # convert data to tensors on correct device

    # in (Batch, C, H, W)
    observation_batch_tensor = obs_batch_to_tensor(observation_batch, device)

    # in (Batch,)
    rtg_batch_tensor = torch.tensor(
        rtg_batch, dtype=torch.float32, device=device
    )

    # in (Batch, 2)
    chosen_action_tensor = torch.tensor(action_batch, device=device)

    # train policy
    policy_optimizer.zero_grad()
    action_probs = policy_network.forward(observation_batch_tensor)
    policy_loss = compute_policy_gradient_loss(
        action_probs, chosen_action_tensor, rtg_batch_tensor
    )
    policy_loss.backward()
    policy_optimizer.step()

    # return the respective losses
    return policy_loss.item()

In [None]:
env = gym.make("MetaDrive-validation-v0", config={})

In [None]:
# train idm model using metadrive as the ground truth


In [None]:
env.close()