In [1]:
from protos import scenario_pb2
from tensorflow.data import TFRecordDataset

2023-08-05 19:33:36.548978: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-05 19:33:36.584248: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-05 19:33:36.584700: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import os

def getFiles(path: str) -> list[str]:
    path = os.path.expanduser(path)
    files = [os.path.join(path, f) for f in os.listdir(path)]
    return [f for f in files if os.path.isfile(f)]

files = getFiles('~/data/waymo/')

In [4]:
import tqdm
from dataclasses import dataclass
import numpy as np

@dataclass
class State:
    heading: float
    velocity: np.ndarray


def parse_scenario(scenario: scenario_pb2.Scenario) -> list[State]:
    states = []
    for s in scenario.tracks[scenario.sdc_track_index].states:
        if s.valid:
            states.append(State(s.heading, np.array([s.velocity_x, s.velocity_y], dtype=np.float32)))
    return states


h: list[list[State]] = []

for file_path in tqdm.tqdm(files):
    for data in TFRecordDataset(file_path, compression_type="").as_numpy_iterator():
        scenario = scenario_pb2.Scenario()
        scenario.ParseFromString(data)
        h.append(parse_scenario(scenario))


  0%|          | 0/100 [00:00<?, ?it/s]

8.535740852355957 -0.9118360877037048
8.434904098510742 -0.89930659532547
8.238272666931152 -0.8825401663780212
8.038335800170898 -0.8822664618492126
7.8264923095703125 -0.8686396479606628
7.622013568878174 -0.8484378457069397
7.418258190155029 -0.8321307897567749
7.216777324676514 -0.8089041113853455
7.027111053466797 -0.7868269085884094
6.839354991912842 -0.7673588991165161
6.668615341186523 -0.7504806518554688
6.518078804016113 -0.7356753945350647
6.368095397949219 -0.7246295213699341
6.222995281219482 -0.702028751373291
6.088409423828125 -0.6884111762046814
5.962471008300781 -0.6800428628921509
5.845017433166504 -0.6604745388031006
5.730380058288574 -0.6475902199745178
5.622725486755371 -0.6325628757476807
5.52157735824585 -0.6270589232444763
5.421697616577148 -0.6179238557815552
5.325798511505127 -0.5993459820747375
5.233363628387451 -0.594534158706665
5.142998218536377 -0.5833777189254761
5.0534868240356445 -0.5706144571304321
4.96184778213501 -0.5609612464904785
4.86977100372314

  0%|          | 0/100 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [None]:
s = scenarios[0]
s.tracks[s.sdc_track_index]

In [7]:
print("trajectories: ", len(h))
if len(h) > 0:
    lens = [len(x) for x in h]
    print("avg len: ", sum(lens)/len(lens))

trajectories:  1719
avg len:  198.4822571262362


In [None]:
import random
# convert into a dataset of tuples (state, next_state)
dataset = []
for states in h:
    for i in range(len(states)-1):
        dataset.append((states[i], states[i+1]))

# shuffle the dataset
random.shuffle(dataset)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# create an idm that attempts to predict: throttle and steering
# given state at current timestep, and state at next timestep:
# each state contains: velocity_x, velocity_y, heading
class IDM(nn.Module):
    def __init__(self):
        super(IDM, self).__init__()
        # input shape: (batch_size, 6, 2)
        # output shape: (batch_size, 2)

        self.conv1 = nn.Conv1d(3, 64, 2) # Bx3x2 -> Bx64x1
        self.fc1 = nn.Linear(64, 32) # Bx64 -> Bx32
        self.fc2 = nn.Linear(32, 2) # Bx32 -> Bx2
    
    def forward(self, x: torch.Tensor):
        x = F.relu(self.conv1(x)) # Bx3x2 -> Bx64x1
        x = torch.flatten(x, 1) # Bx64x1 -> Bx64
        x = F.relu(self.fc1(x)) # Bx64 -> Bx32
        x = self.fc2(x) # Bx32 -> Bx2
        return x


# How to train the Inverse Dynamics Model

An IDM (Inverse Dynamics Model) is a model that predicts the control input (steering angle and acceleration) given the current state of the vehicle and the next state of the vehicle. In RL parlance, we have $s_t$ and $s_{t+1}$ and we want to predict $a_t$.

We're training the IDM on the Waymo Motion Dataset. The dataset contains several thousand trajectories of vehicles driving in a variety of environments. The trajectories are sampled at 10Hz, and each sample contains the state of the vehicle (position, velocity, heading, etc.) and the environment (traffic lights, other vehicles, etc.). However, it does not contain the control input (steering angle and acceleration) of the vehicle. This is what we want to predict.

To do this, we'll leverage the Metadrive simulator. We reformulate the problem of predicting the action as a RL game where the model tries to take the action that will result in the next state in the simulator being as close as possible to the ground truth next state. The reward function is the negative of the distance between the predicted next state and the ground truth next state. We train the model using PPO. 

In [8]:
import metadrive
import gymnasium as gym

def normalize_angle(angle: float) -> float:
    """
    Normalize the angle to [-pi, pi)
    """
    return (angle + np.pi) % (2 * np.pi) - np.pi

def run_game(env:gym.Env, st: State, st1: State,  policy:typing.Callable[tuple[State, State], tuple[float, float]]) -> tuple[tuple[float, float], float]:
    """
    runs the policy and returns the total reward
    """
    
    # reset
    env.reset()

    # allow car to settle
    for _ in range(10):
        env.step([0,0])

    # set the initial state
    env.vehicle.set_velocity(st.velocity)
    env.vehicle.set_heading_theta(st.heading)
    
    # run the policy
    action = policy(st, st1)
    env.step(action)

    # compute the error
    velocity_error = np.linalg.norm(env.vehicle.velocity[:2] - st1.velocity)
    
    # in order to prevent errors from wrapping around, we compare the difference in heading
    sim_heading_diff = normalize_angle(env.vehicle.heading_theta - st.heading)
    real_heading_diff = normalize_angle(st1.heading - st.heading)

    heading_error = (sim_heading_diff - real_heading_diff)**2
    
    reward = -velocity_error - heading_error

    return action, reward

def obs_batch_to_tensor(obs: list[tuple[State, State]], device: torch.device) -> torch.Tensor:
    """
    Reshape the observation from tuple[State, State] to a tensor of shape (batch_size, 3, 2)
    """

    observations = []

    for st, st1 in obs:
        observations.append(np.array([
            [st.velocity[0], st1.velocity[0]], 
            [st.velocity[1], st1.velocity[1]],
            [st.heading, st1.heading]
        ]))

    return torch.tensor(np.stack(observations), dtype=torch.float32, device=device)


def deviceof(m: nn.Module) -> torch.device:
    """
    Get the device of the given module
    """
    return next(m.parameters()).device

Successfully registered the following environments: ['MetaDrive-validation-v0', 'MetaDrive-10env-v0', 'MetaDrive-100envs-v0', 'MetaDrive-1000envs-v0', 'SafeMetaDrive-validation-v0', 'SafeMetaDrive-10env-v0', 'SafeMetaDrive-100envs-v0', 'SafeMetaDrive-1000envs-v0', 'MARLTollgate-v0', 'MARLBottleneck-v0', 'MARLRoundabout-v0', 'MARLIntersection-v0', 'MARLParkingLot-v0', 'MARLMetaDrive-v0'].


In [None]:
class NNPolicy:
    def __init__(self, net:PolicyNetwork):
        self.net = net

    def __call__(self, obs:tuple[State, State]) -> tuple[float, float]:
        # sample an action from the policy network
        obs_tensor = obs_batch_to_tensor([obs], deviceof(self.net))
        with torch.no_grad():
            mu = self.net(obs_tensor)[0]
            sigma = torch.tensor([0.01, 0.01])
            distribution = torch.distributions.MultivariateNormal(mu, torch.diag_embed(sigma))
            throttle, steering = distribution.sample()
        return throttle.item(), steering.item()


def compute_policy_gradient_loss(
    # Current policy network's distribution of actions given a state
    # inner shape = (Batch, 2)
    pi_theta_given_st: torch.distributions.MultivariateNormal,
    # The action chosen by the policy network
    # in (Batch, 2)
    a_t: torch.Tensor,
    # Rewards To Go for the chosen action
    # in (Batch,)
    R_t: torch.Tensor,
) -> torch.Tensor:
    """
    Computes the policy gradient loss for a vector of examples, and reduces with mean.
    """

    # in (Batch,)
    loss_per_example = -pi_theta_given_st.log_prob(a_t) * R_t

    return loss_per_example.mean()


def train_policygradient(
    policy_network: PolicyNetwork,
    policy_optimizer: torch.optim.Optimizer,
    observation_batch: list[npt.NDArray],
    action_batch: list[tuple[float, float]],
    rtg_batch: list[float],
) -> float:
    # assert that the batch_lengths are the same
    assert len(observation_batch) == len(action_batch)
    assert len(observation_batch) == len(rtg_batch)

    # get device
    device = deviceof(policy_network)

    # convert data to tensors on correct device

    # in (Batch, C, H, W)
    observation_batch_tensor = obs_batch_to_tensor(observation_batch, device)

    # in (Batch,)
    rtg_batch_tensor = torch.tensor(
        rtg_batch, dtype=torch.float32, device=device
    )

    # in (Batch, 2)
    chosen_action_tensor = torch.tensor(action_batch, device=device)

    # train policy
    policy_optimizer.zero_grad()
    action_probs = policy_network.forward(observation_batch_tensor)
    policy_loss = compute_policy_gradient_loss(
        action_probs, chosen_action_tensor, rtg_batch_tensor
    )
    policy_loss.backward()
    policy_optimizer.step()

    # return the respective losses
    return policy_loss.item()

In [None]:
def set_lr(optimizer: torch.optim.Optimizer, lr: float) -> None:
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

policy_network = PolicyNetwork().to(device)

policy_optimizer = torch.optim.Adam(policy_network.parameters())

dataset_iterator = iter(dataset)

step = 0
returns = []
losses = []

In [None]:
env = gym.make("MetaDrive-validation-v0", config={})

In [None]:
# train idm model using metadrive as the ground truth
set_lr(policy_optimizer, 1e-4)
TRAIN_EPOCHS = 100
EPISODES_PER_BATCH = 32

# Train
while step < TRAIN_EPOCHS:
    act_batch:list[int] = []
    rtg_batch:list[float] = []
    
    trajectory_returns = []

    for _ in range(EPISODES_PER_BATCH):
        st0, st1 = next(dataset_iterator)
        # Collect trajectory
        action, reward = run_game(env, st0, st1, NNPolicy(policy_network))

        # Update batch
        obs_batch.extend(obs_traj)
        act_batch.extend(act_traj)
        rtg_batch.extend(rtg_traj)

        # Update trajectory returns
        trajectory_returns.append(reward)

    policy_loss = train_policygradient(
        policy_network,
        policy_optimizer,
        obs_batch,
        act_batch,
        rtg_batch,
    )

    # collect statistics
    losses.append(policy_loss)
    returns.append(trajectory_returns)

    print(f"Step {step}, Avg. Returns: {np.mean(trajectory_returns):.3f} +/- {np.std(trajectory_returns):.3f}, Median: {np.median(trajectory_returns):.3f}, Policy Loss: {losses[-1]:.3f}")

    step += 1

In [None]:
env.close()