In [1]:
import argparse
import os
import random
import time
from distutils.util import strtobool
from typing import Callable

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal
from torch.utils.tensorboard import SummaryWriter
from radar_maps.env.radar_map_double_integrator import RadarMap_DoubleIntegrator

In [2]:
from actor_utils import ActorNet
from critic_utils import CriticNet

In [3]:
def unflatten_obs(x):
    # print(x.shape)
    obs = {}
    img = torch.clone(x[:, 0:14400])
    # print("img shape: ", img.shape)
    state = torch.clone(x[:, -4:])
    obs['img'] = img.view(x.shape[0], 1, 120, 120)
    obs['state'] = state.view(x.shape[0], 4)
    return obs

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

def make_env(env_id, idx, capture_video, run_name, gamma):
    map_size = 1000
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id,
                            map_size = map_size, 
                            goal_location=[map_size, map_size], 
                            radar_detection_range=300,
                            grid_size=5,
                            dist_between_radars=map_size/5.0,
                            num_radars=10)
        env = gym.wrappers.FlattenObservation(env)  # deal with dm_control's Dict observation space
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env = gym.wrappers.ClipAction(env)
        # env = gym.wrappers.NormalizeObservation(env)
        # env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10))
        env = gym.wrappers.NormalizeReward(env, gamma=gamma)
        env = gym.wrappers.TransformReward(env, lambda reward: np.clip(reward, -10, 10))
        return env

    return thunk


Set parameters.

In [4]:
seed=1
torch_deterministic = True
env_id = "RadarMap-DoubleIntegrator-v0"
exp_name = "rl_finetune"
capture_video = False
run_name = run_name = f"{env_id}__{exp_name}__{seed}__{int(time.time())}"
gamma = 0.99
num_envs = 1
learning_rate = 3e-4
num_steps = 2048
total_timesteps = 100
batch_size = int(num_envs * num_steps)
anneal_lr = True
target_kl = None
max_grad_norm = 0.5
vf_coef = 0.5
ent_coef = 0.0
clip_coef = 0.2
clip_vloss = True
gae_lambda = 0.95
num_minibatches = 32
norm_adv = True
update_epochs = 10
minibatch_size = int(batch_size // num_minibatches)
save_model = True

Change the following path to point to the agent trained from imitation learning.

In [5]:
agant_path = "/home/lucas/Documents/GitHub/mpc_imitation_learning/agent"

In [6]:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# env setup
envs = gym.vector.SyncVectorEnv(
    [make_env(env_id, i, capture_video, run_name, gamma) for i in range(num_envs)]
)
assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"

dummy_env = RadarMap_DoubleIntegrator(1000, [1000, 1000], 300, 5, 200, 10)
actor = ActorNet(dummy_env.action_space, dummy_env.observation_space, hidden_sizes=[64, 64])
actor.load_state_dict(torch.load(agant_path))
critic = CriticNet(actor.feature_extractor, [64, 64])
actor.to(device)
critic.to(device)

all_params = list(actor.parameters()) + list(critic.parameters())
all_params = dict.fromkeys(all_params).keys()

  logger.warn(


In [7]:
'''
Freeze weights of actor net to train critic.
'''
for name, param in actor.named_parameters():
        param.requires_grad = False
optimizer = optim.Adam(all_params, lr=learning_rate, eps=1e-5)

In [8]:
# ALGO Logic: Storage setup
obs = torch.zeros((num_steps, num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((num_steps, num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((num_steps, num_envs)).to(device)
rewards = torch.zeros((num_steps, num_envs)).to(device)
dones = torch.zeros((num_steps, num_envs)).to(device)
values = torch.zeros((num_steps, num_envs)).to(device)

# TRY NOT TO MODIFY: start the game
global_step = 0
start_time = time.time()
next_obs, _ = envs.reset(seed=seed)
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(num_envs).to(device)
num_updates = total_timesteps // batch_size

In [9]:
for update in range(1, num_updates + 1):
    # Annealing the rate if instructed to do so.
    if anneal_lr:
        frac = 1.0 - (update - 1.0) / num_updates
        lrnow = frac * learning_rate
        optimizer.param_groups[0]["lr"] = lrnow

    for step in range(0, num_steps):
        global_step += 1 * num_envs
        obs[step] = next_obs
        dones[step] = next_done

        # ALGO LOGIC: action logic
        with torch.no_grad():
            obs_unflattened = unflatten_obs(next_obs)
            # print("Obs: ", next_obs.shape)
            action, act_dist = actor.forward(obs_unflattened)
            logprob = act_dist.log_prob(action)
            value = critic.forward(obs_unflattened)

            values[step] = value.flatten()
        actions[step] = action
        logprobs[step] = logprob

        # TRY NOT TO MODIFY: execute the game and log data.
        next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
        done = np.logical_or(terminations, truncations)
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)

        # Only print when at least 1 env is done
        if "final_info" not in infos:
            continue

        for info in infos["final_info"]:
            # Skip the envs that are not done
            if info is None:
                continue
            print(f"global_step={global_step}, episodic_return={info['episode']['r']}")


In [10]:
# bootstrap value if not done
with torch.no_grad():
    # next_value = agent.get_value(next_obs).reshape(1, -1)
    next_value = critic.forward(unflatten_obs(next_obs))
    advantages = torch.zeros_like(rewards).to(device)
    lastgaelam = 0
    for t in reversed(range(num_steps)):
        if t == num_steps - 1:
            nextnonterminal = 1.0 - next_done
            nextvalues = next_value
        else:
            nextnonterminal = 1.0 - dones[t + 1]
            nextvalues = values[t + 1]
        delta = rewards[t] + gamma * nextvalues * nextnonterminal - values[t]
        advantages[t] = lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
    returns = advantages + values

# flatten the batch
b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
b_logprobs = logprobs.reshape(-1)
b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)

# Optimizing the policy and value network
b_inds = np.arange(batch_size)
clipfracs = []
for epoch in range(update_epochs):
    np.random.shuffle(b_inds)
    for start in range(0, batch_size, minibatch_size):
        end = start + minibatch_size
        mb_inds = b_inds[start:end]

        # _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions[mb_inds])
        obs_unflattened = unflatten_obs(b_obs[mb_inds])
        _, act_dist = actor.forward(obs_unflattened)
        newlogprob = act_dist.log_prob(b_actions[mb_inds])
        entropy = act_dist.entropy()
        newvalue = critic.forward(obs_unflattened)

        logratio = newlogprob - b_logprobs[mb_inds]
        ratio = logratio.exp()

        with torch.no_grad():
            # calculate approx_kl http://joschu.net/blog/kl-approx.html
            old_approx_kl = (-logratio).mean()
            approx_kl = ((ratio - 1) - logratio).mean()
            clipfracs += [((ratio - 1.0).abs() > clip_coef).float().mean().item()]

        mb_advantages = b_advantages[mb_inds]
        if norm_adv:
            mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

        # Policy loss
        pg_loss1 = -mb_advantages * ratio
        pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
        pg_loss = torch.max(pg_loss1, pg_loss2).mean()

        # Value loss
        newvalue = newvalue.view(-1)
        if clip_vloss:
            v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
            v_clipped = b_values[mb_inds] + torch.clamp(
                newvalue - b_values[mb_inds],
                -clip_coef,
                clip_coef,
            )
            v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
            v_loss = 0.5 * v_loss_max.mean()
        else:
            v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

        entropy_loss = entropy.mean()
        loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(all_params, max_grad_norm)
        optimizer.step()

    if target_kl is not None:
        if approx_kl > target_kl:
            break

y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
var_y = np.var(y_true)
explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y


Change the following to where you wish to store the trained model.

In [15]:
model_path = f"/home/lucas/Documents/GitHub/mpc_imitation_learning/runs"

In [16]:
if save_model:
    torch.save(actor.state_dict(), model_path + '/actor')
    torch.save(critic.state_dict(), model_path + '/critic')
    print(f"model saved to {model_path}")

RuntimeError: Parent directory /home/lucas/Documents/GitHub/mpc_imitation_learning/runs/RadarMap-DoubleIntegrator-v0__rl_finetune__1__1711134596 does not exist.