# Trust Region Policy Optimization Practice

# -1. Setting

If you run in jupyter, turn 

```
colab = False
```

In [1]:
colab = False
if colab:
    !pip install gym pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !apt-get update > /dev/null 2>&1
    !apt-get install cmake > /dev/null 2>&1
    !pip install --upgrade setuptools 2>&1
    !pip install ez_setup > /dev/null 2>&1
    !pip3 install box2d-py
    !pip3 install gym[Box_2D]

In [2]:
if colab:
    from google.colab import drive
    drive.mount('/content/drive')

    %cd /content/drive/MyDrive/rl-master/rl-master/day4/trpo
    !ls

In [3]:
import numpy as np
import time
import csv
import torch
import os
import copy
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Independent
from torch.distributions.normal import Normal
from torch.optim import Adam
from memory import OnPolicyMemory
from utils import *
import glfw
import mujoco_py

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('current device : ', device)

current device :  cuda


# 0. Network Architectures

In [5]:
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden1, hidden2):
        # actor f_\phi(s)
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(obs_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)

        self.fc3 = nn.Linear(hidden2, act_dim)  # for \mu
        self.fc4 = nn.Linear(hidden2, act_dim)  # for \sigma

    def forward(self, obs):
        x = torch.tanh(self.fc1(obs))
        x = torch.tanh(self.fc2(x))

        mu = self.fc3(x)
        log_sigma = self.fc4(x)

        sigma = torch.exp(log_sigma)

        return mu, sigma

    def log_prob(self, obs, act):
        mu, sigma = self.forward(obs)
        act_distribution = Independent(Normal(mu, sigma), 1)
        log_prob = act_distribution.log_prob(act)
        return log_prob

class Critic(nn.Module):
    # critic V(s ; \theta)
    def __init__(self, obs_dim, hidden1, hidden2):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(obs_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)

    def forward(self, obs):
        x = torch.tanh(self.fc1(obs))
        x = torch.tanh(self.fc2(x))

        return self.fc3(x)

# 1. Agent Definition

In [6]:
class TRPOAgent:
    def __init__(
                 self,
                 obs_dim,
                 act_dim,
                 hidden1=64,
                 hidden2=32,
                 ):

        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.hidden1 = hidden1
        self.hidden2 = hidden2

        self.pi = Actor(obs_dim, act_dim, hidden1, hidden2).to(device)
        self.V = Critic(obs_dim, hidden1, hidden2).to(device)

    def act(self, obs, deterministic=False):
        obs = torch.tensor(obs, dtype=torch.float).to(device)
        with torch.no_grad():
            mu, sigma = self.pi(obs)
            if deterministic:
                action = mu
                log_prob = None
                val = None
            else:
                act_distribution = Independent(Normal(mu, sigma), 1)
                action = act_distribution.sample()
                log_prob = act_distribution.log_prob(action)
                val = self.V(obs)
                log_prob = log_prob.cpu().numpy()
                val = val.cpu().numpy()

        action = action.cpu().numpy()
        

        return action, log_prob, val

# 2. Policy & Value Function Approximation Update

Objective:
\begin{align*}
g = \nabla_\phi J(\phi) &\approx \nabla_\phi \mathbb{E}_{s \sim \rho_{\phi_{\text{old}}}, a \sim \pi_{\phi_{\text{old}}}}\left( \frac{\pi_{\phi}(s, a)}{\pi_{\phi_{\text{old}}}(s, a)} A^{\pi_{\phi_{\text{old}}}}(s, a) \right) \\
&\approx \nabla_\phi \frac{1}{N} \sum_{i = 1}^N \left( \frac{\pi_{\phi}(s_i, a_i)}{\pi_{\phi_{\text{old}}}(s_i, a_i)} \hat A(s_i, a_i) \right).
\end{align*} \\
Since we take into account approximated trust region constraint, the final update direction is
\begin{equation*}
s = H^{-1}g, \quad H s = g,
\end{equation*}
 and the stepsize is
 \begin{equation*}
\alpha = \sqrt{\frac{2\delta}{g^\top H^{-1} g}}.
 \end{equation*}
 Thus, the update is done as follows:
 \begin{gather*}
 \phi_{\text{old}} \longleftarrow \phi, \\
\phi \longleftarrow \phi + \alpha \cdot s.
 \end{gather*}

In [7]:
def update(agent, memory, critic_optim, delta, num_updates):
    
    batch = memory.load()

    states = torch.Tensor(batch['state']).to(device)
    actions = torch.Tensor(batch['action']).to(device)
    target_v = torch.Tensor(batch['val']).to(device)
    A = torch.Tensor(batch['A']).to(device)
    old_log_probs = torch.Tensor(batch['log_prob']).to(device)
    
    for _ in range(num_updates):
        ################
        # train critic #
        ################
        out = agent.V(states)
        critic_loss = torch.mean((out - target_v)**2)

        critic_optim.zero_grad()
        critic_loss.backward()
        critic_optim.step()

        ###################
        # policy gradient #
        ###################
        log_probs = agent.pi.log_prob(states, actions)

        # TODO : calculate below to get probabiltiy ratio
        # Hint : use log_probs and old_log_probs
        # \pi(a_t | s_t ; \phi) / \pi(a_t | s_t ; \phi_old)
        #prob_ratio = torch.exp()
        prob_ratio = torch.exp(log_probs - old_log_probs)

        actor_loss = torch.mean(prob_ratio * A)
        loss_grad = torch.autograd.grad(actor_loss, agent.pi.parameters())
        # flatten gradients of params
        g = torch.cat([grad.view(-1) for grad in loss_grad]).data

        s = cg(fisher_vector_product, g, agent.pi, states)

        sAs = torch.sum(fisher_vector_product(s, agent.pi, states) * s, dim=0, keepdim=True)
        step_size = torch.sqrt(2 * delta / sAs)[0]    # stepsize : move as far as possible within trust region
        step = step_size * s

        old_actor = Actor(agent.obs_dim, agent.act_dim, agent.hidden1, agent.hidden2).to(device)
        old_actor.load_state_dict(agent.pi.state_dict())

        params = flat_params(agent.pi)

        backtracking_line_search(old_actor, agent.pi, actor_loss, g,
                                 old_log_probs, params, step, delta, A, states, actions)    # line search => for improvement guarantee!
    return

In [8]:
def evaluate(agent, env, num_episodes=5):

    scores = np.zeros(num_episodes)
    for i in range(num_episodes):
        obs = env.reset()
        done = False
        score = 0.
        while not done:
            # if i == 0:
            #     env.render()
            action = agent.act(obs, deterministic=True)[0]
            obs, rew, done, _ = env.step(action)
            score += rew
        # if i == 0:
        #     env.close()
        #     glfw.terminate()
            
        scores[i] = score
    avg_score = np.mean(scores)
    std_score = np.std(scores)
    
    return avg_score, std_score

# 3. Training!

In [9]:
def train(env, agent, max_iter, gamma=0.99, lr=3e-4, lam=0.95, delta=1e-3, steps_per_epoch=10000, eval_interval=10000, snapshot_interval=10000):
    
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_ep_len = env._max_episode_steps
    memory = OnPolicyMemory(obs_dim, act_dim, gamma, lam, lim=steps_per_epoch)
    test_env = copy.deepcopy(env)
    critic_optim = Adam(agent.V.parameters(), lr=lr)

    save_path = './snapshots/'
    os.makedirs(save_path, exist_ok=True)
    os.makedirs('./learning_curves/', exist_ok=True)
    log_file = open('./learning_curves/res.csv',
                    'w',
                    encoding='utf-8',
                    newline=''
                   )
    logger = csv.writer(log_file)
    num_epochs = max_iter // steps_per_epoch
    total_t = 0
    begin = time.time()
    for epoch in range(num_epochs):
        # start agent-env interaction
        state = env.reset()
        step_count = 0
        ep_reward = 0

        for t in range(steps_per_epoch):
            # collect transition samples by executing the policy
            action, log_prob, v = agent.act(state)

            next_state, reward, done, _ = env.step(action)
            memory.append(state, action, reward, v, log_prob)

            ep_reward += reward
            step_count += 1

            if (step_count == max_ep_len) or (t == steps_per_epoch - 1):
                # termination of env by env wrapper, or by truncation due to memory size
                s_last = torch.tensor(next_state, dtype=torch.float).to(device)
                v_last = agent.V(s_last).item()
                memory.compute_values(v_last)
            elif done:
                # episode done as the agent reach a terminal state
                v_last = 0.0
                memory.compute_values(v_last)

            state = next_state

            if done:
                state = env.reset()
                step_count = 0
                ep_reward = 0

            if total_t % eval_interval == 0:
                avg_score, std_score = evaluate(agent, test_env, num_episodes=5)
                elapsed_t = time.time() - begin
                print('[elapsed time : {:.1f}s| iter {}] score = {:.2f}'.format(elapsed_t, total_t, avg_score), u'\u00B1', '{:.4f}'.format(std_score))
                evaluation_log = [t, avg_score, std_score]
                logger.writerow(evaluation_log)


            if total_t % snapshot_interval == 0:
                snapshot_path = save_path + 'iter{}_'.format(total_t)
                # save weight & training progress
                save_snapshot(agent, snapshot_path)

            total_t += 1

        # train agent at the end of each epoch
        update(agent, memory, critic_optim, delta, num_updates=1)
    log_file.close()
    return

In [10]:
env = gym.make('HalfCheetah-v2')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
print('observation space dim. : {} / action space dim. : {}'.format(obs_dim, act_dim))

observation space dim. : 17 / action space dim. : 6


In [11]:
agent = TRPOAgent(obs_dim, act_dim, hidden1=128, hidden2=128)

In [12]:
next(agent.pi.parameters()).is_cuda

True

In [13]:
train(env, agent, max_iter=400000, gamma=0.99, lr=3e-4, lam=0.95, steps_per_epoch=10000, eval_interval=10000)

[elapsed time : 1.7s| iter 0] score = -48.80 ± 0.8013
[elapsed time : 10.1s| iter 10000] score = -48.56 ± 1.1181
[elapsed time : 18.3s| iter 20000] score = -42.14 ± 0.7771
[elapsed time : 26.5s| iter 30000] score = -40.30 ± 0.6180
[elapsed time : 35.1s| iter 40000] score = -44.53 ± 0.5490
[elapsed time : 43.5s| iter 50000] score = -38.59 ± 1.0267
[elapsed time : 52.2s| iter 60000] score = -45.89 ± 0.9653
[elapsed time : 60.7s| iter 70000] score = -41.22 ± 0.9451
[elapsed time : 69.0s| iter 80000] score = -44.23 ± 0.6897
[elapsed time : 77.2s| iter 90000] score = -35.01 ± 0.4106
[elapsed time : 85.8s| iter 100000] score = -38.00 ± 0.2626
[elapsed time : 94.2s| iter 110000] score = -30.36 ± 0.5363
[elapsed time : 102.7s| iter 120000] score = -35.58 ± 0.3909
[elapsed time : 111.4s| iter 130000] score = -32.90 ± 1.5522
[elapsed time : 119.9s| iter 140000] score = -28.92 ± 0.5863
[elapsed time : 128.5s| iter 150000] score = -17.67 ± 0.9945
[elapsed time : 137.2s| iter 160000] score = -12.77

# 4. Watch how your agent solve the task!

In [14]:
if colab:
    import gym
    from gym.wrappers import Monitor
    import glob
    import io
    import base64
    from IPython.display import HTML
    from pyvirtualdisplay import Display
    from IPython import display as ipythondisplay

    display = Display(visible=0, size=(1400, 900))
    display.start()

    def show_video():
      mp4list = glob.glob('video/*.mp4')
      if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                </video>'''.format(encoded.decode('ascii'))))
      else: 
        print("Could not find video")
        

    def wrap_env(env):
      env = Monitor(env, './video', force=True)
      return env

    env = wrap_env(env)

In [15]:
env = gym.make('LunarLanderContinuous-v2')
if colab:
  env = wrap_env(env)

load_model(agent, './snapshots/trained.pth.tar', device)

obs = env.reset()

done = False
score = 0.
while not done:
    env.render()
    obs, rew, done, _ = env.step(agent.act(obs, deterministic=True)[0])
    score += rew
env.close()
print('score : ', score)

if colab:
  show_video()

loading pre-trained weight...


FileNotFoundError: [Errno 2] No such file or directory: './snapshots/trained.pth.tar'

# Proximal Policy Optimization

In contrast to TRPO, PPO uses the following simple $1^{\text{st}}$-order objective!
\begin{equation*}
L(\phi) \approx \frac{1}{N} \sum_{i = 1}^N \min\left( r_i(\phi)\hat A_i, \text{clip}(r_i(\phi), 1 - \varepsilon, 1 + \varepsilon) \hat A_i  \right).
\end{equation*}
While we performed complex parameter updates in TRPO, we just build the above loss and use popular optimizers provided by PyTorch...

In [None]:
from ppo import *

In [None]:
env = gym.make('LunarLanderContinuous-v2')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
print('observation space dim. : {} / action space dim. : {}'.format(obs_dim, act_dim))

In [None]:
ppo_agent = PPOAgent(obs_dim, act_dim, hidden1=128, hidden2=128, device=device)

In [None]:
ppo_train(env, ppo_agent, max_iter=500000, gamma=0.99, lr=3e-4, lam=0.95, epsilon=0.2, steps_per_epoch=10000, eval_interval=10000)