If you run in jupyter, turn 

```
colab = False
```




In [None]:
colab = True
if colab:
    !pip install gym pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !apt-get update > /dev/null 2>&1
    !apt-get install cmake > /dev/null 2>&1
    !pip install --upgrade setuptools 2>&1
    !pip install ez_setup > /dev/null 2>&1
    !pip3 install box2d-py
    !pip3 install gym[Box_2D]



In [None]:
if colab:
    from google.colab import drive
    drive.mount('/content/drive')

    %cd /content/drive/MyDrive/Colab\ Notebooks/rl-master/day3/ddpg
    !ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/rl-master/day3/ddpg
buffer.py  ddpg.ipynb  __pycache__  snapshots  utils.py  video


In [None]:
import torch
import torch.nn as nn
from torch.nn import MSELoss
import torch.nn.functional as F
import copy
import os
import numpy as np
from tqdm import tqdm
import torch
from torch.optim import Adam
from buffer import ReplayBuffer
from utils import save_snapshot, recover_snapshot, load_model
import gym

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('current device =', device)

current device = cuda


# 0. Define Q-network & policy-network

In [None]:
# critic network definition
# multi-layer perceptron (with 2 hidden layers)
class Critic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden1, hidden2):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(obs_dim + act_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)
        
    
    def forward(self, obs, act):
        x = torch.cat([obs, act], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        return self.fc3(x)
    
    
# actor network definition
# multi-layer perceptron (with 2 hidden layers)
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim, ctrl_range, hidden1, hidden2):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(obs_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, act_dim)
        self.ctrl_range = ctrl_range
        
    def forward(self, obs):
        x = F.relu(self.fc1(obs))
        x = F.relu(self.fc2(x))
        
        return self.ctrl_range * torch.tanh(self.fc3(x))

# 1. Define DDPG agent

In [None]:
class DDPGAgent:
    def __init__(self, obs_dim, act_dim, ctrl_range, hidden1, hidden2):
        
        # networks
        self.actor = Actor(obs_dim, act_dim, ctrl_range, hidden1, hidden2).to(device)
        self.critic = Critic(obs_dim, act_dim, hidden1, hidden2).to(device)
                
    def act(self, obs):
        # numpy ndarray to torch tensor
        # we first add an extra dimension
        obs = obs[np.newaxis, ...]
        with torch.no_grad():
            obs_tensor = torch.Tensor(obs).to(device)
            # TODO : get agent action from policy network (self.actor)
            #act_tensor = 

        # torch tensor to numpy ndarray
        # remove extra dimension
        action = act_tensor.cpu().detach().numpy()
        action = np.squeeze(action, axis=0)
        
        return action
    

## 1.1.Test

In [None]:
agent = DDPGAgent(4, 2, 3, 32, 32)
action = agent.act(np.array([3., -1., 2., -5.]))
print(action)

NameError: ignored

# 2. Implement one-step param update

In [None]:
def update(agent, replay_buf, gamma, actor_optim, critic_optim, target_actor, target_critic, tau, batch_size):
    # agent : agent with networks to be trained
    # replay_buf : replay buf from which we sample a batch
    # actor_optim / critic_optim : torch optimizers
    # tau : parameter for soft target update
    
    batch = replay_buf.sample_batch(batch_size=batch_size)

    # target construction does not need backward ftns
    with torch.no_grad():
        # unroll batch
        obs = torch.Tensor(batch.obs).to(device)
        act = torch.Tensor(batch.act).to(device)
        next_obs = torch.Tensor(batch.next_obs).to(device)
        rew = torch.Tensor(batch.rew).to(device)
        done = torch.Tensor(batch.done).to(device)
        
        ################
        # train critic #
        ################
        mask = 1. - done
        target = rew + gamma * mask * target_critic(next_obs, target_actor(next_obs))
    
    out = agent.critic(obs, act)

    # TODO : Build critic MSELoss by yourself!
    # Hint : use torch.mean
    #critic_loss = torch.mean()
    
    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()
    
    ###############
    # train actor #
    ###############
    
    # freeze critic during actor training (why?)
    for p in agent.critic.parameters():
        p.requires_grad_(False)
    
    actor_loss = -torch.mean(agent.critic(obs, agent.actor(obs)))
    
    actor_optim.zero_grad()
    actor_loss.backward()
    actor_optim.step()
    
    
    # unfreeze critic after actor training
    for p in agent.critic.parameters():
        p.requires_grad_(True)
        
    # soft target update (both actor & critic network)
    for p, targ_p in zip(agent.actor.parameters(), target_actor.parameters()):
        targ_p.data.copy_((1. - tau) * targ_p + tau * p)
    for p, targ_p in zip(agent.critic.parameters(), target_critic.parameters()):
        targ_p.data.copy_((1. - tau) * targ_p + tau * p)
        
        
    return

In [None]:
def evaluate(agent, env, num_episodes=5):

    sum_scores = 0.
    
    for i in range(num_episodes):
        obs = env.reset()
        done = False
        score = 0.
        
        while not done:
            action = agent.act(obs)
            obs, rew, done, _ = env.step(action)
            score += rew
        sum_scores += score
    avg_score = sum_scores / num_episodes
    
    return avg_score

# 3. Combining these, we finally have...

In [None]:
def train(agent, env, gamma, 
          actor_lr, critic_lr, tau, noise_std,
          ep_len, num_updates, batch_size,
          init_buffer=5000, buffer_size=100000,
          start_train=2000, train_interval=50,
          eval_interval=2000, snapshot_interval=10000, path=None):
    
    target_actor = copy.deepcopy(agent.actor)
    target_critic = copy.deepcopy(agent.critic)
    
    # freeze target networks
    for p in target_actor.parameters():
        p.requires_grad_(False)
    for p in target_critic.parameters():
        p.requires_grad_(False)

    actor_optim = Adam(agent.actor.parameters(), lr=actor_lr)
    critic_optim = Adam(agent.critic.parameters(), lr=critic_lr)
    
    if path is not None:
        recover_snapshot(path, agent.actor, agent.critic,
                   target_actor, target_critic,
                   actor_optim, critic_optim,
                   device=device
                  )
        # load snapshot
    
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    ctrl_range = env.action_space.high[0]
    
    replay_buf = ReplayBuffer(obs_dim, act_dim, buffer_size)
    
    save_path = './snapshots/'
    os.makedirs(save_path, exist_ok=True)
    
    # main loop
    obs = env.reset()
    done = False
    step_count = 0
    
    for t in tqdm(range(num_updates + 1)):
        if t < init_buffer:
            # perform random action until we collect sufficiently many samples
            # this is for exploration purpose
            action = env.action_space.sample()
        else:
            # executes noisy action
            # a_t = \pi(s_t) + N(0, \sigma^2)
            action = agent.act(obs) + noise_std * np.random.randn(act_dim)
            action = np.clip(action, -ctrl_range, ctrl_range)
            
        next_obs, rew, done, _ = env.step(action)
        step_count += 1
        if step_count == ep_len:
            # if the next_state is not terminal but done is set to True by gym env wrapper
            done = False
            
        replay_buf.append(obs, action, next_obs, rew, done)
        obs = next_obs
        
        if done == True or step_count == ep_len:
            # reset environment if current environment reaches a terminal state 
            # or step count reaches predefined length
            obs = env.reset()
            done = False
            step_count = 0
            # score = evaluate(agent, env)
            # print('[iteration {}] evaluation score : {}'.format(t, score))
        
        
        if t > start_train and t % train_interval == 0:
            # start training after fixed number of steps
            # this may mitigate overfitting of networks to the 
            # small number of samples collected during the initial stage of training
            for _ in range(train_interval):
                update(agent,
                       replay_buf,
                       gamma,
                       actor_optim,
                       critic_optim,
                       target_actor,
                       target_critic,
                       tau,
                       batch_size
                      )
        if t % snapshot_interval == 0:
            snapshot_path = save_path + 'iter{}_'.format(t)
            # save weight & training progress
            save_snapshot(snapshot_path, agent.actor, agent.critic,
                          target_actor, target_critic,
                          actor_optim, critic_optim)

# 4. Let's test the code!

In [None]:
env = gym.make('LunarLanderContinuous-v2')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
ctrl_range = env.action_space.high[0]

print('observation space dim : {} / action space dim : {}'.format(obs_dim, act_dim))
print('ctrl range : ', ctrl_range)

observation space dim : 8 / action space dim : 2
ctrl range :  1.0


In [None]:
agent = DDPGAgent(obs_dim=obs_dim, act_dim=act_dim, ctrl_range=ctrl_range, hidden1=256, hidden2=256)

In [None]:
gamma = 0.99
actor_lr = 1e-4
critic_lr = 1e-3
tau = 1e-3
noise_std = 0.1
ep_len = 1000
num_updates = 200000
batch_size = 128

In [None]:
train(agent, env, gamma,
      actor_lr, critic_lr, tau, noise_std,
      ep_len, num_updates, batch_size,
      init_buffer=5000, buffer_size=100000,
      start_train=2000, train_interval=50,
      eval_interval=5000
     )

# 5. Watch the trained agent!

In [None]:
if colab:
    import gym
    from gym.wrappers import Monitor
    import glob
    import io
    import base64
    from IPython.display import HTML
    from pyvirtualdisplay import Display
    from IPython import display as ipythondisplay

    display = Display(visible=0, size=(1400, 900))
    display.start()

    def show_video():
      mp4list = glob.glob('video/*.mp4')
      if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                </video>'''.format(encoded.decode('ascii'))))
      else: 
        print("Could not find video")
        

    def wrap_env(env):
      env = Monitor(env, './video', force=True)
      return env

    env = wrap_env(env)

In [None]:
env = gym.make('LunarLanderContinuous-v2')
if colab:
  env = wrap_env(env)
obs = env_reset()
done = False
score = 0.

for _ in range(1000):
    env.render(mode='human')
    obs, rew, _, _ = env.step(agent.act(obs))
    score += rew
print('score : ', score)
env.close()

if colab:
    show_video()

loading pre-trained weight...
score :  -51008.39558069934
