# Quiz

In quiz, we review <span style = 'color:blue'>DDPG </span> which is simple yet powerful off-policy actor-critic algorithm. As we already have some experience in writing DPPG-related codes, you may refer to the code we wrote in day5 if you have difficulty filling the blank. Please feel free to e-mail me if you have some problems, either with running the code or solving quiz. 

In [None]:
import os
import torch
import torch.nn as nn
from torch.nn import MSELoss
import torch.nn.functional as F
import copy
import numpy as np
import torch
from torch.optim import Adam
from buffer import ReplayBuffer
import gym
import matplotlib.pyplot as plt

# 1. Define Q-network & policy-network

In this section, we define an actor and a critic as 2-layer NNs. Note that you may choose different non-linear activation instead of ReLu and test the cell.

In [None]:
# critic network definition
# multi-layer perceptron (with 2 hidden layers)
class Critic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden1, hidden2):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(obs_dim + act_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)
        
    
    def forward(self, obs, act):
        x = torch.cat([obs, act], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        return self.fc3(x)
    
    
# actor network definition
# multi-layer perceptron (with 2 hidden layers)
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim, ctrl_range, hidden1, hidden2):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(obs_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, act_dim)
        self.ctrl_range = ctrl_range
        
    def forward(self, obs):
        x = F.relu(self.fc1(obs))
        x = F.relu(self.fc2(x))
        
        return self.ctrl_range * torch.tanh(self.fc3(x))

# 2. Define DDPG agent

What should our agent have? In princple, all we need is a single actor network(and critic network in some cases), which perform an appropriate action for given observation. For simplicity, we assume that the agent also has target networks as its components(althouth we don't need them after training is done). <span style='color:red'> You have some codes to fill in here. </span>

In [None]:
class DDPGAgent:
    def __init__(self, obs_dim, act_dim, ctrl_range, hidden1, hidden2):
        
        # networks
        self.actor = Actor(obs_dim, act_dim, ctrl_range, hidden1, hidden2)
        self.critic = Critic(obs_dim, act_dim, hidden1, hidden2)
        
        # target networks
        self.targ_actor = copy.deepcopy(self.actor)
        self.targ_critic = copy.deepcopy(self.critic)
        print('networks loaded')
        
    def act(self, obs):
        
        obs = obs[np.newaxis]  # we first add an extra dimension
        with torch.no_grad():
            obs_tensor = torch.Tensor(obs)  # numpy ndarray to torch tensor
            
            ### TODO (Question 1) ### You need to implement how to compute action when given an observation
            

        action = act_tensor.numpy()  # torch tensor to numpy ndarray
        action = np.squeeze(action, axis=0)  # remove extra dimension
        
        return action
    
    

## 2.1.Test

In [None]:
agent = DDPGAgent(4, 2, 3, 32, 32)
obs = np.array([3., -1., 2., -5.])
print('observation type = {} / observation shape = {}'.format(type(obs), obs.shape))
action = agent.act(obs)
print('selected action : ', action)

# 3. Implement one-step param update

In this section, we will learn how to perform one-step parameter updates of the actor and the critic. <span style='color:red'> Again, we have some work left for you. </span>

## 3.1. Update

In [None]:
def update(agent, replay_buf, gamma, actor_optim, critic_optim, tau, batch_size):
    # agent : agent with networks to be trained
    # replay_buf : replay buf from which we sample a batch
    # actor_optim / critic_optim : torch optimizers
    # tau : parameter for soft target update
    
    batch = replay_buf.sample_batch(batch_size=batch_size)

    # target construction does not need backward ftns
    with torch.no_grad():
        # unroll batch
        obs = torch.Tensor(batch.obs)
        act = torch.Tensor(batch.act)
        next_obs = torch.Tensor(batch.next_obs)
        rew = torch.Tensor(batch.rew)
        done = torch.Tensor(batch.done)
        
        ################
        # train critic #
        ################
        mask = torch.Tensor([1.]) - done
        
        ### TODO (Question 2) ### You should construct a target tensor which is used to train the critic
        
    
    out = agent.critic(obs, act)
    
    loss_ftn = MSELoss()
    critic_loss = loss_ftn(out, target)
    # alternative : loss = torch.mean((target - out)**2)
    
    critic_optim.zero_grad()
    critic_loss.backward()
    critic_optim.step()
    
    ###############
    # train actor #
    ###############
    
    # freeze critic during actor training (why?)
    for p in agent.critic.parameters():
        p.requires_grad_(False)
    
    actor_loss = -torch.mean(agent.critic(obs, agent.actor(obs)))
    
    
    ### TODO (Question 3) ### After loss construction, you need to compute gradient 
    # by calling the appropriate optimizer.
    # Please implement it.
    
    
    
    # unfreeze critic after actor training
    for p in agent.critic.parameters():
        p.requires_grad_(True)
        
    # soft target update (both actor & critic network)
    for p, targ_p in zip(agent.actor.parameters(), agent.targ_actor.parameters()):
        targ_p.data.copy_((1. - tau) * targ_p + tau * p)
    for p, targ_p in zip(agent.critic.parameters(), agent.targ_critic.parameters()):
        targ_p.data.copy_((1. - tau) * targ_p + tau * p)
        
        
    return

## 3.2. Evaluation

when training, you are going to check whether the agent is being trained or not, by periodically evaluating your agent on the environment. This is how we do it.

In [None]:
def evaluate(agent, env, num_episodes=5):
    
    assert num_episodes > 0
    
    test_env = copy.deepcopy(env)
    sum_scores = 0.
    
    for i in range(num_episodes):
        
        obs = test_env.reset()
        done = False
        score = 0.
        
        while not done:
            if i == 0:
                test_env.render()
            action = agent.act(obs)
            obs, rew, done, _ = test_env.step(action)
            score += rew
        sum_scores += score
        if i == 0:
            test_env.close()

    avg_score = sum_scores / num_episodes
    

    return avg_score

## 3.3. Test

Just check that the function you have defined correctly works.

In [None]:
env_id = 'BipedalWalkerHardcore-v3'
env = gym.make(env_id)
obs_dim, act_dim = env.observation_space.shape[0], env.action_space.shape[0]
ctrl_range = env.action_space.high[0]

print('env id : ', env_id)
print('observation space dim = {} / action space dim = {}'.format(obs_dim, act_dim))
print('action space range = {}'.format(ctrl_range))
random_agent = DDPGAgent(obs_dim, act_dim, ctrl_range, 32, 32)

print('score : ', evaluate(random_agent, env, num_episodes=1))

## 3.4. Writer definition

In [None]:
class Writer:
    def __init__(self, path, num_log):
        self.scores = np.zeros((num_log, 2))  # 1st col : time step, 2nd col : score
        self.head = 0
        self.end = num_log
        self.path = os.path.join(path, 'scores.npy')  # path of the location to store the scores
        
    def write(self, step, score):
        assert self.head < self.end
        
        self.scores[self.head] = np.array([step, score])
        self.head += 1
        
    def save(self):
        np.save(self.path, self.scores)
        self.scores = np.zeros((self.end, 2))
        self.head = 0

# 4. Combining these, we finally have...

Now we have all components we need for complete training! Let's see how training the agent looks like. <span style = 'color:red'> You have some blanks to fill in. This will be the last part you have to code yourself. </span> 

In [None]:
def train(agent, env, gamma, 
          actor_lr, critic_lr, tau, noise_std,
          ep_len, num_updates, batch_size,
          init_buffer=5000, buffer_size=100000,
          start_train=2000, train_interval=50,
          eval_interval=2000, writer=None):
    
    actor_optim = Adam(agent.actor.parameters(), lr=actor_lr)
    critic_optim = Adam(agent.critic.parameters(), lr=critic_lr)
    
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    ctrl_range = env.action_space.high[0]
    
    replay_buf = ReplayBuffer(obs_dim, act_dim, buffer_size)
    
    
    # main loop
    obs = env.reset()
    done = False
    step_count = 0
    
    for t in range(num_updates + 1):
        if t < init_buffer:
            # perform random action until we collect sufficiently many samples
            # this is for exploration purpose
            action = env.action_space.sample()
        else:
            # executes noisy action
            # a_t = \pi(s_t) + N(0, \sigma^2)
            
            ### TODO (Question 4) ### Implement a_t = \pi(s_t) + N(0, \sigma^2)
            # Hint : One can generate a random sample from N(0, \sigma^2) using noise_std * np.random.randn(act_dim)
            # You may look at our previous code we worked on in day5
            
            
            action = np.clip(action, -ctrl_range, ctrl_range)
            
        next_obs, rew, done, _ = env.step(action)
        step_count += 1
        if step_count == ep_len:
            # if the next_state is not terminal but done is set to True by gym env wrapper
            done = False
            
        replay_buf.append(obs, action, next_obs, rew, done)
        obs = next_obs
        
        if done == True or step_count == ep_len:
            # reset environment if current environment reaches a terminal state 
            # or step count reaches predefined length
            obs = env.reset()
            done = False
            step_count = 0
        
        
        if t > start_train and t % train_interval == 0:
            # start training after fixed number of steps
            # this may mitigate overfitting of networks to the 
            # small number of samples collected during the initial stage of training
            for _ in range(train_interval):
                update(agent, replay_buf, gamma, actor_optim, critic_optim, tau, batch_size)

        if t % eval_interval == 0:
            score = evaluate(agent, env)
            print('[iteration {}] evaluation score : {}'.format(t, score))
            if writer is not None:
                writer.write(t, score)
                
    writer.save()

# 5. Let's test the code!

Congratulation! You've just finished your Quiz. The rest of the code will show you how training is done in practice and how to analyze the result.

In [None]:
env_id = 'LunarLanderContinuous-v2'
env = gym.make(env_id)
obs_dim, act_dim = env.observation_space.shape[0], env.action_space.shape[0]
ctrl_range = env.action_space.high[0]

print('env id : ', env_id)
print('observation space dim = {} / action space dim = {}'.format(obs_dim, act_dim))
print('action space range = {}'.format(ctrl_range))

In [None]:
agent = DDPGAgent(obs_dim=obs_dim, act_dim=act_dim, ctrl_range=ctrl_range, hidden1=256, hidden2=256)

## 5.1. Hyperparameters setup

We introduce some well-tuned hyperparameters for DDPG. You may use them without any modification, or just try other choices for fun.

In [None]:
gamma = 0.99
actor_lr = 1e-4
critic_lr = 1e-3
tau = 1e-3
noise_std = 0.1
ep_len = 500
eval_interval = 5000
num_updates = 200000
batch_size = 128

In [None]:
pth = os.getcwd()
print('evaluation history will be saved at : ', pth)
writer = Writer(pth, num_updates // eval_interval + 1)
print(writer)

## 5.2. Let's train!

Don't worry about the performance; even if your agent does not perform well on the task, it will have little effect on your quiz score.

In [None]:
train(agent, env, gamma,
      actor_lr, critic_lr, tau, noise_std,
      ep_len, num_updates, batch_size,
      init_buffer=5000, buffer_size=1000000,
      start_train=2000, train_interval=50,
      eval_interval=5000, writer=writer)

# 6. Watch the trained agent

In [None]:
obs = env.reset()
done = False
score = 0.

while not done:
    env.render()
    obs, rew, done, _ = env.step(agent.act(obs))
    score += rew
    
env.close()
print('score : ', score)

In [None]:
scores = np.load(pth + '/scores.npy')

steps = scores[:, 0] / 1000.
score_arr = scores[:, 1]

plt.plot(steps, score_arr)

plt.title(env_id)
plt.xlabel(r'step ($\times 10^3 $)')
plt.ylabel('score')
plt.xlim(steps[0], steps[-1])

plt.grid()
