# Continuous Control

---

In [2]:
from collections import deque

from unityagents import UnityEnvironment
import numpy as np

import random
import collections
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.set_default_tensor_type('torch.DoubleTensor')

#Hyperparameters
lr_mu        = 0.0001
lr_q         = 0.0001
gamma        = 0.99
batch_size   = 128
buffer_limit = 100000
tau          = 0.001 # for target network soft update

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []

        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])
        
        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class MuNet(nn.Module):
    def __init__(self):
        super(MuNet, self).__init__()
        self.fc1 = nn.Linear(33, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc_mu = nn.Linear(128, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x.double()))
        x = F.relu(self.fc2(x.double()))
        mu = torch.tanh(self.fc_mu(x))
        return mu

class QNet(nn.Module):
    def __init__(self):
        super(QNet, self).__init__()
        
        self.fc_s = nn.Linear(33, 256)
        #self.fc_a = nn.Linear(4,  28)
        self.fc_q = nn.Linear(260, 128)
        self.fc_3 = nn.Linear(128,  1)

    def forward(self, x, a):
        h1 = self.fc_s(x.double())
        h1 = F.relu(h1)
        
        a = torch.squeeze(a.double())
        #h2 = self.fc_a(a)
        #h2 = F.relu(h2)

        cat = torch.cat([h1, a], dim=1)
        q = F.relu(self.fc_q(cat))
        q = self.fc_3(q)
        return q

class OrnsteinUhlenbeckNoise:
    def __init__(self, mu):
        self.theta, self.dt, self.sigma = 0.15, 0.01, 0.2
        self.mu = mu
        self.x_prev = np.zeros_like(self.mu)

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x
      
def train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer):
    s,a,r,s_prime,done_mask  = memory.sample(batch_size)
    
    target = r + gamma * q_target(s_prime, mu_target(s_prime))
    q_loss = F.smooth_l1_loss(q(s,a), target.detach())
    q_optimizer.zero_grad()
    q_loss.backward()
    q_optimizer.step()
    
    mu_loss = -q(s,mu(s)).mean() # That's all for the policy loss.
    mu_optimizer.zero_grad()
    mu_loss.backward()
    mu_optimizer.step()
    
def soft_update(net, net_target):
    for param_target, param in zip(net_target.parameters(), net.parameters()):
        param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)
    

In [None]:
#env = UnityEnvironment(file_name='Reacher.app')
env = UnityEnvironment(file_name='/data/Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

memory = ReplayBuffer()

q, q_target = QNet(), QNet()
q_target.load_state_dict(q.state_dict())
mu, mu_target = MuNet(), MuNet()
mu_target.load_state_dict(mu.state_dict())

score = 0.0
print_interval = 1

mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
q_optimizer  = optim.Adam(q.parameters(), lr=lr_q)
ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(4))

reward_buffer = deque(maxlen=100)
reward_hist = []
softened_reward = []

for n_epi in range(1000):
    env_info = env.reset(train_mode=True)[brain_name]
    s = env_info.vector_observations

    while True: # maximum length of episode is 200 for Pendulum-v0
        a = mu(torch.from_numpy(s)) 
        a = a.detach().numpy() + ou_noise()
        
        env_info = env.step(a)[brain_name]
        s_prime = env_info.vector_observations
        r = env_info.rewards[0]
        done = env_info.local_done
        
        memory.put((s[0], a[0], r,s_prime[0], done))
        score += r
        s = s_prime
        
        if memory.size() > batch_size:
            train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer)
            
            soft_update(mu, mu_target)
            soft_update(q,  q_target)
            
        if np.any(done):
            #print(f'episode {n_epi}: {score}')l
            reward_buffer.append(score)
            reward_hist.append(score)
            softened_reward.append(np.mean(reward_buffer))
            break
    
    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}, score {}".format(n_epi, np.mean(reward_buffer), score))
        score = 0.0
    
    if np.mean(reward_buffer) > 30:
        print('Solved!')
    
env.close()

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


# of episode :1, avg score : 0.4, score 0.8099999818950891
# of episode :2, avg score : 0.5, score 0.6999999843537807
# of episode :3, avg score : 0.5, score 0.3199999928474426
# of episode :4, avg score : 0.4, score 0.23999999463558197
# of episode :5, avg score : 0.5, score 0.6599999852478504
# of episode :6, avg score : 0.5, score 0.7099999841302633
# of episode :7, avg score : 0.6, score 1.2499999720603228
# of episode :8, avg score : 0.6, score 0.46999998949468136
# of episode :9, avg score : 0.6, score 1.1399999745190144
# of episode :10, avg score : 0.7, score 0.9399999789893627
# of episode :11, avg score : 0.6, score 0.2199999950826168
# of episode :12, avg score : 0.6, score 0.5699999872595072
# of episode :13, avg score : 0.8, score 2.509999943897128
# of episode :14, avg score : 0.8, score 1.529999965801835
# of episode :15, avg score : 0.9, score 1.6899999622255564
# of episode :16, avg score : 0.9, score 1.409999968484044
# of episode :17, avg score : 0.9, score 0.3799999

In [None]:
torch.save(mu.state_dict(), 'mu.pth')
torch.save(q.state_dict(), 'q.pth')