In [1]:
#https://github.com/seungeunrho/minimalRL/blob/master/ddpg.py

In [2]:
from unityagents import UnityEnvironment
import numpy as np

import random
import collections
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.set_default_tensor_type('torch.DoubleTensor')

#Hyperparameters
lr_mu        = 0.0001
lr_q         = 0.0001
gamma        = 0.99
batch_size   = 128
buffer_limit = 100000
tau          = 0.001 # for target network soft update

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)

    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []

        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])
        
        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class MuNet(nn.Module):
    def __init__(self):
        super(MuNet, self).__init__()
        self.fc1 = nn.Linear(33, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc_mu = nn.Linear(128, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x.double()))
        x = F.relu(self.fc2(x.double()))
        mu = torch.tanh(self.fc_mu(x))
        return mu

class QNet(nn.Module):
    def __init__(self):
        super(QNet, self).__init__()
        
        self.fc_s = nn.Linear(33, 256)
        #self.fc_a = nn.Linear(4,  28)
        self.fc_q = nn.Linear(260, 128)
        self.fc_3 = nn.Linear(128,  1)

    def forward(self, x, a):
        h1 = self.fc_s(x.double())
        h1 = F.relu(h1)
        
        a = torch.squeeze(a.double())
        #h2 = self.fc_a(a)
        #h2 = F.relu(h2)

        cat = torch.cat([h1, a], dim=1)
        q = F.relu(self.fc_q(cat))
        q = self.fc_3(q)
        return q

class OrnsteinUhlenbeckNoise:
    def __init__(self, mu):
        self.theta, self.dt, self.sigma = 0.15, 0.01, 0.2
        self.mu = mu
        self.x_prev = np.zeros_like(self.mu)

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x
      
def train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer):
    s,a,r,s_prime,done_mask  = memory.sample(batch_size)
    
    target = r + gamma * q_target(s_prime, mu_target(s_prime))
    q_loss = F.smooth_l1_loss(q(s,a), target.detach())
    q_optimizer.zero_grad()
    q_loss.backward()
    q_optimizer.step()
    
    mu_loss = -q(s,mu(s)).mean() # That's all for the policy loss.
    mu_optimizer.zero_grad()
    mu_loss.backward()
    mu_optimizer.step()
    
def soft_update(net, net_target):
    for param_target, param in zip(net_target.parameters(), net.parameters()):
        param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)
    

In [3]:
env = UnityEnvironment(file_name='Reacher.app')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

memory = ReplayBuffer()

q, q_target = QNet(), QNet()
q_target.load_state_dict(q.state_dict())
mu, mu_target = MuNet(), MuNet()
mu_target.load_state_dict(mu.state_dict())

score = 0.0
print_interval = 1

mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
q_optimizer  = optim.Adam(q.parameters(), lr=lr_q)
ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(4))

for n_epi in range(1000):
    env_info = env.reset(train_mode=True)[brain_name]
    s = env_info.vector_observations

    while True: # maximum length of episode is 200 for Pendulum-v0
        a = mu(torch.from_numpy(s)) 
        a = a.detach().numpy() + ou_noise()
        
        env_info = env.step(a)[brain_name]
        s_prime = env_info.vector_observations
        r = env_info.rewards[0]
        done = env_info.local_done
        
        memory.put((s[0], a[0], r,s_prime[0], done))
        score += r
        s = s_prime
        
        if memory.size() > batch_size:
            train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer)
            
            soft_update(mu, mu_target)
            soft_update(q,  q_target)
            
        if np.any(done):
            #print(f'episode {n_epi}: {score}')
            break
    
    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
        score = 0.0

env.close()

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


# of episode :1, avg score : 2.2
# of episode :2, avg score : 0.3
# of episode :3, avg score : 0.8
# of episode :4, avg score : 0.4
# of episode :5, avg score : 0.8
# of episode :6, avg score : 0.5
# of episode :7, avg score : 0.5
# of episode :8, avg score : 0.5
# of episode :9, avg score : 0.0
# of episode :10, avg score : 0.7
# of episode :11, avg score : 0.1
# of episode :12, avg score : 1.2
# of episode :13, avg score : 0.7
# of episode :14, avg score : 0.4
# of episode :15, avg score : 0.4
# of episode :16, avg score : 0.5
# of episode :17, avg score : 0.1
# of episode :18, avg score : 0.0
# of episode :19, avg score : 0.2
# of episode :20, avg score : 1.5
# of episode :21, avg score : 1.0
# of episode :22, avg score : 0.6
# of episode :23, avg score : 0.7
# of episode :24, avg score : 0.7
# of episode :25, avg score : 0.7
# of episode :26, avg score : 1.3
# of episode :27, avg score : 0.5
# of episode :28, avg score : 1.0
# of episode :29, avg score : 0.1
# of episode :30, avg s

# of episode :236, avg score : 26.4
# of episode :237, avg score : 27.6
# of episode :238, avg score : 30.6
# of episode :239, avg score : 31.8
# of episode :240, avg score : 29.6
# of episode :241, avg score : 30.3
# of episode :242, avg score : 27.6
# of episode :243, avg score : 30.6
# of episode :244, avg score : 24.8
# of episode :245, avg score : 29.2
# of episode :246, avg score : 28.4
# of episode :247, avg score : 32.4
# of episode :248, avg score : 28.2
# of episode :249, avg score : 24.8
# of episode :250, avg score : 31.6
# of episode :251, avg score : 30.1
# of episode :252, avg score : 21.6
# of episode :253, avg score : 21.0
# of episode :254, avg score : 37.6
# of episode :255, avg score : 23.9
# of episode :256, avg score : 28.1
# of episode :257, avg score : 33.0
# of episode :258, avg score : 29.0
# of episode :259, avg score : 28.7
# of episode :260, avg score : 34.3
# of episode :261, avg score : 32.7
# of episode :262, avg score : 26.0
# of episode :263, avg score

KeyboardInterrupt: 

In [10]:
torch.save(mu.state_dict(), 'mu.pth')
torch.save(q.state_dict(), 'q.pth')

In [5]:
q

QNet(
  (fc_s): Linear(in_features=33, out_features=256, bias=True)
  (fc_q): Linear(in_features=260, out_features=128, bias=True)
  (fc_3): Linear(in_features=128, out_features=1, bias=True)
)