In [90]:
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

import random

In [91]:
class OUActionNoise(object):
    def __init__(self, mu, sigma=0.15, theta=0.2, dt= 1e-2, x0=None):

        self.theta = theta 
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
            self.sigma * np.sqrt(self.dt) * np.random.normal(size = self.mu.shape)
        self.x_prev = x
        return x
    
    def reset(self):
        self.x_prev =  self.x0 if self.x0 is not None else np.zeros_like(self.mu)

In [92]:
class ReplayBuffer:
  def __init__(self, buffer_size, batch_size, seed = 0):
    self.buffer = []
    self.max_size = buffer_size
    self.batch_size = batch_size
    self.random_generator = np.random.RandomState(seed)

  def append(self, state, action, reward, terminal, next_state):
    if(len(self.buffer) == self.max_size):
      del self.buffer[0]

    self.buffer.append([state, action, reward, terminal, next_state])

  def sample(self):
    batch = random.sample(self.buffer, self.batch_size)

    state, action, reward, terminal, next_state = map(np.stack, zip(*batch))
    
    return state, action, reward, terminal, next_state

  def get_buffer_size(self):
    return len(self.buffer)

In [93]:
class CriticNet(nn.Module):
    def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, name, chkpt_dir = 'tmp/ddpg'):
        super(CriticNet, self).__init__()

        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions

        self.checkpoint_file = os.path.join(chkpt_dir, name+'_ddpg')

        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        #initializing the weights
        f1 = 1/np.sqrt(self.fc1.weight.data.size()[0])
        T.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
        T.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
        self.bn1 = nn.LayerNorm(self.fc1_dims)
        
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        #initializing the weights
        f2 = 1/ np.sqrt(self.fc2.weight.data.size()[0])
        T.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
        T.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        self.action_value = nn.Linear(self.n_actions, self.fc2_dims)
        
        f3 = 0.003
        self.q = nn.Linear(self.fc2_dims, 1)
        T.nn.init.uniform_(self.q.weight.data, -f3, f3)
        T.nn.init.uniform_(self.q.bias.data, -f3, f3)

        self.optimizer = optim.Adam(self.parameters(), lr = beta)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self, state, action):
        state_value = self.fc1(state)
        state_value = self.bn1(state_value)
        state_value = F.relu(state_value)

        state_value = self.fc2(state_value)
        state_value = self.bn2(state_value)

        action_value = F.relu(self.action_value(action))
        state_action_value = F.relu(T.add(state_value, action_value))
        state_action_value = self.q(state_action_value)

        return state_action_value

    def save_checkpoint(self):
        print(".... saving checkpoint ....")
        T.save(self.state_dict(), self.checkpoint_file)
    
    def load_checkpoint(self):
        print(".... loading checkpoint ...")
        self.load_state_dict(T.load(self.checkpoint_file))

In [94]:
class ActorNetwork(nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, n_actions, name, chkpt_dir = 'tmp/ddpg'):
        super(ActorNetwork, self).__init__()
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims

        self.checkpoint_file = os.path.join(chkpt_dir, name+'_ddpg')
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        #Init weights
        f1 = 1/np.sqrt(self.fc1.weight.data.size()[0])
        T.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
        T.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
        self.bn1 = nn.LayerNorm(self.fc1_dims)

        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        #Init weights
        f2 = 1/np.sqrt(self.fc2.weight.data.size()[0])
        T.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
        T.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        f3 = 0.003
        self.mu = nn.Linear(self.fc2_dims, self.n_actions)
        T.nn.init.uniform_(self.mu.weight.data, -f3, f3)
        T.nn.init.uniform_(self.mu.bias.data, -f3, f3)

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self, state):
        state = self.fc1(state)
        state = self.bn1(state)
        state = F.relu(state)
        state = self.fc2(state)
        state = self.bn2(state)
        state = F.relu(state)
        output = T.tanh(self.mu(state))

        return output


    def save_checkpoint(self):
        print(".... saving checkpoint ....")
        T.save(self.state_dict(), self.checkpoint_file)
    
    def load_checkpoint(self):
        print(".... loading checkpoint ...")
        self.load_state_dict(T.load(self.checkpoint_file))

In [95]:
class Agent(object):
    def __init__(self, alpha, beta, input_dims, tau, env, 
               gamma = 0.99, n_actions = 2, max_size = 1000000, 
               layer1_size = 400, layer2_size = 300, batch_size = 64):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, batch_size)
        self.batch_size = batch_size

        self.actor = ActorNetwork(alpha, input_dims, layer1_size,
                                   layer2_size, n_actions=n_actions, name='Actor')

        self.target_actor = ActorNetwork(alpha, input_dims, layer1_size,
                                   layer2_size, n_actions=n_actions, name='TargetActor')


        self.critic = CriticNet(beta, input_dims, layer1_size,
                                   layer2_size, n_actions=n_actions, name='Critic')

        self.target_critic = CriticNet(beta, input_dims, layer1_size,
                                   layer2_size, n_actions=n_actions, name='TargetCritic')

        self.noise = OUActionNoise(mu = np.zeros(n_actions))

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
     
        observation = T.tensor([observation], dtype = T.float).to(self.actor.device)

        mu = self.actor.forward(observation).to(self.actor.device)
        mu_prime = mu + T.tensor(self.noise(),
                                 dtype = T.float).to(self.actor.device)

        self.actor.train()
        return mu_prime.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.append(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.get_buffer_size() < self.batch_size:
            return
        
        state, action, reward, next_state, terminal = self.memory.sample()

        state_batch = T.FloatTensor(state).to(self.actor.device)
        next_state_batch = T.FloatTensor(next_state).to(self.actor.device)
        action_batch = T.FloatTensor(action).to(self.actor.device)
        reward_batch = T.FloatTensor(reward).to(self.actor.device).unsqueeze(1)
        terminal_batch = T.FloatTensor(terminal).to(self.actor.device).unsqueeze(1)

        self.target_actor.eval()
        self.target_critic.eval()
        self.critic.eval()

        target_actions = self.target_actor.forward(next_state_batch)
        target_critic_value = self.target_critic.forward(next_state_batch, target_actions)
        critic_value = self.critic.forward(state_batch, action_batch)

        td = reward_batch + self.gamma * (1 - terminal_batch) * target_critic_value - critic_value
      
        critic_loss = (td ** 2).mean()
        

        self.critic.train()

        self.critic.optimizer.zero_grad()
        critic_loss.backward()
        self.critic.optimizer.step()
        self.critic.eval()

        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state_batch)
        self.actor.train()
        actor_loss = -self.critic.forward(state_batch, mu)
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau = None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critict_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)


        for name in critic_state_dict:
            critic_state_dict[name] = tau * critic_state_dict[name].clone() + (1-tau) * target_critict_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau * actor_state_dict[name].clone() + (1-tau) * target_actor_dict[name].clone()

        self.target_actor.load_state_dict(actor_state_dict)

    def save_models(self):
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.target_critic.load_checkpoint()

In [96]:
import gym
import numpy as np
#from utils import plotLearning


In [98]:
env = gym.make('LunarLanderContinuous-v2')
agent = Agent(alpha = 0.000025, beta= 0.00025, input_dims = [8], tau=0.001, env = env, batch_size=64, layer1_size=400, layer2_size=300, n_actions = 2)

np.random.seed(0)

score_history = []

for i in range(1500):
    done = False
    score = 0
    obs = env.reset()[0]
    
    while not done:
        
        act = agent.choose_action(obs)
        print(act)
        new_state, reward, done, info, _ = env.step(act)
        agent.remember(obs,act,reward,new_state, int(done))
        agent.learn()

        score += reward
        obs = new_state

    score_history.append(score)
    print('episode', i, 'score %.2f' % score, '100 game average %.2f' % np.mean(score_history[-100:]))

    # if i % 25 == 0:
    #     agent.save_models()

# filename = 'lunar-lander.png'
# plotLearning(score_history, filename, window=100)




[-0.00406663  0.00155556]
[0.01016638 0.0348267 ]
[0.03831946 0.02015677]
[0.05270043 0.01790472]
[0.05096894 0.02401411]
[0.05319593 0.04601924]
[0.06468512 0.0480765 ]
[0.07140475 0.05320141]
[0.09377005 0.05009263]
[0.09836566 0.03728613]
[0.05986036 0.0469351 ]
[0.07268023 0.03573225]
[0.10669285 0.01394102]
[0.10719976 0.01102991]
[0.12997803 0.03306344]
[0.1322335  0.03884035]
[0.11871859 0.00903658]
[0.11321467 0.01125168]
[0.13156402 0.02934746]
[0.12552756 0.02475123]
[0.10945971 0.00329004]
[0.08376385 0.03266037]
[0.07596058 0.02594471]
[0.05687886 0.03728554]
[0.03250423 0.03388443]
[0.01913264 0.03969482]
[0.01163173 0.02209189]
[0.01139783 0.02867567]
[0.01256338 0.03332879]
[0.00320122 0.02799425]
[-0.00685771  0.02255445]
[-0.01909531 -0.00392472]
[-0.01653371 -0.01052235]
[-0.0411497  -0.00419524]
[-0.05478102 -0.00399873]
[-0.04381657 -0.00267074]
[-0.02669076 -0.02179175]
[-0.02064181 -0.03259953]
[-0.03375666 -0.04183081]
[-0.03853888 -0.04154197]
[-0.05618752 -0.02

  if not isinstance(terminated, (bool, np.bool8)):


[0.00810658 0.26438794]
[0.00857037 0.29075176]
episode 0 score -140.17 100 game average -140.17
[0.03503239 0.26397565]
[0.03639638 0.2497655 ]
[0.05551297 0.22893426]
[0.04001869 0.21735688]
[0.03408635 0.24088521]
[0.04994725 0.23688455]
[0.03214039 0.24384153]
[0.01669678 0.21564244]
[0.03318577 0.21550775]
[0.04519006 0.21558575]
[0.05534215 0.2009757 ]
[0.03644572 0.20642301]
[0.02024749 0.19117674]
[0.00890482 0.18656763]
[-0.00097256  0.16141744]
[-0.01606302  0.12295166]
[-0.01221024  0.09351998]
[-0.03431176  0.08857978]
[-0.05028898  0.10564658]
[-0.07372098  0.10320713]
[-0.07706809  0.07894905]
[-0.07050628  0.06953307]
[-0.05918932  0.07496325]
[-0.02614803  0.08800506]
[-0.03072566  0.07723214]
[-0.012959    0.07976533]
[-0.00179765  0.04813355]
[-0.00050997  0.02968216]
[0.00509045 0.02104415]
[0.02021794 0.01917885]
[0.03305578 0.00588426]
[ 0.01905957 -0.00635937]
[ 0.01808951 -0.00665125]
[ 0.05076409 -0.01268488]
[ 0.03423452 -0.02306581]
[ 0.02381545 -0.02085544]
[

KeyboardInterrupt: 