In [129]:
#Path related stuff
import os
import copy

#Math stuff
import numpy as np
import random

#Environment stuff
import gym
#import mujoco_py


#Pytorch Stuff 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import torch.nn.functional as F
from torch.optim import Adam
from torch.optim import NAdam


In [None]:
class ReplayBuffer:
  def __init__(self, buffer_size, batch_size, seed = 4):
    self.buffer = []
    self.max_size = buffer_size
    self.batch_size = batch_size
    self.random_generator = np.random.RandomState(seed)

  def append(self, state, action, noise_action, reward, terminal, next_state):
    if(len(self.buffer) == self.max_size):
      del self.buffer[0]

    self.buffer.append([state, action, noise_action, reward, terminal, next_state])

  def sample(self):
    batch = random.sample(self.buffer, self.batch_size)

    state, action, noise_action, reward, terminal, next_state = map(np.stack, zip(*batch))
    
    return state, action, noise_action, reward, terminal, next_state

  def get_buffer_size(self):
    return len(self.buffer)


In [135]:
LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
epsilon = 1e-6

#Initialize Policy weights
def weights_init_(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight, gain=1)
        torch.nn.init.constant_(m.bias, 0)

class QNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_dim):
        super(QNetwork, self).__init__()
        

        #n_action = *num_actions
        # Q1 architecture
        self.linear1 = nn.Linear(*np.add(num_inputs, num_actions), hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, hidden_dim)
        self.linear4 = nn.Linear(hidden_dim, 1)

        # Q2 architecture
        self.linear5 = nn.Linear(*np.add(num_inputs, num_actions), hidden_dim)
        self.linear6 = nn.Linear(hidden_dim, hidden_dim)
        self.linear7 = nn.Linear(hidden_dim, hidden_dim)
        self.linear8 = nn.Linear(hidden_dim, 1)

        self.apply(weights_init_)

    def forward(self, state, action):
        xu = torch.cat([state, action], 1)
        
        x1 = F.relu(self.linear1(xu))
        x1 = F.relu(self.linear2(x1))
        x1 = F.relu(self.linear3(x1))
        
        x1 = self.linear4(x1)

        x2 = F.relu(self.linear5(xu))
        x2 = F.relu(self.linear6(x2))
        x2 = F.relu(self.linear7(x2))
        x2 = self.linear8(x2)

        return x1, x2


class GaussianPolicy(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_dim, action_range=None):
        super(GaussianPolicy, self).__init__()
        
        self.linear1 = nn.Linear(*num_inputs, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, hidden_dim)

        self.mean_linear = nn.Linear(hidden_dim, *num_actions)
        self.log_std_linear = nn.Linear(hidden_dim, *num_actions)

        self.apply(weights_init_)

        # action rescaling
        if action_range is None:
            self.action_scale = torch.tensor(1.)
            self.action_bias = torch.tensor(0.)
        else:
            self.action_scale = torch.FloatTensor([
                (np.max(action_range) - np.min(action_range)) / 2.
                ])
            self.action_bias = torch.FloatTensor([
                (np.max(action_range) - np.max(action_range)) / 2.
                ])

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))

        mean = self.mean_linear(x)
        log_std = self.log_std_linear(x)
        log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX)
        return mean, log_std

    def sample(self, state):
        
        mean, log_std = self.forward(state)
        std = log_std.exp()
        normal = Normal(mean, std)
        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
        y_t = torch.tanh(x_t)
        action = y_t * self.action_scale + self.action_bias
        log_prob = normal.log_prob(x_t)
        # Enforcing Action Bound
        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon)
        log_prob = log_prob.sum(1, keepdim=True)
        
        mean = torch.tanh(mean) * self.action_scale + self.action_bias
        return action, log_prob, mean, std

    def to(self, device):
        self.action_scale = self.action_scale.to(device)
        self.action_bias = self.action_bias.to(device)
        return super(GaussianPolicy, self).to(device)

In [136]:
class SAC_Agent(object):
    def __init__(self, observation_space, action_space, gamma, c_lr, a_lr,  tau, alpha, hidden_network_size, 
                 action_range, batch_size):
      

        self.REPLAY_BATCH_SIZE = batch_size
        self.replay_buffer = ReplayBuffer(1000000, self.REPLAY_BATCH_SIZE)

        self.gamma = gamma
        self.clr = c_lr
        self.alr = a_lr
        self.tau = tau
        self.alpha = alpha

        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"

      

        self.critic = QNetwork(observation_space, action_space, hidden_network_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.clr)

        self.critic_target = QNetwork(observation_space, action_space, hidden_network_size).to(self.device)
        

        #perfoming a soft copy of the parameters
        for target_param, param in zip( self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_( param.data * self.tau + (1-self.tau) * target_param.data)

        self.policy = GaussianPolicy(observation_space, action_space, hidden_network_size, action_range).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=self.alr)


        self.target_entropy = -torch.prod(torch.Tensor(*action_space).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = NAdam([self.log_alpha], lr=0.00005)


    def select_action(self, state, evaluate=False):
        state = torch.FloatTensor([state]).to(self.device)
        if evaluate is False:
            action, _, _, _ = self.policy.sample(state)

        return action.cpu().detach().numpy()[0]

    
    def update_parameters(self):
        # Sample a batch from memory
        if self.replay_buffer.get_buffer_size() < self.REPLAY_BATCH_SIZE:
            return

        state_batch, action_batch, noise_action_batch, reward_batch, mask_batch, next_state_batch = self.replay_buffer.sample()

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)
        
        noise_action_batch = torch.FloatTensor(noise_action_batch).to(self.device)

        intrinsic_reward = torch.norm(action_batch - noise_action_batch, p=2, dim=1, keepdim=True)
        

        with torch.no_grad():

            next_state_action, next_state_log_pi, _, _ = self.policy.sample(next_state_batch)
            
            qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action)
            min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = (reward_batch - intrinsic_reward) + (1-mask_batch) * self.gamma * (min_qf_next_target)

        qf1, qf2 = self.critic(state_batch, action_batch) # Two Q-functions to mitigate positive bias in the policy improvement step

        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value) 
       
        qf_loss = (qf1_loss + qf2_loss).mean()
        
        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()

        for target_param, param in zip( self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_( param.data * self.tau + (1-self.tau) * target_param.data)
        

        pi, log_pi, _, _ = self.policy.sample(state_batch)
        

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()


        # alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
        # self.alpha_optim.zero_grad()
        # alpha_loss.backward()
        # self.alpha_optim.step()

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item()



In [137]:
import gym
import numpy as np
#from utils import plotLearning

In [138]:
env = gym.make('LunarLanderContinuous-v2')
env2 = gym.make('LunarLanderContinuous-v2')
agent = SAC_Agent(observation_space = [8], action_space=[2], gamma=0.99, c_lr = 0.00025, a_lr = 0.000025, tau = 0.001, alpha = 1.9, hidden_network_size = 256, action_range = [-1,1], batch_size=64)


np.random.seed(0)

score_history = []

for i in range(1500):
    done = False
    score = 0
    obs = env.reset()[0]
    
    for _ in range(600):
        noisy_state_1 = copy.deepcopy(obs)
        noise = np.random.normal(0,0.05, size=6)
        noise = np.concatenate(([0.0,0.0],noise))
        noisy_obs = np.add(obs, noise)
        
        act_noise = agent.select_action(noisy_obs)
        
        #--------------------------------------------------------
        #--------------------------------------------------------

        act = agent.select_action(obs)
        new_state, reward, done, info, _ = env.step(act)
   
        agent.replay_buffer.append(obs, act, act_noise, reward, int(done), new_state)
        agent.update_parameters()
        
        score += reward

        if done:
            break
        
        obs = new_state

    score_history.append(score)
    print('episode', i, 'score %.2f' % score, '100 game average %.2f' % np.mean(score_history[-100:]))
    

    # if i % 25 == 0:
    #     agent.save_models()

# filename = 'lunar-lander.png'
# plotLearning(score_history, filename, window=100)




  if not isinstance(terminated, (bool, np.bool8)):


episode 0 score -89.48 100 game average -89.48
episode 1 score -161.01 100 game average -125.24
episode 2 score -76.35 100 game average -108.94
episode 3 score -312.21 100 game average -159.76
episode 4 score -529.76 100 game average -233.76
episode 5 score -94.80 100 game average -210.60
episode 6 score -214.61 100 game average -211.17
episode 7 score -115.99 100 game average -199.28
episode 8 score -75.18 100 game average -185.49
episode 9 score -226.20 100 game average -189.56
episode 10 score 22.16 100 game average -170.31
episode 11 score -546.15 100 game average -201.63
episode 12 score -91.80 100 game average -193.18
episode 13 score -161.43 100 game average -190.91
episode 14 score -118.70 100 game average -186.10
episode 15 score -237.56 100 game average -189.32
episode 16 score -278.75 100 game average -194.58
episode 17 score -71.93 100 game average -187.76
episode 18 score -91.23 100 game average -182.68
episode 19 score -251.60 100 game average -186.13
episode 20 score -5.

KeyboardInterrupt: 

In [None]:
agent.log_alpha

tensor([-3.1385], device='cuda:0', requires_grad=True)

array([ 0.11204466,  0.0933779 , -0.04886389,  0.04750442, -0.00756786,
       -0.00516094,  0.02052993,  0.00720218])