# Importing Section

In [1]:
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F

import os
import random 
import datetime
import itertools
import numpy as np

from torch.utils.tensorboard import SummaryWriter
from torch.distributions import Normal
from torch.optim import Adam

# Data

In [2]:
env_name = 'Pendulum-vo'
batch_size = 32

LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
epsilon = 1e-6

# Replay Buffer

In [3]:
class ReplayMemory:
    def __init__(self, capacity, seed):
        random.seed(seed)
        self.capacity = capacity
        self.buffer = []
        self.position = 0
        
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = int((self.position + 1)%self.capacity)
        
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)
    
    # TO DO SEE HOW THIS INTERACTS WITH MY CODE AND MAYBE CHANGE IT
    
    def save_buffer(self, env_name, suffix = "", save_path = None):
        if not os.path.exists('checkpoints/'):
            os.makedirs('checkpoints/')
            
        if save_path is None:
            save_path = 'checkpoints/sac_buffer_{}_{}'.format(env_name, suffix)
        print('Saving buffer to {}'.format(save_path))
        
        with open(save_path, 'wb') as f:
            pickle.dumb(self.buffer, f)
            
    def load_buffer(self, save_path):
        print('Loading buffer from {}'.format(save_path))
        
        with open(save_path, 'rb') as f:
            self.buffer = pickle.load(f)
            self.position = len(self.buffer) % self.capacity

# Nets

In [4]:
def weights_init_(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight, gain = 1)
        torch.nn.init.constant_(m.bias, 0)
        
class ValueNetwork(nn.Module):
    def __init__(self, num_inputs, hidden_dim):
        super(ValueNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        
        self.apply(weights_init_)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        
        return x
    
class QNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_dim):
        super(QNetwork, self).__init__()
        
        # First Q Net
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        
        # Second Q Net
        self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
        self.linear5 = nn.Linear(hidden_dim, hidden_dim)
        self.linear6 = nn.Linear(hidden_dim, 1)      
        
        self.apply(weights_init_)
        
    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        
        x1 = F.relu(self.linear1(x))
        x1 = F.relu(self.linear2(x1))
        x1 = self.linear3(x1)
        
        x2 = F.relu(self.linear4(x))
        x2 = F.relu(self.linear5(x2))
        x2 = self.linear6(x2)
        
        return x1, x2
    
class GaussianPolicy(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_dim, action_space = None):
        super(GaussianPolicy, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        
        self.mean_linear = nn.Linear(hidden_dim, num_actions)
        self.log_std_linear = nn.Linear(hidden_dim, num_actions)
        
        self.apply(weights_init_)
        
        if action_space is None:
            self.action_scale = torch.tensor(1.)
            self.action_bias = torch.tensor(0.)
        else:
            self.action_scale = torch.FloatTensor((action_space.high-action_space.low)/2.)
            self.action_bias = torch.FloatTensor((action_space.high + action_space.low)/2.)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        mean = self.mean_linear(x)
        log_std = self.log_std_linear(x)
        log_std = torch.clamp(log_std, min = LOG_SIG_MIN, max = LOG_SIG_MAX)
        return mean, log_std

    def sample(self, state):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        normal = Normal(mean, std)
        x_t = normal.rsample()
        y_t = torch.tanh(x_t)
        action = y_t * self.action_scale + self.action_bias
        log_prob = normal.log_prob(x_t)
        # This part is unclear and has to be checked
        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon)
        log_prob = log_prob.sum(1, keepdim = True)
        mean = torch.tanh(mean) * self.action_scale + self.action_bias

        return action, log_prob, mean

    def to(self, device):
        self.action_scale = self.action_scale.to(device)
        self.action_bias = self.action_bias.to(device)
        return super(GaussianPolicy, self).to(device)

In [5]:
prova = torch.FloatTensor([[1., 1.],[0.,0.]])
prova.sum(1, keepdim = True)

tensor([[2.],
        [0.]])

In [6]:
QNetwork(8, 2, 128)

QNetwork(
  (linear1): Linear(in_features=10, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=128, bias=True)
  (linear3): Linear(in_features=128, out_features=1, bias=True)
  (linear4): Linear(in_features=10, out_features=128, bias=True)
  (linear5): Linear(in_features=128, out_features=128, bias=True)
  (linear6): Linear(in_features=128, out_features=1, bias=True)
)

# Utils

In [7]:
def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
        
def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)

# Agent

In [8]:
torch.prod(torch.FloatTensor(2,)).item()

-3.372231605069942e-09

In [9]:
torch.zeros(1, requires_grad = True, device = 'cpu')

tensor([0.], requires_grad=True)

In [10]:
class SAC(object):
    def __init__(self, num_inputs, action_space, dictionary):
        self.gamma = dictionary['gamma']
        self.tau = dictionary['tau']
        self.alpha = dictionary['alpha']
        self.target_update_interval = dictionary['target_update_interval']
        
        self.policy_type = dictionary['policy']
        self.automatic_entropy_tuning = dictionary['automatic_entropy_tuning']
        
        self.device = torch.device('cuda' if dictionary['cuda'] else 'cpu')
        
        self.lr1= dictionary['lr1']
        self.lr2 = dictionary['lr2']
        self.lr3 = dictionary['lr3']
        #print('num inputs {} action space {} dictionary hidden'.format(num_inputs, action_space.shape[0]))
        self.critic = QNetwork(num_inputs, action_space.shape[0], dictionary['hidden_size']).to(device = self.device)
        self.critic_tg = QNetwork(num_inputs, action_space.shape[0], dictionary['hidden_size']).to(device = self.device)
        self.critic_optim = Adam(self.critic.parameters(), self.lr1)
        
        hard_update(self.critic_tg, self.critic)
        #print(self.device)
        if self.policy_type == 'Gaussian':
            #print('ciao')
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad = True, device = self.device)
                self.alpha_optim = Adam([self.log_alpha], lr = self.lr2)
                
            self.policy = GaussianPolicy(num_inputs, action_space.shape[0], dictionary['hidden_size'], action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), self.lr3)
                              
        # TO DO Add deterministic 
                              
    def select_action(self, state, evaluate = False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if evaluate is False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]
    
    def update_parameters(self, memory, batch_size, updates):
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size = batch_size)
        
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)
        
        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_tg(next_state_batch, next_state_action)
            min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target)
            
        qf1, qf2 = self.critic(state_batch, action_batch)
        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        qf_loss = qf1_loss + qf2_loss
        
        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()
        
        pi, log_pi, _ = self.policy.sample(state_batch)
        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        
        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
        
        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        
        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
            
            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()
            
            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()
            
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)
            
        if updates % self.target_update_interval == 0:
            soft_update(self.critic_tg, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item()
    
    def save_checkpoint(self, env_name, suffix="", ckpt_path=None):
        if not os.path.exists('checkpoints/'):
            os.makedirs('checkpoints/')
        if ckpt_path is None:
            ckpt_path = "checkpoints/sac_checkpoint_{}_{}".format(env_name, suffix)
        print('Saving models to {}'.format(ckpt_path))
        torch.save({'policy_state_dict': self.policy.state_dict(),
                    'critic_state_dict': self.critic.state_dict(),
                    'critic_target_state_dict': self.critic_tg.state_dict(),
                    'critic_optimizer_state_dict': self.critic_optim.state_dict(),
                    'policy_optimizer_state_dict': self.policy_optim.state_dict()}, ckpt_path)

    # Load model parameters
    def load_checkpoint(self, ckpt_path, evaluate=False):
        print('Loading models from {}'.format(ckpt_path))
        if ckpt_path is not None:
            checkpoint = torch.load(ckpt_path)
            self.policy.load_state_dict(checkpoint['policy_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.critic_tg.load_state_dict(checkpoint['critic_target_state_dict'])
            self.critic_optim.load_state_dict(checkpoint['critic_optimizer_state_dict'])
            self.policy_optim.load_state_dict(checkpoint['policy_optimizer_state_dict'])

            if evaluate:
                self.policy.eval()
                self.critic.eval()
                self.critic_tg.eval()
            else:
                self.policy.train()
                self.critic.train()
                self.critic_tg.train()

# Environment Instatiation

In [11]:
# TODO try to install mujoco

In [12]:
env = gym.make("LunarLanderContinuous-v2")
env.seed(5)
env.action_space.seed(5)
torch.manual_seed(5)
np.random.seed(5)
print(env.action_space.shape)
print(env.observation_space.shape)
env._max_episode_steps 

(2,)
(8,)


1000

# Agent and Memory Instatiation

In [13]:
dictionary ={'gamma': 0.98,
             'tau': 0.9,
             'alpha': 0.5,
             'target_update_interval': 20,
             'policy': 'Gaussian',
             'cuda': True,
             'lr1': 1e-4,
             'lr2': 1e-4,
             'lr3': 1e-4,
             'hidden_size': 128,
             'automatic_entropy_tuning': True,
            }

capacity_size = 1e6
seed = 4

In [14]:
agent = SAC(env.observation_space.shape[0], env.action_space, dictionary)
memory = ReplayMemory(capacity_size, seed)

# Tensorboard Instatiation

In [15]:
automatic_entropy_tuning = True

In [16]:
writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), 'Pendulum-v0','Gaussian', 'autotune' if automatic_entropy_tuning else ""))

# Main

In [17]:
num_episodes = 1000
initial_step = 500
max_num_steps = np.inf
evaluation = True

In [18]:
total_numsteps = 0
updates = 0

for i_episode in range(num_episodes):
    episode_reward = 0
    episode_steps = 0
    done = False
    state = env.reset()
    
    while not done:
        if initial_step > total_numsteps:
            action = env.action_space.sample()
        else:
            action = agent.select_action(state)
        
        if len(memory) > batch_size:
            critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, batch_size, updates)
            
            writer.add_scalar('loss_critic_1', critic_1_loss, updates)
            writer.add_scalar('loss_critic_2', critic_2_loss, updates)
            writer.add_scalar('loss_policy', policy_loss, updates)
            writer.add_scalar('loss_entropy_loss', ent_loss, updates)
            writer.add_scalar('entropy_temp_alpha', alpha, updates)
            updates += 1
            
        next_state, reward, done, _ = env.step(action)
        episode_steps += 1
        total_numsteps += 1
        episode_reward += reward
        
        mask = 1 if episode_steps == env._max_episode_steps else float(not done)
        
        memory.push(state, action, reward, next_state, mask)
        
        state = next_state
    
    if total_numsteps > max_num_steps:
        break
        
    writer.add_scalar('reward/train', episode_reward, i_episode)
    print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))

    if i_episode % 10 == 0 and evaluation is True:
        avg_reward = 0.
        episodes = 10
        for _  in range(episodes):
            state = env.reset()
            episode_reward = 0
            done = False
            while not done:
                action = agent.select_action(state, evaluate=True)

                next_state, reward, done, _ = env.step(action)
                episode_reward += reward


                state = next_state
            avg_reward += episode_reward
        avg_reward /= episodes


        writer.add_scalar('avg_reward/test', avg_reward, i_episode)

        print("----------------------------------------")
        print("Test Episodes: {}, Avg. Reward: {}".format(episodes, round(avg_reward, 2)))
        print("----------------------------------------")

env.close()


Episode: 0, total numsteps: 97, episode steps: 97, reward: -269.27
----------------------------------------
Test Episodes: 10, Avg. Reward: -150.44
----------------------------------------
Episode: 1, total numsteps: 197, episode steps: 100, reward: -300.62
Episode: 2, total numsteps: 317, episode steps: 120, reward: -431.93
Episode: 3, total numsteps: 455, episode steps: 138, reward: -305.36
Episode: 4, total numsteps: 578, episode steps: 123, reward: -136.1
Episode: 5, total numsteps: 648, episode steps: 70, reward: -72.01
Episode: 6, total numsteps: 742, episode steps: 94, reward: -262.11
Episode: 7, total numsteps: 865, episode steps: 123, reward: -134.76
Episode: 8, total numsteps: 978, episode steps: 113, reward: -27.82
Episode: 9, total numsteps: 1100, episode steps: 122, reward: -284.35
Episode: 10, total numsteps: 1255, episode steps: 155, reward: -87.95
----------------------------------------
Test Episodes: 10, Avg. Reward: -130.09
----------------------------------------
Ep

Episode: 99, total numsteps: 60364, episode steps: 1000, reward: -23.69
Episode: 100, total numsteps: 61364, episode steps: 1000, reward: -57.23
----------------------------------------
Test Episodes: 10, Avg. Reward: -58.61
----------------------------------------
Episode: 101, total numsteps: 62364, episode steps: 1000, reward: -21.22
Episode: 102, total numsteps: 63364, episode steps: 1000, reward: -76.07
Episode: 103, total numsteps: 64364, episode steps: 1000, reward: 6.75
Episode: 104, total numsteps: 65364, episode steps: 1000, reward: -6.35
Episode: 105, total numsteps: 66364, episode steps: 1000, reward: -44.02
Episode: 106, total numsteps: 67364, episode steps: 1000, reward: -89.16
Episode: 107, total numsteps: 68364, episode steps: 1000, reward: 22.42
Episode: 108, total numsteps: 69364, episode steps: 1000, reward: 36.8
Episode: 109, total numsteps: 70364, episode steps: 1000, reward: -19.88
Episode: 110, total numsteps: 71364, episode steps: 1000, reward: -96.11
----------

Episode: 195, total numsteps: 154274, episode steps: 1000, reward: -65.9
Episode: 196, total numsteps: 155274, episode steps: 1000, reward: -51.27
Episode: 197, total numsteps: 156274, episode steps: 1000, reward: -15.66
Episode: 198, total numsteps: 157274, episode steps: 1000, reward: -16.41
Episode: 199, total numsteps: 158274, episode steps: 1000, reward: 11.97
Episode: 200, total numsteps: 159274, episode steps: 1000, reward: -4.06
----------------------------------------
Test Episodes: 10, Avg. Reward: -9.41
----------------------------------------
Episode: 201, total numsteps: 160274, episode steps: 1000, reward: -7.85
Episode: 202, total numsteps: 161274, episode steps: 1000, reward: -35.32
Episode: 203, total numsteps: 162274, episode steps: 1000, reward: 6.36
Episode: 204, total numsteps: 163274, episode steps: 1000, reward: -50.04
Episode: 205, total numsteps: 164274, episode steps: 1000, reward: 8.82
Episode: 206, total numsteps: 165274, episode steps: 1000, reward: -30.26


KeyboardInterrupt: 

In [None]:
device