In [1]:
import numpy as np
import copy
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions import Normal

import gym
from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)

In [2]:
class LinearLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear_layer = nn.Sequential(nn.Linear(in_features, out_features), nn.ReLU())
    def forward(self, x):
        x = self.linear_layer(x)
        return x

In [48]:
class ActorStochastic(nn.Module):
    def __init__(self, s_dim, a_dim, action_boundaries, hidden_dim=256, num_hidden_layers=2):
        super(ActorStochastic, self).__init__()

        self.s_dim = s_dim
        self.a_dim = a_dim

        layers = [LinearLayer(s_dim, hidden_dim)]
        for _ in range(num_hidden_layers):
            layers.append(LinearLayer(hidden_dim, hidden_dim))

        self.f = nn.Sequential(*layers)
        self.mean = nn.Linear(hidden_dim, a_dim)
        self.log_std = nn.Linear(hidden_dim, a_dim)

        self.action_boundaries = action_boundaries

    def forward(self, state):
        if not isinstance(state, torch.Tensor):
            state = torch.tensor(state, dtype=torch.float32)

        out = self.f(state)
        mean = self.mean(out)
        log_std = self.log_std(out)

        #out = torch.tanh(out)
        #log_std = torch.tanh(log_std)
        #std = torch.exp(log_std)
        #noise = torch.normal(0, 1, out.shape[0])
        #action = mean + noise * std
        #dist=torch.distributions.Normal(mean, log_std)
        # log_std = torch.clamp(log_std, -20, 2)
        
        log_std = torch.clamp(log_std, -2, 2)
        
        return mean, log_std

    def sampling(self, state):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        dist = Normal(mean, std)
        x_t = dist.rsample() #reparametrization trick implemented by pytorch
        action = torch.tanh(x_t) #Bounds the action
        log_prob = dist.log_prob(x_t) # Log probability(/ies if state in batch)
        log_Jacobian = torch.log(1-action**2+1e-10).sum(dim=1, keepdim=True)
        #print(log_Jacobian)
        log_prob -=log_Jacobian
        return action, log_prob


In [49]:
class QDNN(nn.Module):
    def __init__(self, s_dim, a_dim, hidden_dim=256, num_hidden_layers=2):
        super(QDNN, self).__init__()

        self.s_dim = s_dim
        self.a_dim = a_dim

        layers = [LinearLayer(s_dim + a_dim, hidden_dim)]
        for _ in range(num_hidden_layers):
            layers.append(LinearLayer(hidden_dim, hidden_dim))
        layers.append(nn.Linear(hidden_dim, 1))

        self.f = nn.Sequential(*layers)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        out = self.f(x)
        return out

In [50]:
class Critic(nn.Module):
    def __init__(self, s_dim, a_dim, hidden_dim=256, num_hidden_layers=2):
        super(Critic, self).__init__()

        self.s_dim = s_dim
        self.a_dim = a_dim

        # Two DNNs to mitigate positive bias
        self.Q1 = QDNN(s_dim, a_dim, hidden_dim, num_hidden_layers)
        self.Q2 = QDNN(s_dim, a_dim, hidden_dim, num_hidden_layers)

    def forward(self, state, action):
        q1 = self.Q1(state, action)
        q2 = self.Q2(state, action)
        return q1, q2

In [51]:
class SAC():
    def __init__(self, s_dim, a_dim, hidden_dim_actor=256, hidden_dim_critic=256, 
                 num_layer_actor=2, num_layer_critic=2, lr_act=3e-4, lr_crit=3e-4, 
                 gamma=0.99, tau=0.005, alpha=0.2, lambd=0.005, target_upd_inter=1, 
                 buffer_capacity=int(1000), batch_size=32, grad_steps = 1, device="cpu"):
        
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.device = device
        self.batch_size = batch_size
        self.buffer = deque(maxlen=buffer_capacity)
        self.grad_steps = grad_steps

        self.alpha = alpha
        self.lambd = lambd
        self.gamma = gamma
        self.tau = tau
        self.target_upd_inter = target_upd_inter

        self.actor = ActorStochastic(s_dim, a_dim, hidden_dim_actor, num_hidden_layers=num_layer_actor).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr_act)

        self.critic = Critic(s_dim, a_dim, hidden_dim_critic, num_hidden_layers=num_layer_critic).to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr_crit)
        self.critic_target = copy.deepcopy(self.critic).to(device)

    def choose_action(self, state, evaluate=False):
        #print(state.shape)
        if evaluate:
            # Choosing action to give to the environnement and not train the model
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
                action, log_prob = self.actor.sampling(state)

            return action.cpu().detach().numpy()[0], log_prob
        
        state = torch.tensor(state, dtype=torch.float32, device=self.device)
        # To train actor model and critic
        action, log_prob = self.actor.sampling(state)
        
        return action, log_prob
    
    def critic_train(self, states, actions, rewards, next_states, dones):
        
        with torch.no_grad():
            next_actions, next_log_probs = self.choose_action(next_states, evaluate=False)
            q1_next, q2_next = self.critic_target(next_states, next_actions)
            min_q_next = torch.min(q1_next, q2_next)
            target_q_value = rewards + self.gamma *(torch.ones_like(dones)-dones).unsqueeze(1)*(min_q_next - self.alpha * next_log_probs)
     
        
        #print(target_q_value.shape)
        q1, q2 = self.critic(states, actions)
        #print(q1.shape, q2.shape)    
        critic_loss = F.mse_loss(q1, target_q_value) + F.mse_loss(q2, target_q_value)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss

    
    def actor_train(self, states):
        actions, log_probs = self.choose_action(states, evaluate=False)
        q1_actor, q2_actor = self.critic(states, actions)
        min_q_actor = torch.min(q1_actor, q2_actor)

        #print(actions.shape, log_probs.shape)
            
        actor_loss = (self.alpha * log_probs.unsqueeze(1) - min_q_actor).mean()

        #print(actor_loss)
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss

    def train(self, update_interval):
        if len(self.buffer) < self.batch_size:
            return None, None

        for i in range(self.grad_steps):
            
            states, actions, rewards, next_states, dones = self.sample_batch()
            
            # Critic train
            self.critic.train()
            critic_loss= self.critic_train(states, actions, rewards, next_states, dones)

            # Actor train
            self.actor.train()
            actor_loss = self.actor_train(states)

            # Soft update of target networks
            if update_interval % self.target_upd_inter == 0:
                for target_parameters, parameters in zip(self.critic_target.parameters(), self.critic.parameters()):
                    target_parameters.data.copy_(self.tau * parameters.data + (1.0 - self.tau) * target_parameters.data)

        return critic_loss.item(), actor_loss.item()

    def add_elements_to_buffer(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample_batch(self):
        batch = random.sample(self.buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.float32).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float32).to(self.device)

        return states, actions, rewards, next_states, dones

## Cartpole

In [52]:
# Create the environment
env = gym.make('MountainCarContinuous-v0')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
hidden_dim_actor = 128
hidden_dim_critic = 128
num_layer_actor = 2
num_layer_critic = 2
lr_act = 3e-4
lr_crit = 3e-4
gamma = 0.99
tau = 0.05
alpha = 0.2
batch_size = 64
num_episodes = 100
update_interval = 1
target_upd_inter = 1
buffer_capacity = int(1e3)
grad_step=2

#print(a_dim)
# Initialize SAC agent
agent = SAC(s_dim, a_dim, hidden_dim_actor=hidden_dim_actor, hidden_dim_critic=hidden_dim_critic, 
            num_layer_actor=num_layer_actor, num_layer_critic=num_layer_critic, lr_act=lr_act, 
            lr_crit=lr_crit, gamma=gamma, tau=tau, alpha=alpha, batch_size=batch_size, 
            grad_steps=grad_step, device=device, buffer_capacity=buffer_capacity)

# Training loop
episode_rewards = []
critic_loss=None
actor_loss=None

for episode in range(num_episodes):
    state = env.reset() # shape of (2,)
    episode_reward = 0
    for t in range(1000):
        #print(state.shape)
        action, _ = agent.choose_action(state, evaluate=True)
        next_state, reward, done, _ = env.step(action)
        agent.add_elements_to_buffer(state, action, reward, next_state, done)
        state = next_state.copy()
        episode_reward += reward
        
        if t % update_interval == 0:
            critic_loss, actor_loss = agent.train(t)

        if done:
            break

    episode_rewards.append(episode_reward)
    if critic_loss is not None:
        print("critic loss: ",critic_loss, "actor loss : ", actor_loss)
    if (episode + 1) % 10 == 0:
        print(f"Episode {episode + 1}, Reward: {episode_reward}")

# Plotting
plt.plot(episode_rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('SAC on MountainCarContinuous-v0')
plt.show()

  state = torch.tensor(state, dtype=torch.float32, device=self.device)


critic loss:  0.026462532579898834 actor loss :  -6.294869899749756
critic loss:  0.01235608197748661 actor loss :  -8.125930786132812
critic loss:  0.0032486505806446075 actor loss :  -8.753801345825195
critic loss:  0.006489700637757778 actor loss :  -8.840635299682617
critic loss:  0.8339219093322754 actor loss :  -8.201393127441406
critic loss:  0.003921173978596926 actor loss :  -8.504669189453125
critic loss:  0.003561805235221982 actor loss :  -8.508953094482422
critic loss:  2.3079466819763184 actor loss :  -8.50600814819336
critic loss:  0.023715583607554436 actor loss :  -8.06904125213623
critic loss:  0.00984214711934328 actor loss :  -8.285770416259766
Episode 10, Reward: -28.531345378714583
critic loss:  1.856057047843933 actor loss :  -8.389808654785156
critic loss:  0.008682656101882458 actor loss :  -8.44276237487793
critic loss:  0.03514431044459343 actor loss :  -8.338154792785645
critic loss:  0.006717576179653406 actor loss :  -8.245298385620117
critic loss:  0.0053

KeyboardInterrupt: 

In [None]:
state = env.reset()

In [None]:
print(state)

[-0.5606734  0.       ]


In [None]:
next_state, reward, done, _ = env.step(np.array([0]))
#next_state, reward, done, _ = env.step([0])

In [None]:
print(next_state.shape)

(2,)


In [None]:
agent.add_elements_to_buffer(state, 0, reward, next_state, done)
state = next_state
print(state.shape)

(2,)


In [None]:
action = env.action_space.sample()[0]

In [None]:
agent.choose_action(state, True)

(array([0.85067993], dtype=float32), tensor([[-1.7696]], device='cuda:0'))