In [1]:
import numpy as np
import copy
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions import Normal

import gym

from collections import deque
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)

In [2]:
# Class that enables to modulate the number of hidden layers
class LinearLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear_layer = nn.Sequential(nn.Linear(in_features, out_features), nn.ReLU())
    def forward(self, x):
        x = self.linear_layer(x)
        return x

In [3]:
class ActorStochastic(nn.Module):
    def __init__(self, s_dim, a_dim, action_boundaries, hidden_dim=256, num_hidden_layers=2, device="cpu"):
        super(ActorStochastic, self).__init__()

        self.s_dim = s_dim
        self.a_dim = a_dim

        layers = [LinearLayer(s_dim, hidden_dim)]
        for _ in range(num_hidden_layers):
            layers.append(LinearLayer(hidden_dim, hidden_dim))

        self.f = nn.Sequential(*layers)
        self.mean = nn.Linear(hidden_dim, a_dim)
        self.log_std = nn.Linear(hidden_dim, a_dim)

        self.action_scale = torch.FloatTensor((action_boundaries.high - action_boundaries.low) / 2.0).to(device)
        self.action_bias = torch.FloatTensor((action_boundaries.high + action_boundaries.low) / 2.0).to(device)

    def forward(self, state):
        if not isinstance(state, torch.Tensor):
            state = torch.tensor(state, dtype=torch.float32)

        out = self.f(state)
        mean = self.mean(out)
        log_std = self.log_std(out)
        # Clamping the log_std because it is told to be more stable
        log_std = torch.clamp(log_std, -20, 2)
        
        return mean, log_std

    def sampling(self, state):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        dist = Normal(mean, std)
        x_t = dist.rsample() #reparametrization trick implemented by pytorch
        squashed = torch.tanh(x_t) #Bounds the action / squashing Gaussian

        log_prob = dist.log_prob(x_t)-(2*(np.log(2)-x_t-F.softplus(-2*x_t))).sum(dim=1, keepdim=True)
        action = self.action_scale * squashed + self.action_bias
        
        return action, log_prob


In [4]:
class QDNN(nn.Module):
    def __init__(self, s_dim, a_dim, hidden_dim=256, num_hidden_layers=2):
        super(QDNN, self).__init__()

        self.s_dim = s_dim
        self.a_dim = a_dim

        layers = [LinearLayer(s_dim + a_dim, hidden_dim)]
        for _ in range(num_hidden_layers):
            layers.append(LinearLayer(hidden_dim, hidden_dim))
        layers.append(nn.Linear(hidden_dim, 1))

        self.f = nn.Sequential(*layers)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        out = self.f(x)
        return out

In [5]:
class Critic(nn.Module):
    def __init__(self, s_dim, a_dim, hidden_dim=256, num_hidden_layers=2):
        super(Critic, self).__init__()

        self.s_dim = s_dim
        self.a_dim = a_dim

        # Two DNNs to mitigate positive bias
        self.Q1 = QDNN(s_dim, a_dim, hidden_dim, num_hidden_layers)
        self.Q2 = QDNN(s_dim, a_dim, hidden_dim, num_hidden_layers)

    def forward(self, state, action):
        q1 = self.Q1(state, action)
        q2 = self.Q2(state, action)
        return q1, q2

In [6]:
class SAC():
    def __init__(self, s_dim, a_dim, action_boundaries,hidden_dim_actor=256, hidden_dim_critic=256, 
                 num_layer_actor=2, num_layer_critic=2, lr_act=3e-4, lr_crit=3e-4, 
                 gamma=0.99, tau=0.005, alpha=0.2, lambd=0.005, target_upd_inter=1, 
                 buffer_capacity=int(1000), batch_size=32, grad_steps = 1, device="cpu"):
        
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.device = device
        self.batch_size = batch_size
        self.buffer = deque(maxlen=buffer_capacity)
        self.grad_steps = grad_steps

        self.alpha = alpha
        self.lambd = lambd
        self.gamma = gamma
        self.tau = tau
        self.target_upd_inter = target_upd_inter

        self.actor = ActorStochastic(s_dim, a_dim, action_boundaries, hidden_dim_actor, num_hidden_layers=num_layer_actor, device=device).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr_act)

        self.critic = Critic(s_dim, a_dim, hidden_dim_critic, num_hidden_layers=num_layer_critic).to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr_crit)
        self.critic_target = copy.deepcopy(self.critic).to(device)

        for param in self.critic_target.parameters():
            param.requires_grad = False

    def choose_action(self, state):
            # Choosing action to give to the environnement and not train the model
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)
            action, _ = self.actor.sampling(state)

        return action.cpu().detach().numpy()[0]
    
    def critic_train(self, states, actions, rewards, next_states, dones):
        
        with torch.no_grad():
            next_actions, next_log_probs = self.actor.sampling(next_states)
            q1_next, q2_next = self.critic_target(next_states, next_actions)
            min_q_next = torch.min(q1_next, q2_next)
            target_q_value = rewards + self.gamma *(torch.ones_like(dones)-dones).unsqueeze(1)*(min_q_next - self.alpha * next_log_probs)
            
        
        #print(target_q_value.shape)
        q1, q2 = self.critic(states, actions)
        #print(q1.shape, q2.shape)    
        critic_loss = F.mse_loss(q1, target_q_value) + F.mse_loss(q2, target_q_value)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss

    
    def actor_train(self, states):
        actions, log_probs = self.actor.sampling(states)
        q1_actor, q2_actor = self.critic(states, actions)
        min_q_actor = torch.min(q1_actor, q2_actor)
            
        actor_loss = (self.alpha * log_probs - min_q_actor).mean(dim=0)

        #print(actor_loss)
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss

    def train(self, update_interval):
        if len(self.buffer) < self.batch_size:
            return None, None

        for i in range(self.grad_steps):
            
            states, actions, rewards, next_states, dones = self.sample_batch()
            
            # Critic train
            self.critic.train()
            critic_loss= self.critic_train(states, actions, rewards, next_states, dones)

            # Actor train
            self.actor.train()
            actor_loss = self.actor_train(states)

            # Soft update of target networks
            if update_interval % self.target_upd_inter == 0:
                for target_parameters, parameters in zip(self.critic_target.parameters(), self.critic.parameters()):
                    target_parameters.data.copy_(self.tau * parameters.data + (1.0 - self.tau) * target_parameters.data)

        return critic_loss.item(), actor_loss.item()

    def add_elements_to_buffer(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample_batch(self):
        batch = random.sample(self.buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.float32).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float32).to(self.device)

        return states, actions, rewards, next_states, dones

## Training

In [7]:
import pandas as pd
def train_SAC(agent, env, n_games, training_interval=1, update_interval=1, print_num_episodes=10):
    
    best_score = env.reward_range[0]
    score_history = []

    avg_score = 0
    n_steps = 0
    warmup = 100

    for i in range(n_games):
        state = env.reset()
        done = False
        score = 0
        for _ in range(env.spec.max_episode_steps):
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            score += reward
            
            agent.add_elements_to_buffer(state, action, reward, next_state, done)
            state = next_state.squeeze()
            update_interval += 1
            n_steps += 1
            if n_steps % training_interval == 0:
                critic_loss, actor_loss = agent.train(update_interval)
            if done:
                break

        score_history.append(score)
        avg_score = np.mean(score_history[-10:])
        if i % print_num_episodes == 0:
            print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score)
    
    env.close()

    return score_history

def plot_scores(mean_scores, std_scores):
    # Create the plot
    fig = plt.figure(figsize=(20, 6))
    ax = fig.add_subplot(111)

    episodes = np.arange(1, len(mean_scores) + 1)
    ax.plot(episodes, mean_scores, color='green', label='Mean Score')
    ax.fill_between(episodes, mean_scores - std_scores, mean_scores + std_scores, color='green', alpha=0.3, label='Std Dev')

    ax.set_ylabel('Total reward (= time balanced)', fontsize=20)
    ax.set_xlabel('Episode #', fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.legend(fontsize=20)

    plt.show()

def run_experiments(env, agent_class, agent_kwargs, n_games=300, n_runs=3, training_interval=1, update_interval=1):
    all_scores = []
    
    for run in range(n_runs):
        # Reinitialize the agent for each run
        agent = agent_class(**agent_kwargs)
        env.seed(25)
        score_history = train_SAC(agent, env, n_games, training_interval=training_interval, update_interval=update_interval, print_num_episodes=10)
        all_scores.append(score_history)
    
    all_scores = np.array(all_scores)
    mean_scores = np.mean(all_scores, axis=0)
    std_scores = np.std(all_scores, axis=0)
    
    return mean_scores, std_scores

## Pendulum-v1

In [14]:
env = gym.make('Pendulum-v1')
env.seed(25)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
agent_kwargs = {
    's_dim': state_dim,
    'a_dim': action_dim,
    'action_boundaries':env.action_space,
    'buffer_capacity': int(1e6),
    'hidden_dim_actor': 64,
    'hidden_dim_critic': 64,
    'num_layer_actor': 3,
    'num_layer_critic': 3,
    'lr_act': 3e-3,
    'lr_crit': 3e-3,
    'alpha': 0.2,
    'tau': 0.05,
    'batch_size': 128,
    'grad_steps': 2,
    'device': device
}

mean_scores, std_scores = run_experiments(env, SAC, agent_kwargs, n_games=300, n_runs=3, training_interval=3, update_interval=1)

# Save the results to a CSV file
df = pd.DataFrame({'Episode': np.arange(1, len(mean_scores) + 1), 'Mean Score': mean_scores, 'Std Dev': std_scores})
df.to_csv('sac_pendulum_good.csv', index=False)

# Plot the learning progress
plot_scores(mean_scores, std_scores)
env.close()

episode 0 score -1416.5 avg score -1416.5
episode 10 score -1394.9 avg score -1236.0
episode 20 score -379.2 avg score -297.3
episode 30 score -244.0 avg score -158.4
episode 40 score -128.9 avg score -217.3
episode 50 score -234.3 avg score -265.3
episode 60 score -238.0 avg score -142.9


## Mountain car continuous

In [18]:
env = gym.make('MountainCarContinuous-v0')
env.seed(25)

s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
agent = SAC(s_dim, a_dim, env.action_space,  buffer_capacity=int(1e6), hidden_dim_actor = 64, hidden_dim_critic=64, 
            num_layer_actor=4, num_layer_critic=4, lr_act=4e-4, lr_crit=4e-4, alpha=0.2, tau=0.005, grad_steps=1, batch_size=64, device="cuda")

mean_scores, std_scores = run_experiments(env, agent, n_games=300, n_runs=3, training_interval=10)

# Save the results to a CSV file
df = pd.DataFrame({'Episode': np.arange(1, len(mean_scores) + 1), 'Mean Score': mean_scores, 'Std Dev': std_scores})
df.to_csv('sac_mountain.csv', index=False)

# Plot the learning progress
plot_scores(mean_scores, std_scores)
env.close()

  deprecation(
  deprecation(


AttributeError: 'ActorStochastic' object has no attribute 'device'