In [1]:
python_libraries = ["numpy","pandas","gym","seaborn","pyvirtualdisplay","imageio","nnfigs","box2d-py"]

packages = ["xvfb","x11-utils"]

with open('script.sh', 'w') as file:
  for library in python_libraries:
    file.write("pip install " + library +"\n")
  for package in packages:
    file.write("apt install " + package +"\n")

!bash script.sh

Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.10).
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
x11-utils is already the newest version (7.7+3build1).
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [2]:
import gym
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import math
import numpy as np
import copy
import pandas as pd
import seaborn as sns
import time
import imageio
import IPython
from IPython.display import Image
import pyvirtualdisplay
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque, namedtuple
import torch.optim as optim
import random
import utils

_display = pyvirtualdisplay.Display(visible=False, size=(1400, 900))
_ = _display.start()

In [3]:
# Code inspired by https://github.com/udacity/deep-reinforcement-learning/tree/master/ddpg-bipedal and https://github.com/Jonathan-Pearce/DDPG_PER

class Actor(torch.nn.Module):
    def __init__(self, state_size, action_space, layer1_size=400, layer2_size=300):
        super(Actor, self).__init__()
        self.action_space = action_space
        self.layer1 = torch.nn.Linear(state_size, layer1_size)
        self.layer2 = torch.nn.Linear(layer1_size, layer2_size)
        self.layer3 = torch.nn.Linear(layer2_size, action_space.shape[0])

        # Initialization methods are taken as in the DDPG paper
        bound1 = 1.0/np.sqrt(self.layer1.weight.data.size()[0])
        nn.init.uniform_(self.layer1.weight.data, -bound1, bound1)
        nn.init.uniform_(self.layer1.bias.data, -bound1, bound1)

        bound2 = 1.0/np.sqrt(self.layer2.weight.data.size()[0])
        nn.init.uniform_(self.layer2.weight.data, -bound2, bound2)
        nn.init.uniform_(self.layer2.bias.data, -bound2, bound2)

        bound3 = 0.003
        nn.init.uniform_(self.layer3.weight.data, -bound3, bound3)
        nn.init.uniform_(self.layer3.bias.data, -bound3, bound3)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return torch.tanh(x)


class Critic(torch.nn.Module):
    def __init__(self, state_size, action_size, layer1_size=400, layer2_size=300):
        super(Critic, self).__init__()
        self.layer1 = torch.nn.Linear(state_size, layer1_size)
        self.layer2 = torch.nn.Linear(layer1_size+action_size, layer2_size)
        self.layer3 = torch.nn.Linear(layer2_size, 1)

        # Initialization methods are taken as in the DDPG paper
        bound1 = 1.0/np.sqrt(self.layer1.weight.data.size()[0])
        nn.init.uniform_(self.layer1.weight.data, -bound1, bound1)
        nn.init.uniform_(self.layer1.bias.data, -bound1, bound1)

        bound2 = 1.0/np.sqrt(self.layer2.weight.data.size()[0])
        nn.init.uniform_(self.layer2.weight.data, -bound2, bound2)
        nn.init.uniform_(self.layer2.bias.data, -bound2, bound2)

        bound3 = 0.0003
        nn.init.uniform_(self.layer3.weight.data, -bound3, bound3)
        nn.init.uniform_(self.layer3.bias.data, -bound3, bound3)


    def forward(self, x, a):
        layer1_out = F.relu(self.layer1(x))
        layer2_out = F.relu(self.layer2(torch.cat([layer1_out, a], dim=1)))
        return self.layer3(layer2_out)

In [4]:
# Hyperparameters
buffer_size = int(1e6)  # Replay buffer size
batch_size = 256        # Minibatch size
gamma = 0.99            # Discount factor in gain function
tau = 5e-3              # Update of target networks
lr_actor = 1e-5         # Learning rate of actor neural network
lr_critic = 1e-4        # Learning rate of critic neural network
decay = 0.01            # Weight decay in Adam optimizer

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, action_space, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Initialization of actor networks
        self.actor_local = Actor(state_size, action_space).to(device)
        self.actor_target = Actor(state_size, action_space).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Initialization of critic networks
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=decay)

        # Noise process for action taking
        self.noise = OUNoise(action_size, random_seed)

        # Prioritized Experience Replay
        self.memory = PrioritizedReplayBuffer(buffer_size, alpha=0.5)
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in memory
        self.memory.add(state, action, reward, next_state, done)

        # If enough samples are available in memory, one iteration of the learning process for both the actor and the critic
        if len(self.memory) > batch_size:
            experiences = self.memory.sample(batch_size,beta=0.5)
            self.learn(experiences, gamma)

    def act(self, state, add_noise=True):
        # Returns the best action as a function of the current policy
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        # Noise is added to bring more stability
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        # On the basis of a batch of transitions, updates the critic and actor network
        states, actions, rewards, next_states, dones, _, _ = experiences

       # Critic network update
        # Get predicted next-state actions and Q values from target networks and compute targets for the current states
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss and update the parameters
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor network update
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss and update the parameters
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Target networks updates
        self.soft_update(self.critic_local, self.critic_target, tau)
        self.soft_update(self.actor_local, self.actor_target, tau)                     

    def soft_update(self, local_model, target_model, tau):
        # Target networks updates with the \tau parameter
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

class OUNoise:
    # Noise on action taking
    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        # Initialize the parameters of the noise
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        self.state = copy.copy(self.mu)

    def sample(self):
        # Returns a noise sample
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

class ReplayBuffer(object):
    # Base class
    def __init__(self, size):
        # Initialize a basic replay buffer
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

    def __len__(self):
        return len(self._storage)

    def add(self, obs_t, action, reward, obs_tp1, done):
        data = (obs_t, action, reward, obs_tp1, done)
        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize

    def _encode_sample(self, idxes):
        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
        for i in idxes:
            data = self._storage[i]
            obs_t, action, reward, obs_tp1, done = data
            obses_t.append(np.array(obs_t, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward)
            obses_tp1.append(np.array(obs_tp1, copy=False))
            dones.append(done)
        return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)

    def sample(self, batch_size):
        # Samples a batch of transitions
        idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
        return self._encode_sample(idxes)

class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Creates Prioritized Replay buffer.
        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)
        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = utils.SumSegmentTree(it_capacity)
        self._it_min = utils.MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        """See ReplayBuffer.store_effect"""
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self._storage) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        # Samples a batch of transitions
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage)) ** (-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage)) ** (-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        states = torch.from_numpy(np.vstack([e for e in encoded_sample[0] if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e for e in encoded_sample[1] if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e for e in encoded_sample[2] if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e for e in encoded_sample[3] if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e for e in encoded_sample[4] if e is not None]).astype(np.uint8)).float().to(device)
        
        return tuple([states,actions,rewards,next_states,dones,weights, idxes])

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.
        sets priority of transition at index idxes[i] in buffer
        to priorities[i].
        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority ** self._alpha
            self._it_min[idx] = priority ** self._alpha

            self._max_priority = max(self._max_priority, priority)

In [5]:
env = gym.make('BipedalWalker-v3')
env.seed(10)
class RewardScaler(gym.RewardWrapper):
    # Rescales the rewards given by the environment
    def reward(self, reward):
        return reward * 10
agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], action_space=env.action_space, random_seed=10)
env = RewardScaler(env)




In [6]:
# Training loop
def ddpg(n_episodes=2000, max_t=700):
    # Initialization of a queue to easily compute running means of rewards
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        agent.reset()
        score = 0
        # For each time step, take an action, get the reward and learn
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor'+str(i_episode)+'.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic'+str(i_episode)+'.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
    return scores

scores = ddpg()
np.save("scores.npy", np.array(scores))
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.show()

Episode 30	Average Score: -1128.28	Score: -1181.75

KeyboardInterrupt: ignored

In [None]:
!pip install -U colabgymrender

In [None]:
# Visualization of the agent behaviour after training
from colabgymrender.recorder import Recorder
env = gym.make('BipedalWalker-v3')
env = Recorder(env, '.')
env.seed(10)
agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], action_space=env.action_space, random_seed=10)

agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

state = env.reset()
agent.reset()   
while True:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    state = next_state
    if done:
        break
        
env.play()