In [1]:
python_libraries = ["numpy","pandas","gym","seaborn","pyvirtualdisplay","imageio","nnfigs","box2d-py"]

packages = ["xvfb","x11-utils"]

with open('script.sh', 'w') as file:
  for library in python_libraries:
    file.write("pip install " + library +"\n")
  for package in packages:
    file.write("apt install " + package +"\n")

!bash script.sh

Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Installing collected packages: pyvirtualdisplay
Successfully installed pyvirtualdisplay-3.0
Collecting nnfigs
  Downloading nnfigs-0.1.dev0.tar.gz (15 kB)
Building wheels for collected packages: nnfigs
  Building wheel for nnfigs (setup.py) ... [?25l[?25hdone
  Created wheel for nnfigs: filename=nnfigs-0.1.dev0-py3-none-any.whl size=6264 sha256=f35ef5b5e73e41119ceb48ab6dc62a94bb8167c23f11dd570078c01ca5a44ed9
  Stored in directory: /root/.cache/pip/wheels/3e/1d/26/caa4891e0d41c00294c8b65728417233d4d8a414564eeba202
Successfully built nnfigs
Installing collected packages: nnfigs
Successfully installed nnfigs-0.1.dev0
Collecting box2d-py
  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 5.4 MB/s 
[?25hInstalling collected packages: box2d-py
Successfully installed box2d-py-2.3.8
Reading package lists... Done
Building dependency tre

In [2]:
import gym
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import math
import numpy as np
import copy
import pandas as pd
import seaborn as sns
import time
import imageio
import IPython
from IPython.display import Image
import pyvirtualdisplay
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque, namedtuple
import torch.optim as optim
import random

_display = pyvirtualdisplay.Display(visible=False, size=(1400, 900))
_ = _display.start()

In [3]:
# Code inspired by https://github.com/udacity/deep-reinforcement-learning/tree/master/ddpg-bipedal

class Actor(torch.nn.Module):
    def __init__(self, state_size, action_space, layer1_size=400, layer2_size=300):
        super(Actor, self).__init__()
        self.action_space = action_space
        self.layer1 = torch.nn.Linear(state_size, layer1_size)
        self.layer2 = torch.nn.Linear(layer1_size, layer2_size)
        self.layer3 = torch.nn.Linear(layer2_size, action_space.shape[0])

        # Initialization methods are taken as in the DDPG paper
        bound1 = 1.0/np.sqrt(self.layer1.weight.data.size()[0])
        nn.init.uniform_(self.layer1.weight.data, -bound1, bound1)
        nn.init.uniform_(self.layer1.bias.data, -bound1, bound1)

        bound2 = 1.0/np.sqrt(self.layer2.weight.data.size()[0])
        nn.init.uniform_(self.layer2.weight.data, -bound2, bound2)
        nn.init.uniform_(self.layer2.bias.data, -bound2, bound2)

        bound3 = 0.003
        nn.init.uniform_(self.layer3.weight.data, -bound3, bound3)
        nn.init.uniform_(self.layer3.bias.data, -bound3, bound3)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return torch.tanh(x)


class Critic(torch.nn.Module):
    def __init__(self, state_size, action_size, layer1_size=400, layer2_size=300):
        super(Critic, self).__init__()
        self.layer1 = torch.nn.Linear(state_size, layer1_size)
        self.layer2 = torch.nn.Linear(layer1_size+action_size, layer2_size)
        self.layer3 = torch.nn.Linear(layer2_size, 1)

        # Initialization methods are taken as in the DDPG paper
        bound1 = 1.0/np.sqrt(self.layer1.weight.data.size()[0])
        nn.init.uniform_(self.layer1.weight.data, -bound1, bound1)
        nn.init.uniform_(self.layer1.bias.data, -bound1, bound1)

        bound2 = 1.0/np.sqrt(self.layer2.weight.data.size()[0])
        nn.init.uniform_(self.layer2.weight.data, -bound2, bound2)
        nn.init.uniform_(self.layer2.bias.data, -bound2, bound2)

        bound3 = 0.0003
        nn.init.uniform_(self.layer3.weight.data, -bound3, bound3)
        nn.init.uniform_(self.layer3.bias.data, -bound3, bound3)


    def forward(self, x, a):
        layer1_out = F.relu(self.layer1(x))
        layer2_out = F.relu(self.layer2(torch.cat([layer1_out, a], dim=1)))
        return self.layer3(layer2_out)

In [4]:
# Hyperparameters
buffer_size = int(1e6)  # Replay buffer size
batch_size = 256        # Minibatch size
gamma = 0.99            # Discount factor in gain function
tau = 5e-3              # Update of target networks
lr_actor = 1e-5         # Learning rate of actor neural network
lr_critic = 1e-4        # Learning rate of critic neural network
decay = 0.01            # Weight decay in Adam optimizer

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Agent():
    def __init__(self, state_size, action_size, action_space, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Initialization of actor networks
        self.actor_local = Actor(state_size, action_space).to(device)
        self.actor_target = Actor(state_size, action_space).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Initialization of critic networks
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=decay)

        # Noise process for action taking
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory buffer
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in memory
        self.memory.add(state, action, reward, next_state, done)

        # If enough samples are available in memory, one iteration of the learning process for both the actor and the critic
        if len(self.memory) > batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, gamma)

    def act(self, state, add_noise=True):
        # Returns the best action as a function of the current policy
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        # Noise is added to bring more stability
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        # On the basis of a batch of transitions, updates the critic and actor network
        states, actions, rewards, next_states, dones = experiences

        # Critic network update
        # Get predicted next-state actions and Q values from target networks and compute targets for the current states
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss and update the parameters
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor network update
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss and update the parameters
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Target networks updates
        self.soft_update(self.critic_local, self.critic_target, tau)
        self.soft_update(self.actor_local, self.actor_target, tau)                     

    def soft_update(self, local_model, target_model, tau):
        # Target networks updates with the \tau parameter
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

class OUNoise:
    # Noise on action taking
    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        # Initialize the parameters of the noise
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        self.state = copy.copy(self.mu)

    def sample(self):
        # Returns a noise sample
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

class ReplayBuffer:
    # Standard replay buffer

    def __init__(self, action_size, buffer_size, batch_size, seed):
        # Initialize a replay buffer on the form of a deque object of size buffer_size, with batch_size being the size of each training batch
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        # Add a batch of transitions to the buffer
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        # Uniformly sample a batch of transitions from the buffer
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [5]:
env = gym.make('BipedalWalker-v3')
env.seed(10)
class RewardScaler(gym.RewardWrapper):
    # Rescales the rewards given by the environment
    def reward(self, reward):
        return reward * 10
agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], action_space=env.action_space, random_seed=10)
env = RewardScaler(env)




In [6]:
# Training loop
def ddpg(n_episodes=2000, max_t=700):
    # Initialization of a queue to easily compute running means of rewards
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        agent.reset()
        score = 0
        # For each time step, take an action, get the reward and learn
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor'+str(i_episode)+'.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic'+str(i_episode)+'.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))   
    return scores

scores = ddpg()
np.save("scores.npy", np.array(scores))
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.show()

Episode 18	Average Score: -888.45	Score: -684.56

KeyboardInterrupt: ignored

In [None]:
!pip install -U colabgymrender

In [None]:
# Visualization of the agent behaviour after training
from colabgymrender.recorder import Recorder
env = gym.make('BipedalWalker-v3')
env = Recorder(env, '.')
env.seed(10)
agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], action_space=env.action_space, random_seed=10)

agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

state = env.reset()
agent.reset()   
while True:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    state = next_state
    if done:
        break
        
env.play()