In [16]:
# References: https://towardsdatascience.com/td3-learning-to-run-with-ai-40dfc512f93 [1]
# References: https://github.com/honghaow/FORK [2]

%%capture
!apt update
!pip install 'gym[box2d]'
!apt install xvfb -y
!pip install pyvirtualdisplay

import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import sys
from pyvirtualdisplay import Display
from IPython import display as disp
%matplotlib inline

display = Display(visible=0,size=(600,600))
display.start()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

plot_interval = 10 # update the plot every N episodes
video_every = 20 # videos can take a very long time to render so only do it every N episodes
max_episodes = 8000 # train longer for hardcore environment
seed = 42 # for result replication
env = gym.make("BipedalWalker-v3")
# env = gym.make("BipedalWalkerHardcore-v3")

In [17]:
!pip install ufal.pybox2d

  and should_run_async(code)




In [18]:
# Actor network architecture
class Actor(nn.Module):

    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(state_dim, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, action_dim),
            nn.Tanh()
        )

        self.max_action = max_action


    def forward(self, x):

        return self.max_action * self.net(x)

In [19]:
# Critic network architectures
class Critic(nn.Module):

    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        # Q1
        self.q1 = nn.Sequential(
            nn.Linear(state_dim + action_dim, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, 1)
        )

        # Q2
        self.q2 = nn.Sequential(
            nn.Linear(state_dim + action_dim, 400),
            nn.ReLU(),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Linear(300, 1)
        )

    def forward(self, x, u):
        xu = torch.cat([x, u], 1)
        x1 = self.q1(xu)
        x2 = self.q2(xu)
        return x1, x2

    def Q1(self, x, u):
        xu = torch.cat([x, u], 1)
        return self.q1(xu)

In [20]:
# Forward-Looking network achitecture
class FORK(nn.Module):

    def __init__(self, state_dim, action_dim):
        super(FORK, self).__init__()

        self.net = nn.Sequential(

          nn.Linear(state_dim + action_dim, 400),
          nn.ReLU(),
          nn.Linear(400, 300),
          nn.ReLU(),
          nn.Linear(300, state_dim),
        )

    def forward(self, x, u):
        xu = torch.cat([x, u], 1)
        return self.net(xu)

In [21]:
# Original code: https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py

# Expects tuples of (state, next_state, action, reward, done)
class ReplayBuffer(object):
    """Buffer to store tuples of experience replay"""

    def __init__(self, max_size=1000000):
        """
        Args:
            max_size (int): total amount of tuples to store
        """

        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def add(self, data):
        """Add experience tuples to buffer

        Args:
            data (tuple): experience replay tuple
        """

        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = data
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(data)

    def sample(self, batch_size):
        """Samples a random amount of experiences from buffer of batch size

        Args:
            batch_size (int): size of sample
        """

        ind = np.random.randint(0, len(self.storage), size=batch_size)
        states, actions, next_states, rewards, dones = [], [], [], [], []

        for i in ind:
            s, a, s_, r, d = self.storage[i]
            states.append(np.array(s, copy=False))
            actions.append(np.array(a, copy=False))
            next_states.append(np.array(s_, copy=False))
            rewards.append(np.array(r, copy=False))
            dones.append(np.array(d, copy=False))

        return np.array(states), np.array(actions), np.array(next_states), np.array(rewards).reshape(-1, 1), np.array(dones).reshape(-1, 1)

In [22]:
class Agent(object):

    def __init__(self, state_dim, action_dim, max_action, max_state, env):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-3)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=1e-3)

        self.fork = FORK(state_dim, action_dim).to(device)
        self.fork_optimizer = torch.optim.Adam(self.fork.parameters(), lr=1e-3)

        self.max_action = max_action
        self.max_state = max_state
        self.env = env


    def select_action(self, state, noise=0.1):

        state = torch.FloatTensor(state.reshape(1, -1)).to(device)

        # sample policy action
        action = self.actor(state).cpu().data.numpy().flatten()
        if noise != 0:
            # add noise to action
            action = (action + np.random.normal(0, noise, size=self.env.action_space.shape[0]))

        # clip noise added action to lie within action space
        return action.clip(self.env.action_space.low, self.env.action_space.high)


    def train(self, replay_buffer, iterations, weight, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):

        for it in range(iterations):

            # retrieve sample from replay buffer
            s, a, s_, r, d = replay_buffer.sample(batch_size)
            state = torch.FloatTensor(s).to(device)
            action = torch.FloatTensor(a).to(device)
            next_state = torch.FloatTensor(s_).to(device)
            done = torch.FloatTensor(1 - d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # propose next action
            noise = torch.FloatTensor(a).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)

            # get target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (done * discount * target_Q).detach()

            # regress Q-networks to target network
            current_Q1, current_Q2 = self.critic(state, action)
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # train Forward looking network
            pred_next_state = self.fork(state, action)
            fork_loss = F.mse_loss(pred_next_state, next_state)

            self.fork_optimizer.zero_grad()
            fork_loss.backward()
            self.fork_optimizer.step()

            # update actor every other iteration
            if it % policy_freq == 0:

                # get base actor loss
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()

                # apply loss from future actions iff FORK is accurate enough
                if fork_loss < 0.020:

                  # apply weighted loss of action in s'
                  pred_next_state = self.fork(state, action).clamp(-self.max_state,self.max_state)
                  next_action = self.actor(pred_next_state.detach()) * self.max_action
                  actor_loss -= weight * self.critic.Q1(pred_next_state, next_action).mean()

                  # apply weighted loss of action in s''
                  pred_next_state = self.fork(pred_next_state, next_action).clamp(-self.max_state,self.max_state)
                  next_action = self.actor(pred_next_state.detach()) * self.max_action
                  actor_loss -= 0.5 * weight * self.critic.Q1(pred_next_state, next_action).mean()

                # train actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()


                # update target networks
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

In [23]:
%%capture
env = gym.wrappers.record_video.RecordVideo(env, "./video", episode_trigger = lambda ep_id: ep_id%video_every == 0)
state_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
max_state = float(env.observation_space.high[0])

In [24]:
print('The environment has {} observations and the agent can take {} actions'.format(state_dim, act_dim))
print('The device is: {}'.format(device))

if device.type != 'cpu': print('It\'s recommended to train on the cpu for this')

The environment has 24 observations and the agent can take 4 actions
The device is: cuda
It's recommended to train on the cpu for this


In [25]:
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)

# initialise agent
agent = Agent(state_dim, act_dim, max_action, max_state, env)
done = False
# initialise replay buffer
replay_buffer = ReplayBuffer()

  deprecation(


In [27]:
# logging variables
ep_reward = 0
reward_list = []
plot_data = []
log_f = open("agent-log.txt","w+")

# max timestep per episode
max_timesteps = 2000

# number of timesteps in observation stage
random_timesteps = 25000

state = env.reset()

i = 0
episode = 0
expcount = 0

while episode < max_episodes:

      episode += 1
      temp_replay_buffer = []

      for t in range(max_timesteps):

          i += 1

          if i < random_timesteps:
            # sample random action in observation stage
            action = env.action_space.sample()

          else:
            # sample policy action outside observation stage
            action = agent.select_action(state, 0.1)

          # get results of action
          next_state, reward, done, _ = env.step(action)
          ep_reward += reward

          # apply reward scaling
          if reward == -100:
                add_reward = -1
                reward = -5
                expcount += 1
          else:
                add_reward = 0
                reward = 5 * reward

          temp_replay_buffer.append((state, action, next_state, reward, done))

          state = next_state

          # at end of episode
          if done or t==(max_timesteps-1):

            # trains unsuccesful episodes at ratio 5:1
            totrain = 0
            if add_reward == -1 or ep_reward < 250:
                totrain = 1
                for temp in temp_replay_buffer:
                    replay_buffer.add(temp)
            elif expcount > 0 and np.random.rand() > 0.5:
                totrain = 1
                expcount -= 10
                for temp in temp_replay_buffer:
                    replay_buffer.add(temp)

            reward_list.append(ep_reward)
            avg_reward = np.mean(reward_list[-100:])

            if i > random_timesteps:

                # weight to be applied to actions on FORK predicted states
                weight = 1 - np.clip(np.mean(avg_reward)/300, 0, 1)
                if totrain == 1:
                    agent.train(replay_buffer, t, weight)
                else:
                    agent.train(replay_buffer, 100, weight)
                totrain = 0


            done = False

            state = env.reset()

            break


      log_f.write('episode: {}, reward: {}\n'.format(episode, ep_reward))
      log_f.flush()
      ep_reward = 0

      # print reward data every so often - add a graph like this in your report
      if episode % plot_interval == 0:
          plot_data.append([episode, np.array(reward_list).mean(), np.array(reward_list).std()])
          reward_list = []
          plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], '-', color='tab:grey')
          plt.fill_between([x[0] for x in plot_data], [x[1]-x[2] for x in plot_data], [x[1]+x[2] for x in plot_data], alpha=0.2, color='tab:grey')
          plt.xlabel('Episode number')
          plt.ylabel('Episode reward')
          plt.show()
          disp.clear_output(wait=True)



KeyboardInterrupt: ignored