##### install and basic setup

make sure everything works in colab

In [None]:
%%capture
!apt update && apt install -y libpq-dev libsdl2-dev swig xorg-dev xvfb
%pip install -U tf-agents pyvirtualdisplay
%pip install -U gym>=0.21.0
%pip install -U gym[box2d,atari,accept-rom-license]

imports

In [None]:
import sklearn
import tensorflow as tf
import numpy as np
import pandas as pd
import os
                   
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import matplotlib.animation as animation

import gym

import pyvirtualdisplay

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import random
from collections import deque
import copy
import sys

In [None]:
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

mpl.rc('animation', html='jshtml')

In [None]:
env = gym.make('BipedalWalker-v3')
obs = env.reset(seed=0)
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()
env.render()

##### functions

###### actor & critic

In [None]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim, max_actions):
        super(Actor, self).__init__()

        self.max_actions = max_actions

        self.l1 = nn.Linear(state_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, state):
        x = F.relu(self.l1(state))
        x = F.relu(self.l2(x))
        x = self.max_actions * torch.tanh(self.l3(x))
        return x

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(Critic, self).__init__()
        # TD3 - https://arxiv.org/pdf/1802.09477.pdf

        # Q1
        self.l1 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, hidden_dim)
        self.l3 = nn.Linear(hidden_dim, 1) 
        
        # Q2
        self.l4 = nn.Linear(state_dim + action_dim, hidden_dim)
        self.l5 = nn.Linear(hidden_dim, hidden_dim)
        self.l6 = nn.Linear(hidden_dim, 1) 

    def forward(self, state, action): 
        sa = torch.cat([state, action], 1) 

        # compute Q1
        c1 = F.relu(self.l1(sa))
        c1 = F.relu(self.l2(c1))
        q1 = self.l3(c1)

        # compute Q2
        c2 = F.relu(self.l4(sa))
        c2 = F.relu(self.l5(c2))
        q2 = self.l6(c2)

        return (q1, q2)

###### experience replay

In [None]:
from pandas.io.stata import StataValueLabel
class ExperienceReplay:
  def __init__(self, state_dim, action_dim, batch_size, device, max_size=1e6):
    self.max_size = max_size
    self.ptr = 0  # pointer
    self.size = 0

    self.state = np.zeros((max_size, state_dim))
    self.action = np.zeros((max_size, action_dim))
    self.reward = np.zeros((max_size, 1))
    self.next_state = np.zeros((max_size, state_dim))
    self.dead = np.zeros((max_size, 1))

    self.batch_size = batch_size
    self.device = device

  def store_transition(self, state, action, reward, new_state, dead):
    self.state[self.ptr] = state
    self.action[self.ptr] = action
    self.reward[self.ptr] = reward
    self.next_state[self.ptr] = new_state
    self.dead[self.ptr] = dead

    self.ptr = (self.ptr + 1) % self.max_size
    self.size = min(self.size + 1, self.max_size)

  def sample(self):
    idx = np.random.randint(0, self.size, size=self.batch_size)

    return (
			torch.FloatTensor(self.state[idx]).to(self.device),
			torch.FloatTensor(self.action[idx]).to(self.device),
			torch.FloatTensor(self.reward[idx]).to(self.device),
			torch.FloatTensor(self.next_state[idx]).to(self.device),
			torch.FloatTensor(self.dead[idx]).to(self.device)
		)

  
  def save(self):
    # state, action, reward, next_state, dead, ptr, size
    state_df = pd.DataFrame(self.state)
    state_df.to_csv('state.csv', index=False)
    action_df = pd.DataFrame(self.action)
    action_df.to_csv('action.csv', index=False)
    reward_df = pd.DataFrame(self.reward)
    reward_df.to_csv('reward.csv', index=False)
    next_state_df = pd.DataFrame(self.next_state)
    next_state_df.to_csv('next_state.csv', index=False)
    dead_df = pd.DataFrame(self.dead)
    dead_df.to_csv('dead.csv', index=False)
    ptr_df = pd.DataFrame([self.ptr], dtype=int)
    ptr_df.to_csv('ptr.csv', index=False)
    size_df = pd.DataFrame([self.size], dtype=int)
    size_df.to_csv('size.csv', index=False)
    print('experience replay saved')

  def load(self):
    self.state = pd.read_csv('state.csv').to_numpy()
    self.action = pd.read_csv('action.csv').to_numpy()
    self.reward = pd.read_csv('reward.csv').to_numpy()
    self.next_state = pd.read_csv('next_state.csv').to_numpy()
    self.dead = pd.read_csv('dead.csv').to_numpy()
    self.ptr = pd.read_csv('ptr.csv').to_numpy()[0]
    self.size = pd.read_csv('size.csv').to_numpy()[0]

###### TD3

In [None]:
class TD3():
    def __init__(self, state_dim, action_dim, max_action, env, device, config):
        super(TD3, self).__init__()

        # set up actor
        self.actor = Actor(state_dim, action_dim, config['hidden_dim'], max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=config['learning_rate'])
        self.device = device

        # set up critic
        self.critic = Critic(state_dim, action_dim, config['hidden_dim']).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=config['learning_rate'])
        self.max_action = max_action
        self.env = env

    def select_action(self, state, noise=0.1): 
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        action = self.actor(state).cpu().data.numpy().flatten()
        # add random amount of noise from a normal distribution to the action
        if(noise == config['explore_policy']): 
            action = (action + np.random.normal(0, noise, size=self.env.action_space.shape[0]))
        return self.actor(state).cpu().data.numpy().flatten()

    def save(self):
        torch.save(self.actor.state_dict(), 'actor.pth')
        torch.save(self.actor.state_dict(), 'actor_target.pth')
        torch.save(self.actor_optimizer.state_dict(), 'actor_opt.pth')
        torch.save(self.critic.state_dict(), 'critic.pth')
        torch.save(self.critic.state_dict(), 'critic_target.pth')
        torch.save(self.critic_optimizer.state_dict(), 'critic_opt.pth')
        return
    
    def load(self):
        self.actor.load_state_dict(torch.load('actor.pth', map_location='cpu'))
        self.actor_target.load_state_dict(torch.load('actor_target.pth', map_location='cpu'))
        self.actor_optimizer.load_state_dict(torch.load('actor_opt.pth', map_location='cpu'))
        self.critic.load_state_dict(torch.load('critic.pth', map_location='cpu'))
        self.critic_target.load_state_dict(torch.load('critic_target.pth', map_location='cpu'))
        self.critic_optimizer.load_state_dict(torch.load('critic_opt.pth', map_location='cpu'))
        return

    def train(self, replay_buffer, current_iteration): 
        # pseudocode: http://bicmr.pku.edu.cn/~wenzw/bigdata/lect-dyna3w.pdf

        # sample batch transitions from replay_buffer.
        state, action, reward, next_state, done = replay_buffer.sample()
        tensor_cpy = action.clone().detach()
        # sample noise and clip it
        noise = tensor_cpy.normal_(0, config['noise_policy']).clamp(-config['noise_clip'], config['noise_clip'])

        # clip (next action + clipped noise)
        next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)
        
        # compute q1, q2, take min
        target_q1, target_q2 = self.critic_target(next_state, next_action)
        target_q = ((torch.min(target_q1, target_q2)) * (1-done)) + reward
        curr_q1, curr_q2 = self.critic(state, action)

        critic_loss = F.mse_loss(curr_q1, target_q) + F.mse_loss(curr_q2, target_q)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()  # update Q-functions

        # learn policy by maximizing the current Q every other iteration
        if (current_iteration % config['policy_delay'] == 0):
            actor_loss = -self.critic(state, self.actor(state))[0].mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # if i % policy_delay == 0, then we update model (delayed updates)
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(config['tau'] * param.data + (1 - config['tau']) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):    
                target_param.data.copy_(config['tau'] * param.data + (1 - config['tau']) * target_param.data)

###### main

In [None]:
def main(config):
    env = gym.make('BipedalWalker-v3')

    # set seed for reproducable results
    seed = 0
    env.reset(seed=seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)


    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f'running on {device}')
    

    policy = TD3(state_dim, action_dim, max_action, env, device, config)
    buffer = ExperienceReplay(state_dim, action_dim, config['batch_size'], device, config['buffer_size'])

    if config['train_from_scratch']:
      best_reward = -1*sys.maxsize
      print('actor and critic training from scratch')
      scores_over_episodes = []
    else:
      policy.load()
      scores_over_episodes = [[float(y) for y in x] for x in pd.read_csv('TD3_res.csv').values]
      best_reward = max([max(_) for _ in scores_over_episodes])
      print(f'actor and critic loaded with max score: {round(best_reward, 3)}')
      # buffer.load()
      # print('experience replay loaded')

    for episode in range(config['episodes']):
        reward_list_cur_episode = []

        # now do the actual training step
        state = env.reset()
        avg_reward = 0
        for i in range(config['timesteps']):
          # Same as the TD3, select an action and add noise:
          action = policy.select_action(state) + np.random.normal(0, max_action * config['noise'], size=action_dim)
          action = action.clip(env.action_space.low, env.action_space.high)
          # Make an action. 
          if np.isnan(action).any():
            continue
          next_state, reward, done, _ = env.step(action)
          buffer.store_transition(state, action, reward, next_state, done)
          state = next_state
          avg_reward += reward
          env.render()
          if(buffer.size > config['batch_size']):
            policy.train(buffer, i)
          if(done or i > config['timesteps']):
            reward_list_cur_episode.append(avg_reward)
            break

        for _ in range(9):
          state = env.reset()
          avg_reward = 0
          for i in range(config['timesteps']):
            action = policy.select_action(state) + np.random.normal(0, max_action * config['noise'], size=action_dim)
            action = action.clip(env.action_space.low, env.action_space.high)
            if np.isnan(action).any():
              continue
            next_state, reward, done, _ = env.step(action)
            state = next_state
            avg_reward += reward
            if(done or i > config['timesteps']):
              reward_list_cur_episode.append(avg_reward)
              break

        scores_over_episodes.append(reward_list_cur_episode)

        print(f'{episode+1}/{config["episodes"]} episodes finished')
        print(f'{" "*4+"min reward:":<18}{round(min(scores_over_episodes[-1]), 3)}')
        print(f'{" "*4+"avg reward:":<18}{round(sum(scores_over_episodes[-1])/len(scores_over_episodes[-1]), 3)}')
        print(f'{" "*4+"max reward:":<18}{round(max(scores_over_episodes[-1]), 3)}')
        print(f'{" "*4+"timestap:":<18}{i}')

        if(episode >= 0 and max(scores_over_episodes[-1]) > best_reward):
          print(' '*4+'saving agent - score was better than best-known score')
          best_reward = max(scores_over_episodes[-1])
          policy.save()

    print('saving experience replay')
    buffer.save()
    
    '''fig = plt.figure()
    plt.plot(np.arange(1, len(scores_over_episodes) + 1), scores_over_episodes)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()'''

    df = pd.DataFrame(scores_over_episodes)
    df.to_csv('TD3_res.csv', index=False)
    print('reward history saved')

##### run stuff

In [None]:
config = {
    'batch_size' : 100, 
    'discount_factor' : 0.99, 
    'explore_policy' : 0.1, 
    'learning_rate' : 0.001, 
    'policy_delay' : 2, 
    'tau' : 0.005, 
    'noise_policy' : 0.2, 
    'noise_clip' : 0.5,
    'save_score' : 400,
    'episodes' : 100,  # 800
    'timesteps' : 2000,
    'buffer_size' : 1000000,
    'noise' : 0.1,
    'hidden_dim': 512, 
    'train_from_scratch': False
}

In [None]:
main(config)

---

save files

In [None]:
from google.colab import files

files.download("TD3_res.csv") 
files.download("actor.pth") 
files.download("actor_target.pth") 
files.download("actor_opt.pth") 
files.download("critic.pth") 
files.download("critic_target.pth") 
files.download("critic_opt.pth") 

'''files.download("action.csv") 
files.download("dead.csv") 
files.download("next_state.csv") 
files.download("ptr.csv") 
files.download("reward.csv") 
files.download("size.csv") 
files.download("state.csv")'''

---

Make video

In [None]:
env = gym.make('BipedalWalker-v3')
env = gym.wrappers.RecordVideo(env, 'video')

# set seed for reproducable results
seed = 0
env.reset(seed=seed)
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'running on {device}')
policy = TD3(state_dim, action_dim, max_action, env, device, config)
buffer = ExperienceReplay(state_dim, action_dim, config['batch_size'], device, config['buffer_size'])

policy.load()
scores_over_episodes = [[float(y) for y in x] for x in pd.read_csv('TD3_res.csv').values]
best_reward = max([max(_) for _ in scores_over_episodes])
print(f'actor and critic loaded with max score: {round(best_reward, 3)}')
# buffer.load()
# print('experience replay loaded')

state = env.reset()
done = False
avg_reward = 0
while not done:
  action = policy.select_action(state) + np.random.normal(0, max_action * config['noise'], size=action_dim)
  action = action.clip(env.action_space.low, env.action_space.high)
  next_state, reward, done, _ = env.step(action)
  state = next_state
  avg_reward += reward
print(f'avg_reward : {avg_reward}')
env.close()