In [None]:
# Code modified from the book "Grokking deep reinforcement learning" of Miguel Morales
# DDPG, TD3

In [None]:
# Check if GPU is connected
# ---DO NOT TOUCH---
!nvidia-smi

Mon Aug 30 08:39:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Libraries and setup
# ---DO NOT TOUCH---
import warnings ; warnings.filterwarnings('ignore')
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['OMP_NUM_THREADS'] = '1'

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
import threading
from torch.distributions import Normal

import numpy as np
from IPython.display import display
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from itertools import cycle, count
from textwrap import wrap

import matplotlib
import subprocess
import os.path
import tempfile
import random
import base64
import pprint
import glob
import time
import json
import sys
import gym
import io
import os
import gc

from gym import wrappers
from skimage.transform import resize
from skimage.color import rgb2gray
from subprocess import check_output
from IPython.display import display, HTML

LEAVE_PRINT_EVERY_N_SECS = 300
ERASE_LINE = '\x1b[2K'
EPS = 1e-6
BEEP = lambda: os.system("printf '\a'")
RESULTS_DIR = os.path.join('..', 'results')
SEEDS = (12, 34, 56, 78, 90)

%matplotlib inline

In [None]:
# Setup plotting parameters
# ---DO NOT TOUCH---
plt.style.use('fivethirtyeight')
params = {
    'figure.figsize': (15, 8),
    'font.size': 24,
    'legend.fontsize': 20,
    'axes.titlesize': 28,
    'axes.labelsize': 24,
    'xtick.labelsize': 20,
    'ytick.labelsize': 20
}
pylab.rcParams.update(params)
np.set_printoptions(suppress=True)

In [None]:
# Check for GPU acceleration 
# Remember to enable it in the menu Runtime/Change runtime type
# ---DO NOT TOUCH---
torch.cuda.is_available()

True

In [None]:
# Get gym environment function
# ---DO NOT TOUCH---
def get_make_env_fn(**kargs):
    def make_env_fn(env_name, seed=None, render=None, record=False,
                    unwrapped=False, monitor_mode=None, 
                    inner_wrappers=None, outer_wrappers=None):
        mdir = tempfile.mkdtemp()
        env = None
        if render:
            try:
                env = gym.make(env_name, render=render)
            except:
                pass
        if env is None:
            env = gym.make(env_name)
        if seed is not None: env.seed(seed)
        env = env.unwrapped if unwrapped else env
        if inner_wrappers:
            for wrapper in inner_wrappers:
                env = wrapper(env)
        env = wrappers.Monitor(
            env, mdir, force=True, 
            mode=monitor_mode, 
            video_callable=lambda e_idx: record) if monitor_mode else env
        if outer_wrappers:
            for wrapper in outer_wrappers:
                env = wrapper(env)
        return env
    return make_env_fn, kargs

# DDPG

In [None]:
# Q-value neural network class
# There is a task here!

class FCQV(nn.Module):
    def __init__(self, 
                 input_dim, 
                 output_dim):
        """
        Class initialization

        input_dim = input dimension
        output_dim = output dimension

        hidden_dims = dimension for hidden layers
        activation_fc = activation function

        device = processing device
        """
        super(FCQV, self).__init__()

        #TODO: Choose a non-linear activation function from https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity
        activation_fc = #To complete. Use the format ---> F.activation_function
        self.activation_fc = activation_fc

        #TODO: propose the dimensions for the hidden layers
        hidden_dims = #To complete. Use the format (dimension_1, ..., dimension_n)

        self.input_layer = nn.Linear(input_dim, hidden_dims[0])
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            in_dim = hidden_dims[i]
            if i == 0: 
                in_dim += output_dim
            hidden_layer = nn.Linear(in_dim, hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        self.output_layer = nn.Linear(hidden_dims[-1], 1)

        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)
    
    def _format(self, state, action):
        """
        Format the state for pytorch

        state = state from environment
        action = action from policy
        """
        x, u = state, action
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, 
                             device=self.device, 
                             dtype=torch.float32)
            x = x.unsqueeze(0)
        if not isinstance(u, torch.Tensor):
            u = torch.tensor(u, 
                             device=self.device, 
                             dtype=torch.float32)
            u = u.unsqueeze(0)
        return x, u

    def forward(self, state, action):
        """
        Forward function for neural network

        state = state from environment
        action = action from policy
        """
        x, u = self._format(state, action)
        x = self.activation_fc(self.input_layer(x))
        for i, hidden_layer in enumerate(self.hidden_layers):
            if i == 0:
                x = torch.cat((x, u), dim=1)
            x = self.activation_fc(hidden_layer(x))
        return self.output_layer(x)
    
    def load(self, experiences):
        """
        load samples from experience - replay buffer database

        experiences = samples from the replay buffer database
        """
        states, actions, new_states, rewards, is_terminals = experiences
        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).float().to(self.device)
        new_states = torch.from_numpy(new_states).float().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        is_terminals = torch.from_numpy(is_terminals).float().to(self.device)
        return states, actions, new_states, rewards, is_terminals

In [None]:
# Deterministic policy neural network class
# There is a task here!

class FCDP(nn.Module):
    def __init__(self, 
                 input_dim,
                 action_bounds,
                 out_activation_fc=F.tanh):
      """
      Class initialization

      input_dim = input dimension
      output_dim = output dimension

      hidden_dims = dimension for hidden layers
      activation_fc = activation function
      out_activation_fc = Output activation function

      device = processing device
      """
      super(FCDP, self).__init__()

      #TODO: Choose a non-linear activation function from https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity
      activation_fc = #To complete. Use the format ---> F.activation_function

      self.activation_fc = activation_fc
      self.out_activation_fc = out_activation_fc
      self.env_min, self.env_max = action_bounds
      
      #TODO: propose the dimensions for the hidden layers
      hidden_dims = #To complete. Use the same format (dimension_1, ..., dimension_n)

      self.input_layer = nn.Linear(input_dim, hidden_dims[0])
      self.hidden_layers = nn.ModuleList()
      for i in range(len(hidden_dims)-1):
          hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
          self.hidden_layers.append(hidden_layer)
      self.output_layer = nn.Linear(hidden_dims[-1], len(self.env_max))

      device = "cpu"
      if torch.cuda.is_available():
          device = "cuda:0"
      self.device = torch.device(device)
      self.to(self.device)
      
      self.env_min = torch.tensor(self.env_min,
                                  device=self.device, 
                                  dtype=torch.float32)

      self.env_max = torch.tensor(self.env_max,
                                  device=self.device, 
                                  dtype=torch.float32)
      
      self.nn_min = self.out_activation_fc(
          torch.Tensor([float('-inf')])).to(self.device)
      self.nn_max = self.out_activation_fc(
          torch.Tensor([float('inf')])).to(self.device)
      self.rescale_fn = lambda x: (x - self.nn_min) * (self.env_max - self.env_min) / \
                                  (self.nn_max - self.nn_min) + self.env_min

    def _format(self, state):
      """
      Format the state for pytorch

      state = current state
      """
      x = state
      if not isinstance(x, torch.Tensor):
          x = torch.tensor(x, 
                            device=self.device, 
                            dtype=torch.float32)
          x = x.unsqueeze(0)
      return x

    def forward(self, state):
      """
      Forward function for neural network

      state = current state
      """
      x = self._format(state)
      x = self.activation_fc(self.input_layer(x))
      for hidden_layer in self.hidden_layers:
          x = self.activation_fc(hidden_layer(x))
      x = self.output_layer(x)
      x = self.out_activation_fc(x)
      return self.rescale_fn(x)

In [None]:
# Replay buffer class
# There is a task here!

class ReplayBuffer():
    def __init__(self, 
                 max_size=10000, 
                 batch_size=64):
      """
      Initialize class 

      ss_mem = state buffer
      as_mem = action buffer
      rs_mem = reward buffer
      ps_mem = probability buffer
      ds_mem = discount factor buffer

      max_size = maximum buffer size
      batch_size = sample batch size
      _idx = buffer index
      size = ongoing buffer size
      """
      self.ss_mem = np.empty(shape=(max_size), dtype=np.ndarray)
      self.as_mem = np.empty(shape=(max_size), dtype=np.ndarray)
      self.rs_mem = np.empty(shape=(max_size), dtype=np.ndarray)
      self.ps_mem = np.empty(shape=(max_size), dtype=np.ndarray)
      self.ds_mem = np.empty(shape=(max_size), dtype=np.ndarray)

      self.max_size = max_size
      self.batch_size = batch_size
      self._idx = 0
      self.size = 0
    
    def store(self, sample):
      """
      get values from the samples

      sample = tuple (s, a, r, p, d)
      s = state
      a = action
      r = reward
      p = new_state
      d = flag for terminal state

      ss_mem = memory buffer for current states
      as_mem = memory buffer for actions
      rs_mem = memory buffer for rewards
      ps_mem = memory buffer for new states
      ds_mem = memory buffer for terminal state flag
      """
      s, a, r, p, d = sample
      
      #TODO: Complete the function store in the relay buffer class
      self.ss_mem[self._idx] = # To complete
      self.as_mem[self._idx] = # To complete
      self.rs_mem[self._idx] = # To complete
      self.ps_mem[self._idx] = # To complete
      self.ds_mem[self._idx] = # To complete
      
      self._idx += 1
      self._idx = self._idx % self.max_size

      self.size += 1
      self.size = min(self.size, self.max_size)

    def sample(self, batch_size=None):
      """
      store samples in the buffer

      idxs = index
      experiences = samples in buffer
      """
      if batch_size == None:
          batch_size = self.batch_size

      idxs = np.random.choice(
          self.size, batch_size, replace=False)
      experiences = np.vstack(self.ss_mem[idxs]), \
                    np.vstack(self.as_mem[idxs]), \
                    np.vstack(self.rs_mem[idxs]), \
                    np.vstack(self.ps_mem[idxs]), \
                    np.vstack(self.ds_mem[idxs])
      return experiences

    def __len__(self):
      """
      get buffer size
      """
      return self.size

In [None]:
# Compute the greedy strategy
# There is a task here!

class GreedyStrategy():

    def __init__(self, bounds):
      """
      Initialize class 

      bounds = upper and lower bounds for action
      low = lower bound for action
      high = upper bound for action
      ratio_noise_injected = noise in deterministic greedy policy for exploration
      """
      self.low, self.high = bounds
      self.ratio_noise_injected = 0

    def select_action(self, model, state):
      """
      Select greedy action

      model = policy model
      state = current state

      greedy_action = compute action from the policy
      action = action after clippping
      """
      with torch.no_grad():
          greedy_action = model(state).cpu().detach().data.numpy().squeeze()

      #TODO use the np.clip function to clip the greedy action between the values 'low' and 'high'
      action = # To complete. You may find more information about the clip function in https://numpy.org/doc/stable/reference/generated/numpy.clip.html
      return np.reshape(action, self.high.shape)

In [None]:
# Compute noise for exploration
# There is a task here!

class NormalNoiseStrategy():
    def __init__(self, bounds, exploration_noise_ratio=0.1):
      """
      class initialization

      bounds = upper and lower bounds for noise
      exploration_noise_ratio = noise exploration ration
      """
      self.low, self.high = bounds
      self.exploration_noise_ratio = exploration_noise_ratio
      self.ratio_noise_injected = 0

    def select_action(self, model, state, max_exploration=False):
      """
      select DDPG policy

      model = policy model
      state = current state
      max_exploration = exploration strategy options

      noise_scale = standard deviation for normal distribution
      noise = noise for exploration
      noisy_action = DDPG action with exploration noise
      action = clipped DDPG action
      """
      if max_exploration:
          noise_scale = self.high
      else:
          noise_scale = self.exploration_noise_ratio * self.high

      with torch.no_grad():
          greedy_action = model(state).cpu().detach().data.numpy().squeeze()

      noise = np.random.normal(loc=0, scale=noise_scale, size=len(self.high))

      #TODO: compute the DDPG action with an exploration strategy. Use the values greedy_action and noise
      noisy_action = #To complete

      #TODO use the np.clip function to clip the DDPG action between the values 'self.low' and 'self.high'
      action = # To complete. You may find more information about the clip function in https://numpy.org/doc/stable/reference/generated/numpy.clip.html
      
      self.ratio_noise_injected = np.mean(abs((greedy_action - action)/(self.high - self.low)))
      return action

In [None]:
# DDPG agent class
# There is a task here!

class DDPG():
    def __init__(self, 
                 replay_buffer_fn,
                 policy_model_fn, 
                 policy_max_grad_norm, 
                 policy_optimizer_fn, 
                 policy_optimizer_lr,
                 value_model_fn, 
                 value_max_grad_norm, 
                 value_optimizer_fn, 
                 value_optimizer_lr, 
                 training_strategy_fn,
                 evaluation_strategy_fn,
                 n_warmup_batches,
                 update_target_every_steps,
                 tau):
        """
        Class initialization

        replay_buffer_fn = replay buffer function

        policy_model_fn = policy neural network architecture
        policy_max_grad_norm = maximum gradient norm for policy model
        policy_optimizer_fn = optimizer for policy neural network
        policy_optimizer_lr = learning rate for policy neural network

        value_model_fn = value function neural network architecture
        value_max_grad_norm = maximum gradient norm for Q-value model
        value_optimizer_fn = optimizer for value function neural network
        value_optimizer_lr = learning rate for value function neural network

        training_strategy_fn = exploration strategy - Normal Noise Strategy
        evaluation_strategy_fn = evaluation strategy - Greedy Strategy
        n_warmup_batches = warm up batches for training
        update_target_every_steps = updating rate
        tau = Polyak averaging factor
        """
        self.replay_buffer_fn = replay_buffer_fn

        self.policy_model_fn = policy_model_fn
        self.policy_max_grad_norm = policy_max_grad_norm
        self.policy_optimizer_fn = policy_optimizer_fn
        self.policy_optimizer_lr = policy_optimizer_lr
        
        self.value_model_fn = value_model_fn
        self.value_max_grad_norm = value_max_grad_norm
        self.value_optimizer_fn = value_optimizer_fn
        self.value_optimizer_lr = value_optimizer_lr

        self.training_strategy_fn = training_strategy_fn
        self.evaluation_strategy_fn = evaluation_strategy_fn

        self.n_warmup_batches = n_warmup_batches
        self.update_target_every_steps = update_target_every_steps
        self.tau = tau

    def optimize_model(self, experiences):
      """
      Optimize and update parameters in neural network models (Q value and policy models)

      experiences= experience buffer replay - Database
      
      argmax_a_q_sp = greedy policy for next state
      max_a_q_sp = max Q value function for next state
      target_q_sa = target Q value function
      q_sa = current Q value function
      td_error = TD error
      value_loss = loss function for Q value function neural network model

      argmax_a_q_s = greedy action for current state
      max_a_q_s = Q value from greedy action in current state
      policy_loss = loss function for policy neural network model
      """

      states, actions, rewards, next_states, is_terminals = experiences
      batch_size = len(is_terminals)

      # Compute greedy policy for next state
      argmax_a_q_sp = self.target_policy_model(next_states)

      # Compute max Q value function for next state
      max_a_q_sp = self.target_value_model(next_states, argmax_a_q_sp)

      # Update target Q value function
      target_q_sa = rewards + self.gamma * max_a_q_sp * (1 - is_terminals)

      # Compute current Q value function
      q_sa = self.online_value_model(states, actions)

      #TODO: compute TD error with target_q_sa and q_sa
      td_error = # To complete ---> When using the target_q_sa variable, you may want to work with target_q_sa.detach()

      #TODO: compute the value loss function to train the neural network (check out slide 35 from lecture 13)
      value_loss = # To complete ---> Recall that we are dealing with td error samples, don't forget to compute the expectation via Monte Carlo

      # Backpropagation for value function neural network
      self.value_optimizer.zero_grad()
      value_loss.backward()
      torch.nn.utils.clip_grad_norm_(self.online_value_model.parameters(), 
                                      self.value_max_grad_norm)
      self.value_optimizer.step()

      # Compute greedy action with policy model for current state
      argmax_a_q_s = self.online_policy_model(states)

      # Get Q value from greedy action and current state
      max_a_q_s = self.online_value_model(states, argmax_a_q_s)

      # Compute loss for policy model
      policy_loss = -max_a_q_s.mean()

      # Backpropagation for policy neural network
      self.policy_optimizer.zero_grad()
      policy_loss.backward()
      torch.nn.utils.clip_grad_norm_(self.online_policy_model.parameters(), 
                                      self.policy_max_grad_norm)        
      self.policy_optimizer.step()

    def interaction_step(self, state, env):
      """
      agent interacts with the environment by applying action

      min_samples = database from the buffer replay - Train neural networks
      action = action from the policy model
      new_state = new state in the environment
      reward = reward from the current interaction
      is_failure = flag for terminal state
      experience = new sample tuple (state, action, reward, new_state, is_failure) for buffer replay
      """
      min_samples = self.replay_buffer.batch_size * self.n_warmup_batches
      action = self.training_strategy.select_action(self.online_policy_model, 
                                                    state, 
                                                    len(self.replay_buffer) < min_samples)
      new_state, reward, is_terminal, info = env.step(action)
      is_truncated = 'TimeLimit.truncated' in info and info['TimeLimit.truncated']
      is_failure = is_terminal and not is_truncated
      experience = (state, action, reward, new_state, float(is_failure))

      self.replay_buffer.store(experience)
      self.episode_reward[-1] += reward
      self.episode_timestep[-1] += 1
      self.episode_exploration[-1] += self.training_strategy.ratio_noise_injected
      return new_state, is_terminal
    
    def update_networks(self, tau=None):
      """
      Update neural network models parameters (policy and Q-value function) via Polyak Averaging
      * We use this technique to avoid aggressive model parameters updates

      tau = Polyak averaging factor
      target_ratio = averaging ratio
      mixed_weights = new mixed weights
      """
      tau = self.tau if tau is None else tau

      for target, online in zip(self.target_value_model.parameters(), 
                                self.online_value_model.parameters()):
          target_ratio = (1.0 - self.tau) * target.data
          online_ratio = self.tau * online.data
          mixed_weights = target_ratio + online_ratio
          target.data.copy_(mixed_weights)

      for target, online in zip(self.target_policy_model.parameters(), 
                                self.online_policy_model.parameters()):
          target_ratio = (1.0 - self.tau) * target.data
          online_ratio = self.tau * online.data
          mixed_weights = target_ratio + online_ratio
          target.data.copy_(mixed_weights)

    def train(self, make_env_fn, make_env_kargs, seed, gamma, 
              max_minutes, max_episodes, goal_mean_100_reward):
      """
      Training function and computing stats 

      make_env_fn = make environment function
      make_env_kargs = arguments for make environment function
      seed = seed for random numbers
      gamma = discount factor
      max_minutes = maximum training time
      max_episodes = maximum training episodes
      goal_mean_100_reward = target reward goal
      """

      # Setup environment
      training_start, last_debug_time = time.time(), float('-inf')

      self.checkpoint_dir = tempfile.mkdtemp()
      self.make_env_fn = make_env_fn
      self.make_env_kargs = make_env_kargs
      self.seed = seed
      self.gamma = gamma
      
      env = self.make_env_fn(**self.make_env_kargs, seed=self.seed)
      torch.manual_seed(self.seed) ; np.random.seed(self.seed) ; random.seed(self.seed)
  
      # Initialize actor-critic agent
      nS, nA = env.observation_space.shape[0], env.action_space.shape[0]
      action_bounds = env.action_space.low, env.action_space.high
      self.episode_timestep = []
      self.episode_reward = []
      self.episode_seconds = []
      self.evaluation_scores = []        
      self.episode_exploration = []
      
      # Setup target and online Q-value functions
      self.target_value_model = self.value_model_fn(nS, nA)
      self.online_value_model = self.value_model_fn(nS, nA)

      # Setup target and online policy model
      self.target_policy_model = self.policy_model_fn(nS, action_bounds)
      self.online_policy_model = self.policy_model_fn(nS, action_bounds)

      # Setup optimize and update parameters functions
      self.update_networks(tau=1.0)
      self.value_optimizer = self.value_optimizer_fn(self.online_value_model, 
                                                      self.value_optimizer_lr)        
      self.policy_optimizer = self.policy_optimizer_fn(self.online_policy_model, 
                                                        self.policy_optimizer_lr)

      # Setup replay buffer, and training/evaluation strategies
      self.replay_buffer = self.replay_buffer_fn()
      self.training_strategy = training_strategy_fn(action_bounds)
      self.evaluation_strategy = evaluation_strategy_fn(action_bounds)
                  
      result = np.empty((max_episodes, 5))
      result[:] = np.nan
      training_time = 0

      # Episodic interaction agent-environment 
      for episode in range(1, max_episodes + 1):
          episode_start = time.time()
          
          state, is_terminal = env.reset(), False
          self.episode_reward.append(0.0)
          self.episode_timestep.append(0.0)
          self.episode_exploration.append(0.0)

          for step in count():
              state, is_terminal = self.interaction_step(state, env)

              min_samples = self.replay_buffer.batch_size * self.n_warmup_batches
              if len(self.replay_buffer) > min_samples:
                  experiences = self.replay_buffer.sample()
                  experiences = self.online_value_model.load(experiences)
                  self.optimize_model(experiences)

              if np.sum(self.episode_timestep) % self.update_target_every_steps == 0:
                  self.update_networks()

              if is_terminal:
                  gc.collect()
                  break
          
          # save stats
          #---DO NOT TOUCH---
          episode_elapsed = time.time() - episode_start
          self.episode_seconds.append(episode_elapsed)
          training_time += episode_elapsed
          evaluation_score, _ = self.evaluate(self.online_policy_model, env)
          self.save_checkpoint(episode-1, self.online_policy_model)

          total_step = int(np.sum(self.episode_timestep))
          self.evaluation_scores.append(evaluation_score)
          
          mean_10_reward = np.mean(self.episode_reward[-10:])
          std_10_reward = np.std(self.episode_reward[-10:])
          mean_100_reward = np.mean(self.episode_reward[-100:])
          std_100_reward = np.std(self.episode_reward[-100:])
          mean_100_eval_score = np.mean(self.evaluation_scores[-100:])
          std_100_eval_score = np.std(self.evaluation_scores[-100:])
          lst_100_exp_rat = np.array(
              self.episode_exploration[-100:])/np.array(self.episode_timestep[-100:])
          mean_100_exp_rat = np.mean(lst_100_exp_rat)
          std_100_exp_rat = np.std(lst_100_exp_rat)
          
          wallclock_elapsed = time.time() - training_start
          result[episode-1] = total_step, mean_100_reward, \
              mean_100_eval_score, training_time, wallclock_elapsed
          
          reached_debug_time = time.time() - last_debug_time >= LEAVE_PRINT_EVERY_N_SECS
          reached_max_minutes = wallclock_elapsed >= max_minutes * 60
          reached_max_episodes = episode >= max_episodes
          reached_goal_mean_reward = mean_100_eval_score >= goal_mean_100_reward
          training_is_over = reached_max_minutes or \
                              reached_max_episodes or \
                              reached_goal_mean_reward
          elapsed_str = time.strftime("%H:%M:%S", time.gmtime(time.time() - training_start))
          debug_message = 'el {}, ep {:04}, ts {:07}, '
          debug_message += 'ar 10 {:05.1f}\u00B1{:05.1f}, '
          debug_message += '100 {:05.1f}\u00B1{:05.1f}, '
          debug_message += 'ex 100 {:02.1f}\u00B1{:02.1f}, '
          debug_message += 'ev {:05.1f}\u00B1{:05.1f}'
          debug_message = debug_message.format(
              elapsed_str, episode-1, total_step, mean_10_reward, std_10_reward, 
              mean_100_reward, std_100_reward, mean_100_exp_rat, std_100_exp_rat,
              mean_100_eval_score, std_100_eval_score)
          print(debug_message, end='\r', flush=True)
          if reached_debug_time or training_is_over:
              print(ERASE_LINE + debug_message, flush=True)
              last_debug_time = time.time()
          if training_is_over:
              if reached_max_minutes: print(u'--> reached_max_minutes \u2715')
              if reached_max_episodes: print(u'--> reached_max_episodes \u2715')
              if reached_goal_mean_reward: print(u'--> reached_goal_mean_reward \u2713')
              break

      # End training and save results      
      final_eval_score, score_std = self.evaluate(self.online_policy_model, env, n_episodes=100)
      wallclock_time = time.time() - training_start
      print('Training complete.')
      print('Final evaluation score {:.2f}\u00B1{:.2f} in {:.2f}s training time,'
            ' {:.2f}s wall-clock time.\n'.format(
                final_eval_score, score_std, training_time, wallclock_time))
      env.close() ; del env
      self.get_cleaned_checkpoints()
      return result, final_eval_score, training_time, wallclock_time
    
    def evaluate(self, eval_policy_model, eval_env, n_episodes=1):
      """
      evaluate trained policy
      
      eval_policy_model = policy model to evaluate
      eval_env = environment to evaluate
      n_episodes = number of episodes to evaluate
      a = action
      s = current state
      r = reward
      d = next state
      """
      rs = []
      for _ in range(n_episodes):
          s, d = eval_env.reset(), False
          rs.append(0)
          for _ in count():
              a = self.evaluation_strategy.select_action(eval_policy_model, s)
              s, r, d, _ = eval_env.step(a)
              rs[-1] += r
              if d: break
      return np.mean(rs), np.std(rs)

    def get_cleaned_checkpoints(self, n_checkpoints=4):
      """
      clean database for saving
      """
      try: 
          return self.checkpoint_paths
      except AttributeError:
          self.checkpoint_paths = {}

      paths = glob.glob(os.path.join(self.checkpoint_dir, '*.tar'))
      paths_dic = {int(path.split('.')[-2]):path for path in paths}
      last_ep = max(paths_dic.keys())
      checkpoint_idxs = np.linspace(1, last_ep+1, n_checkpoints, endpoint=True, dtype=np.int)-1

      for idx, path in paths_dic.items():
          if idx in checkpoint_idxs:
              self.checkpoint_paths[idx] = path
          else:
              os.unlink(path)

      return self.checkpoint_paths

    def save_checkpoint(self, episode_idx, model):
      """
      Save model
      """
      torch.save(model.state_dict(), 
                  os.path.join(self.checkpoint_dir, 'model.{}.tar'.format(episode_idx)))

In [None]:
# DDPG training/evaluation routine
# There is a task here

ddpg_results = []
best_agent, best_eval_score = None, float('-inf')

# Prepare environment
for seed in SEEDS:
    environment_settings = {
        'env_name': 'Pendulum-v0',
        'gamma': 0.99,
        'max_minutes': 5,
        'max_episodes': 500,
        'goal_mean_100_reward': -150
    }

    # Setup DDPG agent 
    #---Policy neural network---
    policy_model_fn = lambda nS, bounds: FCDP(nS, bounds)
    policy_max_grad_norm = float('inf')

    #TODO: select an optimization algorithm for the policy neural network. For further information: https://pytorch.org/docs/stable/optim.html - Algorithms
    policy_optimizer_fn = lambda net, lr: # To complete -> Follow the format optim.Optimization_Algorithm(net.parameters(), lr=lr)

    #TODO: select a suitable learning rate for the optimization algorithm
    policy_optimizer_lr = # To complete

    #---Value function neural network---
    value_model_fn = lambda nS, nA: FCQV(nS, nA)
    value_max_grad_norm = float('inf')

    #TODO: select an optimization algorithm for the value neural network. For further information: https://pytorch.org/docs/stable/optim.html - Algorithms
    value_optimizer_fn = lambda net, lr: # To complete -> Follow the format optim.Optimization_Algorithm(net.parameters(), lr=lr)

    #TODO: select a suitable learning rate for the optimization algorithm
    value_optimizer_lr = # To complete

    # Training/evaluation strategy
    training_strategy_fn = lambda bounds: NormalNoiseStrategy(bounds, exploration_noise_ratio=0.1)
    evaluation_strategy_fn = lambda bounds: GreedyStrategy(bounds)

    replay_buffer_fn = lambda: ReplayBuffer(max_size=100000, batch_size=256)
    n_warmup_batches = 5
    update_target_every_steps = 1
    tau = 0.005
    
    env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()

    # Update agent
    agent = DDPG(replay_buffer_fn,
                 policy_model_fn, 
                 policy_max_grad_norm, 
                 policy_optimizer_fn, 
                 policy_optimizer_lr,
                 value_model_fn, 
                 value_max_grad_norm, 
                 value_optimizer_fn, 
                 value_optimizer_lr, 
                 training_strategy_fn,
                 evaluation_strategy_fn,
                 n_warmup_batches,
                 update_target_every_steps,
                 tau)

    # Train/evaluate agent
    make_env_fn, make_env_kargs = get_make_env_fn(env_name=env_name)

    result, final_eval_score, training_time, wallclock_time = agent.train(
        make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward)
    
    # Save results
    ddpg_results.append(result)

    if final_eval_score > best_eval_score:
        best_eval_score = final_eval_score
        best_agent = agent

ddpg_results = np.array(ddpg_results)
_ = BEEP()

[2Kel 00:00:12, ep 0000, ts 0000200, ar 10 -1296.7±000.0, 100 -1296.7±000.0, ex 100 0.3±0.0, ev -1391.7±000.0


In [None]:
# Save results from DDPG agent 
# ---DO NOT TOUCH---
ddpg_max_t, ddpg_max_r, ddpg_max_s, \
ddpg_max_sec, ddpg_max_rt = np.max(ddpg_results, axis=0).T
ddpg_min_t, ddpg_min_r, ddpg_min_s, \
ddpg_min_sec, ddpg_min_rt = np.min(ddpg_results, axis=0).T
ddpg_mean_t, ddpg_mean_r, ddpg_mean_s, \
ddpg_mean_sec, ddpg_mean_rt = np.mean(ddpg_results, axis=0).T
ddpg_x = np.arange(len(ddpg_mean_s))

In [None]:
# Plot results
# ---DO NOT TOUCH---
fig, axs = plt.subplots(2, 1, figsize=(15,10), sharey=False, sharex=True)

# DDPG
axs[0].plot(ddpg_max_r, 'r', linewidth=1)
axs[0].plot(ddpg_min_r, 'r', linewidth=1)
axs[0].plot(ddpg_mean_r, 'r:', label='DDPG', linewidth=2)
axs[0].fill_between(
    ddpg_x, ddpg_min_r, ddpg_max_r, facecolor='r', alpha=0.3)

axs[1].plot(ddpg_max_s, 'r', linewidth=1)
axs[1].plot(ddpg_min_s, 'r', linewidth=1)
axs[1].plot(ddpg_mean_s, 'r:', label='DDPG', linewidth=2)
axs[1].fill_between(
    ddpg_x, ddpg_min_s, ddpg_max_s, facecolor='r', alpha=0.3)

# ALL
axs[0].set_title('Moving Avg Reward (Training)')
axs[1].set_title('Moving Avg Reward (Evaluation)')
plt.xlabel('Episodes')
axs[0].legend(loc='upper left')
plt.show()

In [None]:
# Plot results
# ---DO NOT TOUCH---
fig, axs = plt.subplots(3, 1, figsize=(15,15), sharey=False, sharex=True)

# DDPG
axs[0].plot(ddpg_max_t, 'r', linewidth=1)
axs[0].plot(ddpg_min_t, 'r', linewidth=1)
axs[0].plot(ddpg_mean_t, 'r:', label='DDPG', linewidth=2)
axs[0].fill_between(
    ddpg_x, ddpg_min_t, ddpg_max_t, facecolor='r', alpha=0.3)

axs[1].plot(ddpg_max_sec, 'r', linewidth=1)
axs[1].plot(ddpg_min_sec, 'r', linewidth=1)
axs[1].plot(ddpg_mean_sec, 'r:', label='DDPG', linewidth=2)
axs[1].fill_between(
    ddpg_x, ddpg_min_sec, ddpg_max_sec, facecolor='r', alpha=0.3)

axs[2].plot(ddpg_max_rt, 'r', linewidth=1)
axs[2].plot(ddpg_min_rt, 'r', linewidth=1)
axs[2].plot(ddpg_mean_rt, 'r:', label='DDPG', linewidth=2)
axs[2].fill_between(
    ddpg_x, ddpg_min_rt, ddpg_max_rt, facecolor='r', alpha=0.3)

# ALL
axs[0].set_title('Total Steps')
axs[1].set_title('Training Time')
axs[2].set_title('Wall-clock Time')
plt.xlabel('Episodes')
axs[0].legend(loc='upper left')
plt.show()

# TD3

In [None]:
# Noise decay strategy for exploation
# There is a task here!
class NormalNoiseDecayStrategy():
    def __init__(self, bounds, init_noise_ratio=0.5, min_noise_ratio=0.1, decay_steps=10000):
      """
      Initialize class

      bounds = max-min bounds for noise
      init_noise_ratio = initial noise ratio
      min_noise_ratio = minimum noise ratio
      decay_steps=10000 = noise decay steps
      """
      self.t = 0
      self.low, self.high = bounds
      self.noise_ratio = init_noise_ratio
      self.init_noise_ratio = init_noise_ratio
      self.min_noise_ratio = min_noise_ratio
      self.decay_steps = decay_steps
      self.ratio_noise_injected = 0

    def _noise_ratio_update(self):
      """
      Update noise ratio 
      """
      noise_ratio = 1 - self.t / self.decay_steps
      noise_ratio = (self.init_noise_ratio - self.min_noise_ratio) * noise_ratio + self.min_noise_ratio
      noise_ratio = np.clip(noise_ratio, self.min_noise_ratio, self.init_noise_ratio)
      self.t += 1
      return noise_ratio

    def select_action(self, model, state, max_exploration=False):
      """
      Select noisy action for exploration

      state = state in the environment
      max_exploration = noise scale selection

      """
      if max_exploration:
          noise_scale = self.high
      else:
          noise_scale = self.noise_ratio * self.high

      with torch.no_grad():
          greedy_action = model(state).cpu().detach().data.numpy().squeeze()

      noise = np.random.normal(loc=0, scale=noise_scale, size=len(self.high))

      #TODO: compute the TD3 action with an exploration strategy. Use the values greedy_action and noise
      noisy_action = #To complete

      #TODO use the np.clip function to clip the TD3 action between the values 'self.low' and 'self.high'
      action = # To complete. You may find more information about the clip function in https://numpy.org/doc/stable/reference/generated/numpy.clip.html

      self.noise_ratio = self._noise_ratio_update()
      self.ratio_noise_injected = np.mean(abs((greedy_action - action)/(self.high - self.low)))
      return action

In [None]:
# Plot noise decay function
# --- DO NOT TOUCH ---
s = NormalNoiseDecayStrategy(([-2],[2]))
plt.plot([s._noise_ratio_update() for _ in range(50000)])
plt.title('Normal Noise Linear ratio')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Q-value neural network class - Double Q-learning
# There is a task here!

class FCTQV(nn.Module):
    def __init__(self, 
                 input_dim, 
                 output_dim):
        """
        Class initialization

        input_dim = input dimension
        output_dim = output dimension

        hidden_dims = dimension for hidden layers
        activation_fc = activation function

        device = processing device
        """
        super(FCTQV, self).__init__()
        
        #TODO: Choose a non-linear activation function from https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity
        activation_fc = #To complete. Use the format ---> F.activation_function
        self.activation_fc = activation_fc

        #TODO: propose the dimensions for the hidden layers
        hidden_dims = #To complete. Use the same format (dimension_1, ..., dimension_n)

        # Initialize Q-value neural network A
        self.input_layer_a = nn.Linear(input_dim + output_dim, hidden_dims[0])
        self.hidden_layers_a = nn.ModuleList()

        for i in range(len(hidden_dims)-1):
            hidden_layer_a = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            self.hidden_layers_a.append(hidden_layer_a)

        self.output_layer_a = nn.Linear(hidden_dims[-1], 1)
        
        #TODO: initialize the second Q-value neural network - Use the format nn_property_b
        #---YOUR CODE GOES HERE---








        #-------------------------

        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)

    def _format(self, state, action):
        """
        Format the state for pytorch

        state = state from environment
        action = action from policy
        """
        x, u = state, action
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, 
                             device=self.device, 
                             dtype=torch.float32)
            x = x.unsqueeze(0)
        if not isinstance(u, torch.Tensor):
            u = torch.tensor(u, 
                             device=self.device, 
                             dtype=torch.float32)
            u = u.unsqueeze(0)
        return x, u

    def forward(self, state, action):
        """
        Forward function for neural network - Q-value

        state = state from environment
        action = action from policy
        """
        x, u = self._format(state, action)
        x = torch.cat((x, u), dim=1)

        xa = self.activation_fc(self.input_layer_a(x))
        xb = self.activation_fc(self.input_layer_b(x))

        for hidden_layer_a, hidden_layer_b in zip(self.hidden_layers_a, self.hidden_layers_b):
            xa = self.activation_fc(hidden_layer_a(xa))
            xb = self.activation_fc(hidden_layer_b(xb))
        
        xa = self.output_layer_a(xa)
        xb = self.output_layer_b(xb)
        return xa, xb
    
    def Qa(self, state, action):
        """
        Forward function for neural network - policy

        state = state from environment
        action = action from policy
        """
        x, u = self._format(state, action)
        x = torch.cat((x, u), dim=1)
        xa = self.activation_fc(self.input_layer_a(x))
        for hidden_layer_a in self.hidden_layers_a:
            xa = self.activation_fc(hidden_layer_a(xa))
        return self.output_layer_a(xa)
    
    def load(self, experiences):
      """
      load samples from experience - replay buffer database

      experiences = samples from the replay buffer database
      """
      states, actions, new_states, rewards, is_terminals = experiences
      states = torch.from_numpy(states).float().to(self.device)
      actions = torch.from_numpy(actions).float().to(self.device)
      new_states = torch.from_numpy(new_states).float().to(self.device)
      rewards = torch.from_numpy(rewards).float().to(self.device)
      is_terminals = torch.from_numpy(is_terminals).float().to(self.device)
      return states, actions, new_states, rewards, is_terminals

In [None]:
# TD3 agent class
# There is a task here!

class TD3():
    def __init__(self, 
                 replay_buffer_fn,
                 policy_model_fn, 
                 policy_max_grad_norm, 
                 policy_optimizer_fn, 
                 policy_optimizer_lr,
                 value_model_fn, 
                 value_max_grad_norm, 
                 value_optimizer_fn, 
                 value_optimizer_lr, 
                 training_strategy_fn,
                 evaluation_strategy_fn,
                 n_warmup_batches,
                 update_value_target_every_steps,
                 update_policy_target_every_steps,
                 train_policy_every_steps,
                 tau,
                 policy_noise_ratio,
                 policy_noise_clip_ratio):
      """
      Class initialization

      replay_buffer_fn = replay buffer function

      policy_model_fn = policy neural network architecture
      policy_max_grad_norm = maximum gradient norm for policy model
      policy_optimizer_fn = optimizer for policy neural network
      policy_optimizer_lr = learning rate for policy neural network

      value_model_fn = value function neural network architecture
      value_max_grad_norm = maximum gradient norm for Q-value model
      value_optimizer_fn = optimizer for value function neural network
      value_optimizer_lr = learning rate for value function neural network

      training_strategy_fn = exploration strategy - Normal Noise Strategy
      evaluation_strategy_fn = evaluation strategy - Greedy Strategy
      n_warmup_batches = warm up batches for training
      update_value_target_every_steps = update frecuency for Q-value model
      update_policy_target_every_steps = update frecuency for policy model
      tau = Polyak averaging factor
      policy_noise_ratio = noise ratio for policy exploration
      policy_noise_clip_ratio = upper and lower bounds for actions
      """
      self.replay_buffer_fn = replay_buffer_fn

      self.policy_model_fn = policy_model_fn
      self.policy_max_grad_norm = policy_max_grad_norm
      self.policy_optimizer_fn = policy_optimizer_fn
      self.policy_optimizer_lr = policy_optimizer_lr
      
      self.value_model_fn = value_model_fn
      self.value_max_grad_norm = value_max_grad_norm
      self.value_optimizer_fn = value_optimizer_fn
      self.value_optimizer_lr = value_optimizer_lr

      self.training_strategy_fn = training_strategy_fn
      self.evaluation_strategy_fn = evaluation_strategy_fn

      self.n_warmup_batches = n_warmup_batches
      self.update_value_target_every_steps = update_value_target_every_steps
      self.update_policy_target_every_steps = update_policy_target_every_steps
      self.train_policy_every_steps = train_policy_every_steps
      
      self.tau = tau
      self.policy_noise_ratio = policy_noise_ratio
      self.policy_noise_clip_ratio = policy_noise_clip_ratio

    def optimize_model(self, experiences):
      """
      Optimize and update parameters in neural network models (Q value and policy models)

      experiences= experience buffer replay - Database
      
      argmax_a_q_sp_a = greedy policy for next state in model a
      argmax_a_q_sp_b = greedy policy for next state in model b
      max_a_q_sp = max Q value function for next state
      target_q_sa = target Q value function
      q_sa = current Q value function
      td_error_a = TD error for model a
      td_error_b = TD error for model b
      value_loss = loss function for Q value function neural network model

      argmax_a_q_s = greedy action for current state
      noisy_argmax_a_q_sp
      max_a_q_s = Q value from greedy action in current state
      policy_loss = loss function for policy neural network model

      a_ran = range of actions
      a_noise = noise for exploration with actions
      n_min = minimum value for actions
      n_max = maximum value for actions
      a_noise = clipped noise for exploration
      """
      # Get samples from replay buffer 
      states, actions, rewards, next_states, is_terminals = experiences
      batch_size = len(is_terminals)

      with torch.no_grad():
        
        # Compute noise for exploration with action
        a_ran = self.target_policy_model.env_max - self.target_policy_model.env_min
        a_noise = torch.randn_like(actions) * self.policy_noise_ratio * a_ran
        n_min = self.target_policy_model.env_min * self.policy_noise_clip_ratio
        n_max = self.target_policy_model.env_max * self.policy_noise_clip_ratio  

        #TODO: apply the clipping function to a_noise by using n_min and n_max. You may want to use torch.max and torch.min
        a_noise = #To complete

        # Compute action from policy model
        argmax_a_q_sp = self.target_policy_model(next_states)

        #TODO: use argmax_a_q_sp and a_noise to compute the noisy action
        noisy_argmax_a_q_sp = # To complete

        noisy_argmax_a_q_sp = torch.max(torch.min(noisy_argmax_a_q_sp, 
                                                  self.target_policy_model.env_max),
                                        self.target_policy_model.env_min)

        # Compute Q-value functions with models a and b 
        max_a_q_sp_a, max_a_q_sp_b = self.target_value_model(next_states, noisy_argmax_a_q_sp)

        #TODO: compute the target Q value function using max_a_q_sp_a and max_a_q_sp_b. You may want to use torch.min
        max_a_q_sp = # To complete --> Check out the agorithm in slide 32 from lecture 13

        target_q_sa = rewards + self.gamma * max_a_q_sp * (1 - is_terminals)

      # Compute current Q value function for models a and b
      q_sa_a, q_sa_b = self.online_value_model(states, actions)

      #TODO: compute TD error for neural network a with target_q_sa and q_sa_a, and q_sa_b
      td_error_a = # To complete
      td_error_b = # To complete

      #TODO: compute the value loss function to train the neural network, use td_error_a and td_error_b (check out slide 35 from lecture 13)
      value_loss = # To complete ---> Recall that we are dealing with td error samples, don't forget to compute the expectation via Monte Carlo
      
      # Backpropagation for value function neural network
      self.value_optimizer.zero_grad()
      value_loss.backward()
      torch.nn.utils.clip_grad_norm_(self.online_value_model.parameters(), 
                                      self.value_max_grad_norm)
      self.value_optimizer.step()

      if np.sum(self.episode_timestep) % self.train_policy_every_steps == 0:
        # Compute greedy action with policy model for current state
        argmax_a_q_s = self.online_policy_model(states)

        # Get Q value from greedy action and current state
        max_a_q_s = self.online_value_model.Qa(states, argmax_a_q_s)

        # Compute loss for policy model
        policy_loss = -max_a_q_s.mean()

        # Backpropagation for policy neural network
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.online_policy_model.parameters(), 
                                        self.policy_max_grad_norm)        
        self.policy_optimizer.step()

    def interaction_step(self, state, env):
      """
      agent interacts with the environment by applying action

      min_samples = database from the buffer replay - Train neural networks
      action = action from the policy model
      new_state = new state in the environment
      reward = reward from the current interaction
      is_failure = flag for terminal state
      experience = new sample tuple (state, action, reward, new_state, is_failure) for buffer replay
      """
      min_samples = self.replay_buffer.batch_size * self.n_warmup_batches
      action = self.training_strategy.select_action(self.online_policy_model, 
                                                    state, 
                                                    len(self.replay_buffer) < min_samples)
      new_state, reward, is_terminal, info = env.step(action)
      is_truncated = 'TimeLimit.truncated' in info and info['TimeLimit.truncated']
      is_failure = is_terminal and not is_truncated
      experience = (state, action, reward, new_state, float(is_failure))

      self.replay_buffer.store(experience)
      self.episode_reward[-1] += reward
      self.episode_timestep[-1] += 1
      self.episode_exploration[-1] += self.training_strategy.ratio_noise_injected
      return new_state, is_terminal

    def update_value_network(self, tau=None):
      """
      Update Q-value neural network models parameters (policy and Q-value function) via Polyak Averaging
      * We use this technique to avoid aggressive model parameters updates

      tau = Polyak averaging factor
      target_ratio = averaging ratio
      mixed_weights = new mixed weights
      """
      tau = self.tau if tau is None else tau
      for target, online in zip(self.target_value_model.parameters(), 
                                self.online_value_model.parameters()):
          target_ratio = (1.0 - self.tau) * target.data
          online_ratio = self.tau * online.data
          mixed_weights = target_ratio + online_ratio
          target.data.copy_(mixed_weights)

    def update_policy_network(self, tau=None):
      """
      Update Q-value neural network models parameters (policy and Q-value function) via Polyak Averaging
      * We use this technique to avoid aggressive model parameters updates

      tau = Polyak averaging factor
      target_ratio = averaging ratio
      mixed_weights = new mixed weights
      """
      tau = self.tau if tau is None else tau
      for target, online in zip(self.target_policy_model.parameters(), 
                                self.online_policy_model.parameters()):
          target_ratio = (1.0 - self.tau) * target.data
          online_ratio = self.tau * online.data
          mixed_weights = target_ratio + online_ratio
          target.data.copy_(mixed_weights)

    def train(self, make_env_fn, make_env_kargs, seed, gamma, 
              max_minutes, max_episodes, goal_mean_100_reward):
      """
      Training function and computing stats 

      make_env_fn = make environment function
      make_env_kargs = arguments for make environment function
      seed = seed for random numbers
      gamma = discount factor
      max_minutes = maximum training time
      max_episodes = maximum training episodes
      goal_mean_100_reward = target reward goal
      """
      # Setup environment
      training_start, last_debug_time = time.time(), float('-inf')

      self.checkpoint_dir = tempfile.mkdtemp()
      self.make_env_fn = make_env_fn
      self.make_env_kargs = make_env_kargs
      self.seed = seed
      self.gamma = gamma
      
      env = self.make_env_fn(**self.make_env_kargs, seed=self.seed)
      torch.manual_seed(self.seed) ; np.random.seed(self.seed) ; random.seed(self.seed)

      # Initialize actor-critic agent
      nS, nA = env.observation_space.shape[0], env.action_space.shape[0]
      action_bounds = env.action_space.low, env.action_space.high
      self.episode_timestep = []
      self.episode_reward = []
      self.episode_seconds = []
      self.evaluation_scores = []        
      self.episode_exploration = []
      
      # Setup target and online Q-value functions
      self.target_value_model = self.value_model_fn(nS, nA)
      self.online_value_model = self.value_model_fn(nS, nA)
      self.update_value_network(tau=1.0)

      # Setup target and online policy model
      self.target_policy_model = self.policy_model_fn(nS, action_bounds)
      self.online_policy_model = self.policy_model_fn(nS, action_bounds)
      self.update_policy_network(tau=1.0)

      # Setup optimize and update parameters functions
      self.value_optimizer = self.value_optimizer_fn(self.online_value_model, 
                                                      self.value_optimizer_lr)        
      self.policy_optimizer = self.policy_optimizer_fn(self.online_policy_model, 
                                                        self.policy_optimizer_lr)

      # Setup replay buffer, and training/evaluation strategies
      self.replay_buffer = self.replay_buffer_fn()
      self.training_strategy = training_strategy_fn(action_bounds)
      self.evaluation_strategy = evaluation_strategy_fn(action_bounds)
                  
      result = np.empty((max_episodes, 5))
      result[:] = np.nan
      training_time = 0

      # Episodic interaction agent-environment 
      for episode in range(1, max_episodes + 1):
          episode_start = time.time()
          
          state, is_terminal = env.reset(), False
          self.episode_reward.append(0.0)
          self.episode_timestep.append(0.0)
          self.episode_exploration.append(0.0)

          for step in count():
              state, is_terminal = self.interaction_step(state, env)

              min_samples = self.replay_buffer.batch_size * self.n_warmup_batches
              if len(self.replay_buffer) > min_samples:
                  experiences = self.replay_buffer.sample()
                  experiences = self.online_value_model.load(experiences)
                  self.optimize_model(experiences)

              if np.sum(self.episode_timestep) % self.update_value_target_every_steps == 0:
                  self.update_value_network()

              if np.sum(self.episode_timestep) % self.update_policy_target_every_steps == 0:
                  self.update_policy_network()

              if is_terminal:
                  gc.collect()
                  break
          
          # save stats
          #---DO NOT TOUCH---
          episode_elapsed = time.time() - episode_start
          self.episode_seconds.append(episode_elapsed)
          training_time += episode_elapsed
          evaluation_score, _ = self.evaluate(self.online_policy_model, env)
          self.save_checkpoint(episode-1, self.online_policy_model)

          total_step = int(np.sum(self.episode_timestep))
          self.evaluation_scores.append(evaluation_score)
          
          mean_10_reward = np.mean(self.episode_reward[-10:])
          std_10_reward = np.std(self.episode_reward[-10:])
          mean_100_reward = np.mean(self.episode_reward[-100:])
          std_100_reward = np.std(self.episode_reward[-100:])
          mean_100_eval_score = np.mean(self.evaluation_scores[-100:])
          std_100_eval_score = np.std(self.evaluation_scores[-100:])
          lst_100_exp_rat = np.array(
              self.episode_exploration[-100:])/np.array(self.episode_timestep[-100:])
          mean_100_exp_rat = np.mean(lst_100_exp_rat)
          std_100_exp_rat = np.std(lst_100_exp_rat)
          
          wallclock_elapsed = time.time() - training_start
          result[episode-1] = total_step, mean_100_reward, \
              mean_100_eval_score, training_time, wallclock_elapsed
          
          reached_debug_time = time.time() - last_debug_time >= LEAVE_PRINT_EVERY_N_SECS
          reached_max_minutes = wallclock_elapsed >= max_minutes * 60
          reached_max_episodes = episode >= max_episodes
          reached_goal_mean_reward = mean_100_eval_score >= goal_mean_100_reward
          training_is_over = reached_max_minutes or \
                              reached_max_episodes or \
                              reached_goal_mean_reward
          elapsed_str = time.strftime("%H:%M:%S", time.gmtime(time.time() - training_start))
          debug_message = 'el {}, ep {:04}, ts {:07}, '
          debug_message += 'ar 10 {:05.1f}\u00B1{:05.1f}, '
          debug_message += '100 {:05.1f}\u00B1{:05.1f}, '
          debug_message += 'ex 100 {:02.1f}\u00B1{:02.1f}, '
          debug_message += 'ev {:05.1f}\u00B1{:05.1f}'
          debug_message = debug_message.format(
              elapsed_str, episode-1, total_step, mean_10_reward, std_10_reward, 
              mean_100_reward, std_100_reward, mean_100_exp_rat, std_100_exp_rat,
              mean_100_eval_score, std_100_eval_score)
          print(debug_message, end='\r', flush=True)
          if reached_debug_time or training_is_over:
              print(ERASE_LINE + debug_message, flush=True)
              last_debug_time = time.time()
          if training_is_over:
              if reached_max_minutes: print(u'--> reached_max_minutes \u2715')
              if reached_max_episodes: print(u'--> reached_max_episodes \u2715')
              if reached_goal_mean_reward: print(u'--> reached_goal_mean_reward \u2713')
              break
      
      # End training and save results      
      final_eval_score, score_std = self.evaluate(self.online_policy_model, env, n_episodes=100)
      wallclock_time = time.time() - training_start
      print('Training complete.')
      print('Final evaluation score {:.2f}\u00B1{:.2f} in {:.2f}s training time,'
            ' {:.2f}s wall-clock time.\n'.format(
                final_eval_score, score_std, training_time, wallclock_time))
      env.close() ; del env
      self.get_cleaned_checkpoints()
      return result, final_eval_score, training_time, wallclock_time
    
    def evaluate(self, eval_policy_model, eval_env, n_episodes=1):
      """
      evaluate trained policy
      
      eval_policy_model = policy model to evaluate
      eval_env = environment to evaluate
      n_episodes = number of episodes to evaluate
      a = action
      s = current state
      r = reward
      d = next state
      """
      rs = []
      for _ in range(n_episodes):
          s, d = eval_env.reset(), False
          rs.append(0)
          for _ in count():
              a = self.evaluation_strategy.select_action(eval_policy_model, s)
              s, r, d, _ = eval_env.step(a)
              rs[-1] += r
              if d: break
      return np.mean(rs), np.std(rs)

    def get_cleaned_checkpoints(self, n_checkpoints=4):
      """
      clean database for saving
      """
      try: 
          return self.checkpoint_paths
      except AttributeError:
          self.checkpoint_paths = {}

      paths = glob.glob(os.path.join(self.checkpoint_dir, '*.tar'))
      paths_dic = {int(path.split('.')[-2]):path for path in paths}
      last_ep = max(paths_dic.keys())
      checkpoint_idxs = np.linspace(1, last_ep+1, n_checkpoints, endpoint=True, dtype=np.int)-1

      for idx, path in paths_dic.items():
          if idx in checkpoint_idxs:
              self.checkpoint_paths[idx] = path
          else:
              os.unlink(path)

      return self.checkpoint_paths

    def save_checkpoint(self, episode_idx, model):
      """
      Save model
      """
      torch.save(model.state_dict(), 
                  os.path.join(self.checkpoint_dir, 'model.{}.tar'.format(episode_idx)))

In [None]:
# TD3 training/evaluation routine
# There is a task here
td3_results = []
best_agent, best_eval_score = None, float('-inf')

# Prepare environment
for seed in SEEDS:
    environment_settings = {
        'env_name': 'Pendulum-v0',
        'gamma': 0.99,
        'max_minutes': 5,
        'max_episodes': 500,
        'goal_mean_100_reward': -150
    }

    # Setup TD3 agent
    #---Policy neural network---
    policy_model_fn = lambda nS, bounds: FCDP(nS, bounds)
    policy_max_grad_norm = float('inf')
    
    #TODO: select an optimization algorithm for the policy neural network. For further information: https://pytorch.org/docs/stable/optim.html - Algorithms
    policy_optimizer_fn = lambda net, lr: # To complete -> Follow the format optim.Optimization_Algorithm(net.parameters(), lr=lr)

    #TODO: select a suitable learning rate for the optimization algorithm
    policy_optimizer_lr = # To complete

    #---Value function neural network---
    value_model_fn = lambda nS, nA: FCTQV(nS, nA)
    value_max_grad_norm = float('inf')

    #TODO: select an optimization algorithm for the value neural network. For further information: https://pytorch.org/docs/stable/optim.html - Algorithms
    value_optimizer_fn = lambda net, lr: # To complete -> Follow the format optim.Optimization_Algorithm(net.parameters(), lr=lr)

    #TODO: select a suitable learning rate for the optimization algorithm
    value_optimizer_lr = # To complete

    # Training/evaluation strategy
    training_strategy_fn = lambda bounds: NormalNoiseDecayStrategy(bounds,
                                                                   init_noise_ratio=0.5,
                                                                   min_noise_ratio=0.1,
                                                                   decay_steps=200000)
    evaluation_strategy_fn = lambda bounds: GreedyStrategy(bounds)

    replay_buffer_fn = lambda: ReplayBuffer(max_size=1000000, batch_size=256)
    n_warmup_batches = 5
    update_value_target_every_steps = 2
    update_policy_target_every_steps = 2
    train_policy_every_steps = 2
    policy_noise_ratio = 0.1
    policy_noise_clip_ratio = 0.5
    tau = 0.005

    env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()

    # Update agent
    agent = TD3(replay_buffer_fn,
                policy_model_fn, 
                policy_max_grad_norm, 
                policy_optimizer_fn, 
                policy_optimizer_lr,
                value_model_fn,
                value_max_grad_norm, 
                value_optimizer_fn, 
                value_optimizer_lr, 
                training_strategy_fn,
                evaluation_strategy_fn,
                n_warmup_batches,
                update_value_target_every_steps,
                update_policy_target_every_steps,
                train_policy_every_steps,
                tau,
                policy_noise_ratio,
                policy_noise_clip_ratio)
    # Train/evaluate agent
    make_env_fn, make_env_kargs = get_make_env_fn(env_name=env_name)

    result, final_eval_score, training_time, wallclock_time = agent.train(
        make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward)
    
    # Save results
    td3_results.append(result)

    if final_eval_score > best_eval_score:
        best_eval_score = final_eval_score
        best_agent = agent

td3_results = np.array(td3_results)
_ = BEEP()

In [None]:
# Save results from TD3 agent 
# ---DO NOT TOUCH---
td3_max_t, td3_max_r, td3_max_s, \
td3_max_sec, td3_max_rt = np.max(td3_results, axis=0).T
td3_min_t, td3_min_r, td3_min_s, \
td3_min_sec, td3_min_rt = np.min(td3_results, axis=0).T
td3_mean_t, td3_mean_r, td3_mean_s, \
td3_mean_sec, td3_mean_rt = np.mean(td3_results, axis=0).T
td3_x = np.arange(len(td3_mean_s))

In [None]:
# Plot results
# ---DO NOT TOUCH---
fig, axs = plt.subplots(2, 1, figsize=(15,10), sharey=False, sharex=True)

# TD3
axs[0].plot(td3_max_r, 'b', linewidth=1)
axs[0].plot(td3_min_r, 'b', linewidth=1)
axs[0].plot(td3_mean_r, 'b:', label='TD3', linewidth=2)
axs[0].fill_between(
    td3_x, td3_min_r, td3_max_r, facecolor='b', alpha=0.3)

axs[1].plot(td3_max_s, 'b', linewidth=1)
axs[1].plot(td3_min_s, 'b', linewidth=1)
axs[1].plot(td3_mean_s, 'b:', label='TD3', linewidth=2)
axs[1].fill_between(
    td3_x, td3_min_s, td3_max_s, facecolor='b', alpha=0.3)

# ALL
axs[0].set_title('Moving Avg Reward (Training)')
axs[1].set_title('Moving Avg Reward (Evaluation)')
plt.xlabel('Episodes')
axs[0].legend(loc='upper left')
plt.show()

In [None]:
# Plot results
# ---DO NOT TOUCH---
fig, axs = plt.subplots(3, 1, figsize=(15,15), sharey=False, sharex=True)

# TD3
axs[0].plot(td3_max_t, 'b', linewidth=1)
axs[0].plot(td3_min_t, 'b', linewidth=1)
axs[0].plot(td3_mean_t, 'b:', label='TD3', linewidth=2)
axs[0].fill_between(
    td3_x, td3_min_t, td3_max_t, facecolor='b', alpha=0.3)

axs[1].plot(td3_max_sec, 'b', linewidth=1)
axs[1].plot(td3_min_sec, 'b', linewidth=1)
axs[1].plot(td3_mean_sec, 'b:', label='TD3', linewidth=2)
axs[1].fill_between(
    td3_x, td3_min_sec, td3_max_sec, facecolor='b', alpha=0.3)

axs[2].plot(td3_max_rt, 'b', linewidth=1)
axs[2].plot(td3_min_rt, 'b', linewidth=1)
axs[2].plot(td3_mean_rt, 'b:', label='TD3', linewidth=2)
axs[2].fill_between(
    td3_x, td3_min_rt, td3_max_rt, facecolor='b', alpha=0.3)

# ALL
axs[0].set_title('Total Steps')
axs[1].set_title('Training Time')
axs[2].set_title('Wall-clock Time')
plt.xlabel('Episodes')
axs[0].legend(loc='upper left')
plt.show()