# Deep Deterministic Policy Gradient


### General Information
* Authors: Daniil Bobrovskiy and Nicolaj Schmid
* Course: Artificial Neural Networks and Reinforcement Learning (CS-456, EPFL)
* Project: DDPG implementation to balance an inverted pendulum
* References: 
    * Code: based on OpenAI's _Gym_ environment
    * Algorithm: _Continuous control with deep reinforcement learning_ from Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, Yuval Tassa, David Silver and Daan Wierstra, 2019


### Notebook Structure
This notebook is structured in three parts:
1. __Imports__: import all libraries
2. __Classes__: DDPG implementation split in different classes
3. __Training and Results__: run the algorithms and show the results
    * Heuristic Policy
    * Q-function
    * Minimal Implementation of DDPG
    * Target DDPG
    * Ornstein-Uhlenbeck Noise
    
Please read the report for the discussion of the results.

## Imports

In [None]:
import os
import shutil
import torch
import gym

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation

from abc import abstractmethod
from datetime import datetime

## Classes

### Environment

In [None]:
class NormalizedEnv(gym.ActionWrapper):
    """ Wrap action """

    def action(self, action):
        act_k = (self.action_space.high - self.action_space.low)/ 2.
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k * action + act_b

    def reverse_action(self, action):
        act_k_inv = 2./(self.action_space.high - self.action_space.low)
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k_inv * (action - act_b)

### Replay Buffer

In [None]:
class ReplayBuffer():
    def __init__(self, buffer_size, seed):     
        self._buffer = { 
            "state" : torch.empty((buffer_size,3), dtype=torch.float32, requires_grad=False),
            "action" : torch.empty((buffer_size), dtype=torch.float32, requires_grad=False),
            "reward" : torch.empty((buffer_size), dtype=torch.float32, requires_grad=False),
            "next_state" : torch.empty((buffer_size,3), dtype=torch.float32, requires_grad=False),
            "trunc" : torch.empty((buffer_size), dtype=torch.bool, requires_grad=False)
        }
        self._rng = np.random.default_rng(seed=seed)

        self._buffer_size = buffer_size
        self._idx = 0
        self._full = False

    def addTransition(self, state, action ,reward, next_state, trunc):
        # if len(self._buffer) >= self._buffer_size:
        #     self._buffer.pop(0)
        # self._buffer.append((state, action, reward, next_state, trunc))
        self._buffer["state"][self._idx,:] = self.numpy2tensor(state, shape_type="state")
        self._buffer["action"][self._idx] = self.numpy2tensor(action, shape_type="action")
        self._buffer["reward"][self._idx] = self.numpy2tensor(reward, shape_type="reward")
        self._buffer["next_state"][self._idx,:] = self.numpy2tensor(next_state, shape_type="next_state")
        self._buffer["trunc"][self._idx] = self.numpy2tensor(trunc, shape_type="trunc")

        # increment index
        self._idx = self._idx + 1
        if self._idx >= self._buffer_size:
            self._idx = 0
            self._full = True

    def sampleBatch(self, batch_size):
        
        # determine current buffer size
        if self._full:
            current_buffer_size = self._buffer_size
        else:
            current_buffer_size = self._idx

        # return None if batch_size is larger than current buffer size
        if batch_size > current_buffer_size:
            return None  

        # choose random samples
        rand_idx = self._rng.choice(current_buffer_size, size=batch_size, replace=False) # TODO: should the samples be removed ???
        batch = { 
            "state" : self._buffer["state"][rand_idx,:],
            "action" : self._buffer["action"][rand_idx],
            "reward" : self._buffer["reward"][rand_idx],
            "next_state" : self._buffer["next_state"][rand_idx,:],
            "trunc" : self._buffer["trunc"][rand_idx]
        }
        return batch
    
    def numpy2tensor(self, array, shape_type):
        # convert numpy array to tensor if necessary
        if not torch.is_tensor(array):
            tensor = torch.tensor(array, dtype=torch.float32, requires_grad=False)
        else:
            tensor = array

        # reshape tensor
        if shape_type == "state":
            tensor = tensor.reshape((-1,3))
        elif shape_type == "action":
            tensor = tensor.reshape((-1,1))
        elif shape_type == "reward":
            tensor = tensor.reshape((-1,1))
        elif shape_type == "next_state":
            tensor = tensor.reshape((-1,3))
        elif shape_type == "trunc":
            tensor = tensor.reshape((-1,1))
        else:
            raise ValueError("Invalid shape_type: {}".format(shape_type))
        
        return tensor
        
    def detachClone(self, batch):
        if batch is not None:
            batch = { 
                "state" : batch["state"].clone().detach(),
                "action" : batch["action"].clone().detach(),
                "reward" : batch["reward"].clone().detach(),
                "next_state" : batch["next_state"].clone().detach(),
                "trunc" : batch["trunc"].clone().detach()
            }
        return batch

### Noise

In [None]:
class ActionNoise():
    def __init__(self, sigma, seed):
        self._sigma = sigma
        self.theta = 1.0
        self.generator = torch.Generator().manual_seed(seed)

    @abstractmethod
    def getNoisyAction(self, actions):
        pass

    @abstractmethod
    def reset(self):
        pass

class GaussianActionNoise(ActionNoise):
    def __init__(self, sigma, seed=0) -> None:
        super().__init__(sigma, seed)

    def getNoisyAction(self, actions):
        noisy_action = actions + torch.normal(torch.zeros_like(actions), torch.ones_like(actions)*self._sigma, generator=self.generator)
        return torch.clip(noisy_action, -1, 1)
    
    def reset(self):
        pass
    

class OUActionNoise(ActionNoise):
    def __init__(self, sigma, theta, seed=0):
        super().__init__(sigma, seed)

        self.theta = theta
        self.prev_noise = 0.0

    def getNoisyAction(self, actions):
        noise  = (1-self.theta)*self.prev_noise + torch.normal(torch.zeros_like(actions), torch.ones_like(actions)*self._sigma, generator=self.generator)
        noisy_action = actions + noise

        self.prev_noise = noise
        return torch.clip(noisy_action, -1, 1)
    
    def reset(self):
        self.prev_noise = 0.0

### Critic

In [None]:
class CriticNetwork(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()

        self.fc1 = torch.nn.Linear(4, 32)
        self.fc2 = torch.nn.Linear(32, 32)
        self.fc3 = torch.nn.Linear(32, 1)

        self.relu = torch.nn.ReLU()

    def forward(self, states, actions):
        x = torch.cat((states, actions.reshape(-1,1)), dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)
    

class Critic():
    def __init__(self, gamma, lr, tau):
        # hyperparameters
        self.gamma = gamma
        self.lr = lr
        self.tau = tau
       
        # initialize critic network
        self.critic_net = CriticNetwork()
        self.actor = None

        # intialize critic
        self.critic_optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=self.lr)

        # initialize target network
        self.critic_target_net = CriticNetwork()

        # logging values
        self.log_losses = []

    def saveModels(self, path):
        torch.save(self.critic_net, os.path.join(path, "critic_net.pt"))
        torch.save(self.critic_target_net, os.path.join(path, "critic_target_net.pt"))

    def computeQValues(self, states, actions, target=False):
        if target:
            return self.critic_target_net.forward(states=states, actions=actions)
        else:
            return self.critic_net.forward(states=states, actions=actions)

    def trainStep(self, batch, actor):
        # do not train if relay buffer is not large enough
        if batch is None:
            self.log_losses.append(0)
            return                

        # gradient descent step for critic network
        self.critic_optimizer.zero_grad()
        critic_loss = self._computeCriticsLoss(batch=batch, actor=actor)
        critic_loss.backward()
        self.critic_optimizer.step()

        # log loss
        self.log_losses.append(critic_loss.item())

        # update target network
        self._updateTargetNetworks()

    def _computeCriticsLoss(self, batch, actor):
        # calculate next actions and target, with torch.no_grad()
        with torch.no_grad():
            # calculate next actions and next Q values
            if self.tau == 1.0:
                next_actions = actor.computeActions(states=batch["next_state"], target=False, deterministic=True)
                next_q_values = self.computeQValues(states=batch["next_state"], actions=next_actions, target=False)
            else:
                next_actions = actor.computeActions(batch["next_state"], target=True, deterministic=True)
                next_q_values = self.computeQValues(states=batch["next_state"], actions=next_actions, target=True)

            # set next Q values to 0 if episode is truncated              
            next_q_values = torch.where(batch["trunc"].reshape_as(next_q_values), 0, next_q_values)
            
            # calculate target
            targets = batch["reward"].reshape_as(next_q_values) + self.gamma * next_q_values

        # calculate target and expected cumulative rewards
        q_values = self.computeQValues(states=batch["state"], actions=batch["action"], target=False)
        
        # calculate loss and log it
        return 0.5 * torch.pow(q_values - targets, 2).mean()
    
    def _updateTargetNetworks(self):
        if self.tau == 1.0:
            return
        
        with torch.no_grad():
            # update critic target network         
            critic_net_dict = self.critic_net.state_dict()
            critic_target_net_dict = self.critic_target_net.state_dict()
            for key in critic_target_net_dict:
                critic_target_net_dict[key] = self.tau * critic_net_dict[key] + (1-self.tau) * critic_target_net_dict[key]
            self.critic_target_net.load_state_dict(critic_target_net_dict)

### Actor

In [None]:
class ActorNetwork(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()

        self.fc1 = torch.nn.Linear(3, 32)
        self.fc2 = torch.nn.Linear(32, 32)
        self.fc3 = torch.nn.Linear(32, 1)

        self.relu = torch.nn.ReLU()
        self.tanh = torch.nn.Tanh()

    def forward(self, states):
        x = self.relu(self.fc1(states))
        x = self.relu(self.fc2(x))
        return self.tanh(self.fc3(x))
    
    
class RandomActor():
    def __init__(self):
        pass
        
    def computeActions(self, states, target=False, deterministic=True):
        if target or not deterministic:
            raise ValueError("Random actor does not have a target network or noise")

        # generate random actions between -1 and 1
        actions = torch.rand((states.shape[0],1))
        return 2*actions - 1
    

class HeuristicActor():
    def __init__(self, const_torque):
        if const_torque > 1 or const_torque < 0:
            raise ValueError("Constant torque must be between 0 and 1")
        self.const_torque = const_torque

        self.log_losses = []
    
    def saveModels(self, path):
        pass

    def computeActions(self, states, target=False, deterministic=True):
        # if target or not deterministic:
        #     raise ValueError("Heuristic actor does not have a target network or noise")
        
        # generate heuristic actions
        actions = torch.empty((states.shape[0], 1))
        actions[:,0] = -torch.sign(states[:,0]) * torch.sign(states[:,2]) * self.const_torque
        return actions
    
    def trainStep(self, batch, critic):
        self.log_losses.append(0)


class Actor():
    def __init__(self, lr, tau, noise:ActionNoise):
        # hyperparameters
        self.lr = lr
        self.tau = tau

        self.noise = noise
        
        # initialize actor networks
        self.actor_net = ActorNetwork()

        # intialize critic and actor optimizers
        self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=self.lr)

        # initialize target networks
        self.actor_target_net = ActorNetwork()

        # logging values
        self.log_losses = []

    def saveModels(self, path):
        torch.save(self.actor_net, os.path.join(path, "actor_net.pt"))
        torch.save(self.actor_target_net, os.path.join(path, "actor_target_net.pt"))

    def computeActions(self, states, target=False, deterministic=True):
        if target:
            actions = self.actor_target_net.forward(states=states)
        else:
            actions = self.actor_net.forward(states=states)

        if not deterministic:
            actions += self.noise.getNoisyAction(actions=actions)
        return actions

    def trainStep(self, batch:dict, critic):
        # do not train if relay buffer is not large enough
        if batch is None:
            self.log_losses.append(0)
            return

        # freeze critic network to avoid unnecessary computations of gradients
        for p in critic.critic_net.parameters():
            p.requires_grad = False        

        # gradient descent step for actor network
        self.actor_optimizer.zero_grad()
        actor_loss = self._computeActorLoss(batch=batch, critic=critic)
        actor_loss.backward()
        self.actor_optimizer.step()
        self.log_losses.append(actor_loss.item())

        # unfreeze critic network
        for p in critic.critic_net.parameters():
            p.requires_grad = True

        # update target network
        self._updateTargetNetworks()
    
    def _computeActorLoss(self, batch, critic):
        # estimate action from state
        actions = self.computeActions(states=batch['state'], target=False, deterministic=True)

        # calculate q values for state-action pairs
        q_values = critic.computeQValues(states=batch['state'], actions=actions, target=False)

        # calculate loss      
        return - q_values.mean()
    
    def _updateTargetNetworks(self):
        if self.tau == 1.0:
            return
        
        with torch.no_grad():
            # update actor target network
            actor_net_dict = self.actor_net.state_dict()
            actor_target_net_dict = self.actor_target_net.state_dict()
            for key in actor_target_net_dict:
                actor_target_net_dict[key] = self.tau * actor_net_dict[key] + (1-self.tau) * actor_target_net_dict[key]
            self.actor_target_net.load_state_dict(actor_target_net_dict)

### Simulation

In [None]:
class Simulation():
    def __init__(
            self,
            env:NormalizedEnv, 
            critic:Critic,
            actor:Actor,
            buffer:ReplayBuffer,
        ) -> None:

        self.env = env
        self.critic = critic
        self.actor = actor
        self.buffer = buffer

    def run(self, num_episodes, render, plot):
        # create figure for rendering
        if render:
            fig = plt.figure()
            frames = []

        # run episodes
        step_rewards = []
        cum_rewards = []
        for i in range(num_episodes):
            # print testing progress
            if i%10 == 0:
                print(f"Testing episode: {i}/{num_episodes}")

            # reset environment
            state = self.buffer.numpy2tensor(self.env.reset()[0], shape_type="state") # tuple contains as first element the state
            while True:
                # take action and update environment
                action = self.actor.computeActions(states=state, target=False, deterministic=True)
                next_state, reward, term, trunc, info = self.env.step(action=action.detach().numpy().flatten()) # TODO: action has wrond dimensions
                state = self.buffer.numpy2tensor(next_state, shape_type="state")

                # log reward
                step_rewards.append(reward)

                # render environment
                if render:
                    env_screen = self.env.render()
                    frames.append([plt.imshow(env_screen)])       

                # check if episode is truncated
                if trunc:
                    assert len(step_rewards) % 200 == 0 # verfiy that episode length is 200
                    break

            # log cummulative reward
            cum_rewards.append(np.sum(step_rewards[i*200:]))

        # show animation of environment
        if render:
            ani = animation.ArtistAnimation(fig, frames, interval=50, blit=True, repeat_delay=1000)
            plt.show()

        # plot rewards
        if plot:
            self._plotReward(step_rewards=step_rewards, cum_rewards=cum_rewards, title="reward_testing")

        return step_rewards, cum_rewards
    
    def train(self, num_episodes, batch_size):
        # plot heat maps before training
        self._plotHeatmap(title="heatmap_before_training")
        self._plotPolarHeatMap(title="polar_heatmap_before_training")

        # run episodes
        step_rewards = []
        cum_rewards = []
        for i in range(num_episodes):
            # print training progress
            if i%10 == 0:
                print(f"Training episode: {i}/{num_episodes}")

            # reset environment
            state = self.buffer.numpy2tensor(self.env.reset()[0], shape_type="state") # tuple contains as first element the state
            if hasattr(self.actor, "noise"):
                self.actor.noise.reset()
            while True:
                # take action and update environment
                action = self.actor.computeActions(states=state, target=False, deterministic=False)
                next_state, reward, term, trunc, info = self.env.step(action=action.detach().numpy().flatten())

                # add transition to replay buffer
                self.buffer.addTransition(state=state, action=action, reward=reward, next_state=next_state, trunc=trunc)
                state = self.buffer.numpy2tensor(next_state, shape_type="state")

                # train Q and policy networks if replay buffer is large enough
                batch = self.buffer.sampleBatch(batch_size=batch_size)
                self.critic.trainStep(batch=self.buffer.detachClone(batch), actor=self.actor)
                self.actor.trainStep(batch=self.buffer.detachClone(batch), critic=self.critic)

                # log reward
                step_rewards.append(reward)

                # check if episode is truncated
                if trunc:
                    assert len(step_rewards) % 200 == 0 # verfiy that episode length is 200
                    break

            # log cummulative reward
            cum_rewards.append(np.sum(step_rewards[i*200:]))

        # plot rewards, losses and heat maps
        self._plotReward(step_rewards=step_rewards, cum_rewards=cum_rewards, title="reward_training")    
        self._plotLosses(critic_losses=self.critic.log_losses, actor_losses=self.actor.log_losses)
        self._plotPolarHeatMap()

        return step_rewards, cum_rewards

    def _plotReward(self, step_rewards, cum_rewards, title="reward"):
        # assure that episode length is 200
        assert len(step_rewards) % 200 == 0

        # average losses of one episode
        episode_mean = []
        episode_per5 = []
        episode_per95 = []
        i = 0
        while i < len(step_rewards):
            episodes_rewards = np.array(step_rewards[i:i+200])
            episode_mean.append(episodes_rewards.mean())
            episode_per5.append(np.percentile(episodes_rewards, 5))
            episode_per95.append(np.percentile(episodes_rewards, 95))
            i += 200

        fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(10, 10))
        fig.suptitle("Rewards per episode")

        axs[0].plot(range(len(episode_mean)), cum_rewards, label="Cummulative", color="red")
        axs[0].set_xlabel("Episode")
        axs[0].set_ylabel("Reward")
        axs[0].legend()
        axs[0].set_title(f"Cummulative reward per episode (avg={np.round(np.mean(cum_rewards), 3)})")

        axs[1].plot(range(len(episode_mean)), episode_mean, label="Mean", color="red")
        axs[1].fill_between(x=range(len(episode_mean)), y1=episode_per5, y2=episode_per95, alpha=0.2, color="blue", label="Percentile 5-95%")
        axs[1].set_xlabel("Episode")
        axs[1].set_ylabel("Reward")
        axs[1].legend()
        axs[1].set_title(f"Mean reward per episode (avg={np.round(np.mean(episode_mean), 3)})")

        plt.show()

    def _plotLosses(self, critic_losses, actor_losses):
        # assure that episode length is 200
        assert len(critic_losses) % 200 == 0 and len(critic_losses) == len(actor_losses)
        
        c_losses = []
        a_losses = []
        i = 0
        while i < len(critic_losses):
            c_losses.append(np.mean(critic_losses[i:i+200]))
            a_losses.append(np.mean(actor_losses[i:i+200]))
            i += 200

        fig = plt.figure()
        plt.plot(c_losses, label="Critic Loss", color="green")
        plt.plot(a_losses, label="Actor Loss", color="blue")
        plt.title("Losses")
        plt.xlabel("Episode")
        plt.ylabel("Loss")
        plt.legend()
        plt.show()

    def _plotPolarHeatMap(self, title="polar_heatmap"):
        res_angle = 360
        res_radial = 10

        radius = np.linspace(0, 1, res_radial)
        angle = np.linspace(-np.pi, np.pi, res_angle)

        r, a = np.meshgrid(radius, angle)
        cos_a = torch.cos(torch.tensor(a, dtype=torch.float32)).reshape(-1,1)
        sin_a = torch.sin(torch.tensor(a, dtype=torch.float32)).reshape(-1,1)   

        velocities = [0, 2.5]
        torques = [-1, 0, 1]
        q_values = []
        for i, v in enumerate(velocities):
            for j, torque in enumerate(torques):
                vel = v * torch.ones_like(cos_a, dtype=torch.float32)
                states = torch.concat((cos_a, sin_a, vel), axis=1)
                actions = torque * torch.ones_like(cos_a, dtype=torch.float32)
                q_val = self.critic.computeQValues(states=states, actions=actions.reshape(-1,1), target=False)

                q_val = q_val.detach().numpy().reshape(res_angle, res_radial)
                q_values.append(q_val)
                
        
        q_val_max = np.max(q_values)
        q_val_min = np.min(q_values)

        fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(12,7), subplot_kw={'projection':"polar"})      
        for idx, q_val in enumerate(q_values):
            i = idx // len(torques)
            j = idx % len(torques)
                    
            cb = axs[i,j].pcolormesh(a, r, q_val, vmin=q_val_min, vmax=q_val_max)
            axs[i,j].plot(angle, r, color='k', ls='none') 
            fig.colorbar(cb, ax=axs[i,j])
            axs[i,j].set_yticks([],[])
            axs[i,j].set_theta_offset(np.pi/2)
            axs[i,j].set_theta_direction(-1)
            axs[i,j].set_title(f"Torque={2*torques[j]}Nm, vel={v}m/s")

        plt.show()

## Training and Results

### Chapter 3 - Heuristic Policy

#### Random Agent

In [None]:
# create environment and actor
env = NormalizedEnv(env=gym.make("Pendulum-v1", render_mode="rgb_array"))
critic = None
buffer = ReplayBuffer(buffer_size=100000, seed=1)
actor = RandomActor()

# run algorithm
simu = Simulation(
    env=env, 
    critic = critic,
    actor = actor, 
    buffer=buffer,
)
step_rewards, cum_rewards = simu.run(num_episodes=10, render=False, plot=True)

print(f"Mean cummulative reward: {np.mean(cum_rewards)}, std: {np.std(cum_rewards)}")

#### Heuristic Agent

In [None]:
torques = np.linspace(0, 1, 11)

# create environment and actor
env = NormalizedEnv(env=gym.make("Pendulum-v1", render_mode="rgb_array"))
critic = None
buffer = ReplayBuffer(buffer_size=100000, seed=1)

cum_sums = []
cum_stds = []
step_sums = []
step_stds = []
for torque in torques:
    # create actor
    actor = HeuristicActor(const_torque=torque)

    # run algorithm
    simu = Simulation(
        env=env, 
        critic = critic,
        actor = actor, 
        buffer=buffer,
    )
    step_rewards, cum_rewards = simu.run(num_episodes=10, render=False, plot=False)

    # save mean and std of cummulative and step rewards
    cum_sums.append(np.mean(cum_rewards))
    cum_stds.append(np.std(cum_rewards))
    step_sums.append(np.mean(step_rewards))
    step_stds.append(np.std(step_rewards))

# transform torques in action space [0, 1] to torque space [0, 2]
torques = torques * 2

# plot results
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(10, 5))
axs[0].errorbar(x=torques, y=cum_sums, yerr=cum_stds, ecolor="red", label="Mean and std")
axs[0].set_xlabel("Constant Torque")
axs[0].set_ylabel("Reward")
axs[0].legend()
axs[0].set_title("Cummulative reward")

axs[1].errorbar(x=torques, y=step_sums, yerr=step_stds, ecolor="red", label="Mean and std")
axs[1].set_xlabel("Constant Torque")
axs[1].set_ylabel("Reward")
axs[1].legend()
axs[1].set_title("Average reward")

plt.show()

### Chapter 4 - Q-function

In [None]:
# create environment, critic, actor, noise and buffer
env = NormalizedEnv(env=gym.make("Pendulum-v1", render_mode="rgb_array"))
critic = Critic(gamma=0.99, lr=1e-4, tau=1.0)
actor = HeuristicActor(const_torque=1.0)
buffer = ReplayBuffer(buffer_size=10000, seed=1)

# train algorithm
simu = Simulation(
    env=env, 
    critic = critic,
    actor = actor, 
    buffer=buffer,
)
simu.train(num_episodes=1000, batch_size=128)

### Chapter 5 - Minimal Implementation of DDPG

In [None]:
# create environment, critic, actor, noise and buffer
env = NormalizedEnv(env=gym.make("Pendulum-v1", render_mode="rgb_array"))
critic = Critic(gamma=0.99, lr=1e-4, tau=1.0)
action_noise = GaussianActionNoise(sigma=0.3, seed=0)
actor = Actor(lr=1e-4, tau=1.0, noise=action_noise)
buffer = ReplayBuffer(buffer_size=100000, seed=1)

# train algorithm
simu = Simulation(
    env=env, 
    critic = critic,
    actor = actor, 
    buffer=buffer,
)
simu.train(num_episodes=1000, batch_size=128)
simu.run(num_episodes=100, render=False, plot=True)

### Chapter 6 - Target DDPG

In [None]:
# target network update
tau = 0.01

# create environment, critic, actor, noise and buffer
env = NormalizedEnv(env=gym.make("Pendulum-v1", render_mode="rgb_array"))
critic = Critic(gamma=0.99, lr=1e-4, tau=tau)
action_noise = GaussianActionNoise(sigma=0.3, seed=0)
actor = Actor(lr=1e-4, tau=tau, noise=action_noise)
buffer = ReplayBuffer(buffer_size=100000, seed=1)

# train algorithm
simu = Simulation( 
    env=env, 
    critic = critic,
    actor = actor, 
    buffer=buffer,
)
simu.train(num_episodes=1000, batch_size=128)
simu.run(num_episodes=100, render=False, plot=True)

### Chapter 7 - Ornstein-Uhlenbeck Noise

In [None]:
# OU noise parameter
theta = 0.0

# create environment, critic, actor, noise and buffer
env = NormalizedEnv(env=gym.make("Pendulum-v1", render_mode="rgb_array"))
critic = Critic(gamma=0.99, lr=1e-4, tau=0.01)
action_noise = OUActionNoise(sigma=0.3, theta=theta, seed=0)
actor = Actor(lr=1e-4, tau=0.01, noise=action_noise)
buffer = ReplayBuffer(buffer_size=100000, seed=1)

# train algorithm
simu = Simulation(
    env=env, 
    critic = critic,
    actor = actor, 
    buffer=buffer,
)
simu.train(num_episodes=1000, batch_size=128)
simu.run(num_episodes=100, render=False, plot=True)