# Simplified DDPG in PyTorch Lightning
### AJ Zerouali, 2023/06/21

Goals of this notebook:
* Modify pl-bolts' implementation of SAC to get a simplified DDPG.
* Reduce the number of imports for the *nn.LightningModule* implementation (e.g. get rid of the agent class, as well as the nets).
* Replace the algo's *env* attribute by *test_env* passed as a parameter.
* Wrap all this in a general agent class that has a *train()* method that creates a *Trainer* attribute. Instead of having the agent as an attribute of the *LightningModule*, make an agent that has the *LightningModule* as an attribute. 


In [1]:
import argparse
from typing import Dict, List, Tuple

import numpy as np
import torch
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from torch import Tensor, optim
from torch.nn import functional as F
from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader


Temporary imports:

In [2]:
from RL_lightning_bolts_template.pl_bolts_replay_buffers import Experience, ExperienceSourceDataset, MultiStepBuffer
from RL_lightning_bolts_template.pl_bolts_agents import SoftActorCriticAgent
from RL_lightning_bolts_template.pl_bolts_nets import MLP, ContinuousMLP



## 1) Replay Buffer

## 2) Neural nets

In [None]:
import math
from typing import Tuple

import numpy as np
import torch
from torch import FloatTensor, Tensor, nn
from torch.distributions import Categorical, Normal, MultivariateNormal
from torch.nn import functional as F

In [None]:
'''
Source:

https://github.com/Lightning-Universe/lightning-bolts/blob/0.5.0/pl_bolts/models/rl/common/distributions.py
'''
class TanhMultivariateNormal(torch.distributions.MultivariateNormal):
    """The distribution of X is an affine of tanh applied on a normal distribution.
    X = action_scale * tanh(Z) + action_bias
    Z ~ Normal(mean, variance)
    
    AJ Zerouali, 23/06/21: They forgot about the devices
    """

    def __init__(self, action_bias, action_scale, **kwargs):
        super().__init__(**kwargs)

        self.action_bias = action_bias
        self.action_scale = action_scale

    def rsample_with_z(self, sample_shape=torch.Size()):
        """Samples X using reparametrization trick with the intermediate variable Z.
        Returns:
            Sampled X and Z
        """
        z = super().rsample()
        '''
        # DEBUG
        print(f"z.device = {z.device}")
        print(f"type(z) = {type(z)}")
        print(f"self.action_scale.device = {self.action_scale.device}")
        print(f"self.action_bias.device = {self.action_bias.device}")
        #print(f"next(self.parameters()).is_cuda = {next(self.parameters()).is_cuda}")
        '''
        
        action_scale = torch.Tensor(self.action_scale).to(z.device)
        action_bias = torch.Tensor(self.action_bias).to(z.device)
        
        output = (action_scale * torch.tanh(z) + action_bias, z)
        
        return output

    def log_prob_with_z(self, value, z):
        """Computes the log probability of a sampled X.
        Refer to the original paper of SAC for more details in equation (20), (21)
        Args:
            value: the value of X
            z: the value of Z
        Returns:
            Log probability of the sample
        """
        action_scale = torch.Tensor(self.action_scale).to(z.device)
        action_bias = torch.Tensor(self.action_bias).to(z.device)
        
        value = (value - action_bias) / action_scale
        z_logprob = super().log_prob(z)
        correction = torch.log(action_scale * (1 - value ** 2) + 1e-7).sum(1)
        return z_logprob - correction

    def rsample_and_log_prob(self, sample_shape=torch.Size()):
        """Samples X and computes the log probability of the sample.
        Returns:
            Sampled X and log probability
        """
        
        z = super().rsample()
        z_logprob = super().log_prob(z)
        value = torch.tanh(z)
        
        action_scale = torch.Tensor(self.action_scale).to(z.device)
        action_bias = torch.Tensor(self.action_bias).to(z.device)        
        
        correction = torch.log(action_scale * (1 - value ** 2) + 1e-7).sum(1)
        return action_scale * value + action_bias, z_logprob - correction

    def rsample(self, sample_shape=torch.Size()):
        fz, z = self.rsample_with_z(sample_shape)
        return fz

    def log_prob(self, value):
        
        action_scale = torch.Tensor(self.action_scale).to(value.device)
        action_bias = torch.Tensor(self.action_bias).to(value.device)
        
        value = (value - action_bias) / action_scale
        z = torch.log(1 + value) / 2 - torch.log(1 - value) / 2
        return self.log_prob_with_z(value, z)
  

In [None]:
  
'''
Source:
 
https://github.com/Lightning-Universe/lightning-bolts/blob/0.5.0/pl_bolts/models/rl/common/networks.py
'''
class MLP(nn.Module):
    """Simple MLP network."""

    def __init__(self, input_shape: Tuple[int], n_actions: int, hidden_size: int = 128):
        """
        Args:
            input_shape: observation shape of the environment
            n_actions: number of discrete actions available in the environment
            hidden_size: size of hidden layers
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_shape[0], hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
        )

    def forward(self, input_x):
        """Forward pass through network.
        Args:
            x: input to network
        Returns:
            output of network
        """
        return self.net(input_x.float())

class ContinuousMLP(nn.Module):
    """MLP network that outputs continuous value via Gaussian distribution."""

    def __init__(
        self,
        input_shape: Tuple[int],
        n_actions: int,
        hidden_size: int = 128,
        action_bias: int = 0,
        action_scale: int = 1,
    ):
        """
        Args:
            input_shape: observation shape of the environment
            n_actions: dimension of actions in the environment
            hidden_size: size of hidden layers
            action_bias: the center of the action space
            action_scale: the scale of the action space
        """
        super().__init__()
        self.action_bias = action_bias
        self.action_scale = action_scale

        self.shared_net = nn.Sequential(
            nn.Linear(input_shape[0], hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU()
        )
        self.mean_layer = nn.Linear(hidden_size, n_actions)
        self.logstd_layer = nn.Linear(hidden_size, n_actions)

    def forward(self, x: FloatTensor) -> TanhMultivariateNormal:
        """Forward pass through network. Calculates the action distribution.
        Args:
            x: input to network
        Returns:
            action distribution
        """
        # DEBUG
        #print(f"x.device = {x.device}")
        #print(f"next(self.parameters()).is_cuda = {next(self.parameters()).is_cuda}")
        
        x = self.shared_net(x.float())
        batch_mean = self.mean_layer(x)
        logstd = torch.clamp(self.logstd_layer(x), -20, 2)
        batch_scale_tril = torch.diag_embed(torch.exp(logstd))
        output = TanhMultivariateNormal(action_bias=self.action_bias, 
                                        action_scale=self.action_scale, 
                                        loc=batch_mean, 
                                        scale_tril=batch_scale_tril,)
        return output

    def get_action(self, x: FloatTensor) -> Tensor:
        """Get the action greedily (without sampling)
        Args:
            x: input to network
        Returns:
            mean action
        """
        x = self.shared_net(x.float())
        batch_mean = self.mean_layer(x)
        return self.action_scale * torch.tanh(batch_mean) + self.action_bias

## 3) Lightning Module

In [3]:
import gym

In [11]:
class DDPGS(LightningModule):
    def __init__(
        self,
        env: gym.Env, # Initially a str
        eps_start: float = 1.0,
        eps_end: float = 0.02,
        eps_last_frame: int = 150000,
        sync_rate: int = 1,
        gamma: float = 0.99,
        policy_learning_rate: float = 3e-4,
        q_learning_rate: float = 3e-4,
        target_alpha: float = 5e-3,
        batch_size: int = 128,
        replay_size: int = 1000000,
        warm_start_size: int = 10000,
        avg_reward_len: int = 100,
        min_episode_reward: int = -21,
        seed: int = 123,
        batches_per_epoch: int = 10000,
        n_steps: int = 1,
        **kwargs,
    ):
        super().__init__()

        ### NOTE: I dislike this
        # Training environment
        self.env = env
        #self.env = gym.make(env)
        #self.test_env = gym.make(env)

        self.obs_shape = self.env.observation_space.shape
        self.n_actions = self.env.action_space.shape[0]

        # Model Attributes
        self.buffer = None
        self.dataset = None

        self.policy = None
        self.q_net = None
        self.target_q_net = None
        self.build_networks()

        '''
        ### IMPORTANT: The policy net is the agent's net.
        self.agent = SoftActorCriticAgent(self.policy)
        '''

        # Hyperparameters
        self.save_hyperparameters()

        # Metrics
        self.total_episode_steps = [0]
        self.total_rewards = [0]
        self.done_episodes = 0
        self.total_steps = 0

        # Average Rewards
        self.avg_reward_len = avg_reward_len

        for _ in range(avg_reward_len):
            self.total_rewards.append(torch.tensor(min_episode_reward, device=self.device))

        self.avg_rewards = float(np.mean(self.total_rewards[-self.avg_reward_len :]))

        ### NOTE: Change this
        # I don't think this is needed
        #self.state, _ = self.env.reset()

        self.automatic_optimization = False

    '''
        AJZerouali
    '''
    # This method is originally from SoftActorCriticAgent(Agent)
    def get_action(self, states: Tensor, device: str) -> List[float]:
        """Get the action greedily (without sampling)
        Args:
            states: current state of the environment
            device: the device used for the current batch
        Returns:
            action defined by policy
        
        if not isinstance(states, list):
            states = [states]

        if not isinstance(states, Tensor):
            states = torch.tensor(states, device=device)

        # CRUCIAL: Replace self.net by the appropriate network
        # The get_action() method here is that of ContinuousMLP
        actions = [self.net.get_action(states).cpu().numpy()]

        return actions
        """
        raise NotImplementedError()
    
    # This is originally SoftActorCriticAgent.__call__()
    def policy_eval(self, states: Tensor, device: str) -> List[float]:
        """Takes in the current state and returns the action based on the agents policy.
        Args:
            states: current state of the environment
            device: the device used for the current batch
        Returns:
            action defined by policy
        """
        if not isinstance(states, list):
            states = [states]

        if not isinstance(states, Tensor):
            states = torch.tensor(states, device=device)

        # CRUCIAL: Replace self.net by the appropriate network
        #dist = self.net(states)
        #policy_out = self.policy(states)
        actions_ = self.policy(states)
        actions = actions_.cpu().detach().numpy()
        
        # IMPORTANT: Change this
        #actions = [a for a in dist.sample().cpu().numpy()]
        #actions = [a for a in policy_out.cpu().numpy()]

        return actions
    
    '''
        PL-BOLTS
    '''
    def run_n_episodes(self, env, n_epsiodes: int = 1) -> List[int]:
        """Carries out N episodes of the environment with the current agent without exploration.

        Args:
            env: environment to use, either train environment or test environment
            n_epsiodes: number of episodes to run
        """
        total_rewards = []

        for _ in range(n_epsiodes):
            episode_state, _ = env.reset()
            done = False
            episode_reward = 0

            while not done:
                #### CRUCIAL: Replace self.agent
                #action = self.agent.get_action(episode_state, self.device)
                #action = self.get_action(self.state, self.device)
                action = self.policy_eval(self.state, self.device)
                
                # gym v0.26.2+: step() returns (observation, reward, terminated, truncated, info)
                next_state, reward, done, _, _ = env.step(action[0])
                episode_state = next_state
                episode_reward += reward

            total_rewards.append(episode_reward)

        return total_rewards

    def populate(self, warm_start: int) -> None:
        """Populates the buffer with initial experience."""
        if warm_start > 0:
            self.state, _ = self.env.reset()

            for _ in range(warm_start):
                #### CRUCIAL: Replace self.agent
                #action = self.agent(self.state, self.device)
                action = self.policy_eval(self.state, self.device)
                
                next_state, reward, done, _, _ = self.env.step(action[0])
                
                # NOTE: Change this shit
                exp = Experience(state=self.state, 
                                 action=action[0], 
                                 reward=reward, 
                                 done=done, 
                                 new_state=next_state)
                
                self.buffer.append(exp)
                self.state = next_state

                if done:
                    self.state, _ = self.env.reset()

    def build_networks(self) -> None:
        """Initializes the DDPG policy and q network with target"""
        '''
        # NOTE: Remove these
        action_bias = torch.from_numpy((self.env.action_space.high + self.env.action_space.low) / 2)
        action_scale = torch.from_numpy((self.env.action_space.high - self.env.action_space.low) / 2)
        
        # IMPORTANT: This has to change. The policy is deterministic
        self.policy = ContinuousMLP(self.obs_shape, self.n_actions, 
                                    action_bias=action_bias, 
                                    action_scale=action_scale)
        '''
        self.policy = MLP(self.obs_shape, self.n_actions)

        concat_shape = [self.obs_shape[0] + self.n_actions]
        self.q_net = MLP(concat_shape, 1)
        self.target_q_net = MLP(concat_shape, 1)
        self.target_q_net.load_state_dict(self.q_net.state_dict())

    def soft_update_target(self):
        """Update the weights in target network using a weighted sum.

        w_target := (1-a) * w_target + a * w_q

        Args:
            q_net: the critic (q) network
            target_net: the target (q) network
        """
        ### IMPORTANT: Change this call too. This is incomprehensible...
        for q_param, target_param in zip(self.q_net.parameters(), self.target_q_net.parameters()):
            target_param.data.copy_(
                (1.0 - self.hparams.target_alpha) * target_param.data + self.hparams.target_alpha * q_param
            )

    def forward(self, state: Tensor) -> Tensor:
        """Passes in a state x through the network and gets the q_values of each action as an output.

        Args:
            state: environment state

        Returns:
            q values
        """
        # IMPORTANT: no sample() here for DDPG
        #output = self.policy(x).sample()
        output = self.policy(state)
        return output

    def train_batch(
        self,
    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
        """Contains the logic for generating a new batch of data to be passed to the DataLoader.

        Returns:
            yields a Experience tuple containing the state, action, reward, done and next_state.
        """
        episode_reward = 0
        episode_steps = 0

        while True:
            self.total_steps += 1
            
            #### CRUCIAL: Replace self.agent
            #action = self.agent(self.state, self.device)
            action = self.policy_eval(self.state, self.device)

            next_state, r, is_done, _, _ = self.env.step(action[0])

            episode_reward += r
            episode_steps += 1

            # IMPORTANT: Please change these names before I lose it
            exp = Experience(state=self.state, 
                             action=action[0], 
                             reward=r, 
                             done=is_done, 
                             new_state=next_state)

            self.buffer.append(exp)
            self.state = next_state

            if is_done:
                self.done_episodes += 1
                self.total_rewards.append(episode_reward)
                self.total_episode_steps.append(episode_steps)
                self.avg_rewards = float(np.mean(self.total_rewards[-self.avg_reward_len :]))
                self.state, _ = self.env.reset()
                episode_steps = 0
                episode_reward = 0

            states, actions, rewards, dones, new_states = self.buffer.sample(self.hparams.batch_size)

            for idx, _ in enumerate(dones):
                yield states[idx], actions[idx], rewards[idx], dones[idx], new_states[idx]

            # Simulates epochs
            if self.total_steps % self.hparams.batches_per_epoch == 0:
                break

    def loss(self, batch: Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]) -> Tuple[Tensor, Tensor, Tensor]:
        """Calculates the loss for SAC which contains a total of 3 losses.

        Args:
            batch: a batch of states, actions, rewards, dones, and next states
        """
        states, actions, rewards, dones, next_states = batch
        rewards = rewards.unsqueeze(-1)
        dones = dones.float().unsqueeze(-1)

        # actor
        '''
        ### IMPORTANT: Has to be changed
        dist = self.policy(states)
        new_actions, new_logprobs = dist.rsample_and_log_prob()
        new_logprobs = new_logprobs.unsqueeze(-1)
        '''
        new_actions = self.policy(states)
        '''
        new_states_actions = torch.cat((states, new_actions), 1)
        new_q1_values = self.q1(new_states_actions)
        new_q2_values = self.q2(new_states_actions)
        new_qmin_values = torch.min(new_q1_values, new_q2_values)
        '''
        new_states_actions = torch.cat((states, new_actions), 1)
        new_q_values = self.q_net(new_states_actions)

        '''
        policy_loss = (new_logprobs - new_qmin_values).mean()
        '''
        policy_loss = -new_q_values.mean()

        # critic
        ## Everything's fucking backwards in this implementation...
        states_actions = torch.cat((states, actions), 1)
        '''
        q1_values = self.q1(states_actions)
        q2_values = self.q2(states_actions)
        '''
        q_values = self.q_net(states_actions)

        with torch.no_grad():
            '''
            next_dist = self.policy(next_states)
            new_next_actions, new_next_logprobs = next_dist.rsample_and_log_prob()
            new_next_logprobs = new_next_logprobs.unsqueeze(-1)

            new_next_states_actions = torch.cat((next_states, new_next_actions), 1)
            next_q1_values = self.target_q1(new_next_states_actions)
            next_q2_values = self.target_q2(new_next_states_actions)
            next_qmin_values = torch.min(next_q1_values, next_q2_values) - new_next_logprobs
            target_values = rewards + (1.0 - dones) * self.hparams.gamma * next_qmin_values
            '''
            new_next_actions = self.policy(next_states)
            new_next_states_actions = torch.cat((next_states, new_next_actions), 1)
            next_q_values = self.target_q_net(new_next_states_actions)
            
            target_q_values = rewards + (1.0 - dones) * self.hparams.gamma * next_q_values
            
        '''
        q1_loss = F.mse_loss(q1_values, target_values)
        q2_loss = F.mse_loss(q2_values, target_values)
        
        return policy_loss, q1_loss, q2_loss
        '''
        critic_loss= F.mse_loss(q_values, target_q_values)
        
        return policy_loss, critic_loss


    def training_step(self, batch: Tuple[Tensor, Tensor], _):
        """Carries out a single step through the environment to update the replay buffer. Then calculates loss
        based on the minibatch recieved.

        Args:
            batch: current mini batch of replay data
            _: batch number, not used
        """
        # IMPORTANT: This changes
        #policy_optim, q1_optim, q2_optim = self.optimizers()
        #policy_loss, q1_loss, q2_loss = self.loss(batch)
        policy_optim, critic_optim = self.optimizers()
        policy_loss, critic_loss = self.loss(batch)

        policy_optim.zero_grad()
        self.manual_backward(policy_loss)
        policy_optim.step()
        
        '''
        q1_optim.zero_grad()
        self.manual_backward(q1_loss)
        q1_optim.step()
        
        q2_optim.zero_grad()
        self.manual_backward(q2_loss)
        q2_optim.step()
        '''
        critic_optim.zero_grad()
        self.manual_backward(critic_loss)
        critic_optim.step()

        # Soft update of target network
        if self.global_step % self.hparams.sync_rate == 0:
            self.soft_update_target()

        self.log_dict(
            {
                "total_reward": self.total_rewards[-1],
                "avg_reward": self.avg_rewards,
                "policy_loss": policy_loss,
                "critic_loss": critic_loss,
                "episodes": self.done_episodes,
                "episode_steps": self.total_episode_steps[-1],
            }
        )

    ## Question: What is this?
    def test_step(self, *args, **kwargs) -> Dict[str, Tensor]:
        """Evaluate the agent for 10 episodes."""
        test_reward = self.run_n_episodes(self.test_env, 1)
        avg_reward = sum(test_reward) / len(test_reward)
        return {"test_reward": avg_reward}

    def test_epoch_end(self, outputs) -> Dict[str, Tensor]:
        """Log the avg of the test results."""
        rewards = [x["test_reward"] for x in outputs]
        avg_reward = sum(rewards) / len(rewards)
        self.log("avg_test_reward", avg_reward)
        return {"avg_test_reward": avg_reward}

    '''
        NOTE: The replay buffer changes
    '''
    def _dataloader(self) -> DataLoader:
        """Initialize the Replay Buffer dataset used for retrieving experiences."""
        self.buffer = MultiStepBuffer(self.hparams.replay_size, self.hparams.n_steps)
        self.populate(self.hparams.warm_start_size)

        self.dataset = ExperienceSourceDataset(self.train_batch)
        return DataLoader(dataset=self.dataset, batch_size=self.hparams.batch_size)

    def train_dataloader(self) -> DataLoader:
        """Get train loader."""
        return self._dataloader()

    def test_dataloader(self) -> DataLoader:
        """Get test loader."""
        return self._dataloader()

    def configure_optimizers(self) -> Tuple[Optimizer]:
        """Initialize Adam optimizer."""
        '''
        policy_optim = optim.Adam(self.policy.parameters(), self.hparams.policy_learning_rate)
        q1_optim = optim.Adam(self.q1.parameters(), self.hparams.q_learning_rate)
        q2_optim = optim.Adam(self.q2.parameters(), self.hparams.q_learning_rate)
        return policy_optim, q1_optim, q2_optim
        '''
        policy_optim = optim.Adam(self.policy.parameters(), self.hparams.policy_learning_rate)
        critic_optim = optim.Adam(self.q_net.parameters(), self.hparams.q_learning_rate)
        return policy_optim, critic_optim

    @staticmethod
    def add_model_specific_args(
        arg_parser: argparse.ArgumentParser,
    ) -> argparse.ArgumentParser:
        """Adds arguments for DQN model.

        Note:
            These params are fine tuned for Pong env.

        Args:
            arg_parser: parent parser
        """
        arg_parser.add_argument(
            "--sync_rate",
            type=int,
            default=1,
            help="how many frames do we update the target network",
        )
        arg_parser.add_argument(
            "--replay_size",
            type=int,
            default=1000000,
            help="capacity of the replay buffer",
        )
        arg_parser.add_argument(
            "--warm_start_size",
            type=int,
            default=10000,
            help="how many samples do we use to fill our buffer at the start of training",
        )
        arg_parser.add_argument("--batches_per_epoch", type=int, default=10000, help="number of batches in an epoch")
        arg_parser.add_argument("--batch_size", type=int, default=128, help="size of the batches")
        arg_parser.add_argument("--policy_lr", type=float, default=3e-4, help="policy learning rate")
        arg_parser.add_argument("--q_lr", type=float, default=3e-4, help="q learning rate")
        arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag")
        arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor")

        arg_parser.add_argument(
            "--avg_reward_len",
            type=int,
            default=100,
            help="how many episodes to include in avg reward",
        )
        arg_parser.add_argument(
            "--n_steps",
            type=int,
            default=1,
            help="how many frames do we update the target network",
        )

        return arg_parser


# Testing the Algo (Version 2 - 2306272150)

I modifed the hard-coded MLP sizes in pl_bolts_nets.py

In [1]:
import argparse
from typing import Dict, List, Tuple

import numpy as np
import torch
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from torch import Tensor, optim, FloatTensor, Tensor, nn
from torch.distributions import Categorical, Normal, MultivariateNormal
from torch.nn import functional as F
from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader

import gym



In [2]:
from RL_lightning_bolts_template.pl_ddpgs import DDPGS
from RL_lightning_bolts_template.pl_bolts_nets import TanhMultivariateNormal

## A) Training

### Critic net

In [3]:
class MLP(nn.Module):
    """Simple MLP network."""

    def __init__(self, input_shape: Tuple[int], n_actions: int, hidden_size: int = 128):
        """
        Args:
            input_shape: observation shape of the environment
            n_actions: number of discrete actions available in the environment
            hidden_size: size of hidden layers
        """
        super().__init__()
        '''
        self.net = nn.Sequential(
            nn.Linear(input_shape[0], hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
        )
        '''
        # AJZ, 23/06/25
        self.net = nn.Sequential(
            nn.Linear(input_shape[0], 400),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(300, n_actions),
        )

    def forward(self, input_x):
        """Forward pass through network.
        Args:
            x: input to network
        Returns:
            output of network
        """
        return self.net(input_x.float())

### Actor net

In [6]:
class ContinuousMLP(nn.Module):
    """MLP network that outputs continuous value via Gaussian distribution."""

    def __init__(
        self,
        input_shape: Tuple[int],
        n_actions: int,
        hidden_size: int = 128,
        action_bias: int = 0,
        action_scale: int = 1,
    ):
        """
        Args:
            input_shape: observation shape of the environment
            n_actions: dimension of actions in the environment
            hidden_size: size of hidden layers
            action_bias: the center of the action space
            action_scale: the scale of the action space
        """
        super().__init__()
        self.action_bias = action_bias
        self.action_scale = action_scale
        '''
        self.shared_net = nn.Sequential(
            nn.Linear(input_shape[0], hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU()
        )
        self.mean_layer = nn.Linear(hidden_size, n_actions)
        self.logstd_layer = nn.Linear(hidden_size, n_actions)
        '''
        self.shared_net = nn.Sequential(
            nn.Linear(input_shape[0], 400),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(400, 300), 
            nn.ReLU(),
            nn.Dropout(0.5),
        )
        self.mean_layer = nn.Linear(300, n_actions)
        self.logstd_layer = nn.Linear(300, n_actions)

    def forward(self, x: FloatTensor) -> TanhMultivariateNormal:
        """Forward pass through network. Calculates the action distribution.
        Args:
            x: input to network
        Returns:
            action distribution
        """
        # DEBUG
        #print(f"x.device = {x.device}")
        #print(f"next(self.parameters()).is_cuda = {next(self.parameters()).is_cuda}")
        
        x = self.shared_net(x.float())
        batch_mean = self.mean_layer(x)
        logstd = torch.clamp(self.logstd_layer(x), -20, 2)
        batch_scale_tril = torch.diag_embed(torch.exp(logstd))
        output = TanhMultivariateNormal(action_bias=self.action_bias, 
                                        action_scale=self.action_scale, 
                                        loc=batch_mean, 
                                        scale_tril=batch_scale_tril,)
        return output

    def get_action(self, x: FloatTensor) -> Tensor:
        """Get the action greedily (without sampling)
        Args:
            x: input to network
        Returns:
            mean action
        """
        x = self.shared_net(x.float())
        batch_mean = self.mean_layer(x)
        return self.action_scale * torch.tanh(batch_mean) + self.action_bias

### Training

Environment and *LightningModule*:

In [3]:
# train_env = gym.make("HalfCheetah-v4", render_mode = "rgb_array") # gym v0.26.2
train_env = gym.make("HalfCheetah-v4")

  logger.warn(
  logger.warn(


In [4]:
model = DDPGS(env = train_env,
              sync_rate = 20,
              gamma = 0.98,
              warm_start_size = 5000, #5000
              min_episode_reward = 0,
              seed = 101,
              n_steps = 10,
              policy_learning_rate = 1e-4,
              q_learning_rate = 1e-4,
              target_alpha = 5e-3,
              batch_size = 128,
              replay_size = 1000000,
              avg_reward_len = 100,
              batches_per_epoch = 10000,
              num_workers = 6,
             )

Instantiate trainer:

In [5]:
# save checkpoints based on avg_reward
checkpoint_callback = ModelCheckpoint(save_top_k=1, 
                                      monitor="avg_reward", 
                                      mode="max", 
                                      verbose=True)
seed_everything(123)
trainer = Trainer(accelerator="gpu", 
                  max_steps=500000,  # 100000, 1000000
                  callbacks = checkpoint_callback)

Global seed set to 123
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [5]:
'''
    ##########################################
    ### TRAINER TEST (NO MODEL CHECKPOINT) ###
    ##########################################
'''
seed_everything(123)
trainer = Trainer(accelerator="gpu", 
                  max_steps=5000,)

Global seed set to 123
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Train:

In [6]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f1b29ca2e50>

In [6]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type          | Params
-----------------------------------------------
0 | policy       | ContinuousMLP | 73.5 K
1 | q_net        | MLP           | 72.2 K
2 | target_q_net | MLP           | 72.2 K
-----------------------------------------------
217 K     Trainable params
0         Non-trainable params
217 K     Total params
0.871     Total estimated model params size (MB)
  if not hasattr(tensorboard, '__version__') or LooseVersion(tensorboard.__version__) < LooseVersion('1.15'):
  import imp
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  'hamming': pil_image.HAMMING,
  'box': pil_image.BOX,
  'lanczos': pil_image.LANCZOS,


==> ENTERING train_dataloader()
==> ENTERING _dataloader()
==> ENTERING populate()


  states = torch.tensor(states, device=device)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.array(dones, dtype=np.bool),
  return torch.as_tensor(batch)


==> STARTING train_batch()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING training_step()
==> STARTING tra

`Trainer.fit` stopped: `max_steps=5000` reached.


In [7]:
torch.save(model.state_dict(), "./pl_ddpg_gym0240_2306272310")

## B) Test trained model

To load a model from a checkpoint:

https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html

In [8]:
from gym import wrappers
from pyvirtualdisplay import Display

In [9]:
# Last saved model
# test_model = DDPGS(env = gym.make("HalfCheetah-v4", render_mode = "rgb_array"))
test_model = DDPGS(env = gym.make("HalfCheetah-v4"))
test_model.load_state_dict(torch.load("./pl_ddpg_gym0240_2306272205"))

  logger.warn(
  logger.warn(


<All keys matched successfully>

In [5]:
del test_model

In [9]:
# Best model from Trainer checkpoint
#test_model = DDPGS(env = gym.make("HalfCheetah-v4", render_mode = "rgb_array"))
test_model = DDPGS.load_from_checkpoint("./lightning_logs/version_7/checkpoints/epoch=24-step=500000.ckpt")

# disable randomness, dropout, etc...
test_model.eval()

DDPGS(
  (policy): ContinuousMLP(
    (shared_net): Sequential(
      (0): Linear(in_features=17, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=256, bias=True)
      (3): ReLU()
    )
    (mean_layer): Linear(in_features=256, out_features=6, bias=True)
    (logstd_layer): Linear(in_features=256, out_features=6, bias=True)
  )
  (q_net): MLP(
    (net): Sequential(
      (0): Linear(in_features=23, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=256, bias=True)
      (3): ReLU()
      (4): Linear(in_features=256, out_features=1, bias=True)
    )
  )
  (target_q_net): MLP(
    (net): Sequential(
      (0): Linear(in_features=23, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=256, bias=True)
      (3): ReLU()
      (4): Linear(in_features=256, out_features=1, bias=True)
    )
  )
)

In [10]:
# Make virtual display
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

# Trigger for wrapper.RecordVideo() object
def epsd_trigger(episode_id: int) -> bool:
    '''
        Records all episodes
    '''
    if episode_id < 10:
        return True
    else:
        return False

In [11]:
# Test env
#env = gym.make("HalfCheetah-v4", render_mode = "rgb_array")
env = gym.make("HalfCheetah-v4")
env = wrappers.RecordVideo(env = env, 
                           video_folder="vids/",
                           name_prefix="DDPGs_pl_gym0240_100kSteps_TweedNets_2306272310",
                           episode_trigger = epsd_trigger)

  logger.warn(
  logger.warn(
  logger.warn(


In [12]:
# Main loop
for episode in range(3):
    
    # Init. env. and counters
    state = env.reset()
    step = 0
    total_reward = 0
    n_prds = 0
    done = False
    trunc = False
    
    # Episodic loop
    while not done and step<5001:
        
        # Render
        env.render()
        
        # Get action
        with torch.no_grad():
            state_ = torch.FloatTensor(np.array(state)).to(test_model.device)
            # Get actions and UPolicy output
            action_ = test_model.policy.get_action(state_)
            # Get np arrays
            action = action_.cpu().detach().numpy()
        
        # Environment step
        state_next, reward, done, info = env.step(action)
        # Update tot. score
        total_reward += reward
        # Update step
        step += 1
        # Update state
        state = state_next
        # End of episode
        if done:
            print(f"Episode: {episode},\tSteps: {step},\tscore: {total_reward}")
            break

# Close environment
env.close()

Episode: 0,	Steps: 1000,	score: 1745.4410614420235
Episode: 1,	Steps: 1000,	score: 1826.341708302102
Episode: 2,	Steps: 1000,	score: 1722.5220128503026


# Testing the Algo (Version 1 - 2306221408)

In [1]:
import argparse
from typing import Dict, List, Tuple

import numpy as np
import torch
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from torch import Tensor, optim, FloatTensor, Tensor, nn
from torch.distributions import Categorical, Normal, MultivariateNormal
from torch.nn import functional as F
from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader

import gym



In [2]:
from RL_lightning_bolts_template.pl_ddpgs import DDPGS
from RL_lightning_bolts_template.pl_bolts_nets import TanhMultivariateNormal

## A) Training

### Critic net

In [3]:
class MLP(nn.Module):
    """Simple MLP network."""

    def __init__(self, input_shape: Tuple[int], n_actions: int, hidden_size: int = 128):
        """
        Args:
            input_shape: observation shape of the environment
            n_actions: number of discrete actions available in the environment
            hidden_size: size of hidden layers
        """
        super().__init__()
        '''
        self.net = nn.Sequential(
            nn.Linear(input_shape[0], hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
        )
        '''
        # AJZ, 23/06/25
        self.net = nn.Sequential(
            nn.Linear(input_shape[0], 400),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(400, 300),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(300, n_actions),
        )

    def forward(self, input_x):
        """Forward pass through network.
        Args:
            x: input to network
        Returns:
            output of network
        """
        return self.net(input_x.float())

### Actor net

In [6]:
class ContinuousMLP(nn.Module):
    """MLP network that outputs continuous value via Gaussian distribution."""

    def __init__(
        self,
        input_shape: Tuple[int],
        n_actions: int,
        hidden_size: int = 128,
        action_bias: int = 0,
        action_scale: int = 1,
    ):
        """
        Args:
            input_shape: observation shape of the environment
            n_actions: dimension of actions in the environment
            hidden_size: size of hidden layers
            action_bias: the center of the action space
            action_scale: the scale of the action space
        """
        super().__init__()
        self.action_bias = action_bias
        self.action_scale = action_scale
        '''
        self.shared_net = nn.Sequential(
            nn.Linear(input_shape[0], hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU()
        )
        self.mean_layer = nn.Linear(hidden_size, n_actions)
        self.logstd_layer = nn.Linear(hidden_size, n_actions)
        '''
        self.shared_net = nn.Sequential(
            nn.Linear(input_shape[0], 400),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(400, 300), 
            nn.ReLU(),
            nn.Dropout(0.5),
        )
        self.mean_layer = nn.Linear(300, n_actions)
        self.logstd_layer = nn.Linear(300, n_actions)

    def forward(self, x: FloatTensor) -> TanhMultivariateNormal:
        """Forward pass through network. Calculates the action distribution.
        Args:
            x: input to network
        Returns:
            action distribution
        """
        # DEBUG
        #print(f"x.device = {x.device}")
        #print(f"next(self.parameters()).is_cuda = {next(self.parameters()).is_cuda}")
        
        x = self.shared_net(x.float())
        batch_mean = self.mean_layer(x)
        logstd = torch.clamp(self.logstd_layer(x), -20, 2)
        batch_scale_tril = torch.diag_embed(torch.exp(logstd))
        output = TanhMultivariateNormal(action_bias=self.action_bias, 
                                        action_scale=self.action_scale, 
                                        loc=batch_mean, 
                                        scale_tril=batch_scale_tril,)
        return output

    def get_action(self, x: FloatTensor) -> Tensor:
        """Get the action greedily (without sampling)
        Args:
            x: input to network
        Returns:
            mean action
        """
        x = self.shared_net(x.float())
        batch_mean = self.mean_layer(x)
        return self.action_scale * torch.tanh(batch_mean) + self.action_bias

Environment and *LightningModule*:

In [7]:
# train_env = gym.make("HalfCheetah-v4", render_mode = "rgb_array") # gym v0.26.2
train_env = gym.make("HalfCheetah-v4")

  logger.warn(
  logger.warn(


In [8]:
model = DDPGS(env = train_env,
              sync_rate = 20,
              gamma = 0.98,
              warm_start_size = 10000, #5000
              min_episode_reward = 0,
              seed = 101,
              n_steps = 10,
              policy_learning_rate = 1e-4,
              q_learning_rate = 1e-4,
              target_alpha = 5e-3,
              batch_size = 100,
              replay_size = 200000,
              avg_reward_len = 100,
              batches_per_epoch = 10000,
             )

Instantiate trainer:

In [9]:
# save checkpoints based on avg_reward
checkpoint_callback = ModelCheckpoint(save_top_k=1, 
                                      monitor="avg_reward", 
                                      mode="max", 
                                      verbose=True)
seed_everything(123)
trainer = Trainer(accelerator="gpu", 
                  max_steps=100000,  # 100000, 1000000
                  callbacks = checkpoint_callback)

Global seed set to 123
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Train:

In [6]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f1b29ca2e50>

In [10]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type          | Params
-----------------------------------------------
0 | policy       | ContinuousMLP | 131 K 
1 | q_net        | MLP           | 130 K 
2 | target_q_net | MLP           | 130 K 
-----------------------------------------------
391 K     Trainable params
0         Non-trainable params
391 K     Total params
1.566     Total estimated model params size (MB)
  if not hasattr(tensorboard, '__version__') or LooseVersion(tensorboard.__version__) < LooseVersion('1.15'):
  import imp
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  'hamming': pil_image.HAMMING,
  'box': pil_image.BOX,
  'lanczos': pil_image.LANCZOS,
  states = torch.tensor(states, device=device)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.array(dones, dtype=np.bool),
  return torch.as_tensor(batch)
Epoch 0, global step 20000: 'avg_reward' reached -30.47791 (best -30.47791), saving model to '/notebooks/Lightning/lightning_logs/version_5/checkpoints/epoch=0-step=20000.ckpt' as top 1
Epoch 1, global step 40000: 'avg_reward' reached -26.98513 (best -26.98513), saving model to '/notebooks/Lightning/lightning_logs/version_5/checkpoints/epoch=1-step=40000.ckpt' as top 1
Epoch 2, global step 60000: 'avg_reward' reached -12.04049 (best -12.04049), saving model to '/notebooks/Lightning/lightning_logs/version_5/checkpoints/epoch=2-step=60000.ckpt' as top 1
Epoch 3, global step 80000: 'avg_reward' reached 21.86908 (best 21.86908), saving model to '/notebooks/Lightning/lightning_logs/version_5/checkpoints/epoch=3-step=80000.ckpt' as top 1
Epoch 4, global step 100000: 'avg_reward' reached 78.26254 (best 78.26

In [12]:
torch.save(model.state_dict(), "./pl_ddpg_gym0240_2306251635")

## B) Test trained model

To load a model from a checkpoint:

https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html

In [3]:
from gym import wrappers
from pyvirtualdisplay import Display

In [4]:
# Last saved model
# test_model = DDPGS(env = gym.make("HalfCheetah-v4", render_mode = "rgb_array"))
test_model = DDPGS(env = gym.make("HalfCheetah-v4"))
test_model.load_state_dict(torch.load("./pl_ddpg_gym0240_2306251635"))

  logger.warn(
  logger.warn(


<All keys matched successfully>

In [5]:
del test_model

In [4]:
# Best model from Trainer checkpoint
#test_model = DDPGS(env = gym.make("HalfCheetah-v4", render_mode = "rgb_array"))
test_model = DDPGS.load_from_checkpoint("./lightning_logs/version_4/checkpoints/epoch=4-step=100000.ckpt")

# disable randomness, dropout, etc...
test_model.eval()

DDPGS(
  (policy): ContinuousMLP(
    (shared_net): Sequential(
      (0): Linear(in_features=17, out_features=400, bias=True)
      (1): ReLU()
      (2): Linear(in_features=400, out_features=300, bias=True)
      (3): ReLU()
    )
    (mean_layer): Linear(in_features=300, out_features=6, bias=True)
    (logstd_layer): Linear(in_features=300, out_features=6, bias=True)
  )
  (q_net): MLP(
    (net): Sequential(
      (0): Linear(in_features=23, out_features=400, bias=True)
      (1): ReLU()
      (2): Linear(in_features=400, out_features=300, bias=True)
      (3): ReLU()
      (4): Linear(in_features=300, out_features=1, bias=True)
    )
  )
  (target_q_net): MLP(
    (net): Sequential(
      (0): Linear(in_features=23, out_features=400, bias=True)
      (1): ReLU()
      (2): Linear(in_features=400, out_features=300, bias=True)
      (3): ReLU()
      (4): Linear(in_features=300, out_features=1, bias=True)
    )
  )
)

In [15]:
# Make virtual display
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

# Trigger for wrapper.RecordVideo() object
def epsd_trigger(episode_id: int) -> bool:
    '''
        Records all episodes
    '''
    if episode_id < 10:
        return True
    else:
        return False

In [16]:
# Test env
#env = gym.make("HalfCheetah-v4", render_mode = "rgb_array")
env = gym.make("HalfCheetah-v4")
env = wrappers.RecordVideo(env = env, 
                           video_folder="vids/",
                           name_prefix="DDPGs_pl_gym0240_100kSteps_Corrected_2306261653",
                           episode_trigger = epsd_trigger)

  logger.warn(
  logger.warn(
  logger.warn(


In [11]:
for episode in range(2):
    #state, _ = env.reset()
    state = env.reset()
    step = 0
    total_reward = 0
    done = False
    while not done and step<5001:
        step += 1
        env.render()
        # Get action
        with torch.no_grad():
            # Convert to torch tensors
            state_ = torch.FloatTensor(np.array(state)).to(test_model.device)
            # Get actions and UPolicy output
            #action_ = actor(control, state_, t_)
            #action_ = test_model(state_)
            action_ = test_model.policy.get_action(state_)
            # Get np arrays
            action = action_.cpu().detach().numpy()
        
        #observation, reward, done, trunc, info = env.step(action)
        observation, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            print("Episode: {0},\tSteps: {1},\tscore: {2}"
                  .format(episode, step, total_reward)
            )
            break
env.close()

Episode: 0,	Steps: 1000,	score: -75.82073089709843
Episode: 1,	Steps: 1000,	score: -50.94959431896886


In [17]:
# Main loop
for episode in range(2):
    
    # Init. env. and counters
    state = env.reset()
    step = 0
    total_reward = 0
    n_prds = 0
    done = False
    trunc = False
    '''
    # Initial ampl omega and phi
    with th.no_grad():
        state_ = th.FloatTensor(np.array(state)).to(test_control.device)
        ampl_, omega_, phi_ = test_control(state_)
    '''
    
    # Episodic loop
    while not done and step<5001:
        
        # Render
        env.render()
        '''
        t = float(step-n_prds*n_delay_steps)
        '''
        
        # Get action
        with torch.no_grad():
            '''
            # Convert to torch tensors
            t_ = th.FloatTensor(np.array(t)).to(test_control.device)
            '''
            state_ = torch.FloatTensor(np.array(state)).to(test_model.device)
            # Get actions and UPolicy output
            action_ = test_model.policy.get_action(state_)
            # Get np arrays
            action = action_.cpu().detach().numpy()
        
        # Environment step
        state_next, reward, done, info = env.step(action)
        # Update tot. score
        total_reward += reward
        # Update step
        step += 1
        # Update state
        state = state_next
        '''
        # End of period updates
        if step % n_delay_steps == 0:
            
            # Update no. of periods
            n_prds += 1
            
            # Update amplitude, frequ. and phase
            with th.no_grad():
                state_ = th.FloatTensor(np.array(state)).to(test_control.device)
                ampl_, omega_, phi_ = test_control(state_)
        '''
        
        # End of episode
        if done:
            print(f"Episode: {episode},\tSteps: {step},\tscore: {total_reward}")
            break

# Close environment
env.close()

Episode: 0,	Steps: 1000,	score: 746.109141219302
Episode: 1,	Steps: 1000,	score: 751.0345682717375


# Testing the Algo (Version 0 - 2306220120)

## A) Training

Environment and *LightningModule*:

In [7]:
train_env = gym.make("HalfCheetah-v4", render_mode = "rgb_array")

In [12]:
model = DDPGS(env = train_env,
    sync_rate = 10,
    gamma = 0.98,
    warm_start_size = 5000,
    min_episode_reward = -21,
    seed = 101,
    n_steps = 10,)

Instantiate trainer:

In [13]:
# save checkpoints based on avg_reward
checkpoint_callback = ModelCheckpoint(save_top_k=1, 
                                      monitor="avg_reward", 
                                      mode="max", 
                                      verbose=True)
seed_everything(101)
trainer = Trainer(accelerator="gpu", 
                  max_steps=100000,  
                  callbacks = checkpoint_callback)

Global seed set to 101
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Train:

In [14]:
trainer.fit(model)

You are using a CUDA device ('NVIDIA RTX A4000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type | Params
--------------------------------------
0 | policy       | MLP  | 3.1 K 
1 | q_net        | MLP  | 3.2 K 
2 | target_q_net | MLP  | 3.2 K 
--------------------------------------
9.5 K     Trainable params
0         Non-trainable params
9.5 K     Total params
0.038     Total estimated model params size (MB)
  states = torch.tensor(states, device=device)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Epoch 0, global step 20000: 'avg_reward' reached -21.00000 (best -21.00000), saving model to '/notebooks/Deep_Forecasting/lightning_logs/version_8/checkpoints/epoch=0-step=20000.ckpt' as top 1
Epoch 1, global step 40000: 'avg_reward' was not in top 1
Epoch 2, global step 60000: 'avg_reward' was not in top 1
Epoch 3, global step 80000: 'avg_reward' was not in top 1
Epoch 4, global step 100000: 'avg_reward' was not in top 1
`Trainer.fit` stopped: `max_steps=100000` reached.


In [15]:
torch.save(model.state_dict(), "./ddpg_lightning_gym0262_2306220130")

## B) Test trained model

In [19]:
from gym import wrappers
from pyvirtualdisplay import Display

In [16]:
test_model = DDPGS(env = gym.make("HalfCheetah-v4", render_mode = "rgb_array"))
test_model.load_state_dict(torch.load("./ddpg_lightning_gym0262_2306220130"))

<All keys matched successfully>

In [20]:
# Make virtual display
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

# Trigger for wrapper.RecordVideo() object
def epsd_trigger(episode_id: int) -> bool:
    '''
        Records all episodes
    '''
    if episode_id < 10:
        return True
    else:
        return False

# Test env
env = gym.make("HalfCheetah-v4", render_mode = "rgb_array")

<pyvirtualdisplay.display.Display at 0x7f5176972d00>

In [21]:
env = wrappers.RecordVideo(env = env, 
                           video_folder="vids/",
                           name_prefix="DDPGs_pl_gym0262_2306220135",
                           episode_trigger = epsd_trigger)

  logger.warn(


In [22]:
for episode in range(3):
    state, _ = env.reset()
    step = 0
    total_reward = 0
    done = False
    while not done and step<5001:
        step += 1
        #env.render()
        # Get action
        with torch.no_grad():
            # Convert to torch tensors
            state_ = torch.FloatTensor(np.array(state)).to(test_model.device)
            # Get actions and UPolicy output
            #action_ = actor(control, state_, t_)
            action_ = test_model(state_)
            # Get np arrays
            action = action_.cpu().detach().numpy()
        
        observation, reward, done, trunc, info = env.step(action)
        total_reward += reward
        if done:
            print("Episode: {0},\tSteps: {1},\tscore: {2}"
                  .format(episode, step, total_reward)
            )
            break
env.close()

Moviepy - Building video /notebooks/Deep_Forecasting/vids/DDPGs_pl_gym0262_2306220135-episode-0.mp4.
Moviepy - Writing video /notebooks/Deep_Forecasting/vids/DDPGs_pl_gym0262_2306220135-episode-0.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /notebooks/Deep_Forecasting/vids/DDPGs_pl_gym0262_2306220135-episode-0.mp4
Moviepy - Building video /notebooks/Deep_Forecasting/vids/DDPGs_pl_gym0262_2306220135-episode-1.mp4.
Moviepy - Writing video /notebooks/Deep_Forecasting/vids/DDPGs_pl_gym0262_2306220135-episode-1.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /notebooks/Deep_Forecasting/vids/DDPGs_pl_gym0262_2306220135-episode-1.mp4
Moviepy - Building video /notebooks/Deep_Forecasting/vids/DDPGs_pl_gym0262_2306220135-episode-2.mp4.
Moviepy - Writing video /notebooks/Deep_Forecasting/vids/DDPGs_pl_gym0262_2306220135-episode-2.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /notebooks/Deep_Forecasting/vids/DDPGs_pl_gym0262_2306220135-episode-2.mp4


**First version conclusion (23/06/22 - 01:35):** Some hyperparameter tuning is needed. We're on the right track though.