In [None]:
"""
Implementing PPO
First mistake I made -> ran the action using the policy, then calculated the ratio using old_policy. The correct way is to run the action using old_policy, then calculate the ratio using the new policy.
Second mistake -> didn't run K epochs while calculating the surrogate objective.
Then, it started behaving like a trained model, though did not achieve goal.
Another fix. Added minibatching according to the original paper.
Minibatching made the training significantly slower due to more frequent optimizer steps.
"""

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install gymnasium
!pip install pygame
!pip install wheel setuptools
!pip install swig
!pip install gymnasium[box2d]

Collecting swig
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379371 sha256=f2a46bc6d62fbb5c7e4d8f71af7d1d4564c0c774b848e7437199f522a648ca31
  Stored in directory: /root/.cache/pip/wheels/ab

In [2]:
!pip uninstall -y box2d-py
!pip install box2d pygame swig
!pip install "gymnasium[box2d]" --no-deps

Found existing installation: box2d-py 2.3.5
Uninstalling box2d-py-2.3.5:
  Successfully uninstalled box2d-py-2.3.5
Collecting box2d
  Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (573 bytes)
Downloading Box2D-2.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: box2d
Successfully installed box2d-2.3.10


In [74]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class ActorCriticPPO(nn.Module):

    def __init__(self, input_dim, output_dim, hidden_dims=(64, 64)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU()
        )
        self.actor_layer = nn.Linear(hidden_dims[1], output_dim)
        self.critic_layer = nn.Linear(hidden_dims[1], 1)

    def forward(self, x):
        x = self.net(x)
        action_probs = F.softmax(self.actor_layer(x), dim=-1)
        value = self.critic_layer(x)
        return action_probs, value


In [94]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
import numpy as np
import copy

class PPOAgent:
    def __init__(self, env_id, num_episodes=1000, max_steps=500, epsilon=float('inf'), gamma=0.99, lambda_GAE=1, lr=1e-3, num_steps=0, num_envs=8, num_epochs=4, minibatch_size=4, vectorization_mode = "sync", seed=123):
        # using vectorized environments to boost training speed
        self.env = gym.make_vec(env_id, num_envs=num_envs, vectorization_mode=vectorization_mode)
        self.num_envs = num_envs
        self.num_episodes = num_episodes
        self.num_epochs = num_epochs
        self.max_steps = max_steps
        self.minibatch_size = minibatch_size
        self.epsilon = epsilon
        self.gamma = gamma
        self.lambda_GAE = lambda_GAE
        self.lr = lr
        self.num_steps = num_steps
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = ActorCriticPPO(self.env.single_observation_space.shape[0], self.env.single_action_space.n).to(self.device)
        self.old_policy_net = copy.deepcopy(self.policy_net)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)
        # added scheduler after observing divergence after getting close to solving
        self.scheduler = StepLR(self.optimizer, step_size=100, gamma=0.9)
        self.loss = nn.MSELoss()
        self.seed = seed

    # choosing action from policy's probability distribution
    def choose_action(self, state):
        probs, _ = self.old_policy_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action

    # computing GAE
    # def compute_returns(self, rewards, values, next_value):
    #     rewards = torch.stack(rewards)
    #     values = torch.cat([values, next_value.unsqueeze(0)], dim=0)

    #     T, N = rewards.shape
    #     advantages = torch.zeros_like(rewards)
    #     gae = torch.zeros(N, device=rewards.device)
    #     for t in reversed(range(T)):
    #         # temporal difference error
    #         td = rewards[t] + self.gamma * values[t + 1] - values[t]
    #         # higher labmda -> more sampling, lower lambda -> more bootstrapping
    #         gae = td + self.gamma * self.lambda_GAE * gae
    #         advantages[t] = gae

    #     # compute returns by adding value to advantage
    #     returns = advantages + values[:-1]
    #     returns = (returns - returns.mean()) / (returns.std() + 1e-8)

    #     # normalize advantage across timesteps and environments
    #     advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    #     return returns, advantages

    # computing the gamma decaying rewards
    def compute_returns(self, rewards, values, next_value):
        """
        Args:
            rewards: torch.Tensor of shape [T, N] where
                    T = rollout steps, N = num_envs
        Returns:
            returns: torch.Tensor of shape [T, N], normalized
        """
        rewards = torch.stack(rewards)

        T, N = rewards.shape
        returns = torch.zeros_like(rewards)
        R = torch.zeros(N, device=rewards.device)
        for t in reversed(range(T)):
            R = rewards[t] + self.gamma * R
            returns[t] = R

        # Normalize returns across all timesteps and environments
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        advantages = returns - values

        return returns, advantages


    def train(self):
        episode_rewards = []
        episode_steps = []

        for episode in tqdm(range(self.num_episodes)):
            state, _ = self.env.reset(seed=self.seed + episode)
            done = np.zeros(self.num_envs, dtype=bool)
            episode_reward = np.zeros(self.num_envs)
            old_states, old_actions, old_log_probs, old_rewards, old_values = [], [], [], [], []
            done_mask = np.zeros(self.num_envs, dtype=bool)
            done_steps = np.zeros(self.num_envs)
            steps = 0

            while not np.all(done_mask) and steps < self.max_steps:
                steps += 1
                state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
                old_states.append(state_tensor)

                with torch.no_grad():
                    old_action_probs, old_value = self.old_policy_net(state_tensor)
                    old_action_dist = torch.distributions.Categorical(old_action_probs)
                    old_action = old_action_dist.sample()
                    old_log_prob = old_action_dist.log_prob(old_action)

                next_state, old_reward, terminated, truncated, _ = self.env.step(old_action.cpu().numpy())
                done = np.logical_or(terminated, truncated)
                done_steps = np.where(np.logical_and(done, ~done_mask), steps, done_steps)
                done_mask = np.logical_or(done_mask, done)
                # record when each environment is done
                old_reward = np.where(done_mask, 0.0, old_reward)

                # saves the values, rewards, log_probs which are used to calculate the n_step returns, actor loss, and critic loss
                old_values.append(old_value.squeeze())
                old_rewards.append(torch.tensor(old_reward, dtype=torch.float32).to(self.device))  # shape: (num_envs,)
                old_log_probs.append(old_log_prob)
                old_actions.append(old_action)

                episode_reward += old_reward
                state = next_state


                # finish full trajectory, then update
                if self.num_steps == 0:
                    if np.any(done):
                        with torch.no_grad():
                            next_old_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                            _, next_old_value = self.policy_net(next_old_state_tensor)
                            done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                            next_old_value = next_old_value.squeeze() * (1 - done_tensor)

                        old_states = torch.stack(old_states).to(device)
                        old_values = torch.stack(old_values).to(device)
                        old_log_probs = torch.stack(old_log_probs).to(device)
                        old_actions = torch.stack(old_actions).to(device)
                        old_returns, old_advantages = self.compute_returns(old_rewards, old_values, next_old_value)

                        # flatten inputs
                        T, N = old_actions.shape[:2]
                        old_states = old_states.reshape(T * N, -1)
                        old_actions = old_actions.reshape(T * N)
                        old_log_probs = old_log_probs.reshape(T * N)
                        old_returns = old_returns.reshape(T * N)
                        old_advantages = old_advantages.reshape(T * N)

                        batch_size = old_states.shape[0]
                        minibatch_size = self.minibatch_size

                        # repeat in K epochs
                        for _ in range(self.num_epochs):
                            indices = torch.randperm(batch_size)

                            # mini batching
                            for i in range(0, batch_size, minibatch_size):
                                # selecting inputs for this mini batch
                                mb_idx = indices[i : i + minibatch_size]
                                mb_states = old_states[mb_idx]
                                mb_actions = old_actions[mb_idx]
                                mb_log_probs_old = old_log_probs[mb_idx]
                                mb_returns = old_returns[mb_idx]
                                mb_advantages = old_advantages[mb_idx]

                                # use current policy to find log_probs of the trajectory ran by old_policy
                                action_probs, values = self.policy_net(mb_states)
                                action_dist = torch.distributions.Categorical(action_probs)
                                mb_log_probs = action_dist.log_prob(mb_actions)

                                prob_ratio = torch.exp(mb_log_probs - mb_log_probs)
                                values = values.squeeze(-1)
                                surrogate = torch.min(prob_ratio * mb_advantages, prob_ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * mb_advantages)
                                actor_loss = -surrogate.mean()
                                # clipped_values = torch.clamp(values, old_values.detach() * (1 - self.epsilon), old_values.detach() * (1 + self.epsilon))
                                # critic_loss = min(self.loss(returns, values), self.loss(returns, clipped_values))
                                critic_loss = self.loss(mb_returns, values)

                                loss = actor_loss + critic_loss
                                self.optimizer.zero_grad()
                                loss.backward()
                                self.optimizer.step()

                        old_states, old_actions, old_log_probs, old_rewards, old_values = [], [], [], [], []
                        # self.scheduler.step()


                # every n steps for each environment, calculate losses, update the actor & critic, then refresh the saved lists
                else:
                    if (steps % self.num_steps == 0) or np.any(done):
                        with torch.no_grad():
                            next_old_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device)
                            _, next_old_value = self.policy_net(next_old_state_tensor)
                            done_tensor = torch.tensor(done, dtype=torch.float32).to(self.device)
                            next_old_value = next_old_value.squeeze() * (1 - done_tensor)

                        old_states = torch.stack(old_states).to(device)
                        old_values = torch.stack(old_values).to(device)
                        old_log_probs = torch.stack(old_log_probs).to(device)
                        old_actions = torch.stack(old_actions).to(device)
                        old_returns, old_advantages = self.compute_returns(old_rewards, old_values, next_old_value)

                        # flatten inputs
                        T, N = old_actions.shape[:2]
                        old_states = old_states.reshape(T * N, -1)
                        old_actions = old_actions.reshape(T * N)
                        old_log_probs = old_log_probs.reshape(T * N)
                        old_returns = old_returns.reshape(T * N)
                        old_advantages = old_advantages.reshape(T * N)

                        batch_size = old_states.shape[0]
                        minibatch_size = self.minibatch_size

                        # repeat in K epochs
                        for _ in range(self.num_epochs):
                            indices = torch.randperm(batch_size)

                            # mini batching
                            for i in range(0, batch_size, minibatch_size):
                                # selecting inputs for this mini batch
                                mb_idx = indices[i : i + minibatch_size]
                                mb_states = old_states[mb_idx]
                                mb_actions = old_actions[mb_idx]
                                mb_log_probs_old = old_log_probs[mb_idx]
                                mb_returns = old_returns[mb_idx]
                                mb_advantages = old_advantages[mb_idx]

                                # use current policy to find log_probs of the trajectory ran by old_policy
                                action_probs, values = self.policy_net(mb_states)
                                action_dist = torch.distributions.Categorical(action_probs)
                                mb_log_probs = action_dist.log_prob(mb_actions)

                                prob_ratio = torch.exp(mb_log_probs - mb_log_probs_old)
                                values = values.squeeze(-1)
                                surrogate = torch.min(prob_ratio * mb_advantages, prob_ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * mb_advantages)
                                actor_loss = -surrogate.mean()
                                # clipped_values = torch.clamp(values, old_values.detach() * (1 - self.epsilon), old_values.detach() * (1 + self.epsilon))
                                # critic_loss = min(self.loss(returns, values), self.loss(returns, clipped_values))
                                critic_loss = self.loss(mb_returns, values)

                                loss = actor_loss + critic_loss
                                self.optimizer.zero_grad()
                                loss.backward()
                                self.optimizer.step()

                        old_states, old_actions, old_log_probs, old_rewards, old_values = [], [], [], [], []
                        # self.scheduler.step()


            self.old_policy_net = copy.deepcopy(self.policy_net)

            episode_rewards.append(episode_reward)
            episode_steps.append(steps)

            if episode % 20 == 0:
               print('Episode {}\tlengths: {}\treward: {}]\tfull length: {}'.format(episode, done_steps, episode_reward, steps))
            if episode % 10 == 0:
                print(f"\n[Episode {episode}]")
                print(f"Reward (mean): {np.mean(episode_reward):.2f}")
                print(f"Actor Loss: {actor_loss.item():.4f} | Critic Loss: {critic_loss.item():.4f}")
                print(f"Prob Ratio - mean: {prob_ratio.mean().item():.4f}, max: {prob_ratio.max().item():.4f}, min: {prob_ratio.min().item():.4f}")

        self.env.close()
        return np.array(episode_rewards), np.array(episode_steps)


In [None]:
import gymnasium as gym

env_id = 'LunarLander-v3'
num_episodes = 100
max_steps = 500
lr = 1e-4


ppo_model =  PPOAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, epsilon=0.2, num_envs=8, num_steps=32, minibatch_size=16)

rewards, steps = ppo_model.train()



  1%|          | 1/100 [00:02<04:47,  2.90s/it]

Episode 0	lengths: [ 68.  75. 100.  79.  91. 100. 116.  65.]	reward: [  23.42290852  -21.0415077  -185.99135794 -122.54408225 -133.07595182
  -72.50355055 -164.4432558     1.19082493]]	full length: 116

[Episode 0]
Reward (mean): -84.37
Actor Loss: -0.1178 | Critic Loss: 0.1815
Prob Ratio - mean: 1.0107, max: 1.0831, min: 0.8825


 11%|█         | 11/100 [00:35<04:21,  2.94s/it]


[Episode 10]
Reward (mean): -161.28
Actor Loss: -0.4140 | Critic Loss: 1.8274
Prob Ratio - mean: 1.0174, max: 1.2128, min: 0.8223


 21%|██        | 21/100 [01:10<04:25,  3.37s/it]

Episode 20	lengths: [107. 100. 136. 133.  82.  94. 124. 118.]	reward: [-284.2452314   -25.55426189 -163.74740984   -3.49345575   -7.73744353
 -165.24696666 -210.96473393 -309.9329062 ]]	full length: 136

[Episode 20]
Reward (mean): -146.37
Actor Loss: -0.1504 | Critic Loss: 0.1337
Prob Ratio - mean: 0.9907, max: 1.1293, min: 0.7797


 31%|███       | 31/100 [01:53<05:19,  4.63s/it]


[Episode 30]
Reward (mean): -107.98
Actor Loss: 0.0385 | Critic Loss: 0.6195
Prob Ratio - mean: 0.9770, max: 1.1405, min: 0.8083


 41%|████      | 41/100 [02:50<06:26,  6.55s/it]

Episode 40	lengths: [180. 108. 150. 148. 114.  98. 136. 143.]	reward: [ 15.61733602  76.55506404  37.81381911  -0.58695517  80.46928083
  84.41673134  58.24204205 128.58270396]]	full length: 180

[Episode 40]
Reward (mean): 60.14
Actor Loss: -0.2583 | Critic Loss: 0.9100
Prob Ratio - mean: 1.0360, max: 1.3309, min: 0.8516


 51%|█████     | 51/100 [04:16<06:25,  7.86s/it]


[Episode 50]
Reward (mean): 66.05
Actor Loss: -0.1918 | Critic Loss: 0.7023
Prob Ratio - mean: 1.0338, max: 1.1305, min: 0.9037


 59%|█████▉    | 59/100 [04:52<02:44,  4.00s/it]

In [56]:
import gymnasium as gym
import torch
import numpy as np
from gymnasium.wrappers import RecordVideo
import os

# Create folder to save the video
video_folder = "./video"
os.makedirs(video_folder, exist_ok=True)

# Wrap the environment with RecordVideo
env = gym.make('LunarLander-v3', render_mode='rgb_array')
env = RecordVideo(env, video_folder=video_folder, episode_trigger=lambda e: True)

num_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

episode_rewards = []

for i in range(num_episodes):
    state, _ = env.reset()
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    episode_reward = 0
    done = False

    while not done:
        with torch.no_grad():
            action_probs, _ = ppo_model.policy_net(state)
            # action_dist = torch.distributions.Categorical(action_probs)
            # action = action_dist.sample().item()
            action = torch.argmax(action_probs, dim=1).item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_reward += reward

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0).to(device)

    episode_rewards.append(episode_reward)
    print(f"Episode {i+1} Reward: {episode_reward}")

env.close()

episode_rewards = np.array(episode_rewards)
print(f"Average Reward over {num_episodes} episodes: {np.mean(episode_rewards)}")


  logger.warn(


Episode 1 Reward: -54.78416117561745
Episode 2 Reward: -119.94616451933459
Episode 3 Reward: -110.71677044528491
Episode 4 Reward: -91.66937614894313
Episode 5 Reward: -131.8112768704142
Episode 6 Reward: -51.91420243696872
Episode 7 Reward: -102.09082294946438
Episode 8 Reward: -94.30228997606928
Episode 9 Reward: -116.33677594644324
Episode 10 Reward: -80.7753335795721
Average Reward over 10 episodes: -95.4347174048112


In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class LinearValueEnv(gym.Env):
    def __init__(self, gamma=0.99, episode_length=100):
        super().__init__()
        self.gamma = gamma
        self.episode_length = episode_length
        self.current_step = 0

        # Observation: continuous scalar in [-1, 1]
        self.observation_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)

        # Action: continuous scalar (not used)
        self.action_space = spaces.Discrete(1)

        self.state = None

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = np.random.uniform(-1.0, 1.0, size=(1,)).astype(np.float32)
        self.current_step = 0
        return self.state.copy(), {}

    def step(self, action):
        # Reward is simply the state value
        reward = float(self.state[0])
        self.current_step += 1

        terminated = self.current_step >= self.episode_length
        truncated = False
        return self.state.copy(), reward, terminated, truncated, {}

    def render(self):
        print(f"State: {self.state}")

    def close(self):
        pass




In [None]:
from gymnasium.envs.registration import register

register(
    id="LinearValue-v0",
    entry_point="__main__:LinearValueEnv",  # if you're running in a script
    max_episode_steps=100
)


In [None]:
env_id = "LinearValue-v0"
num_episodes = 1000
max_steps = 500
lr = 1e-4


ppo_model_value =  PPOAgent(env_id, num_episodes=num_episodes, max_steps=max_steps, lr=lr, epsilon=0.2, num_envs=2, num_steps=0)

rewards, steps = ppo_model_value.train()


  0%|          | 1/1000 [00:00<12:09,  1.37it/s]

Episode 0	lengths: [100. 100.]	reward: [-72.96960783  84.58938628]]	full length: 100

[Episode 0]
Reward (mean): 5.81
Actor Loss: 0.3642 | Critic Loss: 0.9067
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  1%|          | 11/1000 [00:03<04:25,  3.73it/s]


[Episode 10]
Reward (mean): -41.61
Actor Loss: 0.2812 | Critic Loss: 1.0306
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  2%|▏         | 21/1000 [00:06<04:13,  3.86it/s]

Episode 20	lengths: [100. 100.]	reward: [-32.27667063  45.82619548]]	full length: 100

[Episode 20]
Reward (mean): 6.77
Actor Loss: 0.3617 | Critic Loss: 1.0112
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  3%|▎         | 31/1000 [00:08<04:14,  3.81it/s]


[Episode 30]
Reward (mean): 49.95
Actor Loss: 0.4339 | Critic Loss: 1.0572
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  4%|▍         | 41/1000 [00:11<04:05,  3.90it/s]

Episode 40	lengths: [100. 100.]	reward: [-12.85368513  34.16484547]]	full length: 100

[Episode 40]
Reward (mean): 10.66
Actor Loss: 0.3643 | Critic Loss: 1.0583
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  5%|▌         | 51/1000 [00:14<04:51,  3.26it/s]


[Episode 50]
Reward (mean): -12.35
Actor Loss: 0.3224 | Critic Loss: 0.9231
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  6%|▌         | 61/1000 [00:17<04:10,  3.75it/s]

Episode 60	lengths: [100. 100.]	reward: [ 41.3785345  -51.92305827]]	full length: 100

[Episode 60]
Reward (mean): -5.27
Actor Loss: 0.3325 | Critic Loss: 0.9675
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  7%|▋         | 71/1000 [00:19<03:58,  3.90it/s]


[Episode 70]
Reward (mean): -60.01
Actor Loss: 0.2350 | Critic Loss: 1.0500
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  8%|▊         | 81/1000 [00:22<03:56,  3.89it/s]

Episode 80	lengths: [100. 100.]	reward: [18.60985883 19.40882935]]	full length: 100

[Episode 80]
Reward (mean): 19.01
Actor Loss: 0.3707 | Critic Loss: 1.1324
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


  9%|▉         | 91/1000 [00:25<04:43,  3.21it/s]


[Episode 90]
Reward (mean): 54.82
Actor Loss: 0.4316 | Critic Loss: 1.1794
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 10%|█         | 101/1000 [00:28<03:59,  3.75it/s]

Episode 100	lengths: [100. 100.]	reward: [36.77222595 51.44622749]]	full length: 100

[Episode 100]
Reward (mean): 44.11
Actor Loss: 0.4109 | Critic Loss: 1.1559
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 11%|█         | 111/1000 [00:30<03:50,  3.86it/s]


[Episode 110]
Reward (mean): 5.18
Actor Loss: 0.3404 | Critic Loss: 1.0804
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 12%|█▏        | 121/1000 [00:33<03:42,  3.95it/s]

Episode 120	lengths: [100. 100.]	reward: [-27.61505038  77.55969197]]	full length: 100

[Episode 120]
Reward (mean): 24.97
Actor Loss: 0.3734 | Critic Loss: 0.9805
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 13%|█▎        | 131/1000 [00:35<03:45,  3.85it/s]


[Episode 130]
Reward (mean): -27.30
Actor Loss: 0.2786 | Critic Loss: 0.9290
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 14%|█▍        | 141/1000 [00:39<04:24,  3.24it/s]

Episode 140	lengths: [100. 100.]	reward: [-91.24863052  44.33011508]]	full length: 100

[Episode 140]
Reward (mean): -23.46
Actor Loss: 0.2832 | Critic Loss: 0.8766
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 15%|█▌        | 151/1000 [00:41<03:39,  3.87it/s]


[Episode 150]
Reward (mean): -92.36
Actor Loss: 0.1577 | Critic Loss: 1.0184
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 16%|█▌        | 161/1000 [00:44<03:34,  3.91it/s]

Episode 160	lengths: [100. 100.]	reward: [ 3.80020684 92.52385944]]	full length: 100

[Episode 160]
Reward (mean): 48.16
Actor Loss: 0.4072 | Critic Loss: 1.0393
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 17%|█▋        | 171/1000 [00:46<03:33,  3.89it/s]


[Episode 170]
Reward (mean): -14.98
Actor Loss: 0.2918 | Critic Loss: 0.9436
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 18%|█▊        | 181/1000 [00:49<04:19,  3.15it/s]

Episode 180	lengths: [100. 100.]	reward: [-11.27119991   1.15310721]]	full length: 100

[Episode 180]
Reward (mean): -5.06
Actor Loss: 0.3075 | Critic Loss: 1.0708
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 19%|█▉        | 191/1000 [00:52<03:34,  3.77it/s]


[Episode 190]
Reward (mean): 41.69
Actor Loss: 0.3900 | Critic Loss: 1.0467
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 20%|██        | 201/1000 [00:55<03:23,  3.92it/s]

Episode 200	lengths: [100. 100.]	reward: [-81.71754402 -39.18708792]]	full length: 100

[Episode 200]
Reward (mean): -60.45
Actor Loss: 0.2027 | Critic Loss: 0.9949
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 21%|██        | 211/1000 [00:57<03:23,  3.87it/s]


[Episode 210]
Reward (mean): -59.89
Actor Loss: 0.2012 | Critic Loss: 1.0341
Prob Ratio - mean: 1.0000, max: 1.0000, min: 1.0000


 21%|██▏       | 213/1000 [00:58<03:35,  3.65it/s]


KeyboardInterrupt: 