In [58]:
import numpy
import torch
import gym 
import imageio
from stable_baselines3 import PPO
from stable_baselines3 import PPO 

In [59]:
##if false we will save GIFs of an episode
##otherwise we render the training to pygame
show = False

if not show :env = gym.make("CartPole-v1", render_mode="rgb_array")
else: env = gym.make("CartPole-v1",render_mode="human")



model = PPO("MlpPolicy", env, verbose=1, device='cuda')
model.learn(total_timesteps=10000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 24       |
|    ep_rew_mean     | 24       |
| time/              |          |
|    fps             | 3682     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26          |
|    ep_rew_mean          | 26          |
| time/                   |             |
|    fps                  | 1945        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007992635 |
|    clip_fraction        | 0.0919      |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x1606dfde5d0>

In [None]:
custom_rewards = []
episodes = 10
frames = []

for episode in range(episodes):
    state = env.reset()[0]
    done = False
    episode_reward = 0

    while not done:

        if not show and episode == 0: frames.append(env.render())
        action, _ = model.predict(state, deterministic=True)
        state, reward, done, truncated, _ = env.step(action)
        episode_reward += reward

    custom_rewards.append(episode_reward)

avg_custom_reward = sum(custom_rewards) / episodes
print(f"Average reward : {avg_custom_reward}")
# Save frames as a GIF
imageio.mimsave('cartpole_agent_part1.gif', frames, fps=30)
print("GIF saved as 'cartpole_agent_part1.gif'")

Average reward : 320.4
GIF saved as 'cartpole_agent_part1.gif'


In [61]:
import numpy as np
from gym import RewardWrapper

class CustomCartPoleReward(RewardWrapper):
    def __init__(self, env):
        super(CustomCartPoleReward, self).__init__(env)

    def reward(self, reward):
        # Increase reward for keeping the pole upright and penalize for moving away from the center
        x, x_dot, theta, theta_dot = self.env.state
        new_reward = reward - np.abs(theta)  # Penalize for angle from upright
        return new_reward

# Initialize the custom environment
if not show: custom_env = CustomCartPoleReward(gym.make("CartPole-v1", render_mode="rgb_array"))
else: custom_env = CustomCartPoleReward(gym.make("CartPole-v1", render_mode="human"))

In [62]:

custom_model = PPO("MlpPolicy", custom_env, verbose=1, device='cuda')
custom_model.learn(total_timesteps=10000)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.1     |
|    ep_rew_mean     | 21.2     |
| time/              |          |
|    fps             | 3175     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28.2        |
|    ep_rew_mean          | 26          |
| time/                   |             |
|    fps                  | 1798        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007637154 |
|    clip_fraction        | 0.0603      |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x16062514bc0>

In [63]:
custom_rewards = []
episodes = 10
frames = []

for episode in range(episodes):
    state = custom_env.reset()[0]
    done = False
    episode_reward = 0

    while not done:
        if not show and episode == 0: frames.append(custom_env.render())
        action, _ = custom_model.predict(state, deterministic=True)
        state, reward, done, truncated, _ = custom_env.step(action)
        episode_reward += reward

    custom_rewards.append(episode_reward)

avg_custom_reward = sum(custom_rewards) / episodes
print(f"Average reward with custom reward function: {avg_custom_reward}")

imageio.mimsave('cartpole_agent_part2.gif', frames, fps=30)
print("GIF saved as 'cartpole_agent_part2.gif'")

Average reward with custom reward function: 442.5095105268057
GIF saved as 'cartpole_agent_part2.gif'


In [64]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch as th
from torch import nn
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed

class CustomMLP(nn.Module):
    def __init__(self):
        super(CustomMLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(4, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, 2)
        ).to(th.device('cuda' if th.cuda.is_available() else 'cpu'))

    def forward(self, x):
        return self.network(x)

# Modify the policy architecture
model = PPO("MlpPolicy", env, policy_kwargs=dict(activation_fn=th.nn.ReLU, net_arch=[128, 128]), verbose=1, device='cuda')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [65]:
model.learn(total_timesteps=10000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20.9     |
|    ep_rew_mean     | 20.9     |
| time/              |          |
|    fps             | 3302     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28          |
|    ep_rew_mean          | 28          |
| time/                   |             |
|    fps                  | 1822        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011595517 |
|    clip_fraction        | 0.0764      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.0182     |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x1606dfdc350>

In [66]:
custom_rewards = []
episodes = 10
frames = []
for episode in range(episodes):
    state = env.reset()[0]
    done = False
    episode_reward = 0

    while not done:
        if not show and episode == 0: frames.append(env.render())
        action, _ = model.predict(state, deterministic=True)
        state, reward, done, truncated, _ = env.step(action)
        episode_reward += reward

    custom_rewards.append(episode_reward)

avg_custom_reward = sum(custom_rewards) / episodes
print(f"Average reward : {avg_custom_reward}")


imageio.mimsave('cartpole_agent_part3.gif', frames, fps=30)
print("GIF saved as 'cartpole_agent_part3.gif'")

Average reward : 283.6
GIF saved as 'cartpole_agent_part3.gif'
