# Training PPO for CartPole

Used [PPOPolicy from Tianshou](https://tianshou.org/en/stable/03_api/policy/modelfree/ppo.html).

In [1]:
%load_ext autoreload
%autoreload 2

# Install dependencies

In [2]:
!pip install gymnasium==0.29.1 pygame==2.3.0 pettingzoo==1.24.3 tianshou==0.5.1

Collecting gymnasium==0.29.1
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pygame==2.3.0
  Downloading pygame-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pettingzoo==1.24.3
  Downloading pettingzoo-1.24.3-py3-none-any.whl (847 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m847.8/847.8 kB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tianshou==0.5.1
  Downloading tianshou-0.5.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.1/163.1 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium==0.29.1)
  Downloading Farama_Notifications-0.0.4-py3-none-an

# Setup environment

In [15]:
import gymnasium as gym
import torch

import tianshou as ts

device = "cuda" if torch.cuda.is_available() else "cpu"

def get_env(render_mode=None):
  return gym.make("CartPole-v1", render_mode=render_mode)

env = get_env()

In [16]:
train_envs = ts.env.DummyVectorEnv([get_env for _ in range(10)])
test_envs = ts.env.DummyVectorEnv([get_env for _ in range(100)])

# Create Policy

In [17]:
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n

net = ts.utils.net.common.Net(state_shape=state_shape, hidden_sizes=[64, 64], device=device)

actor = ts.utils.net.discrete.Actor(preprocess_net=net, action_shape=env.action_space.n, device=device).to(device)
critic = ts.utils.net.discrete.Critic(preprocess_net=net, device=device).to(device)
actor_critic = ts.utils.net.common.ActorCritic(actor, critic)
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

policy = ts.policy.PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=torch.distributions.Categorical,
    action_space=env.action_space,
    action_scaling=False,
)

In [18]:
train_collector = ts.data.Collector(
    policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True
)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

# Train the agent
Using the OffPolicyTrainer.

In [20]:
result = ts.trainer.OnpolicyTrainer(
    policy=policy,
    batch_size=256,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=10,
    step_per_epoch=50000,
    repeat_per_collect=10,
    episode_per_test=10,
    step_per_collect=2000,
    stop_fn=lambda mean_reward: mean_reward >= 500,
).run()
print(f'Finished training! Use {result["duration"]}')

result

Epoch #1: 50001it [00:16, 3005.81it/s, env_step=50000, len=157, loss=57.326, loss/clip=-0.004, loss/ent=0.584, loss/vf=114.670, n/ep=14, n/st=2000, rew=157.43]                           


Epoch #1: test_reward: 143.300000 ± 83.316325, best_reward: 143.300000 ± 83.316325 in #1


Epoch #2: 50001it [00:17, 2881.67it/s, env_step=100000, len=160, loss=57.699, loss/clip=0.003, loss/ent=0.570, loss/vf=115.403, n/ep=17, n/st=2000, rew=160.88]                           


Epoch #2: test_reward: 145.500000 ± 102.473655, best_reward: 145.500000 ± 102.473655 in #2


Epoch #3: 50001it [00:16, 3100.44it/s, env_step=150000, len=169, loss=48.687, loss/clip=0.000, loss/ent=0.547, loss/vf=97.383, n/ep=11, n/st=2000, rew=169.64]                           


Epoch #3: test_reward: 204.900000 ± 98.658451, best_reward: 204.900000 ± 98.658451 in #3


Epoch #4: 50001it [00:16, 2990.13it/s, env_step=200000, len=411, loss=18.393, loss/clip=0.008, loss/ent=0.579, loss/vf=36.781, n/ep=6, n/st=2000, rew=411.67]                           


Epoch #4: test_reward: 234.000000 ± 104.868489, best_reward: 234.000000 ± 104.868489 in #4


Epoch #5: 50001it [00:17, 2939.66it/s, env_step=250000, len=300, loss=13.316, loss/clip=-0.002, loss/ent=0.576, loss/vf=26.648, n/ep=1, n/st=2000, rew=300.00]                           


Epoch #5: test_reward: 243.900000 ± 208.074242, best_reward: 243.900000 ± 208.074242 in #5


Epoch #6: 50001it [00:17, 2903.97it/s, env_step=300000, len=133, loss=17.080, loss/clip=0.002, loss/ent=0.550, loss/vf=34.167, n/ep=2, n/st=2000, rew=133.00]                           


Epoch #6: test_reward: 387.500000 ± 175.150935, best_reward: 387.500000 ± 175.150935 in #6


Epoch #7: 50001it [00:17, 2826.54it/s, env_step=350000, len=175, loss=78.020, loss/clip=-0.000, loss/ent=0.551, loss/vf=156.052, n/ep=16, n/st=2000, rew=175.38]                           


Epoch #7: test_reward: 303.900000 ± 174.958538, best_reward: 387.500000 ± 175.150935 in #6


Epoch #8: 50001it [00:16, 2948.04it/s, env_step=400000, len=275, loss=48.700, loss/clip=-0.000, loss/ent=0.555, loss/vf=97.411, n/ep=10, n/st=2000, rew=275.00]                           


Epoch #8: test_reward: 117.900000 ± 60.769153, best_reward: 387.500000 ± 175.150935 in #6


Epoch #9:  52%|#####2    | 26000/50000 [00:08<00:07, 3011.79it/s, env_step=426000, len=500, n/ep=4, n/st=2000, rew=500.00]

Finished training! Use 148.60s





{'duration': '148.60s',
 'train_time/model': '76.91s',
 'test_step': 35838,
 'test_episode': 130,
 'test_time': '6.93s',
 'test_speed': '5168.02 step/s',
 'best_reward': 500.0,
 'best_result': '500.00 ± 0.00',
 'train_step': 426000,
 'train_episode': 2886,
 'train_time/collector': '64.75s',
 'train_speed': '3007.10 step/s'}

# Play

Play with the trained agent to the opponent a number of episodes and print the results

In [21]:
policy.eval()

env = get_env(render_mode=None)
env = ts.env.DummyVectorEnv([lambda: env])
collector = ts.data.Collector(policy, env, exploration_noise=True)
result = collector.collect(n_episode=100, render=None)
rews, lens = result["rews"], result["lens"]

display(rews.mean())

480.35

Plot Result

In [22]:
import plotly.figure_factory as ff

fig = ff.create_distplot([result['rews']], ['reward'])
fig.update_layout(title_text='CartPole DQN Result')
fig.show()