In [2]:
import os # to log to specific positions
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy # average reward over episodes. plus std deviation
from datetime import datetime


In [3]:
environment_name = 'CartPole-v0'
env = gym.make(environment_name)



In [4]:
episodes = 5
for episode in range(1, episodes +1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()



Episode:1 Score:12.0
Episode:2 Score:12.0
Episode:3 Score:57.0
Episode:4 Score:23.0
Episode:5 Score:20.0


In [5]:
log_path = os.path.join(os.getcwd(), 'logs')

env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])


In [8]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

model.learn(total_timesteps=20_000)

PPO_Path = os.path.join(os.getcwd(), 'models', 'PPO_model_' + environment_name + '_' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

model.save(PPO_Path)

Using cuda device
Logging to d:\git\rl-test\nicholas-renotte\3h-rl\logs\PPO_7
-----------------------------
| time/              |      |
|    fps             | 610  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 491        |
|    iterations           | 2          |
|    time_elapsed         | 8          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00983984 |
|    clip_fraction        | 0.122      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.686     |
|    explained_variance   | -0.000463  |
|    learning_rate        | 0.0003     |
|    loss                 | 6.82       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0196    |
|    value_loss           | 52.6       |
---------------

## 6. Test Model

In [10]:
obs = env.reset()
action, _ = model.predict(obs)
action

array([1], dtype=int64)

In [16]:
episodes = 5
for episodes in range(1, episodes +1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs)
        state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:5 Score:[32.]
Episode:5 Score:[25.]
Episode:5 Score:[12.]
Episode:5 Score:[18.]
Episode:5 Score:[19.]


In [14]:
env.close()

In [23]:
action, state = model.predict(obs)

## 7. Viewing Logs in Tensorboard

In [27]:
training_log_path = os.path.join(log_path, 'PPO_2')
training_log_path

'd:\\git\\rl-test\\nicholas-renotte\\3h-rl\\logs\\PPO_2'

In [28]:
!tensorboard --logdir=${training_log_path}

^C


tensorboard --logdir=d:\\git\\rl-test\\nicholas-renotte\\3h-rl\\logs\\PPO_2

## 8. Adding a callback to the training Stage

In [29]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [34]:
save_path = os.path.join('Training', 'Saved_Models')

In [36]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose = 1)
eval_callback = EvalCallback(env,
                            callback_on_new_best=stop_callback,
                            eval_freq=10_000,
                            best_model_save_path=save_path,
                            verbose=1)

In [38]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [39]:
model.learn(total_timesteps=20_000, callback=eval_callback)

Logging to d:\git\rl-test\nicholas-renotte\3h-rl\logs\PPO_8
-----------------------------
| time/              |      |
|    fps             | 495  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 416        |
|    iterations           | 2          |
|    time_elapsed         | 9          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00944485 |
|    clip_fraction        | 0.107      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.686     |
|    explained_variance   | -0.00111   |
|    learning_rate        | 0.0003     |
|    loss                 | 5.96       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0155    |
|    value_loss           | 48.8       |
---------------------------------



Eval num_timesteps=10000, episode_reward=181.60 +/- 17.22
Episode length: 181.60 +/- 17.22
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 182         |
|    mean_reward          | 182         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.006620636 |
|    clip_fraction        | 0.0525      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.615      |
|    explained_variance   | 0.248       |
|    learning_rate        | 0.0003      |
|    loss                 | 20.9        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0143     |
|    value_loss           | 56.6        |
-----------------------------------------
New best mean reward!
------------------------------
| time/              |       |
|    fps             | 367   |
|    iterations      | 5     |
|    ti

<stable_baselines3.ppo.ppo.PPO at 0x16e0c0cf7c0>

## 9. Changing Policies

In [41]:
new_arch = [dict(pi=[128, 128, 128, 128], vf=[128,128,128,128])]

In [42]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs=dict(net_arch=new_arch))

Using cuda device


In [43]:
model.learn(total_timesteps=20_000, callback=eval_callback)

Logging to d:\git\rl-test\nicholas-renotte\3h-rl\logs\PPO_9
-----------------------------
| time/              |      |
|    fps             | 458  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 364         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014991329 |
|    clip_fraction        | 0.223       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | 0.00265     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.1         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0269     |
|    value_loss           | 20.6        |
----------------



Eval num_timesteps=10000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 200         |
|    mean_reward          | 200         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.020241303 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.568      |
|    explained_variance   | 0.276       |
|    learning_rate        | 0.0003      |
|    loss                 | 9.4         |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0164     |
|    value_loss           | 44.3        |
-----------------------------------------
------------------------------
| time/              |       |
|    fps             | 317   |
|    iterations      | 5     |
|    time_elapsed    | 32    |


<stable_baselines3.ppo.ppo.PPO at 0x16e0c0c5370>

## 10. Using an Alternate Algorithm

In [44]:
from stable_baselines3 import DQN

In [45]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [46]:
model.learn(total_timesteps=20_000, callback=eval_callback)

Logging to d:\git\rl-test\nicholas-renotte\3h-rl\logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.933    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11142    |
|    time_elapsed     | 0        |
|    total_timesteps  | 142      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.898    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12117    |
|    time_elapsed     | 0        |
|    total_timesteps  | 215      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.854    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 12443    |
|    time_elapsed     | 0        |
|    total_timesteps  | 308      |
------------------------------



Eval num_timesteps=9520, episode_reward=61.80 +/- 47.39
Episode length: 61.80 +/- 47.39
----------------------------------
| eval/               |          |
|    mean_ep_length   | 61.8     |
|    mean_reward      | 61.8     |
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    total_timesteps  | 9520     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 412      |
|    fps              | 9985     |
|    time_elapsed     | 0        |
|    total_timesteps  | 9531     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 416      |
|    fps              | 10002    |
|    time_elapsed     | 0        |
|    total_timesteps  | 9587     |
--

<stable_baselines3.dqn.dqn.DQN at 0x16e0c0c54c0>