In [None]:
!pip install stable-baselines3[extra]

In [None]:
import stable_baselines3
print(stable_baselines3.__version__)

2.3.2


In [12]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Current device: {torch.cuda.current_device() if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name()}")

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce GTX 1650


In [4]:
import gymnasium as gym 
from stable_baselines3 import PPO   #algorithm
from stable_baselines3.common.vec_env import DummyVecEnv    #environment
from stable_baselines3.common.evaluation import evaluate_policy #performance evaluation

In [5]:
print(gym.__version__)

0.29.1


Environment

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset() #observations of the environment, which will be given to agent
    done = False
    score = 0 

    while not done:
        env.render()
        action = env.action_space.sample()  #generate random action from action space
                                            #which is Discrete(2) for CartPole-v0
                                            # -> 0 or 1
        n_state, reward, done, truncated, info = env.step(action)   #next step
        score += reward
        done = done or truncated  #if either 'done' or 'truncated', end episode
        
    print('Episode:{} Score:{}'.format(episode, score))

env.close()

Episode:1 Score:18.0
Episode:2 Score:19.0
Episode:3 Score:24.0
Episode:4 Score:43.0
Episode:5 Score:46.0


What is each value:
https://www.gymlibrary.dev/environments/classic_control/cart_pole/

In [7]:
env.action_space

Discrete(2)

In [8]:
env.observation_space #4 values float32

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

Train the model

In [9]:
import os
log_path = os.path.join('Training', 'Logs')
log_path

'Training\\Logs'

In [10]:
env = gym.make("CartPole-v1")
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1) #PPO alg, MlpPolicy is the neural network, env, verbose (print)

Using cuda device


In [11]:
model.learn(total_timesteps=20000)

-----------------------------
| time/              |      |
|    fps             | 577  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 466           |
|    iterations           | 2             |
|    time_elapsed         | 8             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.007853181   |
|    clip_fraction        | 0.0795        |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.687        |
|    explained_variance   | 0.00095808506 |
|    learning_rate        | 0.0003        |
|    loss                 | 6.45          |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.0128       |
|    value_loss           | 50.2          |
------------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x2d615ea1e20>

Save and reload model

In [13]:
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')

In [14]:
model.save(PPO_path)

In [15]:
#delete model and load it again
#del model
#model = PPO.load(PPO_path, env=env)

Evaluate

In [16]:
env = gym.make("CartPole-v1", render_mode="human")
evaluate_policy(model, env, n_eval_episodes=10, render=True) #put render=True to see the agent in action (slower)



(np.float64(500.0), np.float64(0.0))

In [17]:
env.close()

^ Number of rewards (500 is max) and standard deviation (0 is best)

Test model

In [18]:
env = gym.make("CartPole-v1", render_mode="human")
episodes = 5
for episode in range(1, episodes + 1):
    obs, info = env.reset()
    done = False
    score = 0 

    while not done:
        env.render()
        action, _ = model.predict(obs) #now using model
        obs, reward, done, truncated, info = env.step(action)   #next step
        score += reward
        done = done or truncated  #if either 'done' or 'truncated', end episode
        
    print('Episode:{} Score:{}'.format(episode, score))

env.close()

Episode:1 Score:265.0
Episode:2 Score:430.0
Episode:3 Score:500.0
Episode:4 Score:500.0
Episode:5 Score:287.0


In [19]:
env = gym.make("CartPole-v1", render_mode="human")
obs, info = env.reset()
print(model.predict(obs)) #using agent to predict action
env.action_space.sample() #instead of random action

(array(0), None)


np.int64(0)

In [20]:
env.step(action)
#reward is 1 for every successful step
#current state, reward, done, info

(array([-0.0443561 , -0.22341287, -0.01687777,  0.32119074], dtype=float32),
 1.0,
 False,
 False,
 {})

In [21]:
env.close()

View logs in tensorboard

In [22]:
#training_log_path = os.path.join(log_path, 'PPO_3')
#!tensorboard --logdir={training_log_path}

Adding a callback to the training stage


In [23]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

In [24]:
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs')

In [25]:
env = gym.make("CartPole-v1", render_mode="human")
env = DummyVecEnv([lambda: env])

In [None]:
#setup callbacks
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, #if new best model, stop training on this model
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [27]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cuda device


In [28]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training\Logs\PPO_2
-----------------------------
| time/              |      |
|    fps             | 46   |
|    iterations      | 1    |
|    time_elapsed    | 43   |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 45            |
|    iterations           | 2             |
|    time_elapsed         | 89            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.0083107     |
|    clip_fraction        | 0.0912        |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.686        |
|    explained_variance   | -0.0035799742 |
|    learning_rate        | 0.0003        |
|    loss                 | 7.87          |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.0157       |
|    value_loss           | 53.2          |
-----------

<stable_baselines3.ppo.ppo.PPO at 0x2d636bb3a60>

Changing policies

In [30]:
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [32]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path, policy_kwargs={'net_arch': net_arch})

Using cuda device




In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

Using alternate algorithm

In [None]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000, callback=eval_callback)

In [None]:
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')

In [None]:
model.save(dqn_path)

In [None]:
model = DQN.load(dqn_path, env=env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()