# Stable Baselines3 - Car Racing

In [4]:
# !apt-get update -qq && apt-get install swig cmake -qq
!apt-get install swig
!pip install box2d-py -q
!pip install -q "stable-baselines3[extra]"

E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


## Importações

In [5]:
import gymnasium as gym
import numpy as np
import pandas as pd
import numpy as np

from stable_baselines3.common.vec_env import DummyVecEnv

### Importando Algoritmo e Política

In [6]:
from stable_baselines3 import PPO

#### Função Auxiliar

In [7]:
def evaluate(model, num_episodes=10, deterministic=True, qi=0):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    # This function will only work for a single Environment
    vec_env = model.get_env()
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = vec_env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs, deterministic=deterministic)
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            # also note that the step only returns a 4-tuple, as the env that is returned
            # by model.get_env() is an sb3 vecenv that wraps the >v0.26 API

            #obs, reward, done, info = vec_env.step(action)
            obs, reward, done, info = vec_env.step(action)

            episode_rewards.append(reward)

            #done = terminated or truncated


        all_episode_rewards.append(sum(episode_rewards))
        df_treinos.loc[qi][i + 1] = sum(episode_rewards)[0]

    mean_episode_reward = np.mean(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward

### Método de avaliação do Stable Baselines

In [8]:
from stable_baselines3.common.evaluation import evaluate_policy

### Criando Ambiente

In [9]:
# render_mode="human" Se usar visualização na avaliação
env = gym.make("CarRacing-v2", continuous=False)
env = DummyVecEnv([lambda: env])

In [10]:
# Espaço de observação
env.observation_space

Box(0, 255, (96, 96, 3), uint8)

In [11]:
# Espaço de ações
env.action_space

Discrete(5)

### Avaliando agente não treinado

In [13]:
# Separate env for evaluation
eval_env = gym.make("CarRacing-v2", continuous=False)
model = PPO("CnnPolicy", env)

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=10,
    deterministic=True,
)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=-93.02 +/- 0.4213733267964002


## Treina o agente

### Inicialização

In [14]:
# Define quantidade de treinamentos do agente
quantidades = [2e4, 5e4, 1e5, 2e5, 4e5]
qtd_teste = 10

In [15]:
df_treinos = pd.DataFrame(index=quantidades, columns=np.arange(1, qtd_teste + 1))

### Execução

In [None]:


for qtd in quantidades:
    print(qtd)
    # Algoritmo + Politica
    model = PPO("CnnPolicy", env)

    # Treina agente
    model.learn(total_timesteps=qtd)

    # Salva modelo
    model.save(f"./{qtd}k_car_racing_PPO")

    # Carrega modelo
    #model = PPO.load(f"./{qtd}k_car_racing_PPO", env)

    # Avalia modelo
    evaluate(model, num_episodes=qtd_teste, qi=qtd)


    # Deleta modelo
    del model

    # Reseta Ambiente
    env.reset()
    print()



20000.0
Mean reward: -24.9728 Num episodes: 10

50000.0
Mean reward: 462.34772 Num episodes: 10

100000.0
Mean reward: -30.031702 Num episodes: 10

200000.0
Mean reward: 866.6138 Num episodes: 10

400000.0


In [None]:
#model.save("./200k_car_racing_PPO")
#model = PPO.load(f"./{numer}k_car_racing_PPO", env)

## Avalia o agente treinado

### Avaliação com visualização (jupyter notebook)

In [None]:
env = gym.make("CarRacing-v2", continuous=False, render_mode="human")
env = DummyVecEnv([lambda: env])
model = PPO.load(f"./{4e5}k_car_racing_PPO", env)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=3, render=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
env.close()

#### Resultado

In [None]:
df_treinos

In [None]:
df_treinos.T.astype(float).describe()

### Prepare video recording

In [None]:
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [None]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay


def show_videos(video_path="", prefix=""):
    """
    Taken from https://github.com/eleurent/highway-env

    :param video_path: (str) Path to the folder containing videos
    :param prefix: (str) Filter the video, showing only the only starting with this prefix
    """
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

We will record a video using the [VecVideoRecorder](https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html#vecvideorecorder) wrapper, you will learn about those wrapper in the next notebook.

In [None]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv


def record_video(env_id, model, video_length=500, prefix="", video_folder="videos/"):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    eval_env = DummyVecEnv([lambda: gym.make("CarRacing-v2", render_mode="rgb_array", continuous=False)])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()

### Visualize trained agent



In [None]:
#record_video("CarRacing-v2", model, video_length=500, prefix="ppo-car-racing")

### Inicializar modelo salvo e grava video

In [None]:
for qtd in quantidades:
    model = PPO.load(f"{qtd}k_car_racing_PPO", env)
    record_video(f"{qtd}k_car_racing_PPO", model, video_length=500, prefix=f"ppo-{qtd}k_car_racing_PPO")

In [None]:
show_videos("videos", prefix="ppo")