# Gym Environments and Implementing Reinforcement Learning Agents with Stable Baselines

In [1]:
import time
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
from training import latest_model
from stable_baselines3 import PPO,A2C
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

Trying the enviroment

In [2]:
env = gym.make('CarRacing-v2')
obs, info = env.reset()

Let's see what the environment looks like

In [3]:
print("sample action:", env.action_space.sample())

print("observation space shape:", env.observation_space.shape)

#print("sample observation:", env.observation_space.sample())

sample action: [0.05034393 0.67812884 0.6773829 ]
observation space shape: (96, 96, 3)


Let's test the model before training

In [None]:
untrained_model = PPO(MlpPolicy, env, verbose=0)

mean_reward, std_reward = evaluate_policy(untrained_model, env, n_eval_episodes=100, warn=False)

print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

We created a script that creates a model and starts training it. If a model has already been created it trains it further:

```bash
python training.py PPO
```

```bash
python training.py A2C
```

The models are saved in the folder `models` and then we use the latest model to test them

To see the training progress we can use tensorboard:

```bash
tensorboard --logdir=logs
```

PPO algorithm

In [None]:
ppo_model = PPO.load(latest_model("PPO"), env=env)

episodes = 100

for ep in range(episodes):
    done = False
    while not done:
        action, _states = ppo_model.predict(obs)
        obs, rewards, done,_, info = env.step(action)
        env.render()
        print(rewards)

A2C algorithm

In [None]:
a2c_model = A2C.load(latest_model("A2C"), env=env)

episodes = 100

for ep in range(episodes):
    done = False
    while not done:
        action, _states = a2c_model.predict(obs)
        obs, rewards, done,_, info = env.step(action)
        env.render()
        print(rewards)

Review code below