In [12]:
import os, sys
import numpy as np
import cv2, base64, imageio
from IPython.display import HTML

import gymnasium as gym
from minigrid.wrappers import RGBImgPartialObsWrapper, ImgObsWrapper
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

sys.path.append(os.path.abspath(".."))

import my_envs            
from src.envs import make_env as make_env
from src.training.model_factory import ModelFactory
from src.evaluation.evaluation import evaluate
from src.visualization.render_video import record_episode, display_video

In [2]:
# Choose environment
env_id_train = "MiniGrid-DistributionalShift-Train-v0"
env_id_test  = "MiniGrid-DistributionalShift-Test-v0"

In [3]:
# Vectorize for stable-baselines3 compatibility
vec_env = make_vec_env(lambda: make_env(env_id_train), n_envs=4)

In [4]:
# Create and train the agent
factory = ModelFactory(algorithm="PPO", policy="CnnPolicy", env=vec_env)

model = factory.build_model(
    n_steps=128,
    batch_size=512,
    learning_rate=2.5e-4,
    ent_coef=0.01,
)

# train
factory.train(total_timesteps=10_000)

Using cpu device
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 46       |
|    ep_rew_mean     | -95      |
| time/              |          |
|    fps             | 2769     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 512      |
---------------------------------


  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 48.5        |
|    ep_rew_mean          | -97.5       |
| time/                   |             |
|    fps                  | 710         |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 1024        |
| train/                  |             |
|    approx_kl            | 0.041448817 |
|    clip_fraction        | 0.288       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.93       |
|    explained_variance   | -0.00366    |
|    learning_rate        | 0.00025     |
|    loss                 | 121         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0111     |
|    value_loss           | 396         |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 73.6  

<stable_baselines3.ppo.ppo.PPO at 0x15a568110>

In [5]:
# save
# factory.save("../runs/models/ppo_DS_new_reward_model.zip")

# load for testing
model = factory.load("../runs/models/ppo_DS_new_reward_model.zip")

In [6]:
model_path = "../runs/models/ppo_DS_new_reward_model.zip"
model = PPO.load(model_path)

In [7]:
# Show video
env = make_env(env_id_test)
frames = record_episode(env, model)
display_video(frames)


  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


In [13]:
train_metrics = evaluate(model, make_env, env_id_train, 50)
test_metrics  = evaluate(model, make_env, env_id_test, 50)

print(f'Train metrics: {train_metrics}')
print(f'Test metrics: {test_metrics}')



Train metrics: {'avg_return': np.float64(-324.0), 'success_rate': 0.0, 'lava_rate': 0.0, 'timeout_rate': 1.0}
Test metrics: {'avg_return': np.float64(30.0), 'success_rate': 1.0, 'lava_rate': 0.0, 'timeout_rate': 0.0}
