In [4]:
from customenv import CustomDoorKey

from minigrid.wrappers import ImgObsWrapper
from stable_baselines3 import PPO

from customfeatureextractor import CNNFeaturesExtractor, CustomFeatureExtractor, CustomImgObsWrapper

from callback import CustomRewardCallback
from plot import make_plot

import matplotlib.pyplot as plt

# size is grid size
# intermediate reward determines if picking up key/opening door gives reward
# randomimze goal determines if goal should be randomized (cell on last column)
env = CustomDoorKey(size=8, intermediate_reward=True, randomize_goal=True, render_mode = "rgb")
default_env = ImgObsWrapper(env)
custom_env = CustomImgObsWrapper(env)

In [5]:
policy_kwargs = dict(
    features_extractor_class=CNNFeaturesExtractor,
    features_extractor_kwargs=dict(features_dim=128, regularization = False),
)

custom_policy_kwargs = dict(
    features_extractor_class=CustomFeatureExtractor,
    features_extractor_kwargs=dict(cnn_features_dim=128, mlp_features_dim=32),
)

max_reward = 0.9
callback = CustomRewardCallback(check_freq=1000, reward_threshold=max_reward)  # set callback


In [None]:
# custom behavior, Babak can temper with this
model = PPO("MultiInputPolicy", custom_env, policy_kwargs=custom_policy_kwargs, verbose=1)
model.learn(2e5)

In [6]:
# default behavior, Baldur can use this
model = PPO("CnnPolicy", default_env, policy_kwargs=policy_kwargs, verbose=1)
model.learn(2e5, callback=callback)
model.save("model_8x8s")

# the plot function also saves the plot
plt.savefig('8x8s.png')


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 0.0496   |
| time/              |          |
|    fps             | 524      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 0.126       |
| time/                   |             |
|    fps                  | 437         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013294364 |
|    clip_fraction        | 0.0692      |
|    clip_range     

In [8]:
env = CustomDoorKey(size=12, intermediate_reward=True, randomize_goal=True, render_mode = "rgb")
default_env = ImgObsWrapper(env)

policy_kwargs = dict(
    features_extractor_class=CNNFeaturesExtractor,
    features_extractor_kwargs=dict(features_dim=128, regularization = False),
)

# Load the trained model, ensure to provide the correct path
model = PPO.load("model_5x5s", env=default_env)

max_reward = 0.9
callback = CustomRewardCallback(check_freq=1000, reward_threshold=max_reward)

model.learn(total_timesteps=int(2e5), callback=callback)
model.save("model_5x5_transfer_12x12")
plt.savefig('5x5_transfer_12x12.png')

env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 0.167    |
| time/              |          |
|    fps             | 494      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 0.0835      |
| time/                   |             |
|    fps                  | 459         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015561059 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.2       