In [1]:
# virtual display

from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()


<pyvirtualdisplay.display.Display at 0x7e6d90758d10>

In [1]:
# importing the essential libraries
import gymnasium

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import notebook_login

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor


In [2]:
import gymnasium as gym

env = gym.make("LunarLander-v2")
observation, info = env.reset()

for _ in range(20):
  action = env.action_space.sample()
  print(f"Action taken: {action}")

  observation, reward, terminated, truncated, info = env.step(action)

  if terminated or truncated:
    print("Environment is reset")
    observation, info = env.reset()
env.close()

Action taken: 3
Action taken: 0
Action taken: 0
Action taken: 0
Action taken: 0
Action taken: 0
Action taken: 1
Action taken: 0
Action taken: 1
Action taken: 0
Action taken: 1
Action taken: 3
Action taken: 3
Action taken: 3
Action taken: 1
Action taken: 2
Action taken: 1
Action taken: 3
Action taken: 0
Action taken: 3


In [3]:
env = gym.make("LunarLander-v2")
env.reset()
print("____OBSERVATION SPACE____\n")
print(f"Observation space shape: {env.observation_space.shape}")
print(f"Sample observation: {env.observation_space.sample()}")

____OBSERVATION SPACE____

Observation space shape: (8,)
Sample observation: [-39.034153   -72.051254     2.0247571    3.4367871    0.3081179
  -0.7239273    0.80091375   0.55113107]


In [4]:
print("\n___ACTION SPACE____ \n")
print(f"Action space shape: {env.action_space.n}")
print(f"Action space sample: {env.action_space.sample()}")


___ACTION SPACE____ 

Action space shape: 4
Action space sample: 1


In [5]:
env = make_vec_env("LunarLander-v2", n_envs=16)

In [6]:
# create environment

env = gym.make("LunarLander-v2")

# instantiate the agent
model = PPO("MlpPolicy", env, verbose=1)

# train the agent
model.learn(total_timesteps=int(2e5))

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 88.6     |
|    ep_rew_mean     | -161     |
| time/              |          |
|    fps             | 607      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 92.8        |
|    ep_rew_mean          | -191        |
| time/                   |             |
|    fps                  | 503         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010449717 |
|    clip_fraction        | 0.0525      |
|    clip_range           | 0.2         |
|    entropy_loss  

<stable_baselines3.ppo.ppo.PPO at 0x7c334a386fd0>

In [7]:
# solution
model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [8]:
model_name = "ppo-LunarLander-v2"

In [9]:
model.learn(total_timesteps=1000000)
model_name = "LunarLander-v2"
model.save(model_name)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    value_loss           | 101           |
-------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 327         |
|    ep_rew_mean          | 212         |
| time/                   |             |
|    fps                  | 564         |
|    iterations           | 740         |
|    time_elapsed         | 1341        |
|    total_timesteps      | 757760      |
| train/                  |             |
|    approx_kl            | 0.004619845 |
|    clip_fraction        | 0.0217      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.813      |
|    explained_variance   | 0.9771421   |
|    learning_rate        | 0.0003      |
|    loss                 | 5.43        |
|    n_updates            | 2956        |
|    policy_gradient_loss | -0.00182    |
|    value_loss           | 17.7        |
-------

In [10]:
# creating an environment for evaluation
from stable_baselines3.common.evaluation import evaluate_policy

eval_env = gym.make("LunarLander-v2")

mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=10,
    render=False
)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")



Mean reward: 259.09728928954365, Std reward: 48.226560589289626
