**Due to many problems it is strongly advised to use Python 3.9 and the packages' versions specified in the requirements.**

In [15]:
import gymnasium as gym
import cv2
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_util import make_vec_env
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorboard
from tbparse import SummaryReader
import torch as th
import torch.nn as nn

### Example for a game with random moves

In [2]:
env = gym.make(
    "LunarLanderContinuous-v3",
    continuous = True,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5,
    render_mode="rgb_array"
)

state, _ = env.reset()
terminated = False
truncated = False

while not (terminated or truncated):
    action = env.action_space.sample()
    next_state, reward, terminated, truncated, _ = env.step(action)
    frame = env.render()
    cv2.imshow("frame", cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
    cv2.waitKey(50)

cv2.destroyAllWindows()

### Example for using SAC from stable_baselines3 for the Lunar Lander Problem

In [3]:
model = SAC("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000, log_interval=4)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 111      |
|    ep_rew_mean     | -246     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 101      |
|    time_elapsed    | 4        |
|    total_timesteps | 443      |
| train/             |          |
|    actor_loss      | 0.184    |
|    critic_loss     | 25.2     |
|    ent_coef        | 0.908    |
|    ent_coef_loss   | -0.241   |
|    learning_rate   | 0.0003   |
|    n_updates       | 342      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 128      |
|    ep_rew_mean     | -163     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 87       |
|    time_elapsed    | 11       |
|    total_timesteps | 1021     |
| train/             |

<stable_baselines3.sac.sac.SAC at 0x24f9b55f970>

In [4]:
obs, info = env.reset()
terminated = False
truncated = False

while not (terminated or truncated):
    action, _states = model.predict(obs, deterministic=True)
    next_state, reward, terminated, truncated, _ = env.step(action)
    frame = env.render()
    cv2.imshow("frame", cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
    cv2.waitKey(50)

cv2.destroyAllWindows()

### Hyperparameter Tuning

The best hyperparameters for this problem can be found at https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/sac.yml:

```yml
LunarLanderContinuous-v3:
  n_timesteps: !!float 5e5
  policy: 'MlpPolicy'
  batch_size: 256
  learning_rate: lin_7.3e-4
  buffer_size: 1000000
  ent_coef: 'auto'
  gamma: 0.99
  tau: 0.01
  train_freq: 1
  gradient_steps: 1
  learning_starts: 10000
  policy_kwargs: "dict(net_arch=[400, 300])"
```

It is safe to assume that those are the most important hyperparameters for training. Here is a brief description of each one:
- *n_timesteps* - total number of time steps to train the model;
- *policy* - choice of neural network (MlpPolicy means Multi-Layer Perceptron policy);
- *policy_kwargs* - the architecture of the network (here [400, 300] means two hidden layers with 400 and 300 neurons);
- *batch_size* - number of samples per training update;
- *learning_rate* - a linear schedule for learning rate (here starting at 7.3e-3 and decreasing linearly to 0 over training);
- *learning_starts* - the agent collects n steps before starting to learn;
- *ent-coef* - balance between exploration and exploitation (here is learnt automatically);
- *gamma* - discount factor for future rewards;
- *tau* - controls the soft update speed of the target networks;
- *train_freq* - the model is trained every n steps;
- *gradient_steps* - how many gradient steps are done after each rollout.

In [2]:
n_runs = 10
timesteps = 50000

param_sets = [
    {"learning_rate": 3e-4, "batch_size": 256},
    {"learning_rate": 7.3e-4, "batch_size": 256},
    {"learning_rate": 1e-3, "batch_size": 128},
]

log_root = "./logs"

for i, params in enumerate(param_sets):
    for run in range(n_runs):
        env = gym.make("LunarLanderContinuous-v3", render_mode="rgb_array")
        env = Monitor(env)

        model = SAC(
            "MlpPolicy",
            env,
            verbose=0,
            seed=run,
            learning_rate=params["learning_rate"],
            batch_size=params["batch_size"],
            tensorboard_log=f"{log_root}/set_{i}",
        )

        model.learn(total_timesteps=timesteps, tb_log_name=f"run_{run}")
        env.close()

### Learning curves

In [10]:
log_root = "./logs"
param_sets = 3
output_dir = "./plots"
os.makedirs(output_dir, exist_ok=True)

def load_rewards_from_tensorboard(log_dir):
    rewards = []
    for run_dir in os.listdir(log_dir):
        full_path = os.path.join(log_dir, run_dir)
        reader = SummaryReader(full_path)
        df = reader.scalars
        reward_df = df[df['tag'] == 'rollout/ep_rew_mean']
        rewards.append((reward_df['step'].values, reward_df['value'].values))

    return rewards

all_sets_data = []

for i in range(param_sets):
    reward_curves = load_rewards_from_tensorboard(f"{log_root}/set_{i}")

    all_x = sorted(set(x for steps, _ in reward_curves for x in steps))
    all_x = np.array(all_x)

    aligned_rewards = []
    for steps, values in reward_curves:
        interp_values = np.interp(all_x, steps, values)
        aligned_rewards.append(interp_values)

    aligned_rewards = np.array(aligned_rewards)
    mean_rewards = np.mean(aligned_rewards, axis=0)
    std_rewards = np.std(aligned_rewards, axis=0)

    all_sets_data.append((all_x, mean_rewards, std_rewards))

    plt.figure(figsize=(10, 6))
    plt.plot(all_x, mean_rewards, label=f"Set {i}")
    plt.fill_between(all_x, mean_rewards - std_rewards, mean_rewards + std_rewards, alpha=0.3)
    plt.title(f"Learning Curve for Hyperparameter Set {i}")
    plt.xlabel("Timesteps")
    plt.ylabel("Episode Reward")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/set_{i}.png")
    plt.close()

plt.figure(figsize=(10, 6))
for i, (x, mean, std) in enumerate(all_sets_data):
    plt.plot(x, mean, label=f"Set {i}")
    plt.fill_between(x, mean - std, mean + std, alpha=0.3)
plt.title("Combined Learning Curves")
plt.xlabel("Timesteps")
plt.ylabel("Episode Reward")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{output_dir}/combined.png")
plt.close()

### The Stable Baseline3 Zoo model
For comparison here's the recommended hyperparameters.

In [12]:
env = gym.make("LunarLanderContinuous-v3", render_mode="rgb_array")
env = Monitor(env)

ideal_log_dir = "./logs/ideal"
os.makedirs(ideal_log_dir, exist_ok=True)

model = SAC(
    "MlpPolicy",
    env,
    batch_size=256,
    learning_rate=lambda _: 7.3e-4 * _,
    buffer_size=100000,
    ent_coef="auto",
    gamma=0.99,
    tau=0.01,
    train_freq=1,
    gradient_steps=1,
    learning_starts=10000,
    policy_kwargs=dict(net_arch=[400, 300]),
    tensorboard_log=ideal_log_dir,
    verbose=1,
)

model.learn(total_timesteps=50000, tb_log_name="run_0")
env.close()

Using cpu device
Wrapping the env in a DummyVecEnv.
Logging to ./logs/ideal\run_0_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 104      |
|    ep_rew_mean     | -344     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 2623     |
|    time_elapsed    | 0        |
|    total_timesteps | 414      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 115      |
|    ep_rew_mean     | -329     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 2736     |
|    time_elapsed    | 0        |
|    total_timesteps | 922      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 110      |
|    ep_rew_mean     | -279     |
| time/              |          |
|    episodes        | 12       |
|    fps             | 2670     

ValueError: File or directory not found: ./logs/ideal/run_0

In [14]:
ideal_tb_dir = "./logs/ideal/run_0_2"

reader = SummaryReader(ideal_tb_dir)
df = reader.scalars

reward_df = df[df["tag"] == "rollout/ep_rew_mean"]
timesteps = reward_df["step"].values
rewards = reward_df["value"].values

plt.figure(figsize=(10, 6))
plt.plot(timesteps, rewards, label="Ideal Parameters")
plt.title("Learning Curve (Ideal Parameters)")
plt.xlabel("Timesteps")
plt.ylabel("Episode Reward")
plt.grid(True)
plt.legend()
plt.tight_layout()

output_path = "./plots/ideal.png"
plt.savefig(output_path)
plt.close()

In [13]:
model.save("ideal_sac_model")

### Testing different network architectures

In Stable Baselines3 the network architecture can be controlled with policies. The information about them and the default values can be found here:

https://stable-baselines3.readthedocs.io/en/master/_modules/stable_baselines3/sac/policies.html

In the "ideal" example the architecture of the network (Multi-Layer Perceptron) is:
- 2 hidden layers - first with 400 units and second with 300 units;
- ReLu activation.

To test a different architecture all other hyperparameters will be kept the same except for the architecture which will be:
- 3 hidden layers - 256, 256 and 128 units respectively;
- Tanh activation.

In [16]:
env = gym.make("LunarLanderContinuous-v3", render_mode="rgb_array")
env = Monitor(env)

changed_log_dir = "./logs/changed_network"
os.makedirs(changed_log_dir, exist_ok=True)

model = SAC(
    "MlpPolicy",
    env,
    batch_size=256,
    learning_rate=lambda _: 7.3e-4 * _,
    buffer_size=100000,
    ent_coef="auto",
    gamma=0.99,
    tau=0.01,
    train_freq=1,
    gradient_steps=1,
    learning_starts=10000,
    policy_kwargs=dict(net_arch=[256, 256, 128], activation_fn=nn.Tanh),
    tensorboard_log=changed_log_dir,
    verbose=1,
)

model.learn(total_timesteps=50000, tb_log_name="run_0")
env.close()

Using cpu device
Wrapping the env in a DummyVecEnv.
Logging to ./logs/changed_network\run_0_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 117      |
|    ep_rew_mean     | -218     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 2299     |
|    time_elapsed    | 0        |
|    total_timesteps | 468      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 108      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 2540     |
|    time_elapsed    | 0        |
|    total_timesteps | 868      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 113      |
|    ep_rew_mean     | -214     |
| time/              |          |
|    episodes        | 12       |
|    fps             |

In [17]:
ideal_tb_dir = "./logs/changed_network/run_0_1"

reader = SummaryReader(ideal_tb_dir)
df = reader.scalars

reward_df = df[df["tag"] == "rollout/ep_rew_mean"]
timesteps = reward_df["step"].values
rewards = reward_df["value"].values

plt.figure(figsize=(10, 6))
plt.plot(timesteps, rewards, label="Ideal Parameters with changed network")
plt.title("Learning Curve (Ideal Parameters with changed network)")
plt.xlabel("Timesteps")
plt.ylabel("Episode Reward")
plt.grid(True)
plt.legend()
plt.tight_layout()

output_path = "./plots/changed_network.png"
plt.savefig(output_path)
plt.close()

### Running the model with deterministic actions

In [21]:
env = gym.make("LunarLanderContinuous-v3", render_mode="rgb_array")
env = Monitor(env)

model = SAC.load("ideal_sac_model")
model.set_env(env)

rewards = []

for i in range(10):
    state, _ = env.reset()
    terminated = False
    truncated = False
    total_reward = 0.0
    while not (terminated or truncated):
        action, _ = model.predict(state, deterministic=True)
        state, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
    rewards.append(total_reward)

print("Mean reward:", sum(rewards) / len(rewards))


Wrapping the env in a DummyVecEnv.
Mean reward: 86.20785724310558
