In [40]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import gymnasium as gym
from gymnasium import spaces

from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env

### A2C Hyperparameters

| **Hyperparameter**             | **Value**  | **Reasoning** |
|--------------------------------|-----------------------|---------------|
| **Discount Factor (γ)**        | `0.99`                | Ensures long-term reward optimization, critical for docking. |
| **Learning Rate (α)**          | `3e-4`                | Balanced learning speed; lower values improve stability. |
| **Number of Environments**     | `8-16`                | Helps stabilize updates via multiple experiences. |
| **Entropy Coefficient (β)**    | `0.01 - 0.05`         | Encourages exploration but keeps it controlled. |
| **Value Function Coefficient (c1)** | `0.5`          | Ensures a good balance between policy and value updates. |
| **Gradient Clip**              | `0.5`                 | Prevents exploding gradients, improving stability. |
| **Batch Size**                 | `128 - 256`           | Larger batches improve stability for continuous control. |
| **Max Episode Length**         | `500 - 1000` steps    | Allows enough time for docking maneuvers. |
| **Activation Function**        | `ReLU` (or `Tanh`)    | `Tanh` can help with smoother control in continuous actions. |
| **Policy Network Architecture** | `2-3 layers, 256-512 neurons each` | Sufficient capacity for learning complex docking strategies. |
| **Value Network Architecture** | `Same as policy network` | A2C uses separate policy and value networks. |
| **Optimizer**                  | `Adam`                | Standard and stable for RL applications. |
| **Frame Skip** (if using vision) | `1-4`               | Helps reduce training complexity in high-frame-rate environments. |



In [4]:
# Import environment creator function
from Environment_Creator import env_creator, Environments_enum
# Instantiate one of the custom environments
config = None

In [23]:
# Fetch environment
def make_navigation_env():
    return env_creator(Environments_enum.Navigation.value)
    
# Register the custom environment with Gym
gym.register(
    id="SpacecraftNavigation_v2",
    entry_point=make_navigation_env
)
# Verify environment registry
gym.pprint_registry()

# Make our environment
gym_env = gym.make("SpacecraftNavigation_v2")

# Instantiate the env
vec_env = make_vec_env("SpacecraftNavigation_v2", n_envs=8)

===== classic_control =====
Acrobot-v1                  CartPole-v0                 CartPole-v1
MountainCar-v0              MountainCarContinuous-v0    Pendulum-v1
===== phys2d =====
phys2d/CartPole-v0          phys2d/CartPole-v1          phys2d/Pendulum-v0
===== box2d =====
BipedalWalker-v3            BipedalWalkerHardcore-v3    CarRacing-v2
LunarLander-v2              LunarLanderContinuous-v2
===== toy_text =====
Blackjack-v1                CliffWalking-v0             FrozenLake-v1
FrozenLake8x8-v1            Taxi-v3
===== tabular =====
tabular/Blackjack-v0        tabular/CliffWalking-v0
===== mujoco =====
Ant-v2                      Ant-v3                      Ant-v4
HalfCheetah-v2              HalfCheetah-v3              HalfCheetah-v4
Hopper-v2                   Hopper-v3                   Hopper-v4
Humanoid-v2                 Humanoid-v3                 Humanoid-v4
HumanoidStandup-v2          HumanoidStandup-v4          InvertedDoublePendulum-v2
InvertedDoublePendulum-v4   Invert

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


In [25]:
model = A2C(
    policy='MlpPolicy',  # Use MLP Policy for a standard neural network (can change to custom policy if needed)
    env=vec_env,  # Your environment
    learning_rate=3e-4,  # From the table
    n_steps=5,  # Default value, you can increase this to match batch size preferences
    gamma=0.99,  # From the table
    gae_lambda=1.0,  # From the table, helps with Generalized Advantage Estimation
    ent_coef=0.01,  # From the table, controls entropy for exploration
    vf_coef=0.5,  # From the table, balance between policy and value function
    max_grad_norm=0.5,  # From the table, limits gradient explosion
    rms_prop_eps=1e-5,  # Default value for RMSProp, stability for gradient updates
    use_rms_prop=True,  # Using RMSProp for stable optimization
    use_sde=False,  # You can enable for stochastic networks (usually optional)
    sde_sample_freq=-1,  # Only relevant if use_sde=True
    normalize_advantage=False,  # Keep advantage normalization as False unless needed
    stats_window_size=100,  # The size of the window for stats tracking
    tensorboard_log=None,  # Optional, specify a path if you want TensorBoard logs
    policy_kwargs=None,  # For additional custom policy configuration (e.g., network architecture)
    verbose=1,  # Verbosity level, 1 will print some details during training
    seed=None,  # Set a seed for reproducibility if desired
    device='auto',  # Automatically selects the device (CPU or GPU)
    _init_setup_model=True,  # Initialize model setup if you haven't done that elsewhere
)
model.learn(1e6)

Using cpu device


  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 92.9     |
|    ep_rew_mean        | -91.8    |
| time/                 |          |
|    fps                | 2331     |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 4000     |
| train/                |          |
|    entropy_loss       | -1.71    |
|    explained_variance | -0.00226 |
|    learning_rate      | 0.0003   |
|    n_updates          | 99       |
|    policy_loss        | -4.52    |
|    value_loss         | 18.8     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 93.1     |
|    ep_rew_mean        | -90.6    |
| time/                 |          |
|    fps                | 2212     |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 8000     |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x1c663243970>

In [51]:
# Test the trained agent
# using the vecenv
obs = vec_env.reset()
n_steps = 20
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print(f"Step {step + 1} Action {action}")
    obs, reward, done, info = vec_env.step(action)
    print("reward=", reward)
    vec_env.render()

Step 1 Action [2 2 2 2 2 2 2 2]
reward= [0.10454353 0.10454353 0.10454353 0.10454353 0.10454353 0.10454353
 0.10454353 0.10454353]
Step 2 Action [4 4 4 4 4 4 4 4]
reward= [0.15242527 0.15242527 0.15242527 0.15242527 0.15242527 0.15242527
 0.15242527 0.15242527]
Step 3 Action [4 4 4 4 4 4 4 4]
reward= [0.04273289 0.04273289 0.04273289 0.04273289 0.04273289 0.04273289
 0.04273289 0.04273289]
Step 4 Action [0 0 0 0 0 0 0 0]
reward= [0.26143742 0.26143742 0.26143742 0.26143742 0.26143742 0.26143742
 0.26143742 0.26143742]
Step 5 Action [4 4 4 4 4 4 4 4]
reward= [-0.02939921 -0.02939921 -0.02939921 -0.02939921 -0.02939921 -0.02939921
 -0.02939921 -0.02939921]
Step 6 Action [0 0 0 0 0 0 0 0]
reward= [0.20489605 0.20489605 0.20489605 0.20489605 0.20489605 0.20489605
 0.20489605 0.20489605]
Step 7 Action [0 0 0 0 0 0 0 0]
reward= [0.07642822 0.07642822 0.07642822 0.07642822 0.07642822 0.07642822
 0.07642822 0.07642822]
Step 8 Action [2 2 2 2 2 2 2 2]
reward= [0.35251176 0.35251176 0.35251176 0

