In [1]:
# --- 2. Imports ---
import sys
import numpy as np
import torch
import matplotlib.pyplot as plt

# If your code is in a local directory, you might need to add it to sys.path:
# (Adjust the path to where your environment code or models are located)
# sys.path.append("/path/to/your/project")

# Example: from your module that implements HockeyEnv_BasicOpponent
from hockey.hockey_env import HockeyEnv_BasicOpponent, Mode


# Import your DDPG modules (Agent/Trainer) from your project structure
try:
    from models.ddpg.DDPG import DDPGAgent
    from models.ddpg.DDPGTrainer import DDPGTrainer
except ImportError:
    print("Could not import your DDPG modules. Adjust your paths accordingly.")
    raise

In [9]:
# --- 3. Initialize the Environment & Check Spaces ---

# The single-agent wrapper environment is `HockeyEnv_BasicOpponent`.
# By default:
#   - It uses mode=Mode.NORMAL
#   - keep_mode=True
#   - An opponent with `weak_opponent=False` or True
#
# This environment has an action_space of shape (4,) 
# suitable for a single-agent continuous control algorithm like DDPG.

env = HockeyEnv_BasicOpponent(
    mode=Mode.NORMAL,   # or Mode.TRAIN_SHOOTING, Mode.TRAIN_DEFENSE
    weak_opponent=False # whether the opponent is weaker or not
)

print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

# For reference:
# - The observation space is Box(...) with shape (18,) if keep_mode=True.
# - The action space is Box(...) with shape (4,). 
#   The four actions: 
#       1) Force in x, 
#       2) Force in y, 
#       3) Torque (racket rotation),
#       4) Shoot command (0 or 1).

Observation space: Box(-inf, inf, (18,), float32)
Action space: Box(-1.0, 1.0, (4,), float32)


In [None]:
# --- 4. Configure & Instantiate the Trainer ---

# The DDPGTrainer (or your own trainer) expects:
#  - environment name
#  - training_config (like #episodes, #timesteps, etc.)
#  - model_config (DDPG hyperparameters)
#  - experiment_path (where logs/stats are saved)
#
# We demonstrate usage with your existing trainer code.
# If your trainer requires a 'env_name' that typically does `gym.make(env_name)`,
#   you can pass a dummy name and then directly assign `trainer.env = env`.

training_config = {
    "max_episodes": 2000,     # Number of episodes for training
    "max_timesteps": 250,    # Max steps per episode (the environment uses ~250 for normal mode)
    "log_interval": 10,
    "save_interval": 100,
    "render": False,         # Set True to see the environment window
    "train_iter": 32,        # How many DDPG updates each episode
    "seed": 42               # For reproducibility
}

model_config = {
    "eps": 0.1,                  # Noise scale
    "discount": 0.95,            # Discount factor
    "buffer_size": int(1e5),     # Replay buffer size
    "batch_size": 64,            # Minibatch size
    "learning_rate_actor": 1e-4, 
    "learning_rate_critic": 1e-3,
    "hidden_sizes_actor": [128, 128],
    "hidden_sizes_critic": [128, 128, 64],
    "update_target_every": 100,
    "use_target_net": True
}

experiment_path = "rl_experiments/experiments/HockeyEnv_DDPG_Test"

# Initialize trainer
trainer = DDPGTrainer(
    env_name="HockeyEnv",
    training_config=training_config,
    model_config=model_config,
    experiment_path=experiment_path,
    wandb_run=None  # or a Weights & Biases run object if you use wandb
)

# # Overwrite the default environment in trainer with our custom env:
# trainer.env = env


NameError: name 'DDPGTrainer' is not defined

In [8]:
# --- 5. Train the Agent ---

final_metrics = trainer.train()

print("Training finished.")
print("Final metrics:", final_metrics)


2025-01-14 16:35:01 [INFO] Starting DDPG Training...
2025-01-14 16:35:01 [INFO] Environment: HockeyEnv_BasicOpponent, max_episodes=2000, max_timesteps=250, train_iter=32
2025-01-14 16:35:02 [INFO] Episode 10	Avg Length: 171.20	Avg Reward: -23.683
2025-01-14 16:35:03 [INFO] Episode 20	Avg Length: 176.20	Avg Reward: -27.634
2025-01-14 16:35:04 [INFO] Episode 30	Avg Length: 184.60	Avg Reward: -22.109
2025-01-14 16:35:04 [INFO] Episode 40	Avg Length: 206.10	Avg Reward: -25.396
2025-01-14 16:35:05 [INFO] Episode 50	Avg Length: 167.90	Avg Reward: -16.978
2025-01-14 16:35:06 [INFO] Episode 60	Avg Length: 201.50	Avg Reward: -20.222
2025-01-14 16:35:07 [INFO] Episode 70	Avg Length: 169.40	Avg Reward: -16.232
2025-01-14 16:35:08 [INFO] Episode 80	Avg Length: 177.10	Avg Reward: -18.295
2025-01-14 16:35:09 [INFO] Episode 90	Avg Length: 195.80	Avg Reward: -18.544
2025-01-14 16:35:10 [INFO] Saved checkpoint at episode 100 -> rl_experiments/experiments/HockeyEnv_DDPG_Test/results/training/saved_model

In [4]:
import torch

checkpoint_path = "rl_experiments/experiments/HockeyEnv_DDPG_Test/results/training/saved_models/DDPG_HockeyEnv_BasicOpponent_eps0.1_alr0.0001_clr0.001_gamma0.99_checkpoint_ep2000.pth"
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))

# 4) Restore the agent's networks
trainer.agent.restore_state(checkpoint)

  checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))


In [8]:
# --- 6. Testing / Demo (Optional) ---

# If you want to watch a few episodes:
num_test_episodes = 10
for ep in range(num_test_episodes):
    obs, _ = env.reset()
    done = False
    episode_reward = 0.0
    while not done:
        # Simple deterministic policy (no noise)
        action = trainer.agent.act(obs, eps=0.0)  # zero noise
        obs, reward, done, trunc, info = env.step(action)
        episode_reward += reward

        # If you want to watch the environment
        env.render(mode="rgb_array")

        if done or trunc:
            print(f"Test Episode {ep+1}, Reward: {episode_reward:.2f}")
            break

env.close()


Test Episode 1, Reward: -10.77
Test Episode 2, Reward: 7.07
Test Episode 3, Reward: -12.45
Test Episode 4, Reward: -13.12
Test Episode 5, Reward: -10.02
Test Episode 6, Reward: 9.27
Test Episode 7, Reward: -13.03
Test Episode 8, Reward: -34.52
Test Episode 9, Reward: -11.60
Test Episode 10, Reward: -3.28


In [13]:
import imageio
import numpy as np

num_test_episodes = 10
for ep in range(num_test_episodes):
    frames = []
    obs, info = env.reset()
    done = False
    trainer.agent.reset()


    episode_reward = 0.0
    while not done:
        # Act with no noise at test time
        action = trainer.agent.act(obs, eps=0.0)
        obs, reward, done, trunc, info = env.step(action)
        episode_reward += reward

        # ---- Capture frame in rgb_array mode ----
        frame_rgb = env.render(mode='rgb_array')  
        frames.append(frame_rgb)

        if done or trunc:
            break

    env.close()
    print(f"Episode reward: {episode_reward}")

    # 4) Save frames as GIF
    gif_path = f"ddpg_laserhockey_episode{ep}.gif"
    imageio.mimsave(gif_path, frames, fps=15)  # set fps as desired
    print(f"Saved GIF to {gif_path}")

Episode reward: -12.310578733653294
Saved GIF to ddpg_laserhockey_episode0.gif
Episode reward: -22.159581652821615
Saved GIF to ddpg_laserhockey_episode1.gif
Episode reward: -8.00500476368653
Saved GIF to ddpg_laserhockey_episode2.gif
Episode reward: -16.056604594272734
Saved GIF to ddpg_laserhockey_episode3.gif
Episode reward: -8.771734848460094
Saved GIF to ddpg_laserhockey_episode4.gif
Episode reward: -8.991283737925658
Saved GIF to ddpg_laserhockey_episode5.gif
Episode reward: -2.0722434075932195
Saved GIF to ddpg_laserhockey_episode6.gif
Episode reward: -13.641432833137229
Saved GIF to ddpg_laserhockey_episode7.gif
Episode reward: -4.937669097315655
Saved GIF to ddpg_laserhockey_episode8.gif
Episode reward: -7.79536010486082
Saved GIF to ddpg_laserhockey_episode9.gif


In [12]:

# 4) Save frames as GIF
gif_path = "ddpg_laserhockey_episode.gif"
imageio.mimsave(gif_path, frames, fps=15)  # set fps as desired
print(f"Saved GIF to {gif_path}")

Saved GIF to ddpg_laserhockey_episode.gif
