In [None]:
import gymnasium as gym
import stable_baselines3 as sb
from stable_baselines3.common.env_util import make_vec_env
import torch
import pickle

In [None]:
import os
import time
import gymnasium as gym
import pandas as pd
import stable_baselines3 as sb

from stable_baselines3.common.callbacks import BaseCallback

# Output path
out_dir = "./TD3/Outputs"
os.makedirs(out_dir, exist_ok=True)


# Callback to collect episode stats
class EpisodeStatsCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.ep_rewards = []
        self.ep_lengths = []
        self.times = []
        self.t0 = None

    def _on_training_start(self):
        self.t0 = time.time()

    def _on_step(self):
        infos = self.locals.get("infos", [])
        for info in infos:
            if "episode" in info:
                self.ep_rewards.append(info["episode"]["r"])
                self.ep_lengths.append(info["episode"]["l"])
                self.times.append(time.time() - self.t0)
        return True


# Define environment
env_id = "BipedalWalker-v3"

env = gym.make(env_id, hardcore=False, render_mode=None)
env = gym.wrappers.RecordEpisodeStatistics(env)


# model
callback = EpisodeStatsCallback()

model = sb.TD3("MlpPolicy", env, verbose=1)

model.learn(
    total_timesteps=1_000_000,
    callback=callback
)

model.save("./TD3/td3_easy")


# save episode data to excel
df = pd.DataFrame({"episode": range(1, len(callback.ep_rewards) + 1), "episode_reward": callback.ep_rewards,
    "episode_length_timesteps": callback.ep_lengths, "cumulative_time_seconds": callback.times})

excel_path = os.path.join(out_dir, "episode_stats.xlsx")
df.to_excel(excel_path, index=False)

print(f"Saved episode stats to {excel_path}")


# Record one video after training
video_env = gym.make(env_id, hardcore=False, render_mode="rgb_array")

video_env = gym.wrappers.RecordVideo(video_env, video_folder=out_dir, episode_trigger=lambda episode_id: True, name_prefix="td3_easy_video")

obs, info = video_env.reset()

for _ in range(2000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = video_env.step(action)
    done = terminated or truncated
    if done:
        break

video_env.close()
env.close()


In [None]:
# Output path
out_dir = "./SAC/Outputs"
os.makedirs(out_dir, exist_ok=True)


# Define environment
env = gym.make(env_id, hardcore=False, render_mode=None)
env = gym.wrappers.RecordEpisodeStatistics(env)


# model
callback = EpisodeStatsCallback()

model = sb.SAC(
    "MlpPolicy",
    env,
    verbose=1
)

model.learn(total_timesteps=1_000_000, callback=callback)

model.save("./SAC/sac_easy")


# save episode data to excel
df = pd.DataFrame({"episode": range(1, len(callback.ep_rewards) + 1), "episode_reward": callback.ep_rewards,
    "episode_length_timesteps": callback.ep_lengths, "cumulative_time_seconds": callback.times})

excel_path = os.path.join(out_dir, "episode_stats.xlsx")
df.to_excel(excel_path, index=False)

print(f"Saved episode stats to {excel_path}")


# Record one video after training
video_env = gym.make(env_id, hardcore=False, render_mode="rgb_array")

video_env = gym.wrappers.RecordVideo(video_env, video_folder=out_dir, episode_trigger=lambda episode_id: True, name_prefix="sac_easy_video")

obs, info = video_env.reset()

for _ in range(2000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = video_env.step(action)
    done = terminated or truncated
    if done:
        break

video_env.close()
env.close()

In [None]:
# Output path
out_dir = "./PPO/Outputs"
os.makedirs(out_dir, exist_ok=True)


# Define environment
env = gym.make(env_id, hardcore=False, render_mode=None)
env = gym.wrappers.RecordEpisodeStatistics(env)


# model
callback = EpisodeStatsCallback()

model = sb.PPO("MlpPolicy", env, verbose=1)

model.learn(total_timesteps=1_000_000, callback=callback)

model.save("./PPO/ppo_easy")


# save episode data to excel
df = pd.DataFrame({"episode": range(1, len(callback.ep_rewards) + 1), "episode_reward": callback.ep_rewards,
    "episode_length_timesteps": callback.ep_lengths, "cumulative_time_seconds": callback.times})

excel_path = os.path.join(out_dir, "episode_stats.xlsx")
df.to_excel(excel_path, index=False)

print(f"Saved episode stats to {excel_path}")


# Record one video after training
video_env = gym.make(env_id, hardcore=False, render_mode="rgb_array")

video_env = gym.wrappers.RecordVideo(video_env, video_folder=out_dir, episode_trigger=lambda episode_id: True, name_prefix="ppo_easy_video")

obs, info = video_env.reset()

for _ in range(2000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = video_env.step(action)
    done = terminated or truncated
    if done:
        break

video_env.close()
env.close()

In [None]:
# Output path
out_dir = "./TD3/Outputs"
os.makedirs(out_dir, exist_ok=True)


# Define environment
env_id = "BipedalWalker-v3"

env = gym.make(env_id, hardcore=True, render_mode=None)
env = gym.wrappers.RecordEpisodeStatistics(env)


# Model
callback = EpisodeStatsCallback()

model = sb.TD3.load("./TD3/td3_easy")

# Set environment to hardcore

model.set_env(env)

model.learn(total_timesteps=100_000, callback=callback)

model.save("./TD3/td3_hard")


# save episode data to excel
df = pd.DataFrame({"episode": range(1, len(callback.ep_rewards) + 1), "episode_reward": callback.ep_rewards,
    "episode_length_timesteps": callback.ep_lengths, "cumulative_time_seconds": callback.times})

excel_path = os.path.join(out_dir, "episode_stats_hardcore.xlsx")
df.to_excel(excel_path, index=False)

print(f"Saved episode stats to {excel_path}")


# Record one video after training
video_env = gym.make(env_id, hardcore=True, render_mode="rgb_array")

video_env = gym.wrappers.RecordVideo(video_env, video_folder=out_dir, episode_trigger=lambda episode_id: True, name_prefix="td3_hard_video")

obs, info = video_env.reset()

for _ in range(2000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = video_env.step(action)
    done = terminated or truncated
    if done:
        break

video_env.close()
env.close()

In [None]:
# Output path
out_dir = "./SAC/Outputs"
os.makedirs(out_dir, exist_ok=True)


# Define environment
env_id = "BipedalWalker-v3"

env = gym.make(env_id, hardcore=True, render_mode=None)
env = gym.wrappers.RecordEpisodeStatistics(env)


# Model
callback = EpisodeStatsCallback()

model = sb.SAC.load("./SAC/sac_easy")

# Set environment to hardcore

model.set_env(env)

model.learn(total_timesteps=200_000, callback=callback)

model.save("./SAC/sac_hard")


# save episode data to excel
df = pd.DataFrame({"episode": range(1, len(callback.ep_rewards) + 1), "episode_reward": callback.ep_rewards,
    "episode_length_timesteps": callback.ep_lengths, "cumulative_time_seconds": callback.times})

excel_path = os.path.join(out_dir, "episode_stats_hardcore.xlsx")
df.to_excel(excel_path, index=False)

print(f"Saved episode stats to {excel_path}")


# Record one video after training
video_env = gym.make(env_id, hardcore=True, render_mode="rgb_array")

video_env = gym.wrappers.RecordVideo(video_env, video_folder=out_dir, episode_trigger=lambda episode_id: True, name_prefix="sac_hard_video")

obs, info = video_env.reset()

for _ in range(2000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = video_env.step(action)
    done = terminated or truncated
    if done:
        break

video_env.close()
env.close()

In [None]:
# Output path
out_dir = "./PPO/Outputs"
os.makedirs(out_dir, exist_ok=True)


# Define environment
env_id = "BipedalWalker-v3"

env = gym.make(env_id, hardcore=True, render_mode=None)
env = gym.wrappers.RecordEpisodeStatistics(env)


# Model
callback = EpisodeStatsCallback()

model = sb.PPO.load("./PPO/ppo_easy")

# Set environment to hardcore

model.set_env(env)

model.learn(total_timesteps=1_000_000, callback=callback)

model.save("./PPO/ppo_hard")


# save episode data to excel
df = pd.DataFrame({"episode": range(1, len(callback.ep_rewards) + 1), "episode_reward": callback.ep_rewards,
    "episode_length_timesteps": callback.ep_lengths, "cumulative_time_seconds": callback.times})

excel_path = os.path.join(out_dir, "episode_stats_hardcore.xlsx")
df.to_excel(excel_path, index=False)

print(f"Saved episode stats to {excel_path}")


# Record one video after training
video_env = gym.make(env_id, hardcore=True, render_mode="rgb_array")

video_env = gym.wrappers.RecordVideo(video_env, video_folder=out_dir, episode_trigger=lambda episode_id: True, name_prefix="ppo_hard_video")

obs, info = video_env.reset()

for _ in range(2000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = video_env.step(action)
    done = terminated or truncated
    if done:
        break

video_env.close()
env.close()