# 06 ‚Äî RL with Different Algorithms

This notebook trains and evaluates multiple RL algorithms (PPO, A2C, SAC, TD3) on the same microgrid environment.



## Imports and Paths

In [1]:
import os, sys
import pandas as pd

# Path for local package
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Microgrid sim imports
from microgrid_sim.core import MicrogridGymEnv, MicrogridEnv
from microgrid_sim.data import SyntheticDataBuilder
from microgrid_sim.utils import plot_simulation, plot_reward_progress
from microgrid_sim.components import PVGenerator, WindTurbine, FossilGenerator, GridIntertie, BatteryStorage, ResidentialLoad, FactoryLoad
from microgrid_sim.control import RLController

# SB3
from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import PPO, A2C, SAC, TD3

print("Imports OK")


Imports OK


## Global Configurations

In [2]:
# --- Global constants ---
NO_OF_EPISODES = 100
TOTAL_HOURS    = 24 * 5   # 120 hours of synthetic data per dataset
CONTROL_DT     = 60       # minutes per control decision
SIM_DT         = 1        # simulation step minutes
SEED           = 42

CONTROL_STEPS_PER_EP = (TOTAL_HOURS * 60) // CONTROL_DT  # 120 control steps/episode
TRAINING_TIMESTEPS   = CONTROL_STEPS_PER_EP * NO_OF_EPISODES

# paths
MODEL_DIR   = "./models"
LOG_DIR     = "./logs/algos"
RESULTS_DIR = "./plots/06_rl_algos"

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

print("Config OK")
print(f"Control steps/episode: {CONTROL_STEPS_PER_EP}")
print(f"Total training timesteps: {TRAINING_TIMESTEPS}")


Config OK
Control steps/episode: 120
Total training timesteps: 12000


## Environment Setup

In [3]:
def setup_data_builder(seed: int) -> SyntheticDataBuilder:
    builder = SyntheticDataBuilder(
        total_hours=TOTAL_HOURS,
        sim_dt_minutes=SIM_DT,
        seed=seed
    )
    # Names must match env components below
    builder.add_pv("pv_1", peak_irr=900)
    builder.add_pv("pv_2", peak_irr=500)
    builder.add_wind("wind_a", mean_speed=7.0)
    builder.add_load("factory_a", base_kw=20.0, profile="factory")
    builder.add_load("factory_b", base_kw=13.5,  profile="factory")
    builder.add_load("house_a",   base_kw=3.5,  profile="residential")
    builder.add_load("house_b",   base_kw=2.5,  profile="residential")
    builder.add_load("house_c",   base_kw=4.5,  profile="residential")
    builder.add_grid_prices("grid")
    return builder


def setup_microgrid_env() -> MicrogridEnv:
    env = MicrogridEnv(
        simulation_hours=TOTAL_HOURS,
        control_interval_minutes=CONTROL_DT,
        sim_dt_minutes=SIM_DT
    )

    # Generators
    pv1 = PVGenerator("pv_1", capacity_kw=5.0,  time_step_minutes=SIM_DT)
    pv2 = PVGenerator("pv_2", capacity_kw=10.0,  time_step_minutes=SIM_DT)
    wind_a = WindTurbine("wind_a", rated_kw=10.5, time_step_minutes=SIM_DT)

    # Dispatchable
    diesel1 = FossilGenerator("diesel_1", p_min_kw=2.0, p_max_kw=10.0,
                              time_step_minutes=SIM_DT, fuel_cost_per_kwh=0.35)
    diesel2 = FossilGenerator("diesel_2", p_min_kw=0.0, p_max_kw=6.0,
                              time_step_minutes=SIM_DT, fuel_cost_per_kwh=0.35)

    # Loads
    house_a = ResidentialLoad("house_a", base_kw=3.5, noise_std=0.0)
    house_b = ResidentialLoad("house_b", base_kw=2.5, noise_std=0.0)
    house_c = ResidentialLoad("house_c", base_kw=4.5, noise_std=0.0)
    factory_a = FactoryLoad("factory_a", base_kw=20.0, noise_std=0.0)
    factory_b = FactoryLoad("factory_b", base_kw=13.5,  noise_std=0.0)
    # Storage
    bat1 = BatteryStorage("bat_1", capacity_kwh=20.0, time_step_minutes=SIM_DT,
                          max_charge_kw=8.0, max_discharge_kw=8.0)
    bat2 = BatteryStorage("bat_2", capacity_kwh=15.0, time_step_minutes=SIM_DT,
                          max_charge_kw=5.0, max_discharge_kw=5.0)

    # Grid
    grid = GridIntertie("grid", time_step_minutes=SIM_DT,
                        import_limit_kw=25.0, export_limit_kw=10.0,
                        price_export_per_kwh=0.05, price_import_per_kwh=0.15)

    # Add
    for c in [pv1, pv2, wind_a, diesel1, diesel2, house_a, house_b, house_c,
              factory_a, factory_b, bat1, bat2]:
        env.add_component(c)
    env.add_component(grid, is_grid=True)

    return env


def build_wrapped_env(seed: int, log_csv_path: str):
    """
    Returns: (train_eval_env_flat_and_monitored, orig_dict_env_reference)
    - Flattened Box action env for SB3
    - Monitor wrapper logs reward to CSV for our plots
    - Keep a reference to the original dict env (to get DataFrame results)
    """
    base_env = setup_microgrid_env()
    data_builder = setup_data_builder(seed)

    gym_env_dict = MicrogridGymEnv(
        microgrid_env=base_env,
        data_builder=data_builder,
        reward_weights={
            "cost": 5.0,
            "unmet": 5.0,
            "curtailment": 0.1,
            "soc_deviation": 0.0
        }
    )
    gym_env_flat = gym_env_dict.create_flattened_env()
    env_mon = Monitor(gym_env_flat, filename=log_csv_path)

    return env_mon, gym_env_dict


In [4]:
def run_full_episode(env_flat, policy, seed=SEED):
    """
    Runs one full episode with a trained policy on a flattened env.
    Returns (df_results, total_reward, steps).
    """
    obs, info = env_flat.reset(seed=seed)
    total_reward = 0.0
    done = False
    steps = 0
    while not done:
        action, _ = policy.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env_flat.step(action)
        total_reward += reward
        steps += 1
        done = terminated or truncated

    df = env_flat.unwrapped.env.get_results(as_dataframe=True)
    return df, float(total_reward), steps


def summarize_results(df, sim_dt_minutes=SIM_DT):
    dt_hours = sim_dt_minutes / 60.0
    total_cost = float(df["total_cashflow"].sum() if "total_cashflow" in df.columns else 0.0)
    unmet_kwh = float(df["unmet_load_kw"].sum() * dt_hours if "unmet_load_kw" in df.columns else 0.0)
    curtailed_kwh = float(df["curtailed_gen_kw"].sum() * dt_hours if "curtailed_gen_kw" in df.columns else 0.0)
    return {"Total Cost ($)": total_cost, "Unmet Energy (kWh)": unmet_kwh, "Curtailed Energy (kWh)": curtailed_kwh}


# PPO

## Build PPO envs

In [5]:
ppo_log_csv = os.path.join(LOG_DIR, "PPO_monitor.csv")
ppo_env_train, ppo_env_dict_ref = build_wrapped_env(seed=SEED, log_csv_path=ppo_log_csv)

# Use a separate test instance (same seed & config) to avoid train-state leakage
ppo_env_test,  _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "PPO_test_monitor.csv"))

print("PPO envs ready")


PPO envs ready


## Train PPO

In [None]:
ppo_controller = RLController(
    algo="PPO",
    env=ppo_env_train,
    policy='MultiInputPolicy',
    n_steps=CONTROL_STEPS_PER_EP,
    batch_size=12,
    n_epochs=5,
)

ppo_controller.train(
    total_timesteps=TRAINING_TIMESTEPS,
    log_dir=LOG_DIR,
)

# ppo.learn(total_timesteps=TRAINING_TIMESTEPS, progress_bar=True)
ppo_model_path = os.path.join(MODEL_DIR, "ppo_microgrid")
ppo_controller.save(ppo_model_path)
print("PPO trained & saved")


Output()

## PPO reward progression plot

In [None]:
plot_reward_progress(
    monitor_csv_path=ppo_log_csv,
    title="PPO ‚Äî Training Reward",
    out_path=os.path.join(RESULTS_DIR, "ppo_reward_progress.png"),
    rolling=10
);


## PPO evaluation & plots

In [None]:
# Load for safety (optional)
ppo_loaded = PPO.load(ppo_model_path, env=ppo_env_train)

df_ppo, ppo_reward, ppo_steps = run_full_episode(ppo_env_test, ppo_loaded)
ppo_metrics = summarize_results(df_ppo)

print("PPO steps:", ppo_steps)
print("PPO reward:", f"{ppo_reward:.2f}")
print("PPO metrics:", ppo_metrics)

_ = plot_simulation(
    df_ppo,
    sim_dt_minutes=SIM_DT,
    sim_name="06-PPO",
    save=True,
    base_dir=RESULTS_DIR
)


# A2C

## Build A2C envs

In [None]:
a2c_log_csv   = os.path.join(LOG_DIR, "A2C_monitor.csv")
a2c_env_train, a2c_env_dict_ref = build_wrapped_env(seed=SEED, log_csv_path=a2c_log_csv)
a2c_env_test,  _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "A2C_test_monitor.csv"))

print("A2C envs ready")

## Train A2C

In [None]:
a2c = A2C(
    policy="MultiInputPolicy",
    env=a2c_env_train,
    learning_rate=2.5e-4,
    n_steps=CONTROL_STEPS_PER_EP,
    gamma=0.99,
    gae_lambda=0.95,
    ent_coef=0.0,
    vf_coef=0.5,
    max_grad_norm=0.5,
    verbose=0,
    device="auto"
)

a2c.learn(total_timesteps=TRAINING_TIMESTEPS, progress_bar=True)
a2c_model_path = os.path.join(MODEL_DIR, "a2c_microgrid")
a2c.save(a2c_model_path)
print("A2C trained & saved")


## A2C reward progression plot

In [None]:
plot_reward_progress(
    monitor_csv_path=a2c_log_csv,
    title="A2C ‚Äî Training Reward",
    out_path=os.path.join(RESULTS_DIR, "a2c_reward_progress.png"),
    rolling=10
);


## A2C evaluation & plots

In [None]:
a2c_loaded = A2C.load(a2c_model_path, env=a2c_env_train)

df_a2c, a2c_reward, a2c_steps = run_full_episode(a2c_env_test, a2c_loaded)
a2c_metrics = summarize_results(df_a2c)

print("A2C steps:", a2c_steps)
print("A2C reward:", f"{a2c_reward:.2f}")
print("A2C metrics:", a2c_metrics)

_ = plot_simulation(
    df_a2c,
    sim_dt_minutes=SIM_DT,
    sim_name="06-A2C",
    save=True,
    base_dir=RESULTS_DIR
)


# SAC

## Build SAC envs

In [None]:
sac_log_csv   = os.path.join(LOG_DIR, "SAC_monitor.csv")
sac_env_train, sac_env_dict_ref = build_wrapped_env(seed=SEED, log_csv_path=sac_log_csv)
sac_env_test,  _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "SAC_test_monitor.csv"))

print("SAC envs ready")


## Train SAC

In [None]:
# SAC expects Box actions (we have flattened Box) and supports MultiInputPolicy for Dict obs
sac = SAC(
    policy="MultiInputPolicy",
    env=sac_env_train,
    learning_rate=3e-4,
    buffer_size=100_000,
    batch_size=256,
    tau=0.005,
    gamma=0.99,
    train_freq=1,
    gradient_steps=1,
    learning_starts=CONTROL_STEPS_PER_EP,  # after one episode of experience
    ent_coef="auto",
    verbose=0,
    device="auto"
)

sac.learn(total_timesteps=TRAINING_TIMESTEPS, progress_bar=True)
sac_model_path = os.path.join(MODEL_DIR, "sac_microgrid")
sac.save(sac_model_path)
print("SAC trained & saved")


## SAC reward progression plot

In [None]:
plot_reward_progress(
    monitor_csv_path=sac_log_csv,
    title="SAC ‚Äî Training Reward",
    out_path=os.path.join(RESULTS_DIR, "sac_reward_progress.png"),
    rolling=10
);


## SAC evaluation & plots

In [None]:
sac_loaded = SAC.load(sac_model_path, env=sac_env_train)

df_sac, sac_reward, sac_steps = run_full_episode(sac_env_test, sac_loaded)
sac_metrics = summarize_results(df_sac)

print("SAC steps:", sac_steps)
print("SAC reward:", f"{sac_reward:.2f}")
print("SAC metrics:", sac_metrics)

_ = plot_simulation(
    df_sac,
    sim_dt_minutes=SIM_DT,
    sim_name="06-SAC",
    save=True,
    base_dir=RESULTS_DIR
)


# TD3

## Build TD3 envs

In [None]:
td3_log_csv   = os.path.join(LOG_DIR, "TD3_monitor.csv")
td3_env_train, td3_env_dict_ref = build_wrapped_env(seed=SEED, log_csv_path=td3_log_csv)
td3_env_test,  _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "TD3_test_monitor.csv"))

print("TD3 envs ready")


## Train TD3

In [None]:
td3 = TD3(
    policy="MultiInputPolicy",
    env=td3_env_train,
    learning_rate=1e-3,
    buffer_size=100_000,
    batch_size=256,
    tau=0.005,
    gamma=0.99,
    train_freq=1,
    gradient_steps=1,
    learning_starts=CONTROL_STEPS_PER_EP,
    policy_delay=2,
    verbose=0,
    device="auto"
)

td3.learn(total_timesteps=TRAINING_TIMESTEPS, progress_bar=True)
td3_model_path = os.path.join(MODEL_DIR, "td3_microgrid")
td3.save(td3_model_path)
print("TD3 trained & saved")


## TD3 reward progression plot

In [None]:
plot_reward_progress(
    monitor_csv_path=td3_log_csv,
    title="TD3 ‚Äî Training Reward",
    out_path=os.path.join(RESULTS_DIR, "td3_reward_progress.png"),
    rolling=10
);


## TD3 evaluation & plots

In [None]:
td3_loaded = TD3.load(td3_model_path, env=td3_env_train)

df_td3, td3_reward, td3_steps = run_full_episode(td3_env_test, td3_loaded)
td3_metrics = summarize_results(df_td3)

print("TD3 steps:", td3_steps)
print("TD3 reward:", f"{td3_reward:.2f}")
print("TD3 metrics:", td3_metrics)

_ = plot_simulation(
    df_td3,
    sim_dt_minutes=SIM_DT,
    sim_name="06-TD3",
    save=True,
    base_dir=RESULTS_DIR
)


# Random Policy Baseline

## Random policy (for context)

In [None]:
rand_env_test, _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "RANDOM_test_monitor.csv"))

obs, info = rand_env_test.reset(seed=SEED)
rand_reward = 0.0
done = False
rand_steps = 0
while not done:
    action = rand_env_test.action_space.sample()
    obs, reward, terminated, truncated, info = rand_env_test.step(action)
    rand_reward += reward
    rand_steps += 1
    done = terminated or truncated

df_random = rand_env_test.unwrapped.env.get_results(as_dataframe=True)
rand_metrics = summarize_results(df_random)

print("Random steps:", rand_steps)
print("Random reward:", f"{rand_reward:.2f}")
print("Random metrics:", rand_metrics)

_ = plot_simulation(
    df_random,
    sim_dt_minutes=SIM_DT,
    sim_name="06-RANDOM",
    save=True,
    base_dir=RESULTS_DIR
)


# Comparison of All Algorithms

In [None]:
def fmt_money(x):
    return f"${x:,.2f}"

results = {
    "Random": {**rand_metrics, "Test Reward": rand_reward},
    "PPO":    {**ppo_metrics,  "Test Reward": ppo_reward},
    "A2C":    {**a2c_metrics,  "Test Reward": a2c_reward},
    "SAC":    {**sac_metrics,  "Test Reward": sac_reward},
    "TD3":    {**td3_metrics,  "Test Reward": td3_reward},
}

comparison_df = pd.DataFrame(results).T
comparison_df["Total Cost ($)"] = comparison_df["Total Cost ($)"].apply(fmt_money)
comparison_df["Unmet Energy (kWh)"] = comparison_df["Unmet Energy (kWh)"].apply(lambda v: f"{v:.3f}")
comparison_df["Curtailed Energy (kWh)"] = comparison_df["Curtailed Energy (kWh)"].apply(lambda v: f"{v:.3f}")
comparison_df["Test Reward"] = comparison_df["Test Reward"].apply(lambda v: f"{v:.3f}")

print("\n" + "="*80)
print("üèÅ ALGORITHM COMPARISON")
print("="*80)
print(comparison_df.to_string())
print("="*80)
