# 06 ‚Äî RL with Different Algorithms

This notebook trains and evaluates multiple RL algorithms (PPO, A2C, SAC, TD3) on the same microgrid environment.



## Imports and Paths

In [1]:
import os, sys
import pandas as pd

# Path for local package
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Microgrid sim imports
from microgrid_sim.core import MicrogridGymEnv, MicrogridEnv
from microgrid_sim.data import SyntheticDataBuilder
from microgrid_sim.utils import plot_simulation, plot_reward_progress
from microgrid_sim.components import PVGenerator, WindTurbine, FossilGenerator, GridIntertie, BatteryStorage, ResidentialLoad, FactoryLoad
from microgrid_sim.control import RLController

# SB3
from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import PPO, A2C, SAC, TD3

print("Imports OK")


Imports OK


## Global Configurations

In [None]:
# --- Global constants ---
NO_OF_EPISODES = 1000
TOTAL_HOURS    = 24 * 5   # 120 hours of synthetic data per dataset
CONTROL_DT     = 60       # minutes per control decision
SIM_DT         = 1        # simulation step minutes
SEED           = 42

CONTROL_STEPS_PER_EP = (TOTAL_HOURS * 60) // CONTROL_DT  # 120 control steps/episode
TRAINING_TIMESTEPS   = CONTROL_STEPS_PER_EP * NO_OF_EPISODES

# Reward
REWARD_WEIGHTS = {
    "cost": 50.0,
    "unmet": 50.0,
    "curtailment": 1.0,
    "soc_deviation": 0.0
}

# paths
MODEL_DIR   = "./models"
LOG_DIR     = "./logs/algos"
RESULTS_DIR = "./plots/06_rl_algos"

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

USE_SAVED_MODELS = False
global_data = {
    "results": {},
    "envs": {},
    "models": {}
}

print("Config OK")
print(f"Control steps/episode: {CONTROL_STEPS_PER_EP}")
print(f"Total training timesteps: {TRAINING_TIMESTEPS}")


Config OK
Control steps/episode: 120
Total training timesteps: 240000


## Environment Setup

In [3]:
def setup_data_builder(seed: int) -> SyntheticDataBuilder:
    builder = SyntheticDataBuilder(
        total_hours=TOTAL_HOURS,
        sim_dt_minutes=SIM_DT,
        seed=seed
    )
    # Names must match env components below
    builder.add_pv("pv_1", peak_irr=900)
    builder.add_pv("pv_2", peak_irr=500)
    builder.add_wind("wind_a", mean_speed=7.0)
    builder.add_load("factory_a", base_kw=20.0, profile="factory")
    builder.add_load("factory_b", base_kw=13.5,  profile="factory")
    builder.add_load("house_a",   base_kw=3.5,  profile="residential")
    builder.add_load("house_b",   base_kw=2.5,  profile="residential")
    builder.add_load("house_c",   base_kw=4.5,  profile="residential")
    builder.add_grid_prices("grid")
    return builder


def setup_microgrid_env() -> MicrogridEnv:
    env = MicrogridEnv(
        simulation_hours=TOTAL_HOURS,
        control_interval_minutes=CONTROL_DT,
        sim_dt_minutes=SIM_DT
    )

    # Generators
    pv1 = PVGenerator("pv_1", capacity_kw=20.0,  time_step_minutes=SIM_DT)
    pv2 = PVGenerator("pv_2", capacity_kw=25.0,  time_step_minutes=SIM_DT)
    wind_a = WindTurbine("wind_a", rated_kw=17.5, time_step_minutes=SIM_DT)

    # Dispatchable
    diesel1 = FossilGenerator("diesel_1", p_min_kw=0.0, p_max_kw=20.0,
                              time_step_minutes=SIM_DT, fuel_cost_per_kwh=0.35)
    diesel2 = FossilGenerator("diesel_2", p_min_kw=0.0, p_max_kw=15.0,
                              time_step_minutes=SIM_DT, fuel_cost_per_kwh=0.35)

    # Loads
    house_a = ResidentialLoad("house_a", base_kw=3.5, noise_std=0.0)
    house_b = ResidentialLoad("house_b", base_kw=2.5, noise_std=0.0)
    house_c = ResidentialLoad("house_c", base_kw=4.5, noise_std=0.0)
    factory_a = FactoryLoad("factory_a", base_kw=20.0, noise_std=0.0)
    factory_b = FactoryLoad("factory_b", base_kw=13.5,  noise_std=0.0)
    # Storage
    bat1 = BatteryStorage("bat_1", capacity_kwh=50.0, time_step_minutes=SIM_DT,
                          max_charge_kw=40.0, max_discharge_kw=35.0)
    bat2 = BatteryStorage("bat_2", capacity_kwh=30.0, time_step_minutes=SIM_DT,
                          max_charge_kw=20.0, max_discharge_kw=15.0)

    # Grid
    grid = GridIntertie("grid", time_step_minutes=SIM_DT,
                        import_limit_kw=50.0, export_limit_kw=30.0,
                        price_export_per_kwh=0.15, price_import_per_kwh=0.25)

    # Add
    for c in [pv1, pv2, wind_a, diesel1, diesel2, house_a, house_b, house_c,
              factory_a, factory_b, bat1, bat2]:
        env.add_component(c)
    env.add_component(grid, is_grid=True)

    return env


def build_wrapped_env(seed: int, log_csv_path: str):
    """
    Returns: (train_eval_env_flat_and_monitored, orig_dict_env_reference)
    - Flattened Box action env for SB3
    - Monitor wrapper logs reward to CSV for our plots
    - Keep a reference to the original dict env (to get DataFrame results)
    """
    base_env = setup_microgrid_env()
    data_builder = setup_data_builder(seed)

    gym_env_dict = MicrogridGymEnv(
        microgrid_env=base_env,
        data_builder=data_builder,
        reward_weights=REWARD_WEIGHTS,
    )
    gym_env_flat = gym_env_dict.create_flattened_env()
    env_mon = Monitor(gym_env_flat, filename=log_csv_path)

    return env_mon, gym_env_dict


In [4]:
def run_full_episode(env_flat, policy, seed=SEED):
    """
    Runs one full episode with a trained policy on a flattened env.
    Returns (df_results, total_reward, steps).
    """
    obs, info = env_flat.reset(seed=seed)
    total_reward = 0.0
    done = False
    steps = 0
    while not done:
        action, _ = policy.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env_flat.step(action)
        total_reward += reward
        steps += 1
        done = terminated or truncated

    df = env_flat.unwrapped.env.get_results(as_dataframe=True)
    return df, float(total_reward), steps


def summarize_results(df, sim_dt_minutes=SIM_DT):
    dt_hours = sim_dt_minutes / 60.0
    total_cost = float(df["total_cashflow"].sum() if "total_cashflow" in df.columns else 0.0)
    unmet_kwh = float(df["unmet_load_kw"].sum() * dt_hours if "unmet_load_kw" in df.columns else 0.0)
    curtailed_kwh = float(df["curtailed_gen_kw"].sum() * dt_hours if "curtailed_gen_kw" in df.columns else 0.0)
    return {"Total Cost ($)": total_cost, "Unmet Energy (kWh)": unmet_kwh, "Curtailed Energy (kWh)": curtailed_kwh}


In [5]:
def manage_training(AlgoClass, algo_name, env_train, total_timesteps, **kwargs):
    """Initializes and trains/loads the controller, returning the policy and model path."""

    model_path = os.path.join(MODEL_DIR, f"{algo_name}_microgrid")
    model_file = f"{model_path}.zip"
    policy = None

    # 1. Attempt to Load
    if USE_SAVED_MODELS and os.path.exists(model_file):
        try:
            print(f"\n--- {algo_name}: Loading existing model ---")
            policy = AlgoClass.load(model_path, env=env_train, algo=algo_name)
        except Exception as e:
            print(f"Failed to load {algo_name}: {e}. Training new model.")
            policy = None

    # 2. Train if Loading Failed or Not Allowed
    if policy is None:
        print(f"\n--- {algo_name}: Building and training new model ---")

        rl_controller = RLController(
            algo=algo_name, env=env_train, **kwargs
        )

        rl_controller.train(total_timesteps=total_timesteps, log_dir=LOG_DIR)
        print(f"{algo_name} training complete. Saving model...")

        rl_controller.save(model_path)
        policy = rl_controller.model
        print(f"{algo_name} trained and saved.")

    return policy, model_path

# Setup all environments

In [6]:
# Global setup of environments (run once)
ppo_env_train_vec, _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "PPO_monitor.csv"))
a2c_env_train_vec, _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "A2C_monitor.csv"))
sac_env_train_vec, _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "SAC_monitor.csv"))
td3_env_train_vec, _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "TD3_monitor.csv"))

ppo_env_test_vec, _ = build_wrapped_env(seed=SEED + 1, log_csv_path=os.path.join(LOG_DIR, "PPO_test_monitor.csv"))
a2c_env_test_vec, _ = build_wrapped_env(seed=SEED + 1, log_csv_path=os.path.join(LOG_DIR, "A2C_test_monitor.csv"))
sac_env_test_vec, _ = build_wrapped_env(seed=SEED + 1, log_csv_path=os.path.join(LOG_DIR, "SAC_test_monitor.csv"))
td3_env_test_vec, _ = build_wrapped_env(seed=SEED + 1, log_csv_path=os.path.join(LOG_DIR, "TD3_test_monitor.csv"))

print("Environments for training and testing set up.")

Environments for training and testing set up.


# PPO

## Train PPO

In [7]:
ppo_policy, ppo_model_path = manage_training(PPO, "PPO", ppo_env_train_vec, TRAINING_TIMESTEPS)


--- PPO: Building and training new model ---


Output()

## PPO reward progression plot

In [None]:
ppo_log_csv = os.path.join(LOG_DIR, "PPO_monitor.csv")
_ = plot_reward_progress(
    monitor_csv_path=ppo_log_csv,
    title="PPO ‚Äî Training Reward",
    out_path=os.path.join(RESULTS_DIR, "ppo_reward_progress.png"),
    rolling=10
)

## PPO evaluation & plots

In [None]:
df_ppo, ppo_reward, ppo_steps = run_full_episode(ppo_env_test_vec, ppo_policy, seed=SEED+1)
ppo_metrics = summarize_results(df_ppo)
global_data["results"]["PPO"] = {**ppo_metrics, "Test Reward": ppo_reward}

print("PPO reward:", f"{ppo_reward:.2f}")
print("PPO metrics:", ppo_metrics)

_ = plot_simulation(
    df_ppo, sim_dt_minutes=SIM_DT, sim_name="06-PPO", save=True, base_dir=RESULTS_DIR
)

# A2C

## Train A2C

In [None]:
a2c_policy, a2c_model_path = manage_training(A2C, "A2C", a2c_env_train_vec, TRAINING_TIMESTEPS)

## A2C reward progression plot

In [None]:
a2c_log_csv = os.path.join(LOG_DIR, "A2C_monitor.csv")
_ = plot_reward_progress(
    monitor_csv_path=a2c_log_csv,
    title="A2C ‚Äî Training Reward",
    out_path=os.path.join(RESULTS_DIR, "a2c_reward_progress.png"),
    rolling=10
)


## A2C evaluation & plots

In [None]:
df_a2c, a2c_reward, a2c_steps = run_full_episode(a2c_env_test_vec, a2c_policy, seed=SEED+1)
a2c_metrics = summarize_results(df_a2c)
global_data["results"]["A2C"] = {**a2c_metrics, "Test Reward": a2c_reward}

print("A2C reward:", f"{a2c_reward:.2f}")
print("A2C metrics:", a2c_metrics)

_ = plot_simulation(
    df_a2c, sim_dt_minutes=SIM_DT, sim_name="06-A2C", save=True, base_dir=RESULTS_DIR
)


# SAC

## Train SAC

In [None]:
sac_policy, sac_model_path = manage_training(SAC, "SAC", sac_env_train_vec, TRAINING_TIMESTEPS)

## SAC reward progression plot

In [None]:
sac_log_csv = os.path.join(LOG_DIR, "SAC_monitor.csv")
_ = plot_reward_progress(
    monitor_csv_path=sac_log_csv,
    title="SAC ‚Äî Training Reward",
    out_path=os.path.join(RESULTS_DIR, "sac_reward_progress.png"),
    rolling=10
);


## SAC evaluation & plots

In [None]:
df_sac, sac_reward, sac_steps = run_full_episode(sac_env_test_vec, sac_policy, seed=SEED+1)
sac_metrics = summarize_results(df_sac)
global_data["results"]["SAC"] = {**sac_metrics, "Test Reward": sac_reward}

print("SAC reward:", f"{sac_reward:.2f}")
print("SAC metrics:", sac_metrics)

_ = plot_simulation(
    df_sac, sim_dt_minutes=SIM_DT, sim_name="06-SAC", save=True, base_dir=RESULTS_DIR
)


# TD3

## Train TD3

In [None]:
td3_policy, td3_model_path = manage_training(TD3, "TD3", td3_env_train_vec, TRAINING_TIMESTEPS)

## TD3 reward progression plot

In [None]:
td3_log_csv = os.path.join(LOG_DIR, "TD3_monitor.csv")
_ = plot_reward_progress(
    monitor_csv_path=td3_log_csv,
    title="TD3 ‚Äî Training Reward",
    out_path=os.path.join(RESULTS_DIR, "td3_reward_progress.png"),
    rolling=10
);


## TD3 evaluation & plots

In [None]:
df_td3, td3_reward, td3_steps = run_full_episode(td3_env_test_vec, td3_policy, seed=SEED+1)
td3_metrics = summarize_results(df_td3)
global_data["results"]["TD3"] = {**td3_metrics, "Test Reward": td3_reward}

print("TD3 reward:", f"{td3_reward:.2f}")
print("TD3 metrics:", td3_metrics)

_ = plot_simulation(
    df_td3, sim_dt_minutes=SIM_DT, sim_name="06-TD3", save=True, base_dir=RESULTS_DIR
)

# Random Policy Baseline

## Random policy (for context)

In [None]:
rand_env_test, _ = build_wrapped_env(seed=SEED, log_csv_path=os.path.join(LOG_DIR, "RANDOM_test_monitor.csv"))

obs, info = rand_env_test.reset(seed=SEED)
rand_reward = 0.0
done = False
rand_steps = 0
while not done:
    action = rand_env_test.action_space.sample()
    obs, reward, terminated, truncated, info = rand_env_test.step(action)
    rand_reward += reward
    rand_steps += 1
    done = terminated or truncated

df_random = rand_env_test.unwrapped.env.get_results(as_dataframe=True)
rand_metrics = summarize_results(df_random)

global_data["results"]["Random"] = {**rand_metrics, "Test Reward": rand_reward}

print("Random steps:", rand_steps)
print("Random reward:", f"{rand_reward:.2f}")
print("Random metrics:", rand_metrics)

_ = plot_simulation(
    df_random,
    sim_dt_minutes=SIM_DT,
    sim_name="06-RANDOM",
    save=True,
    base_dir=RESULTS_DIR
)


# Comparison of All Algorithms

In [None]:
def fmt_money(x): return f"${x:,.2f}"

comparison_df = pd.DataFrame(global_data["results"]).T
comparison_df["Total Cost ($)"] = comparison_df["Total Cost ($)"].apply(fmt_money)
comparison_df["Unmet Energy (kWh)"] = comparison_df["Unmet Energy (kWh)"].apply(lambda v: f"{v:.3f}")
comparison_df["Curtailed Energy (kWh)"] = comparison_df["Curtailed Energy (kWh)"].apply(lambda v: f"{v:.3f}")
comparison_df["Test Reward"] = comparison_df["Test Reward"].apply(lambda v: f"{v:.3f}")

print("\n" + "="*80)
print("üèÅ ALGORITHM COMPARISON")
print("="*80)
print(comparison_df.to_string())
print("="*80)