**FULLY REFACTORED VERSION** - A template of the RL training, with H-network trained along side the DDQN agent

This notebook uses the refactored environment factory pattern for creating discrete environments.
It uses the unified factory pattern for environments, H-network modules, and callbacks.

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime
import sys

from utils import print_log

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# load the created dataset
dataset_folder_path = Path("dataset", "20250707_downsampled_1min", "split")

In [4]:
from utils_data_split import load_split_data_from_folder, convert_to_naive_datetimes, convert_to_naive_datetimes_df

In [5]:
aggregate_load_segments_test, aggregate_load_df_test = load_split_data_from_folder(dataset_folder_path, 'test') 

In [6]:
aggregate_load_segments_test, aggregate_load_df_test = convert_to_naive_datetimes(aggregate_load_segments_test), convert_to_naive_datetimes_df(aggregate_load_df_test)

---

Experiment start

In [None]:
# initialize experiment folder to store related information/data for future analysis

# rl_datetime = datetime(2025, 8, 11, 2, 46, 31)
# rl_datetime = datetime(2025, 8, 11, 4, 45, 55)
rl_datetime = datetime(2025, 8, 11, 6, 42, 39)
ACTION_TYPE = "continuous"
REWARD_LAMBDA = "1.0"  # between 0 and 1

experiment_folder = Path(
    "experiments", rl_datetime.strftime('%Y%m%d_%H%M%S') + f"_action_{ACTION_TYPE}_reward_lambda_{REWARD_LAMBDA}", 
)

if not experiment_folder.exists():
    raise FileNotFoundError(f"Experiment folder {experiment_folder} does not exist. Please check the path.")
else:
    print_log(f"Experiment folder: {experiment_folder}")

[2025-08-12 12:10:07:207] Experiment folder: experiments/20250811_064239_action_continuous_reward_lambda_1.0


In [8]:
# find the selected checkpoint

selected_ckpt_folder = experiment_folder / "selected_ckpt"

# search for any sub-folders under selected_ckpt_folder
sub_folders = [f for f in selected_ckpt_folder.iterdir() if f.is_dir()]
# assert len(sub_folders) == 1, "There should be exactly one sub-folder in selected_ckpt_folder."

selected_ckpt_folder = sub_folders[0]
selected_ckpt_episode_number = int(selected_ckpt_folder.name.split('_')[-1])

print_log(f"Selected checkpoint folder: {selected_ckpt_folder}")

[2025-08-12 12:10:07:222] Selected checkpoint folder: experiments/20250811_064239_action_continuous_reward_lambda_1.0/selected_ckpt/episode_0703


TODO: read the experiment_details under the experiment_folder, find the seed element, and use it to set the action_space and RL model for reproducibility

Create data loader

In [9]:
sys.path.append(str(Path('rl_env')))
sys.path.append(str(Path('model', 'H_network')))

from rl_env.data_loader import SimpleSmartMeterDataLoader

sm_dl_test = SimpleSmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_test,
    aggregate_load_df=aggregate_load_df_test,
    registry_path=experiment_folder / "simple_episode_registry_test.json"
)

sm_dl_test.get_divided_segments_length()

[SimpleDataLoader] Generated 80 episodes (1 day each)


80

In [10]:
sm_dl_test.divided_segments[7]

array([datetime.datetime(2013, 2, 15, 0, 0),
       datetime.datetime(2013, 2, 15, 23, 59, 59, 999000)], dtype=object)

In [11]:
# sample segment

sm_dl_test.get_aggregate_load_segment(13)

Unnamed: 0,timestamp,aggregate,datetime,segment_index,episode_content_id,episode_length_days
18720,1362355201,144.000000,2013-03-04 00:00:01,13,9eafa9c7f9a1,1
18721,1362355261,146.640530,2013-03-04 00:01:01,13,9eafa9c7f9a1,1
18722,1362355321,144.977225,2013-03-04 00:02:01,13,9eafa9c7f9a1,1
18723,1362355381,145.282810,2013-03-04 00:03:01,13,9eafa9c7f9a1,1
18724,1362355441,144.273479,2013-03-04 00:04:01,13,9eafa9c7f9a1,1
...,...,...,...,...,...,...
20155,1362441301,259.357749,2013-03-04 23:55:01,13,9eafa9c7f9a1,1
20156,1362441361,250.485841,2013-03-04 23:56:01,13,9eafa9c7f9a1,1
20157,1362441421,309.990317,2013-03-04 23:57:01,13,9eafa9c7f9a1,1
20158,1362441481,285.959002,2013-03-04 23:58:01,13,9eafa9c7f9a1,1


Create the environment

In [12]:
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [13]:
from model.H_network.common.factories import create_h_network_module_with_defaults
from model.H_network.h_network_arch import HNetworkType

# the optimizer class and optimizer kwargs does not matter for the testing loop, but they are required to create the module

h_network_kwargs = {
    "h_network_type": HNetworkType.H_NETWORK,
    "optimizer_class": torch.optim.Adam,
    "optimizer_kwargs": {"lr": 0.001},
}

h_network_rl_module = create_h_network_module_with_defaults(
    action_type=ACTION_TYPE,
    **h_network_kwargs,
    device=DEVICE
)

In [14]:
from rl_env.base.env_module import SmartMeterEnvFactory
from rl_env.training_mode import TrainingMode
from decimal import Decimal

env_test = SmartMeterEnvFactory.create(
    action_type=ACTION_TYPE,
    smart_meter_data_loader=sm_dl_test,
    h_network_rl_module=h_network_rl_module,
    mode=TrainingMode.TEST,
    reward_lambda=float(REWARD_LAMBDA),
    render_mode=None,
)

[2025-08-12 12:10:08:398] [SmartMeterContinuousEnv Test] Using data loader: SimpleSmartMeterDataLoader
[2025-08-12 12:10:08:398] [SmartMeterContinuousEnv Test] Curriculum enabled: False
[2025-08-12 12:10:08:403] [SmartMeterContinuousEnv Test]] Render mode set to 'None'. Render server at 127.0.0.1:50007. render_connected: False. render_client_socket: None


In [None]:
h_network_rl_module.set_h_network(
    h_network_rl_module.initialize_h_network()
)
h_network_rl_module.initialize_h_network_training()

In [16]:
# load the h_network from the selected checkpoint

h_network_rl_module.load_h_network(
    selected_ckpt_folder / ("h_network" + f"_{selected_ckpt_episode_number:04d}" + ".pth")
)

In [17]:
h_network_rl_module.h_network

HNetwork(
  (LSTM_1): LSTM(2, 44, batch_first=True, bidirectional=True)
  (ac1): Tanh()
  (LSTM_2): LSTM(88, 44, batch_first=True, bidirectional=True)
  (ac2): Tanh()
  (fc): Linear(in_features=88, out_features=1, bias=True)
)

In [18]:
import torchinfo

torchinfo.summary(h_network_rl_module.h_network, input_size=(1, 1, 2))

Layer (type:depth-idx)                   Output Shape              Param #
HNetwork                                 [1, 1]                    --
├─LSTM: 1-1                              [1, 1, 88]                16,896
├─Tanh: 1-2                              [1, 1, 88]                --
├─LSTM: 1-3                              [1, 1, 88]                47,168
├─Tanh: 1-4                              [1, 1, 88]                --
├─Linear: 1-5                            [1, 1, 1]                 89
Total params: 64,153
Trainable params: 64,153
Non-trainable params: 0
Total mult-adds (M): 0.06
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.26
Estimated Total Size (MB): 0.26

In [19]:
from gymnasium.utils.env_checker import check_env

# This will catch many common issues
try:
    check_env(env_test)
    print("Environment passes all checks!")
except Exception as e:
    print(f"Environment has issues: {e}")

[2025-08-12 12:10:09:405] [SmartMeterContinuousEnv Test] Resetting environment with episode 21. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-04-23 00:00:05'), Timestamp('2013-04-23 23:59:05'))}
[2025-08-12 12:10:09:409] [SmartMeterContinuousEnv Test] Resetting environment with episode 1. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-01-02 00:00:26'), Timestamp('2013-01-02 23:59:26'))}
[2025-08-12 12:10:09:411] [SmartMeterContinuousEnv Test] Resetting environment with episode 54. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-09-17 00:00:02'), Timestamp('2013-09-17 23:59:02'))}
[2025-08-12 12:10:09:414] [SmartMeterContinuousEnv Test] Resetting environment with episode 1. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-01-02 00:00:26'), Timestamp('2013-01-02 23:59:26'))}
[2025-08-12 12:10:09:416] [SmartMeterContinuousEnv Test] Resetting environment with episode 54. Episode info: {'length': 1440, 'datetime_range

  logger.warn(


In [20]:
env_test.reset_render_window()

Also initiate a validation environment for time-to-time validation

In [21]:
# initialize a DQN agent
from stable_baselines3 import PPO

# rl_lr = 2.5e-4
# rl_batch_size=64
# n_updates_btw_h_network_training = 2

# rl_gamma = 1
# policy_kwargs = {
#     "net_arch": [64, 64],
#     "activation_fn": torch.nn.ReLU,
# }
# n_epochs = 15
# gae_lambda = 1 - 2.9011e-3


# # n_steps is automatically computed from n_updates_between_h_network_training
# # Target timesteps per environment per PPO update
# target_steps_per_env = 24 * 60 // n_updates_btw_h_network_training
# # Quantize n_steps to be a multiple of batch_size, close to target_steps_per_env
# n_steps = max(1, target_steps_per_env // rl_batch_size) * rl_batch_size
# # If n_steps is 0 (when target_steps_per_env < batch_size), set it to batch_size
# if n_steps == 0:
#     n_steps = rl_batch_size

# agent_params = {
#     "learning_rate": rl_lr,
#     "n_steps": n_steps,
#     "batch_size": rl_batch_size,
#     "gamma": rl_gamma,
#     "gae_lambda": gae_lambda,
#     "n_epochs": n_epochs,
#     "policy_kwargs": policy_kwargs,
# }


rl_model = PPO(
    "MultiInputPolicy",
    env_test,
    verbose=2,
)

2025-08-12 12:10:09.721445: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


2025-08-12 12:10:09.730430: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754997009.740943   75845 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754997009.744091   75845 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754997009.752396   75845 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754997009.752424   75845 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754997009.752425   75845 computation_placer.cc:177] computation placer alr

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [22]:
rl_model = rl_model.load(
    selected_ckpt_folder / ("rl_model" + f"_{selected_ckpt_episode_number:04d}")  # path to the saved RL model
)

In [23]:
rl_model.policy

MultiInputActorCriticPolicy(
  (features_extractor): CombinedExtractor(
    (extractors): ModuleDict(
      (aggregate_load): Flatten(start_dim=1, end_dim=-1)
      (battery_soc): Flatten(start_dim=1, end_dim=-1)
      (timestamp_features): Flatten(start_dim=1, end_dim=-1)
    )
  )
  (pi_features_extractor): CombinedExtractor(
    (extractors): ModuleDict(
      (aggregate_load): Flatten(start_dim=1, end_dim=-1)
      (battery_soc): Flatten(start_dim=1, end_dim=-1)
      (timestamp_features): Flatten(start_dim=1, end_dim=-1)
    )
  )
  (vf_features_extractor): CombinedExtractor(
    (extractors): ModuleDict(
      (aggregate_load): Flatten(start_dim=1, end_dim=-1)
      (battery_soc): Flatten(start_dim=1, end_dim=-1)
      (timestamp_features): Flatten(start_dim=1, end_dim=-1)
    )
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=5, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=Tr

In [24]:
rl_model.n_steps

704

In [25]:
# we test the rl model and H-network

import json
from utils import print_log

total_episodes = env_test.smart_meter_data_loader.get_divided_segments_length()
episode_indices = list(range(total_episodes))

all_episode_rewards = []
all_episode_lengths = []
all_episode_infos = []
per_episode_reward_stats = []

for i, episode_idx in enumerate(episode_indices):
    obs, info = env_test.reset(episode_idx=episode_idx)

    # Initialize episode tracking variables
    episode_reward = 0.0
    episode_length = 0
    done = False
    state = None  # For recurrent policies

    while not done:
        # Get action from the RL model
        action, state = rl_model.predict(obs, state=state, deterministic=True)

        # Step the environment
        obs, reward, done, truncated, info = env_test.step(action)

        # Update episode tracking variables
        episode_reward += reward
        episode_length += 1

    # Store results for this episode
    all_episode_rewards.append(episode_reward)
    all_episode_lengths.append(episode_length)
    all_episode_infos.append({
        "episode_idx": episode_idx,
        "episode_info": env_test.episode.get_episode_info() if hasattr(env_test, 'episode') else {}
    })

    per_episode_reward_stats.append(env_test.prev_episode_reward_stats)

    env_test.save_episode_info(
        log_folder=experiment_folder / "logs_test",
        episode_training_idx=episode_idx
    )

    if (i + 1) % 10 == 0:
        print_log(f"Episode {i + 1}/{total_episodes} completed.")


# Calculate comprehensive statistics
mean_reward = np.mean(all_episode_rewards)
std_reward = np.std(all_episode_rewards)
min_reward = np.min(all_episode_rewards)
max_reward = np.max(all_episode_rewards)
median_reward = np.median(all_episode_rewards)

# we need to recover the true sum f_signal and g_signal from the per_episode_reward_stats, with length of each episode
f_signal_sums = np.array([item['f_signal_sum'] for item in per_episode_reward_stats])
g_signal_sums = np.array([item['g_signal_sum'] for item in per_episode_reward_stats])

# get the length of each episode
_episode_lengths = np.array(all_episode_lengths)
sum_episode_lengths = np.sum(_episode_lengths)

# calculate the (true) mean of rewards, f_signal and g_signal

f_signal_mean = f_signal_sums @ _episode_lengths / sum_episode_lengths if sum_episode_lengths > 0 else 0.0
g_signal_mean = g_signal_sums @ _episode_lengths / sum_episode_lengths if sum_episode_lengths > 0 else 0.0

mean_length = np.mean(all_episode_lengths)
std_length = np.std(all_episode_lengths)

# Prepare validation data
test_data = {
    'evaluated_episodes': len(episode_indices),
    'total_available_episodes': total_episodes,
    'episode_indices': episode_indices,
    'episode_rewards': all_episode_rewards,
    'episode_lengths': all_episode_lengths,
    'episode_infos': all_episode_infos,
    'summary_stats': {
        'mean_reward': mean_reward,
        'std_reward': std_reward,
        'min_reward': min_reward,
        'max_reward': max_reward,
        'median_reward': median_reward,
        'mean_f_signal': f_signal_mean,
        'mean_g_signal': g_signal_mean,
        'mean_length': mean_length,
        'std_length': std_length
    }
}

test_stats_file_path = experiment_folder / "logs_test" / "test_stats_file.json"
with open(test_stats_file_path, 'w') as f:
    # Convert numpy types to Python native types for JSON serialization
    def convert_numpy(obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, list):
            return [convert_numpy(item) for item in obj]
        elif isinstance(obj, dict):
            return {key: convert_numpy(value) for key, value in obj.items()}
        return obj

    json.dump(convert_numpy(test_data), f, indent=2, default=str)

sm_dl_test._save_registry()
print_log(f"Validation results saved to {test_stats_file_path}")

[2025-08-12 12:10:10:835] [SmartMeterContinuousEnv Test] Resetting environment with episode 0. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-01-01 00:00:01'), Timestamp('2013-01-01 23:59:01'))}
[2025-08-12 12:10:15:669] [SmartMeterContinuousEnv Test] Episode finished. Sum of rewards: -1.3686312500000033. Mean of rewards: -0.0009510988533703983. Std of rewards: 0.0021914362508851436
[2025-08-12 12:10:15:669] [SmartMeterContinuousEnv Test] Episode f_signal sum: -54.52307310271354. Mean: -0.03788955740285861. Std: 0.07402099787859324
[2025-08-12 12:10:15:669] [SmartMeterContinuousEnv Test] Episode g_signal sum: 1.368631250000003. Mean: 0.0009510988533703983. Std: 0.0021914362508851436
[2025-08-12 12:10:15:684] [SmartMeterContinuousEnv Test] Episode 0000 info saved to experiments/20250811_064239_action_continuous_reward_lambda_1.0/logs_test/episode_info/episode_0000_info.json
[2025-08-12 12:10:15:688] [SmartMeterContinuousEnv Test] Resetting environment with episode 1. 