**FULLY REFACTORED VERSION** - A template of the RL training, with H-network trained along side the DDQN agent

This notebook uses the refactored environment factory pattern for creating discrete environments.
It uses the unified factory pattern for environments, H-network modules, and callbacks.

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime
import sys

from utils import print_log

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# load the created dataset
dataset_folder_path = Path("dataset", "20250707_downsampled_1min", "split")

In [4]:
from utils_data_split import load_split_data_from_folder, convert_to_naive_datetimes, convert_to_naive_datetimes_df

In [5]:
aggregate_load_segments_test, aggregate_load_df_test = load_split_data_from_folder(dataset_folder_path, 'test') 

In [6]:
aggregate_load_segments_test, aggregate_load_df_test = convert_to_naive_datetimes(aggregate_load_segments_test), convert_to_naive_datetimes_df(aggregate_load_df_test)

---

(DDQN) only

We have to quantize the data into per 0.05kW = 50W. This is for the charging action and other stuffs

In [7]:
# https://stackoverflow.com/questions/47949053/round-to-nearest-1000-in-pandas

def round_to_nearest_50(x):
    """Round to the nearest 50"""
    return ((x + 25) / 50).astype(int) * 50


aggregate_load_df_test["aggregate"] = round_to_nearest_50(aggregate_load_df_test["aggregate"])

# then cap at 5kW
aggregate_load_df_test["aggregate"] = aggregate_load_df_test["aggregate"].clip(upper=5000)

---

Experiment start

In [8]:
# initialize experiment folder to store related information/data for future analysis

# rl_datetime = datetime(2025, 8, 7, 5, 26, 15)
# rl_datetime = datetime(2025, 8, 7, 8, 13, 47)
rl_datetime = datetime(2025, 8, 7, 11, 5, 34)
ACTION_TYPE = "discrete"
REWARD_LAMBDA = 1.0  # between 0 and 1

experiment_folder = Path(
    "experiments", rl_datetime.strftime('%Y%m%d_%H%M%S') + f"_action_{ACTION_TYPE}_reward_lambda_{REWARD_LAMBDA:.1f}", 
)

if not experiment_folder.exists():
    raise FileNotFoundError(f"Experiment folder {experiment_folder} does not exist. Please check the path.")

In [9]:
# find the selected checkpoint

selected_ckpt_folder = experiment_folder / "selected_ckpt"

# search for any sub-folders under selected_ckpt_folder
sub_folders = [f for f in selected_ckpt_folder.iterdir() if f.is_dir()]
# assert len(sub_folders) == 1, "There should be exactly one sub-folder in selected_ckpt_folder."

selected_ckpt_folder = sub_folders[0]
selected_ckpt_episode_number = int(selected_ckpt_folder.name.split('_')[-1])

print_log(f"Selected checkpoint folder: {selected_ckpt_folder}")

[2025-08-12 12:27:01:464] Selected checkpoint folder: experiments/20250807_110534_action_discrete_reward_lambda_1.0/selected_ckpt/episode_0700


TODO: read the experiment_details under the experiment_folder, find the seed element, and use it to set the action_space and RL model for reproducibility

Create data loader

In [10]:
sys.path.append(str(Path('rl_env')))
sys.path.append(str(Path('model', 'H_network')))

from rl_env.data_loader import SimpleSmartMeterDataLoader

sm_dl_test = SimpleSmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_test,
    aggregate_load_df=aggregate_load_df_test,
    registry_path=experiment_folder / "simple_episode_registry_test.json"
)

sm_dl_test.get_divided_segments_length()

[SimpleDataLoader] Generated 80 episodes (1 day each)


80

In [11]:
sm_dl_test.divided_segments[7]

array([datetime.datetime(2013, 2, 15, 0, 0),
       datetime.datetime(2013, 2, 15, 23, 59, 59, 999000)], dtype=object)

In [12]:
# sample segment

sm_dl_test.get_aggregate_load_segment(13)

Unnamed: 0,timestamp,aggregate,datetime,segment_index,episode_content_id,episode_length_days
18720,1362355201,150,2013-03-04 00:00:01,13,9eafa9c7f9a1,1
18721,1362355261,150,2013-03-04 00:01:01,13,9eafa9c7f9a1,1
18722,1362355321,150,2013-03-04 00:02:01,13,9eafa9c7f9a1,1
18723,1362355381,150,2013-03-04 00:03:01,13,9eafa9c7f9a1,1
18724,1362355441,150,2013-03-04 00:04:01,13,9eafa9c7f9a1,1
...,...,...,...,...,...,...
20155,1362441301,250,2013-03-04 23:55:01,13,9eafa9c7f9a1,1
20156,1362441361,250,2013-03-04 23:56:01,13,9eafa9c7f9a1,1
20157,1362441421,300,2013-03-04 23:57:01,13,9eafa9c7f9a1,1
20158,1362441481,300,2013-03-04 23:58:01,13,9eafa9c7f9a1,1


Create the environment

In [13]:
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [14]:
from model.H_network.common.factories import create_h_network_module_with_defaults
from model.H_network.h_network_arch import HNetworkType

# the optimizer kwargs does not matter for the testing loop, but they are required to create the module

h_network_kwargs = {
    "h_network_type": HNetworkType.H_NETWORK,
    "optimizer_class": torch.optim.RMSprop,
    "optimizer_kwargs": {"lr": 0.001},
}

h_network_rl_module = create_h_network_module_with_defaults(
    action_type=ACTION_TYPE,
    **h_network_kwargs,
    device=DEVICE
)

In [15]:
from rl_env.base.env_module import SmartMeterEnvFactory
from rl_env.training_mode import TrainingMode
from decimal import Decimal

env_test = SmartMeterEnvFactory.create(
    action_type=ACTION_TYPE,
    smart_meter_data_loader=sm_dl_test,
    h_network_rl_module=h_network_rl_module,
    mode=TrainingMode.TEST,
    reward_lambda=REWARD_LAMBDA,
    render_mode=None,

    aggregate_step_size=50,  # step size for the aggregate load
    battery_step_size=Decimal("0.05")
)

[2025-08-12 12:27:02:626] [SmartMeterDiscreteEnv Test] Using data loader: SimpleSmartMeterDataLoader
[2025-08-12 12:27:02:626] [SmartMeterDiscreteEnv Test] Curriculum enabled: False
[2025-08-12 12:27:02:631] [SmartMeterDiscreteEnv Test]] Render mode set to 'None'. Render server at 127.0.0.1:50007. render_connected: False. render_client_socket: None


In [16]:
h_network_rl_module.set_h_network(
    h_network_rl_module.initialize_h_network(output_dim=env_test.observation_space["aggregate_logit"].n)  # output_dim is the number states in the aggregate load space
)
h_network_rl_module.initialize_h_network_training()

In [17]:
# load the h_network from the selected checkpoint

h_network_rl_module.load_h_network(
    selected_ckpt_folder / ("h_network" + f"_{selected_ckpt_episode_number:04d}" + ".pth")
)

In [18]:
h_network_rl_module.h_network

HNetwork(
  (LSTM_1): LSTM(2, 44, batch_first=True, bidirectional=True)
  (ac1): Tanh()
  (LSTM_2): LSTM(88, 44, batch_first=True, bidirectional=True)
  (ac2): Tanh()
  (fc): Linear(in_features=88, out_features=101, bias=True)
)

In [19]:
import torchinfo


torchinfo.summary(h_network_rl_module.h_network, input_size=(1, 1, 2))

Layer (type:depth-idx)                   Output Shape              Param #
HNetwork                                 [1, 1, 101]               --
├─LSTM: 1-1                              [1, 1, 88]                16,896
├─Tanh: 1-2                              [1, 1, 88]                --
├─LSTM: 1-3                              [1, 1, 88]                47,168
├─Tanh: 1-4                              [1, 1, 88]                --
├─Linear: 1-5                            [1, 1, 101]               8,989
Total params: 73,053
Trainable params: 73,053
Non-trainable params: 0
Total mult-adds (M): 0.07
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.29
Estimated Total Size (MB): 0.29

In [20]:
from gymnasium.utils.env_checker import check_env

# This will catch many common issues
try:
    check_env(env_test)
    print("Environment passes all checks!")
except Exception as e:
    print(f"Environment has issues: {e}")

[2025-08-12 12:27:03:704] [SmartMeterWorldDiscrete Test] Resetting environment with episode 74. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-12-16 00:00:06'), Timestamp('2013-12-16 23:59:06'))}
[2025-08-12 12:27:03:707] [SmartMeterWorldDiscrete Test] Resetting environment with episode 1. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-01-02 00:00:26'), Timestamp('2013-01-02 23:59:26'))}
[2025-08-12 12:27:03:709] [SmartMeterWorldDiscrete Test] Resetting environment with episode 54. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-09-17 00:00:02'), Timestamp('2013-09-17 23:59:02'))}
[2025-08-12 12:27:03:712] [SmartMeterWorldDiscrete Test] Resetting environment with episode 1. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-01-02 00:00:26'), Timestamp('2013-01-02 23:59:26'))}
[2025-08-12 12:27:03:714] [SmartMeterWorldDiscrete Test] Resetting environment with episode 54. Episode info: {'length': 1440, 'datetime_range

  logger.warn(


In [21]:
env_test.reset_render_window()

Also initiate a validation environment for time-to-time validation

In [22]:
# callbacks for training - using refactored callback system
from rl_env.env_callbacks import TrainHNetworkEveryNEpisodes, SaveCheckpointEveryNEpisodes, ValidateEveryNEpisodes, EnvLoggingCallback, UpdateGlobalTimestepCallback

2025-08-12 12:27:04.028693: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-12 12:27:04.037362: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754998024.046897   78725 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754998024.049870   78725 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754998024.057699   78725 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [23]:
# initialize a DQN agent
from model.DDQN.ddqn import DoubleDQN

# rl_datetime = datetime.now()

# rl_lr = 0.00025
# rl_buffer_size = 150000  # scaled to match with our episode length
# rl_batch_size = 128     # keep the same as the paper
# rl_gamma = 1  # keep the same as the paper
# rl_train_freq = (120, "step")  # scaled to match with our episode length
# rl_target_update_interval = 7500  # scaled to match with our episode length
# policy_kwargs = {
#     "optimizer_class": torch.optim.RMSprop,     # keep the same as the paper
# }

rl_model = DoubleDQN(
    "MultiInputPolicy",
    env_test,
    verbose=2,

)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [24]:
rl_model = rl_model.load(
    selected_ckpt_folder / ("rl_model" + f"_{selected_ckpt_episode_number:04d}")  # path to the saved RL model
)

In [25]:
rl_model.policy

MultiInputPolicy(
  (q_net): QNetwork(
    (features_extractor): CombinedExtractor(
      (extractors): ModuleDict(
        (aggregate_logit): Flatten(start_dim=1, end_dim=-1)
        (battery_soc): Flatten(start_dim=1, end_dim=-1)
        (timestamp_features): Flatten(start_dim=1, end_dim=-1)
      )
    )
    (q_net): Sequential(
      (0): Linear(in_features=105, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=161, bias=True)
    )
  )
  (q_net_target): QNetwork(
    (features_extractor): CombinedExtractor(
      (extractors): ModuleDict(
        (aggregate_logit): Flatten(start_dim=1, end_dim=-1)
        (battery_soc): Flatten(start_dim=1, end_dim=-1)
        (timestamp_features): Flatten(start_dim=1, end_dim=-1)
      )
    )
    (q_net): Sequential(
      (0): Linear(in_features=105, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_feature

In [26]:
# we test the rl model and H-network

import json
from utils import print_log

total_episodes = env_test.smart_meter_data_loader.get_divided_segments_length()
episode_indices = list(range(total_episodes))

all_episode_rewards = []
all_episode_lengths = []
all_episode_infos = []
per_episode_reward_stats = []

for i, episode_idx in enumerate(episode_indices):
    obs, info = env_test.reset(episode_idx=episode_idx)

    # Initialize episode tracking variables
    episode_reward = 0.0
    episode_length = 0
    done = False
    state = None  # For recurrent policies

    while not done:
        # Get action from the RL model
        action, state = rl_model.predict(obs, state=state, deterministic=True)

        # Step the environment
        obs, reward, done, truncated, info = env_test.step(action)

        # Update episode tracking variables
        episode_reward += reward
        episode_length += 1

    # Store results for this episode
    all_episode_rewards.append(episode_reward)
    all_episode_lengths.append(episode_length)
    all_episode_infos.append({
        "episode_idx": episode_idx,
        "episode_info": env_test.episode.get_episode_info() if hasattr(env_test, 'episode') else {}
    })

    per_episode_reward_stats.append(env_test.prev_episode_reward_stats)

    env_test.save_episode_info(
        log_folder=experiment_folder / "logs_test",
        episode_training_idx=episode_idx
    )

    if (i + 1) % 10 == 0:
        print_log(f"Episode {i + 1}/{total_episodes} completed.")


# Calculate comprehensive statistics
mean_reward = np.mean(all_episode_rewards)
std_reward = np.std(all_episode_rewards)
min_reward = np.min(all_episode_rewards)
max_reward = np.max(all_episode_rewards)
median_reward = np.median(all_episode_rewards)

# we need to recover the true sum f_signal and g_signal from the per_episode_reward_stats, with length of each episode
f_signal_sums = np.array([item['f_signal_sum'] for item in per_episode_reward_stats])
g_signal_sums = np.array([item['g_signal_sum'] for item in per_episode_reward_stats])

# get the length of each episode
_episode_lengths = np.array(all_episode_lengths)
sum_episode_lengths = np.sum(_episode_lengths)

# calculate the (true) mean of rewards, f_signal and g_signal

f_signal_mean = f_signal_sums @ _episode_lengths / sum_episode_lengths if sum_episode_lengths > 0 else 0.0
g_signal_mean = g_signal_sums @ _episode_lengths / sum_episode_lengths if sum_episode_lengths > 0 else 0.0

mean_length = np.mean(all_episode_lengths)
std_length = np.std(all_episode_lengths)

# Prepare validation data
test_data = {
    'evaluated_episodes': len(episode_indices),
    'total_available_episodes': total_episodes,
    'episode_indices': episode_indices,
    'episode_rewards': all_episode_rewards,
    'episode_lengths': all_episode_lengths,
    'episode_infos': all_episode_infos,
    'summary_stats': {
        'mean_reward': mean_reward,
        'std_reward': std_reward,
        'min_reward': min_reward,
        'max_reward': max_reward,
        'median_reward': median_reward,
        'mean_f_signal': f_signal_mean,
        'mean_g_signal': g_signal_mean,
        'mean_length': mean_length,
        'std_length': std_length
    }
}

test_stats_file_path = experiment_folder / "logs_test" / "test_stats_file.json"
with open(test_stats_file_path, 'w') as f:
    # Convert numpy types to Python native types for JSON serialization
    def convert_numpy(obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, list):
            return [convert_numpy(item) for item in obj]
        elif isinstance(obj, dict):
            return {key: convert_numpy(value) for key, value in obj.items()}
        return obj

    json.dump(convert_numpy(test_data), f, indent=2, default=str)

sm_dl_test._save_registry()
print_log(f"Validation results saved to {test_stats_file_path}")

[2025-08-12 12:27:05:044] [SmartMeterWorldDiscrete Test] Resetting environment with episode 0. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-01-01 00:00:01'), Timestamp('2013-01-01 23:59:01'))}
[2025-08-12 12:27:09:448] [SmartMeterDiscreteEnv Test] Episode finished. Sum of rewards: -0.7876317500000001. Mean of rewards: -0.00054734659485754. Std of rewards: 0.0004644640847213585
[2025-08-12 12:27:09:448] [SmartMeterDiscreteEnv Test] Episode f_signal sum: -315.90230377204716. Mean: -0.2195290505712628. Std: 0.09901721263733372
[2025-08-12 12:27:09:448] [SmartMeterDiscreteEnv Test] Episode g_signal sum: 0.78763175. Mean: 0.00054734659485754. Std: 0.0004644640847213585
[2025-08-12 12:27:09:461] [SmartMeterDiscreteEnv Test] Episode 0000 info saved to experiments/20250807_110534_action_discrete_reward_lambda_1.0/logs_test/episode_info/episode_0000_info.json
[2025-08-12 12:27:09:464] [SmartMeterWorldDiscrete Test] Resetting environment with episode 1. Episode info: {'lengt