A template of the RL training, with H-network trained along side the PPO agent

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime

from utils import print_log

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# load the created dataset
dataset_folder_path = Path("dataset", "20250707_downsampled_1min", "split")

In [4]:
# copied from 03_data_split.ipynb

# Helper functions for the new split folder structure
def load_split_data_from_folder(split_folder, split_type='train'):
    """Load aggregate data from split folder"""
    segments = []
    with open(split_folder / f'{split_type}_segments.txt', 'r') as f:
        for line in f:
            start_str, end_str = line.strip().split(' - ')
            start = datetime.fromisoformat(start_str)
            end = datetime.fromisoformat(end_str)
            segments.append((start, end))
    
    df = pd.read_pickle(split_folder / f'{split_type}_aggregate_df.pkl')
    return segments, df

def load_signatures_from_split_folder(split_folder, split_type, appliance):
    """Load appliance signatures from split folder"""
    sig_path = split_folder / 'load_signature_library' / split_type / appliance / 'load_signatures.pkl'
    ranges_path = split_folder / 'load_signature_library' / split_type / appliance / 'selected_ranges.txt'
    
    if not sig_path.exists():
        return pd.DataFrame(), []
    
    signatures_df = pd.read_pickle(sig_path)
    ranges = []
    if ranges_path.exists():
        with open(ranges_path, 'r') as f:
            for line in f:
                start, end = map(int, line.strip().split(','))
                ranges.append((start, end))
    
    return signatures_df, ranges

In [5]:
# convert datetime objects to timezone-naive datetime objects
def convert_to_naive_datetimes_df(df):
    """Convert datetime objects in DataFrame to timezone-naive datetime objects"""
    df['datetime'] = df['datetime'].apply(lambda x: x.replace(tzinfo=None) if isinstance(x, datetime) else x)

    return df

def convert_to_naive_datetimes(segments):
    """Convert datetime objects in segments to timezone-naive datetime objects"""
    return [(start.replace(tzinfo=None), end.replace(tzinfo=None)) for start, end in segments]

In [6]:
aggregate_load_segments_train, aggregate_load_df_train = load_split_data_from_folder(dataset_folder_path, 'train')
aggregate_load_segments_test, aggregate_load_df_test = load_split_data_from_folder(dataset_folder_path, 'test') 
aggregate_load_segments_validation, aggregate_load_df_validation = load_split_data_from_folder(dataset_folder_path, 'val')

In [7]:
aggregate_load_segments_train, aggregate_load_df_train = convert_to_naive_datetimes(aggregate_load_segments_train), convert_to_naive_datetimes_df(aggregate_load_df_train)
aggregate_load_segments_test, aggregate_load_df_test = convert_to_naive_datetimes(aggregate_load_segments_test), convert_to_naive_datetimes_df(aggregate_load_df_test)
aggregate_load_segments_validation, aggregate_load_df_validation = convert_to_naive_datetimes(aggregate_load_segments_validation), convert_to_naive_datetimes_df(aggregate_load_df_validation)

In [8]:
aggregate_load_df_train

Unnamed: 0,datetime,aggregate,timestamp
0,2013-01-08 00:00:05,234.000000,1357603205
1,2013-01-08 00:01:05,230.407069,1357603265
2,2013-01-08 00:02:05,230.680121,1357603325
3,2013-01-08 00:03:05,231.607379,1357603385
4,2013-01-08 00:04:05,231.280688,1357603445
...,...,...,...
231546,2013-12-30 23:55:01,176.973052,1388447701
231547,2013-12-30 23:56:01,177.850890,1388447761
231548,2013-12-30 23:57:01,177.333811,1388447821
231549,2013-12-30 23:58:01,178.462801,1388447881


In [9]:
aggregate_load_segments_train

[(datetime.datetime(2013, 1, 8, 0, 0),
  datetime.datetime(2013, 1, 8, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 1, 9, 0, 0),
  datetime.datetime(2013, 1, 9, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 1, 10, 0, 0),
  datetime.datetime(2013, 1, 10, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 2, 27, 0, 0),
  datetime.datetime(2013, 2, 27, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 2, 28, 0, 0),
  datetime.datetime(2013, 2, 28, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 3, 8, 0, 0),
  datetime.datetime(2013, 3, 8, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 3, 9, 0, 0),
  datetime.datetime(2013, 3, 9, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 3, 10, 0, 0),
  datetime.datetime(2013, 3, 10, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 3, 28, 0, 0),
  datetime.datetime(2013, 3, 28, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 3, 29, 0, 0),
  datetime.datetime(2013, 3, 29, 23, 59, 59, 999000)),
 (datetime.datetime(2013, 3, 30, 0, 0),
  datetime.datetim

In [10]:
from rl_env.env_data_loader import SmartMeterDataLoader

sm_dl_train = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_train,
    aggregate_load_df=aggregate_load_df_train
)

sm_dl_train.get_divided_segments_length()

162

In [11]:
sm_dl_train.divided_segments[7]

array([datetime.datetime(2013, 3, 10, 0, 0),
       datetime.datetime(2013, 3, 10, 23, 59, 59, 999000)], dtype=object)

In [12]:
# sample segment

sm_dl_train.get_aggregate_load_segment(13)

Unnamed: 0,timestamp,aggregate,datetime
11231,1363996805,335.000000,2013-03-23 00:00:05
11232,1363996865,317.206591,2013-03-23 00:01:05
11233,1363996925,293.301546,2013-03-23 00:02:05
11234,1363996985,276.229767,2013-03-23 00:03:05
11235,1363997045,276.501419,2013-03-23 00:04:05
...,...,...,...
12666,1364082905,173.758178,2013-03-23 23:55:05
12667,1364082965,172.392595,2013-03-23 23:56:05
12668,1364083025,174.270419,2013-03-23 23:57:05
12669,1364083085,171.201633,2013-03-23 23:58:05


In [13]:
# create dataloader for validation and test sets
sm_dl_validation = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_validation,
    aggregate_load_df=aggregate_load_df_validation
)

sm_dl_test = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_test,
    aggregate_load_df=aggregate_load_df_test
)

(Optional) Load the pre-trained H-network and related components

In [14]:

# from model.H_network.h_network import HNetwork

# h_network_datetime = datetime(2025, 7, 13)

# h_network_path = Path("model_trained", f"h_network_{h_network_datetime.strftime('%Y%m%d')}.pth")

# h_network = HNetwork(2, 44, 1)
# h_network.load_state_dict(torch.load(h_network_path))
# h_network.eval()

Create the environment

In [15]:
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [16]:
import sys
sys.path.append(str(Path('model', 'H_network')))
from model.H_network.h_network_rl_module import HNetworkRLModule
from model.H_network.h_network_arch import HNetworkType

h_network_rl_module = HNetworkRLModule(
    h_network_type=HNetworkType.H_NETWORK2,
    device=DEVICE
)

In [17]:
sys.path.append(str(Path('rl_env')))

from rl_env.hrl_env_hnetwork_loop import SmartMeterWorld


env_train = SmartMeterWorld(
    smart_meter_data_loader=sm_dl_train,
    h_network_rl_module=h_network_rl_module,
    # render_mode="human",
    render_mode=None,
)

In [18]:
HNetworkType.H_NETWORK2 is h_network_rl_module.h_network_type

True

In [19]:
h_network_rl_module.set_h_network(
    h_network_rl_module.initialize_h_network()
)
h_network_rl_module.initialize_h_network_training()

In [20]:
from gymnasium.utils.env_checker import check_env

# This will catch many common issues
try:
    check_env(env_train)
    print("Environment passes all checks!")
except Exception as e:
    print(f"Environment has issues: {e}")

[2025-07-16 04:19:03:549] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-10-29 00:00:01'), Timestamp('2013-10-29 23:59:01'))}
[2025-07-16 04:19:03:552] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 693, 'datetime_range': (Timestamp('2013-01-10 00:00:02'), Timestamp('2013-01-10 11:32:02'))}
[2025-07-16 04:19:03:554] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-09-26 00:00:00'), Timestamp('2013-09-26 23:59:00'))}
[2025-07-16 04:19:03:556] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 693, 'datetime_range': (Timestamp('2013-01-10 00:00:02'), Timestamp('2013-01-10 11:32:02'))}
[2025-07-16 04:19:03:559] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-09-26 00:00:00'), Timestamp('2

  logger.warn(


In [21]:
obs, info = env_train.reset()
obs

[2025-07-16 04:19:03:727] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-09-26 00:00:00'), Timestamp('2013-09-26 23:59:00'))}


{'aggregate_load': array([-0.4250123], dtype=float32),
 'battery_soc': array([0.], dtype=float32),
 'timestamp_features': array([-0.5 ,  0.  ,  0.25], dtype=float32)}

In [22]:
env_train.reset_render_window()

In [23]:
from stable_baselines3.common.callbacks import EveryNTimesteps, ConvertCallback
from typing import Any

class TrainHNetworkEveryNTimesteps(EveryNTimesteps):
    def __init__(self, n_steps: int, h_network_rl_module: HNetworkRLModule):
        super().__init__(n_steps=n_steps, callback=ConvertCallback(self._train))
        self.h_network_rl_module = h_network_rl_module

    def _train(self, _locals: dict[str, Any], _globals: dict[str, Any]) -> bool:
        
        print_log("Training H-network...")

        # Train the H-network
        avg_loss, loss_list = self.h_network_rl_module.train()

        if avg_loss is None:
            print_log("No episodes in the replay buffer to train the H-network. Skipping training.")
            return True

        print_log(f"Average loss: {avg_loss:.4f}")
        print_log(f"Loss list: {loss_list}")

        return True

2025-07-16 04:19:03.971327: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-16 04:19:03.980655: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752635943.991059 1306938 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752635943.994232 1306938 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752635944.003389 1306938 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [24]:
# initialize a PPO agent
from stable_baselines3 import PPO

rl_datetime = datetime.now()
tensorboard_log_path = Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}")

rl_model = PPO(
    "MultiInputPolicy", 
    env_train, 
    verbose=2,
    tensorboard_log=tensorboard_log_path
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [25]:
rl_model.learn(
    total_timesteps=24 * 60 * 5 * 10,
    progress_bar=True,
    tb_log_name="PPO_SmartMeterWorld",
    callback=[TrainHNetworkEveryNTimesteps(n_steps=24 * 60 * 5, h_network_rl_module=h_network_rl_module)]
)

[2025-07-16 04:19:05:056] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-08-28 00:00:02'), Timestamp('2013-08-28 23:59:02'))}
Logging to rl_model/PPO/20250716_041905/PPO_SmartMeterWorld_1


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.44e+03 |
|    ep_rew_mean     | 37.6     |
| time/              |          |
|    fps             | 214      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 2048     |
---------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.44e+03    |
|    ep_rew_mean          | 43.1        |
| time/                   |             |
|    fps                  | 185         |
|    iterations           | 2           |
|    time_elapsed         | 22          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004127084 |
|    clip_fraction        | 0.0192      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.42       |
|    explained_variance   | -0.0464     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0281      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00293    |
|    std                  | 0.998       |
|    value_loss           | 0.114       |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 43.9         |
| time/                   |              |
|    fps                  | 181          |
|    iterations           | 3            |
|    time_elapsed         | 33           |
|    total_timesteps      | 6144         |
| train/                  |              |
|    approx_kl            | 0.0036211456 |
|    clip_fraction        | 0.0164       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | -0.000503    |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0278       |
|    n_updates            | 20           |
|    policy_gradient_loss | -0.00215     |
|    std                  | 0.964        |
|    value_loss           | 0.12         |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 60.7         |
| time/                   |              |
|    fps                  | 178          |
|    iterations           | 4            |
|    time_elapsed         | 46           |
|    total_timesteps      | 8192         |
| train/                  |              |
|    approx_kl            | 0.0052989707 |
|    clip_fraction        | 0.0259       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.1          |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0994       |
|    n_updates            | 30           |
|    policy_gradient_loss | -0.00246     |
|    std                  | 0.962        |
|    value_loss           | 0.247        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 57.3         |
| time/                   |              |
|    fps                  | 175          |
|    iterations           | 5            |
|    time_elapsed         | 58           |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0030524258 |
|    clip_fraction        | 0.00898      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.502        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.138        |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.00139     |
|    std                  | 0.956        |
|    value_loss           | 0.298        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 58.8         |
| time/                   |              |
|    fps                  | 173          |
|    iterations           | 6            |
|    time_elapsed         | 70           |
|    total_timesteps      | 12288        |
| train/                  |              |
|    approx_kl            | 0.0029736757 |
|    clip_fraction        | 0.0172       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0.682        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0628       |
|    n_updates            | 50           |
|    policy_gradient_loss | -0.00221     |
|    std                  | 0.973        |
|    value_loss           | 0.171        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 56.8         |
| time/                   |              |
|    fps                  | 170          |
|    iterations           | 7            |
|    time_elapsed         | 83           |
|    total_timesteps      | 14336        |
| train/                  |              |
|    approx_kl            | 0.0035460864 |
|    clip_fraction        | 0.0233       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0.667        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.122        |
|    n_updates            | 60           |
|    policy_gradient_loss | -0.00211     |
|    std                  | 0.975        |
|    value_loss           | 0.203        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 61.8         |
| time/                   |              |
|    fps                  | 171          |
|    iterations           | 8            |
|    time_elapsed         | 95           |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0029693954 |
|    clip_fraction        | 0.0406       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0.94         |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0325       |
|    n_updates            | 70           |
|    policy_gradient_loss | -0.00228     |
|    std                  | 0.987        |
|    value_loss           | 0.0842       |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.44e+03    |
|    ep_rew_mean          | 61.2        |
| time/                   |             |
|    fps                  | 170         |
|    iterations           | 9           |
|    time_elapsed         | 107         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.002512382 |
|    clip_fraction        | 0.00957     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.4        |
|    explained_variance   | 0.0471      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.204       |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.000852   |
|    std                  | 0.978       |
|    value_loss           | 0.465       |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 66.6         |
| time/                   |              |
|    fps                  | 170          |
|    iterations           | 10           |
|    time_elapsed         | 119          |
|    total_timesteps      | 20480        |
| train/                  |              |
|    approx_kl            | 0.0019012955 |
|    clip_fraction        | 0.0169       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0.875        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0605       |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.00159     |
|    std                  | 0.999        |
|    value_loss           | 0.151        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 65.2         |
| time/                   |              |
|    fps                  | 171          |
|    iterations           | 11           |
|    time_elapsed         | 131          |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0023807003 |
|    clip_fraction        | 0.0184       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.43        |
|    explained_variance   | 0.38         |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0861       |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.00272     |
|    std                  | 1.02         |
|    value_loss           | 0.282        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 64.8         |
| time/                   |              |
|    fps                  | 170          |
|    iterations           | 12           |
|    time_elapsed         | 143          |
|    total_timesteps      | 24576        |
| train/                  |              |
|    approx_kl            | 0.0030627998 |
|    clip_fraction        | 0.0115       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.44        |
|    explained_variance   | 0.719        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0674       |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00113     |
|    std                  | 1.02         |
|    value_loss           | 0.141        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 63.7         |
| time/                   |              |
|    fps                  | 170          |
|    iterations           | 13           |
|    time_elapsed         | 155          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0037982883 |
|    clip_fraction        | 0.0215       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.43        |
|    explained_variance   | 0.723        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.114        |
|    n_updates            | 120          |
|    policy_gradient_loss | -0.00187     |
|    std                  | 1.01         |
|    value_loss           | 0.308        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 62.4         |
| time/                   |              |
|    fps                  | 170          |
|    iterations           | 14           |
|    time_elapsed         | 168          |
|    total_timesteps      | 28672        |
| train/                  |              |
|    approx_kl            | 0.0058030123 |
|    clip_fraction        | 0.0373       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 0.748        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0657       |
|    n_updates            | 130          |
|    policy_gradient_loss | -0.00197     |
|    std                  | 1            |
|    value_loss           | 0.168        |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.44e+03    |
|    ep_rew_mean          | 63.7        |
| time/                   |             |
|    fps                  | 170         |
|    iterations           | 15          |
|    time_elapsed         | 180         |
|    total_timesteps      | 30720       |
| train/                  |             |
|    approx_kl            | 0.002049874 |
|    clip_fraction        | 0.0111      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.42       |
|    explained_variance   | 0.831       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.099       |
|    n_updates            | 140         |
|    policy_gradient_loss | -0.00205    |
|    std                  | 1.01        |
|    value_loss           | 0.168       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.44e+03    |
|    ep_rew_mean          | 62.8        |
| time/                   |             |
|    fps                  | 170         |
|    iterations           | 16          |
|    time_elapsed         | 192         |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.003264641 |
|    clip_fraction        | 0.0281      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.42       |
|    explained_variance   | 0.551       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.239       |
|    n_updates            | 150         |
|    policy_gradient_loss | -0.00316    |
|    std                  | 1           |
|    value_loss           | 0.348       |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.44e+03   |
|    ep_rew_mean          | 60.6       |
| time/                   |            |
|    fps                  | 170        |
|    iterations           | 17         |
|    time_elapsed         | 204        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.00119726 |
|    clip_fraction        | 0.0144     |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.41      |
|    explained_variance   | 0.652      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0345     |
|    n_updates            | 160        |
|    policy_gradient_loss | -0.00164   |
|    std                  | 0.981      |
|    value_loss           | 0.123      |
----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 59.5         |
| time/                   |              |
|    fps                  | 169          |
|    iterations           | 18           |
|    time_elapsed         | 217          |
|    total_timesteps      | 36864        |
| train/                  |              |
|    approx_kl            | 0.0069969567 |
|    clip_fraction        | 0.0702       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0.667        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0996       |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.00559     |
|    std                  | 0.978        |
|    value_loss           | 0.317        |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.44e+03    |
|    ep_rew_mean          | 57.3        |
| time/                   |             |
|    fps                  | 169         |
|    iterations           | 19          |
|    time_elapsed         | 229         |
|    total_timesteps      | 38912       |
| train/                  |             |
|    approx_kl            | 0.002844797 |
|    clip_fraction        | 0.0292      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.39       |
|    explained_variance   | 0.872       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0167      |
|    n_updates            | 180         |
|    policy_gradient_loss | -0.00246    |
|    std                  | 0.969       |
|    value_loss           | 0.0892      |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.44e+03    |
|    ep_rew_mean          | 56.6        |
| time/                   |             |
|    fps                  | 169         |
|    iterations           | 20          |
|    time_elapsed         | 241         |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.001108799 |
|    clip_fraction        | 0.00273     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.39       |
|    explained_variance   | 0.84        |
|    learning_rate        | 0.0003      |
|    loss                 | 0.044       |
|    n_updates            | 190         |
|    policy_gradient_loss | -0.000724   |
|    std                  | 0.971       |
|    value_loss           | 0.138       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.44e+03    |
|    ep_rew_mean          | 56.3        |
| time/                   |             |
|    fps                  | 169         |
|    iterations           | 21          |
|    time_elapsed         | 254         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.004964751 |
|    clip_fraction        | 0.0351      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.39       |
|    explained_variance   | 0.879       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0958      |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.00268    |
|    std                  | 0.968       |
|    value_loss           | 0.174       |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 54.9         |
| time/                   |              |
|    fps                  | 169          |
|    iterations           | 22           |
|    time_elapsed         | 266          |
|    total_timesteps      | 45056        |
| train/                  |              |
|    approx_kl            | 0.0051074005 |
|    clip_fraction        | 0.0501       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.855        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.052        |
|    n_updates            | 210          |
|    policy_gradient_loss | -0.00387     |
|    std                  | 0.958        |
|    value_loss           | 0.165        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 54.4         |
| time/                   |              |
|    fps                  | 169          |
|    iterations           | 23           |
|    time_elapsed         | 278          |
|    total_timesteps      | 47104        |
| train/                  |              |
|    approx_kl            | 0.0035212503 |
|    clip_fraction        | 0.0176       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.37        |
|    explained_variance   | 0.867        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.115        |
|    n_updates            | 220          |
|    policy_gradient_loss | -0.00204     |
|    std                  | 0.939        |
|    value_loss           | 0.188        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 51.9         |
| time/                   |              |
|    fps                  | 169          |
|    iterations           | 24           |
|    time_elapsed         | 290          |
|    total_timesteps      | 49152        |
| train/                  |              |
|    approx_kl            | 0.0028364677 |
|    clip_fraction        | 0.0176       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.34        |
|    explained_variance   | 0.842        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0662       |
|    n_updates            | 230          |
|    policy_gradient_loss | -0.00155     |
|    std                  | 0.918        |
|    value_loss           | 0.157        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 51.1         |
| time/                   |              |
|    fps                  | 169          |
|    iterations           | 25           |
|    time_elapsed         | 302          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0030834232 |
|    clip_fraction        | 0.0281       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.33        |
|    explained_variance   | 0.939        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0397       |
|    n_updates            | 240          |
|    policy_gradient_loss | -0.00211     |
|    std                  | 0.915        |
|    value_loss           | 0.108        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 48.7         |
| time/                   |              |
|    fps                  | 168          |
|    iterations           | 26           |
|    time_elapsed         | 315          |
|    total_timesteps      | 53248        |
| train/                  |              |
|    approx_kl            | 0.0014808383 |
|    clip_fraction        | 0.00732      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.32        |
|    explained_variance   | 0.857        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0452       |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.000879    |
|    std                  | 0.901        |
|    value_loss           | 0.12         |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.44e+03    |
|    ep_rew_mean          | 47.7        |
| time/                   |             |
|    fps                  | 168         |
|    iterations           | 27          |
|    time_elapsed         | 327         |
|    total_timesteps      | 55296       |
| train/                  |             |
|    approx_kl            | 0.005910723 |
|    clip_fraction        | 0.0409      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.31       |
|    explained_variance   | 0.848       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0573      |
|    n_updates            | 260         |
|    policy_gradient_loss | -0.00404    |
|    std                  | 0.893       |
|    value_loss           | 0.216       |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 46.6         |
| time/                   |              |
|    fps                  | 168          |
|    iterations           | 28           |
|    time_elapsed         | 340          |
|    total_timesteps      | 57344        |
| train/                  |              |
|    approx_kl            | 0.0028373203 |
|    clip_fraction        | 0.0177       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.3         |
|    explained_variance   | 0.924        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.108        |
|    n_updates            | 270          |
|    policy_gradient_loss | -0.00187     |
|    std                  | 0.883        |
|    value_loss           | 0.192        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 44.1         |
| time/                   |              |
|    fps                  | 168          |
|    iterations           | 29           |
|    time_elapsed         | 352          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0030430215 |
|    clip_fraction        | 0.0211       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.3         |
|    explained_variance   | 0.946        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0196       |
|    n_updates            | 280          |
|    policy_gradient_loss | -0.00268     |
|    std                  | 0.888        |
|    value_loss           | 0.144        |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.44e+03    |
|    ep_rew_mean          | 43.1        |
| time/                   |             |
|    fps                  | 168         |
|    iterations           | 30          |
|    time_elapsed         | 365         |
|    total_timesteps      | 61440       |
| train/                  |             |
|    approx_kl            | 0.004251739 |
|    clip_fraction        | 0.0266      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.29       |
|    explained_variance   | 0.891       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.101       |
|    n_updates            | 290         |
|    policy_gradient_loss | -0.00259    |
|    std                  | 0.876       |
|    value_loss           | 0.298       |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.44e+03     |
|    ep_rew_mean          | 40.9         |
| time/                   |              |
|    fps                  | 168          |
|    iterations           | 31           |
|    time_elapsed         | 377          |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0046960833 |
|    clip_fraction        | 0.0476       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.28        |
|    explained_variance   | 0.904        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.131        |
|    n_updates            | 300          |
|    policy_gradient_loss | -0.00539     |
|    std                  | 0.862        |
|    value_loss           | 0.254        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.42e+03     |
|    ep_rew_mean          | 38.6         |
| time/                   |              |
|    fps                  | 168          |
|    iterations           | 32           |
|    time_elapsed         | 389          |
|    total_timesteps      | 65536        |
| train/                  |              |
|    approx_kl            | 0.0045583798 |
|    clip_fraction        | 0.024        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.26        |
|    explained_variance   | 0.951        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.101        |
|    n_updates            | 310          |
|    policy_gradient_loss | -0.00168     |
|    std                  | 0.847        |
|    value_loss           | 0.252        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.42e+03     |
|    ep_rew_mean          | 37.9         |
| time/                   |              |
|    fps                  | 168          |
|    iterations           | 33           |
|    time_elapsed         | 401          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0022182185 |
|    clip_fraction        | 0.00723      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.25        |
|    explained_variance   | 0.934        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0939       |
|    n_updates            | 320          |
|    policy_gradient_loss | -0.000534    |
|    std                  | 0.839        |
|    value_loss           | 0.259        |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.42e+03     |
|    ep_rew_mean          | 36.5         |
| time/                   |              |
|    fps                  | 168          |
|    iterations           | 34           |
|    time_elapsed         | 413          |
|    total_timesteps      | 69632        |
| train/                  |              |
|    approx_kl            | 0.0023662418 |
|    clip_fraction        | 0.0113       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.23        |
|    explained_variance   | 0.847        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.222        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.00178     |
|    std                  | 0.824        |
|    value_loss           | 0.661        |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.41e+03    |
|    ep_rew_mean          | 34.5        |
| time/                   |             |
|    fps                  | 168         |
|    iterations           | 35          |
|    time_elapsed         | 425         |
|    total_timesteps      | 71680       |
| train/                  |             |
|    approx_kl            | 0.004009382 |
|    clip_fraction        | 0.0117      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.23       |
|    explained_variance   | 0.907       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.315       |
|    n_updates            | 340         |
|    policy_gradient_loss | -0.00107    |
|    std                  | 0.824       |
|    value_loss           | 0.624       |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.41e+03     |
|    ep_rew_mean          | 33.9         |
| time/                   |              |
|    fps                  | 168          |
|    iterations           | 36           |
|    time_elapsed         | 437          |
|    total_timesteps      | 73728        |
| train/                  |              |
|    approx_kl            | 0.0054983958 |
|    clip_fraction        | 0.0275       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.22        |
|    explained_variance   | 0.936        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.404        |
|    n_updates            | 350          |
|    policy_gradient_loss | -0.00155     |
|    std                  | 0.817        |
|    value_loss           | 0.67         |
------------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7f0ede509040>

Create a validation environment

and put the policy into the validation env

In [74]:
import sys
sys.path.append(str(Path('rl_env')))
from rl_env.hrl_env_hnetwork_loop import SmartMeterWorld

env_valid = SmartMeterWorld(
    smart_meter_data_loader=sm_dl_validation,
    h_network_rl_module=h_network_rl_module,
    render_mode="human",
)

[2025-07-16 06:32:03:889] [SmartMeterWorld] Render mode set to 'human'. Render server at 127.0.0.1:50007. render_connected: True. render_client_socket: <socket.socket fd=91, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 34520), raddr=('127.0.0.1', 50007)>


In [77]:
env_valid.reset_render_window()

In [75]:
rl_model.set_env(env_valid)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [78]:
env_valid_seed = 71

obs, info = env_valid.reset(env_valid_seed)
for i in range(1):
    done = False
    while not done:
        action, _states = rl_model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env_valid.step(action)
        print_log(f"Step: {env_valid.episode.get_current_step()}, Action: {action}, Reward: {reward}")
        env_valid.render()

[2025-07-16 06:32:41:872] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 1440, 'datetime_range': (Timestamp('2013-12-11 00:00:04'), Timestamp('2013-12-11 23:59:04'))}
[2025-07-16 06:32:41:882] Step: 1, Action: [1.], Reward: 0.09314211941560109
[2025-07-16 06:32:41:892] Step: 2, Action: [1.], Reward: 0.12112230546077092
[2025-07-16 06:32:41:900] Step: 3, Action: [1.], Reward: 0.14987683034737906
[2025-07-16 06:32:41:909] Step: 4, Action: [1.], Reward: 0.16101221478780112
[2025-07-16 06:32:41:917] Step: 5, Action: [1.], Reward: 0.16472628451188406
[2025-07-16 06:32:41:927] Step: 6, Action: [1.], Reward: 0.16780815757115683
[2025-07-16 06:32:41:932] Step: 7, Action: [1.], Reward: 0.16920895613034567
[2025-07-16 06:32:41:939] Step: 8, Action: [1.], Reward: 0.17024972773392996
[2025-07-16 06:32:41:947] Step: 9, Action: [1.], Reward: 0.17117571569283804
[2025-07-16 06:32:41:951] Step: 10, Action: [1.], Reward: 0.17134493327935538
[2025-07-16 06:32:41:959

In [29]:
env_valid.episode.df

Unnamed: 0,timestamp,aggregate,datetime,grid_load,battery_soc,aggregate_std
50897,1373673601,151.000000,2013-07-13 00:00:01,4151.0,0.0,-0.501991
50898,1373673661,151.323431,2013-07-13 00:01:01,4151.323431,0.008333,-0.500805
50899,1373673721,152.664198,2013-07-13 00:02:01,4152.664198,0.016667,-0.495890
50900,1373673781,152.053380,2013-07-13 00:03:01,4152.05338,0.025,-0.498129
50901,1373673841,151.406045,2013-07-13 00:04:01,4151.406045,0.033333,-0.500502
...,...,...,...,...,...,...
52332,1373759701,196.775577,2013-07-13 23:55:01,691.687844,0.894624,-0.334177
52333,1373759761,195.696554,2013-07-13 23:56:01,700.430594,0.895655,-0.338133
52334,1373759821,196.530304,2013-07-13 23:57:01,692.055694,0.896706,-0.335077
52335,1373759881,195.362123,2013-07-13 23:58:01,701.592358,0.897739,-0.339359


In [83]:
# save the graph

env_valid.save_graph(
    {"fname": str(Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", f"graph_valid_index_{env_valid.selected_idx}.png")),
     "dpi": 300}
)

In [73]:
env_valid.close()

[2025-07-16 06:31:55:987] [SmartMeterWorld] Environment closed.


In [32]:
# save the model
rl_model_path = Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "rl_model.zip")
rl_model.save(rl_model_path)

---

In [None]:
# load the model & environment
import sys
sys.path.append(str(Path('rl_env')))

from rl_env.hrl_env import SmartMeterWorld
from stable_baselines3 import PPO
from model.H_network.h_network_arch import HNetworkType

env_test = SmartMeterWorld(
    sm_dl_test,
    h_model_type=HNetworkType.H_NETWORK,
    render_mode="human",
)

env_test.set_h_network(h_network)
env_test.set_h_network_stdscaler(h_network_stdscaler)

rl_model_path = Path("rl_model", "PPO", f"{datetime(2025,7,12,18,25,2).strftime('%Y%m%d_%H%M%S')}", "rl_model.zip")
rl_model_loaded = PPO.load(rl_model_path, env=env_test)

In [None]:
obs, info = env_test.reset(43)

In [None]:
info

In [None]:
env_test.reset_render_window()

In [None]:
env_test.episode.df

In [None]:
for i in range(1):
    done = False
    while not done:
        action, _states = rl_model_loaded.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env_test.step(action)
        print_log(f"Step: {env_test.episode.get_current_step()}, Action: {action}, Reward: {reward}, Info: {info}")
        env_test.render()

In [None]:
env_test.save_graph(
    str(Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "graph_test.png"))
)

In [None]:
env_test.close()