A template of the RL training

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime

from utils import print_log

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# load the created dataset
dataset_folder_path = Path("dataset", "20250707", "split")

In [4]:
# copied from 03_data_split.ipynb

# Helper functions for the new split folder structure
def load_split_data_from_folder(split_folder, split_type='train'):
    """Load aggregate data from split folder"""
    segments = []
    with open(split_folder / f'{split_type}_segments.txt', 'r') as f:
        for line in f:
            start_str, end_str = line.strip().split(' - ')
            start = datetime.fromisoformat(start_str)
            end = datetime.fromisoformat(end_str)
            segments.append((start, end))
    
    df = pd.read_pickle(split_folder / f'{split_type}_aggregate_df.pkl')
    return segments, df

def load_signatures_from_split_folder(split_folder, split_type, appliance):
    """Load appliance signatures from split folder"""
    sig_path = split_folder / 'load_signature_library' / split_type / appliance / 'load_signatures.pkl'
    ranges_path = split_folder / 'load_signature_library' / split_type / appliance / 'selected_ranges.txt'
    
    if not sig_path.exists():
        return pd.DataFrame(), []
    
    signatures_df = pd.read_pickle(sig_path)
    ranges = []
    if ranges_path.exists():
        with open(ranges_path, 'r') as f:
            for line in f:
                start, end = map(int, line.strip().split(','))
                ranges.append((start, end))
    
    return signatures_df, ranges

In [5]:
# convert datetime objects to timezone-naive datetime objects
def convert_to_naive_datetimes_df(df):
    """Convert datetime objects in DataFrame to timezone-naive datetime objects"""
    df['datetime'] = df['datetime'].apply(lambda x: x.replace(tzinfo=None) if isinstance(x, datetime) else x)

    return df

def convert_to_naive_datetimes(segments):
    """Convert datetime objects in segments to timezone-naive datetime objects"""
    return [(start.replace(tzinfo=None), end.replace(tzinfo=None)) for start, end in segments]

In [6]:
aggregate_load_segments_train, aggregate_load_df_train = load_split_data_from_folder(dataset_folder_path, 'train')
aggregate_load_segments_test, aggregate_load_df_test = load_split_data_from_folder(dataset_folder_path, 'test') 
aggregate_load_segments_validation, aggregate_load_df_validation = load_split_data_from_folder(dataset_folder_path, 'val')

In [7]:
aggregate_load_segments_train, aggregate_load_df_train = convert_to_naive_datetimes(aggregate_load_segments_train), convert_to_naive_datetimes_df(aggregate_load_df_train)
aggregate_load_segments_test, aggregate_load_df_test = convert_to_naive_datetimes(aggregate_load_segments_test), convert_to_naive_datetimes_df(aggregate_load_df_test)
aggregate_load_segments_validation, aggregate_load_df_validation = convert_to_naive_datetimes(aggregate_load_segments_validation), convert_to_naive_datetimes_df(aggregate_load_df_validation)

In [8]:
aggregate_load_df_train

Unnamed: 0,timestamp,aggregate,datetime,washing_machine,dishwasher,fridge,kettle,microwave,toaster,tv,htpc,gas_oven,kitchen_lights
0,1.357603e+09,234.0,2013-01-08 00:00:05,0.0,1.0,0.0,1.0,1.0,0.0,1.0,69.0,,0.0
1,1.357603e+09,231.0,2013-01-08 00:00:11,0.0,1.0,0.0,1.0,1.0,0.0,1.0,70.0,,0.0
2,1.357603e+09,234.0,2013-01-08 00:00:17,0.0,1.0,0.0,1.0,1.0,0.0,1.0,70.0,,0.0
3,1.357603e+09,232.0,2013-01-08 00:00:23,0.0,1.0,0.0,1.0,1.0,0.0,1.0,68.0,,0.0
4,1.357603e+09,232.0,2013-01-08 00:00:30,0.0,1.0,0.0,1.0,1.0,0.0,1.0,70.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2231636,1.388448e+09,178.0,2013-12-30 23:59:35,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0
2231637,1.388448e+09,177.0,2013-12-30 23:59:41,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0
2231638,1.388448e+09,178.0,2013-12-30 23:59:47,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0
2231639,1.388448e+09,178.0,2013-12-30 23:59:53,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0


In [9]:
aggregate_load_segments_train

[(datetime.datetime(2013, 1, 8, 0, 0),
  datetime.datetime(2013, 1, 10, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 2, 27, 0, 0),
  datetime.datetime(2013, 2, 28, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 3, 8, 0, 0),
  datetime.datetime(2013, 3, 10, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 3, 28, 0, 0),
  datetime.datetime(2013, 3, 31, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 3, 22, 0, 0),
  datetime.datetime(2013, 3, 26, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 4, 8, 0, 0),
  datetime.datetime(2013, 4, 10, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 4, 26, 0, 0),
  datetime.datetime(2013, 4, 30, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 4, 1, 0, 0),
  datetime.datetime(2013, 4, 7, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 5, 15, 0, 0),
  datetime.datetime(2013, 5, 16, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 5, 1, 0, 0),
  datetime.datetime(2013, 5, 7, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 5, 25, 0, 0),
  datetime.dateti

In [10]:
from rl_env.env_data_loader import SmartMeterDataLoader

sm_dl_train = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_train,
    aggregate_load_df=aggregate_load_df_train
)

sm_dl_train.get_divided_segments_length()

162

In [11]:
sm_dl_train.divided_segments[7]

array([datetime.datetime(2013, 3, 10, 0, 0),
       datetime.datetime(2013, 3, 10, 23, 59, 59, 999999)], dtype=object)

In [12]:
# sample segment

sm_dl_train.get_aggregate_load_segment(13)

Unnamed: 0,timestamp,aggregate,datetime
104747,1.363997e+09,335.0,2013-03-23 00:00:05
104748,1.363997e+09,336.0,2013-03-23 00:00:11
104749,1.363997e+09,333.0,2013-03-23 00:00:17
104750,1.363997e+09,334.0,2013-03-23 00:00:24
104751,1.363997e+09,331.0,2013-03-23 00:00:30
...,...,...,...
118501,1.364083e+09,179.0,2013-03-23 23:59:30
118502,1.364083e+09,171.0,2013-03-23 23:59:37
118503,1.364083e+09,171.0,2013-03-23 23:59:43
118504,1.364083e+09,171.0,2013-03-23 23:59:49


In [13]:
# create dataloader for validation and test sets
sm_dl_validation = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_validation,
    aggregate_load_df=aggregate_load_df_validation
)

sm_dl_test = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_test,
    aggregate_load_df=aggregate_load_df_test
)

(Optional) Load the pre-trained H-network and related components

In final product, the H-network should be trained along with the DDQL/PPO agent

In [14]:
import torch
from model.H_network.h_network import HNetwork

h_network_datetime = datetime(2025, 7, 13)

h_network_path = Path("model_trained", f"h_network_{h_network_datetime.strftime('%Y%m%d')}.pth")

h_network = HNetwork(2, 44, 1)
h_network.load_state_dict(torch.load(h_network_path))
h_network.eval()

HNetwork(
  (LSTM_1): LSTM(2, 44, batch_first=True, bidirectional=True)
  (ac1): Tanh()
  (LSTM_2): LSTM(88, 44, batch_first=True, bidirectional=True)
  (ac2): Tanh()
  (fc): Linear(in_features=88, out_features=1, bias=True)
)

In [15]:
h_network_stdscaler_path = Path("model_trained", f"h_network_standardscaler_{h_network_datetime.strftime('%Y%m%d')}.pkl")
import joblib
h_network_stdscaler = joblib.load(h_network_stdscaler_path)

Create the environment

In [16]:
import sys
sys.path.append(str(Path('rl_env')))

from rl_env.hrl_env import SmartMeterWorld
from model.H_network.h_network_arch import HNetworkType

env_train = SmartMeterWorld(
    smart_meter_data_loader=sm_dl_train,
    h_model_type=HNetworkType.H_NETWORK,
    render_mode="human",
)

env_train.set_h_network(h_network)
env_train.set_h_network_stdscaler(h_network_stdscaler)

[2025-07-14 00:22:45:390] [SmartMeterWorld] Could not connect to render server: [Errno 111] Connection refused
[2025-07-14 00:22:45:390] [SmartMeterWorld] Render mode set to 'human'. Render server at 127.0.0.1:50007. render_connected: False. render_client_socket: <socket.socket fd=65, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('0.0.0.0', 50744)>


In [17]:
from gymnasium.utils.env_checker import check_env

# This will catch many common issues
try:
    check_env(env_train)
    print("Environment passes all checks!")
except Exception as e:
    print(f"Environment has issues: {e}")

[2025-07-14 00:22:45:421] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13836, 'datetime_range': (Timestamp('2013-08-29 00:00:05'), Timestamp('2013-08-29 23:59:56'))}
[2025-07-14 00:22:45:432] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13880, 'datetime_range': (Timestamp('2013-08-28 00:00:02'), Timestamp('2013-08-28 23:59:59'))}
[2025-07-14 00:22:45:442] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13745, 'datetime_range': (Timestamp('2013-03-28 00:00:05'), Timestamp('2013-03-28 23:59:57'))}
[2025-07-14 00:22:45:452] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13880, 'datetime_range': (Timestamp('2013-08-28 00:00:02'), Timestamp('2013-08-28 23:59:59'))}
[2025-07-14 00:22:45:463] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13745, 'datetime_range': (Timestamp('2013-03-28 00:00:05'), Times

  logger.warn(
  logger.warn(


In [18]:
obs = env_train.reset()
obs

[2025-07-14 00:22:45:546] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13745, 'datetime_range': (Timestamp('2013-03-28 00:00:05'), Timestamp('2013-03-28 23:59:57'))}


({'aggregate_load': array([176.], dtype=float32),
  'battery_soc': array([0.22035988], dtype=float32),
  'timestamp_features': array([-0.5 ,  0.  , -0.25], dtype=float32)},
 {'current_step': 0,
  'battery_soc (kWh)': 0.22035988,
  'user_load (W)': 176.0,
  '(prev) grid_load (W)': None,
  'last_action (kW)': None,
  'last_battery_actiuon (kW)': None,
  'last_reward': None,
  'last_f_signal': None,
  'last_g_signal': None})

In [19]:
env_train.reset_render_window()

In [20]:
# initialize a PPO agent
from stable_baselines3 import PPO

rl_datetime = datetime.now()
tensorboard_log_path = Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}")

rl_model = PPO(
    "MultiInputPolicy", 
    env_train, 
    verbose=2,
    tensorboard_log=tensorboard_log_path
)

rl_model.learn(
    total_timesteps=10000,
    progress_bar=True,
    tb_log_name="PPO_SmartMeterWorld"
)

2025-07-14 00:22:45.777725: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-14 00:22:45.785113: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752448965.793806  861821 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752448965.796502  861821 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752448965.804223  861821 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
[2025-07-14 00:22:47:281] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13955, 'datetime_range': (Timestamp('2013-06-29 00:00:03'), Timestamp('2013-06-29 23:59:55'))}
Logging to rl_model/PPO/20250714_002246/PPO_SmartMeterWorld_1


Output()

-----------------------------
| time/              |      |
|    fps             | 96   |
|    iterations      | 1    |
|    time_elapsed    | 21   |
|    total_timesteps | 2048 |
-----------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 81           |
|    iterations           | 2            |
|    time_elapsed         | 50           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0012185706 |
|    clip_fraction        | 0.0122       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | -0.00115     |
|    learning_rate        | 0.0003       |
|    loss                 | 0.244        |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00038     |
|    std                  | 0.989        |
|    value_loss           | 0.606        |
------------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 3            |
|    time_elapsed         | 85           |
|    total_timesteps      | 6144         |
| train/                  |              |
|    approx_kl            | 0.0037497217 |
|    clip_fraction        | 0.0159       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0.0848       |
|    learning_rate        | 0.0003       |
|    loss                 | 0.00263      |
|    n_updates            | 20           |
|    policy_gradient_loss | -0.000248    |
|    std                  | 0.993        |
|    value_loss           | 0.0743       |
------------------------------------------


-------------------------------------------
| time/                   |               |
|    fps                  | 65            |
|    iterations           | 4             |
|    time_elapsed         | 126           |
|    total_timesteps      | 8192          |
| train/                  |               |
|    approx_kl            | 0.00013526666 |
|    clip_fraction        | 9.77e-05      |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | -0.125        |
|    learning_rate        | 0.0003        |
|    loss                 | 0.378         |
|    n_updates            | 30            |
|    policy_gradient_loss | -0.000278     |
|    std                  | 1             |
|    value_loss           | 0.836         |
-------------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 5            |
|    time_elapsed         | 172          |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0012297058 |
|    clip_fraction        | 0.00176      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 0.122        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.604        |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.00117     |
|    std                  | 1.01         |
|    value_loss           | 1.18         |
------------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7fbb0face7c0>

Create a validation environment

and put the policy into the validation env

In [22]:
import sys
sys.path.append(str(Path('rl_env')))
from rl_env.hrl_env import SmartMeterWorld
from model.H_network.h_network_arch import HNetworkType

env_valid = SmartMeterWorld(
    smart_meter_data_loader=sm_dl_validation,
    h_model_type=HNetworkType.H_NETWORK,
    render_mode="human",
)

[2025-07-14 00:28:30:159] [SmartMeterWorld] Render mode set to 'human'. Render server at 127.0.0.1:50007. render_connected: True. render_client_socket: <socket.socket fd=85, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 44686), raddr=('127.0.0.1', 50007)>


In [23]:
env_valid.set_h_network(h_network)
env_valid.set_h_network_stdscaler(h_network_stdscaler)

In [24]:
rl_model.set_env(env_valid)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [25]:
obs, info = env_valid.reset()
for i in range(1):
    done = False
    while not done:
        action, _states = rl_model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env_valid.step(action)
        print_log(f"Step: {env_valid.episode.get_current_step()}, Action: {action}, Reward: {reward}")
        env_valid.render()

[2025-07-14 00:28:42:106] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13263, 'datetime_range': (Timestamp('2013-02-23 00:00:04'), Timestamp('2013-02-23 23:59:59'))}
[2025-07-14 00:28:42:115] Step: 1, Action: [0.01884734], Reward: 0.011010190246357273
[2025-07-14 00:28:42:123] Step: 2, Action: [0.01884733], Reward: 0.002394291655180355
[2025-07-14 00:28:42:131] Step: 3, Action: [0.01884069], Reward: 0.0007039229625482112
[2025-07-14 00:28:42:137] Step: 4, Action: [0.01884733], Reward: 0.0010338612032644453
[2025-07-14 00:28:42:142] Step: 5, Action: [0.0188373], Reward: 0.001052549227113525
[2025-07-14 00:28:42:147] Step: 6, Action: [0.01884403], Reward: 0.0015427948317863047
[2025-07-14 00:28:42:153] Step: 7, Action: [0.01884403], Reward: 0.0019851068324496348
[2025-07-14 00:28:42:159] Step: 8, Action: [0.01884402], Reward: 0.002530804791664084
[2025-07-14 00:28:42:165] Step: 9, Action: [0.01884068], Reward: 0.0024306421114371883
[2025-07-14 00:2

In [26]:
env_valid.episode.df

Unnamed: 0,timestamp,aggregate,datetime,grid_load,battery_soc
57522,1.361578e+09,183.0,2013-02-23 00:00:04,258.389341,0.254655
57523,1.361578e+09,183.0,2013-02-23 00:00:11,258.389326,0.254673
57524,1.361578e+09,181.0,2013-02-23 00:00:17,256.362742,0.254689
57525,1.361578e+09,183.0,2013-02-23 00:00:23,258.389318,0.254705
57526,1.361578e+09,180.0,2013-02-23 00:00:29,255.349189,0.254721
...,...,...,...,...,...
70780,1.361664e+09,293.0,2013-02-23 23:59:09,369.223552,0.482636
70781,1.361664e+09,295.0,2013-02-23 23:59:16,371.228306,0.482654
70782,1.361664e+09,293.0,2013-02-23 23:59:22,369.223552,0.48267
70783,1.361664e+09,292.0,2013-02-23 23:59:53,368.221116,0.482752


In [None]:
# save the graph

env_valid.save_graph(
    str(Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "graph_valid.png"))
)

In [None]:
env_valid.close()

In [21]:
# save the model
rl_model_path = Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "rl_model.zip")
rl_model.save(rl_model_path)

---

In [17]:
# load the model & environment
import sys
sys.path.append(str(Path('rl_env')))

from rl_env.hrl_env import SmartMeterWorld
from stable_baselines3 import PPO
from model.H_network.h_network_arch import HNetworkType

env_test = SmartMeterWorld(
    sm_dl_test,
    h_model_type=HNetworkType.H_NETWORK,
    render_mode="human",
)

env_test.set_h_network(h_network)
env_test.set_h_network_stdscaler(h_network_stdscaler)

rl_model_path = Path("rl_model", "PPO", f"{datetime(2025,7,12,18,25,2).strftime('%Y%m%d_%H%M%S')}", "rl_model.zip")
rl_model_loaded = PPO.load(rl_model_path, env=env_test)

[2025-07-13 19:27:26:817] [SmartMeterWorld] Render mode set to 'human'. Render server at 127.0.0.1:50007. render_connected: True. render_client_socket: <socket.socket fd=85, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 57668), raddr=('127.0.0.1', 50007)>
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [18]:
obs, info = env_test.reset(43)

[2025-07-13 19:27:29:289] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13821, 'datetime_range': (Timestamp('2013-06-19 00:00:05'), Timestamp('2013-06-19 23:59:54'))}


In [19]:
info

{'current_step': 0,
 'battery_soc (kWh)': 0.6522993,
 'user_load (W)': 205.0,
 '(prev) grid_load (W)': None,
 'last_action (kW)': None,
 'last_battery_actiuon (kW)': None,
 'last_reward': None,
 'last_f_signal': None,
 'last_g_signal': None}

In [20]:
env_test.reset_render_window()

In [21]:
env_test.episode.df

Unnamed: 0,timestamp,aggregate,datetime,grid_load,battery_soc
434548,1.371600e+09,205.0,2013-06-19 00:00:05,,
434549,1.371600e+09,205.0,2013-06-19 00:00:11,,
434550,1.371600e+09,208.0,2013-06-19 00:00:17,,
434551,1.371600e+09,205.0,2013-06-19 00:00:23,,
434552,1.371600e+09,204.0,2013-06-19 00:00:30,,
...,...,...,...,...,...
448364,1.371686e+09,239.0,2013-06-19 23:59:30,,
448365,1.371686e+09,238.0,2013-06-19 23:59:36,,
448366,1.371686e+09,238.0,2013-06-19 23:59:42,,
448367,1.371686e+09,240.0,2013-06-19 23:59:48,,


In [22]:
for i in range(1):
    done = False
    while not done:
        action, _states = rl_model_loaded.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env_test.step(action)
        print_log(f"Step: {env_test.episode.get_current_step()}, Action: {action}, Reward: {reward}, Info: {info}")
        env_test.render()

[2025-07-13 19:27:35:685] Step: 1, Action: [-0.16942872], Reward: 0.0012156357950739065, Info: {'current_step': 1, 'battery_soc (kWh)': 0.65225655, 'user_load (W)': 205.0, '(prev) grid_load (W)': 0.0, 'last_action (kW)': -0.6777148842811584, 'last_battery_actiuon (kW)': -0.205, 'last_reward': 0.0012156357950739065, 'last_f_signal': -0.0013545406982302666, 'last_g_signal': 3.4508333333333336e-05}
[2025-07-13 19:27:35:691] Step: 2, Action: [-0.16942874], Reward: 0.0036868651825211446, Info: {'current_step': 2, 'battery_soc (kWh)': 0.6522139, 'user_load (W)': 208.0, '(prev) grid_load (W)': 0.0, 'last_action (kW)': -0.6777149438858032, 'last_battery_actiuon (kW)': -0.205, 'last_reward': 0.0036868651825211446, 'last_f_signal': -0.004100351128727198, 'last_g_signal': 3.4508333333333336e-05}
[2025-07-13 19:27:35:697] Step: 3, Action: [-0.16943836], Reward: 0.009197186373137158, Info: {'current_step': 3, 'battery_soc (kWh)': 0.65217054, 'user_load (W)': 205.0, '(prev) grid_load (W)': 0.0, 'las

In [None]:
env_test.save_graph(
    str(Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "graph_test.png"))
)

In [None]:
env_test.close()