A template of the RL training

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime

from utils import print_log

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# load the created dataset
dataset_folder_path = Path("dataset", "20250707", "split")

In [4]:
# copied from 03_data_split.ipynb

# Helper functions for the new split folder structure
def load_split_data_from_folder(split_folder, split_type='train'):
    """Load aggregate data from split folder"""
    segments = []
    with open(split_folder / f'{split_type}_segments.txt', 'r') as f:
        for line in f:
            start_str, end_str = line.strip().split(' - ')
            start = datetime.fromisoformat(start_str)
            end = datetime.fromisoformat(end_str)
            segments.append((start, end))
    
    df = pd.read_pickle(split_folder / f'{split_type}_aggregate_df.pkl')
    return segments, df

def load_signatures_from_split_folder(split_folder, split_type, appliance):
    """Load appliance signatures from split folder"""
    sig_path = split_folder / 'load_signature_library' / split_type / appliance / 'load_signatures.pkl'
    ranges_path = split_folder / 'load_signature_library' / split_type / appliance / 'selected_ranges.txt'
    
    if not sig_path.exists():
        return pd.DataFrame(), []
    
    signatures_df = pd.read_pickle(sig_path)
    ranges = []
    if ranges_path.exists():
        with open(ranges_path, 'r') as f:
            for line in f:
                start, end = map(int, line.strip().split(','))
                ranges.append((start, end))
    
    return signatures_df, ranges

In [9]:
# convert datetime objects to timezone-naive datetime objects
def convert_to_naive_datetimes_df(df):
    """Convert datetime objects in DataFrame to timezone-naive datetime objects"""
    df['datetime'] = df['datetime'].apply(lambda x: x.replace(tzinfo=None) if isinstance(x, datetime) else x)

    return df

def convert_to_naive_datetimes(segments):
    """Convert datetime objects in segments to timezone-naive datetime objects"""
    return [(start.replace(tzinfo=None), end.replace(tzinfo=None)) for start, end in segments]

In [8]:
aggregate_load_segments_train, aggregate_load_df_train = load_split_data_from_folder(dataset_folder_path, 'train')
aggregate_load_segments_test, aggregate_load_df_test = load_split_data_from_folder(dataset_folder_path, 'test') 
aggregate_load_segments_validation, aggregate_load_df_validation = load_split_data_from_folder(dataset_folder_path, 'val')

In [10]:
aggregate_load_segments_train, aggregate_load_df_train = convert_to_naive_datetimes(aggregate_load_segments_train), convert_to_naive_datetimes_df(aggregate_load_df_train)
aggregate_load_segments_test, aggregate_load_df_test = convert_to_naive_datetimes(aggregate_load_segments_test), convert_to_naive_datetimes_df(aggregate_load_df_test)
aggregate_load_segments_validation, aggregate_load_df_validation = convert_to_naive_datetimes(aggregate_load_segments_validation), convert_to_naive_datetimes_df(aggregate_load_df_validation)

In [11]:
aggregate_load_df_train

Unnamed: 0,timestamp,aggregate,datetime,washing_machine,dishwasher,fridge,kettle,microwave,toaster,tv,htpc,gas_oven,kitchen_lights
0,1.357603e+09,234.0,2013-01-08 00:00:05,0.0,1.0,0.0,1.0,1.0,0.0,1.0,69.0,,0.0
1,1.357603e+09,231.0,2013-01-08 00:00:11,0.0,1.0,0.0,1.0,1.0,0.0,1.0,70.0,,0.0
2,1.357603e+09,234.0,2013-01-08 00:00:17,0.0,1.0,0.0,1.0,1.0,0.0,1.0,70.0,,0.0
3,1.357603e+09,232.0,2013-01-08 00:00:23,0.0,1.0,0.0,1.0,1.0,0.0,1.0,68.0,,0.0
4,1.357603e+09,232.0,2013-01-08 00:00:30,0.0,1.0,0.0,1.0,1.0,0.0,1.0,70.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2231636,1.388448e+09,178.0,2013-12-30 23:59:35,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0
2231637,1.388448e+09,177.0,2013-12-30 23:59:41,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0
2231638,1.388448e+09,178.0,2013-12-30 23:59:47,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0
2231639,1.388448e+09,178.0,2013-12-30 23:59:53,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0


In [12]:
aggregate_load_segments_train

[(datetime.datetime(2013, 1, 8, 0, 0),
  datetime.datetime(2013, 1, 10, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 2, 27, 0, 0),
  datetime.datetime(2013, 2, 28, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 3, 8, 0, 0),
  datetime.datetime(2013, 3, 10, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 3, 28, 0, 0),
  datetime.datetime(2013, 3, 31, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 3, 22, 0, 0),
  datetime.datetime(2013, 3, 26, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 4, 8, 0, 0),
  datetime.datetime(2013, 4, 10, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 4, 26, 0, 0),
  datetime.datetime(2013, 4, 30, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 4, 1, 0, 0),
  datetime.datetime(2013, 4, 7, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 5, 15, 0, 0),
  datetime.datetime(2013, 5, 16, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 5, 1, 0, 0),
  datetime.datetime(2013, 5, 7, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 5, 25, 0, 0),
  datetime.dateti

In [None]:
from rl_env.env_data_loader import SmartMeterDataLoader

sm_dl_train = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_train,
    aggregate_load_df=aggregate_load_df_train
)

sm_dl_train.get_divided_segments_length()

162

In [None]:
sm_dl_train.divided_segments[7]

array([datetime.datetime(2013, 3, 10, 0, 0),
       datetime.datetime(2013, 3, 10, 23, 59, 59, 999999)], dtype=object)

In [None]:
# sample segment

sm_dl_train.get_aggregate_load_segment(13)

Unnamed: 0,timestamp,aggregate,datetime
104747,1.363997e+09,335.0,2013-03-23 00:00:05
104748,1.363997e+09,336.0,2013-03-23 00:00:11
104749,1.363997e+09,333.0,2013-03-23 00:00:17
104750,1.363997e+09,334.0,2013-03-23 00:00:24
104751,1.363997e+09,331.0,2013-03-23 00:00:30
...,...,...,...
118501,1.364083e+09,179.0,2013-03-23 23:59:30
118502,1.364083e+09,171.0,2013-03-23 23:59:37
118503,1.364083e+09,171.0,2013-03-23 23:59:43
118504,1.364083e+09,171.0,2013-03-23 23:59:49


In [33]:
# create dataloader for validation and test sets
sm_dl_validation = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_validation,
    aggregate_load_df=aggregate_load_df_validation
)

sm_dl_test = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_test,
    aggregate_load_df=aggregate_load_df_test
)

(Optional) Load the pre-trained H-network and related components

In final product, the H-network should be trained along with the DDQL/PPO agent

In [16]:
import torch
from model.H_network.h_network import HNetwork

h_network_datetime = datetime(2025, 7, 12)

h_network_path = Path("model_trained", f"h_network_{h_network_datetime.strftime('%Y%m%d')}.pth")

h_network = HNetwork(2, 44, 1)
h_network.load_state_dict(torch.load(h_network_path))
h_network.eval()

HNetwork(
  (LSTM_1): LSTM(2, 44, batch_first=True, bidirectional=True)
  (ac1): Tanh()
  (LSTM_2): LSTM(88, 1, batch_first=True, bidirectional=True)
  (ac2): Tanh()
  (fc): Linear(in_features=2, out_features=1, bias=True)
)

In [17]:
h_network_stdscaler_path = Path("model_trained", f"h_network_standardscaler_{h_network_datetime.strftime('%Y%m%d')}.pkl")
import joblib
h_network_stdscaler = joblib.load(h_network_stdscaler_path)

Create the environment

In [None]:
import sys
sys.path.append(str(Path('rl_env')))

from rl_env.hrl_env import SmartMeterWorld

env_train = SmartMeterWorld(
    smart_meter_data_loader=sm_dl_train,
    render_mode="human",
)

env_train.set_h_network(h_network)
env_train.set_h_network_stdscaler(h_network_stdscaler)

[2025-07-12 16:14:44:217] [SmartMeterWorld] Render mode set to 'human'. Render server at 127.0.0.1:50007. render_connected: True. render_client_socket: <socket.socket fd=92, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 41422), raddr=('127.0.0.1', 50007)>


In [None]:
from gymnasium.utils.env_checker import check_env

# This will catch many common issues
try:
    check_env(env_train)
    print("Environment passes all checks!")
except Exception as e:
    print(f"Environment has issues: {e}")

[2025-07-12 14:57:51:161] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13864, 'datetime_range': (Timestamp('2013-07-01 00:00:03'), Timestamp('2013-07-01 23:59:54'))}
[2025-07-12 14:57:51:172] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13880, 'datetime_range': (Timestamp('2013-08-28 00:00:02'), Timestamp('2013-08-28 23:59:59'))}
[2025-07-12 14:57:51:180] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13745, 'datetime_range': (Timestamp('2013-03-28 00:00:05'), Timestamp('2013-03-28 23:59:57'))}
[2025-07-12 14:57:51:190] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13880, 'datetime_range': (Timestamp('2013-08-28 00:00:02'), Timestamp('2013-08-28 23:59:59'))}
[2025-07-12 14:57:51:200] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13745, 'datetime_range': (Timestamp('2013-03-28 00:00:05'), Times

  logger.warn(
  logger.warn(


In [None]:
obs = env_train.reset()
obs

[2025-07-12 14:57:57:845] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13745, 'datetime_range': (Timestamp('2013-03-28 00:00:05'), Timestamp('2013-03-28 23:59:57'))}


({'aggregate_load': array([176.], dtype=float32),
  'battery_soc': array([0.22035988], dtype=float32),
  'timestamp_features': array([-0.5 ,  0.  , -0.25], dtype=float32)},
 {})

In [None]:
env_train.reset_render_window()

In [None]:
# initialize a PPO agent
from stable_baselines3 import PPO

rl_datetime = datetime.now()
tensorboard_log_path = Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}")

rl_model = PPO(
    "MultiInputPolicy", 
    env_train, 
    verbose=2,
    tensorboard_log=tensorboard_log_path
)

rl_model.learn(
    total_timesteps=100000,
    progress_bar=True,
    tb_log_name="PPO_SmartMeterWorld"
)

2025-07-12 14:58:07.867761: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-12 14:58:07.999517: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752328688.046938  448068 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752328688.062176  448068 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752328688.170229  448068 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
[2025-07-12 14:58:10:488] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13955, 'datetime_range': (Timestamp('2013-06-29 00:00:03'), Timestamp('2013-06-29 23:59:55'))}
Logging to rl_model/PPO/20250712_145809/PPO_SmartMeterWorld_1


Output()

-----------------------------
| time/              |      |
|    fps             | 86   |
|    iterations      | 1    |
|    time_elapsed    | 23   |
|    total_timesteps | 2048 |
-----------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 2            |
|    time_elapsed         | 52           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0024565784 |
|    clip_fraction        | 0.00801      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 1.55e-06     |
|    learning_rate        | 0.0003       |
|    loss                 | 6.36e+05     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.000994    |
|    std                  | 0.995        |
|    value_loss           | 1.45e+06     |
------------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 70           |
|    iterations           | 3            |
|    time_elapsed         | 87           |
|    total_timesteps      | 6144         |
| train/                  |              |
|    approx_kl            | 0.0040488555 |
|    clip_fraction        | 0.00684      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 5.25e+06     |
|    n_updates            | 20           |
|    policy_gradient_loss | -0.0016      |
|    std                  | 0.991        |
|    value_loss           | 1.09e+07     |
------------------------------------------


-------------------------------------------
| time/                   |               |
|    fps                  | 65            |
|    iterations           | 4             |
|    time_elapsed         | 125           |
|    total_timesteps      | 8192          |
| train/                  |               |
|    approx_kl            | 0.00019045983 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 2.03e+07      |
|    n_updates            | 30            |
|    policy_gradient_loss | -7.02e-05     |
|    std                  | 0.989         |
|    value_loss           | 4.05e+07      |
-------------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 60           |
|    iterations           | 5            |
|    time_elapsed         | 169          |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 9.014082e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 4.15e+07     |
|    n_updates            | 40           |
|    policy_gradient_loss | -2.94e-05    |
|    std                  | 0.989        |
|    value_loss           | 8.39e+07     |
------------------------------------------


-------------------------------------------
| time/                   |               |
|    fps                  | 56            |
|    iterations           | 6             |
|    time_elapsed         | 217           |
|    total_timesteps      | 12288         |
| train/                  |               |
|    approx_kl            | 2.4603825e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 7.04e+07      |
|    n_updates            | 50            |
|    policy_gradient_loss | 2.63e-08      |
|    std                  | 0.989         |
|    value_loss           | 1.43e+08      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.4e+04       |
|    ep_rew_mean          | 7.37e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 7             |
|    time_elapsed         | 262           |
|    total_timesteps      | 14336         |
| train/                  |               |
|    approx_kl            | 2.7257222e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 1.06e+08      |
|    n_updates            | 60            |
|    policy_gradient_loss | -3.37e-05     |
|    std                  | 0.991         |
|    value_loss           | 2.07e+08      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.4e+04       |
|    ep_rew_mean          | 7.37e+06      |
| time/                   |               |
|    fps                  | 56            |
|    iterations           | 8             |
|    time_elapsed         | 287           |
|    total_timesteps      | 16384         |
| train/                  |               |
|    approx_kl            | 0.00016736984 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | -1.19e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | 1.35e+08      |
|    n_updates            | 70            |
|    policy_gradient_loss | -0.000124     |
|    std                  | 0.99          |
|    value_loss           | 2.44e+08      |
-------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.4e+04     |
|    ep_rew_mean          | 7.37e+06    |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 9           |
|    time_elapsed         | 317         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.003242892 |
|    clip_fraction        | 0.0199      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.41       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 1.3e+06     |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.00287    |
|    std                  | 0.992       |
|    value_loss           | 2.45e+06    |
-----------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.4e+04       |
|    ep_rew_mean          | 7.37e+06      |
| time/                   |               |
|    fps                  | 58            |
|    iterations           | 10            |
|    time_elapsed         | 352           |
|    total_timesteps      | 20480         |
| train/                  |               |
|    approx_kl            | 0.00061232987 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 0.0003        |
|    loss                 | 5.69e+06      |
|    n_updates            | 90            |
|    policy_gradient_loss | -5.8e-05      |
|    std                  | 0.994         |
|    value_loss           | 1.28e+07      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.37e+06     |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 11           |
|    time_elapsed         | 392          |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0010316729 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 1.19e-07     |
|    learning_rate        | 0.0003       |
|    loss                 | 2.35e+07     |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.000367    |
|    std                  | 0.993        |
|    value_loss           | 4.51e+07     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.37e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 12           |
|    time_elapsed         | 437          |
|    total_timesteps      | 24576        |
| train/                  |              |
|    approx_kl            | 0.0008587333 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 5.69e+07     |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.000469    |
|    std                  | 0.992        |
|    value_loss           | 1.13e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.37e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 13           |
|    time_elapsed         | 486          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 5.992246e-06 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.1e+08      |
|    n_updates            | 120          |
|    policy_gradient_loss | 3.59e-06     |
|    std                  | 0.992        |
|    value_loss           | 2.2e+08      |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 7.83e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 14            |
|    time_elapsed         | 524           |
|    total_timesteps      | 28672         |
| train/                  |               |
|    approx_kl            | 5.9768994e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 1.79e+08      |
|    n_updates            | 130           |
|    policy_gradient_loss | -6.31e-05     |
|    std                  | 0.992         |
|    value_loss           | 3.68e+08      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 7.83e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 15            |
|    time_elapsed         | 549           |
|    total_timesteps      | 30720         |
| train/                  |               |
|    approx_kl            | 0.00026363906 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 1.26e+08      |
|    n_updates            | 140           |
|    policy_gradient_loss | -0.00015      |
|    std                  | 0.992         |
|    value_loss           | 2.47e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.83e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 16           |
|    time_elapsed         | 580          |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.0036125574 |
|    clip_fraction        | 0.0168       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | -2.38e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 2.12e+06     |
|    n_updates            | 150          |
|    policy_gradient_loss | -0.00197     |
|    std                  | 0.99         |
|    value_loss           | 4.24e+06     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.83e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 17           |
|    time_elapsed         | 616          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0031752014 |
|    clip_fraction        | 0.00269      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 8.2e+06      |
|    n_updates            | 160          |
|    policy_gradient_loss | -0.00067     |
|    std                  | 0.989        |
|    value_loss           | 1.69e+07     |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 7.83e+06      |
| time/                   |               |
|    fps                  | 56            |
|    iterations           | 18            |
|    time_elapsed         | 657           |
|    total_timesteps      | 36864         |
| train/                  |               |
|    approx_kl            | 0.00027860855 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.4          |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 2.05e+07      |
|    n_updates            | 170           |
|    policy_gradient_loss | -0.000602     |
|    std                  | 0.98          |
|    value_loss           | 4.15e+07      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 7.83e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 19            |
|    time_elapsed         | 702           |
|    total_timesteps      | 38912         |
| train/                  |               |
|    approx_kl            | 0.00017853561 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.4          |
|    explained_variance   | -3.58e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | 3.66e+07      |
|    n_updates            | 180           |
|    policy_gradient_loss | -5.02e-05     |
|    std                  | 0.981         |
|    value_loss           | 7.49e+07      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 7.83e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 20            |
|    time_elapsed         | 752           |
|    total_timesteps      | 40960         |
| train/                  |               |
|    approx_kl            | 0.00037057232 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.4          |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 6.15e+07      |
|    n_updates            | 190           |
|    policy_gradient_loss | -0.000192     |
|    std                  | 0.98          |
|    value_loss           | 1.23e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.43e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 21           |
|    time_elapsed         | 787          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 7.657331e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.01e+08     |
|    n_updates            | 200          |
|    policy_gradient_loss | -5.73e-05    |
|    std                  | 0.981        |
|    value_loss           | 2.03e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.43e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 22           |
|    time_elapsed         | 813          |
|    total_timesteps      | 45056        |
| train/                  |              |
|    approx_kl            | 0.0012986539 |
|    clip_fraction        | 9.77e-05     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 6.35e+07     |
|    n_updates            | 210          |
|    policy_gradient_loss | -0.000576    |
|    std                  | 0.985        |
|    value_loss           | 1.21e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.43e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 23           |
|    time_elapsed         | 845          |
|    total_timesteps      | 47104        |
| train/                  |              |
|    approx_kl            | 0.0030226323 |
|    clip_fraction        | 0.0129       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | -2.38e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.7e+06      |
|    n_updates            | 220          |
|    policy_gradient_loss | -0.0019      |
|    std                  | 0.983        |
|    value_loss           | 3.5e+06      |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.4e+04     |
|    ep_rew_mean          | 7.43e+06    |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 24          |
|    time_elapsed         | 881         |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.004120515 |
|    clip_fraction        | 0.012       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.4        |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 7.96e+06    |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.00167    |
|    std                  | 0.982       |
|    value_loss           | 1.5e+07     |
-----------------------------------------


--------------------------------------------
| rollout/                |                |
|    ep_len_mean          | 1.4e+04        |
|    ep_rew_mean          | 7.43e+06       |
| time/                   |                |
|    fps                  | 55             |
|    iterations           | 25             |
|    time_elapsed         | 921            |
|    total_timesteps      | 51200          |
| train/                  |                |
|    approx_kl            | 0.000109519286 |
|    clip_fraction        | 0              |
|    clip_range           | 0.2            |
|    entropy_loss         | -1.4           |
|    explained_variance   | 0              |
|    learning_rate        | 0.0003         |
|    loss                 | 1.85e+07       |
|    n_updates            | 240            |
|    policy_gradient_loss | 2.4e-05        |
|    std                  | 0.982          |
|    value_loss           | 3.61e+07       |
--------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.4e+04       |
|    ep_rew_mean          | 7.43e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 26            |
|    time_elapsed         | 967           |
|    total_timesteps      | 53248         |
| train/                  |               |
|    approx_kl            | 0.00011007214 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.4          |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 3.24e+07      |
|    n_updates            | 250           |
|    policy_gradient_loss | -4.33e-05     |
|    std                  | 0.984         |
|    value_loss           | 6.54e+07      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.43e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 27           |
|    time_elapsed         | 1016         |
|    total_timesteps      | 55296        |
| train/                  |              |
|    approx_kl            | 0.0041307304 |
|    clip_fraction        | 0.00752      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 5.12e+07     |
|    n_updates            | 260          |
|    policy_gradient_loss | -0.00297     |
|    std                  | 0.982        |
|    value_loss           | 1.03e+08     |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 6.95e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 28            |
|    time_elapsed         | 1043          |
|    total_timesteps      | 57344         |
| train/                  |               |
|    approx_kl            | 8.1016595e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.4          |
|    explained_variance   | -1.19e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | 7.78e+07      |
|    n_updates            | 270           |
|    policy_gradient_loss | -5.19e-05     |
|    std                  | 0.98          |
|    value_loss           | 1.57e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 6.95e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 29           |
|    time_elapsed         | 1070         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0021172455 |
|    clip_fraction        | 9.77e-05     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.62e+07     |
|    n_updates            | 280          |
|    policy_gradient_loss | -0.000279    |
|    std                  | 0.979        |
|    value_loss           | 2.92e+07     |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.39e+04    |
|    ep_rew_mean          | 6.95e+06    |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 30          |
|    time_elapsed         | 1103        |
|    total_timesteps      | 61440       |
| train/                  |             |
|    approx_kl            | 0.004090028 |
|    clip_fraction        | 0.0142      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.4        |
|    explained_variance   | -2.38e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 9.65e+06    |
|    n_updates            | 290         |
|    policy_gradient_loss | -0.00214    |
|    std                  | 0.978       |
|    value_loss           | 2.05e+07    |
-----------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 6.95e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 31           |
|    time_elapsed         | 1139         |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0012522176 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 4.17e-07     |
|    learning_rate        | 0.0003       |
|    loss                 | 3.4e+07      |
|    n_updates            | 300          |
|    policy_gradient_loss | -0.000599    |
|    std                  | 0.976        |
|    value_loss           | 6.43e+07     |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 6.95e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 32            |
|    time_elapsed         | 1179          |
|    total_timesteps      | 65536         |
| train/                  |               |
|    approx_kl            | 0.00035089997 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.39         |
|    explained_variance   | 1.19e-07      |
|    learning_rate        | 0.0003        |
|    loss                 | 6.43e+07      |
|    n_updates            | 310           |
|    policy_gradient_loss | -0.000184     |
|    std                  | 0.975         |
|    value_loss           | 1.31e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 6.95e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 33           |
|    time_elapsed         | 1225         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0005186721 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 5.96e-08     |
|    learning_rate        | 0.0003       |
|    loss                 | 1.14e+08     |
|    n_updates            | 320          |
|    policy_gradient_loss | -0.000421    |
|    std                  | 0.974        |
|    value_loss           | 2.26e+08     |
------------------------------------------


--------------------------------------------
| rollout/                |                |
|    ep_len_mean          | 1.38e+04       |
|    ep_rew_mean          | 7.33e+06       |
| time/                   |                |
|    fps                  | 55             |
|    iterations           | 34             |
|    time_elapsed         | 1262           |
|    total_timesteps      | 69632          |
| train/                  |                |
|    approx_kl            | 0.000107144646 |
|    clip_fraction        | 0              |
|    clip_range           | 0.2            |
|    entropy_loss         | -1.39          |
|    explained_variance   | -1.19e-07      |
|    learning_rate        | 0.0003         |
|    loss                 | 1.72e+08       |
|    n_updates            | 330            |
|    policy_gradient_loss | -9.79e-05      |
|    std                  | 0.974          |
|    value_loss           | 3.54e+08       |
--------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.33e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 35           |
|    time_elapsed         | 1285         |
|    total_timesteps      | 71680        |
| train/                  |              |
|    approx_kl            | 4.629555e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.79e+08     |
|    n_updates            | 340          |
|    policy_gradient_loss | 3.49e-06     |
|    std                  | 0.974        |
|    value_loss           | 2.95e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.33e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 36           |
|    time_elapsed         | 1313         |
|    total_timesteps      | 73728        |
| train/                  |              |
|    approx_kl            | 0.0021784003 |
|    clip_fraction        | 0.000977     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.91e+06     |
|    n_updates            | 350          |
|    policy_gradient_loss | -0.000113    |
|    std                  | 0.974        |
|    value_loss           | 3.7e+06      |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.33e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 37           |
|    time_elapsed         | 1347         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0010317068 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | -2.38e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 8.74e+06     |
|    n_updates            | 360          |
|    policy_gradient_loss | -0.00028     |
|    std                  | 0.971        |
|    value_loss           | 1.75e+07     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.33e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 38           |
|    time_elapsed         | 1385         |
|    total_timesteps      | 77824        |
| train/                  |              |
|    approx_kl            | 9.753916e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 2.98e-07     |
|    learning_rate        | 0.0003       |
|    loss                 | 3.15e+07     |
|    n_updates            | 370          |
|    policy_gradient_loss | -2.63e-05    |
|    std                  | 0.973        |
|    value_loss           | 6.45e+07     |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.38e+04    |
|    ep_rew_mean          | 7.33e+06    |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 39          |
|    time_elapsed         | 1428        |
|    total_timesteps      | 79872       |
| train/                  |             |
|    approx_kl            | 0.000716207 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.39       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 7.23e+07    |
|    n_updates            | 380         |
|    policy_gradient_loss | -0.00048    |
|    std                  | 0.972       |
|    value_loss           | 1.46e+08    |
-----------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.33e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 40            |
|    time_elapsed         | 1476          |
|    total_timesteps      | 81920         |
| train/                  |               |
|    approx_kl            | 2.7327158e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.39         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 1.24e+08      |
|    n_updates            | 390           |
|    policy_gradient_loss | -0.000139     |
|    std                  | 0.975         |
|    value_loss           | 2.46e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.57e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 41           |
|    time_elapsed         | 1510         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 7.679968e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.83e+08     |
|    n_updates            | 400          |
|    policy_gradient_loss | -6.18e-05    |
|    std                  | 0.975        |
|    value_loss           | 3.68e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.57e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 42           |
|    time_elapsed         | 1534         |
|    total_timesteps      | 86016        |
| train/                  |              |
|    approx_kl            | 0.0009885672 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.18e+08     |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.000527    |
|    std                  | 0.976        |
|    value_loss           | 2.12e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.57e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 43           |
|    time_elapsed         | 1563         |
|    total_timesteps      | 88064        |
| train/                  |              |
|    approx_kl            | 0.0012295481 |
|    clip_fraction        | 0.00439      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 2.57e+06     |
|    n_updates            | 420          |
|    policy_gradient_loss | -0.000686    |
|    std                  | 0.986        |
|    value_loss           | 5.07e+06     |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.57e+06      |
| time/                   |               |
|    fps                  | 56            |
|    iterations           | 44            |
|    time_elapsed         | 1597          |
|    total_timesteps      | 90112         |
| train/                  |               |
|    approx_kl            | 0.00038726596 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.4          |
|    explained_variance   | -2.38e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | 9.74e+06      |
|    n_updates            | 430           |
|    policy_gradient_loss | -3.48e-05     |
|    std                  | 0.985         |
|    value_loss           | 1.98e+07      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.57e+06      |
| time/                   |               |
|    fps                  | 56            |
|    iterations           | 45            |
|    time_elapsed         | 1636          |
|    total_timesteps      | 92160         |
| train/                  |               |
|    approx_kl            | 0.00014925696 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.4          |
|    explained_variance   | 2.98e-07      |
|    learning_rate        | 0.0003        |
|    loss                 | 3.14e+07      |
|    n_updates            | 440           |
|    policy_gradient_loss | -7.4e-05      |
|    std                  | 0.986         |
|    value_loss           | 6.03e+07      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.57e+06      |
| time/                   |               |
|    fps                  | 56            |
|    iterations           | 46            |
|    time_elapsed         | 1680          |
|    total_timesteps      | 94208         |
| train/                  |               |
|    approx_kl            | 0.00062993076 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 0.0003        |
|    loss                 | 5.55e+07      |
|    n_updates            | 450           |
|    policy_gradient_loss | -0.000352     |
|    std                  | 0.986         |
|    value_loss           | 1.1e+08       |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.57e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 47            |
|    time_elapsed         | 1728          |
|    total_timesteps      | 96256         |
| train/                  |               |
|    approx_kl            | 0.00018839748 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.4          |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 8.83e+07      |
|    n_updates            | 460           |
|    policy_gradient_loss | -0.000143     |
|    std                  | 0.985         |
|    value_loss           | 1.76e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.55e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 48           |
|    time_elapsed         | 1758         |
|    total_timesteps      | 98304        |
| train/                  |              |
|    approx_kl            | 0.0011467711 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.36e+08     |
|    n_updates            | 470          |
|    policy_gradient_loss | -0.00105     |
|    std                  | 0.984        |
|    value_loss           | 2.72e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.55e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 49           |
|    time_elapsed         | 1783         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0011077211 |
|    clip_fraction        | 9.77e-05     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 4.94e+07     |
|    n_updates            | 480          |
|    policy_gradient_loss | -0.000471    |
|    std                  | 0.981        |
|    value_loss           | 9.96e+07     |
------------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7f104c7dbbb0>

Create a testing environment

In [None]:
obs, info = env_train.reset()
for i in range(1):
    done = False
    while not done:
        action, _states = rl_model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env_train.step(action)
        print_log(f"Step: {env_train.episode.get_current_step()}, Action: {action}, Reward: {reward}")
        env_train.render()

[2025-07-12 16:08:33:089] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13636, 'datetime_range': (Timestamp('2013-12-03 00:00:02'), Timestamp('2013-12-03 23:59:58'))}
[2025-07-12 16:08:33:095] Step: 1, Action: [-0.11176914], Reward: 0.3195078111757835
[2025-07-12 16:08:33:107] Step: 2, Action: [-0.11176719], Reward: 0.3444152867542605
[2025-07-12 16:08:33:117] Step: 3, Action: [-0.11176597], Reward: 0.20051824648367367
[2025-07-12 16:08:33:126] Step: 4, Action: [-0.11176914], Reward: 0.1035867738970452
[2025-07-12 16:08:33:133] Step: 5, Action: [-0.11177026], Reward: 0.05550562221085032
[2025-07-12 16:08:33:139] Step: 6, Action: [-0.11176758], Reward: 0.04985590043287724
[2025-07-12 16:08:33:145] Step: 7, Action: [-0.11177026], Reward: 0.05582076873700652
[2025-07-12 16:08:33:154] Step: 8, Action: [-0.11176875], Reward: 0.07269255363021294
[2025-07-12 16:08:33:160] Step: 9, Action: [-0.11176836], Reward: 0.0971282958035469
[2025-07-12 16:08:33:166

In [None]:
env_train.episode.df

Unnamed: 0,timestamp,aggregate,datetime,grid_load,battery_soc
0,1.357603e+09,234.0,2013-01-08 00:00:05,,
1,1.357603e+09,231.0,2013-01-08 00:00:11,,
2,1.357603e+09,234.0,2013-01-08 00:00:17,,
3,1.357603e+09,232.0,2013-01-08 00:00:23,,
4,1.357603e+09,232.0,2013-01-08 00:00:30,,
...,...,...,...,...,...
13375,1.357690e+09,178.0,2013-01-08 23:59:33,,
13376,1.357690e+09,177.0,2013-01-08 23:59:39,,
13377,1.357690e+09,178.0,2013-01-08 23:59:46,,
13378,1.357690e+09,177.0,2013-01-08 23:59:52,,


In [None]:
# save the graph

env_train.save_graph(
    str(Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "graph_train.png"))
)

In [None]:
env_train.close()

[2025-07-12 04:44:05:356] [SmartMeterWorld] Environment closed.


In [30]:
# save the model
rl_model_path = Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "rl_model.zip")
rl_model.save(rl_model_path)

---

In [34]:
# load the model & environment
import sys
sys.path.append(str(Path('rl_env')))

from rl_env.hrl_env import SmartMeterWorld
from stable_baselines3 import PPO

env_test = SmartMeterWorld(
    sm_dl_test,
    render_mode="human",
)

env_test.set_h_network(h_network)
env_test.set_h_network_stdscaler(h_network_stdscaler)

rl_model_path = Path("rl_model", "PPO", f"{datetime(2025,7,12,14,58,9).strftime('%Y%m%d_%H%M%S')}", "rl_model.zip")
rl_model_loaded = PPO.load(rl_model_path, env=env_test)

[2025-07-12 17:01:03:406] [SmartMeterWorld] Render mode set to 'human'. Render server at 127.0.0.1:50007. render_connected: True. render_client_socket: <socket.socket fd=94, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 55314), raddr=('127.0.0.1', 50007)>
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [35]:
obs, info = env_test.reset()
for i in range(1):
    done = False
    while not done:
        action, _states = rl_model_loaded.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env_test.step(action)
        print_log(f"Step: {env_test.episode.get_current_step()}, Action: {action}, Reward: {reward}")
        env_test.render()

[2025-07-12 17:01:09:418] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13838, 'datetime_range': (Timestamp('2013-08-15 00:00:01'), Timestamp('2013-08-15 23:59:59'))}
[2025-07-12 17:01:09:427] Step: 1, Action: [-0.11162357], Reward: 0.3197169332109988
[2025-07-12 17:01:09:436] Step: 2, Action: [-0.11163278], Reward: 0.5596025769556463
[2025-07-12 17:01:09:456] Step: 3, Action: [-0.11163721], Reward: 0.48428668455776075
[2025-07-12 17:01:09:464] Step: 4, Action: [-0.11163279], Reward: 0.34103424595346055
[2025-07-12 17:01:09:470] Step: 5, Action: [-0.11162825], Reward: 0.21234606818104784
[2025-07-12 17:01:09:476] Step: 6, Action: [-0.11163279], Reward: 0.12094011353005966
[2025-07-12 17:01:09:487] Step: 7, Action: [-0.11163279], Reward: 0.06497681485166153
[2025-07-12 17:01:09:495] Step: 8, Action: [-0.1116328], Reward: 0.03361730095612258
[2025-07-12 17:01:09:501] Step: 9, Action: [-0.1116328], Reward: 0.017724320196613668
[2025-07-12 17:01:09:50

In [40]:
env_test.save_graph(
    str(Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "graph_test.png"))
)

In [41]:
env_test.close()

[2025-07-12 17:07:43:300] [SmartMeterWorld] Environment closed.
