A template of the RL training

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime

from utils import print_log

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# load the created dataset
dataset_folder_path = Path("dataset", "20250707", "split")

In [4]:
# copied from 03_data_split.ipynb

# Helper functions for the new split folder structure
def load_split_data_from_folder(split_folder, split_type='train'):
    """Load aggregate data from split folder"""
    segments = []
    with open(split_folder / f'{split_type}_segments.txt', 'r') as f:
        for line in f:
            start_str, end_str = line.strip().split(' - ')
            start = datetime.fromisoformat(start_str)
            end = datetime.fromisoformat(end_str)
            segments.append((start, end))
    
    df = pd.read_pickle(split_folder / f'{split_type}_aggregate_df.pkl')
    return segments, df

def load_signatures_from_split_folder(split_folder, split_type, appliance):
    """Load appliance signatures from split folder"""
    sig_path = split_folder / 'load_signature_library' / split_type / appliance / 'load_signatures.pkl'
    ranges_path = split_folder / 'load_signature_library' / split_type / appliance / 'selected_ranges.txt'
    
    if not sig_path.exists():
        return pd.DataFrame(), []
    
    signatures_df = pd.read_pickle(sig_path)
    ranges = []
    if ranges_path.exists():
        with open(ranges_path, 'r') as f:
            for line in f:
                start, end = map(int, line.strip().split(','))
                ranges.append((start, end))
    
    return signatures_df, ranges

In [5]:
# convert datetime objects to timezone-naive datetime objects
def convert_to_naive_datetimes_df(df):
    """Convert datetime objects in DataFrame to timezone-naive datetime objects"""
    df['datetime'] = df['datetime'].apply(lambda x: x.replace(tzinfo=None) if isinstance(x, datetime) else x)

    return df

def convert_to_naive_datetimes(segments):
    """Convert datetime objects in segments to timezone-naive datetime objects"""
    return [(start.replace(tzinfo=None), end.replace(tzinfo=None)) for start, end in segments]

In [6]:
aggregate_load_segments_train, aggregate_load_df_train = load_split_data_from_folder(dataset_folder_path, 'train')
aggregate_load_segments_test, aggregate_load_df_test = load_split_data_from_folder(dataset_folder_path, 'test') 
aggregate_load_segments_validation, aggregate_load_df_validation = load_split_data_from_folder(dataset_folder_path, 'val')

In [7]:
aggregate_load_segments_train, aggregate_load_df_train = convert_to_naive_datetimes(aggregate_load_segments_train), convert_to_naive_datetimes_df(aggregate_load_df_train)
aggregate_load_segments_test, aggregate_load_df_test = convert_to_naive_datetimes(aggregate_load_segments_test), convert_to_naive_datetimes_df(aggregate_load_df_test)
aggregate_load_segments_validation, aggregate_load_df_validation = convert_to_naive_datetimes(aggregate_load_segments_validation), convert_to_naive_datetimes_df(aggregate_load_df_validation)

In [8]:
aggregate_load_df_train

Unnamed: 0,timestamp,aggregate,datetime,washing_machine,dishwasher,fridge,kettle,microwave,toaster,tv,htpc,gas_oven,kitchen_lights
0,1.357603e+09,234.0,2013-01-08 00:00:05,0.0,1.0,0.0,1.0,1.0,0.0,1.0,69.0,,0.0
1,1.357603e+09,231.0,2013-01-08 00:00:11,0.0,1.0,0.0,1.0,1.0,0.0,1.0,70.0,,0.0
2,1.357603e+09,234.0,2013-01-08 00:00:17,0.0,1.0,0.0,1.0,1.0,0.0,1.0,70.0,,0.0
3,1.357603e+09,232.0,2013-01-08 00:00:23,0.0,1.0,0.0,1.0,1.0,0.0,1.0,68.0,,0.0
4,1.357603e+09,232.0,2013-01-08 00:00:30,0.0,1.0,0.0,1.0,1.0,0.0,1.0,70.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2231636,1.388448e+09,178.0,2013-12-30 23:59:35,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0
2231637,1.388448e+09,177.0,2013-12-30 23:59:41,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0
2231638,1.388448e+09,178.0,2013-12-30 23:59:47,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0
2231639,1.388448e+09,178.0,2013-12-30 23:59:53,0.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,3.0,0.0


In [9]:
aggregate_load_segments_train

[(datetime.datetime(2013, 1, 8, 0, 0),
  datetime.datetime(2013, 1, 10, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 2, 27, 0, 0),
  datetime.datetime(2013, 2, 28, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 3, 8, 0, 0),
  datetime.datetime(2013, 3, 10, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 3, 28, 0, 0),
  datetime.datetime(2013, 3, 31, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 3, 22, 0, 0),
  datetime.datetime(2013, 3, 26, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 4, 8, 0, 0),
  datetime.datetime(2013, 4, 10, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 4, 26, 0, 0),
  datetime.datetime(2013, 4, 30, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 4, 1, 0, 0),
  datetime.datetime(2013, 4, 7, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 5, 15, 0, 0),
  datetime.datetime(2013, 5, 16, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 5, 1, 0, 0),
  datetime.datetime(2013, 5, 7, 23, 59, 59, 999999)),
 (datetime.datetime(2013, 5, 25, 0, 0),
  datetime.dateti

In [10]:
from rl_env.env_data_loader import SmartMeterDataLoader

sm_dl_train = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_train,
    aggregate_load_df=aggregate_load_df_train
)

sm_dl_train.get_divided_segments_length()

162

In [11]:
sm_dl_train.divided_segments[7]

array([datetime.datetime(2013, 3, 10, 0, 0),
       datetime.datetime(2013, 3, 10, 23, 59, 59, 999999)], dtype=object)

In [12]:
# sample segment

sm_dl_train.get_aggregate_load_segment(13)

Unnamed: 0,timestamp,aggregate,datetime
104747,1.363997e+09,335.0,2013-03-23 00:00:05
104748,1.363997e+09,336.0,2013-03-23 00:00:11
104749,1.363997e+09,333.0,2013-03-23 00:00:17
104750,1.363997e+09,334.0,2013-03-23 00:00:24
104751,1.363997e+09,331.0,2013-03-23 00:00:30
...,...,...,...
118501,1.364083e+09,179.0,2013-03-23 23:59:30
118502,1.364083e+09,171.0,2013-03-23 23:59:37
118503,1.364083e+09,171.0,2013-03-23 23:59:43
118504,1.364083e+09,171.0,2013-03-23 23:59:49


In [13]:
# create dataloader for validation and test sets
sm_dl_validation = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_validation,
    aggregate_load_df=aggregate_load_df_validation
)

sm_dl_test = SmartMeterDataLoader(
    aggregate_load_segments=aggregate_load_segments_test,
    aggregate_load_df=aggregate_load_df_test
)

(Optional) Load the pre-trained H-network and related components

In final product, the H-network should be trained along with the DDQL/PPO agent

In [14]:
import torch
from model.H_network.h_network import HNetwork

h_network_datetime = datetime(2025, 7, 12)

h_network_path = Path("model_trained", f"h_network_{h_network_datetime.strftime('%Y%m%d')}.pth")

h_network = HNetwork(2, 44, 1)
h_network.load_state_dict(torch.load(h_network_path))
h_network.eval()

HNetwork(
  (LSTM_1): LSTM(2, 44, batch_first=True, bidirectional=True)
  (ac1): Tanh()
  (LSTM_2): LSTM(88, 1, batch_first=True, bidirectional=True)
  (ac2): Tanh()
  (fc): Linear(in_features=2, out_features=1, bias=True)
)

In [15]:
h_network_stdscaler_path = Path("model_trained", f"h_network_standardscaler_{h_network_datetime.strftime('%Y%m%d')}.pkl")
import joblib
h_network_stdscaler = joblib.load(h_network_stdscaler_path)

Create the environment

In [16]:
import sys
sys.path.append(str(Path('rl_env')))

from rl_env.hrl_env import SmartMeterWorld

env_train = SmartMeterWorld(
    smart_meter_data_loader=sm_dl_train,
    render_mode="human",
)

env_train.set_h_network(h_network)
env_train.set_h_network_stdscaler(h_network_stdscaler)

[2025-07-12 18:24:45:225] [SmartMeterWorld] Render mode set to 'human'. Render server at 127.0.0.1:50007. render_connected: True. render_client_socket: <socket.socket fd=69, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 36982), raddr=('127.0.0.1', 50007)>


In [17]:
from gymnasium.utils.env_checker import check_env

# This will catch many common issues
try:
    check_env(env_train)
    print("Environment passes all checks!")
except Exception as e:
    print(f"Environment has issues: {e}")

[2025-07-12 18:24:45:265] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13780, 'datetime_range': (Timestamp('2013-11-24 00:00:04'), Timestamp('2013-11-24 23:59:58'))}
[2025-07-12 18:24:45:275] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13880, 'datetime_range': (Timestamp('2013-08-28 00:00:02'), Timestamp('2013-08-28 23:59:59'))}
[2025-07-12 18:24:45:284] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13745, 'datetime_range': (Timestamp('2013-03-28 00:00:05'), Timestamp('2013-03-28 23:59:57'))}
[2025-07-12 18:24:45:295] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13880, 'datetime_range': (Timestamp('2013-08-28 00:00:02'), Timestamp('2013-08-28 23:59:59'))}
[2025-07-12 18:24:45:305] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13745, 'datetime_range': (Timestamp('2013-03-28 00:00:05'), Times

  logger.warn(
  logger.warn(


In [18]:
obs = env_train.reset()
obs

[2025-07-12 18:24:45:406] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13745, 'datetime_range': (Timestamp('2013-03-28 00:00:05'), Timestamp('2013-03-28 23:59:57'))}


({'aggregate_load': array([176.], dtype=float32),
  'battery_soc': array([0.22035988], dtype=float32),
  'timestamp_features': array([-0.5 ,  0.  , -0.25], dtype=float32)},
 {})

In [19]:
env_train.reset_render_window()

In [20]:
# initialize a PPO agent
from stable_baselines3 import PPO

rl_datetime = datetime.now()
tensorboard_log_path = Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}")

rl_model = PPO(
    "MultiInputPolicy", 
    env_train, 
    verbose=2,
    tensorboard_log=tensorboard_log_path
)

rl_model.learn(
    total_timesteps=100000,
    progress_bar=True,
    tb_log_name="PPO_SmartMeterWorld"
)

2025-07-12 18:25:01.557801: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-12 18:25:01.571294: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752341101.581645  513872 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752341101.585092  513872 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752341101.595501  513872 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
[2025-07-12 18:25:03:156] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13955, 'datetime_range': (Timestamp('2013-06-29 00:00:03'), Timestamp('2013-06-29 23:59:55'))}
Logging to rl_model/PPO/20250712_182502/PPO_SmartMeterWorld_1


Output()

-----------------------------
| time/              |      |
|    fps             | 85   |
|    iterations      | 1    |
|    time_elapsed    | 24   |
|    total_timesteps | 2048 |
-----------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 2            |
|    time_elapsed         | 52           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0033738734 |
|    clip_fraction        | 0.00566      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 1.49e-06     |
|    learning_rate        | 0.0003       |
|    loss                 | 7.19e+05     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00047     |
|    std                  | 0.998        |
|    value_loss           | 1.47e+06     |
------------------------------------------


--------------------------------------------
| time/                   |                |
|    fps                  | 69             |
|    iterations           | 3              |
|    time_elapsed         | 88             |
|    total_timesteps      | 6144           |
| train/                  |                |
|    approx_kl            | 0.000117620744 |
|    clip_fraction        | 0              |
|    clip_range           | 0.2            |
|    entropy_loss         | -1.42          |
|    explained_variance   | 5.96e-08       |
|    learning_rate        | 0.0003         |
|    loss                 | 5.04e+06       |
|    n_updates            | 20             |
|    policy_gradient_loss | 2.77e-05       |
|    std                  | 1              |
|    value_loss           | 1.09e+07       |
--------------------------------------------


-----------------------------------------
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 4           |
|    time_elapsed         | 130         |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.003905599 |
|    clip_fraction        | 0.00796     |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.42       |
|    explained_variance   | -2.38e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 1.91e+07    |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.00226    |
|    std                  | 1           |
|    value_loss           | 3.9e+07     |
-----------------------------------------


------------------------------------------
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 5            |
|    time_elapsed         | 174          |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0007370424 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 2.98e-07     |
|    learning_rate        | 0.0003       |
|    loss                 | 3.95e+07     |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.000437    |
|    std                  | 1            |
|    value_loss           | 7.99e+07     |
------------------------------------------


-------------------------------------------
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 6             |
|    time_elapsed         | 221           |
|    total_timesteps      | 12288         |
| train/                  |               |
|    approx_kl            | 0.00058060477 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 0.0003        |
|    loss                 | 6.24e+07      |
|    n_updates            | 50            |
|    policy_gradient_loss | -0.000404     |
|    std                  | 1             |
|    value_loss           | 1.32e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.11e+06     |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 7            |
|    time_elapsed         | 269          |
|    total_timesteps      | 14336        |
| train/                  |              |
|    approx_kl            | 0.0007666254 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 9.56e+07     |
|    n_updates            | 60           |
|    policy_gradient_loss | -0.000617    |
|    std                  | 1            |
|    value_loss           | 1.9e+08      |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.4e+04       |
|    ep_rew_mean          | 7.11e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 8             |
|    time_elapsed         | 293           |
|    total_timesteps      | 16384         |
| train/                  |               |
|    approx_kl            | 0.00018795172 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 1.05e+08      |
|    n_updates            | 70            |
|    policy_gradient_loss | -0.000145     |
|    std                  | 1             |
|    value_loss           | 2.2e+08       |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.11e+06     |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 9            |
|    time_elapsed         | 323          |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0017921743 |
|    clip_fraction        | 0.000635     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.15e+06     |
|    n_updates            | 80           |
|    policy_gradient_loss | -0.000162    |
|    std                  | 1            |
|    value_loss           | 2.45e+06     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.11e+06     |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 10           |
|    time_elapsed         | 358          |
|    total_timesteps      | 20480        |
| train/                  |              |
|    approx_kl            | 0.0034355486 |
|    clip_fraction        | 0.00527      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 6.2e+06      |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.00106     |
|    std                  | 1            |
|    value_loss           | 1.28e+07     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.11e+06     |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 11           |
|    time_elapsed         | 398          |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0017333918 |
|    clip_fraction        | 0.000195     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 2.38e+07     |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.000778    |
|    std                  | 1            |
|    value_loss           | 4.77e+07     |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.4e+04       |
|    ep_rew_mean          | 7.11e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 12            |
|    time_elapsed         | 442           |
|    total_timesteps      | 24576         |
| train/                  |               |
|    approx_kl            | 0.00017077423 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 5.51e+07      |
|    n_updates            | 110           |
|    policy_gradient_loss | -0.000119     |
|    std                  | 1             |
|    value_loss           | 1.17e+08      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.4e+04       |
|    ep_rew_mean          | 7.11e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 13            |
|    time_elapsed         | 491           |
|    total_timesteps      | 26624         |
| train/                  |               |
|    approx_kl            | 0.00031607194 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 0.0003        |
|    loss                 | 1.14e+08      |
|    n_updates            | 120           |
|    policy_gradient_loss | -0.000266     |
|    std                  | 1             |
|    value_loss           | 2.2e+08       |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 7.69e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 14            |
|    time_elapsed         | 530           |
|    total_timesteps      | 28672         |
| train/                  |               |
|    approx_kl            | 3.8005295e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | 1.19e-07      |
|    learning_rate        | 0.0003        |
|    loss                 | 1.76e+08      |
|    n_updates            | 130           |
|    policy_gradient_loss | -1.58e-05     |
|    std                  | 1             |
|    value_loss           | 3.55e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.69e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 15           |
|    time_elapsed         | 555          |
|    total_timesteps      | 30720        |
| train/                  |              |
|    approx_kl            | 0.0001429769 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.05e+08     |
|    n_updates            | 140          |
|    policy_gradient_loss | -8.43e-05    |
|    std                  | 1.01         |
|    value_loss           | 2.38e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.69e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 16           |
|    time_elapsed         | 587          |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.0001800489 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | 1.19e-07     |
|    learning_rate        | 0.0003       |
|    loss                 | 2.74e+06     |
|    n_updates            | 150          |
|    policy_gradient_loss | -0.000293    |
|    std                  | 0.993        |
|    value_loss           | 5.24e+06     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.69e+06     |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 17           |
|    time_elapsed         | 625          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0008569119 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.05e+07     |
|    n_updates            | 160          |
|    policy_gradient_loss | -0.000245    |
|    std                  | 0.991        |
|    value_loss           | 2.03e+07     |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 7.69e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 18            |
|    time_elapsed         | 667           |
|    total_timesteps      | 36864         |
| train/                  |               |
|    approx_kl            | 0.00042952166 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | -1.19e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | 2.48e+07      |
|    n_updates            | 170           |
|    policy_gradient_loss | -0.000209     |
|    std                  | 0.987         |
|    value_loss           | 4.86e+07      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 7.69e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 19            |
|    time_elapsed         | 713           |
|    total_timesteps      | 38912         |
| train/                  |               |
|    approx_kl            | 0.00044127434 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 0.0003        |
|    loss                 | 4.27e+07      |
|    n_updates            | 180           |
|    policy_gradient_loss | -0.000271     |
|    std                  | 0.989         |
|    value_loss           | 8.88e+07      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.39e+04      |
|    ep_rew_mean          | 7.69e+06      |
| time/                   |               |
|    fps                  | 53            |
|    iterations           | 20            |
|    time_elapsed         | 763           |
|    total_timesteps      | 40960         |
| train/                  |               |
|    approx_kl            | 0.00059548626 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 7.37e+07      |
|    n_updates            | 190           |
|    policy_gradient_loss | -0.000428     |
|    std                  | 0.99          |
|    value_loss           | 1.46e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.52e+06     |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 21           |
|    time_elapsed         | 800          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0005829751 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | -4.77e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.15e+08     |
|    n_updates            | 200          |
|    policy_gradient_loss | -0.000519    |
|    std                  | 0.988        |
|    value_loss           | 2.36e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.52e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 22           |
|    time_elapsed         | 826          |
|    total_timesteps      | 45056        |
| train/                  |              |
|    approx_kl            | 0.0025329872 |
|    clip_fraction        | 0.00146      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 2.38e-07     |
|    learning_rate        | 0.0003       |
|    loss                 | 6.75e+07     |
|    n_updates            | 210          |
|    policy_gradient_loss | -0.00126     |
|    std                  | 0.99         |
|    value_loss           | 1.36e+08     |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.4e+04       |
|    ep_rew_mean          | 7.52e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 23            |
|    time_elapsed         | 857           |
|    total_timesteps      | 47104         |
| train/                  |               |
|    approx_kl            | 0.00021729703 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 2.3e+06       |
|    n_updates            | 220           |
|    policy_gradient_loss | 3.88e-05      |
|    std                  | 0.986         |
|    value_loss           | 4.82e+06      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.52e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 24           |
|    time_elapsed         | 895          |
|    total_timesteps      | 49152        |
| train/                  |              |
|    approx_kl            | 0.0012477294 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 2.98e-07     |
|    learning_rate        | 0.0003       |
|    loss                 | 9.53e+06     |
|    n_updates            | 230          |
|    policy_gradient_loss | -0.000291    |
|    std                  | 0.986        |
|    value_loss           | 1.87e+07     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.52e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 25           |
|    time_elapsed         | 936          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0014769395 |
|    clip_fraction        | 9.77e-05     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 2.22e+07     |
|    n_updates            | 240          |
|    policy_gradient_loss | -0.000603    |
|    std                  | 0.987        |
|    value_loss           | 4.3e+07      |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.4e+04      |
|    ep_rew_mean          | 7.52e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 26           |
|    time_elapsed         | 983          |
|    total_timesteps      | 53248        |
| train/                  |              |
|    approx_kl            | 0.0007532175 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | -3.58e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 3.79e+07     |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.000318    |
|    std                  | 0.987        |
|    value_loss           | 7.75e+07     |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.4e+04       |
|    ep_rew_mean          | 7.52e+06      |
| time/                   |               |
|    fps                  | 53            |
|    iterations           | 27            |
|    time_elapsed         | 1034          |
|    total_timesteps      | 55296         |
| train/                  |               |
|    approx_kl            | 0.00032076114 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.41         |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 0.0003        |
|    loss                 | 6.02e+07      |
|    n_updates            | 260           |
|    policy_gradient_loss | -0.000203     |
|    std                  | 0.987         |
|    value_loss           | 1.2e+08       |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.13e+06     |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 28           |
|    time_elapsed         | 1062         |
|    total_timesteps      | 57344        |
| train/                  |              |
|    approx_kl            | 0.0014773323 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 8.82e+07     |
|    n_updates            | 270          |
|    policy_gradient_loss | -0.000958    |
|    std                  | 0.987        |
|    value_loss           | 1.76e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.13e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 29           |
|    time_elapsed         | 1092         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0020507686 |
|    clip_fraction        | 0.000977     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.92e+07     |
|    n_updates            | 280          |
|    policy_gradient_loss | -0.000545    |
|    std                  | 0.978        |
|    value_loss           | 3.18e+07     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.13e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 30           |
|    time_elapsed         | 1126         |
|    total_timesteps      | 61440        |
| train/                  |              |
|    approx_kl            | 0.0012014109 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1e+07        |
|    n_updates            | 290          |
|    policy_gradient_loss | -0.000214    |
|    std                  | 0.976        |
|    value_loss           | 2.07e+07     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.13e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 31           |
|    time_elapsed         | 1162         |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0017802758 |
|    clip_fraction        | 0.000391     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 3.18e+07     |
|    n_updates            | 300          |
|    policy_gradient_loss | -0.000803    |
|    std                  | 0.977        |
|    value_loss           | 6.37e+07     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.13e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 32           |
|    time_elapsed         | 1203         |
|    total_timesteps      | 65536        |
| train/                  |              |
|    approx_kl            | 3.207766e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 6.42e+07     |
|    n_updates            | 310          |
|    policy_gradient_loss | -3.76e-05    |
|    std                  | 0.979        |
|    value_loss           | 1.33e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.39e+04     |
|    ep_rew_mean          | 7.13e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 33           |
|    time_elapsed         | 1248         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0003599249 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.11e+08     |
|    n_updates            | 320          |
|    policy_gradient_loss | -0.000283    |
|    std                  | 0.98         |
|    value_loss           | 2.25e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.47e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 34           |
|    time_elapsed         | 1286         |
|    total_timesteps      | 69632        |
| train/                  |              |
|    approx_kl            | 0.0001060146 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.4         |
|    explained_variance   | -2.38e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.76e+08     |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.000125    |
|    std                  | 0.979        |
|    value_loss           | 3.47e+08     |
------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.47e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 35            |
|    time_elapsed         | 1309          |
|    total_timesteps      | 71680         |
| train/                  |               |
|    approx_kl            | 0.00074082066 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.4          |
|    explained_variance   | 5.96e-08      |
|    learning_rate        | 0.0003        |
|    loss                 | 1.32e+08      |
|    n_updates            | 340           |
|    policy_gradient_loss | -0.000542     |
|    std                  | 0.98          |
|    value_loss           | 2.89e+08      |
-------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.38e+04    |
|    ep_rew_mean          | 7.47e+06    |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 36          |
|    time_elapsed         | 1338        |
|    total_timesteps      | 73728       |
| train/                  |             |
|    approx_kl            | 0.001898983 |
|    clip_fraction        | 0.000977    |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.39       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 2.05e+06    |
|    n_updates            | 350         |
|    policy_gradient_loss | -0.000212   |
|    std                  | 0.973       |
|    value_loss           | 4.01e+06    |
-----------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.47e+06      |
| time/                   |               |
|    fps                  | 55            |
|    iterations           | 37            |
|    time_elapsed         | 1375          |
|    total_timesteps      | 75776         |
| train/                  |               |
|    approx_kl            | 0.00050182285 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.39         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 8.86e+06      |
|    n_updates            | 360           |
|    policy_gradient_loss | -0.000143     |
|    std                  | 0.968         |
|    value_loss           | 1.79e+07      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.47e+06     |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 38           |
|    time_elapsed         | 1419         |
|    total_timesteps      | 77824        |
| train/                  |              |
|    approx_kl            | 0.0028336453 |
|    clip_fraction        | 0.00181      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 3.17e+07     |
|    n_updates            | 370          |
|    policy_gradient_loss | -0.00121     |
|    std                  | 0.968        |
|    value_loss           | 6.32e+07     |
------------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.38e+04    |
|    ep_rew_mean          | 7.47e+06    |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 39          |
|    time_elapsed         | 1466        |
|    total_timesteps      | 79872       |
| train/                  |             |
|    approx_kl            | 5.37085e-06 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.39       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 7.07e+07    |
|    n_updates            | 380         |
|    policy_gradient_loss | 2.08e-05    |
|    std                  | 0.968       |
|    value_loss           | 1.42e+08    |
-----------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.47e+06      |
| time/                   |               |
|    fps                  | 53            |
|    iterations           | 40            |
|    time_elapsed         | 1520          |
|    total_timesteps      | 81920         |
| train/                  |               |
|    approx_kl            | 0.00017688773 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.39         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 1.26e+08      |
|    n_updates            | 390           |
|    policy_gradient_loss | -0.000236     |
|    std                  | 0.965         |
|    value_loss           | 2.43e+08      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.67e+06      |
| time/                   |               |
|    fps                  | 53            |
|    iterations           | 41            |
|    time_elapsed         | 1559          |
|    total_timesteps      | 83968         |
| train/                  |               |
|    approx_kl            | 0.00016190702 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.38         |
|    explained_variance   | -1.19e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | 1.83e+08      |
|    n_updates            | 400           |
|    policy_gradient_loss | -0.000168     |
|    std                  | 0.966         |
|    value_loss           | 3.67e+08      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.67e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 42            |
|    time_elapsed         | 1587          |
|    total_timesteps      | 86016         |
| train/                  |               |
|    approx_kl            | 0.00030314265 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.38         |
|    explained_variance   | 1.79e-07      |
|    learning_rate        | 0.0003        |
|    loss                 | 1.11e+08      |
|    n_updates            | 410           |
|    policy_gradient_loss | -0.000157     |
|    std                  | 0.965         |
|    value_loss           | 2.09e+08      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.67e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 43            |
|    time_elapsed         | 1621          |
|    total_timesteps      | 88064         |
| train/                  |               |
|    approx_kl            | 0.00046375598 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.38         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 2.93e+06      |
|    n_updates            | 420           |
|    policy_gradient_loss | -0.000255     |
|    std                  | 0.954         |
|    value_loss           | 5.43e+06      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.67e+06      |
| time/                   |               |
|    fps                  | 54            |
|    iterations           | 44            |
|    time_elapsed         | 1661          |
|    total_timesteps      | 90112         |
| train/                  |               |
|    approx_kl            | 0.00041989927 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.37         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 1.04e+07      |
|    n_updates            | 430           |
|    policy_gradient_loss | -1.56e-05     |
|    std                  | 0.953         |
|    value_loss           | 2.1e+07       |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.67e+06      |
| time/                   |               |
|    fps                  | 53            |
|    iterations           | 45            |
|    time_elapsed         | 1707          |
|    total_timesteps      | 92160         |
| train/                  |               |
|    approx_kl            | 3.8700353e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.37         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 3e+07         |
|    n_updates            | 440           |
|    policy_gradient_loss | 2.26e-05      |
|    std                  | 0.953         |
|    value_loss           | 6.32e+07      |
-------------------------------------------


-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 1.38e+04      |
|    ep_rew_mean          | 7.67e+06      |
| time/                   |               |
|    fps                  | 53            |
|    iterations           | 46            |
|    time_elapsed         | 1756          |
|    total_timesteps      | 94208         |
| train/                  |               |
|    approx_kl            | 4.8036774e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.37         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 5.68e+07      |
|    n_updates            | 450           |
|    policy_gradient_loss | -5.21e-05     |
|    std                  | 0.951         |
|    value_loss           | 1.13e+08      |
-------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.67e+06     |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 47           |
|    time_elapsed         | 1808         |
|    total_timesteps      | 96256        |
| train/                  |              |
|    approx_kl            | 0.0011249157 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.37        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 8.62e+07     |
|    n_updates            | 460          |
|    policy_gradient_loss | -0.000838    |
|    std                  | 0.952        |
|    value_loss           | 1.8e+08      |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.66e+06     |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 48           |
|    time_elapsed         | 1840         |
|    total_timesteps      | 98304        |
| train/                  |              |
|    approx_kl            | 6.485122e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.37        |
|    explained_variance   | -5.96e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 1.38e+08     |
|    n_updates            | 470          |
|    policy_gradient_loss | -8.23e-05    |
|    std                  | 0.95         |
|    value_loss           | 2.77e+08     |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.38e+04     |
|    ep_rew_mean          | 7.66e+06     |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 49           |
|    time_elapsed         | 1868         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0018679299 |
|    clip_fraction        | 0.000781     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.37        |
|    explained_variance   | 5.96e-08     |
|    learning_rate        | 0.0003       |
|    loss                 | 5.8e+07      |
|    n_updates            | 480          |
|    policy_gradient_loss | -0.000516    |
|    std                  | 0.95         |
|    value_loss           | 1.02e+08     |
------------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7f2a7c7ce160>

Create a validation environment

and put the policy into the validation env

In [27]:
env_valid = SmartMeterWorld(
    smart_meter_data_loader=sm_dl_validation,
    render_mode="human",
)

[2025-07-12 19:45:46:881] [SmartMeterWorld] Render mode set to 'human'. Render server at 127.0.0.1:50007. render_connected: True. render_client_socket: <socket.socket fd=87, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 33722), raddr=('127.0.0.1', 50007)>


In [28]:
env_valid.set_h_network(h_network)
env_valid.set_h_network_stdscaler(h_network_stdscaler)

In [29]:
rl_model.set_env(env_valid)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [30]:
obs, info = env_valid.reset()
for i in range(1):
    done = False
    while not done:
        action, _states = rl_model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env_valid.step(action)
        print_log(f"Step: {env_valid.episode.get_current_step()}, Action: {action}, Reward: {reward}")
        env_valid.render()

[2025-07-12 19:45:53:291] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 14080, 'datetime_range': (Timestamp('2013-05-08 00:00:05'), Timestamp('2013-05-08 23:59:55'))}
[2025-07-12 19:45:53:305] Step: 1, Action: [-0.16950542], Reward: 0.33101090490818025
[2025-07-12 19:45:53:318] Step: 2, Action: [-0.16950324], Reward: 0.47096859812736513
[2025-07-12 19:45:53:330] Step: 3, Action: [-0.16950102], Reward: 0.34303130507469176
[2025-07-12 19:45:53:345] Step: 4, Action: [-0.16950332], Reward: 0.20556457936763764
[2025-07-12 19:45:53:357] Step: 5, Action: [-0.1695011], Reward: 0.11220863834023476
[2025-07-12 19:45:53:367] Step: 6, Action: [-0.16950113], Reward: 0.060318984836339955
[2025-07-12 19:45:53:378] Step: 7, Action: [-0.1695034], Reward: 0.03724815845489502
[2025-07-12 19:45:53:386] Step: 8, Action: [-0.16950344], Reward: 0.029599212482571604
[2025-07-12 19:45:53:395] Step: 9, Action: [-0.16951], Reward: 0.0346350833773613
[2025-07-12 19:45:53:403

In [29]:
env_train.episode.df

Unnamed: 0,timestamp,aggregate,datetime,grid_load,battery_soc
0,1.357603e+09,234.0,2013-01-08 00:00:05,,
1,1.357603e+09,231.0,2013-01-08 00:00:11,,
2,1.357603e+09,234.0,2013-01-08 00:00:17,,
3,1.357603e+09,232.0,2013-01-08 00:00:23,,
4,1.357603e+09,232.0,2013-01-08 00:00:30,,
...,...,...,...,...,...
13375,1.357690e+09,178.0,2013-01-08 23:59:33,,
13376,1.357690e+09,177.0,2013-01-08 23:59:39,,
13377,1.357690e+09,178.0,2013-01-08 23:59:46,,
13378,1.357690e+09,177.0,2013-01-08 23:59:52,,


In [None]:
# save the graph

env_valid.save_graph(
    str(Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "graph_valid.png"))
)

In [None]:
env_valid.close()

[2025-07-12 04:44:05:356] [SmartMeterWorld] Environment closed.


In [26]:
# save the model
rl_model_path = Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "rl_model.zip")
rl_model.save(rl_model_path)

---

In [62]:
# load the model & environment
import sys
sys.path.append(str(Path('rl_env')))

from rl_env.hrl_env import SmartMeterWorld
from stable_baselines3 import PPO

env_test = SmartMeterWorld(
    sm_dl_test,
    render_mode="human",
)

env_test.set_h_network(h_network)
env_test.set_h_network_stdscaler(h_network_stdscaler)

rl_model_path = Path("rl_model", "PPO", f"{datetime(2025,7,12,14,58,9).strftime('%Y%m%d_%H%M%S')}", "rl_model.zip")
rl_model_loaded = PPO.load(rl_model_path, env=env_test)

[2025-07-12 18:15:46:013] [SmartMeterWorld] Render mode set to 'human'. Render server at 127.0.0.1:50007. render_connected: True. render_client_socket: <socket.socket fd=96, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 55842), raddr=('127.0.0.1', 50007)>
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [63]:
obs, info = env_test.reset()
for i in range(1):
    done = False
    while not done:
        action, _states = rl_model_loaded.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env_test.step(action)
        print_log(f"Step: {env_test.episode.get_current_step()}, Action: {action}, Reward: {reward}")
        env_test.render()

[2025-07-12 18:15:47:464] [SmartMeterWorld] Resetting environment with a new episode. Episode info: {'length': 13809, 'datetime_range': (Timestamp('2013-08-18 00:00:05'), Timestamp('2013-08-18 23:59:54'))}
[2025-07-12 18:15:47:474] Step: 1, Action: [-0.11176261], Reward: 0.33047990798950194
[2025-07-12 18:15:47:482] Step: 2, Action: [-0.11176547], Reward: 0.4861467361450195
[2025-07-12 18:15:47:490] Step: 3, Action: [-0.11176455], Reward: 0.36622543931007384
[2025-07-12 18:15:47:500] Step: 4, Action: [-0.1117636], Reward: 0.22244373410940171
[2025-07-12 18:15:47:509] Step: 5, Action: [-0.11176723], Reward: 0.12244832664728166
[2025-07-12 18:15:47:517] Step: 6, Action: [-0.11176636], Reward: 0.06538385376334191
[2025-07-12 18:15:47:528] Step: 7, Action: [-0.11176637], Reward: 0.03864115029573441
[2025-07-12 18:15:47:536] Step: 8, Action: [-0.11176547], Reward: 0.028842618316411973
[2025-07-12 18:15:47:550] Step: 9, Action: [-0.11176547], Reward: 0.028882988914847373
[2025-07-12 18:15:47

KeyboardInterrupt: 

In [40]:
env_test.save_graph(
    str(Path("rl_model", "PPO", f"{rl_datetime.strftime('%Y%m%d_%H%M%S')}", "graph_test.png"))
)

In [64]:
env_test.close()

[2025-07-12 18:16:34:523] [SmartMeterWorld] Environment closed.
