In [1]:
import os
os.environ['NUMEXPR_MAX_THREADS'] = '1'

import logging
import numexpr as ne
import numpy as np
import torch
import datetime
from ddopai.envs.pricing.dynamic import DynamicPricingEnv
from ddopai.envs.pricing.dynamic_RL2 import RL2DynamicPricingEnv
from ddopai.envs.pricing.dynamic_lag import LagDynamicPricingEnv
from ddopai.envs.pricing.dynamic_inventory import DynamicPricingInvEnv
from ddopai.envs.actionprocessors import ClipAction, RoundAction
from ddopai.agents.obsprocessors import ConvertDictSpace

from ddopai.experiments.experiment_functions_online import run_experiment, run_hp_experiment, evaluate_val
from ddopai.experiments.meta_experiment_functions import *
import requests
import yaml
import re
import pandas as pd
import wandb
from copy import deepcopy
import warnings
import gc
#from mushroom_rl import core 
from ddopai.experiments.meta_core import Core
import pickle

In [2]:
logging_level = logging.INFO
logging.basicConfig(level=logging_level)

ne.set_num_threads(1)
torch.backends.cudnn.enabled = False
torch.set_num_threads(1)

set_warnings(logging.INFO) # turn off warnings for any level higher or equal to the input level
LIBRARIES_TO_TRACK = ["ddopai", "mushroom_rl"]
PROJECT_NAME = "pricing_cMDP_test"

ENVCLASS = DynamicPricingEnv
RESULTS_DIR = "results"
def get_ENVCLASS(class_name):
    if class_name == "DynamicPricingEnv":
        return DynamicPricingEnv
    elif class_name == "DynamicPricingInvEnv":
        return DynamicPricingInvEnv
    elif class_name == "LagDynamicPricingEnv":
        return LagDynamicPricingEnv
    elif class_name == "RL2DynamicPricingEnv":
        return RL2DynamicPricingEnv
    else:
        raise ValueError(f"Unknown class name {class_name}")

# Experiment preparations
## Set-up WandB
### Init WandB

In [3]:
project_name = "pricing_cMDP"


### Track library versions and git hash of experiment

# Experiment parameters

In [4]:
config_train, config_agent, config_env, AgentClass, agent_name = prep_experiment(
        PROJECT_NAME,
        LIBRARIES_TO_TRACK,
        config_train_name="config_train.yaml",
        config_agent_name="config_agent.yaml",
        config_env_name="config_env.yaml",
    )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtimlachner[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO:root:ddopai: 0.0.7
INFO:root:mushroom_rl: 1.10.1
INFO:root:Git hash: 1b303091ee923ebc8cc99f44a2502ad060a25ffc
INFO:root:Configuration file 'config_train.yaml' successfully loaded.
INFO:root:Configuration file 'config_agent.yaml' successfully loaded.
INFO:root:Configuration file 'config_env.yaml' successfully loaded.


In [5]:
if config_env['lag_window_params'].get("lag_window") is not None:
        for env_kwargs in config_env["env_kwargs"]:
            env_kwargs["lag_window"] = config_env['lag_window_params']['lag_window']
            env_kwargs["env_class"] = "LagDynamicPricingEnv"
if "gamma" in config_agent:
        for env_kwargs in config_env["env_kwargs"]:
            env_kwargs["gamma"] = config_agent["gamma"]
        del config_agent["gamma"]

In [6]:
#artifact = wandb.use_artifact('raw_data:latest')
#path = artifact.download()
#raw_data = pickle.load(open(os.path.join(path, 'raw_data.pkl'), 'rb'))

In [7]:
raw_data, val_index_start, test_index_start = get_online_data(
            config_env,
            overwrite=False
        )

In [8]:
raw_data[0][0][100]

array([0.45329345, 0.45625252, 0.2714313 , 0.35211957, 0.31304172])

## Environment parameters

* Get the environment parameters from the config file 
* Overwrite the ```lag_window```parameter with the parameter specified in the agent, if it is specified (since lag window is provided by the environment, but a tunable hyperparameter of the agent)

In [9]:
round_action = RoundAction(unit_size=config_env["unit_size"])
clip_action = ClipAction(lower=config_env["setup_kwargs"]["p_bound_low"], upper=config_env["setup_kwargs"]["p_bound_high"])
postprocessors = [round_action, clip_action]

#ENVCLASS = get_ENVCLASS(config_env["env_class"])
#environment = set_up_env_online(ENVCLASS, raw_data, val_index_start, test_index_start, config_env, postprocessors)
environments = prepare_env_online(get_ENVCLASS=get_ENVCLASS, raw_data=raw_data, val_index_start=0, test_index_start=0, config_env=config_env, postprocessors=postprocessors)

In [10]:
environments[0].inv

array([3000.])

In [11]:
environments[0].observation_space

Dict('features': Box([0.00135587 0.0005734  0.00041923 0.00048459 0.00530169], [0.49910572 0.4991851  0.49827215 0.4935362  0.49968392], (5,), float32), 'inventory': Box(0.0, 1.0, (1,), float32), 'prev_action': Box(0.0, 5.0, (1,), float32), 'prev_done': Box(0.0, 1.0, (1,), float32), 'prev_reward': Box(-inf, inf, (1,), float32))

In [12]:
obs, _ = environments[0].get_observation()
obs

{'features': array([0.09599362, 0.14803715, 0.04638279, 0.05974782, 0.01263212]),
 'inventory': array([[1.]], dtype=float32),
 'prev_action': array([0.], dtype=float32),
 'prev_reward': array([0.], dtype=float32),
 'prev_done': array([1.], dtype=float32)}

In [13]:
conv = ConvertDictSpace(keep_time_dim=False)
X = conv(environments[0].get_observation()[0])
X

array([0.09599362, 0.14803715, 0.04638279, 0.05974782, 0.01263212,
       1.        , 0.        , 0.        , 1.        ])

## Agent Parameter

In [14]:
n_steps_per_fit = config_agent.pop("n_steps_per_fit", None)
n_episodes_per_fit = config_agent.pop("n_episodes_per_fit", None)

In [15]:
config_agent

{'learning_rate_actor': 0.0001,
 'learning_rate_critic': 0.0002,
 'n_epochs_policy': 6,
 'meta_episodes_per_policy_update': 2,
 'meta_episodes_per_learner_batch': 64,
 'eps_ppo': 0.1,
 'lam': 0.95,
 'ent_coeff': 0.03,
 'hidden_layers_RNN': 2,
 'num_hidden_units_RNN': 512,
 'hidden_layers_MLP': [256, 128],
 'activation': 'tanh',
 'drop_prob': 0,
 'init_method': 'kaiming_uniform',
 'optimizer': 'Adam',
 'loss': 'MSE',
 'device': 'cpu',
 'RNN_cell': 'GRU'}

In [16]:
logging.info(f"Agent: {agent_name}")
if agent_name in ["SAC", "PPORNN", "RL2PPO"]:
    obsprocessors = [ConvertDictSpace(keep_time_dim=False, )]
else:
    obsprocessors = []
if AgentClass.train_mode == "env_interaction":
    if "link" in config_agent:
        glm_link, price_function = set_up_agent(AgentClass, environments[0], config_agent)
        config_agent["g"] = glm_link
        config_agent["price_function"] = price_function
        
        del config_agent["link"]
    if agent_name == "Clairvoyant":
        agent = AgentClass(
        alpha=environments[0].alpha,
        beta=environments[0].beta,
        environment_info=environments[0].mdp_info,
        **config_agent
        )
    else:
        agent = AgentClass(
        environment_info=environments[0].mdp_info,
        obsprocessors=obsprocessors,
        **config_agent
        )

else:
    raise ValueError("Invalid train_mode for online training")

INFO:root:Agent: RL2PPO
INFO:root:Actor (RL²) network:


Layer (type:depth-idx)                   Output Shape              Param #
RL2RNNActor                              [1, 1, 1]                 --
├─RL2RNN: 1-1                            [1, 1, 1]                 --
│    └─SpecificRNNWrapperHS: 2-1         [1, 1, 512]               --
│    │    └─GRU: 3-1                     [1, 1, 512]               2,379,264
│    └─Sequential: 2-2                   [1, 1]                    --
│    │    └─Linear: 3-2                  [1, 256]                  131,328
│    │    └─Tanh: 3-3                    [1, 256]                  --
│    │    └─Dropout: 3-4                 [1, 256]                  --
│    │    └─Linear: 3-5                  [1, 128]                  32,896
│    │    └─Tanh: 3-6                    [1, 128]                  --
│    │    └─Dropout: 3-7                 [1, 128]                  --
│    │    └─Linear: 3-8                  [1, 1]                    129
Total params: 2,543,617
Trainable params: 2,543,617
Non-trainable pa

INFO:root:Critic (RL²) network:


Layer (type:depth-idx)                   Output Shape              Param #
RL2RNNValue                              [1, 1, 1]                 --
├─RL2RNN: 1-1                            [1, 1, 1]                 --
│    └─SpecificRNNWrapperHS: 2-1         [1, 1, 512]               --
│    │    └─GRU: 3-1                     [1, 1, 512]               2,379,264
│    └─Sequential: 2-2                   [1, 1]                    --
│    │    └─Linear: 3-2                  [1, 256]                  131,328
│    │    └─Tanh: 3-3                    [1, 256]                  --
│    │    └─Dropout: 3-4                 [1, 256]                  --
│    │    └─Linear: 3-5                  [1, 128]                  32,896
│    │    └─Tanh: 3-6                    [1, 128]                  --
│    │    └─Dropout: 3-7                 [1, 128]                  --
│    │    └─Linear: 3-8                  [1, 1]                    129
Total params: 2,543,617
Trainable params: 2,543,617
Non-trainable pa

In [17]:
earlystoppinghandler = set_up_earlystoppinghandler(config_train)

In [18]:
eval_envs = np.random.choice(environments, size=16, replace=False).tolist()
train_envs = [env for env in environments if env not in eval_envs]

def run_experiment(
        *,
        agent,                                     # BaseAgent
        train_envs:   List = None,                 # envs used for training
        eval_envs:    List = None,                 # envs used for val / test
        experiment_type: str = "train_only",       # 'train_only'|'train_val'|'only_eval'
        # ---------- training hyper‑params ----------
        n_epochs: int = 1,
        n_steps: int = None,
        n_steps_per_fit: int = 1,
        n_episodes_per_fit: int = None,
        # ---------- validation ----------
        val_every: int = 10,                       # how often to call evaluate_val
        # ---------- misc ----------
        early_stopping_handler: Union[EarlyStoppingHandler, None] = None,
        save_best: bool = True,
        performance_criterion: str = "J",
        tracking: Union[str, None] = None,         # 'wandb' or None
        results_dir: str = "results",
        run_id: str | None = None,
        print_freq: int = 1,
        eval_step_info: bool = False,
):

In [19]:
dataset = run_experiment(
        agent=agent,
        train_envs=train_envs,
        eval_envs=eval_envs,
        experiment_type="train_val",
        val_every= 250,    
        n_epochs=config_train["n_epochs"],
        n_steps=config_train["n_steps"],
        n_steps_per_fit=n_steps_per_fit,
        n_episodes_per_fit=n_episodes_per_fit,
        early_stopping_handler=earlystoppinghandler,
        save_best=config_train["save_best"],
        run_id=wandb.run.id,
        tracking="wandb",
        eval_step_info=False,
        print_freq=1,
        results_dir = RESULTS_DIR
    )

INFO:root:★ Experiment dir: results/8dvi7i6k
epochs:   1%|          | 249/40000 [01:39<3:34:02,  3.10it/s] INFO:root:[VAL] running validation after epoch 250
INFO:root:[VAL] mean True_R=336.36  mean R=363.85
epochs:   1%|          | 499/40000 [03:40<3:40:27,  2.99it/s] INFO:root:[VAL] running validation after epoch 500
INFO:root:[VAL] mean True_R=337.38  mean R=369.39
epochs:   2%|▏         | 749/40000 [05:40<3:30:47,  3.10it/s] INFO:root:[VAL] running validation after epoch 750
INFO:root:[VAL] mean True_R=300.20  mean R=331.68
epochs:   2%|▏         | 999/40000 [07:41<3:27:27,  3.13it/s] INFO:root:[VAL] running validation after epoch 1000
INFO:root:[VAL] mean True_R=226.39  mean R=245.92
epochs:   3%|▎         | 1249/40000 [09:39<3:20:11,  3.23it/s] INFO:root:[VAL] running validation after epoch 1250
INFO:root:[VAL] mean True_R=225.60  mean R=251.32
epochs:   4%|▎         | 1499/40000 [11:42<3:26:32,  3.11it/s] INFO:root:[VAL] running validation after epoch 1500
INFO:root:[VAL] mean T

In [20]:
wandb.finish()

0,1
Action,▁▄▅▁█▃▂▁▅▄▄▄▅▃▅▁▃▁▁▅▁▁▂▁▅▂▆▃▃▂▄▁▃▃▂▄▆▃▃▃
Cumulative_Reward,▆▁▄▆▂▃█▅▃▄▃▆▅▁▄▂▄▁▇▂▂▄▃▄▃▇▁▁▃▄▄▁▄▄▄▄▁▂▂▁
Cumulative_True_Reward,▃▄▁▁▃▆▅▃▃▄▄▂▃▆▆▆▄▅▁▂▂▄▂▂▄▂▃█▇▄▄▂▃▅▄▂▃▄█▄
Epoch,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇█
Inventory,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Reward,▁▅▄▄▁▁▁▂▅▁▂▅▁▁▁▂▆▂▁█▁▁▁▂▁▁▁▄▆▂▁▁▅▁▁▂▁▁▁▂
True_Reward,▅▃▆▁▁▁▅▁▁▄▄▇▃▃▄▆▃▂▁▂▃▄▂▆▁▄█▂▄▁▅▄▁▂▁▄▅▆▂▁
Val_Action,▆▁▆▂▆▆▆▄▁▁▆▇▄▁▃▅▄▂▇▁▅▁▇▁▁█▆▂▇▇▇▇▂▆▆▂▇█▂▃
Val_Cumulative_Reward,▃▅▅▄▂▁▇▆▄▂▂▄█▃▁▃▂▃▂▃▅▃▂▂▄▃▅█▂▂▇█▅▄▂▆▁▇▃▂
Val_Cumulative_True_Reward,▄▂▁▃▁▁▆▂▁▁▅▆▁▅█▁▂▁▂▄▅▅▄▁▄▄▁▅▃▂▄▂▇▃▁█▆▇▁▂

0,1
Action,0.02
Cumulative_Reward,343.59381
Cumulative_True_Reward,286.46385
Epoch,20249
Inventory,3000
Reward,0.03609
True_Reward,0.02781
Val_Action,0.06
Val_Cumulative_Reward,39.73909
Val_Cumulative_True_Reward,34.56135


In [21]:
agent.eval()
agent.reset_hidden()
env = environments[0]
env.reset()
for i in range(300):
    obs = conv(env.get_observation()[0])  # Preprocess the observation
    action = agent.predict(obs)
    print(env.step(action[0]))
  

({'features': array([0.3749225 , 0.49686624, 0.4088976 , 0.42476839, 0.43450523]), 'inventory': array([[1.]], dtype=float32), 'prev_action': array([[-0.88]], dtype=float32), 'prev_reward': array([[0.]], dtype=float32), 'prev_done': array([0.], dtype=float32)}, array([0.]), array([False]), {'inv': array([3000.]), 'demand': array([0.]), 'true_demand': array([0.48489932]), 'action': array([0.3]), 'reward': array([0.]), 'true_reward': array([0.1454698])})
({'features': array([0.44248337, 0.34391384, 0.32635113, 0.05098269, 0.25883157]), 'inventory': array([[1.]], dtype=float32), 'prev_action': array([[-0.856]], dtype=float32), 'prev_reward': array([[353446.]], dtype=float32), 'prev_done': array([0.], dtype=float32)}, array([353445.99751488]), array([False]), {'inv': array([3000.]), 'demand': array([0.98179444]), 'true_demand': array([2.33301445]), 'action': array([0.36]), 'reward': array([0.353446]), 'true_reward': array([0.8398852])})
({'features': array([0.38869095, 0.08539285, 0.1998374

In [22]:
agent.eval()
env = environments[0]
env.reset() 
agent.reset_hidden()
processed_obs = conv(env.get_observation()[0])  # Preprocess the observation
print(agent.predict(processed_obs))


(array([[0.29700312]], dtype=float32), None, None)
