In [1]:
import os
os.environ['NUMEXPR_MAX_THREADS'] = '1'

import logging
import numexpr as ne
import numpy as np
import torch
import datetime
from ddopai.envs.pricing.dynamic import DynamicPricingEnv
from ddopai.envs.pricing.dynamic_RL2 import RL2DynamicPricingEnv
from ddopai.envs.actionprocessors import ClipAction, RoundAction
from ddopai.agents.obsprocessors import ConvertDictSpace

#from ddopai.experiments.experiment_functions_online import run_experiment, run_hp_experiment, evaluate_val
from ddopai.experiments.experiment_functions_meta import run_experiment, test_agent, run_test_episode
from ddopai.experiments.meta_experiment_functions import *
import requests
import yaml
import re
import pandas as pd
import wandb
from copy import deepcopy
import warnings
import gc
from mushroom_rl import core 
#from ddopai.experiments.meta_core import Core
import pickle

In [2]:
logging_level = logging.INFO
logging.basicConfig(level=logging_level)

ne.set_num_threads(1)
torch.backends.cudnn.enabled = False
torch.set_num_threads(1)

set_warnings(logging.INFO) # turn off warnings for any level higher or equal to the input level
LIBRARIES_TO_TRACK = ["ddopai", "mushroom_rl"]
PROJECT_NAME = "pricing_cMDP_test"

ENVCLASS = DynamicPricingEnv
RESULTS_DIR = "results"
def get_ENVCLASS(class_name):
    if class_name == "DynamicPricingEnv":
        return DynamicPricingEnv
    elif class_name == "DynamicPricingInvEnv":
        return DynamicPricingInvEnv
    elif class_name == "LagDynamicPricingEnv":
        return LagDynamicPricingEnv
    elif class_name == "RL2DynamicPricingEnv":
        return RL2DynamicPricingEnv
    else:
        raise ValueError(f"Unknown class name {class_name}")

# Experiment preparations
## Set-up WandB
### Init WandB

In [3]:
project_name = "pricing_cMDP"


### Track library versions and git hash of experiment

# Experiment parameters

In [4]:
config_train, config_agent, config_env, AgentClass, agent_name = prep_experiment(
        PROJECT_NAME,
        LIBRARIES_TO_TRACK,
        config_train_name="config_train.yaml",
        config_agent_name="config_agent.yaml",
        config_env_name="config_env.yaml",
    )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtimlachner[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO:root:ddopai: 0.0.7
INFO:root:mushroom_rl: 1.10.1
INFO:root:Git hash: 855d846a6d43bb012b88168fd8200f7f5ae3b8bd
INFO:root:Configuration file 'config_train.yaml' successfully loaded.
INFO:root:Configuration file 'config_agent.yaml' successfully loaded.
INFO:root:Configuration file 'config_env.yaml' successfully loaded.


In [5]:
if "gamma" in config_agent:
        config_env["gamma"] = config_agent["gamma"]
        del config_agent["gamma"]

In [6]:
#artifact = wandb.use_artifact('raw_data:latest')
#path = artifact.download()
#raw_data = pickle.load(open(os.path.join(path, 'raw_data.pkl'), 'rb'))

In [7]:
raw_data, val_index_start, test_index_start = get_online_data(
            config_env,
            overwrite=False
        )

## Environment parameters

* Get the environment parameters from the config file 
* Overwrite the ```lag_window```parameter with the parameter specified in the agent, if it is specified (since lag window is provided by the environment, but a tunable hyperparameter of the agent)

In [8]:
config_env.keys()

dict_keys(['const_params', 'env_class', 'env_kwargs', 'normalize_features', 'size_test', 'size_train', 'size_val', 'unit_size'])

In [9]:
round_action = RoundAction(unit_size=config_env["unit_size"])
clip_action = ClipAction(lower=config_env["env_kwargs"]["p_bound_low"], upper=config_env["env_kwargs"]["p_bound_high"])
postprocessors = [round_action, clip_action]

#ENVCLASS = get_ENVCLASS(config_env["env_class"])
environment = set_up_env_online(ENVCLASS, raw_data, val_index_start, test_index_start, config_env, postprocessors)

## Agent Parameter

In [10]:
config_agent

{'learning_rate_actor': 0.005,
 'learning_rate_critic': 0.005,
 'initial_replay_size': 32,
 'max_replay_size': 256,
 'batch_size': 32,
 'hidden_layers': [32, 8],
 'activation': 'relu',
 'lr_alpha': 0.005,
 'log_std_min': -20,
 'log_std_max': 2,
 'warmup_transitions': 32,
 'tau': 0.005,
 'target_entropy': -1.0,
 'optimizer': 'Adam',
 'loss': 'MSE',
 'device': 'cpu',
 'use_log_alpha_loss': True,
 'n_steps_per_fit': 1}

In [11]:
n_steps_per_fit = config_agent.pop("n_steps_per_fit", None)
n_episodes_per_fit = config_agent.pop("n_episodes_per_fit", None)

In [12]:
logging.info(f"Agent: {agent_name}")
if agent_name in ["SAC", "PPORNN", "RL2PPO"]:
    obsprocessors = [ConvertDictSpace(keep_time_dim=False, )]
else:
    obsprocessors = []
if "link" in config_agent:
    glm_link, price_function = set_up_agent(AgentClass, environment, config_agent)
    config_agent["g"] = glm_link
    config_agent["price_function"] = price_function
    
    del config_agent["link"]
if agent_name == "Clairvoyant":
    agent = AgentClass(
    task = environment.get_task(),
    environment_info=environment.mdp_info,
    **config_agent
    )
else:
    agent = AgentClass(
    environment_info=environment.mdp_info,
    obsprocessors=obsprocessors,
    **config_agent
    )


INFO:root:Agent: SAC
INFO:root:Actor network (mu network):


Layer (type:depth-idx)                   Output Shape              Param #
MLPActor                                 [1, 1]                    --
├─Sequential: 1-1                        [1, 1]                    --
│    └─Linear: 2-1                       [1, 32]                   224
│    └─ReLU: 2-2                         [1, 32]                   --
│    └─Dropout: 2-3                      [1, 32]                   --
│    └─Linear: 2-4                       [1, 8]                    264
│    └─ReLU: 2-5                         [1, 8]                    --
│    └─Dropout: 2-6                      [1, 8]                    --
│    └─Linear: 2-7                       [1, 1]                    9
│    └─Identity: 2-8                     [1, 1]                    --
Total params: 497
Trainable params: 497
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00


INFO:root:################################################################################
INFO:root:Critic network:


Layer (type:depth-idx)                   Output Shape              Param #
MLPStateAction                           --                        --
├─Sequential: 1-1                        [1, 1]                    --
│    └─Linear: 2-1                       [1, 32]                   256
│    └─ReLU: 2-2                         [1, 32]                   --
│    └─Dropout: 2-3                      [1, 32]                   --
│    └─Linear: 2-4                       [1, 8]                    264
│    └─ReLU: 2-5                         [1, 8]                    --
│    └─Dropout: 2-6                      [1, 8]                    --
│    └─Linear: 2-7                       [1, 1]                    9
│    └─Identity: 2-8                     [1, 1]                    --
Total params: 529
Trainable params: 529
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00


In [13]:
obsprocessors

[<ddopai.agents.obsprocessors.ConvertDictSpace at 0x1530540d0>]

In [14]:

agent.preprocessors

[<ddopai.agents.obsprocessors.ConvertDictSpace at 0x1530540d0>,
 <ddopai.agents.obsprocessors.FlattenTimeDimNumpy at 0x1124d2140>]

In [15]:
earlystoppinghandler = set_up_earlystoppinghandler(config_train)

In [16]:
agent.n_steps_per_fit = n_steps_per_fit
#agent.train_mode = "online_eval"

In [None]:
dataset = run_experiment(
        agent=agent,
        env=environment,
        eval_freq= 1,    
        n_epochs=config_train["n_epochs"],
        n_steps=config_train["n_steps"],
        early_stopping_handler=earlystoppinghandler,
        save_best=config_train["save_best"],
        run_id=wandb.run.id,
        tracking="wandb",
        eval_step_info=False,
        print_freq=1,
        results_dir = RESULTS_DIR
    )

INFO:root:Starting experiment
INFO:root:Starting training with env_interaction


Experiment directory: results/9ff5tcvl


  4%|▎         | 9/250 [00:07<03:27,  1.16it/s]INFO:root:Epoch 10: R=242.18407319700964, J=0.4774198859973497
  4%|▍         | 10/250 [00:08<03:28,  1.15it/s]

Episode 0: R=2421.8407319700964, J=4.774198859973497
Episode 1: R=0, J=0
Episode 2: R=0, J=0
Episode 3: R=0, J=0
Episode 4: R=0, J=0
Episode 5: R=0, J=0
Episode 6: R=0, J=0
Episode 7: R=0, J=0
Episode 8: R=0, J=0
Episode 9: R=0, J=0


  8%|▊         | 19/250 [00:17<03:35,  1.07it/s]


KeyboardInterrupt: 

In [None]:
wandb.finish()

0,1
val/Episode,▄▆▁▂█▁▆█▄▃▆▆▂▃▃▁▃▇▃▆█▁▆▃▄▆▇▃▆█▅▇█▃▇▁▂█▅▆
val/J,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁█▁▁
val/R,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁

0,1
val/Episode,9
val/J,0
val/R,0
