In [1]:
import os
import torch
import torch.nn as nn
import math
import numpy as np
import tensorflow as tf
import pandas as pd

from google.cloud import storage
from tensorboardX import SummaryWriter
import gin.tf

from batch_rl.fixed_replay.agents import quantile_agent
from batch_rl.fixed_replay.environments import ACPulse
from dopamine.discrete_domains import checkpointer


from running_stats import RunningMeanStd



In [2]:
class NN(nn.Module):
    def __init__(self, input_size):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512, bias=True)
        # nn.init.zeros_(self.fc1.weight)
        self.fc2 = nn.Linear(512, 256, bias=True)
        # nn.init.zeros_(self.fc2.weight)
        self.fc3 = nn.Linear(256, 128, bias=True)
        self.fc4 = nn.Linear(128, 1, bias=True)
        # nn.init.zeros_(self.fc4.weight)
        self.tanh = torch.nn.Tanh()
        self.softp = torch.nn.Softplus()

    def forward(self, x):
        x = self.fc1(x)
        x = self.tanh(x)
        x = self.fc2(x)
        x = self.tanh(x)
        x = self.fc3(x)
        x = self.tanh(x)
        x = self.fc4(x)
        # x = self.softp(x)
        return x
    
class RewardPredictor:
    def __init__(self, input_size, checkpoint_dir):

        self.model = NN(input_size + 1)

        self.running_stats = RunningMeanStd()

        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
        model_state, optimizer_state, scheduler_state, running_stats_state = torch.load(
            checkpoint
        )
        self.model.load_state_dict(model_state)
        self.running_stats.load_dict(running_stats_state)

    def predict(self, x):
        scores = self.model(x)
        scores_raw = (torch.exp(scores) - 1 + 0.003) * math.sqrt(
            (self.running_stats.var)
        )  # just the inverse transofrmation for the predicted rewards
        return scores_raw
    
def estimate(predictor, actions, action_probs, obs):
    obs = torch.Tensor(
        np.concatenate(
            (obs, np.reshape(actions, (actions[0].shape[0], 1))), axis=1
        )
    )  # concatenate actions and observations for input obs are usually [[obs1],[obs2],[obs3]] and
    # actions are usually [1,0,1,0] so the goal is to make actions like this: [[1],[0],[1]]
    scores_raw = predictor.predict(obs).detach().numpy()
    results = {}
    results["score"] = (scores_raw * action_probs).mean()
    results["pred_reward_mean"] = scores_raw.mean()
    results["pred_reward_total"] = scores_raw.sum()
    return results

In [3]:
tf.compat.v1.disable_v2_behavior()
config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
_sess = tf.compat.v1.Session('', config=config)

Instructions for updating:
non-resource variables are not supported in the long term


Instructions for updating:
non-resource variables are not supported in the long term


In [4]:
environment = ACPulse((80,), np.float32)

replay_data_dir = "cql-dataset/v5_scoremax_dataset_cql/replay_logs"
checkpoint_dir = "runs/checkpoints"

agent = quantile_agent.FixedReplayQuantileAgent(
    _sess, 
    num_actions=environment.action_space.n,
    observation_shape=environment.observation_shape,
    observation_dtype=environment.observation_dtype,
    replay_data_dir=replay_data_dir,
    init_checkpoint_dir=checkpoint_dir,
    replay_scheme="uniform",
)
agent.eval_mode = True

INFO:tensorflow:Creating FixedReplayAgent with replay directory: cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Creating FixedReplayAgent with replay directory: cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:	 init_checkpoint_dir: runs/checkpoints


INFO:tensorflow:	 init_checkpoint_dir: runs/checkpoints


INFO:tensorflow:	 replay_suffix None


INFO:tensorflow:	 replay_suffix None


Created Quantile Agent....
min Q weight (QR-DQN):  10.0
Ckpt suffixes:  ['0' '7' '5' '2' '1' '6' '3' '4']
Loading buffer in fixed replay buffer
Loading buffer in fixed replay bufferLoading buffer in fixed replay buffer
Loading buffer in fixed replay buffer

Loading buffer in fixed replay buffer
Loading buffer in fixed replay buffer
Loading buffer in fixed replay buffer
Loading buffer in fixed replay buffer
INFO:tensorflow:Loaded replay buffer ckpt 3 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 3 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 0 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 0 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 2 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 2 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 7 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 7 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 4 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 4 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 1 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 1 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 6 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 6 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 5 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


INFO:tensorflow:Loaded replay buffer ckpt 5 from cql-dataset/v5_scoremax_dataset_cql/replay_logs


Number of replay buffers:  8
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


MIN Q WEIGHT:  10.0
---------- (80,)
---------- <class 'numpy.float32'>


In [5]:
def download_files(path):
    print("----- Download files from Bucket")
    bucket = storage.Client().get_bucket("ac-rl-artifacts")
    blobs = bucket.list_blobs(prefix=path)
    for blob in blobs:
        print(blob.name)
        os.makedirs(os.path.dirname(blob.name), exist_ok=True)
        blob.download_to_filename(blob.name)

In [15]:
input_size = environment.observation_shape[0]

# Get checkpoint dir
chkpt_path = "models/reward_pred_v0_model/release/80_input"
download_files(chkpt_path)

# Load reward predictor
rew_pred = RewardPredictor(input_size, os.path.abspath(chkpt_path))

----- Download files from Bucket
models/reward_pred_v0_model/release/80_input/.is_checkpoint
models/reward_pred_v0_model/release/80_input/.tune_metadata
models/reward_pred_v0_model/release/80_input/checkpoint


In [16]:
# Create Tensorboard Writer
save_folder = "evaluation"
dataset_name = "test_dataset_users"
dataset_version = "v5"

writer = SummaryWriter(
    os.path.join(save_folder, dataset_name.split("_")[0])
)

In [17]:
# Get validation dataset
dataset_path = "data/processed/{}_dataset/{}/".format(
    dataset_version, dataset_name
)
# download_files(dataset_path)
validation_dataset = [
    os.path.join(dataset_path, f)
    for f in os.listdir(dataset_path)
    if os.path.isfile(os.path.join(dataset_path, f))
]

In [18]:
from tqdm import tqdm
from ray.rllib.offline.json_reader import JsonReader
from statistics import mean


checkpoints = [9, 10, 11, 12]
_checkpointer = checkpointer.Checkpointer(checkpoint_dir, 'ckpt')

for checkpoint in checkpoints:
    experiment_data = _checkpointer.load_checkpoint(checkpoint)
    agent.unbundle(checkpoint_dir, checkpoint, experiment_data)
    agent.eval_mode = True

    actions = []
    estimation_eps = {
        "dm/score": [],
        "dm/pred_reward_mean": [],
        "dm/pred_reward_total": [],
    }
    for n_eps in tqdm(range(len(validation_dataset[0]))):
        reader = JsonReader(validation_dataset[0])
        batch = reader.next()
        estimation = {
            "dm/score": [],
            "dm/pred_reward_mean": [],
            "dm/pred_reward_total": [],
        }
        for episode in batch.split_by_episode():
            action = []
            action_probs = []
            for i in range(len(episode["eps_id"])): 
                action.append(agent.step(episode["rewards"][i], episode["obs"][i]))
                action_probs.append(1.0)
                
                
            actions.extend(action)
            action = np.array([action])
            action_probs = np.array([action_probs])
            scores = estimate(rew_pred, action, action_probs, episode["obs"])
            estimation["dm/score"].append(scores["score"])
            estimation["dm/pred_reward_mean"].append(scores["pred_reward_mean"])
            estimation["dm/pred_reward_total"].append(scores["pred_reward_total"])
            
        estimation_eps["dm/score"].append(mean(estimation["dm/score"]))
        estimation_eps["dm/pred_reward_mean"].append(mean(estimation["dm/pred_reward_mean"]))
        estimation_eps["dm/pred_reward_total"].append(mean(estimation["dm/pred_reward_total"]))
                        
    est_mean = pd.DataFrame.from_dict(estimation_eps).mean(axis=0)
    print(est_mean.head())
    
    # DM Estimation ------------------------
    writer.add_scalar(
        "evaluation/dm/score", est_mean["dm/score"], checkpoint
    )
    writer.add_scalar(
        "evaluation/dm/pred_reward_mean",
        est_mean["dm/pred_reward_mean"],
        checkpoint,
    )
    writer.add_scalar(
        "evaluation/dm/pred_reward_mean_total",
        est_mean["dm/pred_reward_total"],
        checkpoint,
    )
    
    # Action
    writer.add_scalar(
        "evaluation/actions_prob",
        float(actions.count(1)) / len(actions),
        checkpoint,
    )

LOAD CHECKPOINT  runs/checkpoints/ckpt.9
INFO:tensorflow:Restoring parameters from runs/checkpoints/tf_ckpt-9


INFO:tensorflow:Restoring parameters from runs/checkpoints/tf_ckpt-9
100%|██████████| 88/88 [00:48<00:00,  1.83it/s]

dm/score                0.053695
dm/pred_reward_mean     0.053695
dm/pred_reward_total    0.597074
dtype: float64
LOAD CHECKPOINT  runs/checkpoints/ckpt.10
INFO:tensorflow:Restoring parameters from runs/checkpoints/tf_ckpt-10



INFO:tensorflow:Restoring parameters from runs/checkpoints/tf_ckpt-10
100%|██████████| 88/88 [00:47<00:00,  1.85it/s]

dm/score                0.054368
dm/pred_reward_mean     0.054368
dm/pred_reward_total    0.599134
dtype: float64
LOAD CHECKPOINT  runs/checkpoints/ckpt.11
INFO:tensorflow:Restoring parameters from runs/checkpoints/tf_ckpt-11



INFO:tensorflow:Restoring parameters from runs/checkpoints/tf_ckpt-11
100%|██████████| 88/88 [00:48<00:00,  1.82it/s]

dm/score                0.054276
dm/pred_reward_mean     0.054276
dm/pred_reward_total    0.599352
dtype: float64
LOAD CHECKPOINT  runs/checkpoints/ckpt.12
INFO:tensorflow:Restoring parameters from runs/checkpoints/tf_ckpt-12



INFO:tensorflow:Restoring parameters from runs/checkpoints/tf_ckpt-12
100%|██████████| 88/88 [00:47<00:00,  1.84it/s]

dm/score                0.054109
dm/pred_reward_mean     0.054109
dm/pred_reward_total    0.600885
dtype: float64





In [14]:
est_mean

dm/score                0.054293
dm/pred_reward_mean     0.054293
dm/pred_reward_total    0.600544
dtype: float64