In [None]:
# Reinforcment Learning with Foirier/Polynomial State Weighted Q Basis for Homogeus Multi Component Maitenance
# By Joseph Wittrock

# Enviroment/Training code adapted from:
# https://pytorch.org/rl/stable/tutorials/torchrl_envs.html
# https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

# The enviroment is composed of n components with m condition states. Each component degrades each step according to its degredation transition matrix. 
# The actions to the enviroment are 0 for do nothing, or 1 for repair.
# Only k components can be repaired at a time, and the repair cost is proportional to the number of components repaired.
# If a component is in a failed state, it will have a failure cost no matter what action is taken. (Though I want to change how this works for multiperiod adaptation)
# The goal is to maximize the reward over a fixed number of steps, reward is negative for repair costs and failure costs.

# The enviroment is written using TorchRL and TensorDict for efficient paralell computation on a cuda enabled GPU.

# The state space is reduced by considering the distrobution of the components in each condition state, rather than the condition state of each component.
# This condenses the state space to m states.

# To generalize the action space we search for an "equitable" policy. i.e. a component in worse condition is always repaired before a component in better condition.
# This reduces the action space to k actions.

# For large values of m, a traditional deep Q network would not generalize well, as there are too many output neurons.
# Instead, we use a basis of continuous valued functions over [0,1], and map the action space to i \mapsto i/k for i in [0,k]. 
# Note when i=0, no components are repaired, and when i=k, all components are repaired, so there are k+1 actions in total.

# The degrees of approximation determine the output nodes for the neural network, then the output is dot producted with the basis functions evaluated at the precalculated action domain points.
# This allows for scale free computation for increasing values of the repair constraint.

In [None]:
from typing import Optional

from matplotlib import pyplot as plt # for plotting
import numpy as np # for cpu based computation
import torch # for efficient (gpu) computation and automatic differentiation
from tqdm import tqdm # for progress bars
from tensordict import TensorDict, TensorDictBase # for handling dictionaries of tensors in a pytorch friendly way, e.g. for batched data
from torch import nn # for neural networks
import torch.optim as optim # for optimizers
import torch.nn.functional as F # for activation functions
from torch.utils.tensorboard import SummaryWriter # for logging to tensorboard


# TorchRL
from torchrl.data import BoundedTensorSpec, CompositeSpec, UnboundedContinuousTensorSpec # for defining the shape and type of data [Legacy]
from torchrl.data import Bounded, Composite, Unbounded # for defining the shape and type of data
from torchrl.envs import (
    CatTensors, # Concatenate tensors
    EnvBase, # Tensordict based env
    Transform, # Transform for envs
)
from torchrl.envs.transforms.transforms import _apply_to_composite # for applying a transform to a composite spec
from torchrl.envs.utils import check_env_specs, step_mdp # for checking env specs and stepping through an MDP

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # enable cuda if available 
print(f"Using device: {device}")
torch.set_default_device(device)

In [None]:
######################
#                    #
#   HYPERPARAMETERS  #
#                    #
######################

# Environment parameters
N_COMPONENTS = 1000 # number of components in each enviroment
MAX_REPAIR_CONSTRAINT = 50 # Maximum number of components that can be repaired in a single step
EPISODE_LENGTH = 128 # number of steps in each episode
STEP_BATCH_SIZE = 128 # number of environments to be executed in paralell
# - Cost parameters
# REPAIR_COST = 10.0 # cost per repair action taken each step
# FAILURE_COST = 20.0 # cost per maximum state (failed state) component each step
REWARD_BLEND = 0.8 # weight for blending orm costs and utility

# Optimization parameters
OPTIMIZATION_BATCH_SIZE = 64 * EPISODE_LENGTH # number of transitions to be used used in each optimization batch
BUFFER_SIZE = 1 * STEP_BATCH_SIZE * EPISODE_LENGTH # number of transitions to be stored in the memory replay buffer
OPTIMIZATION_PASSES = 1 # number of optimization passes to be made in each step
GAMMA = 0.98 # reward discount factor \in [0, 1]
TAU = 1e-2 # soft target network learning rate. \in [0, 1]
LR = 1e-2 # learning rate for policy network. \in [0, 1]
N_EPISODES = 100
EPOCHS = N_EPISODES * EPISODE_LENGTH  # number of epochs to train the policy network
# - Entropy parameters
ENTROPY_START = 0.5 # initial entropy value for random actions
ENTROPY_END = 0.05  # entropy value for random actions
ENTROPY_DECAY = 1_000  # entropy decay rate 
# - Agent Network parameters
BASIS =   "polynomial" # "fourier" #
N_DEEP_NODES = 32 # number of nodes in the deep network
DEGREE_APPROXIMATION = 3 # basis elements to be generated for action value approximation



In [None]:
###### Generate Other Parameters ######

# evaluate basis functions at every relevant action
# a = k / MAX_REPAIR_CONSTRAINT \in [0, 1] 
# for k \in { 0, 1, 2, ...,  MAX_REPAIR_CONSTRAINT }
basis_switch = { 
    "polynomial": torch.tensor([
        [ (n / MAX_REPAIR_CONSTRAINT )**k for n in range(MAX_REPAIR_CONSTRAINT + 1) ] 
        for k in range(DEGREE_APPROXIMATION)
    ], dtype=torch.float32),
    "fourier": torch.tensor([
        [np.cos(1 * np.pi * k * n / MAX_REPAIR_CONSTRAINT) for n in range(MAX_REPAIR_CONSTRAINT + 1)]
        for k in range(DEGREE_APPROXIMATION)
    ], dtype=torch.float32),
    }
basis = basis_switch[ BASIS ]


def get_basis(degree: int, max_repair_constraint: int, basis_type: str = "polynomial") -> torch.Tensor:
    if basis_type == "polynomial":
        return torch.tensor([
            [ (n / max_repair_constraint )**k for n in range(max_repair_constraint + 1) ] 
            for k in range(degree)
        ], dtype=torch.float32)
    elif basis_type == "fourier":
        return torch.tensor([
            [np.cos(1 * np.pi * k * n / max_repair_constraint) for n in range(max_repair_constraint + 1)]
            for k in range(degree)
        ], dtype=torch.float32)

basis = basis.to(device)
basis.shape

# Degredation Transition Matrix
# Degredation Transition Matrix
transition_matrix_degrade = torch.tensor([
    [0.90, 0.0, 0.0, 0.0, 0.0, 0.0],
    [0.10, 0.90, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.05, 0.8, 0.0, 0.0, 0.0],
    [0.0, 0.05, 0.15, 0.8, 0.0, 0.0],
    [0.0, 0.0, 0.05, 0.1, 0.8, 0.0],
    [0.0, 0.00, 0.00, 0.1, 0.2, 1.0],
], device=device).T 

# Number of discrete condition states per component
N_CONDITION_STATES = transition_matrix_degrade.shape[0]

# Repair Matrix
transition_matrix_repair = torch.zeros((N_CONDITION_STATES, N_CONDITION_STATES), device=device)
transition_matrix_repair[:, 0] = 1.0 # send to the first state

# Pack matrices into a single tensor
transition_matrices = torch.stack([transition_matrix_degrade, transition_matrix_repair])

# Reward function
def make_r_s_a(repair_cost, failure_cost):
    r_s_a = torch.zeros(N_CONDITION_STATES, 2, device=device)
    r_s_a[:, 1] = -repair_cost
    r_s_a[-1, :] = -failure_cost
    return r_s_a

# r_s_a = make_r_s_a(REPAIR_COST, FAILURE_COST)


# Homogeneous Setup
transition_tensor = torch.cat([transition_matrices for _ in range(N_COMPONENTS)]).reshape(N_COMPONENTS, 2, N_CONDITION_STATES, N_CONDITION_STATES)
# rsa_tensor = torch.cat([r_s_a for _ in range(N_COMPONENTS)]).reshape(N_COMPONENTS, N_CONDITION_STATES, 2)

# Max cost used for normalizing rewards
# max_cost = FAILURE_COST * N_COMPONENTS

# rsa_tensor /= max_cost

In [None]:
# hyperparameter dictionary for accessing hyperparameters in other scripts

hyperparameters = TensorDict(
    N_COMPONENTS=N_COMPONENTS,
    N_CONDITION_STATES=N_CONDITION_STATES,
    MAX_REPAIR_CONSTRAINT=MAX_REPAIR_CONSTRAINT,
    EPISODE_LENGTH=EPISODE_LENGTH,
    STEP_BATCH_SIZE=STEP_BATCH_SIZE,
    # REPAIR_COST=REPAIR_COST,
    # FAILURE_COST=FAILURE_COST,
    REWARD_BLEND=REWARD_BLEND,
    OPTIMIZATION_BATCH_SIZE=OPTIMIZATION_BATCH_SIZE,
    BUFFER_SIZE=BUFFER_SIZE,
    OPTIMIZATION_PASSES=OPTIMIZATION_PASSES,
    GAMMA=GAMMA,
    TAU=TAU,
    LR=LR,
    N_EPISODES=N_EPISODES,
    EPOCHS=EPOCHS,
    ENTROPY_START=ENTROPY_START,
    ENTROPY_END=ENTROPY_END,
    ENTROPY_DECAY=ENTROPY_DECAY,
    BASIS=BASIS,
    BASIS_DOMAIN=basis,
    N_DEEP_NODES=N_DEEP_NODES,
    DEGREE_APPROXIMATION=DEGREE_APPROXIMATION,
)

hyperparameters.to(device)

# hyperparameters

In [None]:
from maitenance_util_cost_env import DiscreteMaitenanceEnv, gen_params,  load_dynamics, generate_maitenance_env, reset_maitenance_env

utility = torch.linspace(0, 1, N_COMPONENTS, device=device) 
orm_costs = torch.linspace(0, 1, MAX_REPAIR_CONSTRAINT+1, device=device) 

# load data into enviroment global variables

load_dynamics(transition_tensor, orm_costs, utility, hyperparameters)

# generate enviroment based on hyperparameters and transition/reward tensors
env = generate_maitenance_env(device=device)


In [None]:
# Linear Hyperparameters:
linear_best_lr =  0.013366665691137314
# Linear AK Hyperparameters:
linear_AK_best_lr =  0.006733333226293325
linear_AK_best_degree =  3
# Linear AK Fourier Hyperparameters:
linear_AK_fourier_best_lr =  0.019170833751559258
linear_AK_fourier_best_degree =  8
# DQ Hyperparameters:
DQ_best_lr =  0.011708333157002926
DQ_best_deep =  24
# DQAK Hyperparameters:
DQAK_best_lr =  0.007562499959021807
DQAK_best_degree =  3
DQAK_best_deep =  48
# DQAK Forier Hyperparameters:
DQAK_forier_best_lr =  0.00507500022649765
DQAK_forier_best_degree =  5
DQAK_forier_best_deep =  128

test_observation = torch.tensor([0.4290, 0.1980, 0.1290, 0.1580, 0.0610, 0.0250], device=device)

In [None]:
from deep_q_target_training import MaitenanceDQBNTrainer

reward_blend_range = torch.linspace(0, 1, 20, device=device)

In [None]:
# linear
hyperparameters["LR"] = linear_best_lr

linear_utility = torch.zeros_like(reward_blend_range)
linear_orm_costs = torch.zeros_like(reward_blend_range)

for i, reward_blend in enumerate(reward_blend_range):
	print("test:", i)
	hyperparameters["REWARD_BLEND"] = reward_blend
	dqt_linear = MaitenanceDQBNTrainer(hyperparameters, env,)
	dqt_linear.create_linear_models()
	dqt_linear.train()

	util, cost = dqt_linear.benchmark_UC(episode_length=300)
	linear_utility[i] = util
	linear_orm_costs[i] = cost


linear_rewards = torch.cat([linear_utility.unsqueeze(0), linear_orm_costs.unsqueeze(0)], dim=0)

# save rewards
torch.save(linear_rewards, "linear_rewards.pt")

In [None]:
plt.scatter(linear_orm_costs.cpu().numpy(), linear_utility.cpu().numpy())
plt.xlabel("Cost")
plt.ylabel("Utility")
plt.title("Linear Approximation")
plt.show()

In [None]:
# linear SWQB (formely linea AK)
hyperparameters["LR"] = linear_AK_best_lr
hyperparameters["DEGREE_APPROXIMATION"] = linear_AK_best_degree
hyperparameters["BASIS_DOMAIN"] = get_basis(linear_AK_best_degree, MAX_REPAIR_CONSTRAINT, basis_type="polynomial")

linear_AK_utility = torch.zeros_like(reward_blend_range)
linear_AK_orm_costs = torch.zeros_like(reward_blend_range)

for i, reward_blend in enumerate(reward_blend_range):
	print("test:", i)
	hyperparameters["REWARD_BLEND"] = reward_blend
	dqt_linear_AK = MaitenanceDQBNTrainer(hyperparameters, env,)
	dqt_linear_AK.create_linear_AK_models()
	dqt_linear_AK.train()

	util, cost = dqt_linear_AK.benchmark_UC(episode_length=300)
	linear_AK_utility[i] = util
	linear_AK_orm_costs[i] = cost


linear_AK_rewards = torch.cat([linear_AK_utility.unsqueeze(0), linear_AK_orm_costs.unsqueeze(0)], dim=0)

# save rewards
torch.save(linear_AK_rewards, "linear_AK_rewards.pt")

In [None]:
plt.plot(linear_orm_costs.cpu().numpy(), linear_utility.cpu().numpy())
plt.plot(linear_AK_orm_costs.cpu().numpy(), linear_AK_utility.cpu().numpy())

In [None]:
# linear AK Fourier
hyperparameters["LR"] = linear_AK_fourier_best_lr
hyperparameters["DEGREE_APPROXIMATION"] = linear_AK_fourier_best_degree
hyperparameters["BASIS_DOMAIN"] = get_basis(linear_AK_fourier_best_degree, MAX_REPAIR_CONSTRAINT, basis_type="fourier")

linear_AK_fourier_utility = torch.zeros_like(reward_blend_range)
linear_AK_fourier_orm_costs = torch.zeros_like(reward_blend_range)

for i, reward_blend in enumerate(reward_blend_range):
	print("test:", i)
	hyperparameters["REWARD_BLEND"] = reward_blend
	dqt_linear_AK_fourier = MaitenanceDQBNTrainer(hyperparameters, env,)
	dqt_linear_AK_fourier.create_linear_AK_models()
	dqt_linear_AK_fourier.train()

	util, cost = dqt_linear_AK_fourier.benchmark_UC(episode_length=300)
	linear_AK_fourier_utility[i] = util
	linear_AK_fourier_orm_costs[i] = cost


linear_AK_fourier_rewards = torch.cat([linear_AK_fourier_utility.unsqueeze(0), linear_AK_fourier_orm_costs.unsqueeze(0)], dim=0)

# save rewards
torch.save(linear_AK_fourier_rewards, "linear_AK_fourier_rewards.pt")

In [None]:

plt.plot(linear_orm_costs.cpu().numpy(), linear_utility.cpu().numpy())
plt.plot(linear_AK_orm_costs.cpu().numpy(), linear_AK_utility.cpu().numpy())
plt.plot(linear_AK_fourier_orm_costs.cpu().numpy(), linear_AK_fourier_utility.cpu().numpy())
plt.xlabel("Cost")
plt.ylabel("Utility")
plt.title("Linear SWQB Polynomial Approximation")
plt.show()

In [17]:
# Deep Q
hyperparameters["LR"] = DQ_best_lr
hyperparameters["N_DEEP_NODES"] = DQ_best_deep

DQ_utility = torch.zeros_like(reward_blend_range)
DQ_orm_costs = torch.zeros_like(reward_blend_range)

for i, reward_blend in enumerate(reward_blend_range):
	print("test:", i)
	hyperparameters["REWARD_BLEND"] = reward_blend
	dqt_DQ = MaitenanceDQBNTrainer(hyperparameters, env,)
	dqt_DQ.create_models()
	dqt_DQ.train()

	util, cost = dqt_DQ.benchmark_UC(episode_length=300)
	DQ_utility[i] = util
	DQ_orm_costs[i] = cost


# plt.scatter(DQ_orm_costs.cpu().numpy(), DQ_utility.cpu().numpy())
# plt.xlabel("Cost")
# plt.ylabel("Utility")
# plt.title("Deep Q Approximation")
# plt.show()

DQ_rewards = torch.cat([DQ_utility.unsqueeze(0), DQ_orm_costs.unsqueeze(0)], dim=0)

# save rewards
torch.save(DQ_rewards, "DQ_rewards.pt")

test: 0


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.73it/s]


test: 1


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 207.70it/s]


test: 2


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 207.64it/s]


test: 3


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 207.01it/s]


test: 4


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 207.02it/s]


test: 5


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 207.93it/s]


test: 6


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 207.29it/s]


test: 7


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 207.48it/s]


test: 8


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 206.76it/s]


test: 9


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.71it/s]


test: 10


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.47it/s]


test: 11


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.98it/s]


test: 12


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.56it/s]


test: 13


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 206.04it/s]


test: 14


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.75it/s]


test: 15


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.56it/s]


test: 16


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 206.20it/s]


test: 17


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.85it/s]


test: 18


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 206.73it/s]


test: 19


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.40it/s]


In [19]:
# Deep SWQB
hyperparameters["LR"] = DQAK_best_lr
hyperparameters["DEGREE_APPROXIMATION"] = DQAK_best_degree
hyperparameters["N_DEEP_NODES"] = DQAK_best_deep
hyperparameters["BASIS_DOMAIN"] = get_basis(DQAK_best_degree, MAX_REPAIR_CONSTRAINT, basis_type="polynomial")

DQAK_utility = torch.zeros_like(reward_blend_range)
DQAK_orm_costs = torch.zeros_like(reward_blend_range)

for i, reward_blend in enumerate(reward_blend_range):
	print("test:", i)
	hyperparameters["REWARD_BLEND"] = reward_blend
	dqt_DQAK = MaitenanceDQBNTrainer(hyperparameters, env,)
	dqt_DQAK.create_AK_models()
	dqt_DQAK.train()

	util, cost = dqt_DQAK.benchmark_UC(episode_length=300)
	DQAK_utility[i] = util
	DQAK_orm_costs[i] = cost


# plt.scatter(DQAK_orm_costs.cpu().numpy(), DQAK_utility.cpu().numpy())
# plt.xlabel("Cost")
# plt.ylabel("Utility")
# plt.title("Deep SWQB Polynomial Approximation")
# plt.show()

DQAK_rewards = torch.cat([DQAK_utility.unsqueeze(0), DQAK_orm_costs.unsqueeze(0)], dim=0)

# save rewards
torch.save(DQAK_rewards, "DQAK_rewards.pt")

test: 0


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 207.68it/s]


test: 1


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 207.42it/s]


test: 2


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 206.39it/s]


test: 3


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 204.53it/s]


test: 4


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 205.84it/s]


test: 5


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 204.33it/s]


test: 6


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.97it/s]


test: 7


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.62it/s]


test: 8


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.94it/s]


test: 9


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 202.43it/s]


test: 10


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 202.91it/s]


test: 11


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 202.46it/s]


test: 12


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 203.01it/s]


test: 13


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.83it/s]


test: 14


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 202.23it/s]


test: 15


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.24it/s]


test: 16


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 203.10it/s]


test: 17


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 202.83it/s]


test: 18


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.18it/s]


test: 19


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 202.43it/s]


In [20]:
# Deep SWQB Forier
hyperparameters["LR"] = DQAK_forier_best_lr
hyperparameters["DEGREE_APPROXIMATION"] = DQAK_forier_best_degree
hyperparameters["N_DEEP_NODES"] = DQAK_forier_best_deep
hyperparameters["BASIS_DOMAIN"] = get_basis(DQAK_forier_best_degree, MAX_REPAIR_CONSTRAINT, basis_type="fourier")

DQAK_forier_utility = torch.zeros_like(reward_blend_range)
DQAK_forier_orm_costs = torch.zeros_like(reward_blend_range)

for i, reward_blend in enumerate(reward_blend_range):
	print("test:", i)
	hyperparameters["REWARD_BLEND"] = reward_blend
	dqt_DQAK_forier = MaitenanceDQBNTrainer(hyperparameters, env,)
	dqt_DQAK_forier.create_AK_models()
	dqt_DQAK_forier.train()

	util, cost = dqt_DQAK_forier.benchmark_UC(episode_length=300)
	DQAK_forier_utility[i] = util
	DQAK_forier_orm_costs[i] = cost


# plt.scatter(DQAK_forier_orm_costs.cpu().numpy(), DQAK_forier_utility.cpu().numpy())
# plt.xlabel("Cost")
# plt.ylabel("Utility")
# plt.title("Deep SWQB Forier Approximation")
# plt.show()

DQAK_forier_rewards = torch.cat([DQAK_forier_utility.unsqueeze(0), DQAK_forier_orm_costs.unsqueeze(0)], dim=0)

# save rewards
torch.save(DQAK_forier_rewards, "DQAK_forier_rewards.pt")

test: 0


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.32it/s]


test: 1


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.96it/s]


test: 2


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.22it/s]


test: 3


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.39it/s]


test: 4


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.69it/s]


test: 5


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.78it/s]


test: 6


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.42it/s]


test: 7


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.84it/s]


test: 8


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.68it/s]


test: 9


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 202.60it/s]


test: 10


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 203.02it/s]


test: 11


entropy: 0.05: 100%|██████████| 12800/12800 [01:02<00:00, 203.25it/s]


test: 12


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 202.17it/s]


test: 13


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 202.48it/s]


test: 14


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 203.08it/s]


test: 15


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 201.90it/s]


test: 16


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 201.67it/s]


test: 17


entropy: 0.05: 100%|██████████| 12800/12800 [01:03<00:00, 201.53it/s]


test: 18


entropy: 0.05: 100%|██████████| 12800/12800 [01:04<00:00, 198.17it/s]


test: 19


entropy: 0.05: 100%|██████████| 12800/12800 [01:01<00:00, 209.46it/s]


In [None]:
plt.plot(linear_orm_costs.cpu().numpy(), linear_utility.cpu().numpy())
plt.plot(linear_AK_orm_costs.cpu().numpy(), linear_AK_utility.cpu().numpy())
plt.plot(linear_AK_fourier_orm_costs.cpu().numpy(), linear_AK_fourier_utility.cpu().numpy())
plt.plot(DQ_orm_costs.cpu().numpy(), DQ_utility.cpu().numpy())
plt.plot(DQAK_orm_costs.cpu().numpy(), DQAK_utility.cpu().numpy())
plt.plot(DQAK_forier_orm_costs.cpu().numpy(), DQAK_forier_utility.cpu().numpy())

In [None]:
# plot all pareto fronts
import matplotlib.pyplot as plt
plt.scatter(linear_orm_costs.cpu().numpy(), linear_utility.cpu().numpy(), label="Linear")
plt.scatter(linear_AK_orm_costs.cpu().numpy(), linear_AK_utility.cpu().numpy(), label="Linear SWQB")
plt.scatter(DQ_orm_costs.cpu().numpy(), DQ_utility.cpu().numpy(), label="Deep Q")
plt.scatter(DQAK_orm_costs.cpu().numpy(), DQAK_utility.cpu().numpy(), label="Deep SWQB")
plt.scatter(DQAK_forier_orm_costs.cpu().numpy(), DQAK_forier_utility.cpu().numpy(), label="Deep SWQB Forier")
plt.xlabel("Cost")
plt.ylabel("Utility")
plt.title("Pareto Fronts")
plt.legend()

In [None]:
# plot all pareto fronts
plt.scatter(linear_orm_costs.cpu().numpy(), linear_utility.cpu().numpy(), label="Linear")
plt.scatter(linear_AK_orm_costs.cpu().numpy(), linear_AK_utility.cpu().numpy(), label="Linear SWQB")
plt.scatter(DQ_orm_costs.cpu().numpy(), DQ_utility.cpu().numpy(), label="Deep Q")
plt.scatter(DQAK_orm_costs.cpu().numpy(), DQAK_utility.cpu().numpy(), label="Deep SWQB")
plt.scatter(DQAK_forier_orm_costs.cpu().numpy(), DQAK_forier_utility.cpu().numpy(), label="Deep SWQB Fourier")
plt.xlabel("Cost")
plt.ylabel("Utility")
plt.title("Pareto Fronts")
plt.legend()
plt.ylim(0.5, 1.0)
plt.xlim(0.8, 1.0)