# Combined Oracle Bandits

Analyze the performance of various oracle bandits that solve the combined activity and matching task

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import json 
import argparse 
import sys
from openrl.envs.common import make
from gymnasium.envs.registration import register

In [3]:
from rmab.simulator import RMABSimulator
from rmab.baselines import optimal_whittle,  optimal_q_iteration, optimal_whittle_sufficient, greedy_policy, random_policy, greedy_iterative_policy, mcts_policy
from rmab.fr_dynamics import get_all_transitions
from rmab.compute_whittle import arm_value_iteration_exponential
from rmab.utils import get_save_path, delete_duplicate_results, filter_pareto_optimal, is_pareto_optimal


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
is_jupyter = 'ipykernel' in sys.modules

In [55]:
if is_jupyter: 
    seed        = 42
    n_arms      = 3
    budget      = 2 
    discount    = 0.99
    alpha       = 3 
    n_episodes  = 300
    episode_len = 20 
    n_epochs    = 10
    save_name = 'heterogenous_arms_{}'.format(n_arms)
    match_prob = 0.5
    save_with_date = False 
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_arms',         '-N', help='num beneficiaries (arms)', type=int, default=8)
    parser.add_argument('--episode_len',    '-H', help='episode length', type=int, default=20)
    parser.add_argument('--n_episodes',     '-T', help='num episodes', type=int, default=30)
    parser.add_argument('--budget',         '-B', help='budget', type=int, default=3)
    parser.add_argument('--n_epochs',       '-E', help='number of epochs (num_repeats)', type=int, default=10)
    parser.add_argument('--discount',       '-d', help='discount factor', type=float, default=0.9)
    parser.add_argument('--alpha',          '-a', help='alpha: for conf radius', type=float, default=3)
    parser.add_argument('--seed',           '-s', help='random seed', type=int, default=42)
    parser.add_argument('--save_name',      '-n', help='save name', type=str, default='combined_lamb')
    parser.add_argument('--match_prob',      '-m', help='match probability', type=float, default=0.5)
    parser.add_argument('--use_date', action='store_true')

    args = parser.parse_args()

    n_arms      = args.n_arms
    budget      = args.budget
    discount    = args.discount
    alpha       = args.alpha 
    seed        = args.seed
    n_episodes  = args.n_episodes
    episode_len = args.episode_len
    n_epochs    = args.n_epochs
    save_name   = args.save_name 
    save_with_date = args.use_date 
    match_prob = args.match_prob 



In [56]:
n_states = 2
n_actions = 2

In [57]:
all_population_size = 100 # number of random arms to generate
all_transitions = get_all_transitions(all_population_size)

In [58]:
all_transitions.shape

(100, 2, 2, 2)

In [68]:
# x1 = x2 = 0.5
# x3 = 7/8

x1 = 0.5
x2 = 0.57
x3 = 0.57 

all_transitions = np.zeros((n_arms,2,2,2))

for s in range(2):
    all_transitions[:,s,0,0] = 1
    all_transitions[:,s,0,1] = 0

    all_transitions[0,s,1,0] = 1-x1 
    all_transitions[0,s,1,1] = x1 
    
    all_transitions[1,s,1,0] = 1-x2 
    all_transitions[1,s,1,1] = x2 

    all_transitions[2,s,1,0] = 1-x3 
    all_transitions[2,s,1,1] = x3 

all_population_size = n_arms 

In [69]:
all_features = np.arange(all_population_size)

In [70]:
np.random.seed(seed)
match_probabilities = [random.random() for i in range(all_population_size)]

In [71]:
# match_probabilities = [1,1,0]
match_probabilities = [1,0.75,0.75]

In [72]:
np.random.seed(seed)
random.seed(seed)
simulator = RMABSimulator(all_population_size, all_features, all_transitions,
            n_arms, episode_len, n_epochs, n_episodes, budget, number_states=n_states, reward_style='match',match_probability=match_prob)
simulator.match_probability_list = match_probabilities

cohort [0 1 2]
cohort [0 1 2]
cohort [2 0 1]
cohort [2 1 0]
cohort [1 0 2]
cohort [2 1 0]
cohort [2 1 0]
cohort [0 1 2]
cohort [0 1 2]
cohort [0 2 1]


In [73]:
simulator.cohort_selection = np.tile(list(range(n_arms)),(10,1))
simulator.cohort_selection

array([[0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2]])

In [74]:
import logging
logging.disable(logging.CRITICAL)

In [75]:
lamb_list = [0,1,2,4,6,8,12,16,24,32,48,64] 
lamb_list = [i/n_arms for i in lamb_list]

## Heterogenous Match Probability

In [35]:
if is_jupyter:
    lamb = 1
    np.random.seed(seed)
    random.seed(seed)
    greedy_reward = greedy_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
    greedy_active_rate = simulator.total_active/(greedy_reward.size*n_arms)
    print(np.mean(greedy_reward) + lamb*n_arms*greedy_active_rate)

first state [1 1 1]
instance 0, ep 1, state [0 0 0]
instance 0, ep 2, state [1 1 1]
instance 0, ep 3, state [0 1 1]
instance 0, ep 4, state [0 0 0]
instance 0, ep 5, state [0 1 0]
instance 0, ep 6, state [0 1 0]
instance 0, ep 7, state [0 0 0]
instance 0, ep 8, state [1 0 1]
instance 0, ep 9, state [1 0 1]
instance 0, ep 10, state [0 0 1]
instance 0, ep 11, state [1 1 0]
instance 0, ep 12, state [0 1 0]
instance 0, ep 13, state [0 0 0]
instance 0, ep 14, state [1 0 1]
instance 0, ep 15, state [0 1 1]
instance 0, ep 16, state [0 1 1]
instance 0, ep 17, state [1 1 1]
instance 0, ep 18, state [1 0 0]
instance 0, ep 19, state [0 0 0]
instance 0, ep 20, state [0 1 0]
instance 0, ep 21, state [0 1 0]
instance 0, ep 22, state [1 0 1]
instance 0, ep 23, state [1 0 0]
instance 0, ep 24, state [1 1 1]
instance 0, ep 25, state [1 0 0]
instance 0, ep 26, state [0 1 1]
instance 0, ep 27, state [0 0 0]
instance 0, ep 28, state [0 1 1]
instance 0, ep 29, state [1 0 0]
first state [1 1 1]
instance 1, 

In [36]:
if is_jupyter:
    lamb = 1
    np.random.seed(seed)
    random.seed(seed)
    greedy_iterative_reward = greedy_iterative_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
    greedy_iterative_active_rate = simulator.total_active/(greedy_iterative_reward.size*n_arms)
    print(np.mean(greedy_iterative_reward) + lamb*n_arms*greedy_iterative_active_rate)

first state [1 1 1]
instance 0, ep 1, state [0 0 0]
instance 0, ep 2, state [1 1 1]
instance 0, ep 3, state [0 1 1]
instance 0, ep 4, state [0 0 0]
instance 0, ep 5, state [0 1 0]
instance 0, ep 6, state [0 1 0]
instance 0, ep 7, state [0 0 0]
instance 0, ep 8, state [1 0 1]
instance 0, ep 9, state [1 0 1]
instance 0, ep 10, state [0 0 1]
instance 0, ep 11, state [1 1 0]
instance 0, ep 12, state [0 1 0]
instance 0, ep 13, state [0 0 0]
instance 0, ep 14, state [1 0 1]
instance 0, ep 15, state [0 1 1]
instance 0, ep 16, state [0 1 1]
instance 0, ep 17, state [1 1 1]
instance 0, ep 18, state [1 0 0]
instance 0, ep 19, state [0 0 0]
instance 0, ep 20, state [0 1 0]
instance 0, ep 21, state [0 1 0]
instance 0, ep 22, state [1 0 1]
instance 0, ep 23, state [1 0 0]
instance 0, ep 24, state [1 1 1]
instance 0, ep 25, state [1 0 0]
instance 0, ep 26, state [0 1 1]
instance 0, ep 27, state [0 0 0]
instance 0, ep 28, state [0 1 1]
instance 0, ep 29, state [1 0 0]
first state [1 1 1]
instance 1, 

In [76]:
if is_jupyter:
    lamb = 1
    np.random.seed(seed)
    random.seed(seed)
    greedy_iterative_q_reward = greedy_iterative_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb,use_Q=True)
    greedy_iterative_q_active_rate = simulator.total_active/(greedy_iterative_q_reward.size*n_arms)
    print(np.mean(greedy_iterative_q_reward) + lamb*n_arms*greedy_iterative_q_active_rate)

first state [1 1 1]
instance 0, ep 1, state [0 0 0]
instance 0, ep 2, state [1 1 1]
instance 0, ep 3, state [0 1 1]
instance 0, ep 4, state [0 0 0]
instance 0, ep 5, state [0 1 0]
instance 0, ep 6, state [0 1 0]
instance 0, ep 7, state [0 0 0]
instance 0, ep 8, state [1 0 1]
instance 0, ep 9, state [1 0 1]
instance 0, ep 10, state [0 0 1]
instance 0, ep 11, state [1 1 0]
instance 0, ep 12, state [0 1 0]
instance 0, ep 13, state [0 0 0]
instance 0, ep 14, state [1 0 1]
instance 0, ep 15, state [0 1 1]
instance 0, ep 16, state [0 1 1]
instance 0, ep 17, state [1 1 1]
instance 0, ep 18, state [1 0 0]
instance 0, ep 19, state [0 0 0]
instance 0, ep 20, state [0 1 0]
instance 0, ep 21, state [0 1 0]
instance 0, ep 22, state [1 0 1]
instance 0, ep 23, state [1 0 0]
instance 0, ep 24, state [1 1 1]
instance 0, ep 25, state [1 0 0]
instance 0, ep 26, state [0 1 1]
instance 0, ep 27, state [0 0 0]
instance 0, ep 28, state [0 1 1]
instance 0, ep 29, state [1 0 0]
instance 0, ep 30, state [1 1 1]

In [38]:
if is_jupyter:
    lamb = 1
    np.random.seed(seed)
    random.seed(seed)
    greedy_iterative_shapley_reward = greedy_iterative_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb,use_shapley=True)
    greedy_iterative_shapley_active_rate = simulator.total_active/(greedy_iterative_shapley_reward.size*n_arms)
    print(np.mean(greedy_iterative_shapley_reward) + lamb*n_arms*greedy_iterative_shapley_active_rate)

first state [1 1 1]
instance 0, ep 1, state [0 0 0]
instance 0, ep 2, state [1 1 1]
instance 0, ep 3, state [0 1 1]
instance 0, ep 4, state [0 0 0]
instance 0, ep 5, state [0 1 0]
instance 0, ep 6, state [0 1 0]
instance 0, ep 7, state [0 0 0]
instance 0, ep 8, state [1 0 1]
instance 0, ep 9, state [1 0 1]
instance 0, ep 10, state [0 0 1]
instance 0, ep 11, state [1 1 0]
instance 0, ep 12, state [0 1 0]
instance 0, ep 13, state [0 0 0]
instance 0, ep 14, state [1 0 1]
instance 0, ep 15, state [0 1 1]
instance 0, ep 16, state [0 1 1]
instance 0, ep 17, state [1 1 1]
instance 0, ep 18, state [1 0 0]
instance 0, ep 19, state [0 0 0]
instance 0, ep 20, state [0 1 0]
instance 0, ep 21, state [0 1 0]
instance 0, ep 22, state [1 0 1]
instance 0, ep 23, state [1 0 0]
instance 0, ep 24, state [1 1 1]
instance 0, ep 25, state [1 0 0]
instance 0, ep 26, state [0 1 1]
instance 0, ep 27, state [0 0 0]
instance 0, ep 28, state [0 1 1]
instance 0, ep 29, state [1 0 0]
first state [1 1 1]
instance 1, 

In [39]:
if is_jupyter:
    lamb = 1
    np.random.seed(seed)
    random.seed(seed)
    greedy_iterative_shapley_q_reward = greedy_iterative_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb,use_Q=True,use_shapley=True)
    greedy_iterative_shapley_q_active_rate = simulator.total_active/(greedy_iterative_shapley_q_reward.size*n_arms)
    print(np.mean(greedy_iterative_shapley_q_reward) + lamb*n_arms*greedy_iterative_shapley_q_active_rate)

first state [1 1 1]
instance 0, ep 1, state [0 0 0]
instance 0, ep 2, state [1 1 1]
instance 0, ep 3, state [0 1 1]
instance 0, ep 4, state [0 0 0]
instance 0, ep 5, state [0 1 0]
instance 0, ep 6, state [0 1 0]
instance 0, ep 7, state [0 0 0]
instance 0, ep 8, state [1 0 1]
instance 0, ep 9, state [1 0 1]
instance 0, ep 10, state [0 0 1]
instance 0, ep 11, state [1 1 0]
instance 0, ep 12, state [0 1 0]
instance 0, ep 13, state [0 0 0]
instance 0, ep 14, state [1 0 1]
instance 0, ep 15, state [0 1 1]
instance 0, ep 16, state [0 1 1]
instance 0, ep 17, state [1 1 1]
instance 0, ep 18, state [1 0 0]
instance 0, ep 19, state [0 0 0]
instance 0, ep 20, state [0 1 0]
instance 0, ep 21, state [0 1 0]
instance 0, ep 22, state [1 0 1]
instance 0, ep 23, state [1 0 0]
instance 0, ep 24, state [1 1 1]
instance 0, ep 25, state [1 0 0]
instance 0, ep 26, state [0 1 1]
instance 0, ep 27, state [0 0 0]
instance 0, ep 28, state [0 1 1]
instance 0, ep 29, state [1 0 0]
first state [1 1 1]
instance 1, 

In [40]:
if is_jupyter:
    lamb = 1
    np.random.seed(seed)
    random.seed(seed)
    approximate_combined_reward = optimal_whittle(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
    approximate_combined_active_rate = simulator.total_active/(approximate_combined_reward.size*n_arms)
    print(np.mean(approximate_combined_reward) + lamb*n_arms*approximate_combined_active_rate)

first state [1 1 1]
   state [1 1 1] state_WI [0.87 1.5  1.5 ] sorted [2 1]
   state [0 1 0] state_WI [0.86 1.5  0.98] sorted [1 2]
   state [0 0 0] state_WI [0.86 0.98 0.98] sorted [2 1]
   state [0 0 1] state_WI [0.86 0.98 1.5 ] sorted [2 1]
   state [0 0 1] state_WI [0.86 0.98 1.5 ] sorted [2 1]
   state [0 0 0] state_WI [0.86 0.98 0.98] sorted [2 1]
   state [0 0 0] state_WI [0.86 0.98 0.98] sorted [2 1]
   state [0 1 0] state_WI [0.86 1.5  0.98] sorted [1 2]
   state [0 1 0] state_WI [0.86 1.5  0.98] sorted [1 2]
   state [0 0 1] state_WI [0.86 0.98 1.5 ] sorted [2 1]
   state [0 1 1] state_WI [0.86 1.5  1.5 ] sorted [2 1]
   state [0 1 0] state_WI [0.86 1.5  0.98] sorted [1 2]
   state [0 1 1] state_WI [0.86 1.5  1.5 ] sorted [2 1]
   state [0 1 0] state_WI [0.86 1.5  0.98] sorted [1 2]
   state [0 0 1] state_WI [0.86 0.98 1.5 ] sorted [2 1]
   state [0 0 1] state_WI [0.86 0.98 1.5 ] sorted [2 1]
   state [0 1 0] state_WI [0.86 1.5  0.98] sorted [1 2]
   state [0 0 1] state_WI [0

In [41]:
if is_jupyter:
    lamb = 1
    np.random.seed(seed)
    random.seed(seed)
    mcts_reward = mcts_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
    mcts_active_rate = simulator.total_active/(mcts_reward.size*n_arms)
    print(np.mean(mcts_reward) + lamb*n_arms*mcts_active_rate)

first state [1 1 1]
instance 0, ep 1, state [0 0 0]
instance 0, ep 2, state [1 1 1]


instance 0, ep 3, state [0 1 1]
instance 0, ep 4, state [0 0 0]
instance 0, ep 5, state [0 1 0]
instance 0, ep 6, state [0 1 0]
instance 0, ep 7, state [0 0 0]
instance 0, ep 8, state [1 0 1]
instance 0, ep 9, state [1 0 1]
instance 0, ep 10, state [0 0 1]
instance 0, ep 11, state [1 1 0]
instance 0, ep 12, state [0 1 0]
instance 0, ep 13, state [0 0 0]
instance 0, ep 14, state [1 0 1]
instance 0, ep 15, state [0 1 1]
instance 0, ep 16, state [0 1 1]
instance 0, ep 17, state [1 1 1]
instance 0, ep 18, state [1 0 0]
instance 0, ep 19, state [0 0 0]
instance 0, ep 20, state [0 1 0]
instance 0, ep 21, state [0 1 0]
instance 0, ep 22, state [1 0 1]
instance 0, ep 23, state [1 0 0]
instance 0, ep 24, state [1 1 1]
instance 0, ep 25, state [1 0 0]
instance 0, ep 26, state [0 1 1]
instance 0, ep 27, state [0 0 0]
instance 0, ep 28, state [0 1 1]
instance 0, ep 29, state [1 0 0]
first state [1 1 1]
instance 1, ep 1, state [0 1 0]
instance 1, ep 2, state [0 0 0]
instance 1, ep 3, state [1 0 1]


In [42]:
if is_jupyter:
    lamb = 1
    np.random.seed(seed)
    random.seed(seed)
    mcts_q_reward = mcts_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb,use_Q=True)
    mcts_q_active_rate = simulator.total_active/(mcts_q_reward.size*n_arms)
    print(np.mean(mcts_q_reward) + lamb*n_arms*mcts_q_active_rate)

first state [1 1 1]
instance 0, ep 1, state [0 0 0]
instance 0, ep 2, state [1 1 1]
instance 0, ep 3, state [0 1 1]
instance 0, ep 4, state [0 0 0]
instance 0, ep 5, state [0 1 0]
instance 0, ep 6, state [0 1 0]
instance 0, ep 7, state [0 0 0]
instance 0, ep 8, state [1 0 1]
instance 0, ep 9, state [1 0 1]
instance 0, ep 10, state [0 0 1]
instance 0, ep 11, state [1 1 0]
instance 0, ep 12, state [0 1 0]
instance 0, ep 13, state [0 0 0]
instance 0, ep 14, state [1 0 1]
instance 0, ep 15, state [0 1 1]
instance 0, ep 16, state [0 1 1]
instance 0, ep 17, state [1 1 1]
instance 0, ep 18, state [1 0 0]
instance 0, ep 19, state [0 0 0]
instance 0, ep 20, state [0 1 0]
instance 0, ep 21, state [0 1 0]
instance 0, ep 22, state [1 0 1]
instance 0, ep 23, state [1 0 0]
instance 0, ep 24, state [1 1 1]
instance 0, ep 25, state [1 0 0]
instance 0, ep 26, state [0 1 1]
instance 0, ep 27, state [0 0 0]
instance 0, ep 28, state [0 1 1]
instance 0, ep 29, state [1 0 0]
first state [1 1 1]
instance 1, 

In [54]:
if is_jupyter:
    lamb = 1
    np.random.seed(seed)
    random.seed(seed)
    optimal_reward = optimal_q_iteration(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
    optimal_active_rate = simulator.total_active/(optimal_reward.size*n_arms)
    print(np.mean(optimal_reward) + lamb*n_arms*optimal_active_rate)

first state [1 1 1]
instance 0, ep 1, state [0 0 0]
instance 0, ep 2, state [1 1 1]
instance 0, ep 3, state [0 1 1]
instance 0, ep 4, state [0 0 0]
instance 0, ep 5, state [0 1 0]
instance 0, ep 6, state [0 1 0]
instance 0, ep 7, state [0 0 0]
instance 0, ep 8, state [1 0 1]
instance 0, ep 9, state [1 0 1]
instance 0, ep 10, state [0 0 1]
instance 0, ep 11, state [1 1 0]
instance 0, ep 12, state [0 1 0]
instance 0, ep 13, state [0 0 0]
instance 0, ep 14, state [1 0 1]
instance 0, ep 15, state [0 1 1]
instance 0, ep 16, state [0 1 1]
instance 0, ep 17, state [1 1 1]
instance 0, ep 18, state [1 0 0]
instance 0, ep 19, state [0 0 0]
instance 0, ep 20, state [0 1 0]
instance 0, ep 21, state [0 1 0]
instance 0, ep 22, state [1 0 1]
instance 0, ep 23, state [1 0 0]
instance 0, ep 24, state [1 1 1]
instance 0, ep 25, state [1 0 0]
instance 0, ep 26, state [0 1 1]
instance 0, ep 27, state [0 0 0]
instance 0, ep 28, state [0 1 1]
instance 0, ep 29, state [1 0 0]
first state [1 1 1]
instance 1, 

In [44]:
greedy_reward_list = []
greedy_active_rate_list = []

for lamb in lamb_list:
    np.random.seed(seed)
    random.seed(seed)
    greedy_reward = greedy_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
    greedy_active_rate = simulator.total_active/(greedy_reward.size*n_arms)
    greedy_reward_list.append(np.mean(greedy_reward))
    greedy_active_rate_list.append(greedy_active_rate)

first state [1 1 1]
instance 0, ep 1, state [0 0 0]
instance 0, ep 2, state [1 1 1]
instance 0, ep 3, state [0 1 1]
instance 0, ep 4, state [0 0 0]
instance 0, ep 5, state [0 1 0]
instance 0, ep 6, state [0 1 0]
instance 0, ep 7, state [0 0 0]
instance 0, ep 8, state [1 0 1]
instance 0, ep 9, state [1 0 1]
instance 0, ep 10, state [0 0 1]
instance 0, ep 11, state [1 1 0]
instance 0, ep 12, state [0 1 0]
instance 0, ep 13, state [0 0 0]
instance 0, ep 14, state [1 0 1]
instance 0, ep 15, state [0 1 1]
instance 0, ep 16, state [0 1 1]
instance 0, ep 17, state [1 1 1]
instance 0, ep 18, state [1 0 0]
instance 0, ep 19, state [0 0 0]
instance 0, ep 20, state [0 1 0]
instance 0, ep 21, state [0 1 0]
instance 0, ep 22, state [1 0 1]
instance 0, ep 23, state [1 0 0]
instance 0, ep 24, state [1 1 1]
instance 0, ep 25, state [1 0 0]
instance 0, ep 26, state [0 1 1]
instance 0, ep 27, state [0 0 0]
instance 0, ep 28, state [0 1 1]
instance 0, ep 29, state [1 0 0]
first state [1 1 1]
instance 1, 

In [24]:
greedy_iterative_reward_list = []
greedy_iterative_active_rate_list = []

for lamb in lamb_list:
    np.random.seed(seed)
    random.seed(seed)
    greedy_iterative_reward = greedy_iterative_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
    greedy_iterative_active_rate = simulator.total_active/(greedy_iterative_reward.size*n_arms)
    greedy_iterative_reward_list.append(np.mean(greedy_iterative_reward))
    greedy_iterative_active_rate_list.append(greedy_iterative_active_rate)


first state [0 1 1 1]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
instance 0, ep 21, state [0 0 1 0]
instance 0, ep 22, state [1 0 1 1]
instance 0, ep 23, state [0 1 1 1]
instance 0, ep 24, state [0 0 1 1]
instance 0, ep 25, state [0 0 1 0]
instance 0, ep 26, state [1 1 1 1]
instance 0, ep 27, state [0 1 1 0]
instance 0, ep 28, state [0 1 1 0]
instanc

In [25]:
greedy_iterative_q_reward_list = []
greedy_iterative_q_active_rate_list = []
for lamb in lamb_list:
    np.random.seed(seed)
    random.seed(seed)
    greedy_iterative_q_reward = greedy_iterative_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb,use_Q=True)
    greedy_iterative_q_active_rate = simulator.total_active/(greedy_iterative_q_reward.size*n_arms)
    greedy_iterative_q_reward_list.append(np.mean(greedy_iterative_q_reward))
    greedy_iterative_q_active_rate_list.append(greedy_iterative_q_active_rate)

first state [0 1 1 1]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
instance 0, ep 21, state [0 0 1 0]
instance 0, ep 22, state [1 0 1 1]
instance 0, ep 23, state [0 1 1 1]
instance 0, ep 24, state [0 0 1 1]
instance 0, ep 25, state [0 0 1 0]
instance 0, ep 26, state [1 1 1 1]
instance 0, ep 27, state [0 1 1 0]
instance 0, ep 28, state [0 1 1 0]
instanc

In [26]:
greedy_iterative_shapley_reward_list = []
greedy_iterative_shapley_active_rate_list = []

for lamb in lamb_list:
    np.random.seed(seed)
    random.seed(seed)
    greedy_iterative_shapley_reward = greedy_iterative_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb,use_shapley=True)
    greedy_iterative_shapley_active_rate = simulator.total_active/(greedy_iterative_shapley_reward.size*n_arms)
    greedy_iterative_shapley_reward_list.append(np.mean(greedy_iterative_shapley_reward))
    greedy_iterative_shapley_active_rate_list.append(greedy_iterative_shapley_active_rate)

first state [0 1 1 1]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
instance 0, ep 21, state [0 0 1 0]
instance 0, ep 22, state [1 0 1 1]
instance 0, ep 23, state [0 1 1 1]
instance 0, ep 24, state [0 0 1 1]
instance 0, ep 25, state [0 0 1 0]
instance 0, ep 26, state [1 1 1 1]
instance 0, ep 27, state [0 1 1 0]
instance 0, ep 28, state [0 1 1 0]
instanc

In [27]:
greedy_iterative_shapley_q_reward_list = []
greedy_iterative_shapley_q_active_rate_list = []

for lamb in lamb_list:
    np.random.seed(seed)
    random.seed(seed)
    greedy_iterative_shapley_q_reward = greedy_iterative_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb,use_Q=True,use_shapley=True)
    greedy_iterative_shapley_q_active_rate = simulator.total_active/(greedy_iterative_shapley_q_reward.size*n_arms)
    greedy_iterative_shapley_q_reward_list.append(np.mean(greedy_iterative_shapley_q_reward))
    greedy_iterative_shapley_q_active_rate_list.append(greedy_iterative_shapley_q_active_rate)

first state [0 1 1 1]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
instance 0, ep 21, state [0 0 1 0]
instance 0, ep 22, state [1 0 1 1]
instance 0, ep 23, state [0 1 1 1]
instance 0, ep 24, state [0 0 1 1]
instance 0, ep 25, state [0 0 1 0]
instance 0, ep 26, state [1 1 1 1]
instance 0, ep 27, state [0 1 1 0]
instance 0, ep 28, state [0 1 1 0]
instanc

In [28]:
approximate_combined_reward_list = []
approximate_combined_active_rate_list = []

for lamb in lamb_list:
    np.random.seed(seed)
    random.seed(seed)
    approximate_combined_reward = optimal_whittle(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
    approximate_combined_active_rate = simulator.total_active/(approximate_combined_reward.size*n_arms)
    approximate_combined_reward_list.append(np.mean(approximate_combined_reward))
    approximate_combined_active_rate_list.append(approximate_combined_active_rate)

first state [0 1 1 1]
   state [0 1 1 1] state_WI [0.32 0.94 0.21 0.23] sorted [1 0 3]
   state [0 1 0 1] state_WI [0.32 0.94 0.11 0.23] sorted [1 0 3]
   state [0 1 0 0] state_WI [0.32 0.94 0.11 0.12] sorted [1 0 3]
   state [1 0 0 1] state_WI [0.9  0.48 0.11 0.23] sorted [0 1 3]
   state [1 0 0 1] state_WI [0.9  0.48 0.11 0.23] sorted [0 1 3]
   state [1 1 0 1] state_WI [0.9  0.94 0.11 0.23] sorted [1 0 3]
   state [1 1 0 1] state_WI [0.9  0.94 0.11 0.23] sorted [1 0 3]
   state [1 1 0 0] state_WI [0.9  0.94 0.11 0.12] sorted [1 0 3]
   state [1 0 0 1] state_WI [0.9  0.48 0.11 0.23] sorted [0 1 3]
   state [1 1 0 0] state_WI [0.9  0.94 0.11 0.12] sorted [1 0 3]
   state [1 1 0 1] state_WI [0.9  0.94 0.11 0.23] sorted [1 0 3]
   state [1 1 0 0] state_WI [0.9  0.94 0.11 0.12] sorted [1 0 3]
   state [0 1 0 1] state_WI [0.32 0.94 0.11 0.23] sorted [1 0 3]
   state [1 1 0 1] state_WI [0.9  0.94 0.11 0.23] sorted [1 0 3]
   state [1 1 0 0] state_WI [0.9  0.94 0.11 0.12] sorted [1 0 3]
   

In [29]:
mcts_reward_list = []
mcts_active_rate_list = []

for lamb in lamb_list:
    np.random.seed(seed)
    random.seed(seed)
    mcts_reward = mcts_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
    mcts_active_rate = simulator.total_active/(mcts_reward.size*n_arms)
    mcts_reward_list.append(np.mean(mcts_reward))
    mcts_active_rate_list.append(mcts_active_rate)

first state [0 1 1 1]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
instance 0, ep 21, state [0 0 1 0]
instance 0, ep 22, state [1 0 1 1]
instance 0, ep 23, state [0 1 1 1]
instance 0, ep 24, state [0 0 1 1]
instance 0, ep 25, state [0 0 1 0]
instance 0, ep 26, state [1 1 1 1]
instance 0, ep 27, state [0 1 1 0]
instance 0, ep 28, state [0 1 1 0]
instanc

In [30]:
mcts_q_reward_list = []
mcts_q_active_rate_list = []

for lamb in lamb_list:
    np.random.seed(seed)
    random.seed(seed)
    mcts_q_reward = mcts_policy(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb,use_Q=True)
    mcts_q_active_rate = simulator.total_active/(mcts_q_reward.size*n_arms)
    mcts_q_reward_list.append(np.mean(mcts_q_reward))
    mcts_q_active_rate_list.append(mcts_q_active_rate)

first state [0 1 1 1]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
instance 0, ep 21, state [0 0 1 0]
instance 0, ep 22, state [1 0 1 1]
instance 0, ep 23, state [0 1 1 1]
instance 0, ep 24, state [0 0 1 1]
instance 0, ep 25, state [0 0 1 0]
instance 0, ep 26, state [1 1 1 1]
instance 0, ep 27, state [0 1 1 0]
instance 0, ep 28, state [0 1 1 0]
instanc

## Write Results

In [31]:
data = {
    'whittle_match': approximate_combined_reward_list, 
    'whittle_active': approximate_combined_active_rate_list,
    'greedy_match': greedy_reward_list, 
    'greedy_active': greedy_active_rate_list,
    'iterative_match': greedy_iterative_reward_list,
    'iterative_active': greedy_iterative_active_rate_list, 
    'iterative_q_match': greedy_iterative_q_reward_list, 
    'iterative_q_active': greedy_iterative_q_active_rate_list, 
    'iterative_shapley_match': greedy_iterative_shapley_reward_list, 
    'iterative_shapley_active': greedy_iterative_shapley_active_rate_list, 
    'iterative_q_shapley_match': greedy_iterative_shapley_q_reward_list,
    'iterative_q_shapley_active': greedy_iterative_shapley_q_active_rate_list,
    'mcts_match': mcts_reward_list,
    'mcts_active': mcts_active_rate_list, 
    'mcts_q_match': mcts_q_reward_list, 
    'mcts_q_active': mcts_q_active_rate_list,
    'parameters': 
        {'seed'      : seed,
        'n_arms'    : n_arms,
        'budget'    : budget,
        'discount'  : discount, 
        'alpha'     : alpha, 
        'n_episodes': n_episodes, 
        'episode_len': episode_len, 
        'n_epochs'  : n_epochs, 
        'match_prob': match_prob, 
        'lambda_list': lamb_list,} 
}

In [32]:
if n_arms <= 6:
    np.random.seed(seed)
    random.seed(seed)
    _ = optimal_q_iteration(simulator, n_episodes, n_epochs, discount,reward_function='activity')
    optimal_active_rate = simulator.total_active/(_.size*n_arms)

    np.random.seed(seed)
    random.seed(seed)
    optimal_match_reward = optimal_q_iteration(simulator, n_episodes, n_epochs, discount)

    joint_match = []
    joint_active = []

    for lamb in lamb_list:
        np.random.seed(seed)
        random.seed(seed)
        joint_combined_reward = optimal_q_iteration(simulator, n_episodes, n_epochs, discount,reward_function='combined',lamb=lamb)
        joint_combined_active_rate = simulator.total_active/(joint_combined_reward.size*n_arms)

        joint_match.append(np.mean(joint_combined_reward))
        joint_active.append(joint_combined_active_rate)
    
    data['joint_match'] = joint_match 
    data['joint_active'] = joint_active 
    data['optimal_match'] = np.mean(optimal_match_reward)
    data['optimal_active'] = optimal_active_rate

first state [0 1 1 1]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
instance 0, ep 21, state [0 0 1 0]
instance 0, ep 22, state [1 0 1 1]
instance 0, ep 23, state [0 1 1 1]
instance 0, ep 24, state [0 0 1 1]
instance 0, ep 25, state [0 0 1 0]
instance 0, ep 26, state [1 1 1 1]
instance 0, ep 27, state [0 1 1 0]
instance 0, ep 28, state [0 1 1 0]
instanc

In [33]:
save_path = get_save_path('combined',save_name,seed,use_date=save_with_date)

In [34]:
delete_duplicate_results('combined',save_name,data)

In [35]:
json.dump(data,open('../results/'+save_path,'w'))