# Large Combined Oracle Bandits

Analyze the performance of various algorithms to solve the joint matching + activity task, when the number of volunteers is large and structured

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import json 
import argparse 
import sys
import time 

In [17]:
from rmab.simulator import RMABSimulator
from rmab.omniscient_policies import *
from rmab.mcts_policies import mcts_policy, mcts_mcts_policy, mcts_whittle_policy
from rmab.fr_dynamics import get_all_transitions
from rmab.utils import get_save_path, delete_duplicate_results

In [18]:
is_jupyter = 'ipykernel' in sys.modules

In [19]:
if is_jupyter: 
    seed        = 42
    n_arms      = 2
    volunteers_per_arm = 2
    budget      = 3
    discount    = 0.9
    alpha       = 3 
    n_episodes  = 30
    episode_len = 20 
    n_epochs    = 10
    save_with_date = False 
    TIME_PER_RUN = 0.01 * 1000
    save_name = 'two_step_{}_{}'.format(int(TIME_PER_RUN),seed)
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_arms',         '-N', help='num beneficiaries (arms)', type=int, default=3)
    parser.add_argument('--volunteers_per_arm',         '-V', help='volunteers per arm', type=int, default=2)
    parser.add_argument('--episode_len',    '-H', help='episode length', type=int, default=20)
    parser.add_argument('--n_episodes',     '-T', help='num episodes', type=int, default=30)
    parser.add_argument('--budget',         '-B', help='budget', type=int, default=3)
    parser.add_argument('--n_epochs',       '-E', help='number of epochs (num_repeats)', type=int, default=10)
    parser.add_argument('--discount',       '-d', help='discount factor', type=float, default=0.9)
    parser.add_argument('--alpha',          '-a', help='alpha: for conf radius', type=float, default=3)
    parser.add_argument('--seed',           '-s', help='random seed', type=int, default=42)
    parser.add_argument('--save_name',      '-n', help='save name', type=str, default='combined_lamb')
    parser.add_argument('--time_per_run',      '-t', help='time per MCTS run', type=float, default=.01*1000)
    parser.add_argument('--use_date', action='store_true')

    args = parser.parse_args()

    n_arms      = args.n_arms
    volunteers_per_arm = args.volunteers_per_arm
    budget      = args.budget
    discount    = args.discount
    alpha       = args.alpha 
    seed        = args.seed
    n_episodes  = args.n_episodes
    episode_len = args.episode_len
    n_epochs    = args.n_epochs
    save_name   = args.save_name 
    save_with_date = args.use_date 
    TIME_PER_RUN = args.time_per_run



In [20]:
n_states = 2
n_actions = 2

In [21]:
all_population_size = 100 # number of random arms to generate
all_transitions = get_all_transitions(all_population_size)

In [22]:
random.seed(seed)
np.random.seed(seed)

In [23]:
all_features = np.arange(all_population_size)
match_probabilities = [random.random() for i in range(all_population_size * volunteers_per_arm)]

In [24]:
np.random.seed(seed)
random.seed(seed)
simulator = RMABSimulator(all_population_size, all_features, all_transitions,
            n_arms, volunteers_per_arm, episode_len, n_epochs, n_episodes, budget, discount,number_states=n_states, reward_style='match',match_probability_list=match_probabilities,TIME_PER_RUN=TIME_PER_RUN)

acting should always be good! 0.000 < 0.044
acting should always be good! 0.000 < 0.162
acting should always be good! 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [83 53]
cohort [41 39]
cohort [79 72]
cohort [92 64]
cohort [ 7 38]
cohort [12 59]
cohort [48 64]
cohort [18 53]
cohort [69 64]
cohort [46 87]


In [25]:
lamb = 64/(n_arms*volunteers_per_arm)

## Index Policies

In [26]:
if is_jupyter:
    policy = greedy_policy
    greedy_reward, greedy_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
    print(np.mean(greedy_reward) + lamb*n_arms*volunteers_per_arm*greedy_active_rate)

instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4
instance 1, ep 5
instance 1, ep 6
instance 1, ep 7
instance 1, ep 8
instance 1, ep 9
instance 1, ep 10
instance 1, ep 11
instance 1, ep 12
instance 1, ep 13
instance 1, ep 14
instance 1, ep 15
instance 1, ep 16
instance 1, ep 17
instance 1, ep 18
instance 1, ep 19
instance 1, ep 20
instance 1, ep 21
instance 1, ep 22
instance 1, ep 23
instance 1, ep 24
instance 1, ep 25
instance 1, ep 26
instance 1, ep 27
instance 1

In [27]:
if is_jupyter:
    policy = random_policy
    random_reward, random_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
    print(np.mean(random_reward) + random_active_rate*lamb*n_arms*volunteers_per_arm)

instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4
instance 1, ep 5
instance 1, ep 6
instance 1, ep 7
instance 1, ep 8
instance 1, ep 9
instance 1, ep 10
instance 1, ep 11
instance 1, ep 12
instance 1, ep 13
instance 1, ep 14
instance 1, ep 15
instance 1, ep 16
instance 1, ep 17
instance 1, ep 18
instance 1, ep 19
instance 1, ep 20
instance 1, ep 21
instance 1, ep 22
instance 1, ep 23
instance 1, ep 24
instance 1, ep 25
instance 1, ep 26
instance 1, ep 27
instance 1

In [28]:
if is_jupyter:
    policy = whittle_policy
    whittle_reward, whittle_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
    print(np.mean(whittle_reward) + whittle_active_rate*lamb*n_arms*volunteers_per_arm)

instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4
instance 1, ep 5
instance 1, ep 6
instance 1, ep 7
instance 1, ep 8
instance 1, ep 9
instance 1, ep 10
instance 1, ep 11
instance 1, ep 12
instance 1, ep 13
instance 1, ep 14
instance 1, ep 15
instance 1, ep 16
instance 1, ep 17
instance 1, ep 18
instance 1, ep 19
instance 1, ep 20
instance 1, ep 21
instance 1, ep 22
instance 1, ep 23
instance 1, ep 24
instance 1, ep 25
instance 1, ep 26
instance 1, ep 27
instance 1

In [29]:
if is_jupyter:
    policy = greedy_one_step_policy
    greedy_one_step_reward, greedy_one_step_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
    print(np.mean(greedy_one_step_reward) + greedy_one_step_active_rate*lamb*n_arms*volunteers_per_arm)

instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4
instance 1, ep 5
instance 1, ep 6
instance 1, ep 7
instance 1, ep 8
instance 1, ep 9
instance 1, ep 10
instance 1, ep 11
instance 1, ep 12
instance 1, ep 13
instance 1, ep 14
instance 1, ep 15
instance 1, ep 16
instance 1, ep 17
instance 1, ep 18
instance 1, ep 19
instance 1, ep 20
instance 1, ep 21
instance 1, ep 22
instance 1, ep 23
instance 1, ep 24
instance 1, ep 25
instance 1, ep 26
instance 1, ep 27
instance 1

In [30]:
if is_jupyter:
    policy = shapley_whittle_policy 
    whittle_shapley_reward, whittle_shapley_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
    print(np.mean(whittle_shapley_reward) + whittle_shapley_active_rate*lamb*n_arms*volunteers_per_arm)

instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4
instance 1, ep 5
instance 1, ep 6
instance 1, ep 7
instance 1, ep 8
instance 1, ep 9
instance 1, ep 10
instance 1, ep 11
instance 1, ep 12
instance 1, ep 13
instance 1, ep 14
instance 1, ep 15
instance 1, ep 16
instance 1, ep 17
instance 1, ep 18
instance 1, ep 19
instance 1, ep 20
instance 1, ep 21
instance 1, ep 22
instance 1, ep 23
instance 1, ep 24
instance 1, ep 25
instance 1, ep 26
instance 1, ep 27
instance 1

In [31]:
if is_jupyter:
    policy = whittle_greedy_policy 
    whittle_greedy_reward, whittle_greedy_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
    print(np.mean(whittle_greedy_reward) + whittle_greedy_active_rate*lamb*n_arms*volunteers_per_arm)

instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4
instance 1, ep 5
instance 1, ep 6
instance 1, ep 7
instance 1, ep 8
instance 1, ep 9
instance 1, ep 10
instance 1, ep 11
instance 1, ep 12
instance 1, ep 13
instance 1, ep 14
instance 1, ep 15
instance 1, ep 16
instance 1, ep 17
instance 1, ep 18
instance 1, ep 19
instance 1, ep 20
instance 1, ep 21
instance 1, ep 22
instance 1, ep 23
instance 1, ep 24
instance 1, ep 25
instance 1, ep 26
instance 1, ep 27
instance 1

## MCTS Policies

In [32]:
if is_jupyter:
    policy = mcts_policy 
    mcts_reward, mcts_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
    print(np.mean(mcts_reward) + mcts_active_rate*lamb*n_arms*volunteers_per_arm)

instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4
instance 1, ep 5
instance 1, ep 6
instance 1, ep 7
instance 1, ep 8
instance 1, ep 9
instance 1, ep 10
instance 1, ep 11
instance 1, ep 12
instance 1, ep 13
instance 1, ep 14
instance 1, ep 15
instance 1, ep 16
instance 1, ep 17
instance 1, ep 18
instance 1, ep 19
instance 1, ep 20
instance 1, ep 21
instance 1, ep 22
instance 1, ep 23
instance 1, ep 24
instance 1, ep 25
instance 1, ep 26
instance 1, ep 27
instance 1

In [33]:
if is_jupyter:
    policy = mcts_mcts_policy
    mcts_mcts_reward, mcts_mcts_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
    print(np.mean(mcts_mcts_reward) + mcts_mcts_active_rate*lamb*n_arms*volunteers_per_arm)

instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4


KeyboardInterrupt: 

In [None]:
if is_jupyter:
    policy = mcts_whittle_policy
    mcts_mcts_reward, mcts_mcts_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
    print(np.mean(mcts_mcts_reward) + mcts_mcts_active_rate*lamb*n_arms*volunteers_per_arm)

instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4
instance 1, ep 5
instance 1, ep 6
instance 1, ep 7
instance 1, ep 8
instance 1, ep 9
instance 1, ep 10
instance 1, ep 11
instance 1, ep 12
instance 1, ep 13
instance 1, ep 14
instance 1, ep 15
instance 1, ep 16
instance 1, ep 17
instance 1, ep 18
instance 1, ep 19
instance 1, ep 20
instance 1, ep 21
instance 1, ep 22
instance 1, ep 23
instance 1, ep 24
instance 1, ep 25
instance 1, ep 26
instance 1, ep 27
instance 1

## Optimal Policy

In [None]:
if is_jupyter and n_arms*volunteers_per_arm <= 6:
    policy = q_iteration_policy
    per_epoch_function = q_iteration_epoch
    q_reward, q_active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb,per_epoch_function=per_epoch_function)
    print(np.mean(q_reward) + q_active_rate*lamb*n_arms*volunteers_per_arm)

## Actual Experiments

In [None]:
if "two_step" in save_name:
    lamb_list = [1,16,64]
else:
    lamb_list = [0,0.25,0.5,1,2,4,8,16,32,64] 
lamb_list = [i/(n_arms*volunteers_per_arm) for i in lamb_list]

In [None]:
if "combined" in save_name:
    policies = [random_policy,greedy_policy,greedy_one_step_policy,whittle_policy,whittle_activity_policy,shapley_whittle_policy,whittle_greedy_policy]
    policy_names = ["random","greedy","greedy_one_step","whittle","whittle_activity","shapley_whittle","whittle_greedy"]
elif "mcts" in save_name:
    policies = [mcts_policy,mcts_mcts_policy]
    policy_names = ["mcts","mcts_mcts"]
elif "two_step" in save_name:
    policies = [whittle_greedy_policy,mcts_policy,mcts_mcts_policy,mcts_whittle_policy]
    policy_names = ["whittle_greedy","mcts","mcts_mcts","mcts_whittle"]

In [None]:
results = {}
results['parameters'] = {'seed'      : seed,
        'n_arms'    : n_arms,
        'volunteers_per_arm': volunteers_per_arm, 
        'budget'    : budget,
        'discount'  : discount, 
        'alpha'     : alpha, 
        'n_episodes': n_episodes, 
        'episode_len': episode_len, 
        'n_epochs'  : n_epochs, 
        'lambda_list': lamb_list,
        'time_per_run': TIME_PER_RUN} 

In [None]:
if (n_arms * volunteers_per_arm) <= 6:
    print("Running optimal")
    policy = q_iteration_policy
    per_epoch_function = q_iteration_epoch
    
    match_reward_list = []
    active_rate_list = []

    for lamb in lamb_list:
        reward, active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb,per_epoch_function=per_epoch_function)
        match_reward_list.append(np.mean(reward))
        active_rate_list.append(active_rate)

    results['optimal_match'] = match_reward_list 
    results['optimal_active'] = active_rate_list     

In [None]:
for policy,name in zip(policies,policy_names):
    match_reward_list = []
    active_rate_list = []

    print("On policy {}".format(name))

    for lamb in lamb_list:
        reward, active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb)
        match_reward_list.append(np.mean(reward))
        active_rate_list.append(active_rate)

    results['{}_match'.format(name)] = match_reward_list 
    results['{}_active'.format(name)] = active_rate_list 

On policy whittle_greedy
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 1, ep 1
instance 1, ep 2
instance 1, ep 3
instance 1, ep 4
instance 1, ep 5
instance 1, ep 6
instance 1, ep 7
instance 1, ep 8
instance 1, ep 9
instance 1, ep 10
instance 1, ep 11
instance 1, ep 12
instance 1, ep 13
instance 1, ep 14
instance 1, ep 15
instance 1, ep 16
instance 1, ep 17
instance 1, ep 18
instance 1, ep 19
instance 1, ep 20
instance 1, ep 21
instance 1, ep 22
instance 1, ep 23
instance 1, ep 24
instance 1, ep 25
instance 1, ep 26
ins

In [None]:
save_path = get_save_path('combined_large',save_name,seed,use_date=save_with_date)

In [None]:
delete_duplicate_results('combined_large',save_name,results)

In [None]:
json.dump(results,open('../results/'+save_path,'w'))