# All Policies

Analyze the performance of our Whittle and Adaptive Policies

In [1]:
%load_ext autoreload
%autoreload 2

In [44]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import json 
import argparse 
import sys
import secrets
from itertools import combinations

In [42]:
from rmab.whittle_policies import *
from rmab.baseline_policies import *
from rmab.mcts_policies import *
from rmab.utils import get_save_path, delete_duplicate_results, restrict_resources
from rmab.simulator import RMABSimulator, run_heterogenous_policy, get_discounted_reward, create_random_transitions
from rmab.fr_dynamics import get_all_transitions, get_db_data, get_all_transitions_partition
from rmab.compute_whittle import get_q_vals,Q_multi_prob, fast_Q_multi_prob
from rmab.multiple_notifications import get_notification_text, get_volunteer_trip_info, get_donor_info, get_recipient_info

In [4]:
is_jupyter = 'ipykernel' in sys.modules

In [5]:
if is_jupyter: 
    seed        = 43
    n_arms      = 20
    volunteers_per_arm = 1
    budget      = 10
    discount    = 0.9
    alpha       = 3 
    n_episodes  = 105
    episode_len = 50 
    n_epochs    = 1
    save_with_date = False 
    lamb = 0.5
    prob_distro = 'food_rescue_top'
    reward_type = "probability"
    reward_parameters = {'universe_size': 20, 'arm_set_low': 0, 'arm_set_high': 1}
    out_folder = 'iterative'
    time_limit = 100
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_arms',         '-N', help='num beneficiaries (arms)', type=int, default=2)
    parser.add_argument('--volunteers_per_arm',         '-V', help='volunteers per arm', type=int, default=5)
    parser.add_argument('--episode_len',    '-H', help='episode length', type=int, default=50)
    parser.add_argument('--n_episodes',     '-T', help='num episodes', type=int, default=105)
    parser.add_argument('--budget',         '-B', help='budget', type=int, default=3)
    parser.add_argument('--n_epochs',       '-E', help='number of epochs (num_repeats)', type=int, default=1)
    parser.add_argument('--discount',       '-d', help='discount factor', type=float, default=0.9)
    parser.add_argument('--alpha',          '-a', help='alpha: for conf radius', type=float, default=3)
    parser.add_argument('--lamb',          '-l', help='lambda for matching-engagement tradeoff', type=float, default=0.5)
    parser.add_argument('--universe_size', help='For set cover, total num unvierse elems', type=int, default=10)
    parser.add_argument('--arm_set_low', help='Least size of arm set, for set cover', type=float, default=3)
    parser.add_argument('--arm_set_high', help='Largest size of arm set, for set cover', type=float, default=6)
    parser.add_argument('--reward_type',          '-r', help='Which type of custom reward', type=str, default='set_cover')
    parser.add_argument('--seed',           '-s', help='random seed', type=int, default=42)
    parser.add_argument('--prob_distro',           '-p', help='which prob distro [uniform,uniform_small,uniform_large,normal]', type=str, default='uniform')
    parser.add_argument('--out_folder', help='Which folder to write results to', type=str, default='iterative')
    parser.add_argument('--time_limit', help='Online time limit for computation', type=float, default=100)
    parser.add_argument('--use_date', action='store_true')

    args = parser.parse_args()

    n_arms      = args.n_arms
    volunteers_per_arm = args.volunteers_per_arm
    budget      = args.budget
    discount    = args.discount
    alpha       = args.alpha 
    seed        = args.seed
    n_episodes  = args.n_episodes
    episode_len = args.episode_len
    n_epochs    = args.n_epochs
    lamb = args.lamb
    save_with_date = args.use_date
    prob_distro = args.prob_distro
    out_folder = args.out_folder
    reward_type = args.reward_type
    reward_parameters = {'universe_size': args.universe_size,
                        'arm_set_low': args.arm_set_low, 
                        'arm_set_high': args.arm_set_high}
    time_limit = args.time_limit 

save_name = secrets.token_hex(4)  

In [6]:
results = {}
results['parameters'] = {'seed'      : seed,
        'n_arms'    : n_arms,
        'volunteers_per_arm': volunteers_per_arm, 
        'budget'    : budget,
        'discount'  : discount, 
        'alpha'     : alpha, 
        'n_episodes': n_episodes, 
        'episode_len': episode_len, 
        'n_epochs'  : n_epochs, 
        'lamb': lamb,
        'prob_distro': prob_distro, 
        'reward_type': reward_type, 
        'universe_size': reward_parameters['universe_size'],
        'arm_set_low': reward_parameters['arm_set_low'], 
        'arm_set_high': reward_parameters['arm_set_high'],
        'time_limit': time_limit, 
        } 

In [7]:
n_states = 2
n_actions = 2


In [8]:
np.random.seed(seed)
all_population_size = 100 
max_transition_prob = 0.25
all_transitions = create_random_transitions(all_population_size,max_transition_prob)


In [9]:
def partition_volunteers(probs_by_num,num_by_section):
    total = sum([len(probs_by_num[i]) for i in probs_by_num])
    num_per_section = total//num_by_section

    nums_by_partition = []
    current_count = 0
    current_partition = []

    keys = sorted(probs_by_num.keys())

    for i in keys:
        if current_count >= num_per_section*(len(nums_by_partition)+1):
            nums_by_partition.append(current_partition)
            current_partition = []
        
        current_partition.append(i)
        current_count += len(probs_by_num[i])
    return nums_by_partition


In [10]:
if prob_distro == "food_rescue_top":
    all_population_size = 20 
    probs_by_user = json.load(open("../../../results/food_rescue/match_probs.json","r"))
    donation_id_to_latlon, recipient_location_to_latlon, rescues_by_user, all_rescue_data, user_id_to_latlon = get_db_data()
    probs_by_num = {}
    for i in rescues_by_user:
        if str(i) in probs_by_user and probs_by_user[str(i)] > 0 and len(rescues_by_user[i]) >= 100:
            if len(rescues_by_user[i]) not in probs_by_num:
                probs_by_num[len(rescues_by_user[i])] = []
            probs_by_num[len(rescues_by_user[i])].append(probs_by_user[str(i)])

    partitions = partition_volunteers(probs_by_num,all_population_size)
    probs_by_partition = []

    for i in range(len(partitions)):
        temp_probs = []
        for j in partitions[i]:
            temp_probs += (probs_by_num[j])
        probs_by_partition.append(temp_probs)

    all_transitions = get_all_transitions_partition(all_population_size,partitions)

    for i,partition in enumerate(partitions):
        current_transitions = np.array(all_transitions[i])
        partition_scale = np.array([len(probs_by_num[j]) for j in partition])
        partition_scale = partition_scale/np.sum(partition_scale)
        prod = current_transitions*partition_scale[:,np.newaxis,np.newaxis,np.newaxis]
        new_transition = np.sum(prod,axis=0)
        all_transitions[i] = new_transition
    all_transitions = np.array(all_transitions)


In [12]:
def create_environment(seed):
    random.seed(seed)
    np.random.seed(seed)

    all_features = np.arange(all_population_size)
    N = all_population_size*volunteers_per_arm
    print(probs_by_partition[0])
    match_probabilities = [np.random.choice(probs_by_partition[i//volunteers_per_arm]) for i in range(N)] 

    simulator = RMABSimulator(all_population_size, all_features, all_transitions,
                n_arms, volunteers_per_arm, episode_len, n_epochs, n_episodes, budget, discount,number_states=n_states, reward_style='custom',match_probability_list=match_probabilities)
    simulator.reward_type = reward_type 
    simulator.reward_parameters = reward_parameters 
    return simulator 


In [13]:
def run_multi_seed(seed_list,policy,is_mcts=False,per_epoch_function=None,train_iterations=0,test_iterations=400,test_length=20,shapley_iterations=1000):
    memories = []
    scores = {
        'reward': [],
        'time': [], 
        'match': [], 
        'active_rate': [],
    }

    for seed in seed_list:
        simulator = create_environment(seed)
        simulator.time_limit = time_limit

        simulator.mcts_train_iterations = train_iterations
        simulator.mcts_test_iterations = test_iterations
        simulator.shapley_iterations = shapley_iterations 

        if prob_distro == "linearity":
            set_list = []

            for i in range(budget):
                set_list.append(set(list(range(1,int(reward_parameters['arm_set_high'])+1))))
            
            for i in range(n_arms*volunteers_per_arm-budget):
                nums = list(range(1,int(reward_parameters['universe_size'])+1))[i*int(reward_parameters['arm_set_low']):(i+1)*int(reward_parameters['arm_set_low']):]
                set_list.append(set(nums))
            
            simulator.match_probability_list[simulator.cohort_selection[0]]  = set_list 

        if prob_distro == "one_time":
            N = n_arms*volunteers_per_arm
            simulator.first_init_states = np.array([[[1 for i in range(N)] for i in range(n_episodes)]])
            random.seed(seed)
            shuffled_list = [reward_parameters['arm_set_high'] for i in range(2)] + [reward_parameters['arm_set_high'] for i in range(N-2)]
            random.shuffle(shuffled_list)

            simulator.match_probability_list[simulator.cohort_selection[0]] = shuffled_list

        if is_mcts:
            match, active_rate, memory = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb,should_train=False,test_T=test_length,get_memory=True,per_epoch_function=per_epoch_function)
        else:
            match, active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb,should_train=False,test_T=test_length,per_epoch_function=per_epoch_function)

        num_timesteps = match.size
        match = match.reshape((num_timesteps//episode_len,episode_len))
        active_rate = active_rate.reshape((num_timesteps//episode_len,episode_len))

        time_whittle = simulator.time_taken
        discounted_reward = get_discounted_reward(match,active_rate,discount,lamb)
        scores['reward'].append(discounted_reward)
        scores['time'].append(time_whittle)
        scores['match'].append(np.mean(match))
        scores['active_rate'].append(np.mean(active_rate))
        if is_mcts:
            memories.append(memory)

    return scores, memories, simulator


In [46]:
volunteer_id = 37683
donor_id = 6220
recipient_id = 7848

In [52]:
get_notification_text(get_volunteer_trip_info(volunteer_id,donor_id,recipient_id)[0],
                    get_donor_info(donor_id),
                    get_recipient_info(recipient_id)
                    ,get_volunteer_trip_info(volunteer_id,donor_id,recipient_id)[1])

'A trip you completed in past is available again! From United Dairy Farmers in Northside to St Leo Food Pantry in North Fairmount'

## Finding Transitions

In [13]:
from datetime import timedelta
import rmab.secret as secret
from rmab.database import open_connection
from rmab.fr_dynamics import get_transitions, compute_days_till, get_data_all_users
from collections import Counter

In [45]:
db_name = secret.database_name 
username = secret.database_username 
password = secret.database_password 
ip_address = secret.ip_address
port = secret.database_port

In [46]:
connection_dict = open_connection(db_name,username,password,ip_address,port)
connection = connection_dict['connection']
cursor = connection_dict['cursor']

In [47]:
data_by_user = get_data_all_users(cursor)

In [48]:
get_transitions_multiple_states_match(data_by_user,200,0.01)[1,1]

array([9.89901010e-05, 6.92200636e-01, 1.09989001e-01, 5.49945005e-01,
       1.00000000e-02, 0.00000000e+00, 0.00000000e+00])

## Speed of Value Iteration

In [243]:
n_states = 25
transitions = np.zeros((n_states,2,n_states))
transitions = np.random.random((n_states,2,n_states))
for i in range(len(transitions)):
    for j in range(len(transitions[i])):
        transitions[i,j] /= np.sum(transitions[i,j])
state = 1
subsidy_break = 1e-3 
whittle_threshold = 1e-3 

In [190]:
def get_q_vals_full(transitions, state, predicted_subsidy, discount, threshold=1e-3, reward_function='activity', lamb=0,
                    match_prob=0.5, get_v=False, num_arms=1):
    """Get the Q values for one arm

    Arguments: 
        transitions: 2x2 numpy array; P_{i}(s,a,1)
        state: integer, which state s_{i} (0 or 1) is an arm in
        predicted_subsidy: float, w, how much to penalize pulling an arm by
        discount: float, \gamma, discount for reward
        lamb: float, \alpha, tradeoff between R_{i} and R_{glob}
        match_prob: p_{i}(s_{i}), current marginal reward
        num_arms: N, total number of arms

    Returns: List of Q values for current state; [Q(s,0),Q(s,1)] 
    """
    assert discount < 1

    n_states, n_actions, _ = transitions.shape
    value_func = np.random.random(n_states)
    difference = np.ones(n_states)

    # lambda-adjusted reward function
    def reward(s, a):
        return s / num_arms - a * predicted_subsidy

    def reward_matching(s, a):
        return s * a * match_prob - a * predicted_subsidy 

    def combined_reward(s, a):
        return s * a * match_prob * (1 - lamb) + lamb * s / num_arms - a * predicted_subsidy

    reward_functions = {
        'activity': reward,
        'matching': reward_matching,
        'combined': combined_reward
    }

    if reward_function not in reward_functions:
        raise ValueError(f"Reward function {reward_function} not found")

    r = reward_functions[reward_function]

    r_matrix = np.array([[r(i, j) for j in range(n_actions)] for i in range(n_states)])

    while np.max(difference) >= threshold:
        orig_value_func = np.copy(value_func)
        print("r matrix {} transitions {} value func {}".format(r_matrix.shape,transitions.shape,value_func.shape))
        Q_func = r_matrix + discount * np.dot(transitions, value_func)

        value_func = np.max(Q_func, axis=1)
        difference = np.abs(orig_value_func - value_func)

    if get_v:
        return Q_func[state, :], value_func

    return Q_func[state, :]

In [191]:
start = time.time() 
lb, ub = -1000,1000

while abs(ub - lb) > 1e-3:
    predicted_subsidy = (lb + ub) / 2

    action = np.argmax(get_q_vals_full(transitions,state,predicted_subsidy,discount,1e-3,reward_function='combined',lamb=lamb,match_prob=0.5,num_arms=4))

    if action == 0:
        # optimal action is passive: subsidy is too high
        ub = predicted_subsidy
    elif action == 1:
        # optimal action is active: subsidy is too low
        lb = predicted_subsidy
    else:
        raise Exception(f'action not binary: {action}')

subsidy = (ub + lb) / 2
subsidy
time.time()-start

r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 2) transitions (25, 2, 25) value func (25,)
r matrix (25, 

0.03153395652770996

In [240]:
def Q_multi_prob_slow(transitions, state, predicted_subsidy, discount, threshold=1e-3, reward_function='combined', lamb=0.5,
                      match_prob=0.5, match_prob_now=1, num_arms=1):
    """Value iteration when initial and subsequent rewards differ
        We do this through Q Iteration on 3 states
        State 0 and 1 are as normal, while 
        State 2 assumes we get match_prob_now reward, then we transition
        to state 0 or 1, with the same transitions as state 1

    Arguments:
        transitions: Numpy array of size 2x2 (for 2 states x 2 actions)
        state: Integer; which state the arm is currently in
            Will compute Q(s,0) and Q(s,1)
        predicted_subsidy: Float, how much to penalize pulling an arm by
        discount: Float, \gamma, how much to discount future rewards
        threshold: Loop exit condition; when value error <= threshold
            we break
        match_prob: Float; how much reward we get when pulling an arm
            in later time steps
        match_prob_now: Float; how much reward we get when pulling an arm
            in this time step
        num_arms: Total number of arms, N
    
    Returns: 3x2 Numpy array, where Q(2,a) is 
        the rewards for pulling/not pulling an arm now
    """
    assert discount < 1
    assert state == 1

    # transitions = new_transitions 

    n_states, n_actions, _ = transitions.shape

    value_func = np.random.random(n_states)
    difference = 1

    def combined_reward(s, a):
        s = min(s,1)
        return s * a * match_prob * (1 - lamb) + lamb * s / num_arms - a * predicted_subsidy

    reward_functions = {
        'combined': combined_reward
    }

    if reward_function not in reward_functions:
        raise ValueError(f"Reward function {reward_function} not found")

    r = reward_functions[reward_function]

    r_matrix = np.array([[r(min(i, n_states-1), j) for j in range(n_actions)] for i in range(n_states)])

    while difference >= threshold:
        start = time.time() 
        orig_value_func = np.copy(value_func)

        Q_func = r_matrix + discount * np.dot(transitions, value_func)
        value_func = np.max(Q_func, axis=1)
        difference = np.max(np.abs(orig_value_func - value_func))

        print("Last",time.time()-start)

    return Q_func


In [244]:
lb, ub = -1,1 # return lower and upper bounds on WI
state = 1
assert state == 1
transitions = np.zeros((2,2,2))
transitions[:,:,1] = np.random.random((2,2))
transitions[:,:,0] = 1-transitions[:,:,1] 

start = time.time()
use_fast = False

if use_fast:
    transitions = transitions[:,:,1]
else:
    new_transitions = np.zeros((3, 2, 3))
    new_transitions[:2, :2, :2] = transitions 
    new_transitions[2, :, :2] = transitions[1]
    transitions = new_transitions 


while abs(ub - lb) > 1e-3:
    predicted_subsidy = (lb + ub) / 2

    if use_fast: 
        Q_multi = fast_Q_multi_prob(transitions, state, discount,lamb=lamb,
                    match_prob=0.5,match_prob_now=0.25,num_arms=4)
    else:
        Q_multi = Q_multi_prob_slow(transitions, state, predicted_subsidy, discount,reward_function='combined',lamb=lamb,
                    match_prob=0.5,match_prob_now=0.25,num_arms=4)
    
    if use_fast:
        action = np.argmax(Q_multi)
    else:
        action = np.argmax(Q_multi[2,:])
    if action == 0:
        # optimal action is passive: subsidy is too high
        ub = predicted_subsidy
    elif action == 1:
        # optimal action is active: subsidy is too low
        lb = predicted_subsidy
    else:
        raise Exception(f'action not binary: {action}')

subsidy = (ub + lb) / 2
time.time()-start


Copy 7.152557373046875e-06
Middle 0.00027942657470703125
Value Func 0.0003039836883544922
Last 0.00031948089599609375
Copy 2.384185791015625e-06
Middle 1.430511474609375e-05
Value Func 2.574920654296875e-05
Last 3.790855407714844e-05
Copy 3.814697265625e-06
Middle 1.621246337890625e-05
Value Func 2.8371810913085938e-05
Last 3.9577484130859375e-05
Copy 2.86102294921875e-06
Middle 1.4066696166992188e-05
Value Func 2.5510787963867188e-05
Last 3.62396240234375e-05
Copy 3.0994415283203125e-06
Middle 1.3828277587890625e-05
Value Func 2.5272369384765625e-05
Last 3.62396240234375e-05
Copy 2.86102294921875e-06
Middle 1.3589859008789062e-05
Value Func 2.5272369384765625e-05
Last 3.600120544433594e-05
Copy 2.6226043701171875e-06
Middle 1.3589859008789062e-05
Value Func 2.4557113647460938e-05
Last 3.552436828613281e-05
Copy 2.86102294921875e-06
Middle 1.33514404296875e-05
Value Func 2.4318695068359375e-05
Last 3.457069396972656e-05
Copy 2.86102294921875e-06
Middle 1.3589859008789062e-05
Value Func

0.013901710510253906

## Index Policies

In [141]:
seed_list = [seed]
restrict_resources()

In [142]:
policy = greedy_policy
name = "greedy"

rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50))
results['{}_reward'.format(name)] = rewards['reward']
results['{}_match'.format(name)] =  rewards['match'] 
results['{}_active'.format(name)] = rewards['active_rate']
results['{}_time'.format(name)] =  rewards['time']
print(np.mean(rewards['reward']))

[0.0025209872185948017, 0.05889281507656066, 0.001668307168715904, 0.0035241048773611504, 0.004228866240960799, 0.09711538461538462, 0.0014444253761226474, 0.006340636574800678, 0.06408629441624365, 0.010603674540682414, 0.01527063804052011, 0.056361607142857144, 0.018603794437281267, 0.040383846461415435, 0.021632041122295994, 0.019370924434215574, 0.0035861383326232068, 0.062461726883037354, 0.022972972972972974, 0.015042029199233151, 0.013845527351703543, 0.001932184125781398, 0.0194954128440367, 0.006313834726090994, 0.002111824165009329, 0.00638640873015873, 0.0065755873340143005, 0.0017701548455840651, 0.0036838340486409155, 0.03698384201077199]
cohort [ 6 19 10  7  1  9 12 16 17  8 14  0 13  4  5  3 18 11  2 15]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0

In [9]:
policy = random_policy
name = "random"

rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50))
results['{}_reward'.format(name)] = rewards['reward']
results['{}_match'.format(name)] =  rewards['match'] 
results['{}_active'.format(name)] = rewards['active_rate']
results['{}_time'.format(name)] =  rewards['time']
print(np.mean(rewards['reward']))

cohort [40 88 42 87]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
Took 0.04671072959899902 time for inference and 0.0002827644348144531 time for training
7.148556983964163


In [10]:
policy = whittle_activity_policy
name = "whittle_activity"

rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50))
results['{}_reward'.format(name)] = rewards['reward']
results['{}_match'.format(name)] =  rewards['match'] 
results['{}_active'.format(name)] = rewards['active_rate']
results['{}_time'.format(name)] =  rewards['time']
print(np.mean(rewards['reward']))

cohort [40 88 42 87]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
Took 0.06586885452270508 time for inference and 0.0005936622619628906 time for training
13.314422290729675


In [143]:
policy = whittle_policy
name = "linear_whittle"

rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50))
results['{}_reward'.format(name)] = rewards['reward']
results['{}_match'.format(name)] =  rewards['match'] 
results['{}_active'.format(name)] = rewards['active_rate']
results['{}_time'.format(name)] =  rewards['time']
print(np.mean(rewards['reward']))

[0.0025209872185948017, 0.05889281507656066, 0.001668307168715904, 0.0035241048773611504, 0.004228866240960799, 0.09711538461538462, 0.0014444253761226474, 0.006340636574800678, 0.06408629441624365, 0.010603674540682414, 0.01527063804052011, 0.056361607142857144, 0.018603794437281267, 0.040383846461415435, 0.021632041122295994, 0.019370924434215574, 0.0035861383326232068, 0.062461726883037354, 0.022972972972972974, 0.015042029199233151, 0.013845527351703543, 0.001932184125781398, 0.0194954128440367, 0.006313834726090994, 0.002111824165009329, 0.00638640873015873, 0.0065755873340143005, 0.0017701548455840651, 0.0036838340486409155, 0.03698384201077199]
cohort [ 6 19 10  7  1  9 12 16 17  8 14  0 13  4  5  3 18 11  2 15]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0

In [12]:
if n_arms * volunteers_per_arm <= 4:
    policy = q_iteration_policy
    per_epoch_function = q_iteration_custom_epoch()
    name = "optimal"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],per_epoch_function=per_epoch_function,test_length=episode_len*(n_episodes%50))
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))

cohort [40 88 42 87]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
Took 0.05080914497375488 time for inference and 5.8581366539001465 time for training
13.572425195709457


In [11]:
if n_arms * volunteers_per_arm <= 1000:
    policy = shapley_whittle_custom_policy 
    name = "shapley_whittle_custom"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50),shapley_iterations=1000)
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))

acting should always be good! (0, 1) 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [80  1 98 86 89 61 50 66 18 49 17  5 40 73 23 20 24 44 32 54]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
Took 0.14577889442443848 time for inference and 0.1260237693786621 time for training
3.1664266867321205


In [16]:
if n_arms * volunteers_per_arm <= 25:
    policy = mcts_linear_policy
    name = "mcts_linear"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50),mcts_test_iterations=400)
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))


cohort [82  9 66 53]


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  choices_weights = [(c.q() / c.n()) + c_param * np.sqrt((2 * np.log(self.n()) / c.n())) for c in self.children]
  choices_weights = [(c.q() / c.n()) + c_param * np.sqrt((2 * np.log(self.n()) / c.n())) for c in self.children]


instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
Took 15.201551914215088 time for inference and 0.12212157249450684 time for training
5.75516038777406


In [51]:
if n_arms * volunteers_per_arm <= 25:
    policy = mcts_shapley_policy
    name = "mcts_shapley"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50))
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))


[0.001668307168715904, 0.0025209872185948017, 0.0035241048773611504, 0.004228866240960799, 0.05889281507656066, 0.09711538461538462, 0.018603794437281267, 0.010603674540682414, 0.040383846461415435, 0.0014444253761226474, 0.021632041122295994, 0.019370924434215574, 0.006340636574800678, 0.0035861383326232068, 0.06408629441624365, 0.01527063804052011, 0.056361607142857144, 0.001932184125781398, 0.006313834726090994, 0.015042029199233151, 0.062461726883037354, 0.013845527351703543, 0.022972972972972974, 0.0194954128440367, 0.0017701548455840651, 0.0036838340486409155, 0.0065755873340143005, 0.002111824165009329, 0.00638640873015873, 0.03698384201077199]
cohort [ 6 19 10  7  1  9 12 16 17  8 14  0 13  4  5  3 18 11  2 15]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  choices_weights = [(c.q() / c.n()) + c_param * np.sqrt((2 * np.log(self.n()) / c.n())) for c in self.children]
  choices_weights = [(c.q() / c.n()) + c_param * np.sqrt((2 * np.log(self.n()) / c.n())) for c in self.children]


instance 0, ep 101
instance 0, ep 102
instance 0, ep 103
instance 0, ep 104
Took 188.50051403045654 time for inference and 2.601569175720215 time for training
6.625318690059936


In [18]:
if n_arms * volunteers_per_arm <= 25:
    policy = mcts_shapley_policy
    name = "mcts_shapley_40"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50),mcts_test_iterations=40)
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))


cohort [82  9 66 53]
instance 0, ep 1
instance 0, ep 2


KeyboardInterrupt: 

In [None]:
if n_arms * volunteers_per_arm <= 25:
    policy = mcts_shapley_policy
    name = "mcts_shapley_4"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50),mcts_test_iterations=4)
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))


cohort [61 54 87 93]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 0, ep 30
instance 0, ep 31
instance 0, ep 32
instance 0, ep 33
instance 0, ep 34
instance 0, ep 35
instance 0, ep 36
instance 0, ep 37
instance 0, ep 38
instance 0, ep 39
instance 0, ep 40
instance 0, ep 41
instance 0, ep 42
instance 0, ep 43
instance 0, ep 44
instance 0, ep 45
instance 0, ep 46
instance 0, ep 47
instance 0, ep 48
instance 0, ep 49
instance 0, ep 50
instance 0, ep 51
instance 0, ep 52
instance 0, ep 53
instance 0, ep 54
instance 0, ep 5

In [None]:
if n_arms * volunteers_per_arm <= 250:
    policy = whittle_iterative_policy
    name = "iterative_whittle"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50))
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))

cohort [61 54 87 93]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 0, ep 30
instance 0, ep 31
instance 0, ep 32
instance 0, ep 33
instance 0, ep 34
instance 0, ep 35
instance 0, ep 36
instance 0, ep 37
instance 0, ep 38
instance 0, ep 39
instance 0, ep 40
instance 0, ep 41
instance 0, ep 42
instance 0, ep 43
instance 0, ep 44
instance 0, ep 45
instance 0, ep 46
instance 0, ep 47
instance 0, ep 48
instance 0, ep 49
instance 0, ep 50
instance 0, ep 51
instance 0, ep 52
instance 0, ep 53
instance 0, ep 54
instance 0, ep 5

In [10]:
if n_arms * volunteers_per_arm <= 25:
    policy = shapley_whittle_iterative_policy
    name = "shapley_iterative_whittle"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50),shapley_iterations=1000)
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))

acting should always be good! (0, 1) 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [80  1 98 86 89 61 50 66 18 49 17  5 40 73 23 20 24 44 32 54]


  shapley_indices = shapley_indices / num_by_shapley_index


KeyboardInterrupt: 

In [None]:
if n_arms * volunteers_per_arm <= 50:
    policy = shapley_whittle_iterative_policy
    name = "shapley_iterative_whittle_100"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50),shapley_iterations=100)
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))

cohort [61 54 87 93]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 0, ep 30
instance 0, ep 31
instance 0, ep 32
instance 0, ep 33
instance 0, ep 34
instance 0, ep 35
instance 0, ep 36
instance 0, ep 37
instance 0, ep 38
instance 0, ep 39
instance 0, ep 40
instance 0, ep 41
instance 0, ep 42
instance 0, ep 43
instance 0, ep 44
instance 0, ep 45
instance 0, ep 46
instance 0, ep 47
instance 0, ep 48
instance 0, ep 49
instance 0, ep 50
instance 0, ep 51
instance 0, ep 52
instance 0, ep 53
instance 0, ep 54
instance 0, ep 5

In [None]:
if n_arms * volunteers_per_arm <= 50:
    policy = shapley_whittle_iterative_policy
    name = "shapley_iterative_whittle_10"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len,shapley_iterations=10)
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))

cohort [61 54 87 93]
instance 0, ep 1
instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 0, ep 30
instance 0, ep 31
instance 0, ep 32
instance 0, ep 33
instance 0, ep 34
instance 0, ep 35
instance 0, ep 36
instance 0, ep 37
instance 0, ep 38
instance 0, ep 39
instance 0, ep 40
instance 0, ep 41
instance 0, ep 42
instance 0, ep 43
instance 0, ep 44
instance 0, ep 45
instance 0, ep 46
instance 0, ep 47
instance 0, ep 48
instance 0, ep 49
instance 0, ep 50
instance 0, ep 51
instance 0, ep 52
instance 0, ep 53
instance 0, ep 54
instance 0, ep 5

In [None]:
if n_arms * volunteers_per_arm <= 50:
    policy = shapley_whittle_iterative_policy
    name = "shapley_iterative_whittle_1"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,results['parameters'],test_length=episode_len*(n_episodes%50),shapley_iterations=1)
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))

cohort [61 54 87 93]
instance 0, ep 1


instance 0, ep 2
instance 0, ep 3
instance 0, ep 4
instance 0, ep 5
instance 0, ep 6
instance 0, ep 7
instance 0, ep 8
instance 0, ep 9
instance 0, ep 10
instance 0, ep 11
instance 0, ep 12
instance 0, ep 13
instance 0, ep 14
instance 0, ep 15
instance 0, ep 16
instance 0, ep 17
instance 0, ep 18
instance 0, ep 19
instance 0, ep 20
instance 0, ep 21
instance 0, ep 22
instance 0, ep 23
instance 0, ep 24
instance 0, ep 25
instance 0, ep 26
instance 0, ep 27
instance 0, ep 28
instance 0, ep 29
instance 0, ep 30
instance 0, ep 31
instance 0, ep 32
instance 0, ep 33
instance 0, ep 34
instance 0, ep 35
instance 0, ep 36
instance 0, ep 37
instance 0, ep 38
instance 0, ep 39
instance 0, ep 40
instance 0, ep 41
instance 0, ep 42
instance 0, ep 43
instance 0, ep 44
instance 0, ep 45
instance 0, ep 46
instance 0, ep 47
instance 0, ep 48
instance 0, ep 49
instance 0, ep 50
instance 0, ep 51
instance 0, ep 52
instance 0, ep 53
instance 0, ep 54
instance 0, ep 55
instance 0, ep 56
instance 0, ep 57


  shapley_indices /= num_by_shapley_index


## Write Data

In [None]:
save_path = get_save_path(out_folder,save_name,seed,use_date=save_with_date)

In [None]:
delete_duplicate_results(out_folder,"",results)

In [None]:
json.dump(results,open('../../results/'+save_path,'w'))