# MCTS + Whittle Indices Experiments

Analyze the performance of various algorithms to solve the joint matching + activity task, when the number of volunteers is large and structured

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import json 
import argparse 
import sys
import secrets

In [3]:
from rmab.simulator import RMABSimulator, run_heterogenous_policy, get_discounted_reward
from rmab.omniscient_policies import *
from rmab.fr_dynamics import get_all_transitions
from rmab.compute_whittle import arm_compute_whittle_multi_prob
from rmab.mcts_policies import *
from rmab.utils import get_save_path, delete_duplicate_results, create_prob_distro
import resource

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
torch.cuda.set_per_process_memory_fraction(0.5)
torch.set_num_threads(1)
resource.setrlimit(resource.RLIMIT_AS, (30 * 1024 * 1024 * 1024, -1))

In [5]:
is_jupyter = 'ipykernel' in sys.modules

In [6]:
if is_jupyter: 
    seed        = 43
    n_arms      = 3
    volunteers_per_arm = 1
    budget      = 2
    discount    = 0.9
    alpha       = 3 
    n_episodes  = 50
    episode_len = 100 
    n_epochs    = 1
    save_with_date = False 
    TIME_PER_RUN = 0.01 * 1000
    lamb = 0
    prob_distro = 'uniform'
    reward_type = "max"
    reward_parameters = {'universe_size': 20, 'arm_set_low': 15, 'arm_set_high': 20}
    policy_lr=5e-3
    value_lr=1e-4
    train_iterations = 30
    test_iterations = 30
    out_folder = 'mcts_exploration/mcts_shapley'
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_arms',         '-N', help='num beneficiaries (arms)', type=int, default=2)
    parser.add_argument('--volunteers_per_arm',         '-V', help='volunteers per arm', type=int, default=5)
    parser.add_argument('--episode_len',    '-H', help='episode length', type=int, default=20)
    parser.add_argument('--n_episodes',     '-T', help='num episodes', type=int, default=1)
    parser.add_argument('--budget',         '-B', help='budget', type=int, default=3)
    parser.add_argument('--n_epochs',       '-E', help='number of epochs (num_repeats)', type=int, default=3)
    parser.add_argument('--discount',       '-d', help='discount factor', type=float, default=0.9)
    parser.add_argument('--alpha',          '-a', help='alpha: for conf radius', type=float, default=3)
    parser.add_argument('--lamb',          '-l', help='lambda for matching-engagement tradeoff', type=float, default=0.5)
    parser.add_argument('--universe_size', help='For set cover, total num unvierse elems', type=int, default=10)
    parser.add_argument('--arm_set_low', help='Least size of arm set, for set cover', type=int, default=3)
    parser.add_argument('--arm_set_high', help='Largest size of arm set, for set cover', type=int, default=6)
    parser.add_argument('--reward_type',          '-r', help='Which type of custom reward', type=str, default='set_cover')
    parser.add_argument('--seed',           '-s', help='random seed', type=int, default=42)
    parser.add_argument('--prob_distro',           '-p', help='which prob distro [uniform,uniform_small,uniform_large,normal]', type=str, default='uniform')
    parser.add_argument('--time_per_run',      '-t', help='time per MCTS run', type=float, default=.01*1000)
    parser.add_argument('--policy_lr', help='Learning Rate Policy', type=float, default=5e-3)
    parser.add_argument('--value_lr', help='Learning Rate Value', type=float, default=1e-4)
    parser.add_argument('--train_iterations', help='Number of MCTS train iterations', type=int, default=30)
    parser.add_argument('--test_iterations', help='Number of MCTS test iterations', type=int, default=30)
    parser.add_argument('--out_folder', help='Which folder to write results to', type=str, default='mcts_exploration/mcts_shapley')

    parser.add_argument('--use_date', action='store_true')

    args = parser.parse_args()

    n_arms      = args.n_arms
    volunteers_per_arm = args.volunteers_per_arm
    budget      = args.budget
    discount    = args.discount
    alpha       = args.alpha 
    seed        = args.seed
    n_episodes  = args.n_episodes
    episode_len = args.episode_len
    n_epochs    = args.n_epochs
    lamb = args.lamb
    save_with_date = args.use_date
    TIME_PER_RUN = args.time_per_run
    prob_distro = args.prob_distro
    policy_lr = args.policy_lr 
    value_lr = args.value_lr 
    out_folder = args.out_folder
    train_iterations = args.train_iterations 
    test_iterations = args.test_iterations 
    reward_type = args.reward_type
    reward_parameters = {'universe_size': args.universe_size,
                        'arm_set_low': args.arm_set_low, 
                        'arm_set_high': args.arm_set_high}

save_name = secrets.token_hex(4)  

In [27]:
n_states = 2
n_actions = 2

In [28]:
all_population_size = 100 
all_transitions = get_all_transitions(all_population_size)
# all_transitions[all_transitions > 0.5] = 1 # np.random.random(all_transitions[all_transitions > 0.5].shape)*(1-1) + 1
# all_transitions[all_transitions <= 0.5] = 0

In [29]:
cohort = [87,53,47,7,61,18,10,46,21,96][:3]

In [30]:
for i in range(len(cohort)-1):
    a = 1
    b = 1
    c = 1
    d = 1
    all_transitions[cohort[i]] = np.array([[[1-a,a],[1-b,b]],[[1-c,c],[1-d,d]]])

In [51]:
a = 0
b = 0.17
c = 0
d = b
all_transitions[cohort[-1]] = np.array([[[1-a,a],[1-b,b]],[[1-c,c],[1-d,d]]])

In [52]:
def create_environment(seed):
    random.seed(seed)
    np.random.seed(seed)

    all_features = np.arange(all_population_size)
    if reward_type == "set_cover":
        match_probabilities = [set([random.randint(0,reward_parameters['universe_size']) for _ in range(random.randint(reward_parameters['arm_set_low'],reward_parameters['arm_set_high']))]) for i in range(all_population_size*volunteers_per_arm)]
    else:
        match_probabilities = np.array(create_prob_distro(prob_distro,all_population_size*volunteers_per_arm))

    simulator = RMABSimulator(all_population_size, all_features, all_transitions,
                n_arms, volunteers_per_arm, episode_len, n_epochs, n_episodes, budget, discount,number_states=n_states, reward_style='custom',match_probability_list=match_probabilities,TIME_PER_RUN=TIME_PER_RUN)
    simulator.reward_type = reward_type 
    simulator.reward_parameters = reward_parameters 
    simulator.match_probability_list[cohort] = [0.24 for i in range(len(cohort)-1)] + [0.62]

    # TODO: Remove this
    return simulator 

In [53]:
def run_multi_seed(seed_list,policy,is_mcts=False,per_epoch_function=None,train_iterations=0,test_iterations=0,test_length=20):
    memories = []
    scores = {
        'reward': [],
        'time': [], 
        'match': [], 
        'active_rate': [],
    }

    for seed in seed_list:
        simulator = create_environment(seed)
        simulator.first_init_states = np.array([[[1 for k in range(len(cohort)-1)]+[0] for i in range(n_episodes)]])     
        # simulator.cohort_selection = np.array([[[87,53,47] for i in range(n_episodes)]])
        if is_mcts:
            simulator.mcts_train_iterations = train_iterations
            simulator.mcts_test_iterations = test_iterations
            simulator.policy_lr = policy_lr
            simulator.value_lr = value_lr

        if is_mcts:
            match, active_rate, memory = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb,should_train=True,test_T=test_length,get_memory=True,per_epoch_function=per_epoch_function)
        else:
            match, active_rate = run_heterogenous_policy(simulator, n_episodes, n_epochs, discount,policy,seed,lamb=lamb,should_train=True,test_T=test_length,per_epoch_function=per_epoch_function)
        time_whittle = simulator.time_taken
        discounted_reward = get_discounted_reward(match,active_rate,discount,lamb)
        scores['reward'].append(discounted_reward)
        scores['time'].append(time_whittle)
        scores['match'].append(np.mean(match))
        scores['active_rate'].append(np.mean(active_rate))
        if is_mcts:
            memories.append(memory)

    return scores, memories, simulator

In [54]:
results = {}
results['parameters'] = {'seed'      : seed,
        'n_arms'    : n_arms,
        'volunteers_per_arm': volunteers_per_arm, 
        'budget'    : budget,
        'discount'  : discount, 
        'alpha'     : alpha, 
        'n_episodes': n_episodes, 
        'episode_len': episode_len, 
        'n_epochs'  : n_epochs, 
        'lamb': lamb,
        'time_per_run': TIME_PER_RUN, 
        'prob_distro': prob_distro, 
        'policy_lr': policy_lr, 
        'value_lr': value_lr, 
        'reward_type': reward_type, 
        'universe_size': reward_parameters['universe_size'],
        'arm_set_low': reward_parameters['arm_set_low'], 
        'arm_set_high': reward_parameters['arm_set_high'],
        } 

## Upper and Loewr Bounds

In [158]:
max_values = [0.24,0.24,0.62]
budget = 2
num_start = 8

In [170]:
prob_values = [1000,2/3+.01,1]

In [171]:
def our_reward(state,combo,corresponding_probabilities,s,d):
    probs = state*combo*corresponding_probabilities
    star = probs[0]
    low = sum(probs[1:2])
    high = sum(probs[2:3])

    if star > 0:
        return star + low 
    else:
        return low + high 

In [172]:
state = np.ones(len(prob_values))

In [173]:
state = [1 for i in range(len(prob_values))]
shapley_indices = [0 for i in range(len(state))]
state_str = "".join([str(i) for i in state])

state_1 = [i for i in range(len(state)) if state[i] != 0]
match_probabilities = np.array(prob_values)
corresponding_probabilities = match_probabilities[state_1]
num_random_combos = 200*len(state_1)
# num_random_combos = min(num_random_combos,100000)

combinations = np.zeros((num_random_combos, len(corresponding_probabilities)), dtype=int)

# Fix for when the number of combinations is small (with respect to the budget)
# In that scenario, we can essentially just manually compute
budget_probs = np.array([scipy.special.comb(len(corresponding_probabilities),k) for k in range(0,budget)])
budget_probs /= np.sum(budget_probs)



for i in range(num_random_combos):
    k = random.choices(list(range(len(budget_probs))), weights=budget_probs,k=1)[0]
    ones_indices = random.sample(list(range(len(corresponding_probabilities))),k)
    combinations[i, ones_indices] = 1

state = [int(i) for i in state]

scores = [our_reward(state,combo,corresponding_probabilities,"max",{}) for combo in combinations]
scores = np.array(scores)
print("Combinations {} Score {}".format(combinations[0],scores[0]))

for i in range(len(state_1)):
    shapley_indices[state_1[i]] = np.mean([our_reward(state,np.array([1 if idx == i else val for idx, val in enumerate(combo)]),corresponding_probabilities,"max",{}) - scores[j] for j,combo in enumerate(combinations) if combo[i] == 0])
list(shapley_indices)

Combinations [0 1 0] Score 0.6766666666666666


[999.6507936507936, 0.6766666666666503, 0.6434977578475336]

In [191]:
state = np.array([1,1,1])
for i in range(2**3):
    for j in range(2**3):
        i_binary = np.array([int(k) for k in bin(i)[2:].zfill(3)])
        j_binary = np.array([int(k) for k in bin(j)[2:].zfill(3)])
        intersection = np.array([i_binary[k]*j_binary[k] for k in range(3)])
        union = np.array([min(i_binary[k]+j_binary[k],1) for k in range(3)])

        i_score = our_reward(state,i_binary,corresponding_probabilities,"max",{})
        j_score = our_reward(state,j_binary,corresponding_probabilities,"max",{})

        intersection_score = our_reward(state,intersection,corresponding_probabilities,"max",{})
        union_score = our_reward(state,union,corresponding_probabilities,"max",{})

        assert i_score + j_score >= intersection_score + union_score

In [163]:
i = 0
a = []
for j,combo in enumerate(combinations):
    if combo[i] == 0:
        ours = our_reward(state,np.array([1 if idx == i else val for idx, val in enumerate(combo)]),corresponding_probabilities,"max",{})
        theirs = scores[j]
        print("Ours {} Theirs {} Combo {}".format(ours,theirs,combo))
        a.append(ours-theirs)
np.mean(a)

Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1000 Theirs 10 Combo [0 0 1]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1000 Theirs 10 Combo [0 0 1]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1000 Theirs 10 Combo [0 0 1]
Ours 1000 Theirs 10 Combo [0 0 1]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1000 Theirs 10 Combo [0 0 1]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1000 Theirs 10 Combo [0 0 1]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1000 Theirs 0 Combo [0 0 0]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1007 Theirs 7 Combo [0 1 0]
Ours 1000 Theirs 10 Combo [0 0 1]
Our

996.7483296213809

In [37]:
guess = []
for i in range(len(max_values)):
    val = (i+1)*max_values[i]
    val -= sum([max_values[j] for j in range(i)])
    val /= (len(max_values))
    guess.append(val)
guess

[0.08, 0.08, 0.45999999999999996]

In [38]:
0.08/(0.46*0.9)

0.1932367149758454

In [39]:
# ratio = (0,(0,0))
# for i in range(len(max_values)):
#     for j in range(len(max_values)):
#         if i==j:
#             continue 
#         top = max(max_values[i],max_values[j])
#         bottom = guess[i]+guess[j] 

#         if top/bottom > ratio[0]:
#             ratio = (top/bottom,(i,j))
# ratio

In [40]:
# def f(s):
#     if len(s) == 0:
#         return 0
#     return max([max_values[i] for i in s])

# K = budget 
# discount = 0.9
# N = len(max_values)
# min_ratio = 1
# best_choice = []

# for S in range(2**N):
#     for S_prime in range(2**N):
#         S_binary = [int(j) for j in bin(S)[2:].zfill(N)]
#         S_index = [i for i in range(len(S_binary)) if S_binary[i] == 1]

#         S_prime_binary = [int(j) for j in bin(S_prime)[2:].zfill(N)]
#         S_prime_index = [i for i in range(len(S_prime_binary)) if S_prime_binary[i] == 1]

#         if not (len(S_index) == len(S_prime_index) == K):
#             continue 

#         min_S_value = min(shapley_indices[i] for i in S_index)
#         S_prime_intersection = [i for i in range(len(S_prime_binary)) if S_binary[i]*S_prime_binary[i] == 1]
#         S_prime_exclusive = [i for i in range(len(S_prime_binary)) if S_binary[i] == 0 and S_prime_binary[i] == 1]
#         b = [min(min_S_value/(discount*shapley_indices[i]),1) for i in S_prime_exclusive]     


#         top = (1/(1-discount))*f(S_index)
#         bottom = f(S_prime_intersection)

#         if S_index == [0,1] and S_prime_index == [1,3]:
#             print("Bottom is {}".format(bottom))

#         if len(S_prime_exclusive) > 0:
#             for i in range(2**len(S_prime_exclusive)):
#                 binary_val = [int(j) for j in bin(i)[2:].zfill(len(S_prime_exclusive))]
#                 prob = np.prod([b[j] if binary_val[j] == 1 else (1-b[j]) for j in range(len(binary_val))])
#                 bottom += (1/(1-discount)-1)*prob*f(S_prime_intersection + [S_prime_exclusive[j] for j in range(len(binary_val)) if binary_val[j] == 1])
#                 if S_index == [0,1] and S_prime_index == [1,3]:
#                     print("Prob is {}".format((1/(1-discount)-1)*prob*f(S_prime_intersection + [S_prime_exclusive[j] for j in range(len(binary_val)) if binary_val[j] == 1])))

#         else:
#             bottom = (1/(1-discount))*f(S_prime_intersection)

#         approx_ratio = top/bottom 
#         if min_ratio > approx_ratio:
#             best_choice = (S_index,S_prime_index)
#             print("Top {} Bottom {}".format(top,bottom))
#         min_ratio = min(approx_ratio,min_ratio)
# print("Min Ratio {} with {}".format(min_ratio,best_choice))
# lower_bound = max(max_values[:budget])/sum(shapley_indices[:budget])
# print("Lower bound {}".format(lower_bound))

In [41]:
# best = (1,(0,0))
# for i in range(len(max_values)):
#     for j in range(len(max_values)):
#         if i == j:
#             continue 
#         top = max(max_values[i],max_values[j])
#         bottom = shapley_indices[i]+shapley_indices[j]
#         if top/bottom < best[0]:
#             best = (top/bottom,(i,j))
# best

In [42]:
# shapley_indices

## Index Policies

In [43]:
seed_list=[seed]

In [44]:
# all_diffs = []
# for seed in range(40,50):
#     seed_list = [seed]
#     policy = mcts_shapley_policy
#     name = "mcts_shapley"

#     rewards, memory, simulator = run_multi_seed(seed_list,policy,is_mcts=True,test_iterations=400,test_length=n_episodes*episode_len)
#     results['{}_reward'.format(name)] = rewards['reward']
#     results['{}_match'.format(name)] =  rewards['match'] 
#     results['{}_active'.format(name)] = rewards['active_rate']
#     results['{}_time'.format(name)] =  rewards['time']
#     rew_whittle = np.mean(rewards['reward'])
#     policy = q_iteration_policy
#     per_epoch_function = q_iteration_custom_epoch()
#     name = "optimal"

#     print("Running optimal")

#     rewards, memory, simulator = run_multi_seed(seed_list,policy,per_epoch_function=per_epoch_function,test_length=n_episodes*episode_len)
#     results['{}_reward'.format(name)] = rewards['reward']
#     results['{}_match'.format(name)] =  rewards['match'] 
#     results['{}_active'.format(name)] = rewards['active_rate']
#     results['{}_time'.format(name)] =  rewards['time']
#     rew_q = np.mean(rewards['reward'])

#     diff = rew_whittle/rew_q
#     all_diffs.append(diff)

In [45]:
# pair_amounts = np.zeros((4,4))
# for i in range(4):
#     for j in range(4):
#         pair_amounts[i,j] = len(simulator.match_probability_list[simulator.agent_idx][i].union(simulator.match_probability_list[simulator.agent_idx][j]))
# a = [len(i) for i in simulator.match_probability_list[simulator.agent_idx]]
# a = [(i,a[i]) for i in range(len(a))]
# a = sorted(a,key = lambda k: k[1],reverse=True)
# max_pair = 1
# while max_pair < len(a)  and a[1][1] == a[max_pair][1]:
#     max_pair += 1
# max_pair -= 1

# found_max_pair = 0
# for i in range(max_pair+1):
#     for j in range(max_pair+1):
#         found_max_pair = max(pair_amounts[a[i][0],a[j][0]],found_max_pair)
# found_max_pair/np.max(pair_amounts)

In [56]:
policy = whittle_policy 
name = "linear_whittle"

rewards, memory, simulator = run_multi_seed(seed_list,policy,test_length=n_episodes*episode_len)
results['{}_reward'.format(name)] = rewards['reward']
results['{}_match'.format(name)] =  rewards['match'] 
results['{}_active'.format(name)] = rewards['active_rate']
results['{}_time'.format(name)] =  rewards['time']
print(np.mean(rewards['reward']))

acting should always be good! (0, 1) 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [87 53 47]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Reward 0.24
WI [0.24000043 0.24000043 0.09486033]
State [1 1 0] Action [1 1 0] Rewa

In [57]:
if n_arms * volunteers_per_arm <= 4:
    policy = q_iteration_policy
    per_epoch_function = q_iteration_custom_epoch()
    name = "optimal"

    rewards, memory, simulator = run_multi_seed(seed_list,policy,per_epoch_function=per_epoch_function,test_length=n_episodes*episode_len)
    results['{}_reward'.format(name)] = rewards['reward']
    results['{}_match'.format(name)] =  rewards['match'] 
    results['{}_active'.format(name)] = rewards['active_rate']
    results['{}_time'.format(name)] =  rewards['time']
    print(np.mean(rewards['reward']))

acting should always be good! (0, 1) 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [87 53 47]
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 1] Action [0 0 1] Reward 0.62
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 1] Action [0 0 1] Reward 0.62
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 1] Action [0 0 1] Reward 0.62
State [1 1 1] Action [0 0 1] Reward 0.62
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 1] Action [0 0 1] Reward 0.62
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
State [1 1 0] Action [0 1 1] Reward 0.24
Sta

In [68]:
4.85/5.07

0.9566074950690334

In [86]:
simulator.agent_idx

[87, 53, 47]

In [58]:
policy = shapley_whittle_custom_policy 
name = "shapley_whittle_custom"

rewards, memory, simulator = run_multi_seed(seed_list,policy,test_length=n_episodes*episode_len)
results['{}_reward'.format(name)] = rewards['reward']
results['{}_match'.format(name)] =  rewards['match'] 
results['{}_active'.format(name)] = rewards['active_rate']
results['{}_time'.format(name)] =  rewards['time']
print(np.mean(rewards['reward']))

acting should always be good! (0, 1) 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [87 53 47]
[0.07304348 0.08195122 0.45148936]
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 1 0] Action [1 1 0] Reward 0.24
State [1 

In [60]:
max_values, shapley_indices

([0.24, 0.24, 0.62],
 [0.07778251599147122, 0.08386206896551725, 0.46142857142857135])

In [72]:
3.2/1.49

2.1476510067114094

In [152]:
policy = mcts_shapley_policy
name = "mcts_shapley"

rewards, memory, simulator = run_multi_seed(seed_list,policy,is_mcts=True,test_iterations=400,test_length=20)
results['{}_reward'.format(name)] = rewards['reward']
results['{}_match'.format(name)] =  rewards['match'] 
results['{}_active'.format(name)] = rewards['active_rate']
results['{}_time'.format(name)] =  rewards['time']
np.mean(rewards['reward'])

cohort [79]


TypeError: get_attributions() got an unexpected keyword argument 'attribution_method'

In [47]:
policy = mcts_shapley_attributions_policy
name = "mcts_shapley_attribution"

rewards, memory, simulator = run_multi_seed(seed_list,policy,is_mcts=True,test_iterations=400,test_length=20)
results['{}_reward'.format(name)] = rewards['reward']
results['{}_match'.format(name)] =  rewards['match'] 
results['{}_active'.format(name)] = rewards['active_rate']
results['{}_time'.format(name)] =  rewards['time']
np.mean(rewards['reward'])

acting should always be good! 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [20]
Took 2.373187303543091 time for inference and 0.4436161518096924 time for training
acting should always be good! 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [42]
Took 2.027259588241577 time for inference and 0.22554779052734375 time for training
acting should always be good! 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [0]
Took 1.9015212059020996 time for inference and 0.425368070602417 time for training


28.966393687307782

## Write Data

In [None]:
save_path = get_save_path(out_folder,save_name,seed,use_date=save_with_date)

In [None]:
delete_duplicate_results(out_folder,"",results)

In [None]:
json.dump(results,open('../../results/'+save_path,'w'))