In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import json 
import argparse 
import sys

In [3]:
from rmab.simulator import RMABSimulator, random_valid_transition, random_valid_transition_round_down, synthetic_transition_small_window
from rmab.uc_whittle import UCWhittleBuggy, UCWhittle, NStepMatch, UCWhittleOracle
from rmab.baselines import optimal_whittle, random_policy, optimal_q_iteration
from rmab.fr_dynamics import get_all_transitions
from rmab.utils import get_save_path, delete_duplicate_results


In [4]:
is_jupyter = 'ipykernel' in sys.modules

In [5]:
if is_jupyter: 
    seed        = 42
    n_arms      = 4
    budget      = 3
    discount    = 0.9
    alpha       = 3 
    n_episodes  = 30
    episode_len = 20
    n_epochs    = 10
    save_name = 'results_all_agents'
    match_prob = 0.5
    save_with_date = True 
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_arms',         '-N', help='num beneficiaries (arms)', type=int, default=8)
    parser.add_argument('--episode_len',    '-H', help='episode length', type=int, default=20)
    parser.add_argument('--n_episodes',     '-T', help='num episodes', type=int, default=30)
    parser.add_argument('--budget',         '-B', help='budget', type=int, default=3)
    parser.add_argument('--n_epochs',       '-E', help='number of epochs (num_repeats)', type=int, default=10)
    parser.add_argument('--discount',       '-d', help='discount factor', type=float, default=0.9)
    parser.add_argument('--alpha',          '-a', help='alpha: for conf radius', type=float, default=3)
    parser.add_argument('--seed',           '-s', help='random seed', type=int, default=42)
    parser.add_argument('--save_name',      '-n', help='save name', type=str, default='results')
    parser.add_argument('--match_prob',      '-m', help='match probability', type=float, default=0.5)
    parser.add_argument('--use_date', action='store_true')

    args = parser.parse_args()

    n_arms      = args.n_arms
    budget      = args.budget
    discount    = args.discount
    alpha       = args.alpha 
    seed        = args.seed
    n_episodes  = args.n_episodes
    episode_len = args.episode_len
    n_epochs    = args.n_epochs
    save_name   = args.save_name 
    save_with_date = args.use_date 
    match_prob = args.match_prob 



In [6]:
n_states = 2
n_actions = 2

In [7]:
all_population_size = 100 # number of random arms to generate
all_transitions = get_all_transitions(all_population_size)

In [8]:
all_transitions.shape

(100, 2, 2, 2)

In [9]:
all_features = np.arange(all_population_size)

In [10]:
np.random.seed(seed)
random.seed(seed)
simulator = RMABSimulator(all_population_size, all_features, all_transitions,
            n_arms, episode_len, n_epochs, n_episodes, budget, number_states=n_states, reward_style='match',match_probability=match_prob)

acting should always be good! 0.000 < 0.044
acting should always be good! 0.000 < 0.162
acting should always be good! 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [83 53 70 45]
cohort [41 39 15 76]
cohort [79 72 62 94]
cohort [92 64 85 36]
cohort [ 7 38 78  2]
cohort [12 59 91 73]
cohort [48 64  7 37]
cohort [18 53 12 32]
cohort [69 64 53 61]
cohort [46 87 15 26]


In [11]:
np.random.seed(seed)
random.seed(seed)
random_rewards = random_policy(simulator, n_episodes, n_epochs)
random_active_rate = simulator.total_active/(random_rewards.size * n_arms)

first state [0 1 1 1]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
instance 0, ep 21, state [0 0 1 0]
instance 0, ep 22, state [1 0 1 1]
instance 0, ep 23, state [0 1 1 1]
instance 0, ep 24, state [0 0 1 1]
instance 0, ep 25, state [0 0 1 0]
instance 0, ep 26, state [1 1 1 1]
instance 0, ep 27, state [0 1 1 0]
instance 0, ep 28, state [0 1 1 0]
instanc

In [12]:
np.random.seed(seed)
random.seed(seed)
optimal_reward = optimal_whittle(simulator, n_episodes, n_epochs, discount)
optimal_active_rate = simulator.total_active/(optimal_reward.size*n_arms)

first state [0 1 1 1]
   state [0 1 1 1] state_WI [0.39 0.47 0.47 0.26] sorted [2 1 0]
   state [1 1 1 0] state_WI [0.26 0.47 0.47 0.71] sorted [3 2 1]
   state [0 1 1 1] state_WI [0.39 0.47 0.47 0.26] sorted [2 1 0]
   state [1 1 0 1] state_WI [0.26 0.47 0.58 0.26] sorted [2 1 0]
   state [0 1 0 0] state_WI [0.39 0.47 0.58 0.71] sorted [3 2 1]
   state [1 1 1 1] state_WI [0.26 0.47 0.47 0.26] sorted [2 1 0]
   state [1 1 1 1] state_WI [0.26 0.47 0.47 0.26] sorted [2 1 0]
   state [0 1 1 0] state_WI [0.39 0.47 0.47 0.71] sorted [3 2 1]
   state [0 0 1 1] state_WI [0.39 0.54 0.47 0.26] sorted [1 2 0]
   state [1 1 1 1] state_WI [0.26 0.47 0.47 0.26] sorted [2 1 0]
   state [1 1 1 1] state_WI [0.26 0.47 0.47 0.26] sorted [2 1 0]
   state [1 1 1 0] state_WI [0.26 0.47 0.47 0.71] sorted [3 2 1]
   state [1 0 1 1] state_WI [0.26 0.54 0.47 0.26] sorted [1 2 0]
   state [1 1 0 0] state_WI [0.26 0.47 0.58 0.71] sorted [3 2 1]
   state [1 1 0 1] state_WI [0.26 0.47 0.58 0.26] sorted [2 1 0]
   

In [13]:
np.random.seed(seed)
random.seed(seed)
rewards_without_norm = UCWhittle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='UCB',norm_confidence=False)
ucw_without_norm_active_rate = simulator.total_active/(rewards_without_norm.size*n_arms)

solving UCWhittle using method: UCB
first state [0 1 1 1]
---------------------------------------------------
0 0  | a  [0 1 1 1]  | s'  [0 1 1 1]  | r  0.875    | WI  [-0. -0. -0. -0.]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
---------------------------------------------------
0 100  | a  [0 1 1 1]  | s'  [0 1 1 1]  | r  0.5    | WI  [-0. -0. -0. -0.]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
---------------------------------------------------
0 200  | a  [0 1 1 1]  | s'  [0 1 1 1]  | r  0.0    | WI  [-0. -0. -0. -0.]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
------------------------------------------

In [14]:
np.random.seed(seed)
random.seed(seed)
rewards_with_norm = UCWhittle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='UCB',norm_confidence=True)
ucw_with_norm_active_rate = simulator.total_active/(rewards_with_norm.size*n_arms)

solving UCWhittle using method: UCB
first state [0 1 1 1]
---------------------------------------------------
0 0  | a  [0 1 1 1]  | s'  [0 1 1 1]  | r  0.875    | WI  [-0. -0. -0. -0.]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
---------------------------------------------------
0 100  | a  [1 1 1 0]  | s'  [1 1 1 0]  | r  0.0    | WI  [0.495 0.65  0.595 0.106]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
---------------------------------------------------
0 200  | a  [0 1 1 1]  | s'  [0 1 0 1]  | r  0.0    | WI  [0.277 0.422 0.404 0.45 ]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
--------------------------

In [15]:
np.random.seed(seed)
random.seed(seed)
whittle_approximate_rewards = optimal_whittle(simulator, n_episodes, n_epochs, discount,reward_function='matching')
whittle_approximate_active_rate = simulator.total_active/(whittle_approximate_rewards.size*n_arms)
np.mean(whittle_approximate_rewards)

first state [0 1 1 1]
   state [0 1 1 1] state_WI [0.35 1.   1.   1.  ] sorted [3 2 1]
   state [1 1 1 0] state_WI [1.   1.   1.   0.52] sorted [2 1 0]
   state [0 1 1 0] state_WI [0.35 1.   1.   0.52] sorted [2 1 3]
   state [0 1 0 1] state_WI [0.35 1.   0.54 1.  ] sorted [3 1 2]
   state [0 1 0 1] state_WI [0.35 1.   0.54 1.  ] sorted [3 1 2]
   state [1 1 1 1] state_WI [1. 1. 1. 1.] sorted [3 2 1]
   state [1 1 1 1] state_WI [1. 1. 1. 1.] sorted [3 2 1]
   state [0 1 1 0] state_WI [0.35 1.   1.   0.52] sorted [2 1 3]
   state [0 0 1 1] state_WI [0.35 0.52 1.   1.  ] sorted [3 2 1]
   state [1 1 1 1] state_WI [1. 1. 1. 1.] sorted [3 2 1]
   state [1 1 1 1] state_WI [1. 1. 1. 1.] sorted [3 2 1]
   state [1 1 1 0] state_WI [1.   1.   1.   0.52] sorted [2 1 0]
   state [1 0 1 0] state_WI [1.   0.52 1.   0.52] sorted [2 0 3]
   state [1 0 0 0] state_WI [1.   0.52 0.54 0.52] sorted [0 2 3]
   state [1 0 0 1] state_WI [1.   0.52 0.54 1.  ] sorted [3 0 2]
   state [1 0 1 0] state_WI [1.   0

0.7437916666666666

In [16]:
np.random.seed(seed)
random.seed(seed)
ucw_perfect_rewards = UCWhittleOracle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='UCB',delta=1e-4,reward_function='matching')
ucw_perfect_active_rate = simulator.total_active/(ucw_perfect_rewards.size*n_arms)
np.mean(ucw_perfect_rewards)


solving UCWhittle using method: UCB
first state [0 1 1 1]
---------------------------------------------------
0 0  | a  [0 1 1 1]  | s'  [0 1 0 0]  | r  0.875    | WI  [-10.   1.   1.   1.]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]


instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
---------------------------------------------------
0 100  | a  [0 1 1 1]  | s'  [0 1 1 1]  | r  0.5    | WI  [0.353 0.516 0.569 1.   ]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
---------------------------------------------------
0 200  | a  [0 1 1 1]  | s'  [0 1 1 1]  | r  0.0    | WI  [0.374 0.516 0.54  0.542]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
---------------------------------------------------
0 300  | a  [0 1 1 1]  | s'  [0 1 0 1]  | r  0.5    | WI  [0.353 0.516 0.477 1.   ]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
---

0.74075

In [17]:
np.random.seed(seed)
random.seed(seed)
ucw_match_rewards = UCWhittle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='UCB',delta=1e-4,reward_function='matching')
ucw_match_active_rate = simulator.total_active/(ucw_match_rewards.size*n_arms)
np.mean(ucw_match_rewards)


solving UCWhittle using method: UCB
first state [0 1 1 1]
---------------------------------------------------
0 0  | a  [0 1 1 1]  | s'  [0 1 1 1]  | r  0.875    | WI  [-0.  1.  1.  1.]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
---------------------------------------------------
0 100  | a  [1 1 0 1]  | s'  [0 1 0 1]  | r  0.5    | WI  [0.482 0.535 0.018 1.   ]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
---------------------------------------------------
0 200  | a  [1 1 0 1]  | s'  [0 0 0 1]  | r  0.0    | WI  [0.446 0.556 0.226 0.47 ]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
--------------------------

0.7401666666666666

In [18]:
np.random.seed(seed)
random.seed(seed)
zero_step_rewards = NStepMatch(simulator, n_episodes, n_epochs, discount, alpha=alpha,n_step=0, method='UCB')
zero_step_active_rate = simulator.total_active/(zero_step_rewards.size*n_arms)

solving UCWhittle using method: UCB
first state [0 1 1 1]
---------------------------------------------------
0 0  | a  [0 1 1 1]  | s'  [0 1 1 1]  | r  0.875
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
---------------------------------------------------
0 100  | a  [0 1 1 1]  | s'  [0 1 0 1]  | r  0.5
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
---------------------------------------------------
0 200  | a  [1 1 1 0]  | s'  [1 1 0 1]  | r  0.0
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
---------------------------------------------------
0 300  | a  [1 1 0 1]  | s'  [1 1 0 1]  | r  0.5
instance 0, ep 16, sta

In [19]:
np.random.seed(seed)
random.seed(seed)
one_step_rewards = NStepMatch(simulator, n_episodes, n_epochs, discount, alpha=alpha,n_step=1, method='UCB')
one_step_active_rate = simulator.total_active/(one_step_rewards.size*n_arms)

solving UCWhittle using method: UCB
first state [0 1 1 1]
---------------------------------------------------
0 0  | a  [1 1 1 0]  | s'  [0 1 1 1]  | r  0.75
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
---------------------------------------------------
0 100  | a  [1 1 0 1]  | s'  [0 1 0 1]  | r  0.5
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
---------------------------------------------------
0 200  | a  [1 1 0 1]  | s'  [1 0 0 1]  | r  0.0
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
---------------------------------------------------
0 300  | a  [1 1 0 1]  | s'  [1 0 0 1]  | r  0.5
instance 0, ep 16, stat

In [20]:
np.random.seed(seed)
random.seed(seed)
infinite_step_rewards = NStepMatch(simulator, n_episodes, n_epochs, discount, alpha=alpha,n_step=-1, method='UCB')
infinite_step_active_rate = simulator.total_active/(infinite_step_rewards.size*n_arms)

solving UCWhittle using method: UCB
first state [0 1 1 1]
---------------------------------------------------
0 0  | a  [1 1 1 0]  | s'  [0 1 1 1]  | r  0.75
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
---------------------------------------------------
0 100  | a  [1 1 0 1]  | s'  [0 1 0 1]  | r  0.5
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
---------------------------------------------------
0 200  | a  [1 1 0 1]  | s'  [1 0 0 1]  | r  0.0
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
---------------------------------------------------
0 300  | a  [1 1 0 1]  | s'  [1 0 0 1]  | r  0.5
instance 0, ep 16, stat

In [21]:
mean_rewards = {'random_rewards': np.mean(random_rewards), 
 'optimal_rewards': np.mean(optimal_reward), 
 'fixed_rewards': np.mean(rewards_without_norm), 
 'norm_rewards': np.mean(rewards_with_norm), 
 'zero_step_rewards': np.mean(zero_step_rewards),
 'one_step_rewards': np.mean(one_step_rewards),
 'infinite_step_rewards': np.mean(infinite_step_rewards),
  'ucw_match_rewards': np.mean(ucw_match_rewards), 
 'ucw_perfect_rewards': np.mean(ucw_perfect_rewards), 
'whittle_approximate_rewards': np.mean(whittle_approximate_rewards),}

In [22]:
active_rates = {'random_rewards': np.mean(random_active_rate), 
 'optimal_rewards': np.mean(optimal_active_rate), 
 'fixed_rewards': np.mean(ucw_without_norm_active_rate), 
 'norm_rewards': np.mean(ucw_with_norm_active_rate), 
 'zero_step_rewards': np.mean(zero_step_active_rate),
 'one_step_rewards': np.mean(one_step_active_rate),
 'infinite_step_rewards': np.mean(infinite_step_active_rate),
  'ucw_match_rewards': np.mean(ucw_match_active_rate), 
  'ucw_perfect_rewards': np.mean(ucw_perfect_active_rate),
  'whittle_approximate_rewards': np.mean(whittle_approximate_active_rate)}

In [23]:
std_rewards = {'random_rewards': np.std(random_rewards), 
 'optimal_rewards': np.std(optimal_reward), 
 'fixed_rewards': np.std(rewards_without_norm), 
 'norm_rewards': np.std(rewards_with_norm), 
 'zero_step_rewards': np.std(zero_step_rewards),
 'one_step_rewards': np.std(one_step_rewards),
 'infinite_step_rewards': np.std(infinite_step_rewards),
  'ucw_match_rewards': np.std(ucw_match_rewards), 
  'ucw_perfect_rewards': np.std(ucw_perfect_rewards),
  'whittle_approximate_rewards': np.std(whittle_approximate_rewards)}

In [24]:
if n_arms <= 6:
    np.random.seed(seed)
    random.seed(seed)
    optimal_match_rewards = optimal_q_iteration(simulator, n_episodes, n_epochs, discount,reward_function='matching')
    optimal_match_active_rate = simulator.total_active/(optimal_match_rewards.size*n_arms)

    mean_rewards['optimal_match_rewards'] = np.mean(optimal_match_rewards)
    active_rates['optimal_match_rewards'] = np.mean(optimal_match_active_rate)
    std_rewards['optimal_match_rewards'] = np.std(optimal_match_rewards)

first state [0 1 1 1]
instance 0, ep 1, state [1 1 1 1]
instance 0, ep 2, state [0 0 0 1]
instance 0, ep 3, state [1 1 0 1]
instance 0, ep 4, state [0 1 1 1]
instance 0, ep 5, state [0 0 0 1]
instance 0, ep 6, state [0 0 0 1]
instance 0, ep 7, state [0 1 1 0]
instance 0, ep 8, state [1 1 0 1]
instance 0, ep 9, state [1 1 1 1]
instance 0, ep 10, state [0 0 0 0]
instance 0, ep 11, state [0 1 0 0]
instance 0, ep 12, state [0 1 0 1]
instance 0, ep 13, state [0 0 0 0]
instance 0, ep 14, state [1 0 1 0]
instance 0, ep 15, state [0 0 0 1]
instance 0, ep 16, state [0 0 0 0]
instance 0, ep 17, state [1 1 0 1]
instance 0, ep 18, state [0 1 0 0]
instance 0, ep 19, state [0 1 1 1]
instance 0, ep 20, state [0 1 1 0]
instance 0, ep 21, state [0 0 1 0]
instance 0, ep 22, state [1 0 1 1]
instance 0, ep 23, state [0 1 1 1]
instance 0, ep 24, state [0 0 1 1]
instance 0, ep 25, state [0 0 1 0]
instance 0, ep 26, state [1 1 1 1]
instance 0, ep 27, state [0 1 1 0]
instance 0, ep 28, state [0 1 1 0]
instanc

In [25]:
mean_rewards

{'random_rewards': 0.6314791666666667,
 'optimal_rewards': 0.6417291666666667,
 'fixed_rewards': 0.6403125,
 'norm_rewards': 0.5985625,
 'zero_step_rewards': 0.7333541666666666,
 'one_step_rewards': 0.7305833333333334,
 'infinite_step_rewards': 0.7157083333333333,
 'ucw_match_rewards': 0.7401666666666666,
 'ucw_perfect_rewards': 0.74075,
 'whittle_approximate_rewards': 0.7437916666666666,
 'optimal_match_rewards': 0.7490833333333333}

In [None]:
data = {
    'mean_reward': mean_rewards, 
    'std_reward': std_rewards,
    'active_rate': active_rates, 
    'parameters': 
        {'seed'      : seed,
        'n_arms'    : n_arms,
        'budget'    : budget,
        'discount'  : discount, 
        'alpha'     : alpha, 
        'n_episodes': n_episodes, 
        'episode_len': episode_len, 
        'n_epochs'  : n_epochs, 
        'match_prob': match_prob} 
}

In [None]:
save_path = get_save_path('matching',save_name,seed,use_date=save_with_date)

In [None]:
delete_duplicate_results('matching',save_name,data)

In [None]:
json.dump(data,open('../results/'+save_path,'w'))