In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import json 
import argparse 
import sys

In [3]:
from rmab.simulator import RMABSimulator, random_valid_transition, random_valid_transition_round_down, synthetic_transition_small_window
from rmab.uc_whittle import UCWhittle, UCWhittleFixed 
from rmab.ucw_value import UCWhittle_value, UCWhittle_value_fixed
from rmab.baselines import optimal_policy, random_policy, WIQL
from rmab.fr_dynamics import get_all_transitions
from rmab.utils import get_save_path, delete_duplicate_results


In [4]:
is_jupyter = 'ipykernel' in sys.modules

In [5]:
if is_jupyter: 
    seed        = 42
    n_arms      = 8
    budget      = 3
    discount    = 0.9
    alpha       = 3 
    n_episodes  = 30
    episode_len = 20
    n_epochs    = 10
    save_name = 'hyperparameter'
    save_with_date = True 
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_arms',         '-N', help='num beneficiaries (arms)', type=int, default=8)
    parser.add_argument('--episode_len',    '-H', help='episode length', type=int, default=20)
    parser.add_argument('--n_episodes',     '-T', help='num episodes', type=int, default=30)
    parser.add_argument('--budget',         '-B', help='budget', type=int, default=3)
    parser.add_argument('--n_epochs',       '-E', help='number of epochs (num_repeats)', type=int, default=10)
    parser.add_argument('--discount',       '-d', help='discount factor', type=float, default=0.9)
    parser.add_argument('--alpha',          '-a', help='alpha: for conf radius', type=float, default=3)
    parser.add_argument('--seed',           '-s', help='random seed', type=int, default=42)
    parser.add_argument('--save_name',      '-n', help='save name', type=str, default='results')
    parser.add_argument('--use_date', action='store_true')

    args = parser.parse_args()

    n_arms      = args.n_arms
    budget      = args.budget
    discount    = args.discount
    alpha       = args.alpha 
    seed        = args.seed
    n_episodes  = args.n_episodes
    episode_len = args.episode_len
    n_epochs    = args.n_epochs
    save_name   = args.save_name 
    save_with_date = args.use_date 



In [6]:
n_states = 2
n_actions = 2

In [7]:
all_population_size = 100 # number of random arms to generate
all_transitions = get_all_transitions(all_population_size)

In [8]:
all_transitions.shape

(100, 2, 2, 2)

In [9]:
all_features = np.arange(all_population_size)

In [15]:
np.random.seed(seed)
random.seed(seed)
simulator = RMABSimulator(all_population_size, all_features, all_transitions,
            n_arms, episode_len, n_epochs, n_episodes, budget, number_states=n_states)

acting should always be good! 0.000 < 0.044
acting should always be good! 0.000 < 0.162
acting should always be good! 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [83 53 70 45 44 39 22 80]
cohort [45 80  5 29 78 35 86 14]
cohort [42 28 84 76 68 46 97 15]
cohort [51 29 14 74 15  3 31 87]
cohort [55  0 94  4 72 26 38  3]
cohort [69 66 55 63 30 61 42 29]
cohort [83 68 66 27 99 32 35 23]
cohort [31  1 98 11 36  4 84 58]
cohort [ 5 41 86 49 97 13 58 42]
cohort [22 71 78 83 30 48 34 29]
Last cohort [22 71 78 83 30 48 34 29]
Transitions [[[[0.90466102 0.09533898]
   [0.52564103 0.47435897]]

  [[0.57692308 0.42307692]
   [0.28274428 0.71725572]]]


 [[[0.8125     0.1875    ]
   [0.32539683 0.67460317]]

  [[0.38095238 0.61904762]
   [0.21077283 0.78922717]]]


 [[[0.54545455 0.45454545]
   [0.27173913 0.72826087]]

  [[0.43478261 0.56521739]
   [0.21621622 0.78378378]]]


 [[[0.79467681 0.20532319]
   [0.47663

In [11]:
np.random.seed(seed)
random.seed(seed)
random_rewards = random_policy(simulator, n_episodes, n_epochs)

first state [0 0 1 0 1 1 0 1]
instance 0, ep 1, state [1 1 0 1 0 1 1 0]
instance 0, ep 2, state [0 0 0 0 0 0 1 1]
instance 0, ep 3, state [1 1 0 1 0 0 0 0]
instance 0, ep 4, state [1 0 1 0 1 0 0 0]
instance 0, ep 5, state [1 0 0 1 0 1 1 0]
instance 0, ep 6, state [0 0 0 0 1 0 1 0]
instance 0, ep 7, state [1 1 0 1 0 1 1 1]
instance 0, ep 8, state [1 0 1 0 0 1 1 0]
instance 0, ep 9, state [1 0 1 0 0 1 1 0]
instance 0, ep 10, state [1 1 0 1 0 0 1 1]
instance 0, ep 11, state [1 1 1 0 1 0 1 0]
instance 0, ep 12, state [0 0 0 0 0 1 1 0]
instance 0, ep 13, state [0 1 1 0 0 0 0 0]
instance 0, ep 14, state [0 0 0 1 0 0 0 0]
instance 0, ep 15, state [1 0 1 1 1 1 0 0]
instance 0, ep 16, state [0 0 1 0 0 0 1 0]
instance 0, ep 17, state [1 0 0 1 0 1 0 1]
instance 0, ep 18, state [1 1 0 1 1 0 1 1]
instance 0, ep 19, state [0 1 0 0 1 0 1 1]
instance 0, ep 20, state [1 0 0 0 0 1 1 0]
instance 0, ep 21, state [0 0 0 1 1 0 1 0]
instance 0, ep 22, state [0 1 1 1 1 1 0 0]
instance 0, ep 23, state [1 1 0 0

In [12]:
np.random.seed(seed)
random.seed(seed)
optimal_reward = optimal_policy(simulator, n_episodes, n_epochs, discount)

first state [0 0 1 0 1 1 0 1]


   state [0 0 1 0 1 1 0 1] state_WI [  0.44   0.84   0.14   0.34 -10.   -10.     0.73 -10.  ] sorted [1 6 0]
   state [1 0 0 0 1 0 0 0] state_WI [  0.43   0.84   0.24   0.34 -10.     0.77   0.73   0.91] sorted [7 1 5]
   state [0 1 0 1 0 1 0 1] state_WI [  0.44   0.33   0.24   0.25 -10.   -10.     0.73 -10.  ] sorted [6 0 1]
   state [1 1 0 1 0 0 1 1] state_WI [  0.43   0.33   0.24   0.25 -10.     0.77 -10.   -10.  ] sorted [5 0 1]
   state [1 1 0 1 0 1 0 1] state_WI [  0.43   0.33   0.24   0.25 -10.   -10.     0.73 -10.  ] sorted [6 0 1]
   state [1 1 0 0 1 0 0 1] state_WI [  0.43   0.33   0.24   0.34 -10.     0.77   0.73 -10.  ] sorted [5 6 0]
   state [1 1 0 0 0 1 1 1] state_WI [  0.43   0.33   0.24   0.34 -10.   -10.   -10.   -10.  ] sorted [0 3 1]
   state [1 1 0 0 0 1 0 1] state_WI [  0.43   0.33   0.24   0.34 -10.   -10.     0.73 -10.  ] sorted [6 0 3]
   state [1 0 0 1 1 0 0 1] state_WI [  0.43   0.84   0.24   0.25 -10.     0.77   0.73 -10.  ] sorted [1 5 6]
   state [0 1 0 0 0

In [13]:
np.random.seed(seed)
random.seed(seed)
wiql_rewards = WIQL(simulator, n_episodes, n_epochs)

first state [0 0 1 0 1 1 0 1]
instance 0, ep 1, state [1 1 0 1 0 1 1 0]
instance 0, ep 2, state [0 0 0 0 0 0 1 1]
instance 0, ep 3, state [1 1 0 1 0 0 0 0]
instance 0, ep 4, state [1 0 1 0 1 0 0 0]
instance 0, ep 5, state [1 0 0 1 0 1 1 0]
instance 0, ep 6, state [0 0 0 0 1 0 1 0]
instance 0, ep 7, state [1 1 0 1 0 1 1 1]
instance 0, ep 8, state [1 0 1 0 0 1 1 0]
instance 0, ep 9, state [1 0 1 0 0 1 1 0]
instance 0, ep 10, state [1 1 0 1 0 0 1 1]
instance 0, ep 11, state [1 1 1 0 1 0 1 0]
instance 0, ep 12, state [0 0 0 0 0 1 1 0]
instance 0, ep 13, state [0 1 1 0 0 0 0 0]
instance 0, ep 14, state [0 0 0 1 0 0 0 0]
instance 0, ep 15, state [1 0 1 1 1 1 0 0]
instance 0, ep 16, state [0 0 1 0 0 0 1 0]


instance 0, ep 17, state [1 0 0 1 0 1 0 1]
instance 0, ep 18, state [1 1 0 1 1 0 1 1]
instance 0, ep 19, state [0 1 0 0 1 0 1 1]
instance 0, ep 20, state [1 0 0 0 0 1 1 0]
instance 0, ep 21, state [0 0 0 1 1 0 1 0]
instance 0, ep 22, state [0 1 1 1 1 1 0 0]
instance 0, ep 23, state [1 1 0 0 1 0 0 0]
instance 0, ep 24, state [1 1 1 0 0 1 1 0]
instance 0, ep 25, state [0 0 1 1 0 1 0 0]
instance 0, ep 26, state [0 0 1 1 1 0 0 1]
instance 0, ep 27, state [1 0 1 0 0 0 0 0]
instance 0, ep 28, state [1 0 1 1 0 0 0 0]
instance 0, ep 29, state [1 0 1 0 1 0 1 1]
first state [1 0 0 0 1 1 0 0]
instance 1, ep 1, state [1 1 1 0 1 1 1 1]
instance 1, ep 2, state [0 0 1 1 1 1 1 0]
instance 1, ep 3, state [0 1 1 0 1 0 1 0]
instance 1, ep 4, state [0 1 1 1 0 0 1 1]
instance 1, ep 5, state [0 0 1 1 0 0 0 1]
instance 1, ep 6, state [0 1 0 1 0 0 0 1]
instance 1, ep 7, state [0 0 0 0 0 1 1 0]
instance 1, ep 8, state [1 1 0 1 1 1 0 0]
instance 1, ep 9, state [1 0 1 0 0 0 0 1]
instance 1, ep 10, state [1 1 0 0

In [14]:
np.random.seed(seed)
random.seed(seed)
ucw_extreme_rewards = UCWhittle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='extreme')

solving UCWhittle using method: extreme
first state [0 0 1 0 1 1 0 1]
instance 0, ep 1, state [1 1 0 1 0 1 1 0]
instance 0, ep 2, state [0 0 0 0 0 0 1 1]
instance 0, ep 3, state [1 1 0 1 0 0 0 0]


instance 0, ep 4, state [1 0 1 0 1 0 0 0]
instance 0, ep 5, state [1 0 0 1 0 1 1 0]
---------------------------------------------------
0 100  | a  [0 0 0 0 1 1 0 1]  | s'  [0 0 0 0 0 1 0 0]  | r  1    | WI  [0.898 0.898 0.898 0.9   0.9   0.9   0.898 0.9  ]
instance 0, ep 6, state [0 0 0 0 1 0 1 0]
instance 0, ep 7, state [1 1 0 1 0 1 1 1]
instance 0, ep 8, state [1 0 1 0 0 1 1 0]
instance 0, ep 9, state [1 0 1 0 0 1 1 0]
instance 0, ep 10, state [1 1 0 1 0 0 1 1]
---------------------------------------------------
0 200  | a  [1 0 0 0 0 0 1 1]  | s'  [1 0 0 0 1 0 0 1]  | r  3    | WI  [  0.9     0.898   0.898   0.898   0.898 -10.      0.9     0.899]
instance 0, ep 11, state [1 1 1 0 1 0 1 0]
instance 0, ep 12, state [0 0 0 0 0 1 1 0]
instance 0, ep 13, state [0 1 1 0 0 0 0 0]
instance 0, ep 14, state [0 0 0 1 0 0 0 0]
instance 0, ep 15, state [1 0 1 1 1 1 0 0]
---------------------------------------------------
0 300  | a  [0 0 0 1 1 0 1 0]  | s'  [0 0 0 0 1 0 1 0]  | r  2    | WI  [ 

In [15]:
np.random.seed(seed)
random.seed(seed)
ucw_ucb_rewards = UCWhittle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='UCB')

solving UCWhittle using method: UCB
first state [0 0 1 0 1 1 0 1]
instance 0, ep 1, state [1 1 0 1 0 1 1 0]
instance 0, ep 2, state [0 0 0 0 0 0 1 1]
instance 0, ep 3, state [1 1 0 1 0 0 0 0]
instance 0, ep 4, state [1 0 1 0 1 0 0 0]
instance 0, ep 5, state [1 0 0 1 0 1 1 0]
---------------------------------------------------
0 100  | a  [0 0 0 1 0 0 1 1]  | s'  [0 0 0 1 0 0 1 1]  | r  3    | WI  [-0.    -0.    -0.     0.016 -0.    -0.    -0.    -0.   ]
instance 0, ep 6, state [0 0 0 0 1 0 1 0]
instance 0, ep 7, state [1 1 0 1 0 1 1 1]
instance 0, ep 8, state [1 0 1 0 0 1 1 0]
instance 0, ep 9, state [1 0 1 0 0 1 1 0]
instance 0, ep 10, state [1 1 0 1 0 0 1 1]
---------------------------------------------------
0 200  | a  [1 1 0 0 0 0 0 1]  | s'  [1 1 0 0 0 1 1 1]  | r  5    | WI  [ 0.001  0.001 -0.    -0.    -0.    -0.    -0.    -0.   ]
instance 0, ep 11, state [1 1 1 0 1 0 1 0]
instance 0, ep 12, state [0 0 0 0 0 1 1 0]
instance 0, ep 13, state [0 1 1 0 0 0 0 0]
instance 0, ep 14, s

In [16]:
np.random.seed(seed)
random.seed(seed)
ucw_ucb_rewards_fixed = UCWhittleFixed(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='UCB',norm_confidence=False)

solving UCWhittle using method: UCB
first state [0 1 1 1 1 1 1 1]
instance 0, ep 1, state [0 0 0 1 1 1 0 1]
instance 0, ep 2, state [0 1 1 1 0 0 0 1]
instance 0, ep 3, state [0 0 0 1 0 1 1 0]
instance 0, ep 4, state [1 1 0 1 1 1 1 1]
instance 0, ep 5, state [0 0 0 0 0 1 0 0]
---------------------------------------------------
0 100  | a  [0 0 0 0 0 1 1 1]  | s'  [0 1 0 0 0 1 1 0]  | r  3    | WI  [-0. -0. -0. -0. -0. -0. -0. -0.]
instance 0, ep 6, state [0 1 0 1 0 0 0 0]
instance 0, ep 7, state [1 0 1 0 0 0 0 1]
instance 0, ep 8, state [0 0 0 0 1 1 0 1]
instance 0, ep 9, state [0 1 0 0 0 1 1 1]
instance 0, ep 10, state [0 1 1 0 0 0 1 0]
---------------------------------------------------
0 200  | a  [0 0 1 0 1 0 0 1]  | s'  [1 0 0 0 1 0 0 1]  | r  3    | WI  [-0.    -0.     0.001 -0.     0.005 -0.    -0.    -0.   ]
instance 0, ep 11, state [1 0 1 1 0 1 1 1]
instance 0, ep 12, state [0 0 1 1 0 0 1 0]
instance 0, ep 13, state [1 1 1 1 0 1 1 0]
instance 0, ep 14, state [0 1 1 0 1 1 1 1]
i

In [17]:
np.random.seed(seed)
random.seed(seed)
ucw_fixed_qp_rewards = UCWhittleFixed(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='QP')

solving UCWhittle using method: QP
first state [0 0 1 0 1 1 0 1]
Restricted license - for non-production use only - expires 2024-10-28




instance 0, ep 1, state [1 1 0 1 0 1 1 0]
instance 0, ep 2, state [0 0 0 0 0 0 1 1]
instance 0, ep 3, state [1 1 0 1 0 0 0 0]
instance 0, ep 4, state [1 0 1 0 1 0 0 0]
instance 0, ep 5, state [1 0 0 1 0 1 1 0]
---------------------------------------------------
0 100  | a  [0 0 0 0 0 1 1 1]  | s'  [0 0 0 0 1 1 1 1]  | r  4    | WI  [-1. -1. -1. -1. -1. -1. -1. -1.]
instance 0, ep 6, state [0 0 0 0 1 0 1 0]
instance 0, ep 7, state [1 1 0 1 0 1 1 1]
instance 0, ep 8, state [1 0 1 0 0 1 1 0]
instance 0, ep 9, state [1 0 1 0 0 1 1 0]
instance 0, ep 10, state [1 1 0 1 0 0 1 1]
---------------------------------------------------
0 200  | a  [0 0 0 0 0 1 1 1]  | s'  [0 0 0 0 0 0 0 1]  | r  1    | WI  [-1. -1. -1. -1. -1. -1. -1. -1.]
instance 0, ep 11, state [1 1 1 0 1 0 1 0]
instance 0, ep 12, state [0 0 0 0 0 1 1 0]
instance 0, ep 13, state [0 1 1 0 0 0 0 0]
instance 0, ep 14, state [0 0 0 1 0 0 0 0]
instance 0, ep 15, state [1 0 1 1 1 1 0 0]
------------------------------------------------

In [18]:
np.random.seed(seed)
random.seed(seed)
ucw_fixed_value_rewards = UCWhittle_value_fixed(simulator, n_episodes, n_epochs, discount, alpha=alpha)

solving UCWhittle using method: VALUE-BASED
first state [0 0 1 0 1 1 0 1]
--------------------------
      arm 0 state 0, opt val 7.21, opt p [0.576 0.576 0.889 0.889]
      arm 0 state 1, opt val 8.85, opt p [0.407 0.407 0.932 0.932]
      arm 1 state 0, opt val 7.21, opt p [0. 0. 0. 0.]
      arm 1 state 1, opt val 8.85, opt p [0. 0. 0. 0.]
      arm 2 state 0, opt val 7.21, opt p [0. 0. 0. 0.]
      arm 2 state 1, opt val 8.85, opt p [0. 0. 0. 0.]
      arm 3 state 0, opt val 7.21, opt p [0. 0. 0. 0.]
      arm 3 state 1, opt val 8.85, opt p [0. 0. 0. 0.]
      arm 4 state 0, opt val 7.21, opt p [0. 0. 0. 0.]
      arm 4 state 1, opt val 8.85, opt p [0. 0. 0. 0.]
      arm 5 state 0, opt val 7.21, opt p [0. 0. 0. 0.]
      arm 5 state 1, opt val 8.85, opt p [0. 0. 0. 0.]
      arm 6 state 0, opt val 7.21, opt p [0. 0. 0. 0.]
      arm 6 state 1, opt val 8.85, opt p [0. 0. 0. 0.]
      arm 7 state 0, opt val 7.21, opt p [0. 0. 0. 0.]
      arm 7 state 1, opt val 8.85, opt p [0. 0. 0.



--------------------------
      arm 0 state 0, opt val 0.00, opt p [0. 0. 0. 0.]
      arm 0 state 1, opt val 1.00, opt p [0. 0. 0. 0.]
      arm 1 state 0, opt val 0.00, opt p [0. 0. 0. 0.]
      arm 1 state 1, opt val 1.00, opt p [0. 0. 0. 0.]
      arm 2 state 0, opt val 0.00, opt p [0. 0. 0. 0.]
      arm 2 state 1, opt val 1.00, opt p [0. 0. 0. 0.]
      arm 3 state 0, opt val 0.00, opt p [0. 0. 0. 0.]
      arm 3 state 1, opt val 1.00, opt p [0. 0. 0. 0.]
      arm 4 state 0, opt val 0.00, opt p [0. 0. 0. 0.]
      arm 4 state 1, opt val 1.00, opt p [0. 0. 0. 0.]
      arm 5 state 0, opt val 0.00, opt p [0. 0. 0. 0.]
      arm 5 state 1, opt val 1.00, opt p [0. 0. 0. 0.]
      arm 6 state 0, opt val 0.00, opt p [0. 0. 0. 0.]
      arm 6 state 1, opt val 1.00, opt p [0. 0. 0. 0.]
      arm 7 state 0, opt val 0.00, opt p [0. 0. 0. 0.]
      arm 7 state 1, opt val 1.00, opt p [0. 0. 0. 0.]
--------------------------
      arm 0 state 0, opt val 0.00, opt p [0. 0. 0. 0.]
      arm 0



--------------------------
      arm 0 state 0, opt val 8.51, opt p [0.632 0.632 1.    1.   ]
      arm 0 state 1, opt val 10.01, opt p [0. 0. 1. 1.]
      arm 1 state 0, opt val 8.57, opt p [0.666 0.666 1.    1.   ]
      arm 1 state 1, opt val 10.01, opt p [0. 0. 1. 1.]
      arm 2 state 0, opt val 8.49, opt p [0. 0. 0. 0.]
      arm 2 state 1, opt val 10.01, opt p [0. 0. 0. 0.]
      arm 3 state 0, opt val 8.49, opt p [0.626 0.626 1.    1.   ]
      arm 3 state 1, opt val 10.01, opt p [0. 0. 1. 1.]
      arm 4 state 0, opt val 9.00, opt p [0. 0. 0. 0.]
      arm 4 state 1, opt val 10.01, opt p [0. 0. 0. 0.]
      arm 5 state 0, opt val 9.00, opt p [1. 1. 1. 1.]
      arm 5 state 1, opt val 10.01, opt p [0. 0. 1. 1.]
      arm 6 state 0, opt val 9.00, opt p [0. 0. 0. 0.]
      arm 6 state 1, opt val 10.01, opt p [0. 0. 0. 0.]
      arm 7 state 0, opt val 9.00, opt p [0. 0. 0. 0.]
      arm 7 state 1, opt val 10.01, opt p [0. 0. 0. 0.]
--------------------------
      arm 0 state 0, o

In [19]:
mean_rewards = {'random_rewards': np.mean(random_rewards), 
 'optimal_rewards': np.mean(optimal_reward), 
 'wiql_rewards': np.mean(wiql_rewards), 
 'extreme_rewards': np.mean(ucw_extreme_rewards), 
 'ucb_rewards': np.mean(ucw_ucb_rewards), 
 'ucb_fixed_rewards': np.mean(ucw_ucb_rewards_fixed), 
 'qp_fixed_rewards': np.mean(ucw_fixed_qp_rewards),
 'value_fixed_rewards': np.mean(ucw_fixed_value_rewards)}
mean_rewards

{'random_rewards': 3.135440931780366,
 'optimal_rewards': 3.6405990016638934,
 'wiql_rewards': 3.094342762063228,
 'extreme_rewards': 2.9763727121464227,
 'ucb_rewards': 3.098336106489185,
 'ucb_fixed_rewards': 3.2590682196339436,
 'qp_fixed_rewards': 3.007487520798669,
 'value_fixed_rewards': 3.052579034941764}

In [20]:
std_rewards = {'random_rewards': np.std(random_rewards), 
 'optimal_rewards': np.std(optimal_reward), 
 'wiql_rewards': np.std(wiql_rewards), 
 'extreme_rewards': np.std(ucw_extreme_rewards), 
 'ucb_rewards': np.std(ucw_ucb_rewards), 
 'ucb_fixed_rewards': np.std(ucw_ucb_rewards_fixed), 
 'qp_fixed_rewards': np.std(ucw_fixed_qp_rewards),
 'value_fixed_rewards': np.std(ucw_fixed_value_rewards)}

In [21]:
random_match = 1-np.sum(random_rewards == 0)/random_rewards.size
optimal_match = 1-np.sum(optimal_reward == 0)/optimal_reward.size 
wiql_match = 1-np.sum(wiql_rewards == 0)/wiql_rewards.size 
ucw_extreme_match = 1-np.sum(ucw_extreme_rewards == 0)/ucw_extreme_rewards.size 
ucw_ucb_match = 1-np.sum(ucw_ucb_rewards == 0)/ucw_ucb_rewards.size
ucw_ucb_fixed_match = 1-np.sum(ucw_ucb_rewards_fixed == 0)/ucw_ucb_rewards_fixed.size 
ucw_fixed_qp_match = 1-np.sum(ucw_fixed_qp_rewards == 0)/ucw_fixed_qp_rewards.size 
ucw_fixed_value_match = 1-np.sum(ucw_fixed_value_rewards == 0)/ucw_fixed_value_rewards.size


In [22]:
match_rates = {
    'random_match': random_match, 
    'optimal_match': optimal_match, 
    'wiql_match': wiql_match, 
    'extreme_match': ucw_extreme_match, 
    'ucb_match': ucw_ucb_match, 
    'fixed_match': ucw_ucb_fixed_match, 
    'qp_match': ucw_fixed_qp_match, 
    'value_match': ucw_fixed_value_match, 
}
match_rates

{'random_match': 0.9885191347753743,
 'optimal_match': 0.9963394342762063,
 'wiql_match': 0.9828618968386024,
 'extreme_match': 0.986855241264559,
 'ucb_match': 0.9825291181364393,
 'fixed_match': 0.9863560732113145,
 'qp_match': 0.9886855241264559,
 'value_match': 0.9863560732113145}

In [None]:
data = {
    'mean_reward': mean_rewards, 
    'std_reward': std_rewards,
    'match_rate': match_rates, 
    'parameters': 
        {'seed'      : seed,
        'n_arms'    : n_arms,
        'budget'    : budget,
        'discount'  : discount, 
        'alpha'     : alpha, 
        'n_episodes': n_episodes, 
        'episode_len': episode_len, 
        'n_epochs'  : n_epochs} 
}

In [29]:
save_path = get_save_path('baseline',save_name,seed,use_date=save_with_date)

In [None]:
delete_duplicate_results('baseline',save_name,data)

In [64]:
json.dump(data,open('../results/'+save_path,'w'))