In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import json 
import argparse 
import sys

In [3]:
from rmab.simulator import RMABSimulator, random_valid_transition, random_valid_transition_round_down, synthetic_transition_small_window
from rmab.uc_whittle import UCWhittle
from rmab.ucw_value import UCWhittle_value
from rmab.baselines import optimal_policy, random_policy, WIQL
from rmab.fr_dynamics import get_all_transitions
from rmab.utils import get_save_path, delete_duplicate_results


In [4]:
is_jupyter = 'ipykernel' in sys.modules

In [5]:
if is_jupyter: 
    seed        = 42
    n_arms      = 8
    budget      = 3
    discount    = 0.9
    alpha       = 3 
    n_episodes  = 30
    episode_len = 20
    n_epochs    = 10
    save_name = 'results'
    match_prob = 0.25
    save_with_date = True 
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_arms',         '-N', help='num beneficiaries (arms)', type=int, default=8)
    parser.add_argument('--episode_len',    '-H', help='episode length', type=int, default=20)
    parser.add_argument('--n_episodes',     '-T', help='num episodes', type=int, default=30)
    parser.add_argument('--budget',         '-B', help='budget', type=int, default=3)
    parser.add_argument('--n_epochs',       '-E', help='number of epochs (num_repeats)', type=int, default=10)
    parser.add_argument('--discount',       '-d', help='discount factor', type=float, default=0.9)
    parser.add_argument('--alpha',          '-a', help='alpha: for conf radius', type=float, default=3)
    parser.add_argument('--seed',           '-s', help='random seed', type=int, default=42)
    parser.add_argument('--save_name',      '-n', help='save name', type=str, default='results')
    parser.add_argument('--use_date', action='store_true')

    args = parser.parse_args()

    n_arms      = args.n_arms
    budget      = args.budget
    discount    = args.discount
    alpha       = args.alpha 
    seed        = args.seed
    n_episodes  = args.n_episodes
    episode_len = args.episode_len
    n_epochs    = args.n_epochs
    save_name   = args.save_name 
    save_with_date = args.use_date 



In [6]:
n_states = 2
n_actions = 2

In [7]:
all_population_size = 100 # number of random arms to generate
all_transitions = get_all_transitions(all_population_size)

In [8]:
all_transitions.shape

(100, 2, 2, 2)

In [9]:
all_features = np.arange(all_population_size)

In [10]:
simulator = RMABSimulator(all_population_size, all_features, all_transitions,
            n_arms, episode_len, n_epochs, n_episodes, budget, number_states=n_states)

acting should always be good! 0.000 < 0.044
acting should always be good! 0.000 < 0.162
acting should always be good! 0.108 < 0.183
good start state should always be good! 0.380 < 0.508
good start state should always be good! 0.506 < 0.760
cohort [44 86 57 73 51 88 10 66]
cohort [97 67 28 45 46 60 66 86]
cohort [84 29 74 64 59  7 56 36]
cohort [76  8 39 20 96 48 66 71]
cohort [63 96 60 92 66 95 43 50]
cohort [46 65 69 96  3 22 99 84]
cohort [12 66 30 19 85 70  2 20]
cohort [82 79 23 46 99 74 62 36]
cohort [90 67 31 65 73 10 26 70]
cohort [65 89 93 48 73 91 82 74]


In [13]:
np.random.seed(seed)
random.seed(seed)
random_rewards = random_policy(simulator, n_episodes, n_epochs)

first state [0 1 1 0 1 0 1 0]
T is 600
instance 0, ep 1, state [1 0 0 0 0 0 0 1]
instance 0, ep 2, state [1 1 0 0 1 1 0 0]
instance 0, ep 3, state [1 1 1 0 1 1 1 1]
instance 0, ep 4, state [0 0 1 1 1 1 1 1]
instance 0, ep 5, state [1 1 1 1 0 0 0 0]
instance 0, ep 6, state [1 1 1 0 1 1 1 1]
instance 0, ep 7, state [0 0 0 1 0 0 0 1]
instance 0, ep 8, state [1 1 1 0 1 1 1 0]
instance 0, ep 9, state [1 1 1 0 1 0 1 1]
instance 0, ep 10, state [0 1 0 0 0 0 0 1]
instance 0, ep 11, state [0 0 0 0 1 0 0 0]
instance 0, ep 12, state [1 0 1 0 0 1 1 0]
instance 0, ep 13, state [0 1 1 0 1 1 0 0]
instance 0, ep 14, state [1 0 1 1 1 0 0 1]
instance 0, ep 15, state [1 0 1 0 0 1 0 0]
instance 0, ep 16, state [1 0 0 1 0 1 0 1]
instance 0, ep 17, state [0 0 1 0 0 0 0 1]
instance 0, ep 18, state [0 0 0 1 1 1 0 0]
instance 0, ep 19, state [1 1 1 1 0 1 0 0]
instance 0, ep 20, state [1 1 1 1 0 1 1 1]
instance 0, ep 21, state [1 0 1 1 1 0 1 0]
instance 0, ep 22, state [0 1 1 0 0 1 0 1]
instance 0, ep 23, state

In [13]:
np.random.seed(seed)
random.seed(seed)
optimal_reward = optimal_policy(simulator, n_episodes, n_epochs, discount)

first state [1 0 1 1 0 0 1 0]
   state [1 0 1 1 0 0 1 0] state_WI [  0.39   1.     0.34   0.3    0.84   0.49 -10.     0.88] sorted [1 7 4]
   state [0 0 0 0 1 0 0 1] state_WI [  0.72   1.     0.49   0.46 -10.     0.49 -10.   -10.  ] sorted [1 0 5]
   state [1 0 0 0 1 1 0 1] state_WI [  0.39   1.     0.49   0.46 -10.     0.55 -10.   -10.  ] sorted [1 5 2]
   state [0 0 0 0 1 0 0 1] state_WI [  0.72   1.     0.49   0.46 -10.     0.49 -10.   -10.  ] sorted [1 0 5]
   state [1 1 0 0 0 0 0 0] state_WI [  0.39   0.18   0.49   0.46   0.84   0.49 -10.     0.88] sorted [7 4 5]
   state [0 1 0 0 0 1 0 0] state_WI [  0.72   0.18   0.49   0.46   0.84   0.55 -10.     0.88] sorted [7 4 0]
   state [1 0 0 0 1 0 0 1] state_WI [  0.39   1.     0.49   0.46 -10.     0.49 -10.   -10.  ] sorted [1 5 2]
   state [1 1 1 0 0 1 0 0] state_WI [  0.39   0.18   0.34   0.46   0.84   0.55 -10.     0.88] sorted [7 4 5]
   state [0 1 0 0 1 1 0 1] state_WI [  0.72   0.18   0.49   0.46 -10.     0.55 -10.   -10.  ] sort

In [14]:
np.random.seed(seed)
random.seed(seed)
wiql_rewards = WIQL(simulator, n_episodes, n_epochs)

first state [1 0 1 1 0 0 1 0]
instance 0, ep 1, state [1 1 1 0 1 0 1 1]
instance 0, ep 2, state [1 0 1 0 0 0 0 0]
instance 0, ep 3, state [1 1 1 1 0 1 0 0]
instance 0, ep 4, state [0 1 0 0 1 1 1 1]
instance 0, ep 5, state [0 0 0 1 1 0 1 1]
instance 0, ep 6, state [1 1 1 0 1 0 0 1]
instance 0, ep 7, state [1 1 0 0 1 1 1 1]
instance 0, ep 8, state [1 1 0 1 1 0 1 0]
instance 0, ep 9, state [0 1 1 0 1 0 1 1]
instance 0, ep 10, state [1 1 0 0 0 0 1 1]
instance 0, ep 11, state [0 1 1 0 0 1 0 0]
instance 0, ep 12, state [0 1 0 1 0 0 1 1]
instance 0, ep 13, state [1 0 0 0 1 1 0 1]
instance 0, ep 14, state [1 1 0 1 0 0 1 0]
instance 0, ep 15, state [1 0 0 0 1 1 0 0]
instance 0, ep 16, state [1 1 0 0 1 1 0 0]
instance 0, ep 17, state [1 0 1 1 0 1 0 1]
instance 0, ep 18, state [0 0 0 1 1 0 0 1]
instance 0, ep 19, state [0 0 0 1 1 1 0 1]
instance 0, ep 20, state [1 0 0 0 1 1 0 1]


instance 0, ep 21, state [1 0 0 1 0 1 1 0]
instance 0, ep 22, state [0 0 0 1 0 0 1 0]
instance 0, ep 23, state [0 0 0 1 0 0 1 0]
instance 0, ep 24, state [0 0 0 1 1 1 0 1]
instance 0, ep 25, state [1 1 0 1 0 0 0 0]
instance 0, ep 26, state [0 1 1 1 0 0 1 1]
instance 0, ep 27, state [0 1 0 0 1 0 1 1]
instance 0, ep 28, state [0 1 0 0 1 0 1 1]
instance 0, ep 29, state [1 0 1 1 1 1 0 0]
first state [0 1 0 1 0 1 0 0]
instance 1, ep 1, state [0 0 0 1 1 0 0 1]
instance 1, ep 2, state [0 0 0 0 0 1 0 1]
instance 1, ep 3, state [0 0 0 0 1 1 0 1]
instance 1, ep 4, state [1 0 1 0 0 0 0 1]
instance 1, ep 5, state [0 0 1 0 1 0 0 1]
instance 1, ep 6, state [0 1 0 1 0 1 1 1]
instance 1, ep 7, state [1 0 1 1 1 1 1 0]
instance 1, ep 8, state [1 1 1 0 0 1 0 1]
instance 1, ep 9, state [1 0 0 1 1 0 0 1]
instance 1, ep 10, state [0 1 0 1 1 0 0 1]
instance 1, ep 11, state [0 0 1 1 0 1 1 0]
instance 1, ep 12, state [0 0 0 0 0 0 1 0]
instance 1, ep 13, state [0 1 0 1 0 0 0 1]
instance 1, ep 14, state [1 1 0 0

In [15]:
np.random.seed(seed)
random.seed(seed)
ucw_extreme_rewards = UCWhittle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='extreme')

solving UCWhittle using method: extreme
first state [1 0 1 1 0 0 1 0]
instance 0, ep 1, state [1 1 1 0 1 0 1 1]
instance 0, ep 2, state [1 0 1 0 0 0 0 0]
instance 0, ep 3, state [1 1 1 1 0 1 0 0]


instance 0, ep 4, state [0 1 0 0 1 1 1 1]
instance 0, ep 5, state [0 0 0 1 1 0 1 1]
---------------------------------------------------
0 100  | a  [1 0 0 0 0 1 0 1]  | s'  [1 1 1 0 0 1 0 1]  | r  5    | WI  [0.9   0.899 0.899 0.899 0.899 0.9   0.899 0.9  ]
instance 0, ep 6, state [1 1 1 0 1 0 0 1]
instance 0, ep 7, state [1 1 0 0 1 1 1 1]
instance 0, ep 8, state [1 1 0 1 1 0 1 0]
instance 0, ep 9, state [0 1 1 0 1 0 1 1]
instance 0, ep 10, state [1 1 0 0 0 0 1 1]
---------------------------------------------------
0 200  | a  [0 0 0 0 1 1 0 1]  | s'  [0 0 0 0 1 1 0 1]  | r  3    | WI  [0.899 0.899 0.899 0.9   0.9   0.9   0.899 0.9  ]
instance 0, ep 11, state [0 1 1 0 0 1 0 0]
instance 0, ep 12, state [0 1 0 1 0 0 1 1]
instance 0, ep 13, state [1 0 0 0 1 1 0 1]
instance 0, ep 14, state [1 1 0 1 0 0 1 0]
instance 0, ep 15, state [1 0 0 0 1 1 0 0]
---------------------------------------------------
0 300  | a  [1 1 0 0 0 1 0 0]  | s'  [0 1 0 0 0 0 0 1]  | r  2    | WI  [  0.9     0.9    

KeyboardInterrupt: 

In [None]:
np.random.seed(seed)
random.seed(seed)
ucw_ucb_rewards = UCWhittle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='UCB')

solving UCWhittle using method: UCB
first state [1 0 1 1 1 0 1 1]
instance 0, ep 1, state [1 0 1 1 1 0 0 0]
instance 0, ep 2, state [0 1 0 1 1 1 1 1]
instance 0, ep 3, state [0 0 1 0 0 1 0 0]
instance 0, ep 4, state [0 0 1 1 1 1 0 1]
instance 0, ep 5, state [1 1 1 1 0 1 0 1]
---------------------------------------------------
0 100  | a  [1 1 0 0 0 0 0 1]  | s'  [0 0 0 0 0 0 0 0]  | r  0    | WI  [ 0.021  0.021 -0.    -0.    -0.    -0.    -0.    -0.   ]
instance 0, ep 6, state [0 0 1 0 1 0 0 0]
instance 0, ep 7, state [1 1 0 0 0 1 1 0]
instance 0, ep 8, state [1 1 0 1 1 1 0 0]
instance 0, ep 9, state [0 0 0 0 1 0 0 0]
instance 0, ep 10, state [1 0 1 0 0 1 1 1]
---------------------------------------------------
0 200  | a  [0 1 0 0 0 0 1 1]  | s'  [1 0 1 0 0 0 1 0]  | r  3    | WI  [-0.     0.106 -0.    -0.    -0.    -0.    -0.    -0.   ]
instance 0, ep 11, state [0 0 1 0 0 1 0 0]
instance 0, ep 12, state [1 0 1 1 0 0 1 1]
instance 0, ep 13, state [0 0 0 1 1 1 1 1]
instance 0, ep 14, s

In [None]:
mean_rewards = {'random_rewards': np.mean(random_rewards), 
 'optimal_rewards': np.mean(optimal_reward), 
 'wiql_rewards': np.mean(wiql_rewards), 
 'extreme_rewards': np.mean(ucw_extreme_rewards), 
 'ucb_rewards': np.mean(ucw_ucb_rewards)}

In [None]:
std_rewards = {'random_rewards': np.std(random_rewards), 
 'optimal_rewards': np.std(optimal_reward), 
 'wiql_rewards': np.std(wiql_rewards), 
 'extreme_rewards': np.std(ucw_extreme_rewards), 
 'ucb_rewards': np.std(ucw_ucb_rewards)}

In [None]:
random_match = 1-np.sum(random_rewards == 0)/random_rewards.size
optimal_match = 1-np.sum(optimal_reward == 0)/optimal_reward.size 
wiql_match = 1-np.sum(wiql_rewards == 0)/wiql_rewards.size 
ucw_extreme_match = 1-np.sum(ucw_extreme_rewards == 0)/ucw_extreme_rewards.size 
ucw_ucb_match = 1-np.sum(ucw_ucb_rewards == 0)/ucw_ucb_rewards.size

In [None]:
match_rates = {
    'random_match': random_match, 
    'optimal_match': optimal_match, 
    'wiql_match': wiql_match, 
    'extreme_match': ucw_extreme_match, 
    'ucb_match': ucw_ucb_match, 
}
match_rates

{'random_match': 0.9858569051580699,
 'optimal_match': 0.9951747088186356,
 'wiql_match': 0.9831946755407653,
 'extreme_match': 0.986522462562396,
 'ucb_match': 0.9770382695507488}

In [None]:
data = {
    'mean_reward': mean_rewards, 
    'std_reward': std_rewards,
    'match_rate': match_rates, 
    'parameters': 
        {'seed'      : seed,
        'n_arms'    : n_arms,
        'budget'    : budget,
        'discount'  : discount, 
        'alpha'     : alpha, 
        'n_episodes': n_episodes, 
        'episode_len': episode_len, 
        'n_epochs'  : n_epochs, 
        'match_prob': match_prob} 
}

In [29]:
save_path = get_save_path('matching',save_name,seed,use_date=save_with_date)

In [None]:
delete_duplicate_results('matching',save_name,data)

In [64]:
json.dump(data,open('../results/'+save_path,'w'))