In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import json 
import argparse 
import sys

In [3]:
from rmab.simulator import RMABSimulator, random_valid_transition, random_valid_transition_round_down, synthetic_transition_small_window
from rmab.uc_whittle import UCWhittle
from rmab.ucw_value import UCWhittle_value
from rmab.baselines import optimal_policy, random_policy, WIQL
from rmab.fr_dynamics import get_all_transitions
from rmab.utils import get_save_path, delete_duplicate_results
from rmab.utils import get_ucb_conf


In [4]:
is_jupyter = 'ipykernel' in sys.modules

In [14]:
if is_jupyter: 
    seed        = 42
    n_arms      = 8
    budget      = 3
    discount    = 0.9
    alpha       = 3 
    n_episodes  = 30
    episode_len = 20
    n_epochs    = 10
    save_name = 'hyperparameter'
    save_with_date = True 
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_arms',         '-N', help='num beneficiaries (arms)', type=int, default=8)
    parser.add_argument('--episode_len',    '-H', help='episode length', type=int, default=20)
    parser.add_argument('--n_episodes',     '-T', help='num episodes', type=int, default=30)
    parser.add_argument('--budget',         '-B', help='budget', type=int, default=3)
    parser.add_argument('--n_epochs',       '-E', help='number of epochs (num_repeats)', type=int, default=10)
    parser.add_argument('--discount',       '-d', help='discount factor', type=float, default=0.9)
    parser.add_argument('--alpha',          '-a', help='alpha: for conf radius', type=float, default=3)
    parser.add_argument('--seed',           '-s', help='random seed', type=int, default=42)
    parser.add_argument('--save_name',      '-n', help='save name', type=str, default='results')
    parser.add_argument('--use_date', action='store_true')

    args = parser.parse_args()

    n_arms      = args.n_arms
    budget      = args.budget
    discount    = args.discount
    alpha       = args.alpha 
    seed        = args.seed
    n_episodes  = args.n_episodes
    episode_len = args.episode_len
    n_epochs    = args.n_epochs
    save_name   = args.save_name 
    save_with_date = args.use_date 



In [15]:
n_states = 2
n_actions = 2

In [27]:
all_population_size = 100 # number of random arms to generate
all_transitions = random_valid_transition(all_population_size, n_states, n_actions)

In [28]:
all_features = np.arange(all_population_size)

In [29]:
simulator = RMABSimulator(all_population_size, all_features, all_transitions,
            n_arms, episode_len, n_epochs, n_episodes, budget, number_states=n_states)

acting should always be good! 0.819 < 0.938
acting should always be good! 0.926 < 0.951
acting should always be good! 0.599 < 0.742
acting should always be good! 0.922 < 0.992
acting should always be good! 0.911 < 0.997
acting should always be good! 0.993 < 0.994
acting should always be good! 0.840 < 0.902
acting should always be good! 0.856 < 0.960
acting should always be good! 0.452 < 0.659
acting should always be good! 0.899 < 0.972
acting should always be good! 0.920 < 0.947
acting should always be good! 0.973 < 0.980
acting should always be good! 0.987 < 0.989
acting should always be good! 0.765 < 0.972
cohort [ 5 35 49 18 47  9 51 74]
cohort [ 2 33 97 17 49 74 91 65]
cohort [25 66 30 98 17 55 20 60]
cohort [ 9 30 82 10 98 96 31 29]
cohort [94 93 13 63 85 15  4 50]
cohort [18 89 82 80 71 39 40  3]
cohort [13 79 57 14 47 10 65 60]
cohort [44 91  9 31 76 15 39 59]
cohort [10 29 90 30 96  1 79 69]
cohort [12 71  3 58 92 99 94 96]
Last cohort [12 71  3 58 92 99 94 96]
Transitions [[[[

In [30]:
np.random.seed(seed)
random.seed(seed)
random_rewards = random_policy(simulator, n_episodes, n_epochs)

first state [0 1 1 1 0 1 1 0]
instance 0, ep 1, state [1 0 1 1 0 1 0 0]
instance 0, ep 2, state [1 1 0 0 0 0 0 1]
instance 0, ep 3, state [0 1 0 0 1 1 0 0]
instance 0, ep 4, state [1 1 0 1 1 0 1 0]
instance 0, ep 5, state [0 1 0 1 0 1 1 1]
instance 0, ep 6, state [1 0 0 0 1 0 0 1]
instance 0, ep 7, state [0 1 0 0 0 1 1 0]
instance 0, ep 8, state [0 1 0 0 1 1 1 1]
instance 0, ep 9, state [1 0 1 1 0 1 1 1]
instance 0, ep 10, state [1 0 1 0 0 0 0 1]
instance 0, ep 11, state [0 0 0 0 1 1 0 1]
instance 0, ep 12, state [0 1 0 0 0 1 0 0]
instance 0, ep 13, state [1 0 1 0 1 1 1 1]
instance 0, ep 14, state [1 0 0 1 1 0 0 0]
instance 0, ep 15, state [0 0 0 1 0 0 1 1]
instance 0, ep 16, state [0 1 1 1 1 0 1 0]
instance 0, ep 17, state [0 1 1 1 1 0 1 1]
instance 0, ep 18, state [0 1 1 1 0 0 0 0]
instance 0, ep 19, state [0 0 1 0 0 1 0 1]
instance 0, ep 20, state [0 0 0 1 0 0 1 0]
instance 0, ep 21, state [1 0 1 0 0 1 0 1]
instance 0, ep 22, state [0 0 0 0 0 1 0 0]
instance 0, ep 23, state [0 1 1 1

In [31]:
np.random.seed(seed)
random.seed(seed)
optimal_reward = optimal_policy(simulator, n_episodes, n_epochs, discount)

first state [0 1 1 1 0 1 1 0]


   state [0 1 1 1 0 1 1 0] state_WI [  0.65   0.18   0.17  -0.18 -10.     0.2    0.33   0.38] sorted [0 7 6]
   state [0 1 1 0 1 1 1 0] state_WI [  0.65   0.18   0.17   0.04 -10.     0.2    0.33   0.38] sorted [0 7 6]
   state [1 1 1 1 1 1 1 0] state_WI [  0.33   0.18   0.17  -0.18 -10.     0.2    0.33   0.38] sorted [7 6 0]
   state [1 1 1 1 1 0 1 1] state_WI [  0.33   0.18   0.17  -0.18 -10.     1.     0.33   0.49] sorted [5 7 6]
   state [0 1 1 0 1 1 1 1] state_WI [  0.65   0.18   0.17   0.04 -10.     0.2    0.33   0.49] sorted [0 7 6]
   state [1 1 1 1 1 0 1 1] state_WI [  0.33   0.18   0.17  -0.18 -10.     1.     0.33   0.49] sorted [5 7 6]
   state [1 1 1 1 1 1 1 1] state_WI [  0.33   0.18   0.17  -0.18 -10.     0.2    0.33   0.49] sorted [7 6 0]
   state [1 1 0 1 1 1 1 1] state_WI [  0.33   0.18   0.53  -0.18 -10.     0.2    0.33   0.49] sorted [2 7 6]
   state [0 1 0 0 1 1 1 1] state_WI [  0.65   0.18   0.53   0.04 -10.     0.2    0.33   0.49] sorted [0 2 7]
   state [1 1 1 1 1

In [32]:
np.random.seed(seed)
random.seed(seed)
wiql_rewards = WIQL(simulator, n_episodes, n_epochs)

first state [0 1 1 1 0 1 1 0]
instance 0, ep 1, state [1 0 1 1 0 1 0 0]
instance 0, ep 2, state [1 1 0 0 0 0 0 1]
instance 0, ep 3, state [0 1 0 0 1 1 0 0]
instance 0, ep 4, state [1 1 0 1 1 0 1 0]
instance 0, ep 5, state [0 1 0 1 0 1 1 1]
instance 0, ep 6, state [1 0 0 0 1 0 0 1]
instance 0, ep 7, state [0 1 0 0 0 1 1 0]
instance 0, ep 8, state [0 1 0 0 1 1 1 1]
instance 0, ep 9, state [1 0 1 1 0 1 1 1]
instance 0, ep 10, state [1 0 1 0 0 0 0 1]
instance 0, ep 11, state [0 0 0 0 1 1 0 1]
instance 0, ep 12, state [0 1 0 0 0 1 0 0]
instance 0, ep 13, state [1 0 1 0 1 1 1 1]
instance 0, ep 14, state [1 0 0 1 1 0 0 0]
instance 0, ep 15, state [0 0 0 1 0 0 1 1]
instance 0, ep 16, state [0 1 1 1 1 0 1 0]
instance 0, ep 17, state [0 1 1 1 1 0 1 1]
instance 0, ep 18, state [0 1 1 1 0 0 0 0]
instance 0, ep 19, state [0 0 1 0 0 1 0 1]
instance 0, ep 20, state [0 0 0 1 0 0 1 0]
instance 0, ep 21, state [1 0 1 0 0 1 0 1]
instance 0, ep 22, state [0 0 0 0 0 1 0 0]
instance 0, ep 23, state [0 1 1 1

In [33]:
np.random.seed(seed)
random.seed(seed)
ucw_extreme_rewards = UCWhittle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='extreme')

solving UCWhittle using method: extreme
first state [0 1 1 1 0 1 1 0]
instance 0, ep 1, state [1 0 1 1 0 1 0 0]


instance 0, ep 2, state [1 1 0 0 0 0 0 1]
instance 0, ep 3, state [0 1 0 0 1 1 0 0]
instance 0, ep 4, state [1 1 0 1 1 0 1 0]
instance 0, ep 5, state [0 1 0 1 0 1 1 1]
---------------------------------------------------
0 100  | a  [0 0 0 0 1 1 1 0]  | s'  [0 1 0 0 0 1 1 0]  | r  3    | WI  [0.898 0.9   0.898 0.9   0.9   0.9   0.9   0.898]
instance 0, ep 6, state [1 0 0 0 1 0 0 1]
instance 0, ep 7, state [0 1 0 0 0 1 1 0]
instance 0, ep 8, state [0 1 0 0 1 1 1 1]
instance 0, ep 9, state [1 0 1 1 0 1 1 1]
instance 0, ep 10, state [1 0 1 0 0 0 0 1]
---------------------------------------------------
0 200  | a  [0 0 0 0 1 1 1 0]  | s'  [0 1 0 1 1 1 1 0]  | r  5    | WI  [0.898 0.9   0.898 0.9   0.9   0.9   0.9   0.898]
instance 0, ep 11, state [0 0 0 0 1 1 0 1]
instance 0, ep 12, state [0 1 0 0 0 1 0 0]
instance 0, ep 13, state [1 0 1 0 1 1 1 1]
instance 0, ep 14, state [1 0 0 1 1 0 0 0]
instance 0, ep 15, state [0 0 0 1 0 0 1 1]
---------------------------------------------------
0 300 

In [37]:
np.random.seed(seed)
random.seed(seed)
ucw_ucb_rewards = UCWhittle(simulator, n_episodes, n_epochs, discount, alpha=alpha, method='UCB')

solving UCWhittle using method: UCB
first state [0 1 1 1 0 1 1 0]
instance 0, ep 1, state [1 0 1 1 0 1 0 0]
instance 0, ep 2, state [1 1 0 0 0 0 0 1]
instance 0, ep 3, state [0 1 0 0 1 1 0 0]
instance 0, ep 4, state [1 1 0 1 1 0 1 0]
instance 0, ep 5, state [0 1 0 1 0 1 1 1]
---------------------------------------------------
0 100  | a  [0 0 0 0 0 1 1 1]  | s'  [0 1 0 0 0 1 1 0]  | r  3    | WI  [-0. -0. -0. -0. -0. -0. -0. -0.]
instance 0, ep 6, state [1 0 0 0 1 0 0 1]
instance 0, ep 7, state [0 1 0 0 0 1 1 0]
instance 0, ep 8, state [0 1 0 0 1 1 1 1]
instance 0, ep 9, state [1 0 1 1 0 1 1 1]
instance 0, ep 10, state [1 0 1 0 0 0 0 1]
---------------------------------------------------
0 200  | a  [0 0 1 0 0 0 1 1]  | s'  [0 1 1 0 1 1 1 1]  | r  6    | WI  [-0.    -0.     0.059 -0.    -0.    -0.    -0.    -0.   ]
instance 0, ep 11, state [0 0 0 0 1 1 0 1]
instance 0, ep 12, state [0 1 0 0 0 1 0 0]
instance 0, ep 13, state [1 0 1 0 1 1 1 1]
instance 0, ep 14, state [1 0 0 1 1 0 0 0]
i

In [38]:
mean_rewards = {'random_rewards': np.mean(random_rewards), 
 'optimal_rewards': np.mean(optimal_reward), 
 'wiql_rewards': np.mean(wiql_rewards), 
 'extreme_rewards': np.mean(ucw_extreme_rewards), 
 'ucb_rewards': np.mean(ucw_ucb_rewards)}

In [39]:
mean_rewards

{'random_rewards': 6.0108153078203,
 'optimal_rewards': 6.637104825291181,
 'wiql_rewards': 5.667720465890183,
 'extreme_rewards': 6.026622296173045,
 'ucb_rewards': 6.3041597337770385}