In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class ParallelFactoredEnv():
    def __init__(self, k, d, num_trials, sigma=0.01, min_expected=0.3, max_expected=1, seed=0):
        self.d = d
        self.num_trials = num_trials
        self.sigma=sigma
        self.d_vect = np.linspace(0, d-1, d, dtype=int)
        self.seed = seed
        np.random.seed(self.seed)
        self.avg_reward = np.random.uniform(min_expected, max_expected, (self.num_trials, self.d, k))
    
    def step(self, trial, action):
        return self.avg_reward[trial, self.d_vect, action] + np.random.normal(0, self.sigma, self.d)

    def get_expected(self, trial):
        return self.avg_reward[trial, :, :]

In [3]:
class FactoredUCBAgent():
    """
    This class implements the FRB algorithm in its anytime version
    """
    def __init__(self, n_arms_vect, dim, sigma, max_reward=1, exploration_alpha=4):
        self.n_arms_vect = n_arms_vect
        self.dim = dim
        assert self.dim == self.n_arms_vect.shape[0]
        self.max_reward = max_reward
        self.sigma = sigma
        self.exploration_alpha = exploration_alpha
        self.reset()

    def reset(self):
        self.t = 1
        self.last_pull = None
        self.avg_reward = []
        self.n_pulls = []
        for size in self.n_arms_vect:
            self.avg_reward.append(np.zeros(size))
            self.n_pulls.append(np.zeros(size, dtype=int))
        return self

    def pull_arm(self):
        self.last_pull = -1 * np.ones(self.dim, dtype=int)
        for i in range(self.dim):
            ucb1 = [self.avg_reward[i][a] + self.max_reward * self.sigma * np.sqrt(
                self.exploration_alpha * np.log(self.t) / self.n_pulls[i][a]) for a in range(self.n_arms_vect[i])]
            self.last_pull[i] = int(np.argmax(ucb1))
            self.n_pulls[i][self.last_pull[i]] = self.n_pulls[i][self.last_pull[i]] + 1
        return self.last_pull

    def update(self, observations):
        self.t += 1
        for i in range(self.dim):
            self.avg_reward[i][self.last_pull[i]] = (
                self.avg_reward[i][self.last_pull[i]] *
                (self.n_pulls[i][self.last_pull[i]] - 1) + observations[i]
            ) / (self.n_pulls[i][self.last_pull[i]])

In [4]:
class FtrackAgent():
    """
    This class implements F-track
    """
    def __init__(self, k, d, sigma, T, c):
        self.k = k
        self.d = d
        self.sigma = sigma
        self.T = T
        self.c = c
        self.N0 = 10 * int(np.ceil(np.sqrt(np.log(T))))
        self.eps = np.sqrt(2 * (sigma ** 2) * self._ft(1/np.log(T), c) / self.N0)
        self.exploration_alpha = 4
        self.schedule = False
        self.reset()

    def reset(self):
        self.t = 0
        self.last_pull = None
        self.avg_reward = np.zeros((self.d, self.k))
        self.n_pulls = np.zeros((self.d, self.k), dtype=int)
        # self.n_arms_vect = self.k * np.ones(self.d, dtype=int)
        # for size in self.n_arms_vect:
        #     self.avg_reward.append(np.zeros(size))
        #     self.n_pulls.append(np.zeros(size, dtype=int))
        return self

    def pull_arm(self):
        if(self.t < self.N0*self.k): 
            self.last_pull = (self.t % self.k) * np.ones(self.d, dtype=int)
        else:
            if self.schedule == False:
                self._create_schedule()
        
            if np.max(np.abs(self.avg_rewards_warmup-self.avg_reward)) <= 2 * self.eps:
                self._pull_arm_ftrack()
            else:
                self._pull_arm_fucb()
            
        for i in range(self.d):
            self.n_pulls[i, self.last_pull[i]] = self.n_pulls[i, self.last_pull[i]] + 1

        return self.last_pull
    
    def _pull_arm_ftrack(self):
        finished = self.action_vects_num == self.action_vects_num_pulled
        self.action_vects_num_pulled[finished] = np.inf
        to_pull = np.argmin(self.action_vects_num_pulled)
        self.last_pull = self.action_vects[to_pull]
        self.action_vects_num_pulled[to_pull] = self.action_vects_num_pulled[to_pull] + 1
    
    def _pull_arm_fucb(self):
        self.last_pull = -1 * np.ones(self.d, dtype=int)
        for i in range(self.d):
            ucb1 = [self.avg_reward[i, a] + self.sigma * np.sqrt(
                self.exploration_alpha * np.log(self.t) / self.n_pulls[i, a]) for a in range(self.k)]
            self.last_pull[i] = int(np.argmax(ucb1))
            # self.n_pulls[i, self.last_pull[i]] = self.n_pulls[i, self.last_pull[i]] + 1

    def update(self, observations):
        self.t += 1
        for i in range(self.d):
            self.avg_reward[i, self.last_pull[i]] = (
                self.avg_reward[i, self.last_pull[i]] *
                (self.n_pulls[i, self.last_pull[i]] - 1) + observations[i]
            ) / (self.n_pulls[i, self.last_pull[i]])
    
    def _ft(self, delta, c):
        return (1 + 1 / np.log(self.T)) * (c * np.log(np.log(self.T)) + np.log(1/delta))
    
    def _create_schedule(self):
        self.avg_rewards_warmup = np.copy(self.avg_reward)
        max_val = np.max(self.avg_rewards_warmup, axis=1).reshape(self.d, -1)
        max_idx = np.argmax(self.avg_rewards_warmup, axis=1)
        deltas = max_val - self.avg_rewards_warmup
        self.pulls_todo = np.zeros((self.d, self.T - self.N0*self.k))
        ft = self._ft(1/self.T, self.c)
        for i in range(self.d):
            self.pulls_todo[i, :] = max_idx[i]
            N = np.ceil(2 * (self.sigma ** 2) * ft / (deltas[i, :] ** 2)).astype(int)
            order = np.argsort(N)
            N_ordered = N[order]
            counter = 0
            for j in range(self.k-1):
                self.pulls_todo[i, counter:counter + N_ordered[j]] = order[j]
                counter += N_ordered[j]
        self.action_vects = []
        self.action_vects_num = []
        for i in range(self.T - self.N0*self.k):
            if (i == 0):
                self.action_vects.append(self.pulls_todo[:, 0])
                self.action_vects_num.append(1)
            if np.array_equal(self.pulls_todo[:, i], self.action_vects[-1]):
                self.action_vects_num[-1] = self.action_vects_num[-1] + 1
            else:
                self.action_vects.append(self.pulls_todo[:, i])
                self.action_vects_num.append(1)
        self.action_vects_num_pulled = list(np.zeros(len(self.action_vects_num)))     
        self.schedule = True
        

In [5]:
# -*- coding: latin-1 -*-
import numpy as np
import math
from tqdm import tqdm
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import tikzplotlib as tkz
import warnings
warnings.filterwarnings("ignore")

import os, sys

_, filename = os.path.split(os.getcwd())
if filename == 'notebooks':
    old_dir = os.getcwd()
    os.chdir('../')
    print('Moving Current Directory from ' + old_dir + ' to ' + os.getcwd())
else:
    print('Current Directory is ' + os.getcwd())

sys.path.append('./')

from FRB.agents import FactoredUCBAgent
from FRB.env import FactoredEnv
from FRB.utils import get_pulled_expected, compute_max_expected, create_action_matrix, get_sigma_square_eq_max

# Body of the script

# BASIC SETTING FOR EXPERIMENTS
fucb = '\\JPAalgnameshort'
ftrack = '\\FTrack'
algs = [fucb, ftrack]
checkpoints = [1000, 5000, 10000]
n_trials = 50
seed = 0
k_list = [3]
d_list = [2]
# k_list = [int(sys.argv[1])]
# d_list = [int(sys.argv[2])]
T = 10000 #int(sys.argv[3])
bounded_list = [False]
do_subsampling = True

# OVERRIDE FOR TESTING PURPOSE TO SPEED UP THE RUNS
T = 10000
checkpoints = [1000, 2000, 5000]
bounded_list = [False]
# algs = [fucb, tea]
n_trials = 4
k_list = [3]
d_list = [2]
do_subsampling = True
    
result_table = {}
# out_folder = str(sys.argv[4])

# ht_mult = float(sys.argv[5])
_sigma = 0.05 #float(sys.argv[6])

for bounded in bounded_list:

    result_table[bounded] = {}
    
    if bounded: 
        sigma = 0.5 # fixed for bernoulli
    else:
        sigma = _sigma
    
    for d in d_list:

        result_table[bounded][d] = {}

        for k in k_list:

            # out_path = out_folder + 'out' + str(k) + '_' + str(d) + '.txt'

            # result_table[bounded][d][k] = {}

            arms_vect = k * np.ones(d, dtype=int)

            # F-UCB INIT
            agent_factored = FactoredUCBAgent(arms_vect, d, sigma)

            agent_stellina = FtrackAgent(k, d, sigma, T, c=2.5)
            
            mean_cum_expected_regret = {}
            std_cum_expected_regret = {}
            
            for alg in algs:

                # result_table[bounded][d][k][alg] = {}

                env = FactoredEnv(arms_vect, d, sigma=sigma, bounded=bounded)

                inst_expected_regret = np.zeros((n_trials, T))
                
                # for trial_i in range(n_trials):
                for trial_i in tqdm(range(n_trials)):
                
                    vals_expected = env.get_expected()
                    max_expected = compute_max_expected(vals_expected)

                    for t in range(T):

                        if alg == ftrack:
                            action = agent_stellina.pull_arm()
                            agent_stellina.update(env.step(action))
                        elif alg == fucb:
                            action = agent_factored.pull_arm()
                            agent_factored.update(env.step(action))
                        else:
                            raise ValueError('Error in selecting algorithm')

                        inst_expected_regret[trial_i, t] = max_expected - get_pulled_expected(
                            vals_expected, action)
                    
                    # I reset all the agents, becuase i do not know which one 
                    # i am using for the sake of simplicity
                    agent_factored.reset()
                    agent_stellina.reset()
                
                # maybe replace with cumsum with correct axis
                cum_expected_regret = np.zeros(inst_expected_regret.shape)
                cum_expected_regret[:, 0] = inst_expected_regret[:, 0]
                for i in range(1, T):
                    cum_expected_regret[:, i] = inst_expected_regret[:, i] + cum_expected_regret[:, i-1]

                mean_cum_expected_regret[alg] = np.mean(cum_expected_regret, axis=0)
                std_cum_expected_regret[alg] = np.std(cum_expected_regret, axis=0) / np.sqrt(n_trials)

            plt.figure()
            if do_subsampling:
                subsample = 50
                assert T % subsample == 0
                x_plt = np.linspace(0, T-1, int(T/subsample), dtype=int)
            else:
                x_plt = np.linspace(0, T-1, T, dtype=int)
            for alg in algs:
                plt.plot(x_plt, mean_cum_expected_regret[alg][x_plt], 
                         label=alg)
                plt.fill_between(x_plt, 
                                 mean_cum_expected_regret[alg][x_plt] - std_cum_expected_regret[alg][x_plt], 
                                 mean_cum_expected_regret[alg][x_plt] + std_cum_expected_regret[alg][x_plt], 
                                 alpha=0.3)
            plt.legend()
            plt.xlabel('Rounds')
            plt.ylabel('Regret')
            plt.title('bounded={} k={} d={} $\sigma$={}'.format(bounded, k, d, sigma))

Moving Current Directory from /home/simone/Projects/research/FRB/notebooks to /home/simone/Projects/research/FRB


100%|██████████| 4/4 [00:02<00:00,  1.69it/s]
  0%|          | 0/4 [00:00<?, ?it/s]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()