In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class ParallelFactoredEnv():
    def __init__(self, k, d, num_trials, sigma=0.01, min_expected=0.3, max_expected=1, seed=0):
        self.d = d
        self.num_trials = num_trials
        self.sigma=sigma
        self.d_vect = np.linspace(0, d-1, d, dtype=int)
        self.seed = seed
        np.random.seed(self.seed)
        self.avg_reward = np.random.uniform(min_expected, max_expected, (self.num_trials, self.d, k))
    
    def step(self, trial, action):
        return self.avg_reward[trial, self.d_vect, action] + np.random.normal(0, self.sigma, self.d)

    def get_expected(self, trial):
        return self.avg_reward[trial, :, :]

In [None]:
class FactoredUCBAgent():
    """
    This class implements the FRB algorithm in its anytime version
    """
    def __init__(self, n_arms_vect, dim, sigma, max_reward=1, exploration_alpha=4):
        self.n_arms_vect = n_arms_vect
        self.dim = dim
        assert self.dim == self.n_arms_vect.shape[0]
        self.max_reward = max_reward
        self.sigma = sigma
        self.exploration_alpha = exploration_alpha
        self.reset()

    def reset(self):
        self.t = 1
        self.last_pull = None
        self.avg_reward = []
        self.n_pulls = []
        for size in self.n_arms_vect:
            self.avg_reward.append(np.zeros(size))
            self.n_pulls.append(np.zeros(size, dtype=int))
        return self

    def pull_arm(self):
        self.last_pull = -1 * np.ones(self.dim, dtype=int)
        for i in range(self.dim):
            ucb1 = [self.avg_reward[i][a] + self.max_reward * self.sigma * np.sqrt(
                self.exploration_alpha * np.log(self.t) / self.n_pulls[i][a]) for a in range(self.n_arms_vect[i])]
            self.last_pull[i] = int(np.argmax(ucb1))
            self.n_pulls[i][self.last_pull[i]] = self.n_pulls[i][self.last_pull[i]] + 1
        return self.last_pull

    def update(self, observations):
        self.t += 1
        for i in range(self.dim):
            self.avg_reward[i][self.last_pull[i]] = (
                self.avg_reward[i][self.last_pull[i]] *
                (self.n_pulls[i][self.last_pull[i]] - 1) + observations[i]
            ) / (self.n_pulls[i][self.last_pull[i]])

In [None]:
class FtrackAgent():
    """
    This class implements F-track
    """
    def __init__(self, k, d, sigma, T, c):
        self.k = k
        self.d = d
        self.sigma = sigma
        self.T = T
        self.c = c
        self.N0 = 10 * np.ceil(np.sqrt(np.log(T)))
        self.eps = np.sqrt(2 * (sigma ** 2) * self._ft(1/np.log(T), c) / self.N0)
        self.schedule = False
        self.reset()

    def reset(self):
        self.t = 0
        self.last_pull = None
        self.avg_reward = np.zeros((self.d, self.k))
        self.n_pulls = np.zeros((self.d, self.k))
        for size in self.n_arms_vect:
            self.avg_reward.append(np.zeros(size))
            self.n_pulls.append(np.zeros(size, dtype=int))
        return self

    def pull_arm(self):
        if(self.t < self.N0*self.k): 
            self.last_pull = (self.t % self.k) * np.ones(self.d, dtype=int)
        else: 
            if self.schedule == False:
                self._create_schedule()
            finished = self.action_vects_num == self.action_vects_num_pulled
            self.action_vects_num[finished] = np.inf
            to_pull = np.argmin(self.action_vects_num)
            self.last_pull = self.action_vects[to_pull]
            self.action_vects_num_pulled[to_pull] = self.action_vects_num_pulled[to_pull] + 1
        return self.last_pull

    def update(self, observations):
        self.t += 1
        for i in range(self.d):
            self.avg_reward[i, self.last_pull[i]] = (
                self.avg_reward[i, self.last_pull[i]] *
                (self.n_pulls[i, self.last_pull[i]] - 1) + observations[i]
            ) / (self.n_pulls[i, self.last_pull[i]])
    
    def _ft(self, delta, c):
        return (1 + 1 / np.log(self.T)) * (c * np.log(np.log(self.T)) + np.log(1/delta))
    
    def _create_schedule():
        self.avg_rewards_warmup = np.copy(self.avg_reward)
        max_val = np.max(self.avg_rewards_warmup, axis=1)
        max_idx = np.argmax(self.avg_rewards_warmup, axis=1)
        deltas = max_val - self.avg_rewards_warmup
        self.pulls_todo = np.zeros((self.d, self.T - self.N0*self.k))
        ft = self._ft(1/self.T, self.c)
        for i in range(self.d):
            self.pulls_todo[i, :] = max_idx[i]
            N = 2 * (self.sigma ** 2) * ft / (deltas[i, :] ** 2)
            order = np.argsort(N)
            N_ordered = N[order]
            counter = 0
            for j in range(self.k-1):
                self.pulls_todo[i, counter:counter + N_ordered[j]] = order[j]
                counter+=N_ordered[j]
        self.action_vects = []
        self.action_vects_num = []
        for i in range(self.T - self.N0*self.k):
            if (i == 0):
                self.action_vects.append(self.pulls_todo[:, 0])
                self.action_vects_num.append(1)
            if self.pulls_todo[:, i] == self.action_vects[-1]:
                self.action_vects_num[-1] = self.action_vects_num[-1] + 1
            else:
                self.action_vects.append(self.pulls_todo[:, i])
                self.action_vects_num.append(1)
        self.action_vects_num_pulled = list(np.zeros(len(self.action_vects_num)))     
        