In [171]:
import numpy as np
import matplotlib as plt
from tqdm import tqdm

%matplotlib inline

In [165]:
class MultiArmedBanditEnvironment:
    
    def __init__(self, arm_probs):
        self.arm_probs = arm_probs
        
    def take_action(self, arm_i):
        return np.random.binomial(n=1, p=self.arm_probs[arm_i])

In [173]:
class ThompsonAgent:
    
    def __init__(self, n_arms):
        self.beta_alpha = [1 for _ in range(n_arms)] # beta dist alpha parameter
        self.beta_beta  = [1 for _ in range(n_arms)] # beta dist beta parameter
    
    @property
    def reward(self):
        # the reward is the sum of the alphas minus the inital 1s
        return np.sum(self.beta_alpha) - len(self.beta_alpha)
    
    def update(self, arm_i, reward):
        if reward == 1:
            self.beta_alpha[arm_i] += 1
        elif reward == 0:
            self.beta_beta[arm_i] += 1
        else:
            raise ValueError(f"Unexpected reward value {reward}")
        
    def choose_action(self):
        beta_samples = np.random.beta(a=self.beta_alpha, b=self.beta_beta)
        return np.argmax(beta_samples)

## Setup:

Arms with the following probabilities:

In [174]:
arm_probs = [0.3, 0.5, 0.4, 0.45, 0.3, 0.35]

We will run `n_rounds` rounds for `n_games` games.

In [203]:
n_rounds = 1000
n_games = 30

In [204]:
env = MultiArmedBanditEnvironment(arm_probs=arm_probs)

rewards_list = []

for _ in tqdm(range(n_games)):

    agent = ThompsonAgent(n_arms=len(arm_probs))

    for _ in range(n_rounds):
        
        choosen_arm = agent.choose_action()
        reward = env.take_action(arm_i=choosen_arm)
        agent.update(arm_i=choosen_arm, reward=reward)
        
    rewards_list.append(agent.reward)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 40.78it/s]


In [205]:
np.mean(rewards_list)

462.1666666666667