In [213]:
import random
import numpy as np
rewards = [1, 2, 3, 4, 5]
class Bandit:
    def __init__(self, p, index):
        self.p = p
        self.index = index

    def play(self):
        sample = np.random.multinomial(1, self.p)
        return np.argmax(sample)

def naive(bandits, T):
    total_reward = 0
    bandit_plays = [0] * len(bandits)
    for t in range(T):
        b = np.random.choice(bandits)
        total_reward += rewards[b.play()]
        bandit_plays[b.index] += 1

    return total_reward / T, bandit_plays

In [214]:
b1 = Bandit([1/6, 1/3, 1/6, 2/9, 1/9], 0)
b2 = Bandit([2/9, 1/9, 1/3, 2/9, 1/9], 1)
b3 = Bandit([1/6, 1/6, 1/6, 1/6, 1/3], 2)

In [215]:
naive([b1, b2, b3], 1000)

(2.999, [335, 335, 330])

In [216]:
def egreedy(bandits, T, e=0.1):
    total_reward = 0
    bandit_rewards = [0] * len(bandits)
    bandit_plays = [0] * len(bandits)
    for t in range(T):
        avg_rewards = [x / y if y else 0 for x, y in zip(bandit_rewards, bandit_plays)]
        max_reward_bandit = np.argmax(avg_rewards)
        probs = [e / (len(bandits) - 1)] * len(bandits)
        probs[max_reward_bandit] = 1 - e
        assert sum(probs) == 1
        bandit = np.random.choice(bandits, p=probs)
        reward = rewards[bandit.play()]
        total_reward += reward
        bandit_rewards[bandit.index] += reward
        bandit_plays[bandit.index] += 1

    return total_reward / T, bandit_plays

In [223]:
egreedy([b1, b2, b3], 50000)

(3.28786, [2508, 2431, 45061])

In [218]:
def thompson_sampling(bandits, T):
    total_reward = 0
    bandit_priors = [[1] * 5 for _ in range(len(bandits))]
    bandit_plays = [0] * len(bandits)
    for t in range(T):
        sampled_rewards = [rewards[np.argmax(np.random.dirichlet(bandit_priors[j]))] for j in range(len(bandits))]
        max_sample_reward_bandit = np.argmax(sampled_rewards)
        bandit = bandits[max_sample_reward_bandit]
        reward_index = bandit.play()
        total_reward += rewards[reward_index]
        bandit_plays[bandit.index] += 1
        bandit_priors[bandit.index][reward_index] += 1

    return total_reward / T, bandit_plays

In [219]:
np.random.dirichlet([1, 1, 1, 1, 1])

array([0.24933773, 0.4235967 , 0.03994664, 0.14829385, 0.13882507])

In [224]:
thompson_sampling([b1, b2, b3], 50000)

(3.32982, [70, 80, 49850])