In [136]:
import numpy as np

In [137]:
# Thompson Sampling Beta Binomial (used for hit or miss arms - binomial)
num_trials = 10000
thetas = np.zeros(num_trials)
thetasregret = np.zeros(num_trials)
reward_i = np.zeros(num_trials)
choice_i = np.zeros(num_trials)
variants = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
payouts = [0.023, 0.03, 0.029, 0.001, 0.05, 0.06, 0.0234, 0.035, 0.01, 0.11]
num_k = len(variants)
a = np.ones(num_k)
b = np.ones(num_k)

In [138]:
# Thompson Sampling Beta Binomial
total_reward = 0
for i in range(num_trials):
    # Choose k
    theta = np.random.beta(a, b)
    k = variants[np.argmax(theta)]
    
    # Reward (flag hit_or_miss) (In a real scenario payout is unknown and you make a choice here)
    reward = np.random.binomial(1, p=payouts[k])
    
    # Update
    a[k] += reward
    b[k] += 1 - reward # i.e. only increment b when it's a swing and a miss. 1 - 0 = 1, 1 - 1 = 0

    thetas[i] = theta[k]
    thetasregret[i] = np.max(thetas) - theta[k] 
    
    choice_i[i] = k
    reward_i[i] = reward
    total_reward += reward

In [139]:
print(a)
print(b)

[  8.   5.   4.   1.  14.  52.   2.  20.   1. 875.]
[ 181.  130.  108.   62.  235.  631.   87.  321.   68. 7215.]


In [140]:
print(f'Best Choice: {np.argmax(a/(a+b))}')

Best Choice: 9


In [161]:
# UCB (Use for Continuous rewards)
num_trials = 1000000
thetas = np.zeros(num_trials)
thetasregret = np.zeros(num_trials)
reward_i = np.zeros(num_trials)
choice_i = np.zeros(num_trials)
variants = [0, 1, 2]
payouts = [(55, 5), (70, 10), (70, 3)]
num_k = len(variants)
mean = np.zeros(num_k)
pulls_per_k = np.ones(num_k)

In [162]:
# UCB

# Play each action once to get starting mean
for a in variants:
    mean[a] = np.random.normal(payouts[a][0], payouts[a][1])
    
total_reward = 0
for i in range(1, num_trials):
    # Choose k
    q = mean + np.sqrt((2*np.log(i))/pulls_per_k)
    k = variants[np.argmax(mean)]
    pulls_per_k[k] += 1
    
    # Reward (In a real scenario payout is unknown and you make a choice here)
    reward = np.random.normal(payouts[k][0], payouts[k][1])
    
    # Update
    prev_mean = mean[k]
    mean[k] = prev_mean + (reward - prev_mean)/(i)

In [163]:
mean

array([57.14733201, 66.20149998, 69.99981034])

In [164]:
print(f'Best Choice: {np.argmax(mean)}')

Best Choice: 2
