In [89]:
import numpy as np

In [90]:
# Thompson Sampling Beta Binomial (used for hit or miss arms - binomial)
num_trials = 10000
thetas = np.zeros(num_trials)
thetasregret = np.zeros(num_trials)
reward_i = np.zeros(num_trials)
choice_i = np.zeros(num_trials)
variants = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
payouts = [0.023, 0.03, 0.029, 0.001, 0.05, 0.06, 0.0234, 0.035, 0.01, 0.11]
num_k = len(variants)
a = np.ones(num_k)
b = np.ones(num_k)

total_reward = 0
for i in range(num_trials):
    # Choose k
    theta = np.random.beta(a, b)
    k = variants[np.argmax(theta)]
    
    # Reward (flag hit_or_miss) (In a real scenario payout is unknown and you make a choice here)
    reward = np.random.binomial(1, p=payouts[k])
    
    # Update
    a[k] += reward
    b[k] += 1 - reward # i.e. only increment b when it's a swing and a miss. 1 - 0 = 1, 1 - 1 = 0

    thetas[i] = theta[k]
    thetasregret[i] = np.max(thetas) - theta[k] 
    
    choice_i[i] = k
    reward_i[i] = reward
    total_reward += reward

In [91]:
print(a)
print(b)

[4.00e+00 3.00e+00 7.00e+00 1.00e+00 1.70e+01 1.40e+01 4.00e+00 6.00e+00
 1.00e+00 1.04e+03]
[ 104.   90.  136.   55.  221.  221.  106.  121.   58. 7811.]


In [92]:
print(f'Best Choice: {np.argmax(a/(a+b))}')

Best Choice: 9


In [93]:
# UCB1 (Use for Continuous rewards)
num_trials = 1000000
thetas = np.zeros(num_trials)
thetasregret = np.zeros(num_trials)
reward_i = np.zeros(num_trials)
choice_i = np.zeros(num_trials)
variants = [0, 1, 2]
payouts = [(55, 5), (70, 10), (70, 3)]
num_k = len(variants)
mean = np.zeros(num_k)
pulls_per_k = np.ones(num_k)

# Play each action once to get starting mean
for a in variants:
    mean[a] = np.random.normal(payouts[a][0], payouts[a][1])
    
total_reward = 0
for i in range(1, num_trials):
    # Choose k
    q = mean + np.sqrt((2*np.log(i))/pulls_per_k)
    k = variants[np.argmax(q)]
    pulls_per_k[k] += 1
    
    # Reward (In a real scenario payout is unknown and you make a choice here)
    reward = np.random.normal(payouts[k][0], payouts[k][1])
    
    # Update
    prev_mean = mean[k]
    mean[k] = prev_mean + (reward - prev_mean)/(i)

In [94]:
mean

array([63.43240845, 65.04810104, 70.00278722])

In [95]:
np.sqrt((2*np.log(i))/pulls_per_k)

array([5.25652158, 3.71692205, 0.00525652])

In [96]:
print(f'Best Choice: {np.argmax(mean)}')

Best Choice: 2


In [97]:
#UCB-BV2 (UCB but where arms have cost and you have a budget)
# Note: when payout has a quite similiar expectations, it will choose the payout with a more consistent return
budget = 10000
variants = [0, 1]
payouts = [(1, 2, 5), (1.1, 10)]
probs = [(.8, .19, .01), (.98, .02)] #E[x] = 1.23,1.278
cost = [.80, 1] # Note: this can be a continuous process but here I made it static

num_k = len(variants)
x_sum = np.zeros(num_k)
x_bar = np.zeros(num_k)
c_sum = np.zeros(num_k)
c_bar = np.zeros(num_k)
n_pulls = np.ones(num_k)
t = 0
total_reward = 0
total_cost = 0

# init by pulling each arm
for a in variants:
    t += 1
    r, c = np.random.choice(payouts[a], p=probs[a]), cost[a]
    
    x_sum[a] += r
    x_bar[a] = r
    c_sum[a] += c
    c_bar[a] = c
    
while total_cost < budget:
    t += 1
    
    # Choose k
    lam_t = min(c_bar)
    d = x_bar/c_bar + (1/lam_t)*(1+(1/(lam_t-np.sqrt(np.log(t-1)/n_pulls))))*np.sqrt(np.log(t-1)/n_pulls)
    k = variants[np.argmax(d)]
    
    # Reward
    r, c = np.random.choice(payouts[k], p=probs[k]), cost[k]
    
    # Update
    n_pulls[k] += 1
    x_bar[k] = x_sum[k]/n_pulls[k]
    x_sum[k] += r
    c_bar[k] = c_sum[k]/n_pulls[k]
    c_sum[k] += c
    total_reward += r
    total_cost += c
    

In [98]:
print(f'Best Arm: {np.argmax(x_bar)}')
print(f'Average Return per {np.argmax(x_bar)} Pull: {x_bar[np.argmax(x_bar)]}')
print(f'Average Cost per {np.argmax(x_bar)} Pull: {c_bar[np.argmax(x_bar)]}')
print(f'Total Return: {total_reward:.2f}')
print(f'Total Cost: {total_cost:.2f}')
print(f'Net: {total_reward-total_cost:.2f}')

Best Arm: 0
Average Return per 0 Pull: 1.2302152516603986
Average Cost per 0 Pull: 0.7999359846362865
Total Return: 15378.40
Total Cost: 10000.80
Net: 5377.60
