In [1]:
#####
# Code for bandits computational example

eps1 = 0.1
eps2 = 0.5

# First we define our bandits
###


0.5

In [2]:
function eps_greedy(eps, revenue)
    counts = zeros(5); # Number of times each assortment is picked so far
    rewards = zeros(5); # Rewards from each assortment so far
    record_pulls = []; # Which decisions were made

    for t=1:52
        next_arm = (rand() <= eps ? rand(1:5) : indmax(rewards./counts) ) # ?: is the ternary if operator. A?B:C will take B if A is true, and C if A is false
        counts[next_arm] += 1
        rewards[next_arm] += revenue[next_arm, t]
        push!(record_pulls, next_arm)
    end
    return sum(rewards), record_pulls
end

###
function eps_decreasing(eps, revenue)
    counts = zeros(5); # Number of times each assortment is picked so far
    rewards = zeros(5); # Rewards from each assortment so far
    record_pulls = []; # Which decisions were made
    
    for t=1:52
        next_arm = (rand() <= eps ? rand(1:5) : indmax(rewards./counts) )
        counts[next_arm] += 1
        rewards[next_arm] += revenue[next_arm, t]
        push!(record_pulls, next_arm)
        eps *= 0.99 # Epsilon decreases by 1% each time
    end
    return sum(rewards), record_pulls
end

###
function thompson(estimate, prev_weeks, revenue) # We want to know how sure we are of our estimate
    counts = zeros(5);
    rewards = zeros(5);
    record_pulls = []; # Which decisions were made
    
    for t=1:52
        # Posterior expression here: https://en.wikipedia.org/wiki/Conjugate_prior
        # with sigma_0 equal to sigma for simplicity
        sampled_parameters = ( (estimate*prev_weeks+rewards) ./ (prev_weeks+counts) ) + 100./(prev_weeks+counts).*randn(5,1)
        next_arm = indmax(sampled_parameters) # Since for a normal distribution, the expected reward is just the value of the mean parameter
        counts[next_arm] += 1
        rewards[next_arm] += revenue[next_arm, t]
        push!(record_pulls, next_arm)
    end
    return sum(rewards), record_pulls
end

###



thompson (generic function with 1 method)

In [3]:
total_revenue = zeros(6); # Stores revenue across scenarios


6-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [4]:
for scenario=1:100

###
# Generating the true underlying data for purpose of comparing different algorithms - Note: the decision maker does not observe these true values when making decisions
srand(scenario);
mu = 100*randn(5)+500*ones(5); # Mean revenue
revenue = 100*randn(5,52)+repmat(mu,1,52); # Our revenue is normal with means themselves normally distributed

estimate1 = 100*randn(5,1)+mu # One-week estimate
estimate2 = mean(100*randn(5,12)+repmat(mu,1,12), 2) # 12-week estimate

eps_greedy_score1, eps_greedy_record1 = eps_greedy(eps1, revenue) # You can refer to eps_greedy_record1 to see which arms were pulled by the algorithm. Similarly for the other algorithms.
eps_greedy_score2, eps_greedy_record2 = eps_greedy(eps2, revenue)
eps_decreasing_score1, eps_decreasing_record1 = eps_decreasing(eps1, revenue)
eps_decreasing_score2, eps_decreasing_record2 = eps_decreasing(eps2, revenue)
thompson_score1, thompson_record1 = thompson(estimate1, 1, revenue)
thompson_score2, thompson_record2 = thompson(estimate2, 12, revenue)

total_revenue += [eps_greedy_score1; eps_greedy_score2; eps_decreasing_score1; eps_decreasing_score2; thompson_score1; thompson_score2]
end



In [None]:
println("Epsilon-greedy, epsilon = $eps1: Average revenue is ", total_revenue[1]/100 )
println("Epsilon-greedy, epsilon = $eps2: Average revenue is ", total_revenue[2]/100 )
println("Epsilon-decreasing, epsilon = $eps1: Average revenue is ", total_revenue[3]/100 )
println("Epsilon-decreasing, epsilon = $eps2: Average revenue is ", total_revenue[4]/100 )
println("Thompson sampling, one-week estimate to obtain prior: Average revenue is ", total_revenue[5]/100 )
println("Thompson sampling, twelve-week estimate to obtain prior: Average revenue is ", total_revenue[6]/100 )