In [1]:
import numpy
from bandit import *

In [2]:
n = 10 # arms of the bandit. In general, (as expected), the avg value/step we predict increases with n
samples = 1000 # 2000 in the text
stepsize = 50
epochs = 10 # for a total of 500 steps

In [3]:
# testing the eps-greedy bandit.
epsilons = [0, 0.01, 0.05, 0.1, 0.2, 0.5, 0.7, 1.0]
for eps in epsilons:
    eps_greedy_bandit.test(n, stepsize=stepsize, samples=samples, epochs=epochs, verbose=0, eps=eps, learn_type='sample-average')

average (over samples) optimal value/step: 1.527395
epsilon: 0
After 500 steps, we (on average) predict a value/step of 1.033509, (67.7% optimal)
average (over samples) optimal value/step: 1.550446
epsilon: 0.01
After 500 steps, we (on average) predict a value/step of 1.226887, (79.1% optimal)
average (over samples) optimal value/step: 1.569618
epsilon: 0.05
After 500 steps, we (on average) predict a value/step of 1.414762, (90.1% optimal)
average (over samples) optimal value/step: 1.537605
epsilon: 0.1
After 500 steps, we (on average) predict a value/step of 1.341445, (87.2% optimal)
average (over samples) optimal value/step: 1.571179
epsilon: 0.2
After 500 steps, we (on average) predict a value/step of 1.239833, (78.9% optimal)
average (over samples) optimal value/step: 1.519340
epsilon: 0.5
After 500 steps, we (on average) predict a value/step of 0.759143, (50.0% optimal)
average (over samples) optimal value/step: 1.535226
epsilon: 0.7
After 500 steps, we (on average) predict a valu

In [4]:
# testing the better bandits on optimistic initial values
for eps in epsilons[:4]:
    eps_greedy_bandit.test(n, stepsize=stepsize, samples=samples, epochs=epochs, verbose=0, eps=eps, learn_type='sample-average', optimistic=True)

# observation: after incorporating the optimistic initial values, it is not useful to still continue to choose states probabilistically, an (almost) full-on greedy approach works best in this case.

average (over samples) optimal value/step: 1.576145
epsilon: 0
After 500 steps, we (on average) predict a value/step of 1.466252, (93.0% optimal)
average (over samples) optimal value/step: 1.527741
epsilon: 0.01
After 500 steps, we (on average) predict a value/step of 1.420591, (93.0% optimal)
average (over samples) optimal value/step: 1.527894
epsilon: 0.05
After 500 steps, we (on average) predict a value/step of 1.393965, (91.2% optimal)
average (over samples) optimal value/step: 1.578729
epsilon: 0.1
After 500 steps, we (on average) predict a value/step of 1.389156, (88.0% optimal)


In [5]:
# testing the better bandits on optimistic initial values with constant learning rate of 0.1
for eps in epsilons[:4]:
    eps_greedy_bandit.test(n, stepsize=stepsize, samples=samples, epochs=epochs, verbose=0, eps=eps, learn_type='constant-rate', alpha=0.1, optimistic=True)

# observation: again, an all-out greedy approach is best.

average (over samples) optimal value/step: 1.532612
epsilon: 0
After 500 steps, we (on average) predict a value/step of 1.498202, (97.8% optimal)
average (over samples) optimal value/step: 1.524053
epsilon: 0.01
After 500 steps, we (on average) predict a value/step of 1.478753, (97.0% optimal)
average (over samples) optimal value/step: 1.545518
epsilon: 0.05
After 500 steps, we (on average) predict a value/step of 1.479365, (95.7% optimal)
average (over samples) optimal value/step: 1.513763
epsilon: 0.1
After 500 steps, we (on average) predict a value/step of 1.405776, (92.9% optimal)


In [7]:
# testing the UCB bandit for different values of c. The learning type is the sample mean.
cs = [0.3, 0.5, 0.8, 1, 1.2]
for c in cs:
    ucb_bandit.test(n, stepsize=stepsize, samples=samples, epochs=epochs, verbose=0, learn_type='sample-average', c=c) # we don't need optimistic when we're using a ucb bandit

# observation: takes much longer to complete because of increased calculation (more than double), but accuracy better than everything else.
# we are able to match the curve shape as given in the text

average (over samples) optimal value/step: 1.546293
c: 0.3
After 500 steps, we (on average) predict a value/step of 1.480778, (95.8% optimal)
average (over samples) optimal value/step: 1.530956
c: 0.5
After 500 steps, we (on average) predict a value/step of 1.505425, (98.3% optimal)
average (over samples) optimal value/step: 1.561422
c: 0.8
After 500 steps, we (on average) predict a value/step of 1.546213, (99.0% optimal)
average (over samples) optimal value/step: 1.549962
c: 1
After 500 steps, we (on average) predict a value/step of 1.525886, (98.4% optimal)
average (over samples) optimal value/step: 1.554072
c: 1.2
After 500 steps, we (on average) predict a value/step of 1.524412, (98.1% optimal)


In [8]:
# testing the UCB bandit for different values of c with learning type being constant rate, alpha = 0.1.
for c in cs:
    ucb_bandit.test(n, stepsize=stepsize, samples=samples, epochs=epochs, verbose=0, learn_type='constant-rate', alpha=0.1, c=c) # we don't need optimistic when we're using a ucb bandit

# observation: not as good as sample-average ucb, but pretty good. Curve same as that of ucb.

average (over samples) optimal value/step: 1.525328
c: 0.3
After 500 steps, we (on average) predict a value/step of 1.462743, (95.9% optimal)
average (over samples) optimal value/step: 1.572760
c: 0.5
After 500 steps, we (on average) predict a value/step of 1.518459, (96.5% optimal)
average (over samples) optimal value/step: 1.522488
c: 0.8
After 500 steps, we (on average) predict a value/step of 1.432741, (94.1% optimal)
average (over samples) optimal value/step: 1.524126
c: 1
After 500 steps, we (on average) predict a value/step of 1.416996, (93.0% optimal)
average (over samples) optimal value/step: 1.544558
c: 1.2
After 500 steps, we (on average) predict a value/step of 1.415037, (91.6% optimal)
