### 2.3 The 10-armed Testbed

In [72]:
import numpy as np
np.random.seed(0)

class Bandit:
    """Bandit arm having true reward value determined by normal distribution
    """
    def __init__(self, true_value):
        print("Create a bandit machine with true value: ", true_value)
        self.mean = true_value
    
    def play(self):
        return np.random.normal(self.mean, 1)

    
def update_best(history, a, best):
    new_score_a = sum(history[a])/len(history[a])
    best_score = sum(history[best[0]])/len(history[best[0]])
    if new_score_a > best_score:
        return (a, new_score_a)
    else:
        return best

# prepare bandit machines
num_actions = 10
bandits = []
mean, sigma = 0, 1
true_value = np.random.normal(mean, sigma, num_actions)
print("The best machine for this experiment is ", np.argmax(true_value))
for v in true_value:
    bandits.append(Bandit(v))

num_iteration = 1000
epsilon = 0.1
history = np.empty((10,0))
history = []
for i in range(num_actions):
    history.append([])

best = (0, -float('inf')) # maximum reward holder
for i in range(num_iteration):

    p = np.random.random()
    if p < epsilon:
        #explore
        a = np.random.randint(num_actions)
    else:
        # exploit
        a = best[0]
    
    reward = bandits[a].play()
    #print("Selected #{} and got reward {}".format(a, reward))
    history[a].append(reward)
    best = update_best(history, a, best)

print("The best machine is {} with expected reward {}".format(best[0], best[1]))

The best machine for this experiment is  3
Create a bandit machine with true value:  1.764052345967664
Create a bandit machine with true value:  0.4001572083672233
Create a bandit machine with true value:  0.9787379841057392
Create a bandit machine with true value:  2.240893199201458
Create a bandit machine with true value:  1.8675579901499675
Create a bandit machine with true value:  -0.977277879876411
Create a bandit machine with true value:  0.9500884175255894
Create a bandit machine with true value:  -0.1513572082976979
Create a bandit machine with true value:  -0.10321885179355784
Create a bandit machine with true value:  0.41059850193837233
The best machine is 3 with expected reward 1.857216013357003
