In [17]:
from __future__ import print_function, division
from builtins import range
import matplotlib.pyplot as plt
import numpy as np

NUM_TRIALS = 10000
EPS = 0.1
BANDIT_PROBABILITES = [0.2, 0.5, 0.75]

In [18]:
np.zeros(NUM_TRIALS)
np.random.random() 
np.argmax([1,1,1,2,2,2,])
np.ones(NUM_TRIALS)

array([1., 1., 1., ..., 1., 1., 1.])

In [22]:
class Bandit:
    def __init__(self, p):
        #p: the win rate
        self.p = p
        self.p_estimate = 0
        self.N = 0
        
    def pull(self):
        
        return np.random.random() < self.p
    
    def update(self, x):
        self.N  += 1.
        self.p_estimate = ((self.N - 1)*self.p_estimate + x) / self.N

In [None]:
def experiment():
    bandits = [Bandit(p) for p in BANDIT_PROBABILITES]
    
    rewards = np.zeros(NUM_TRIALS)
    num_times_explored = 0
    num_times_exploited = 0
    num_optimal = 0
    

    optimal_j = np.argmax(b.p for n in bandits)
    print("optimal j:", optimal_j )
    
    for i in range(NUM_TRIALS):
        
        # Use epsilon-greedy to select the next bandit
        if np.random.random() < EPS:
            num_times_explored += 1
            j = np.random.randint(len(bandits))
        
        else:
            num_times_exploited += 1
            j = np.argmax([b.p_estimate for b in bandits])
            
        if j == optimal_j:
            num_optimal +=1
            
        x = bandits[j].pull()
        
        rewards[i] = x
        
        bandits[j].update(x)
        
        for b in bandits:
            print("mean estimate:", b.p_estimate)
        
        # print total reward
        print("total reward earned:", rewards.sum())
        print("overall win rate:", rewards.sum() / NUM_TRIALS)
        print("num_times_explored:", num_times_explored)
        print("num_times_exploited:", num_times_exploited)
        print("num times selected optimal bandit:", num_optimal)
        
        cumulative_rewards = np.cumsum(rewards)
        win_rates = cumulative_rewards / (np.arange(NUM_TRIALS) + 1)
        win_rate = cumulative_rewards / (np.arange(NUM_TRIALS) + 1)
        plt.plot(NUM_TRIALS)
        plt.plot(np.ones(NUM_TRIALS)*np.max(BANDIT_PROBABILITES))
        plt.show()
        
if __name__ == "__main__":
    experiment()