In [1]:
import numpy as np
import random


class MultiArmedBandit(object):
    '''Multi armed bandint game for RL challange.
    
    Instance of this class represents single RL setup that agent needs to 
    face. 
    
    Properties:
        n_arms (int):
            Number of arms (between 2 and 10).
        n_trials (int):
            Number of available arm pulls (between 10 and 500).
        account (float):
            Sum of all gathered and lost points. 
            
    Methods:
        pull(arm):
            Return a reward for chosen arm. Reward is cached in the instance
            and accesible as account property. Note that after performing 
            n_trials pulls, bandit gets exhausted and pull raises StopIteration
            exception.
            
    How points are distributed? Each arm is instantiated with random mean and 
    standard deviation (both drawn from uniform distribution [0, 100]). Pulled 
    arm draws random reward from normal (Gaussian distribution) with preset 
    (but unknow to user) mean and standard deviation. 
    
    Goal of the challange is to write algorithm that effectively chooses most
    beneficial arms. Best algorithm will be the one with highest total money 
    collected for 1_000_000 experiments.
    '''
    
    def __init__(self):
        self._n_arms = random.randint(2, 10)
        self._n_trials = random.randint(10, 500)
        
        self._payoff_mean = np.random.random((self._n_arms, )) * 100 
        self._payoff_std = np.random.random((self._n_arms, )) * 100
        
        self._account = 0
        self._current_trial = 0
    
    @property 
    def n_arms(self):
        return self._n_arms
    
    @property 
    def n_trials(self):
        return self._n_trials
    
    @property
    def account(self):
        return self._account 
    
    def pull(self, arm):
        if self._current_trial >= self._n_trials:
            raise StopIteration('this bandit is already exhausted')
        else:
            reward = np.random.normal(self._payoff_mean[arm], self._payoff_std[arm])
            self._account += reward
            self._current_trial += 1
            return reward

In [2]:
my_results = []
for i in range(100):
    random.seed(3)
    mab = MultiArmedBandit()
    print(f'Number of arms: {mab.n_arms}')
    print(f'Number of trials: {mab.n_trials}')
    print('......................................')

    test_range = int(mab.n_trials/(10*mab.n_arms))
    print(test_range)
    from statistics import mean, stdev

    means = []
    used_trials = mab.n_arms*test_range

    for i in range(mab.n_arms):
        current_list = [mab.pull(i) for j in range(test_range)]
        if stdev(current_list) < np.mean(np.abs(current_list)):
            curr_mean = np.mean(np.abs(current_list))
            means.append(curr_mean)
            #print('Fine at first shot:')
            #print(np.mean(np.abs(current_list)))
            #print(stdev(current_list))
            #print(stdev(current_list) < np.mean(np.abs(current_list)))
            #print('...................')
        else:
            #print('First try not so good:')
            #print(np.mean(np.abs(current_list)))
            #print(stdev(current_list))
            #print(stdev(current_list) < np.mean(np.abs(current_list)))
            current_list = current_list + [mab.pull(i) for j in range(test_range)]
            curr_mean = np.mean(np.abs(current_list))
            means.append(curr_mean)
            used_trials = used_trials + test_range
            #print('Performed second try, now:')
            #print(np.mean(np.abs(current_list)))
            #print(stdev(current_list))
            #print(stdev(current_list) < np.mean(np.abs(current_list)))
            #print('...................')
            
    #print(means)
    idx = means.index(max(means))
    #print(idx)
    for _ in range(mab.n_trials-used_trials):
        mab.pull(idx)
    
    print(f'Total points gathered = {mab.account}')
    my_results.append(mab.account)
    print('......................................')
print(np.mean(my_results))

Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 23620.515435071306
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 24452.205267143214
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 18381.023000848378
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 3821.880913105997
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 22737.790853222534
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 25457.142952693288
......................................
Number of arms: 5
Numbe

Total points gathered = 20549.73214015871
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 27739.28402354945
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 12646.788450715112
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 22691.23196896858
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 23373.01844845641
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 18383.993690815245
......................................
Number of arms: 5
Number of trials: 313
......................................
6
Total points gathered = 28

In [4]:
random_results = []
for i in range(100):
    random.seed(3)
    mab = MultiArmedBandit()
    print(f'Number of arms: {mab.n_arms}')
    print(f'Number of trials: {mab.n_trials}')
    print('Random pulling...')
    for _ in range(mab.n_trials):
        mab.pull(random.randint(0, mab.n_arms - 1))

    print(f'Total points gathered = {mab.account}')
    random_results.append(mab.account)
    print('......................................')
print(np.mean(random_results))

Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 16656.277724055803
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 8324.15435598068
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 19252.44527206623
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 20420.331528330455
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 13649.602704278781
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 13945.122524414392
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 10606.316989876474
......................................
Number of arms: 5
Numbe

Total points gathered = 18214.322167059545
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 16010.601235373728
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 12390.825088597432
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 15378.187311696729
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 6124.550200692226
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 13707.515820414374
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered = 19701.492713575622
......................................
Number of arms: 5
Number of trials: 313
Random pulling...
Total points gathered

In [5]:
# If all trials were used, bandit gets exhausted and StopIteration is raised
try:
    mab.pull(0)
except StopIteration as ex:
    print(ex)

this bandit is already exhausted
