In [1]:
import numpy as np
import random

class MultiArmedBandit(object):
    '''Multi armed bandint game for RL challange.
    
    Instance of this class represents single RL setup that agent needs to 
    face. 
    
    Properties:
        n_arms (int):
            Number of arms (between 2 and 10).
        n_trials (int):
            Number of available arm pulls (between 10 and 500).
        account (float):
            Sum of all gathered and lost points. 
            
    Methods:
        pull(arm):
            Return a reward for chosen arm. Reward is cached in the instance
            and accesible as account property. Note that after performing 
            n_trials pulls, bandit gets exhausted and pull raises StopIteration
            exception.
            
    How points are distributed? Each arm is instantiated with random mean and 
    standard deviation (both drawn from uniform distribution [0, 100]). Pulled 
    arm draws random reward from normal (Gaussian distribution) with preset 
    (but unknow to user) mean and standard deviation. 
    
    Goal of the challange is to write algorithm that effectively chooses most
    beneficial arms. Best algorithm will be the one with highest total money 
    collected for 1_000_000 experiments.
    '''
    
    def __init__(self):
        self._n_arms = random.randint(2, 10)
        self._n_trials = random.randint(10, 500)
        
        self._payoff_mean = np.random.random((self._n_arms, )) * 100 
        self._payoff_std = np.random.random((self._n_arms, )) * 100
        
        self._account = 0
        self._current_trial = 0
    
    @property 
    def n_arms(self):
        return self._n_arms
    
    @property 
    def n_trials(self):
        return self._n_trials
    
    @property
    def account(self):
        return self._account 
    
    def pull(self, arm):
        if self._current_trial >= self._n_trials:
            raise StopIteration('This bandit is already exhausted.')
        else:
            reward = np.random.normal(self._payoff_mean[arm], self._payoff_std[arm])
            self._account += reward
            self._current_trial += 1
            return reward

In [2]:
''' A solution based on the standard deviation and mean value in the first few iterations '''

my_results = []
for i in range(100):
    random.seed(3)
    mab = MultiArmedBandit()
    #print(f'Number of arms: {mab.n_arms}')
    #print(f'Number of trials: {mab.n_trials}')
    #print('......................................')

    test_range = int(mab.n_trials/(10*mab.n_arms))
    from statistics import mean, stdev

    means = []
    used_trials = mab.n_arms*test_range

    for i in range(mab.n_arms):
        current_list = [mab.pull(i) for j in range(test_range)]
        if stdev(current_list) < np.mean(np.abs(current_list)):
            # Good result: stdev(current_list) < np.mean(np.abs(current_list))
            curr_mean = np.mean(np.abs(current_list))
            means.append(curr_mean)
        else:
            # Bad result, try again (stdev(current_list) < np.mean(np.abs(current_list))).
            current_list = current_list + [mab.pull(i) for j in range(test_range)]
            curr_mean = np.mean(np.abs(current_list))
            means.append(curr_mean)
            used_trials = used_trials + test_range
            
    idx = means.index(max(means))
    for _ in range(mab.n_trials-used_trials):
        mab.pull(idx)
    
    #print(f'Total points gathered = {mab.account}')
    my_results.append(mab.account)
    #print('......................................')
print(f'The average reward with my method is {np.mean(my_results)}.' )

The average reward with my method is 23421.18590065838.


In [3]:
''' Random choice of arms - for comparison '''
random_results = []
for i in range(100):
    random.seed(3)
    mab = MultiArmedBandit()
    #print(f'Number of arms: {mab.n_arms}')
    #print(f'Number of trials: {mab.n_trials}')
    #print('Random pulling...')
    for _ in range(mab.n_trials):
        mab.pull(random.randint(0, mab.n_arms - 1))

    #print(f'Total points gathered = {mab.account}')
    random_results.append(mab.account)
print(f'The average reward when pulling randomly is {np.mean(random_results)}.' )

The average reward when pulling randomly is 15823.839279220112.


In [4]:
# If all trials were used, bandit gets exhausted and StopIteration is raised
try:
    mab.pull(0)
except StopIteration as ex:
    print(ex)

This bandit is already exhausted.
