# Reinforcement learning: Bandits, greediness and optimism

In [5]:
import numpy as np

class Bandit:
    def __init__(self, mean, var=1):
        self.mean = mean
        self.var = var
        
    def draw(self):
        return np.random.normal(loc=self.mean, scale=np.sqrt(self.var))
    
    
bandit0 = Bandit(0)
bandit1 = Bandit(1)
bandit2 = Bandit(2)

bandits = [bandit0, bandit1, bandit2]

## Greedy: totally, epsilon, optimistic starting values

In [6]:
def play_greedy(bandits, n_times, epsilon=0, start_expectation=0, rate=1):
    n_bandits = len(bandits)
    expectations = [start_expectation] * n_bandits
    values = {i: [] for i in range(n_bandits)}
    if start_expectation > 0:
        values = {i: [(-1, start_expectation, True)] for i in range(n_bandits)}
    for i in range(n_times):
        exploitation = True
        if np.random.uniform() > epsilon:
            max_expectation = max(expectations)
            max_elements_inds = [ind for ind, expectation in enumerate(expectations) if expectation == max_expectation]
            ind = np.random.choice(max_elements_inds)
        else:
            exploitation = False
            ind = np.random.choice(range(n_bandits))
        drawn_value = bandits[ind].draw()
        values[ind].append((i, drawn_value, exploitation))
        expectations[ind] = np.mean([v[1] for v in values[ind]])
        epsilon *= rate
    return values, expectations

In [7]:
values, expectations = play_greedy(bandits, 100, epsilon=.1)

In [8]:
values

{0: [(0, 0.518368678718732, True),
  (1, 0.4179136773165176, True),
  (2, -1.5155707150686213, True),
  (53, -1.742410132844397, False)],
 1: [(3, 0.7133951276241697, True),
  (4, 0.9789391781996101, True),
  (5, -0.10155577483718559, True),
  (6, 0.5610075558267564, True),
  (7, 2.5538749038837807, True),
  (8, 2.2804775806389563, True),
  (9, 0.315853177640287, True),
  (10, 1.6899454084078076, True),
  (11, 0.06355614566200218, True),
  (12, 0.19635567447333724, True),
  (13, 0.04290867061069459, True),
  (47, 0.5694019803448793, False),
  (49, 0.4231884148158679, False),
  (55, 1.9394613219485746, False),
  (61, 1.5654901805492558, False),
  (91, 0.8246961292577035, False)],
 2: [(14, 1.5840742361733915, False),
  (15, 1.7527076248644091, True),
  (16, 0.15720672780640843, True),
  (17, 4.114251304381272, True),
  (18, 0.5328633961093829, True),
  (19, 2.8279929399129493, True),
  (20, 0.9833882083443748, True),
  (21, 2.5687132538793582, True),
  (22, 2.770342685474115, True),
  (

In [34]:
from copy import deepcopy


def get_ucb(estimate, n_total, n_pulled):
    return estimate + np.sqrt(2 * np.log(n_total) / n_pulled)


def get_new_estimate(estimate, n_pulled, pulled_value):
    return (estimate * n_pulled + pulled_value) / (n_pulled + 1)


def play_ucb(bandits, n_times):
    # Start with playing each bandit once
    pulled_values, results = [], []
    for bandit in bandits:
        pulled_value = bandit.draw()
        pulled_values.append(pulled_value)
    N = len(bandits)
    res = {i: {'n_pulled': 1, 'estimated_mean': pulled_values[i]} for i in range(len(bandits))}
    results.append(deepcopy(res))
    for i in range(n_times - len(bandits)):
        ucbs = np.array([get_ucb(v['estimated_mean'], N, v['n_pulled']) for v in res.values()])
        print(f'DEBUG: round {i}')
        print(f'DEBUG: ucbs: {ucbs}')
        selected_bandit = np.argmax(ucbs)
        print(f'DEBUG: selected_bandit: {selected_bandit}')
        pulled_value = bandits[selected_bandit].draw()
        new_estimate = get_new_estimate(res[selected_bandit]['estimated_mean'], res[selected_bandit]['n_pulled'],  pulled_value)
        res[selected_bandit]['estimated_mean'] = new_estimate
        res[selected_bandit]['n_pulled'] += 1
        results.append(deepcopy(res))
        print(f'res: {res}')
        N += 1
    return results

In [35]:
results_ucb = play_ucb(bandits, 100)
print(results_ucb)

DEBUG: round 0
DEBUG: ucbs: [1.66593878 2.1507083  4.15282311]
DEBUG: selected_bandit: 2
res: {0: {'n_pulled': 1, 'estimated_mean': 0.1836349729126025}, 1: {'n_pulled': 1, 'estimated_mean': 0.6684044908869098}, 2: {'n_pulled': 2, 'estimated_mean': 2.0433030149753435}}
DEBUG: round 1
DEBUG: ucbs: [1.8487442  2.33351371 3.22071304]
DEBUG: selected_bandit: 2
res: {0: {'n_pulled': 1, 'estimated_mean': 0.1836349729126025}, 1: {'n_pulled': 1, 'estimated_mean': 0.6684044908869098}, 2: {'n_pulled': 3, 'estimated_mean': 2.16456111449142}}
DEBUG: round 2
DEBUG: ucbs: [1.97775755 2.46252707 3.20039827]
DEBUG: selected_bandit: 2
res: {0: {'n_pulled': 1, 'estimated_mean': 0.1836349729126025}, 1: {'n_pulled': 1, 'estimated_mean': 0.6684044908869098}, 2: {'n_pulled': 4, 'estimated_mean': 1.9129539708886703}}
DEBUG: round 3
DEBUG: ucbs: [2.07665345 2.56142296 2.85946321]
DEBUG: selected_bandit: 2
res: {0: {'n_pulled': 1, 'estimated_mean': 0.1836349729126025}, 1: {'n_pulled': 1, 'estimated_mean': 0.668

In [29]:
labels = list(results_ucb.keys())
men_means = [20, 34, 30, 35, 27]
women_means = [25, 32, 34, 20, 25]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, men_means, width, label='Men')
rects2 = ax.bar(x + width/2, women_means, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Scores')
ax.set_title('Scores by group and gender')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)

fig.tight_layout()

plt.show()

4.666666666666667