In [1]:
from dataclasses import dataclass, field
from random import choice, gauss, random, randrange
from typing import Callable, Dict, Tuple

import numpy as np
import scipy.stats as st

In [2]:
@dataclass(frozen=True)
class Normal:
    """A normal distribution."""
    mu: float = 0
    sigma: float = 1

    def __call__(self):
        return gauss(self.mu, self.sigma)

standard_normal = Normal()

In [3]:
@dataclass
class Bandit:
    arms: Tuple[Callable[[], float]]

    def __len__(self):
        return len(self.arms)

    def actions(self):
        """Return a list of available actions (arms that can be pulled)."""
        return list(range(len(self.arms)))
    
    def pull(self, arm):
        return self.arms[arm]()

    def maximum_expected_value(self):
        """The expected value of the optimum policy."""
        return max(dist.mu for dist in self.arms)

In [4]:
def gaussian_bandit(k=10):
    """Create a k-armed bandit where each arm returns values drawn from the unit-normal distribution
    centered around mu, where mu is initially drawn from the standard normal distribution.
    """
    return Bandit(tuple(Normal(standard_normal()) for _ in range(k)))

In [5]:
# Just checking to make sure it works.
bandit = gaussian_bandit(3)
assert bandit.actions() == [0, 1, 2]
assert isinstance(bandit.pull(1), float)
bandit

Bandit(arms=(Normal(mu=1.3669200597511832, sigma=1), Normal(mu=1.0150586203888627, sigma=1), Normal(mu=-1.2605233254415784, sigma=1)))

In [6]:
@dataclass
class EpsilonGreedyPolicy:
    epsilon: float
    bandit: Bandit
    expected_reward: Dict[int, float] = field(init=False)
    visit_counts: Dict[int, int] = field(init=False)
    total_reward: float = field(init=False, default=0)

    def __post_init__(self):
        actions = self.bandit.actions()
        self.expected_reward = {a: 0 for a in actions}
        self.visit_counts = {a: 0 for a in actions}
    
    def pull(self):
        if random() < self.epsilon:
            # Pull a lever at random.
            action = randrange(len(self.bandit))
        else:
            # Pull the lever with the higest expected reward.
            action = max(self.expected_reward, key=self.expected_reward.get)
        reward = self.bandit.pull(action)
        self.update(action, reward)
        self.total_reward += reward
    
    def update(self, action, reward):
        self.visit_counts[action] += 1
        alpha = 1 / self.visit_counts[action]
        self.expected_reward[action] += alpha * (reward - self.expected_reward[action])
    
    def trial(self, attempts=1000):
        for _ in range(attempts):
            self.pull()
        return self.total_reward



In [7]:
def evaluate(epsilon, attempts=1000, runs=1000):
    returns = []
    for _ in range(runs):
        policy = EpsilonGreedyPolicy(epsilon, bandit=gaussian_bandit())
        policy.trial(attempts)
        returns.append(policy.total_reward)
    ci_low, ci_high = st.t.interval(0.95, runs-1, loc=np.mean(returns), scale=st.sem(returns))
    ci_size = (ci_high - ci_low) / 2
    return np.mean(returns), ci_size

In [8]:
performance = {e : evaluate(e) for e in (1, 0.3, 0.1, 0.03, 0.01, 0)}
performance

{1: (-13.124761493384547, 20.469862193861296),
 0.3: (1051.1372979983241, 28.965340322606608),
 0.1: (1314.9651299899601, 33.06312912690828),
 0.03: (1316.2716726101148, 35.37608767592974),
 0.01: (1152.827637286691, 36.45293734946176),
 0: (1033.2359387480142, 36.41430901717604)}

In [9]:
performance = {e : evaluate(e, runs=10000) for e in np.linspace(0.03, 0.1, num=8)}
performance

NameError: name 'np' is not defined