# Multi-armed Bandits

The expected value for action $q$ given that action $a$ is selected is given by the reward $R_t$

\begin{align}
q_*(a) := \mathbb{E}\left[R_t,A_t=a \right]
\end{align}

The action-value function if defined as
\begin{align}
Q_t(a) := \frac{\text{sum of rewards when a taken prior to t}}{\text{number of times a taken prior to t}}
\end{align}

### Multi-Armed Bandits as DM Environment

In [6]:
!pip install --user dm_env numpy

In [22]:
import dm_env
from dm_env import specs
import numpy as np
from dataclasses import dataclass

@dataclass
class BaditArm:
    mean: float
    sigma: float
    rng: np.random.RandomState

    def __call__(self):
        return self.rng.normal(self.mean, self.sigma)

class MultiArmedBandit(dm_env.Environment):
    def __init__(self, k: int = 10, seed: int = 1):
        rng = np.random.RandomState(seed)
        self._arms = [
            BaditArm(mean=a*4-2., sigma=b,rng=rng) for a,b in rng.random_sample((k,2))
        ]

    def reset(self) -> dm_env.TimeStep:
        return dm_env.restart(np.zeros(1))

    def step(self, action: int) -> dm_env.TimeStep:
        return self._arms[action]()

    def observation_spec(self) -> specs.BoundedArray:
        return specs.BoundedArray(
            shape=(1,),
            dtype=np.float_,
            name="multi-armed-bandit",
            minimum=0,
            maximum=len(self._arms)-1,
        )

    def action_spec(self) -> specs.DiscreteArray:
        return specs.DiscreteArray(dtype=int, num_values=1, name="action")



In [23]:
multi_armed_bandit = MultiArmedBandit(4)