In [None]:
import numpy as np
from scipy.special import softmax
from numpy.typing import NDArray
import matplotlib.pyplot as plt

In [None]:
class KArmedGaussianBandit():
    def __init__(self, k, mu : NDArray | None = None, std : NDArray | None = None):
        self.k = k
        self.mu = mu if mu is not None else np.zeros(k)
        self.std = std if std is not None else np.ones(k)

    def pull(self, arm):
        return np.random.normal(self.mu[arm], self.std[arm])

class BanditAgent():
    def __init__(
            self,
            bandit : KArmedGaussianBandit,
            q_initial : NDArray | None = None,
            strategy : str = "epsilon-greedy",
            epsilon : float = 0.1
        ):
        self.bandit = bandit
        self.n = np.zeros(bandit.k)
        self.strategy = strategy
        self.epsilon = epsilon
        self.total_reward = 0.0
        if q_initial is not None:
            if len(q_initial) == bandit.k:
                self.q_estimates = q_initial
            else:
                raise ValueError(f"Expected q_initial of length {bandit.k}, got {len(q_initial)}")
        else:
            self.q_estimates = np.zeros(bandit.k)

    def play_episode(self):
        k = self.select_arm()
        reward = self.bandit.pull(k)
        self.total_reward += reward

        self.update_estimates(k, reward)

    def select_arm(self):
        if self.strategy == "epsilon-greedy":
            return self._select_arm_epsilon_greedy()
        elif self.strategy == "ucb":
            return self._select_arm_ucb()
        else:
            raise ValueError(f"Unknown strategy: {self.strategy}")

    def _select_arm_epsilon_greedy(self):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.bandit.k)
        else:
            return np.argmax(self.q_estimates).item()

    def _select_arm_ucb(self):
        total_counts = self.n.sum()
        if total_counts == 0:
            return np.random.choice(self.bandit.k)
        ucb_values = self.q_estimates + np.sqrt(2 * np.log(total_counts) / (self.n + 1e-5))
        return np.argmax(ucb_values).item()

    def update_estimates(self, k, reward):
        self.q_estimates[k] += (reward - self.q_estimates[k]) / (self.n[k] + 1)
        self.n[k] += 1

    def get_average_reward(self):
        return self.total_reward / self.n.sum()

    def print_estimates(self):
        print("Estimated action values:")
        for i in range(self.bandit.k):
            print(f"Arm {i}: {self.q_estimates[i]:.2f} (n={self.n[i]})")


class GradientBanditAgent():
    def __init__(
            self,
            bandit : KArmedGaussianBandit,
            alpha : float = 0.1
        ):
        self.bandit = bandit
        self.n = 0
        self.alpha = alpha
        self.average_reward = 0.0

        self.preferences = np.zeros(bandit.k)

    def play_episode(self):
        k = self.select_arm()
        reward = self.bandit.pull(k)
        self.average_reward += (reward - self.average_reward) / (self.n + 1)

        self.update_preferences(k, reward)

    def select_arm(self):
        probs = softmax(self.preferences)
        return np.random.choice(self.bandit.k, p=probs)

    def update_preferences(self, k, reward):
        kronecker_delta = np.zeros(self.bandit.k)
        kronecker_delta[k] = 1.0
        self.preferences = (
            self.preferences + self.alpha * (reward - self.average_reward) 
            * (kronecker_delta - softmax(self.preferences))
        )

    def get_average_reward(self):
        return self.average_reward


In [94]:
rewards = np.array([
    1.0, 0.5, 0.2, 0.8, 0.3, 0.9, 0.4, 0.6, 0.7, 0.1
])
bandit = KArmedGaussianBandit(k=10, mu=rewards, std=np.ones(10))

q_initial = np.ones_like(rewards) * 0.0  # Initial estimates for each arm

agent = BanditAgent(bandit, q_initial=q_initial, strategy="epsilon-greedy", epsilon=0.1)
agent.print_estimates()

Estimated action values:
Arm 0: 0.00 (n=0.0)
Arm 1: 0.00 (n=0.0)
Arm 2: 0.00 (n=0.0)
Arm 3: 0.00 (n=0.0)
Arm 4: 0.00 (n=0.0)
Arm 5: 0.00 (n=0.0)
Arm 6: 0.00 (n=0.0)
Arm 7: 0.00 (n=0.0)
Arm 8: 0.00 (n=0.0)
Arm 9: 0.00 (n=0.0)


In [95]:
for i in range(10000):
    agent.play_episode()
agent.print_estimates()

Estimated action values:
Arm 0: 1.00 (n=8472.0)
Arm 1: 0.61 (n=100.0)
Arm 2: 0.30 (n=115.0)
Arm 3: 0.69 (n=162.0)
Arm 4: 0.22 (n=84.0)
Arm 5: 0.89 (n=640.0)
Arm 6: 0.33 (n=114.0)
Arm 7: 0.66 (n=113.0)
Arm 8: 0.75 (n=95.0)
Arm 9: 0.03 (n=105.0)


In [96]:
agent.get_average_reward()

np.float64(0.9426117986341529)