In [None]:
import numpy as np


class Agent:
    def __init__(self, number_of_bandits: int, eps: float):
        self.number_of_bandits = number_of_bandits
        self.eps = eps
        self.step_number = 0
        self.rewards = np.zeros(number_of_bandits)
        self.actions_selections_count = np.zeros(number_of_bandits)
        self.mean_reward = 0

    def next_action(self) -> int:
        if self.step_number == 0:
            return np.random.choice(self.number_of_bandits)
        else:
            p = np.random.rand()
            if p < self.eps:
                return np.random.choice(self.number_of_bandits)
            else:
                return np.argmax(self.rewards)

    def update_reward(self, action: int, reward: float) -> None:
        self.step_number += 1
        self.actions_selections_count[action] += 1

        self.rewards[action] = self.rewards[action] + \
            (reward - self.rewards[action]) / self.actions_selections_count[action]
        self.mean_reward = self.mean_reward + (reward - self.mean_reward) / self.step_number


In [None]:
import typing
import numpy as np


class Environment:

    def __init__(self, number_of_bandits: int):
        self.number_of_bandits = number_of_bandits
        self.mu = np.random.normal(0, 1, number_of_bandits)

    def perform_action(self, action: int) -> float:
        return np.random.normal(self.mu[action], 1)


In [None]:
import typing

import numpy as np
from tqdm import tqdm


def run_exeriment(number_of_bandits: int, eps: float, number_of_iterations: int) -> typing.List[float]:
    agent = Agent(number_of_bandits, eps)
    environment = Environment(number_of_bandits)

    mean_rewards = []
    for _ in range(number_of_iterations):
        action = agent.next_action()
        reward = environment.perform_action(action)
        agent.update_reward(action, reward)
        mean_rewards.append(agent.mean_reward)
    return mean_rewards


number_of_bandits = 10
number_of_iterations = 1000

eps_0_rewards = np.zeros(number_of_iterations)
eps_001_rewards = np.zeros(number_of_iterations)
eps_01_rewards = np.zeros(number_of_iterations)
eps_05_rewards = np.zeros(number_of_iterations)

for episode in tqdm(range(1000)):

    eps_0_exp_rewards = run_exeriment(number_of_bandits, 0, number_of_iterations)
    eps_001_exp_rewards = run_exeriment(number_of_bandits, 0.01, number_of_iterations)
    eps_01_exp_rewards = run_exeriment(number_of_bandits, 0.1, number_of_iterations)
    eps_05_exp_rewards = run_exeriment(number_of_bandits, 0.5, number_of_iterations)

    eps_0_rewards = eps_0_rewards + (eps_0_exp_rewards - eps_0_rewards) / (episode + 1)
    eps_001_rewards = eps_001_rewards + (eps_001_exp_rewards - eps_001_rewards) / (episode + 1)
    eps_01_rewards = eps_01_rewards + (eps_01_exp_rewards - eps_01_rewards) / (episode + 1)
    eps_05_rewards = eps_05_rewards + (eps_05_exp_rewards - eps_05_rewards) / (episode + 1)


import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))
plt.plot(eps_0_rewards, label="$\epsilon=0$ (greedy)")
plt.plot(eps_001_rewards, label="$\epsilon=0.01$")
plt.plot(eps_01_rewards, label="$\epsilon=0.1$")
plt.plot(eps_05_rewards, label="$\epsilon=0.5$")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Average Reward")
plt.title("Average $\epsilon-greedy$ Rewards after 1000 episodes")
plt.show()