In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Initialize bandit with k arms
class Bandit:
    def __init__(self, k_arms = 10):
        self.k_arms = k_arms
        self.q_star = np.random.normal(0, 1, self.k_arms)
        self.optimal_action = np.argmax(self.q_star)
    
    def get_reward(self, action):
        reward = np.random.normal(self.q_star[action], 1)
        return reward


# $greedy$ action selection method
$$A_t\doteq argmax_aQ_t(a)$$

$argmax_a$ denotes the action $a$ for which the expression that follows is maximized.

In [2]:
class GreedyAgent:
    def __init__(self, k_arms = 10):
        self.k_arms = k_arms
        self.Q = np.zeros(self.k_arms)
        self.N = np.zeros(self.k_arms)
    
    def choose_action(self):
        action = np.argmax(self.Q)
        return action
    
    def update_values(self, action, reward):
        self.N[action] += 1
        self.Q[action] += (reward - self.Q[action]) / self.N[action]

# $\epsilon-greedy$ methods
behave greedy most of the time, but with small probability $\epsilon$ select randomly from among all the actions with equal probability