In [5]:
import numpy as np
import matplotlib.pyplot as plt 

from bandit import Bandits_one, Bandits_two, Bandits_three

AttributeError: module 'matplotlib' has no attribute 'get_data_path'

Epsilon greedy algo implementation

In [6]:
def epsilon_greedy(mab_env, T=1000, epsilon=0.1):
    k = mab_env.k                  #k represents num arms in each MAB_env, otherwise, init E[R|a]=0, N_i = 0
    estimated_exp_reward = np.zeros(k, dtype=float)   # estimated mean reward per arm
    num_actions = np.zeros(k, dtype=int)     # pull counts per arm

    actions = np.zeros(T, dtype=int)
    rewards = np.zeros(T, dtype=float)

    #this is sort of equiv to num simulations
    for t in range(T):
        # explore vs exploit
        if np.random.rand() < epsilon:
            action = np.random.randint(k)
        else:
            action = int(np.argmax(estimated_exp_reward))

        observation, reward, terminated, truncated, info = mab_env.step(action)

        # incremental mean update
        num_actions[action] += 1
        estimated_exp_reward[action] += (reward - estimated_exp_reward[action]) / num_actions[action]

        actions[t] = action
        rewards[t] = reward

    return estimated_exp_reward, num_actions, actions, rewards


Decaying Epsilon Greedy

In [8]:
def decaying_epsilon_greedy(mab_env, T = 1000, epsilon = 0.1, alpha = 0.1):
    k = mab_env.k # same as above, represents num arms in each bandit env
    estimated_exp_reward = np.zeros(k, dtype=float)
    num_actions = np.zeros(k, dtype=int)  

    actions = np.zeros(T, dtype=int)
    rewards = np.zeros(T, dtype=float)

    for t in range(T):
        # explore vs exploit
        if np.random.rand() < epsilon:
            action = np.random.randint(k)
        else:
            action = int(np.argmax(estimated_exp_reward))
        

        observation, reward, terminated, truncated, info = mab_env.step(action)

        # incremental mean update
        num_actions[action] += 1
        estimated_exp_reward[action] += (reward - estimated_exp_reward[action]) / num_actions[action]

        actions[t] = action
        rewards[t] = reward

        #algo similar to normal epsilon greedy, so have to add alpha-decay 

        epsilon = epsilon * alpha

    return estimated_exp_reward, num_actions, actions, rewards


UCB Algorithm 

In [9]:
def UCB(mab_env, T= 1000, c = 2):
    k = mab_env.k
    estimated_exp_reward = np.zeros(k, dtype=float)   # estimated mean reward per arm
    num_actions = np.zeros(k, dtype=int)     # pull counts per arm  

    actions = np.zeros(T, dtype=int)
    rewards = np.zeros(T, dtype=float)

    for t in range(1, T+1):
        Ucb_values = np.zeros(k, dtype=float)
        for action in range(k): 
            if num_actions[action] == 0:
                Ucb_values[action] = float('inf')  # ensure each arm is selected at least once
            else:
                Ucb_values[action] = estimated_exp_reward[action] + c * np.sqrt(np.log(t) / num_actions[action])

        action = int(np.argmax(Ucb_values))
        observation, reward, terminated, truncated, info = mab_env.step(action) 

        # incremental mean update
        num_actions[action] += 1
        estimated_exp_reward[action] += (reward - estimated_exp_reward[action]) / num_actions[action]

        actions[t-1] = action   
        rewards[t-1] = reward

    return estimated_exp_reward, num_actions, actions, rewards  

1000 runs of each algo

In [11]:
T = 1000

# epsilon-greedy
env = Bandits_one()
estimated_reward, num_actions, actions, rewards = epsilon_greedy(env, T=T, epsilon=0.1)
print("Epsilon-greedy best arm:", np.argmax(estimated_reward))

# decaying epsilon-greedy
env = Bandits_one()
Qd, Nd, Ad, Rd = decaying_epsilon_greedy(env, T=T, epsilon=1.0, alpha=0.995)
print("Decaying epsilon-greedy best arm:", np.argmax(Qd))

# UCB
env = Bandits_one()
Qu, Nu, Au, Ru = UCB(env, T=T, c=2.0)
print("UCB best arm:", np.argmax(Qu))

# true optimal
env = Bandits_one()
print("True optimal arm:", env.get_optimal_action(), "True means:", env.means)


NameError: name 'Bandits_one' is not defined