In [1]:
import numpy as np
from utils import *
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
def run_env(env, agent, timesteps=1000, averaging_steps=100, progress_bar=True):
    env.reset()
    agent.reset()
    mean_reward = 0

    iterator = tqdm(range(timesteps)) if progress_bar else range(timesteps)
    for i in iterator:
        action = agent.act()
        reward = env.step(action)
        agent.update(action, reward)

        if i == timesteps - averaging_steps:
            mean_reward += 1/(i - timesteps + averaging_steps + 1)*(reward - mean_reward)

    return mean_reward

In [3]:
mean_alpha_fun = lambda x: 1/x
k = 10
spec = {'eps-greedy-mean': (np.linspace(1/128, 1/4, 6), lambda cur_eps: QAgent(alpha=mean_alpha_fun, k=k, eps=cur_eps)),
        'gradient-bandit': (np.linspace(1/32, 3, 6), lambda cur_alpha: GradientBandits(k=k, alpha=cur_alpha)),
        'UCB': (np.linspace(1/16, 4, 6), lambda cur_c: UCBAgent(k=k, c=cur_c)),
        'eps-greedy-const': (np.linspace(1/128, 1/4, 6), lambda cur_eps: QAgent(alpha=0.1, k=k, eps=cur_eps)),
        'greedy-optimistic': (np.linspace(1/4, 4, 6), lambda Q_0: QAgent(alpha=0.1, k=k, initial_Q=np.full(k, Q_0)))}

mean_rewards = {'eps-greedy-mean': [],
                'gradient-bandit': [],
                'UCB': [],
                'eps-greedy-const': [],
                'greedy-optimistic': []}
for agent_desc, (space, agent_fun) in spec.items():
    for val in space:

        agent = agent_fun(val)
        env = KBandits(k=k, stationary=True)
        mean_mean_rw = 0

        for i in tqdm(range(100)):
            mean_rw = run_env(env, agent, timesteps=1000, averaging_steps=100, progress_bar=False)
            mean_mean_rw += 1/(i+1)*(mean_rw - mean_mean_rw)

        mean_rewards[agent_desc].append(mean_mean_rw)

100%|██████████| 100/100 [00:01<00:00, 96.07it/s]
100%|██████████| 100/100 [00:01<00:00, 91.50it/s]
100%|██████████| 100/100 [00:01<00:00, 98.42it/s]
100%|██████████| 100/100 [00:01<00:00, 96.87it/s]
100%|██████████| 100/100 [00:01<00:00, 97.96it/s]
100%|██████████| 100/100 [00:01<00:00, 98.31it/s]
100%|██████████| 100/100 [00:06<00:00, 15.41it/s]
100%|██████████| 100/100 [00:06<00:00, 15.23it/s]
100%|██████████| 100/100 [00:06<00:00, 15.04it/s]
100%|██████████| 100/100 [00:06<00:00, 15.04it/s]
100%|██████████| 100/100 [00:06<00:00, 15.49it/s]
100%|██████████| 100/100 [00:06<00:00, 15.82it/s]
  0%|          | 0/100 [00:00<?, ?it/s]


AttributeError: 'UCBAgent' object has no attribute 'alpha'

In [None]:
for agent_desc, (space, _) in spec.items():
    plt.plot(space, mean_rewards[agent_desc], label=agent_desc)

plt.grid()
plt.legend()
plt.ylabel('Average reward over last 100 steps')
