## Imports

Import the necessary packages: 
* `matplotlib.pyplot` and `seaborn` for plots
* `numpy`
* `gym` is needed to build the environments
* `tqdm` is nice
* `rl_agents` necessary classes

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# from rl_agents.runners import simple_tab_runner
from rl_agents.agents import QLearningAgent, SarsaAgent, ExpectedSarsaAgent
from rl_agents.agents.functions import QMatrixFunction
from rl_agents.agents.policies import EGreedyPolicy, EDecreasePolicy, BoltzmanPolicy

In [None]:
sns.set_style("darkgrid")
sns.set(rc={'figure.figsize':(6,4)})

In [None]:
def cumulative_avg(vector):
    output = np.zeros(vector.size)
    total = 0
    for ii, elem in enumerate(vector):
        total += elem
        output[ii] = total / (ii+1)
    return output

## Define the simulator.

The code below is almost the same as in `rl_agents.runners.simple_tab_runner`, the only difference is the substitution of the `while` with a `for` to avoid a episode that runs for too long. We define it here just to show how can you create your own setup:

In [None]:
def simple_tab_runner(env, agent, n_episodes):
    rewards = np.zeros(n_episodes)
    for ii in tqdm(range(n_episodes)):
        # Run episode:
        obs = env.reset()
        done = False
        episode_reward = 0
        for _ in range(200):
            action = agent.predict(obs)
            next_obs, reward, done, info = env.step(action)
            agent.learn(obs, action, reward, next_obs)
            agent.policy.update()
            obs = next_obs
            episode_reward += reward
            if done: break
        rewards[ii] = episode_reward
    return rewards

### Evaluation runner:
We also define a runner for testing the agent. In this setting we activate the flag `eval` in the `agent.predict()` to activate a greedy policy

In [None]:
def simple_tab_runner_eval(env, agent, n_episodes):
    rewards = np.zeros(n_episodes)
    for ii in tqdm(range(n_episodes)):
        # Run episode:
        obs = env.reset()
        done = False
        episode_reward = 0
        for _ in range(100):
#             env.render()
            action = agent.predict(obs, eval=True)
            next_obs, reward, done, info = env.step(action)
#             agent.learn(obs, action, reward, next_obs)
#             agent.policy.update()
            obs = next_obs
            episode_reward += reward
            if done: break
        rewards[ii] = episode_reward
    return rewards

## 0. Just a demonstration:

In [None]:
env = gym.make('CliffWalking-v0')
agentQ = QLearningAgent(n_states=env.nS, n_actions=env.nA, alpha=0.5, gamma=0.9, q_func_kwargs={"method":"zeros"})
agentS = SarsaAgent(n_states=env.nS, n_actions=env.nA, alpha=0.5, gamma=0.9, q_func_kwargs={"method":"zeros"})
agentE = ExpectedSarsaAgent(n_states=env.nS, n_actions=env.nA, alpha=0.5, gamma=0.9, q_func_kwargs={"method":"zeros"})
results = []
for agent in [agentQ, agentS, agentE]:
    env.seed(60)
    np.random.seed(40)
    results.append(simple_tab_runner(env, agent, 1000))

In [None]:
for rewards, label in zip(results,["QLearning", "Sarsa", "Expected-Sarsa"]):
    plt.plot(cumulative_avg(rewards), label=label)
plt.legend()
plt.title('Total Reward by Episodes')
plt.show()

In [None]:
results = []
for agent in [agentQ, agentS, agentE]:
    env.seed(123)
    np.random.seed(457)
    results.append(simple_tab_runner_eval(env, agent, 10))

In [None]:
for rewards, label in zip(results,["QLearning", "Sarsa", "Expected-Sarsa"]):
    plt.plot(cumulative_avg(rewards), label=label)
plt.legend()
plt.title('Total Reward by Episodes: Evaluating')
plt.show()

### Conclusions:

This is just one experiment, without a hyperparameter study or even a MonteCarlo simulation, so we can just take small conclusions, not really state anything, but we can see from comparing the training and evaluating plot that:

* The Q-Learning has the worst performance in the training, with being Expected-Sarsa being the best option: Since Q-Learning takes a greedy policy in the TD update for the next state, while Sarsa and Expected-Sarsa take in consideration the exploration policy, they  perform better at this stage.
* The Q-Learning performs better than both Expected-Sarsa and Sarsa in the evaluating stage, where the policy is changed into a greedy policy. Since both Sarsa and Expected-Sarsa takes into consideration the exploration policy when learning, when the policy changes to greedy, it decreases their performance.

## 1. A hyperparameter study

In this initial study, we want to find the best alpha and gamma for each algorithm.
We will use a epsilon-decreasing policy.

In [None]:
policy = EDecreasePolicy(0.9, 0.01, 0.995)
epsilons = []
for _ in range(1000):
    policy.update()
    epsilons.append(policy.epsilon)
plt.plot(epsilons)
plt.title("Epsilon Decrease")
plt.show()

In [None]:
def grid_search(env, Agent, alphas, gammas, methods, n_episodes):
    n_montecarlo = 10
    output = {}
    param_list = [(alpha, gamma, method) for alpha in alphas for gamma in gammas for method in methods]
    for alpha, gamma, method in tqdm(param_list):
        total_mc_reward = 0
        for mc in range(n_montecarlo): # Monte carlo loop:
            # We instantiate the agent inside the monte carlo loop due to the "random" initialization method and to reinitiate the epsilon 
            agent = Agent(n_states=env.nS, n_actions=env.nA, alpha=alpha, gamma=gamma, policy=EDecreasePolicy(0.9, 0.01, 0.99), q_func_kwargs={"method":method})
            total_reward = 0
            # This garantees that each parameter combination experiences
            # the same conditions for the same monte carlo run.
            np.random.seed(mc*123)
            env.seed(mc*456)
            for ii in range(n_episodes):
                # Run episode:
                obs = env.reset()
                done = False
                for _ in range(100):
                    action = agent.predict(obs)
                    next_obs, reward, done, info = env.step(action)
                    agent.learn(obs, action, reward, next_obs)
                    agent.policy.update()
                    obs = next_obs
                    total_reward += reward
                    if done: break
            total_mc_reward += total_reward/n_episodes
        output[(alpha, gamma, method)] = total_mc_reward/n_montecarlo
    return output

### Run simulation:

In [None]:
alphas = np.linspace(0.1, 0.9, 5)
gammas = np.linspace(0.1, 0.9, 5)
methods = ["zeros", "ones", "random"]
n_episodes = 500
ql_dict = grid_search(env, QLearningAgent, alphas, gammas, methods, n_episodes)
sarsa_dict = grid_search(env, SarsaAgent, alphas, gammas, methods, n_episodes)
esarsa_dict = grid_search(env, ExpectedSarsaAgent, alphas, gammas, methods, n_episodes)

In [None]:
print("Best combination for Q-learning:", max(ql_dict, key=ql_dict.get), "Value:", ql_dict[max(ql_dict, key=ql_dict.get)])
print("Best combination for Sarsa:", max(sarsa_dict, key=sarsa_dict.get), "Value:", sarsa_dict[max(sarsa_dict, key=sarsa_dict.get)])
print("Best combination for Expected-Sarsa:", max(esarsa_dict, key=esarsa_dict.get), "Value", esarsa_dict[max(esarsa_dict, key=esarsa_dict.get)])

## 2. Comparison of the basic tabular methods.

Now that we have foumd the best hyperparameters, we can compare the different algorithms:

In [None]:
alpha, gamma, method = max(ql_dict, key=ql_dict.get)
agentQ = QLearningAgent(n_states=env.nS, n_actions=env.nA, alpha=alpha, gamma=gamma, policy=EDecreasePolicy(0.9, 0.01, 0.995), q_func_kwargs={"method":method})
alpha, gamma, method = max(sarsa_dict, key=sarsa_dict.get)
agentS = SarsaAgent(n_states=env.nS, n_actions=env.nA, alpha=alpha, gamma=gamma, policy=EDecreasePolicy(0.9, 0.01, 0.995), q_func_kwargs={"method":method})
alpha, gamma, method = max(esarsa_dict, key=esarsa_dict.get)
agentE = ExpectedSarsaAgent(n_states=env.nS, n_actions=env.nA, alpha=alpha, gamma=gamma, policy=EDecreasePolicy(0.9, 0.01, 0.995), q_func_kwargs={"method":method})
results = []
for agent in [agentQ, agentS, agentE]:
    env.seed(789)
    np.random.seed(159)
    results.append(simple_tab_runner(env, agent, 1000))

In [None]:
for rewards, label in zip(results,["QLearning", "Sarsa", "Expected-Sarsa"]):
    plt.plot(cumulative_avg(rewards), label=label)
plt.legend()
plt.title('Total Reward by Episodes')
plt.show()

In [None]:
results = []
for agent in [agentQ, agentS, agentE]:
    env.seed(123)
    np.random.seed(457)
    results.append(simple_tab_runner_eval(env, agent, 10))
for rewards, label in zip(results,["QLearning", "Sarsa", "Expected-Sarsa"]):
    plt.plot(cumulative_avg(rewards), label=label)
plt.legend()
plt.title('Total Reward by Episodes: Evaluating')
plt.show()