# Deep Q-Networks

## Tabular Q-Learning
By *Tabular Q-Learning* we mean a *model-free* set of techniques which work in discrete (or discretized) environments with small amount of states and actions and keep around a table of Q values. Contrary to the Q-Value Learning from the last chapter, here we don't explicitly model the environment.

After getting an experience $(s, a, r, s')$ we perform a blending Bellman approximation update:
$$
Q(s, a) \gets (1 - \alpha) Q(s, a) + \alpha (r + \max_{a'} Q(s', a'))
$$
which can be reformulated in terms of *Temporal Difference learning (TD learning)* as
$$
Q(s, a) \gets Q(s, a) + \alpha (r + \max_{a'} Q(s', a') - Q(s, a)) = Q(s, a) + \alpha \delta(s, a, r, s')
$$
where
* $\delta(s, a, r, s') = r + \max_{a'} Q(s', a') - Q(s, a)$ is called *TD error* and
* $r + \max_{a'} Q(s', a')$ is the *TD target*

Finally, for efficiency reasons we don't actually have to construct the full Q table as we don't really care about states that we've never experience. So we'll estimate values only for those states that we've seen and iterate over a smaller set.

In [1]:
import collections
from functools import partial
from typing import Callable, NamedTuple, Tuple

import gym
import numpy as np
from tensorboardX import SummaryWriter


class Experience(NamedTuple):
    state: int
    action: int
    reward: float
    next_state: int


class Agent:
    """Q-Learning agent"""

    def __init__(
        self,
        n_actions: int,
        alpha: float,
        gamma: float,
    ) -> None:
        self.n_actions = n_actions
        self.alpha = alpha
        self.gamma = gamma
        self.values = collections.defaultdict(float)

    def policy(self, s: int) -> int:
        """Returns the best known action"""
        return np.argmax([self.values[s, a] for a in range(self.n_actions)])

    def __iadd__(self, e: Experience) -> "Agent":
        Q_max = self.values[e.next_state, self.policy(e.next_state)]
        td_target = e.reward + self.gamma * Q_max
        td_error = td_target - self.values[e.state, e.action]
        self.values[e.state, e.action] += self.alpha * td_error
        return self


def explore(env: gym.Env, state: int) -> Tuple[Experience, int]:
    """
    Samples and applies a random action in given environment from given state.
    Returns experience (s, a, r, s') and new state (resets env if necessary).
    """
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    experience = Experience(state, action, reward, next_state)
    state = env.reset() if done else next_state
    return experience, state


def evaluate(
    env: gym.Env,
    n_episodes: int,
    policy: Callable[[int], int],
) -> float:
    """
    Runs n episodes in given environment using given policy and
    computes the mean non-disounted reward.
    """

    total_reward = 0.0

    for _ in range(n_episodes):

        state = env.reset()
        episode_done = False

        while not episode_done:
            action = policy(state)
            next_state, reward, episode_done, _ = env.step(action)
            state = next_state
            total_reward += reward

    return total_reward / n_episodes


def train(
    env: gym.Env,
    eval_episodes: int = 20,
    alpha: float = 0.2,
    gamma: float = 0.9,
    solution_bound: float = 0.8,
    max_iters: int = 5_000,
) -> None:
    with SummaryWriter(comment="-q-learning") as writer:

        # Bind experience sampling and policy evaluation to environments
        #  - Note: We use a copy of the env. for testing.
        explore_env = partial(explore, env)
        eval_policy = partial(evaluate, gym.make(env.spec.id), eval_episodes)

        # Initialize the Q-Learning agent
        agent = Agent(env.action_space.n, alpha, gamma)

        i = 0
        reward = 0.0
        best_reward = 0.0

        # Initialize the environment
        state = env.reset()

        while reward < solution_bound and i < max_iters:
            i += 1

            # Sample new experience from the environment
            # and pass it to the agent to learn from it.
            experience, state = explore_env(state)
            agent += experience

            # Evaluate current policy
            mean_reward = eval_policy(agent.policy)
            best_reward = max(mean_reward, best_reward)

            # Record metrics
            writer.add_scalar("reward", mean_reward, i)
            writer.add_scalar("best_reward", best_reward, i)

    print(f"Solved in {i} iterations with best reward: {best_reward:.3f}")


# Run Q-Learning in FrozenLake
train(env=gym.make("FrozenLake-v0"))

Solved in 5000 iterations with best reward: 0.750
