# Capítulo 7 - DQN


Você pode rodar este notebook no Colab ou localmente. Para abrir diretamente no Colab, basta clicar no link abaixo.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pablo-sampaio/rl_facil/blob/main/cap07/cap07-main.ipynb) 

Para rodar localmente, primeiro, baixe todo o repositório do github: https://github.com/pablo-sampaio/rl_facil.

## 1. Configurações Iniciais

### Configurações Dependentes do Sistema

Rode a célula abaixo, mesmo sem estar no Colab.

In [1]:
import sys
from IPython.display import clear_output

if 'google.colab' in sys.modules:
    !pip install gym
    !pip install gym[box2d]
    !pip install gym[atari]
    !pip install opencv-python
    !pip install autorom[accept-rom-license]
    !pip install tensorboardX

    # para salvar videos
    !apt-get install -y xvfb x11-utils
    !pip install pyvirtualdisplay==0.2.*
    !apt-get install ffmpeg

    from pyvirtualdisplay import Display
    global display
    display = Display(visible=False, size=(1400, 900))
    _ = display.start()

    !git clone https://github.com/pablo-sampaio/rl_facil
    clear_output()

    !mv /content/rl_facil/cap07/* /content/


### Configurações para Exibir Video

In [2]:
# ideias adaptadas de : https://www.anyscale.com/blog/an-introduction-to-reinforcement-learning-with-openai-gym-rllib-and-google
from base64 import b64encode
from IPython.display import HTML
from gym.wrappers.monitoring.video_recorder import VideoRecorder

def render_mp4(videopath: str) -> str:
  """
  Gets a string containing a b4-encoded version of the MP4 video
  at the specified path.
  """
  mp4 = open(videopath, 'rb').read()
  base64_encoded_mp4 = b64encode(mp4).decode()
  html_code = f'<video width=400 controls><source src="data:video/mp4;' \
         f'base64,{base64_encoded_mp4}" type="video/mp4"></video>'
  return HTML(html_code)

## 2. Imports e Definições Usadas pelo DQN

Código adaptado do código explicado no livro de M. Lapan, cap. 6.

### Imports

In [5]:
import dqn_models
from atari_wrappers import *

import argparse
import time
from datetime import datetime
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim

from tensorboardX import SummaryWriter

clear_output()

### Classes Auxiliares

In [23]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])


class DQNExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, s1, a, r, done, s2):
        experience = Experience(s1, a, r, done, s2)
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)


class DQNAgent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self.state = self.env.reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = self.env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a, dtype=torch.float32).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        self.exp_buffer.append(self.state, action, reward, is_done, new_state)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self.state = self.env.reset()
            self.total_reward = 0.0
        return done_reward

### Função de Perda (*Loss Function*)

In [25]:
def calc_loss(batch, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(states, dtype=torch.float32).to(device)
    next_states_v = torch.tensor(next_states, dtype=torch.float32).to(device)
    actions_v = torch.tensor(actions, dtype=torch.int64).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.tensor(dones, dtype=torch.bool).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * gamma + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

## 2. Função Principal do DQN

Esta é a função que faz o aprendizado. (Porém, o DQN é uma solução que incluir também as ideias dos wrappers.)

In [19]:
def DQN_TRAIN(env, env_name, qnet, qnet_lr, tgt_qnet, target_update_freq, gamma, replay_size, batch_size, epsilon_f, epsilon_decay_last_step, MEAN_REWARD_BOUND):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    qnet = qnet.to(device)
    tgt_qnet = tgt_qnet.to(device)

    writer = SummaryWriter(comment="-" + env_name)
    print(qnet)

    buffer = DQNExperienceBuffer(replay_size)
    agent = DQNAgent(env, buffer)
    epsilon = 1.0

    optimizer = optim.Adam(qnet.parameters(), lr=qnet_lr)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None
    start_time_str = datetime.now().strftime("%Y-%m-%d,%H-%M-%S")

    while True:
        frame_idx += 1
        epsilon = max(epsilon_f, 1.0 - frame_idx / epsilon_decay_last_step)

        reward = agent.play_step(qnet, epsilon, device=device)
        if reward is not None:
            total_rewards.append(reward)
            if (time.time() - ts) == 0:
                speed = float("-inf")
            else:
                speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-100:])
            print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
                frame_idx, len(total_rewards), mean_reward, epsilon,
                speed
            ))
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)
            writer.add_scalar("reward_100", mean_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)
            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(qnet.state_dict(), env_name + "-" + start_time_str + "-best.dat")
                # IDEIA: salvar também a iteração, para poder retomar
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
                best_mean_reward = mean_reward
            if mean_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

        if len(buffer) < replay_size:
            continue

        if frame_idx % target_update_freq == 0:
            tgt_qnet.load_state_dict(qnet.state_dict())

        optimizer.zero_grad()
        batch = buffer.sample(batch_size)
        loss_t = calc_loss(batch, qnet, tgt_qnet, gamma, device=device)
        loss_t.backward()
        optimizer.step()

    writer.close()


## 3. Rodando em Ambientes Simples

In [20]:
ENV_NAME = "MountainCar-v0"
REWARD_BOUND = -110
#ENV_NAME = "CartPole-v0"
#REWARD_BOUND = 200

GAMMA = 0.999
REPLAY_SIZE = 2000
BATCH_SIZE = 32
LEARNING_RATE = 0.001
SYNC_TARGET_FRAMES = 250

EPSILON_DECAY_LAST_FRAME = 80000
EPSILON_FINAL = 0.02

In [21]:
env = gym.make(ENV_NAME)

qnet = dqn_models.MLP(env.observation_space.shape[0], [128,256], env.action_space.n)
qtarget = dqn_models.MLP(env.observation_space.shape[0], [128,256], env.action_space.n)

In [None]:
DQN_TRAIN(
    env = env, 
    env_name = ENV_NAME, 
    qnet = qnet,
    qnet_lr = LEARNING_RATE,
    tgt_qnet = qtarget, 
    target_update_freq = SYNC_TARGET_FRAMES,
    gamma = GAMMA, 
    replay_size = REPLAY_SIZE, 
    batch_size = BATCH_SIZE, 
    epsilon_f = EPSILON_FINAL, 
    epsilon_decay_last_step = EPSILON_DECAY_LAST_FRAME, 
    MEAN_REWARD_BOUND = REWARD_BOUND)

MLP(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=3, bias=True)
  )
)
200: done 1 games, mean reward -200.000, eps 1.00, speed 11882.72 f/s
400: done 2 games, mean reward -200.000, eps 0.99, speed 12992.70 f/s
600: done 3 games, mean reward -200.000, eps 0.99, speed 18417.88 f/s
800: done 4 games, mean reward -200.000, eps 0.99, speed 17399.78 f/s
1000: done 5 games, mean reward -200.000, eps 0.99, speed 21983.88 f/s
1200: done 6 games, mean reward -200.000, eps 0.98, speed 15990.48 f/s
1400: done 7 games, mean reward -200.000, eps 0.98, speed 18938.90 f/s
1600: done 8 games, mean reward -200.000, eps 0.98, speed 16884.94 f/s
1800: done 9 games, mean reward -200.000, eps 0.98, speed 16850.68 f/s
2000: done 10 games, mean reward -200.000, eps 0.97, speed 17729.28 f/s
2200: done 11 games, mean reward -200.000, e

In [None]:
!tensorboard --logdir runs

In [None]:
# Faz alguns testes com o modelo de forma DETERMINÍSTICA e salva o vídeo em arquivo
video = VideoRecorder(env, "politica-treinada.mp4")
dqn_models.test_Qpolicy(env, qnet, 0.0, 5, render=False, videorec=video)

In [None]:
render_mp4("politica-treinada.mp4")

## 4. Rodando no Jogo Pong (Atari)

In [None]:
--
ENV_NAME = "PongNoFrameskip-v4"
REWARD_BOUND = 15 #19.5

GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 3e-4 #1e-4
SYNC_TARGET_FRAMES = 1000

EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_FINAL = 0.02

In [None]:
env = gym.make(ENV_NAME)
env = MaxAndSkipEnv(env)
env = FireResetEnv(env)
env = ProcessFrame84(env)
env = ImageToPyTorch(env)
env = BufferWrapper(env, 4)

net = dqn_model.DQNNet(env.observation_space.shape, env.action_space.n)
tgt_net = dqn_model.DQNNet(env.observation_space.shape, env.action_space.n)

In [None]:
DQN_TRAIN(
    env = env, 
    env_name = ENV_NAME, 
    qnet = qnet,
    qnet_lr = LEARNING_RATE,
    tgt_qnet = qtarget, 
    target_update_freq = SYNC_TARGET_FRAMES,
    gamma = GAMMA, 
    replay_size = REPLAY_SIZE, 
    batch_size = BATCH_SIZE, 
    epsilon_f = EPSILON_FINAL, 
    epsilon_decay_last_step = EPSILON_DECAY_LAST_FRAME, 
    MEAN_REWARD_BOUND = REWARD_BOUND)

In [None]:
# Faz alguns testes com o modelo de forma DETERMINÍSTICA e salva o vídeo em arquivo
video = VideoRecorder(ENV, "politica-treinada.mp4")
test_policy(ENV, policy, True, 5, render=False, videorec=video)