# References
- [A Deeper Look at Experience Replay](https://arxiv.org/abs/1712.01275)
- [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/abs/1312.5602)
- [Human Level Control Through Deep Reinforcement Learning](https://deepmind.com/research/publications/human-level-control-through-deep-reinforcement-learning)
- [Double Q-Learning](https://papers.nips.cc/paper/2010/file/091d584fced301b442654dd8c23b3fc9-Paper.pdf)
- [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)
- [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/abs/1802.09477)
- [Rainbow: Combining Improvements in Deep Reinforcement Learning](https://arxiv.org/abs/1710.02298)
- [PyTorch Reinforcement Learning (DQN) Tutorial](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html)

# Import

In [4]:
import time
import random
import math
from collections import namedtuple, deque
from itertools import count, product
import numpy as np
import matplotlib.pyplot as plt

import gym
from gym import logger
logger.set_level(gym.logger.DISABLED)
from replay_buffer import ReplayBuffer

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

# CartPole Environment

In [5]:
env = gym.make('CartPole-v1')

# Replay Buffer

In [6]:
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer_size = int(buffer_size)
        self.buffer = []
        self.index = 0
    
    def __len__(self):
        return len(self.buffer)
    
    def add(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.buffer_size:
            self.buffer.append(None)
        self.buffer[self.index] = Transition(state, action, reward, next_state, done)
        self.index = (self.index + 1) % self.buffer_size
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

# Neural Network

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(4, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 2)
    
    def forward(self, x):
        x = x.to(device)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# DQN Agents

In [28]:
class Agent:
    def __init__(self, config, nn):
        self.gamma = config["gamma"]
        self.batch_size = config["batch_size"]
        self.replay_buffer = ReplayBuffer(config["buffer_size"])
        self.n_gradient_steps = config["n_gradient_steps"]
        self.n_actions = config["n_actions"]
        self.yield_epsilon = self.epsilon_generator(config["epsilon_max"], config["epsilon_min"], config["epsilon_decay"])
        self.epsilon = config["epsilon_max"]
        self.nn = nn
        self.criterion = torch.nn.MSELoss()
        self.optimizer = optim.Adam(self.nn.parameters(), lr=config["learning_rate"])
    
    def epsilon_generator(self, epsilon_max, epsilon_min, epsilon_decay):
        step = 0
        while True:
            epsilon = epsilon_min + (epsilon_max - epsilon_min) * math.exp(-1. * step / epsilon_decay)
            yield epsilon
            step += 1
    
    def epsilon_greedy_action(self, state):
        self.epsilon = next(self.yield_epsilon)
        if random.random() > self.epsilon:
            with torch.no_grad():
                return torch.argmax(self.nn(torch.Tensor(state))).item()
        else:
            return torch.tensor(random.randrange(self.n_actions), device=device, dtype=torch.long).item()
    
    def greedy_action(self, state):
        with torch.no_grad():
            return torch.argmax(self.nn(torch.Tensor(state))).item()
    
    def gradient_step(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        
        transitions = self.replay_buffer.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        
        state_batch = torch.stack(batch.state)
        action_batch = torch.stack(batch.action)
        reward_batch = torch.stack(batch.reward)
        next_state_batch = torch.stack(batch.next_state)
        done_batch = torch.stack(batch.done)
        
        state_action_values = self.nn(state_batch).gather(1, action_batch.unsqueeze(1))
        next_state_action_values = self.nn(next_state_batch).max(1)[0].detach()
        expected_state_action_values = reward_batch + self.gamma * next_state_action_values * (1-done_batch)
        
        loss = self.criterion(state_action_values, expected_state_action_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    
    def train(self, env, n_episodes):
        for i_episode in range(1, n_episodes+1):
            episode_return = 0
            state = env.reset()
            for t in count():
                action = self.epsilon_greedy_action(state)
                next_state, reward, done, _ = env.step(action)
                episode_return += reward
                
                self.replay_buffer.add(torch.Tensor(state), torch.tensor(action, dtype=torch.long), torch.tensor(reward, dtype=torch.float), torch.Tensor(next_state), torch.tensor(done, dtype=torch.long))
                state = next_state
                
                for _ in range(self.n_gradient_steps):
                    self.gradient_step()
                
                if done:
                    print("Episode {:4d} : {:4d} steps | epsilon = {:4.2f} | return = {:.1f}".format(i_episode, t+1, self.epsilon, episode_return))
                    break
    
    def test(self, env, step_max):
        episode_return = 0
        state = env.reset()
        for t in count():
            env.render()
            action = self.greedy_action(state)
            next_state, reward, done, _ = env.step(action)
            episode_return += reward
            
            state = next_state

            if done or t+1 >= step_max:
                env.close()
                return episode_return
    
    def save_model(self):
        return

config = {"gamma": 0.95,
          "batch_size": 128,
          "buffer_size": 1e6,
          "n_gradient_steps": 8,
          "n_actions": 2,
          "learning_rate": 0.001,
          "epsilon_max": 1.,
          "epsilon_min": 0.1,
          "epsilon_decay": 2000}

dqn = DQN()
agent = Agent(config, dqn)

In [33]:
agent.train(env, 300)

Episode    1 :   49 steps | epsilon = 0.16 | return = 49.0
Episode    2 :  500 steps | epsilon = 0.15 | return = 500.0


KeyboardInterrupt: 

In [34]:
agent.test(env, 1000)

220.0

# CartPole Swing Up Environment

In [5]:
class CartPoleSwingUp(gym.Wrapper):
    def __init__(self, env, **kwargs):
        super(CartPoleSwingUp, self).__init__(env, **kwargs)
        self.theta_dot_threshold = 4*np.pi

    def reset(self):
        self.env.env.state = [0, 0, np.pi, 0] + super().reset()
        self.env.env.steps_beyond_done = None
        return np.array(self.env.env.state)

    def step(self, action):
        state, reward, done, _ = super().step(action)
        x, x_dot, theta, theta_dot = state
        
        done = x < -self.x_threshold \
               or x > self.x_threshold \
               or theta_dot < -self.theta_dot_threshold \
               or theta_dot > self.theta_dot_threshold
        
        if done:
            # game over
            reward = -10.
            if self.steps_beyond_done is None:
                self.steps_beyond_done = 0
            else:
                logger.warn("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
                self.steps_beyond_done += 1
        else:
            if -self.theta_threshold_radians < theta and theta < self.theta_threshold_radians:
                # pole upright
                reward = 1.
            else:
                # pole swinging
                reward = 0.

        return np.array(self.state), reward, done, {}

env = CartPoleSwingUp(gym.make('CartPole-v1'))

In [6]:
config = {"gamma": 0.99,
          "batch_size": 128,
          "buffer_size": 1e6,
          "n_gradient_steps": 8,
          "n_actions": 2,
          "learning_rate": 0.001,
          "epsilon_max": 1.,
          "epsilon_min": 0.01,
          "epsilon_decay": 100000}

agent = Agent(config, dqn)

In [None]:
agent.train(env, 1000)

  self.replay_buffer.add(torch.Tensor(state), torch.tensor(action, dtype=torch.long), torch.tensor(reward, dtype=torch.float), torch.Tensor(next_state), torch.tensor(done, dtype=torch.long))


Episode    1 :  245 steps | epsilon = 1.00 | return = -10.0
Episode    2 :  120 steps | epsilon = 1.00 | return = -10.0
Episode    3 :  183 steps | epsilon = 0.99 | return = -10.0
Episode    4 :  173 steps | epsilon = 0.99 | return = -10.0
Episode    5 :  128 steps | epsilon = 0.99 | return = -10.0
Episode    6 :  197 steps | epsilon = 0.99 | return = -10.0
Episode    7 :  450 steps | epsilon = 0.99 | return = -10.0
Episode    8 :  352 steps | epsilon = 0.98 | return = -10.0
Episode    9 :   93 steps | epsilon = 0.98 | return = -10.0
Episode   10 :  123 steps | epsilon = 0.98 | return = -10.0
Episode   11 :  323 steps | epsilon = 0.98 | return = -10.0
Episode   12 :  137 steps | epsilon = 0.98 | return = -10.0
Episode   13 :  135 steps | epsilon = 0.97 | return = -10.0
Episode   14 :  166 steps | epsilon = 0.97 | return = -10.0
Episode   15 :  293 steps | epsilon = 0.97 | return = -10.0
Episode   16 :  101 steps | epsilon = 0.97 | return = -10.0
Episode   17 :   72 steps | epsilon = 0.

Episode  138 :  165 steps | epsilon = 0.74 | return = -10.0
Episode  139 :  236 steps | epsilon = 0.74 | return = -10.0
Episode  140 :   63 steps | epsilon = 0.74 | return = -10.0
Episode  141 :  353 steps | epsilon = 0.74 | return = -10.0
Episode  142 : 1143 steps | epsilon = 0.73 | return = -3.0
Episode  143 :  115 steps | epsilon = 0.73 | return = -10.0
Episode  144 :  106 steps | epsilon = 0.73 | return = -10.0
Episode  145 :  131 steps | epsilon = 0.73 | return = -10.0
Episode  146 :  520 steps | epsilon = 0.73 | return = -10.0
Episode  147 :  351 steps | epsilon = 0.72 | return = -10.0
Episode  148 :  207 steps | epsilon = 0.72 | return = -10.0
Episode  149 :  262 steps | epsilon = 0.72 | return = -10.0
Episode  150 :  241 steps | epsilon = 0.72 | return = -10.0
Episode  151 :  635 steps | epsilon = 0.71 | return = -10.0
Episode  152 :  343 steps | epsilon = 0.71 | return = -10.0
Episode  153 :  262 steps | epsilon = 0.71 | return = -10.0
Episode  154 :  168 steps | epsilon = 0.7

Episode  276 : 2099 steps | epsilon = 0.37 | return = -6.0
Episode  277 : 1035 steps | epsilon = 0.36 | return = -6.0
Episode  278 :  689 steps | epsilon = 0.36 | return = -2.0
Episode  279 : 4218 steps | epsilon = 0.34 | return = 5.0
Episode  280 : 1728 steps | epsilon = 0.34 | return = -10.0
Episode  281 :  638 steps | epsilon = 0.34 | return = -4.0
Episode  282 : 1643 steps | epsilon = 0.33 | return = 4.0
Episode  283 : 5695 steps | epsilon = 0.31 | return = 68.0
Episode  284 : 1704 steps | epsilon = 0.31 | return = 0.0
Episode  285 : 4465 steps | epsilon = 0.30 | return = 7.0
Episode  286 :  722 steps | epsilon = 0.29 | return = -10.0
Episode  287 :  578 steps | epsilon = 0.29 | return = -10.0
Episode  288 : 4293 steps | epsilon = 0.28 | return = 13.0
Episode  289 :  655 steps | epsilon = 0.28 | return = -5.0
Episode  290 : 2271 steps | epsilon = 0.27 | return = 5.0
Episode  291 : 1015 steps | epsilon = 0.27 | return = -10.0
Episode  292 : 1270 steps | epsilon = 0.27 | return = 0.0

Episode  415 : 1864 steps | epsilon = 0.08 | return = 8.0
Episode  416 : 1408 steps | epsilon = 0.08 | return = -10.0
Episode  417 : 3166 steps | epsilon = 0.08 | return = 6.0
Episode  418 :  605 steps | epsilon = 0.08 | return = -1.0
Episode  419 : 1475 steps | epsilon = 0.07 | return = 6.0
Episode  420 : 4484 steps | epsilon = 0.07 | return = 16.0
Episode  421 :  548 steps | epsilon = 0.07 | return = -6.0
Episode  422 : 1040 steps | epsilon = 0.07 | return = -2.0
Episode  423 : 5540 steps | epsilon = 0.07 | return = -3.0
Episode  424 :  888 steps | epsilon = 0.07 | return = -10.0
Episode  425 : 1341 steps | epsilon = 0.07 | return = -10.0
Episode  426 :  860 steps | epsilon = 0.07 | return = -10.0
Episode  427 : 3490 steps | epsilon = 0.06 | return = -10.0
Episode  428 :  722 steps | epsilon = 0.06 | return = -4.0
Episode  429 : 3040 steps | epsilon = 0.06 | return = -10.0
Episode  430 : 3315 steps | epsilon = 0.06 | return = -10.0
Episode  431 : 1100 steps | epsilon = 0.06 | return 

In [1]:
agent.test(env, 1000)

NameError: name 'agent' is not defined