# DQN implementation

https://www.datascienceassn.org/sites/default/files/Human-level%20Control%20Through%20Deep%20Reinforcement%20Learning.pdf

useful sites

https://github.com/openai/baselines/tree/master/baselines
https://github.com/DavidJanz/successor_uncertainties_atari
https://github.com/davidreiman/pytorch-atari-dqn
https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

https://github.com/roclark/openai-gym-pytorch

cartpole

https://github.com/Rowing0914/TF_RL/blob/master/tf_rl/env/cartpole_pixel.py


In [30]:
from collections import deque

import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F

import random
import torchvision
import gym

from tqdm import tqdm

In [31]:
import sys
sys.path.append('../../')
sys.path.append('../../stochastic_control/neural_rl/')

import numpy as np
import stochastic_control
from atari_env_torch import make_atari


In [32]:
from IPython.display import clear_output

In [33]:
class DQN_Network(nn.Module):
    ''' A Deepmind-type DQN network
    '''  
    def __init__(self,action_size=6):
        super(DQN_Network, self).__init__()
        self._args = (action_size,)
        self.dqn_model = nn.Sequential(
             nn.Conv2d(4, 32, 8, 4),
             nn.ReLU(),
             nn.Conv2d(32, 64, 4, 2),
             nn.ReLU(),
             nn.Conv2d(64, 64, 3, 1),
             nn.ReLU(),
             nn.Flatten(),
             nn.Linear(3136,1024),
             nn.ReLU(),
             nn.Linear(1024, action_size) )

    def forward(self, x):
        output = self.dqn_model(x)
        return output
    
    def clone(self):# Should this be here?
        clone = DQN_Network(*self._args) # Check this? Change self?
        clone.load_state_dict(self.state_dict())
        return clone   

In [148]:
''' to do:

Need to add target parameters etc.. to DQN below
epsilon decay
check everything
'''

class DQN(): 
    def __init__(self,
                 discount,
                 action_size,
                 lr,
                 neural_network,
                 batch_size,
                 memory_size,
                 target_steps,
                 epsilon_steps,
                 epsilon_final,
                 epsilon_start=1.):
        
        # MDP parameters
        self.disc = discount                       
        self.action_size = action_size        

        # Network optimization
        self.q_fn = neural_network
        self.lr = lr                              
        self.q_fn_target = self.q_fn.clone()
        self.optimizer = Adam(self.q_fn.parameters(), lr=self.lr)   
        self.batch_size = batch_size

        # Training parameters
        self.memory = deque(maxlen=memory_size)             
        self.eps = epsilon_start
        self.eps_final = epsilon_final      
        self.epsilon_steps = epsilon_steps
        self.target_steps = target_steps
        
        # Internal parameters
        self.target_counter = 0       
        self.memory_full = False
        
    def update(self,sarsd):
        # 1. memorize sarsd
        self.memorize(sarsd)
        
        # 2. learn a batch (Can this be shorter!?)
        if len(self.memory) == memory_size:
            batch = self.batch(self.batch_size)
            self.learn(batch)
            if self.memory_full is not True:
                self.memory_full = True
                print('Memory now full!')
        
        # 3. update counter and epsilon (once memory full)
        s, a, r, s, d = sarsd
        self.target_counter += 1
        if self.memory_full:
            self._eps_update()
                  
        # 4. update target if update required   
        if self.target_counter > self.target_steps:
            self.target_counter = 0
            self._target_update()
               
    def policy(self, state, epsilon=None):# checked
        eps = self.eps if epsilon is None else epsilon      
        if np.random.rand() < eps:
            action = np.random.randint(self.action_size)
        else:
            action = self.q_fn(state).max(1)[1].item()      
        return action
        
    def learn(self,batch):
        self.optimizer.zero_grad()
        loss = self.loss(batch)
        loss.backward()
        self.optimizer.step()
        
    def save(self):
        torch.save({
            'q_fn': self.q_fn.state_dict(),
            'eps': self.eps
            }, './checkpoints/tensor.pt')
        
    def load(self):
        pass
    
    def memorize(self, sarsd):# checked
        self.memory.append(sarsd)
        
    def batch(self,batch_size):# checked
        # returns a batch of states, actions, rewards, next_states, dones
        idx=np.random.choice(len(self.memory), batch_size, replace=False)
        s_batch = torch.cat([self.memory[i][0] for i in idx])
        a_batch = torch.LongTensor([[self.memory[i][1]] for i in idx])
        r_batch = torch.FloatTensor([self.memory[i][2] for i in idx])
        ns_batch = torch.cat([self.memory[i][3] for i in idx])
        d_batch = torch.FloatTensor([self.memory[i][4] for i in idx])
        return (s_batch, a_batch, r_batch, ns_batch, d_batch)
            
    def _eps_update(self):  
        self.eps = max(self.eps - 1/self.epsilon_steps,self.eps_final)
    
    def loss(self,batch):
        s, a, r, ns, d = batch
        target = r+ (1-d)* self.disc * self.q_fn_target(ns).max(1)[0].detach()
        prediction = self.q_fn(s).gather(1,a).squeeze()
        return F.smooth_l1_loss(prediction, target)
    
    def _target_update(self):
        self.q_fn_target = self.q_fn.clone()
        self.target_counter = 0
        

        

## Atari Example

In [71]:
env = make_atari('PongNoFrameskip-v4')

In [72]:
# MDP parameters
discount = 0.99
action_size = env.action_space.n

# Network optimization
dqn_net = DQN_Network()
lr = 5e-5
batch_size = 32

# Training parameters
target_steps = int(1e4)
memory_size = int(1e6)
epsilon_steps = int(1e6) 
epsilon_final = 0.1

total_steps = int(5e7)

In [17]:
# MDP parameters
discount = 0.99
action_size = env.action_space.n

# Network optimization
dqn_net = DQN_Network()
lr = 5e-5
batch_size = 32

# Training parameters
target_steps = int(1e4)
memory_size = int(1e4)
epsilon_steps = int(1e5) 
epsilon_final = 0.05

total_steps = int(1e6)

In [18]:
dqn_net = DQN_Network()

In [19]:
dqn = DQN(discount,
          action_size,
          lr,
          dqn_net,
          batch_size,
          memory_size,
          target_steps,
          epsilon_steps,
          epsilon_final)

In [87]:
# Training loop
s = env.reset()
for _ in tqdm(range(total_steps)):
    a = dqn.policy(s)
    ns, r, d, _ = env.step(a)
    sarsd = (s,a,r,ns,float(d))
    dqn.update(sarsd)
    if d:
        s = env.reset()
    else:
        s = ns  

 10%|▉         | 984/10000 [00:01<00:12, 717.38it/s]

Memory now full!


100%|██████████| 10000/10000 [13:44<00:00, 12.13it/s]


## Cartpol Easy Example

In [123]:
env = gym.make('CartPole-v0')

In [124]:
class CartPole_Net(nn.Module):
    ''' A CartPole network
    '''  
    def __init__(self,action_size=2):
        super(CartPole_Net, self).__init__()
        self._args = (action_size,)
        self.dqn_model = nn.Sequential(
             nn.Linear(4, 32),
             nn.ReLU(),
             nn.Linear(32, 32),
             nn.ReLU(),
             nn.Linear(32, 32),
             nn.ReLU(),
             nn.Linear(32, action_size) )

    def forward(self, x):
        output = self.dqn_model(x)
        return output
    
    def clone(self):# Should this be here?
        clone = CartPole_Net(*self._args) # Check this? Change self?
        clone.load_state_dict(self.state_dict())
        return clone   

In [125]:
def cartpole_sarsd(s,a,r,ns,d):
    s = torch.from_numpy(s).float().unsqueeze(0)\
    if isinstance(s,np.ndarray) else s
    ns = torch.from_numpy(ns).float().unsqueeze(0)\
    if isinstance(ns,np.ndarray) else ns
    d = float(d)
    r= r-d
    return (s,a,r,ns,d)

In [126]:
# MDP parameters
discount = 0.99
action_size = env.action_space.n

# Network optimization
dqn_net = CartPole_Net()
lr = 1e-2
batch_size = 32

# Training parameters
target_steps = int(1e3)
memory_size = int(1e4)
epsilon_steps = int(1e4) 
epsilon_final = 0.1

total_steps = int(1e4)

In [127]:
dqn = DQN(discount,
          action_size,
          lr,
          dqn_net,
          batch_size,
          memory_size,
          target_steps,
          epsilon_steps,
          epsilon_final)

In [128]:
# Training loop
s = torch.tensor(env.reset()).float().unsqueeze(0)
for _ in tqdm(range(total_steps)):
    a = dqn.policy(s)
    ns, r, d, _ = env.step(a)
    sarsd = (s,a,r,ns,d) = cartpole_sarsd(s,a,r,ns,d)
    dqn.update(sarsd)
    if d:
        s = torch.tensor(env.reset()).float().unsqueeze(0)
    else:
        s = ns

100%|██████████| 10000/10000 [00:00<00:00, 22587.38it/s]

Memory now full!





In [86]:
s = torch.tensor(env.reset()).float().unsqueeze(0)
d = False
while not d:
    a = dqn.policy(s)
    ns, r, d, _ = env.step(a)
    print(r-d)
    env.render()

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0


## Learning Rewards and Parameter Tuning

In [149]:
def loss_reward(self,batch):
    s, a, r, ns, d = batch
    target = r-d
    prediction = self.q_fn(s).gather(1,a).squeeze()
    return F.smooth_l1_loss(prediction, target)

In [150]:
# MDP parameters
discount = 0.99
action_size = env.action_space.n

# Network optimization
dqn_net = CartPole_Net()
lr = 1e-2
batch_size = 32

total_steps = int(1e4)
# Training parameters
target_steps = total_steps
memory_size = total_steps
epsilon_steps = batch_size
epsilon_final = 1.

#total_steps = int(1e5)

In [151]:
dqn = DQN(discount,
          action_size,
          lr,
          dqn_net,
          batch_size,
          memory_size,
          target_steps,
          epsilon_steps,
          epsilon_final)

In [152]:
# Training loop
s = torch.tensor(env.reset()).float().unsqueeze(0)
for _ in tqdm(range(total_steps)):
    a = dqn.policy(s)
    ns, r, d, _ = env.step(a)
    sarsd = (s,a,r,ns,d) = cartpole_sarsd(s,a,r,ns,d)
    dqn.memorize(sarsd)
    if d:
        s = torch.tensor(env.reset()).float().unsqueeze(0)
    else:
        s = ns

100%|██████████| 10000/10000 [00:01<00:00, 9736.91it/s]


In [153]:
len(dqn.memory)

10000

In [154]:
#for _ in range(1000):
batch = dqn.batch(32)
dqn.learn(batch)

In [111]:
dqn.optimizer.zero_grad()
batch = dqn._batch(batch_size)
loss = loss_reward(dqn,batch)
loss.backward()
dqn.optimizer.step()

In [158]:
training_data = dqn.batch(len(dqn.memory))

## you are here!

In [160]:
training_data[1]

tensor([[0],
        [1],
        [0],
        ...,
        [0],
        [1],
        [1]])

In [93]:
# Training loop
s = torch.tensor(env.reset()).float().unsqueeze(0)
for _ in tqdm(range(total_steps)):
    a = np.random.randint(2)
    ns, r, d, _ = env.step(a)
    sarsd = (s,a,r,ns,d) = cartpole_sarsd(s,a,r,ns,d)
    dqn.learn(sarsd)
    if d:
        s = torch.tensor(env.reset()).float().unsqueeze(0)
    else:
        s = ns

  0%|          | 0/100000 [00:00<?, ?it/s]


TypeError: gather() received an invalid combination of arguments - got (int, int), but expected one of:
 * (name dim, Tensor index, *, bool sparse_grad)
 * (int dim, Tensor index, *, bool sparse_grad)


# Debug env 

## Debugging

### Define Neural network