# DQN implementation

https://www.datascienceassn.org/sites/default/files/Human-level%20Control%20Through%20Deep%20Reinforcement%20Learning.pdf

useful sites

https://github.com/openai/baselines/tree/master/baselines
https://github.com/DavidJanz/successor_uncertainties_atari
https://github.com/davidreiman/pytorch-atari-dqn
https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

https://github.com/roclark/openai-gym-pytorch

cartpole

https://github.com/Rowing0914/TF_RL/blob/master/tf_rl/env/cartpole_pixel.py


In [66]:
from collections import deque

import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F

import random
import torchvision
import gym

from tqdm import tqdm

In [67]:
import sys
sys.path.append('../../')
sys.path.append('../../stochastic_control/neural_rl/')

import numpy as np
import stochastic_control
from atari_env_torch import make_atari


In [68]:
from IPython.display import clear_output

In [69]:
class DQN_Network(nn.Module):
    ''' A Deepmind-type DQN network
    '''  
    def __init__(self,action_size=6):
        super(DQN_Network, self).__init__()
        self._args = (action_size,)
        self.dqn_model = nn.Sequential(
             nn.Conv2d(4, 32, 8, 4),
             nn.ReLU(),
             nn.Conv2d(32, 64, 4, 2),
             nn.ReLU(),
             nn.Conv2d(64, 64, 3, 1),
             nn.ReLU(),
             nn.Flatten(),
             nn.Linear(3136,1024),
             nn.ReLU(),
             nn.Linear(1024, action_size) )

    def forward(self, x):
        output = self.dqn_model(x)
        return output
    
    def clone(self):# Should this be here?
        clone = DQN_Network(*self._args) # Check this? Change self?
        clone.load_state_dict(self.state_dict())
        return clone   

In [70]:
''' to do:

Need to add target parameters etc.. to DQN below
epsilon decay
check everything
'''

class DQN(): 
    def __init__(self,
                 discount,
                 action_size,
                 lr,
                 neural_network,
                 batch_size,
                 memory_size,
                 target_steps,
                 epsilon_steps,
                 epsilon_final,
                 epsilon_start=1.):
        
        # MDP parameters
        self.disc = discount                       
        self.action_size = action_size        

        # Network optimization
        self.q_fn = neural_network
        self.lr = lr                              
        self.q_fn_target = self.q_fn.clone()
        self.optimizer = Adam(self.q_fn.parameters(), lr=self.lr)   
        self.batch_size = batch_size

        # Training parameters
        self.memory = deque(maxlen=memory_size)             
        self.eps = epsilon_start
        self.eps_final = epsilon_final      
        self.epsilon_steps = epsilon_steps
        self.target_steps = target_steps
        
        # Internal parameters
        self.target_counter = 0       
        self.memory_full = False
        
    def update(self,sarsd):
        # 1. memorize sarsd
        self._memorize(sarsd)
        
        # 2. learn a batch (Can this be shorter!?)
        if len(self.memory) == memory_size:
            self.learn(self.batch_size)
            if self.memory_full is not True:
                self.memory_full = True
                print('Memory now full!')
        
        # 3. update counter and epsilon (once memory full)
        s, a, r, s, d = sarsd
        self.target_counter += 1
        if self.memory_full:
            self._eps_update()
                  
        # 4. update target if update required   
        if self.target_counter > self.target_steps:
            self.target_counter = 0
            self._target_update()
               
    def policy(self, state, epsilon=None):# checked
        eps = self.eps if epsilon is None else epsilon      
        if np.random.rand() < eps:
            action = np.random.randint(self.action_size)
        else:
            action = self.q_fn(state).max(1)[1].item()      
        return action
        
    def learn(self,batch_size):
        self.optimizer.zero_grad()
        batch = self._batch(batch_size)
        loss = self._loss(batch)
        loss.backward()
        self.optimizer.step()
        
    def save(self):
        torch.save({
            'q_fn': self.q_fn.state_dict(),
            'eps': self.eps
            }, './checkpoints/tensor.pt')
        
    def load(self):
        pass
    
    def _memorize(self, sarsd):# checked
        self.memory.append(sarsd)
        
    def _batch(self,batch_size):# checked
        # returns a batch of states, actions, rewards, next_states, dones
        idx=np.random.choice(len(self.memory), batch_size)
        s_batch = torch.cat([self.memory[i][0] for i in idx])
        a_batch = torch.LongTensor([[self.memory[i][1]] for i in idx])
        r_batch = torch.FloatTensor([self.memory[i][2] for i in idx])
        ns_batch = torch.cat([self.memory[i][3] for i in idx])
        d_batch = torch.FloatTensor([self.memory[i][4] for i in idx])
        return (s_batch, a_batch, r_batch, ns_batch, d_batch)
            
    def _eps_update(self):  
        self.eps = max(self.eps - 1/self.epsilon_steps,self.eps_final)
    
    def _loss(self,batch):
        s, a, r, ns, d = batch
        target = r+ (1-d)* self.disc * self.q_fn_target(ns).max(1)[0].detach()
        prediction = self.q_fn(s).gather(1,a).squeeze()
        return F.smooth_l1_loss(prediction, target)
    
    def _target_update(self):
        self.q_fn_target = self.q_fn.clone()
        self.target_counter = 0
        

        

## Atari Example

In [71]:
env = make_atari('PongNoFrameskip-v4')

In [72]:
# MDP parameters
discount = 0.99
action_size = env.action_space.n

# Network optimization
dqn_net = DQN_Network()
lr = 5e-5
batch_size = 32

# Training parameters
target_steps = int(1e4)
memory_size = int(1e6)
epsilon_steps = int(1e6) 
epsilon_final = 0.1

total_steps = int(5e7)

In [84]:
# MDP parameters
discount = 0.99
action_size = env.action_space.n

# Network optimization
dqn_net = DQN_Network()
lr = 5e-5
batch_size = 32

# Training parameters
target_steps = int(1e3)
memory_size = int(1e3)
epsilon_steps = int(1e3) 
epsilon_final = 0.1

total_steps = int(1e4)

In [85]:
dqn_net = DQN_Network()

In [86]:
dqn = DQN(discount,
          action_size,
          lr,
          dqn_net,
          batch_size,
          memory_size,
          target_steps,
          epsilon_steps,
          epsilon_final)

In [87]:
# Training loop
s = env.reset()
for _ in tqdm(range(total_steps)):
    a = dqn.policy(s)
    ns, r, d, _ = env.step(a)
    sarsd = (s,a,r,ns,float(d))
    dqn.update(sarsd)
    if d:
        s = env.reset()
    else:
        s = ns  

 10%|▉         | 984/10000 [00:01<00:12, 717.38it/s]

Memory now full!


100%|██████████| 10000/10000 [13:44<00:00, 12.13it/s]


## Cartpol Easy Example

In [None]:
env = gym.make('Cartpol-v0')

# Debug env 

In [52]:
dqn.save()

In [53]:
import os
print(os.getcwd())

/Users/mbbssnw3/Dropbox (The University of Manchester)/ACTIVE/COURSE--OptContFinance/code/stochastic_control/examples/6_Function_Approximation


In [None]:
self = dqn

In [202]:
s = env.reset().torch().unsqueeze(0)
d=False
a = dqn.policy(s)
print(a)
ns, r, d, _ = env.step(a)
sarsd = (s,a,r,ns.torch().unsqueeze(0),float(d))

5


In [13]:
if 1.:
    print('here')

here


In [64]:
x=5
y=6
f"Auto Hella Restart Report {x}"

'Auto Hella Restart Report 5'

In [65]:
"Auto Hella Restart Report {x}"

'Auto Hella Restart Report {x}'

In [205]:
# 1. memorize sarsd
self._memorize(sarsd)

In [206]:
self.learn(self.batch_size)

In [207]:
self.memory[0][4]

0.0

In [208]:
#print('1')

# 2. learn a batch
if len(self.memory) == memory_size:
    self.learn(self.batch_size)
    if self.memory_full is not True:
        self.memory_full = True
        print('Memory now full!')

#print('2')

# 3. update epsilon if episode done
s, a, r, s, d = sarsd
self._eps_update()
self.target_counter += 1

#print('3')


# 4. update target if update required   
if self.target_counter > self.target_update:
    self.target_counter = 0
    self._target_update()

In [209]:
batch_size = self.batch_size
idx=np.random.choice(len(self.memory), batch_size)
s_batch = torch.cat([self.memory[i][0] for i in idx])
a_batch = torch.LongTensor([[self.memory[i][1]] for i in idx])
r_batch = torch.FloatTensor([self.memory[i][2] for i in idx])
ns_batch = torch.cat([self.memory[i][3] for i in idx])
d_batch = torch.FloatTensor([self.memory[i][4] for i in idx])

In [210]:
np.random.choice(len(self.memory), batch_size)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [211]:
dqn_net(s)

tensor([[ 0.0028,  0.0199, -0.0323,  0.0039,  0.0250, -0.0079]],
       grad_fn=<AddmmBackward>)

In [212]:
state[0]

tensor([[[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         ...,
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

        [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         ...,
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

        [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.

In [213]:
state = s

In [214]:
self.q_fn(state).max(1)[1]

tensor([4])

In [215]:
self.policy(s,epsilon=0.)

4

In [216]:
for i in tqdm(range(2)):
    s = env.reset().torch().unsqueeze(0)
    d=False
    while not d:
        a = 0
        ns, r, d, _ = env.step(dqn.policy(s))
        sarsd = (s,a,r,ns.torch().unsqueeze(0),float(d))
        dqn.update(sarsd)
        s = ns.torch().unsqueeze(0)

100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


In [68]:
dqn.update(sarsd)

In [217]:
d

True

In [658]:
s,a,r,ns,d = dqn._batch(1)

In [661]:
ns[0]

tensor([[[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         ...,
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

        [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         ...,
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

        [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.

In [651]:
r+ (1-d)* self.disc * self.q_fn(ns).max(1)[0].detach()

tensor([0.0277, 0.0274])

In [670]:
s = ns.torch().unsqueeze(0)

In [None]:
s = env.reset().torch().unsqueeze(0)

In [195]:
self.q_fn(s).max(1)[1].item()

1

In [641]:
s, a, r, ns, d = dqn._batch(2)
target = r+ (1-d)* self.disc * self.q_fn(ns).max(1)[0].detach()
prediction = self.q_fn(s).gather(1,a)


In [587]:
self.q_fn(s).gather(1,a).squeeze()

tensor([0.0273, 0.0273, 0.0270, 0.0274], grad_fn=<SqueezeBackward0>)

In [588]:
r + (1-d)* self.disc * self.q_fn(ns).max(1)[0].detach()

tensor([0.0272, 0.0272, 0.0267, 0.0272])

In [604]:
self.q_fn(s).gather(1,a).squeeze()

tensor([-0.0155, -0.0155], grad_fn=<SqueezeBackward0>)

In [590]:
dqn._loss(batch)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [525]:
a_ = torch.from_numpy(np.array([a[i] for i in range(len(a))]))

In [526]:

qs = self.q_fn(s)

In [304]:
states = torch.cat([self.memory[i][0] for i in idx])
next_states = torch.cat([self.memory[i][3] for i in idx])

In [315]:
s_batch = torch.cat([self.memory[i][0] for i in idx])
a_batch = [self.memory[i][1] for i in idx]
r_batch = [self.memory[i][2] for i in idx]
ns_batch = torch.cat([self.memory[i][3] for i in idx])
d_batch = [self.memory[i][4] for i in idx]

In [566]:
import math
x = torch.linspace(-math.pi, math.pi, 10)
y = torch.sin(x)

In [567]:
y

tensor([ 8.7423e-08, -6.4279e-01, -9.8481e-01, -8.6603e-01, -3.4202e-01,
         3.4202e-01,  8.6603e-01,  9.8481e-01,  6.4279e-01, -8.7423e-08])

In [302]:
torch.cat([self.memory[i][0] for i in idx])

tensor([[[[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          ...,
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

         [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          ...,
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
          [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

         [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
          [0.3412, 0.3412, 0.3412,  ..., 0

In [263]:
batch = torch.from_numpy (dqn._batch(25))

ValueError: only one element tensors can be converted to Python scalars

In [266]:
batch[0]

(tensor([[[[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
           [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
           [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
           ...,
           [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
           [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
           [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],
 
          [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
           [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
           [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
           ...,
           [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
           [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
           [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],
 
          [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
           [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
           [0.3412, 0.34

In [241]:
a =np.array([[1, 4, 5], [4, 6, 8], [8, 3, 10]])

In [243]:
a[:,1]

array([4, 6, 3])

In [285]:
loss_fn = torch.nn.MSELoss(reduction='sum')

In [297]:
loss = loss_fn(y_pred, y_pred)

In [298]:
loss.backward()

RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling .backward() or autograd.grad() the first time.

In [211]:
env.close()

In [200]:
env = gym.make('CartPole-v0').unwrapped

In [201]:
state = env.reset()

In [202]:
screen = env.render(mode='rgb_array')

In [212]:
env = CartPole_Pixel(gym.make('CartPole-v0'))

In [213]:
state = env.reset()

In [196]:
from threading import Event, Thread


class RenderThread(Thread):
    """
    Original Code:
        https://github.com/tqjxlm/Simple-DQN-Pytorch/blob/master/Pytorch-DQN-CartPole-Raw-Pixels.ipynb
    Data:
        - Observation: 3 x 400 x 600
    Usage:
        1. call env.step() or env.reset() to update env state
        2. call begin_render() to schedule a rendering task (non-blocking)
        3. call get_screen() to get the lastest scheduled result (block main thread if rendering not done)
    Sample Code:
    ```python
        # A simple test
        env = gym.make('CartPole-v0').unwrapped
        renderer = RenderThread(env)
        renderer.start()
        env.reset()
        renderer.begin_render()
        for i in range(100):
            screen = renderer.get_screen() # Render the screen
            env.step(env.action_space.sample()) # Select and perform an action
            renderer.begin_render()
            print(screen)
            print(screen.shape)
        renderer.stop()
        renderer.join()
        env.close()
    ```
    """

    def __init__(self, env):
        super(RenderThread, self).__init__(target=self.render)
        self._stop_event = Event()
        self._state_event = Event()
        self._render_event = Event()
        self.env = env

    def stop(self):
        """
        Stops the threads
        :return:
        """
        self._stop_event.set()
        self._state_event.set()

    def stopped(self):
        """
        Check if the thread has been stopped
        :return:
        """
        return self._stop_event.is_set()

    def begin_render(self):
        """
        Start rendering the screen
        :return:
        """
        self._state_event.set()

    def get_screen(self):
        """
        get and output the screen image
        :return:
        """
        self._render_event.wait()
        self._render_event.clear()
        return self.screen

    def render(self):
        while not self.stopped():
            self._state_event.wait()
            self._state_event.clear()
            self.screen = self.env.render(mode='rgb_array')
            self._render_event.set()

In [197]:

import numpy as np
import os, math

os.environ.setdefault('PATH', '')
from collections import deque
import gym
from gym import spaces

# for those who installed ROS on local env
import sys

try:
    sys.path.remove('/opt/ros/kinetic/lib/python2.7/dist-packages')
except:
    pass

import cv2

cv2.ocl.setUseOpenCL(False)

"""
Wrapper for Cartpole
This is to change the reward at the terminal state because originally it is set as 1.0
check here: https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py
"""


class CartPole_Pixel(gym.Wrapper):
    """
    Wrapper for getting raw pixel in cartpole env
    observation: 400x400x1 => (Width, Height, Colour-chennel)
    we dispose 100pxl from each side of width to make the frame divisible(Square) in CNN
    """

    def __init__(self, env):
        self.width = 400
        self.height = 400

        gym.Wrapper.__init__(self, env)
        self.env = env.unwrapped
        # self.env.seed(123)  # fix the randomness for reproducibility purpose

        """
        start new thread to deal with getting raw image
        """
        #from tf_rl.env.cartpole_pixel import RenderThread
        self.renderer = RenderThread(env)
        self.renderer.start()

    def _pre_process(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
        frame = np.expand_dims(frame, -1)
        return frame

    def step(self, ac):
        _, reward, done, info = self.env.step(ac)
        self.renderer.begin_render()  # move screen one step
        observation = self._pre_process(self.renderer.get_screen())

        if done:
            reward = -1.0  # reward at a terminal state
        return observation, reward, done, info

    def reset(self, **kwargs):
        self.env.reset()
        self.renderer.begin_render()  # move screen one step
        return self._pre_process(self.renderer.get_screen())  # overwrite observation by raw image pixels of screen

    def close(self):
        self.renderer.stop()  # terminate the threads
        self.renderer.join()  # collect the dead threads and notice all threads are safely terminated
        if self.env:
            return self.env.close()

## Debugging

### Define Neural network

In [171]:
action_size = 4
dqn_model = nn.Sequential(
    nn.Conv2d(4, 32, 8, 4),
    nn.ReLU(),
    nn.Conv2d(32, 64, 4, 2),
    nn.ReLU(),
    nn.Conv2d(64, 64, 3, 1),
    nn.ReLU(),
    nn.Flatten(0, 1),
    nn.Linear(3136,1024),
    nn.ReLU(),
    nn.Linear(1024, action_size)
)


# do you need to flatten?

In [172]:
dqn = dqn_model()

TypeError: forward() missing 1 required positional argument: 'input'

In [121]:
action_size = 4
dqn_model = nn.Sequential(
    nn.Conv2d(4, 32, 8, 4),
    nn.ReLU(),
    nn.Conv2d(32, 64, 4, 2),
    nn.ReLU(),
    nn.Conv2d(64, 64, 3, 1),
    nn.ReLU(),
    nn.Flatten(0, 1),
    nn.Linear(3136,1024),
    nn.ReLU(),
    nn.Linear(1024, action_size)
)


# do you need to flatten?

In [11]:
state.torch()[5]

IndexError: index 5 is out of bounds for dimension 0 with size 4

In [12]:
_default_conv_sizes = [(4, 32, 8, 4),
                       (32, 64, 4, 2),
                       (64, 64, 3, 1)]


_default_feature_size = 3136
hidden_size = 1024

In [13]:
x

NameError: name 'x' is not defined

In [14]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

NameError: name 'torch' is not defined

In [16]:
model = NeuralNetwork().to(device)
print(model)

NameError: name 'device' is not defined

In [17]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

NameError: name 'torch' is not defined

In [18]:
logits

NameError: name 'logits' is not defined

In [19]:
pred_probab

NameError: name 'pred_probab' is not defined

In [20]:
from collections import OrderedDict

from torch import nn
def flatten(x):
    return x.view(x.size()[0], -1)


class FlattenModule(nn.Module):
    def forward(self, input):
        return flatten(input)


class ConvRelu(nn.Sequential):
    def __init__(self, conv_sizes):
        layers = OrderedDict()

        for i, conv_size in enumerate(conv_sizes):
            layers[f'conv{i}'] = nn.Conv2d(*conv_size)
            layers[f'relu{i}'] = nn.ReLU()

        layers['flatten'] = FlattenModule()
        super().__init__(layers)


class LinearRelu(nn.Sequential):
    def __init__(self, layer_sizes):
        layers = OrderedDict()

        input_size = layer_sizes[0]
        for i, layer_size in enumerate(layer_sizes[1:]):
            layers[f'lin{i}'] = nn.Linear(input_size, layer_size)
            layers[f'relu{i}'] = nn.ReLU()
            input_size = layer_size

        super().__init__(layers)
        
        
def init_weights_xavier(model):
    for p_name, p_tensor in model.named_parameters():
        if '.weight' in p_name:
            nn.init.xavier_uniform_(p_tensor)
            

class CloneMixin:
    def clone(self):
        # noinspection PyArgumentList
        clone = self.__class__(*self.__args__)
        clone.load_state_dict(self.state_dict())
        return clone

class Q(nn.Module, CloneMixin):
    def __init__(self, action_size, hidden_size, bias_out):
        super().__init__()
        self.__args__ = (action_size, hidden_size, bias_out)

        self.featuriser = ConvRelu(_default_conv_sizes)
        self.post_featuriser = LinearRelu([_default_feature_size, hidden_size])
        self.linear = nn.Linear(hidden_size, action_size, bias=bias_out)

        init_weights_xavier(self)

    def forward(self, x):
        hidden = self.featuriser(x)
        embedding = self.post_featuriser(hidden)
        return self.linear(embedding)

    def q_fn(self, *, state=None, state_embedding=None):
        if state is not None:
            state_embedding = self.global_embedding(state=state)
        return self.linear(state_embedding)

    def global_embedding(self, *, state=None, state_features=None):
        if state is not None:
            state_features = self.featuriser(state)
        return self.post_featuriser(state_features)

    def get_global_embedding_dim(self):
        return self.post_featuriser.out_features

In [53]:
env.action_space

Discrete(4)

In [55]:
env.observation_space.shape[0]

84

In [29]:
q_fn = Q(6,1024,False)

In [51]:
q_fn

Q(
  (featuriser): ConvRelu(
    (conv0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (relu0): ReLU()
    (conv1): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (relu1): ReLU()
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (relu2): ReLU()
    (flatten): FlattenModule()
  )
  (post_featuriser): LinearRelu(
    (lin0): Linear(in_features=3136, out_features=1024, bias=True)
    (relu0): ReLU()
  )
  (linear): Linear(in_features=1024, out_features=6, bias=False)
)

In [36]:
state, ep_reward, terminal = env.reset(), 0, False

In [38]:
state.torch()

tensor([[[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         ...,
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

        [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         ...,
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255],
         [0.9255, 0.9255, 0.9255,  ..., 0.9255, 0.9255, 0.9255]],

        [[0.2039, 0.2039, 0.2039,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.3412],
         [0.3412, 0.3412, 0.3412,  ..., 0.3412, 0.3412, 0.

In [40]:
84*84

7056

In [41]:
env = make_atari('PongNoFrameskip-v4')

In [42]:
env.action_space.n

6

In [43]:
state = env.reset()

In [99]:
state.torch()

AttributeError: 'numpy.ndarray' object has no attribute 'torch'

In [100]:
model = nn.Sequential(
          nn.Conv2d(4,20,5),
          nn.ReLU(),
          nn.Conv2d(20,64,5),
          nn.ReLU()
        )

In [101]:
model(state.torch())

AttributeError: 'numpy.ndarray' object has no attribute 'torch'

In [64]:
state.torch().shape

torch.Size([4, 84, 84])

In [114]:
class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      self.conv1 = nn.Conv2d(1, 32, 3, 1)
      self.conv2 = nn.Conv2d(32, 64, 3, 1)
      self.dropout1 = nn.Dropout2d(0.25)
      self.dropout2 = nn.Dropout2d(0.5)
      self.fc1 = nn.Linear(9216, 128)
      self.fc2 = nn.Linear(128, 10)

    # x represents our data
    def forward(self, x):
      # Pass data through conv1
      x = self.conv1(x)
      # Use the rectified-linear activation function over x
      x = F.relu(x)

      x = self.conv2(x)
      x = F.relu(x)

      # Run max pooling over x
      x = F.max_pool2d(x, 2)
      # Pass data through dropout1
      x = self.dropout1(x)
      # Flatten x with start_dim=1
      x = torch.flatten(x, 1)
      # Pass data through fc1
      x = self.fc1(x)
      x = F.relu(x)
      x = self.dropout2(x)
      x = self.fc2(x)

      # Apply softmax to x 
      output = F.log_softmax(x, dim=1)
      return output

In [None]:
model = nn.Sequential(
          nn.Conv2d(4,20,5),
          nn.ReLU(),
          nn.Conv2d(20,64,5),
          nn.ReLU()
        )

In [77]:

import torchvision.transforms as transforms
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


Exception ignored in: <function tqdm.__del__ at 0x7f9363360040>
Traceback (most recent call last):
  File "/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-packages/tqdm/notebook.py", line 275, in close
    self.disp(bar_style='danger')
AttributeError: 'tqdm' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x7f9363360040>
Traceback (most recent call last):
  File "/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-packages/tqdm/notebook.py", line 275, in close
    self.disp(bar_style='danger')
AttributeError: 'tqdm' object has no attribute 'disp'


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [76]:
from ipywidgets import IProgress

ImportError: cannot import name 'IProgress' from 'ipywidgets' (/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-packages/ipywidgets/__init__.py)

In [78]:
class DeepQNetwork(nn.Module):
    def __init__(self, num_frames, num_actions):
        super(DeepQNetwork, self).__init__()
        self.num_frames = num_frames
        self.num_actions = num_actions
        
        # Layers
        self.conv1 = nn.Conv2d(
            in_channels=num_frames,
            out_channels=16,
            kernel_size=8,
            stride=4,
            padding=2
            )
        self.conv2 = nn.Conv2d(
            in_channels=16,
            out_channels=32,
            kernel_size=4,
            stride=2,
            padding=1
            )
        self.fc1 = nn.Linear(
            in_features=3200,
            out_features=256,
            )
        self.fc2 = nn.Linear(
            in_features=256,
            out_features=num_actions,
            )
        
        # Activation Functions
        self.relu = nn.ReLU()
    
    def flatten(self, x):
        batch_size = x.size()[0]
        x = x.view(batch_size, -1)
        return x
    
    def forward(self, x):
        
        # Forward pass
        x = self.relu(self.conv1(x))  # In: (80, 80, 4)  Out: (20, 20, 16)
        x = self.relu(self.conv2(x))  # In: (20, 20, 16) Out: (10, 10, 32)
        x = self.flatten(x)           # In: (10, 10, 32) Out: (3200,)
        x = self.relu(self.fc1(x))    # In: (3200,)      Out: (256,)
        x = self.fc2(x)               # In: (256,)       Out: (4,)
        
        return x

In [81]:
state.torch()

torch.Size([4, 84, 84])

In [83]:
DQN = DeepQNetwork(4,6)

In [85]:
DQN(state.torch())

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [16, 4, 8, 8], but got 3-dimensional input of size [4, 84, 84] instead

In [102]:
env = gym.make('BreakoutDeterministic-v4')

In [111]:
from skimage.color import rgb2grey
from skimage.transform import rescale

def process(state):
    state = rgb2grey(state[35:195, :, :])
    state = rescale(state, scale=0.5)
    state = state[np.newaxis, np.newaxis, :, :]
    return torch.tensor(state, dtype=torch.float)

Exception ignored in: <function tqdm.__del__ at 0x7f9363360040>
Traceback (most recent call last):
  File "/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-packages/tqdm/notebook.py", line 275, in close
    self.disp(bar_style='danger')
AttributeError: 'tqdm' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x7f9363360040>
Traceback (most recent call last):
  File "/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-packages/tqdm/notebook.py", line 275, in close
    self.disp(bar_style='danger')
AttributeError: 'tqdm' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x7f9363360040>
Traceback (most recent call last):
  File "/Users/mbbssnw3/anaconda3/envs/sc/lib/python3.8/site-pack

In [112]:
state0 = env.reset()
state = process(state0)

  state = rgb2grey(state[35:195, :, :])


In [113]:
state = torch.cat([state, state0], 1)

TypeError: expected Tensor as element 1 in argument 0, but got numpy.ndarray

In [91]:
num_frames = 4
while state.size()[1] < num_frames:
    action = 1 # Fire

    new_frame, reward, done, info = env.step(action)
    new_frame = self.process(new_frame)

    state = torch.cat([state, new_frame], 1)

TypeError: 'int' object is not callable

In [153]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      self.conv1 = nn.Conv2d(4, 32, 3, 1)
      self.conv2 = nn.Conv2d(32, 64, 3, 1)
      self.dropout1 = nn.Dropout2d(0.25)
      self.dropout2 = nn.Dropout2d(0.5)
      self.fc1 = nn.Linear(9216, 128)
      self.fc2 = nn.Linear(128, 10)

    # x represents our data
    def forward(self, x):
      # Pass data through conv1
      x = self.conv1(x)
      # Use the rectified-linear activation function over x
      x = F.relu(x)

      x = self.conv2(x)
      x = F.relu(x)

      # Run max pooling over x
      x = F.max_pool2d(x, 2)
      # Pass data through dropout1
      x = self.dropout1(x)
      # Flatten x with start_dim=1
      x = torch.flatten(x, 1)
      # Pass data through fc1
      x = self.fc1(x)
      x = F.relu(x)
      x = self.dropout2(x)
      x = self.fc2(x)

      # Apply softmax to x 
      output = F.log_softmax(x, dim=1)
      return output

In [154]:
state.shape

AttributeError: 'LazyFrames' object has no attribute 'shape'

In [155]:
my_nn = Net()

In [156]:
random_data = torch.rand((1, 1, 28, 28))
my_nn(random_data)

RuntimeError: Given groups=1, weight of size [32, 4, 3, 3], expected input[1, 1, 28, 28] to have 4 channels, but got 1 channels instead

In [157]:
env = make_atari('BreakoutNoFrameskip-v4')

state = env.reset()

statey = state.torch()

statey.unsqueeze(0).shape

In [161]:
my_nn(statey.unsqueeze(0))

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x102400 and 9216x128)

In [162]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      self.conv1 = nn.Conv2d(4, 32, 3, 1)

    # x represents our data
    def forward(self, x):
      # Pass data through conv1
      x = self.conv1(x)
      x = F.relu(x)

      # Apply softmax to x 
      output = F.log_softmax(x, dim=1)
      return output

In [163]:
env = make_atari('BreakoutNoFrameskip-v4')

state = env.reset()

statey = state.torch()

statey.unsqueeze(0).shape

torch.Size([1, 4, 84, 84])

In [164]:
my_nn = Net()

In [168]:
x =my_nn(statey.unsqueeze(0))

In [170]:
x.shape

torch.Size([1, 32, 82, 82])