### pytorch basics

In [0]:
import torch.nn as nn
import torch
torch.__version__

'1.5.0+cu101'

In [0]:
l = nn.Linear(2, 5).cuda()
v = torch.FloatTensor([1, 2]).cuda()
l(v)
# Feed forward layer with 2 inputs and 5 outputs

tensor([-0.5842, -0.4360, -0.0710, -0.3440,  0.1779], device='cuda:0',
       grad_fn=<AddBackward0>)

In [0]:
# Create a sequential network
# 3 layer
# Applied along dimension 1
s = nn.Sequential(
      nn.Linear(2, 5),
      nn.ReLU(),
      nn.Linear(5, 2),
      nn.Dropout(0.3),
      nn.Softmax(dim=1)
).cuda()
s

Sequential(
  (0): Linear(in_features=2, out_features=5, bias=True)
  (1): ReLU()
  (2): Linear(in_features=5, out_features=2, bias=True)
  (3): Dropout(p=0.3, inplace=False)
  (4): Softmax(dim=1)
)

In [0]:
s(torch.FloatTensor([[1, 2], [2, 3]]).cuda())

tensor([[0.1891, 0.8109],
        [0.1830, 0.8170]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [0]:
# Custom layers
class CustomModule(nn.Module):
    def __init__(self, num_input, num_classes, dropout_prob=0.3):
        super(CustomModule, self).__init__()
        self.pipe = nn.Sequential(
            nn.Linear(num_input, 5),
            nn.ReLU(),
            nn.Linear(5, 20),
            nn.ReLU(),
            nn.Linear(20, num_classes),
            nn.Dropout(p=dropout_prob),
            nn.Softmax(dim=1)
        )
    def forward(self, x):
        return self.pipe(x)

net = CustomModule(2, 3).cuda()
v = torch.FloatTensor([[2, 3]]).cuda()
out = net(v)
print(net)
print(out)

CustomModule(
  (pipe): Sequential(
    (0): Linear(in_features=2, out_features=5, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
    (5): Dropout(p=0.3, inplace=False)
    (6): Softmax(dim=1)
  )
)
tensor([[0.3441, 0.3813, 0.2746]], device='cuda:0', grad_fn=<SoftmaxBackward>)


**Normal Training loop:**
  
for x, y in iter(batches):            
> x = torch.tensor(x)    
  y = torch.tensor(y)          
  out = network(x)      
  loss = loss_function(out, y)      
  loss.backward()        
  optimizer.step()      
  optimizer.zero_grad()



### Deep Q-learning

In [0]:
import gym
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import collections
import numpy as np
import time
import torch.optim as optim

**DQN training**

**Algorithm Source : https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf**

1.   Initialize the parameters $Q(s, a)$ and $\hat{Q}(s, a)$ with random weights, $\epsilon$=1.0 and empty replay buffer.
2.   with probability $\epsilon$ select a random action, otherwise $a = argmax Q(s, a)$
3.   Execute action a in the environment and observe reward, state $s'$
4.   Store transition $(s, a, r, s')$ in replay buffer.
5.   Sample a random min-batch of transitions from replay buffer.
6.   Calculate the target $y = r$ and if ended $y = r + \gamma max Q(s',a')$
7.   Loss = $(Q(s, a) - y)^2$
8.   Update the $Q(s, a)$ using SGD algorithm, minimizing loss. 
9.   Every N steps, copy weights from $Q$ to $\hat{Q}$
10.  Repeat from step 2. until converged.


#### Pre Process frames

In [0]:
# To convert to 84x84 from the original paper 
# Implementing using the link below, torchvision.transform
# https://pytorch.org/docs/stable/torchvision/transforms.html


#### Network

In [0]:
# Convolutional network
# pyTorch doesn't have a flatten layer, use a helper function conv_out
# This dummy call happens once for model creation

class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        out_size = self.get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )
    
    def get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        # Flatten the result tensor, this doesnot create a new memory space hance no overhead
        conv_out = self.conv(x).view(x.size()[0], -1)  
        return self.fc(conv_out)

#### Training 

In [0]:
# Hyperparameters

gamma = 0.99                              # Gamma for the bellman equation
batch_size = 32                           # Batch sampled from replay buffer
replay_size = 10_000                      # Max capacity of the buffer
replay_start_size = 10_000                # No. of frames before repopulate
learning_rate = 1e-4                      # Learning rate for the optimizer
sync_target = 1_000                       # How frequently to update the target model
epsilon_start = 1.0             
epsilon_final = 0.01
epsilon_decay_last_frame = 300_000        # After 300_000 frames the epsilon is decayed to 0.1, random exploration

In [0]:
# Experience Buffer
Experience = collections.namedtuple(
    'Experience', field_names = ['state', 'action', 'reward', 'done', 'new_state']
)

class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)
    def __len__(self):
        return len(self.buffer)
    def append(self, experience):
        self.buffer.append(experience)
    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards), np.array(dones), np.array(next_states)

In [0]:
# Agent class
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()
    def _reset():
        self.state = env.reset()
        self.total_reward = 0.0
    def step(self, net, epsilon):
        done_reward = 0
        # Greedy action
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state])
            state_v = torch.tensor(state_a).cuda()
            q_val = net(state_v)
            _, act_v = torch.max(q_val, dim=1)
            action = int(act_v.item())
        # After the action is chosen add it to the replay buffer
            new_state, reward, is_done, _ = self.env.step()
            self.total_reward += reward
            exp = Experience(self.state, action, reward, is_done, new_state)
            self.exp_buffer.append(exp)
            self.state = new_state
            if is_done:
                done_reward = self.total_reward
                self._reset()
            return done_reward      

In [0]:
# Loss Calculation
def calc_loss(batch, net, target):
      states, actions, rewards, dones, next_states = batch

      states_v = torch.tensor(np.array(states)).cuda()
      next_states_v = torch.tensor(np.array(next_states)).cuda()
      actions_v = torch.tensor(actions).cuda()
      rewards_v = torch.tensor(rewards).cuda()
      done_mask = torch.BoolTensor(dones).cuda()

      state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)

      next_state_values = target(next_states_v).max(1)[0]

      next_state_values[done_mask] = 0
      # Prevent gradients from flowing into target n/w
      next_state_values = next_state_values.detach()
      # Bellman approximation
      expected_state_action_values = next_state_values * gamma + rewards_v

      return nn.MSELoss()(state_action_values, expected_state_action_values)


In [0]:
# Dummy loop [Change this to fit the actual problem]

"""
Load the env
print observation space
print the action space
"""


# main network
net = DQN(env.observation_space.shape, env.action_space.shape).cuda()
# Target network
target = DQN(env.observation_space.shape, env.action_space.shape).cuda()
# Replay buffer
buffer = ExperienceBuffer(replay_size)
# agent
agent = Agent(env, buffer)
# epsilon 
epsilon = epsilon_start
# optimizer
optimizer = optim.Adam(net.parameters(), lr=learning_rate)
# Rewards
total_rewards = []
# index used for reducing the epsilon

while True:
    epsilon = max(epsilon_final, epsilon_start - index / epsilon_decay_last_frame)
    reward = agent.step(net, epsilon)
    if done:
        break
    else:
        total_rewards.append(reward)
        if len(buffer) < replay_start_size:
            continue
        if index % sync_target == 0:
            # Load the weights of the main network on the target network
            target.load_state_dict(net.state_dict())
        optimizer.zero_grad()
        batch = buffer.sample(batch_size)
        loss_t = calc_loss(batch, net, target)
        loss_t.backward()
        optimizer.step()