In [46]:
# Libraries Needed
!pip install wandb -q
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!apt-get install x11-utils

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.4).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
x11-utils is already the newest version (7.7+3build1).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.


###Run Settings

In [0]:
# wandb metric tracking
log_metrics = True

# Cuda
use_cuda = True 

# Google Colab
use_colab = True

### Import Libraries

In [0]:
# Import Libraries
import wandb

import configparser
import gym
from gym import wrappers
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F

from collections import deque

import random

Google Colab Setup

In [49]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Variable Setup

In [0]:
project_dir = "/content/gdrive/My Drive/Colab Notebooks/Paper Implementation/Playing-Atari-with-Deep-Reinforcement-Learning/"
config_ini = project_dir + '/config.ini'
video_dir = project_dir + '/videos'



### Metric Tracking Setup

In [51]:
  # Wandb Login
if log_metrics:
  config = configparser.ConfigParser()
  config.sections()
  config.read(config_ini)

  login = config['WANDB']['login']
  entity = config['WANDB']['entity']
  project = config['WANDB']['project']

  wandb.login(login)
  wandb.init(entity=entity, project=project, anonymous='never')

### Cuda Setup

In [0]:
if use_cuda and not torch.cuda.is_available():
  print ("WARNING: cuda is not available")
  use_cuda = False

### Colab Display Setup

In [0]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()
import os
# This code creates a virtual display to draw game images on. 
# If you are running locally, just ignore it
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

## Replay Buffer

In [0]:
class ReplayBuffer(object):
  def __init__(self, size):
    self.replayBuffer = deque(maxlen=size)
  
  def add(self, current_state, action, reward, next_state, done):
    self.replayBuffer.append((state, action, reward, next_state, done))

  def sample(self, numSamples):
    state, action, reward, next_state, done = zip(*random.sample(self.replayBuffer, numSamples))
  
    return state, action, reward, next_state, done
  
  def __len__(self):
    return len(self.replayBuffer)

###Epsilon Decay
- Decaying epsilon per episode
- Using a class so that different epsilon decay methods could be swapped in and out



In [0]:
class EpsilonDecay(object):
  def __init__(self, start_epsilon, final_epsilon, decay_epsilon):
    self.start_epsilon = start_epsilon
    self.final_epsilon = final_epsilon
    self.decay_epsilon = decay_epsilon
    self.current_epsilon = start_epsilon
  
  def decay(self):
    if self.current_epsilon > self.final_epsilon:
      self.current_epsilon *= self.decay_epsilon
    return self.current_epsilon
  
  def get_epsilon(self):
    return self.current_epsilon
    
  def reset(self, new_epsilon=None):
    if new_epsilon is None:
      self.current_epsilon = self.start_epsilon
    else:
      self.current_epsilon = new_epsilon

    return self.current_epsilon

## Deep Q Network

In [56]:
class testDQN(nn.Module):
    def __init__(self, num_inputs, num_actions, replay_buffer, epsilon, gamma, learning_rate):
        super(testDQN, self).__init__()
        self.replay_buffer = replay_buffer # Replay Buffer
        self.num_actions = num_actions
        self.epsilon = epsilon
        self.gamma = gamma
    
        self.layers = nn.Sequential(
            nn.Linear(env.observation_space.shape[0], 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, env.action_space.n)
        )

        self.optimizer = optim.Adam(self.layers.parameters(), lr=learning_rate)

    def forward(self, x):
      return self.layers(x)

    def action(self, state):
        if np.random.rand() <= self.epsilon.get_epsilon():  
            return random.randrange(self.num_actions)
        # Greedy Action
        state = torch.cuda.FloatTensor(state) if use_cuda else torch.FloatTensor(state)
        q_values = self.forward(state)
        values, indices = q_values.max(0)
        return indices.item()

    def memorize(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)

    def learn(self, batch):
        if len(self.replay_buffer) < batch:
            return
        state, action, reward, next_state, done = self.replay_buffer.sample(batch)
        state      = torch.cuda.FloatTensor(state) if use_cuda else torch.FloatTensor(state)
        action     = torch.cuda.LongTensor(action) if use_cuda else torch.LongTensor(action)
        reward     = torch.cuda.LongTensor(reward) if use_cuda else torch.LongTensor(reward)
        next_state = torch.cuda.FloatTensor(next_state) if use_cuda else torch.FloatTensor(next_state) 
        done       = torch.cuda.LongTensor(done) if use_cuda else torch.LongTensor(done)

        # Get Q values of each state
        q_values      = self.forward(state)
        next_q_values = self.forward(next_state)

        # Get Q Values of State-Action pair
        q_values = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        # Get Max Q Values of Next State
        next_q_values = next_q_values.max(1)[0]
        # Calculate Expected Q Value (If done, we do not have a next q value so it needs to be 0)
        expected_q_values = reward + self.gamma * next_q_values * (1 - done)
        # Fit Model
        loss = ((q_values - expected_q_values.data) ** 2).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        wandb.log({"Loss": loss}, commit=False)


 class DQN(nn.Module):
    def __init__(self, num_inputs, num_actions, replay_buffer, epsilon, gamma, learning_rate):
        super(testDQN, self).__init__()
        self.replay_buffer = replay_buffer # Replay Buffer
        self.num_actions = num_actions
        self.epsilon = epsilon
        self.gamma = gamma
    
        self.features = nn.Sequential(
            nn.Conv2d(env.observation_space.shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        self.fc = nn.Sequential(
            nn.Linear(self.feature_size(), 512),
            nn.ReLU(),
            nn.Linear(512, env.action_space.n)
        )

        self.optimizer = optim.Adam(self.layers.parameters(), lr=learning_rate)

    def feature_size(self):
        return self.features(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
        
    def forward(self, x):
      return self.layers(x)

    def action(self, state):
        if np.random.rand() <= self.epsilon.get_epsilon():  
            return random.randrange(self.num_actions)
        # Greedy Action
        state = torch.cuda.FloatTensor(state) if use_cuda else torch.FloatTensor(state)
        q_values = self.forward(state)
        values, indices = q_values.max(0)
        return indices.item()

    def memorize(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)

    def learn(self, batch):
        if len(self.replay_buffer) < batch:
            return
        state, action, reward, next_state, done = self.replay_buffer.sample(batch)
        state      = torch.cuda.FloatTensor(state) if use_cuda else torch.FloatTensor(state)
        action     = torch.cuda.LongTensor(action) if use_cuda else torch.LongTensor(action)
        reward     = torch.cuda.LongTensor(reward) if use_cuda else torch.LongTensor(reward)
        next_state = torch.cuda.FloatTensor(next_state) if use_cuda else torch.FloatTensor(next_state) 
        done       = torch.cuda.LongTensor(done) if use_cuda else torch.LongTensor(done)

        # Get Q values of each state
        q_values      = self.forward(state)
        next_q_values = self.forward(next_state)

        # Get Q Values of State-Action pair
        q_values = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        # Get Max Q Values of Next State
        next_q_values = next_q_values.max(1)[0]
        # Calculate Expected Q Value (If done, we do not have a next q value so it needs to be 0)
        expected_q_values = reward + self.gamma * next_q_values * (1 - done)
        # Fit Model
        loss = ((q_values - expected_q_values.data) ** 2).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        wandb.log({"Loss": loss}, commit=False)       


'''class DQN(nn.Module):
  def __init__(self, numInputs, numActions):
    self.layers == nn.Sequential(
        # 16 8x8 filters, stride 4 -> rectifier nonlinearity
        nn.Conv2d(numInputs, 32, kernel_size=(8,8), stride=4),
        nn.ReLu(),
        # 32 4x4 filter, stride 2 -> rectifier nonlinearity
        nn.Conv2d(32,64, kernel_size=(4,4), stride=2),
        nn.ReLU(),
        # fully connected, 256 rectifier
        nn.Linear(64, 256),
        nn.ReLU(),
        # fully connected  
        nn.Linear(256, numActions)
    )
        
  def forward(self, x):
    x = self.layers(x)
    return x
  
  #def act(self, state, epsilon):
    #if random.random() > epsilon):'''
    

'class DQN(nn.Module):\n  def __init__(self, numInputs, numActions):\n    self.layers == nn.Sequential(\n        # 16 8x8 filters, stride 4 -> rectifier nonlinearity\n        nn.Conv2d(numInputs, 32, kernel_size=(8,8), stride=4),\n        nn.ReLu(),\n        # 32 4x4 filter, stride 2 -> rectifier nonlinearity\n        nn.Conv2d(32,64, kernel_size=(4,4), stride=2),\n        nn.ReLU(),\n        # fully connected, 256 rectifier\n        nn.Linear(64, 256),\n        nn.ReLU(),\n        # fully connected  \n        nn.Linear(256, numActions)\n    )\n        \n  def forward(self, x):\n    x = self.layers(x)\n    return x\n  \n  #def act(self, state, epsilon):\n    #if random.random() > epsilon):'

In [68]:
features = nn.Sequential(
            nn.Conv2d(env.observation_space.shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

x = torch.randn(1, env.observation_space.shape[0], 224, 224)
print (features(x).size())

torch.Size([1, 64, 24, 24])


In [70]:
64*24*24
7 * 7 * 64

3136

### Cart Pole Environment

In [0]:
env = gym.make("CartPole-v0")
env = gym.wrappers.Monitor(env, video_dir, video_callable=lambda episode_id: episode_id%500==0, force=True)


### Algorithm 1 - Deep Q-Learning with Experience Replay

In [58]:
N = 1000 # Replay Buffer Capacity
M = 10000  # Number of Episodes
T = 1000  # Number of frames
LR = 0.001 # Learning Rate
MB = 32 # Mini batch of samples

# Initialize replay memory D to capacity N
replay_buffer = ReplayBuffer(N)
# Initialize action-value function Q with random weights
epsilon = EpsilonDecay(1.0, 0.01, 0.996)
model = testDQN(env.observation_space.shape[0], env.action_space.n, replay_buffer, epsilon, 0.99, 0.00001)
if use_cuda:
  model = model.cuda()

# for episode = 1, M do
for episode in range(M):
  # Initialize sequence s1 = {x} and preprocessed sequenced
  state = env.reset()
  episode_reward = 0
  # for t = 1, T do
  for t in range(T):
    # With probability e select a random action a otherwise select a = max(Q)
    action = model.action(state)
    # Execute action in emulator and observe reward and image
    next_state, reward, done, _ = env.step (action)
    # Store transition in D 
    model.memorize(state, action, reward, next_state, done)
    # Set S(t+1) = st, at, x(t+1) and preprocess 
    state = next_state
    episode_reward += reward
    # Sample Random minibatch of transitions from D
    model.learn(MB)
    if done:
      wandb.log({})
      break;
    epsilon.decay()
  #wanb_epsilon =  epsilon.get_epsilon()
  wandb.log({"Epsilon": epsilon.get_epsilon(), "Reward": episode_reward})

env.close()

KeyboardInterrupt: ignored

### Hyperparameters

#### Grid Search, Random Search, Bayesian Optimization, Hyperband
#### [Parameter Prioritization](https://www.wandb.com/articles/running-hyperparameter-sweeps-to-pick-the-best-model-using-w-b)

### learning rate, loss function, layer_size
### weight initialization, model depth, layer params, weight of regularization
### optimizer choice, optimize params, batch size, nonlinearity



In [0]:
## If you want to see while running - super slow
#

#plt.imshow(env.render('rgb_array'))
#ipythondisplay.clear_output(wait=True)
#ipythondisplay.display(plt.gcf())