<a href="https://colab.research.google.com/github/mohantyk/deep-rl/blob/master/Chap6_DeepQNetworks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorboardX



In [2]:
import gym
from collections import defaultdict
from tensorboardX import SummaryWriter
import math

In [3]:
%load_ext tensorboard

# FrozenLake Q-Learning

In [4]:
ENV_NAME = 'FrozenLake-v0'
GAMMA = 0.9
ALPHA = 0.2
TEST_EPISODES = 20

In [5]:
class Agent:
  def __init__(self):
    self.env = gym.make(ENV_NAME)
    self.values = defaultdict(float) # key (s, a)
    self.state = self.env.reset()

  def sample_env(self):
    '''Returns a (s,a,r,s') tuple'''
    action = self.env.action_space.sample()
    old_state = self.state
    new_state, reward, is_done, _ = self.env.step(action)
    self.state = self.env.reset() if is_done else new_state
    return (old_state, action, reward, new_state)

  def best_value_and_action(self, state):
    '''Returns best action and its value in a given state'''
    best_action, best_value = None, -math.inf
    for action in range(self.env.action_space.n):
      val = self.values[(state, action)]
      if val > best_value:
        best_value = val
        best_action = action
    return best_value, best_action

  def value_update(self, s, a, r, next_s):
    '''Update value of (s,a) given a (s,a,r,s') occurence'''
    old_val = self.values[(s,a)]
    best_value, _ = self.best_value_and_action(next_s)
    new_val = r + GAMMA*best_value
    self.values[(s, a)] = old_val*(1-ALPHA) + new_val * ALPHA

  def play_episode(self, env):
    '''Play test episode. Returns total reward for episode'''
    total_reward = 0.0
    state = env.reset()
    is_done = False
    while not is_done:
      _, action = self.best_value_and_action(state)
      new_state, reward, is_done, _ = env.step(action)
      total_reward += reward
      state = new_state
    return total_reward

In [6]:
test_env = gym.make(ENV_NAME)
agent = Agent()
writer = SummaryWriter(comment='-q-learning')

In [7]:
iter_no = 0
best_reward = 0.0
while True:
  iter_no += 1
  state, action, reward, next_state = agent.sample_env()
  agent.value_update(state, action, reward, next_state)

  test_reward = 0.0
  for _ in range(TEST_EPISODES):
    test_reward += agent.play_episode(test_env)
  test_reward /= TEST_EPISODES
  writer.add_scalar('reward', test_reward, iter_no)
  if test_reward > best_reward:
    print(f'{iter_no}: Best reward updated from {best_reward} -> {test_reward}')
    best_reward = test_reward
  if test_reward > 0.8:
    print(f'Solved in {iter_no} iterations')
    break
writer.close()

400: Best reward updated from 0.0 -> 0.6
3096: Best reward updated from 0.6 -> 0.65
3124: Best reward updated from 0.65 -> 0.7
8088: Best reward updated from 0.7 -> 0.75
8108: Best reward updated from 0.75 -> 0.85
Solved in 8108 iterations


# Deep Q-learning

## Wrappers

In [8]:
import cv2
import numpy as np
from collections import deque, namedtuple

In [9]:
class FireResetEnv(gym.Wrapper):
  def __init__(self, env=None):
    super().__init__(env)
    assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
    assert len(env.unwrapped.get_action_meanings()) > 3

  def step(self, action):
    return self.env.step(action)

  def reset(self):
    '''Reset underlying environment and press FIRE and move right'''
    self.env.reset()
    obs, reward, is_done, _ = self.step(1) # Press FIRE button
    if is_done: # Pressing FIRE was not such a good idea
      obs, reward, is_done, _ = self.env.reset() 
    obs, reward, is_done, _ = self.env.step(2) # Move right? Why?
    if is_done:
      obs, reward, is_done, _ = self.env.reset()
    return obs
    

In [10]:
class MaxAndSkipEnv(gym.Wrapper):
  '''
  Repeats same action for K frames.
  Takes max of pixels over 2 frames.
  '''
  def __init__(self, env=None, skip=4):
    super().__init__(env)
    self._obs_buffer = deque(maxlen=2) # To take max of pixels
    self._skip = 4

  def step(self, action):
    '''
    Take same action for skip frames.
    When returning observation, take maximum of pixels for 2 frames
    '''
    total_reward = 0.0
    for _ in range(self._skip):
      obs, reward, is_done, info = self.env.step(action)
      self._obs_buffer.append(obs)
      total_reward += reward
      if is_done:
        break
    max_frame = np.max(np.stack(self._obs_buffer), axis=0)
    return max_frame, total_reward, is_done, info

  def reset(self):
    self._obs_buffer.clear()
    obs = self.env.reset()
    self._obs_buffer.append(obs)
    return obs

In [11]:
class ProcessFrame84(gym.ObservationWrapper):
  def __init__(self, env=None):
    super().__init__(env)
    self.observation_space = gym.spaces.Box(low=0, high=255, 
                                            shape=(84,84,1), dtype=np.uint8)
  
  def observation(self, obs):
    return ProcessFrame84.process(obs)

  @staticmethod
  def process(frame):
    if frame.size == 210 * 160 * 3:
      img = np.reshape(frame, (210, 160, 3)).astype(np.float32)
    elif frame.size == 250 * 160 * 3:
      img = np.reshape(frame, (250, 160, 3)).astype(np.float32)
    else:
      raise ValueError('Unknown resolution')

    img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, : , 2] * 0.114
    resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
    x_t = resized_screen[18:102, :]
    x_t = np.reshape(x_t, (84, 84, 1))
    return x_t.astype(np.uint8) 

In [12]:
class BufferWrapper(gym.ObservationWrapper):
  '''
  Buffers N frames into a single observation
  Needed for obtaining info about dynamics
  '''
  def __init__(self, env, n_steps, dtype=np.float32):
    super().__init__(env)
    self.dtype = dtype
    old_space = env.observation_space
    self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                            old_space.high.repeat(n_steps, axis=0),
                                            dtype=dtype)
    
  def reset(self):
    self.buffer = np.zeros_like(self.observation_space.low)
    return self.observation(self.env.reset())

  def observation(self, obs):
    '''Remove oldest observation. Append new observation at end of buffer'''
    self.buffer[:-1] = self.buffer[1:]
    self.buffer[-1] = obs
    return self.buffer

In [13]:
class ImageToPyTorch(gym.ObservationWrapper):
  '''
  Converts height/width/channel to channel/height/width format
  CHW format is needed for PyTorch convnets
  '''
  def __init__(self, env):
    super().__init__(env)
    old_shape = env.observation_space.shape
    new_shape = (old_shape[2], old_shape[0], old_shape[1])
    self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 
                                            shape=new_shape, dtype=np.float32)
    
  def observation(self, obs):
    return np.moveaxis(obs, 2, 0)

In [14]:
class ScaledFloatFrame(gym.ObservationWrapper):
  '''
  Scales observations to [0,1] range
  '''
  def observation(self, obs):
    return np.array(obs).astype(np.float32)/255.0

In [15]:
def make_env(env_name):
  env = gym.make(env_name)
  env = MaxAndSkipEnv(env)
  env = FireResetEnv(env)
  env = ProcessFrame84(env)
  env = ImageToPyTorch(env)
  env = BufferWrapper(env, 4)
  env = ScaledFloatFrame(env)

  return env

## DQN model

In [16]:
import torch
import torch.nn as nn
import time

In [17]:
class DQN(nn.Module):
  def __init__(self, input_shape, num_actions):
    super().__init__()
    self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=2, stride=1),
            nn.ReLU()
    )
    # Need to know the shape of the conv network output
    # Note: this function is called only once
    conv_out_size = self._get_conv_out(input_shape)
    self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions)
    )

  def _get_conv_out(self, input_shape):
    # Create a dummy input of zeros and run through convnet
    o = self.conv(torch.zeros(1, *input_shape))
    return int(np.prod(o.size()))

  def forward(self, x):
    conv_out = self.conv(x)
    # Convert each observation into a single row.
    # Reshape into 2-dims where dim[0] is batch_size
    batch_size = x.shape[0]
    fc_in = conv_out.view(batch_size, -1)
    # Run through fully connected network
    return self.fc(fc_in)

## Training

In [18]:
DEFAULT_ENV_NAME = 'PongNoFrameskip-v4'
MEAN_REWARD_BOUND = 19.0
# Training Parameters
GAMMA = 0.99
BATCH_SIZE = 32 
REPLAY_SIZE = 10000 # Maximum capacity of replay buffer
REPLAY_START_SIZE = 10000 # The count of frames we wait to start training
LEARNING_RATE = 1e-4 # Adam learning rate
SYNC_TARGET_FRAMES = 1000 # How often sync weights from traning model to target model
# Epsilon Decay parameters
EPSILON_START = 1.0
EPSILON_FINAL = 0.1
EPSILON_DECAY_LAST_FRAME = 150000 # linearly decay epsilon
# Device
DEVICE = 'cuda'


In [19]:
Experience = namedtuple('Experience', 
                        ['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceBuffer:
  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)
    
  def sample(self, batch_size):
    indices = np.random.choice(len(self.buffer), batch_size, replace=False)
    states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
    return np.array(states), np.array(actions), \
           np.array(rewards, dtype=np.float32), \
           np.array(dones, dtype=np.uint8), \
           np.array(next_states)

In [20]:
# Small code snippet to demonstrate what is happening in sample() above
a = [(1,2,3), (10,20,30), (100,200,300)]
indices = [0, 1, 2]
x, y, z = zip(*[a[idx] for idx in indices])
x

(1, 10, 100)

In [21]:
class Agent:
  def __init__(self, env, exp_buffer):
    self.env = env
    self.exp_buffer = exp_buffer
    self._reset()
  
  def _reset(self):
    self.state = self.env.reset()
    self.total_reward = 0.0

  @torch.no_grad()  # Forces requires_grad = False for inference
  def play_step(self, net, epsilon=0.0, device='cuda'):
    done_reward = None # Reward only after episode is complete
    # Choose action e-greedily
    if np.random.random() < epsilon:
      action = self.env.action_space.sample()
    else:
      obs = np.array([self.state], copy=False)
      obs_v = torch.tensor(obs).to(device)
      q_vals = net(obs_v)
      max_q, max_idx = torch.max(q_vals, dim=1)
      action = max_idx.item() # max_idx is a tensor
    # Take step
    new_state, reward, done, _ = self.env.step(action)
    self.total_reward += reward
    # Save experience to replay buffer
    exp = Experience(self.state, action, reward, done, new_state) 
    self.exp_buffer.append(exp)
    # Update state, handle end of episode
    self.state = new_state
    if done:
      done_reward = self.total_reward
      self._reset()
    return done_reward


In [22]:
def calc_loss(batch, net, tgt_net, device='cuda'):
  '''
  batch: tuple of arrays (output of ExperienceBuffer.sample())
  net: network we are training
  tgt_net: target network (for target estimates)
  '''
  states, actions, rewards, dones, next_states = batch # Each is numpy array
  # Convert to tensors on appropriate device
  states_v = torch.tensor(states).to(device)
  next_states_v = torch.tensor(next_states).to(device)
  actions_v = torch.tensor(actions).to(device)
  rewards_v = torch.tensor(rewards).to(device)
  done_mask = torch.BoolTensor(dones).to(device)
  # Predict action values
  state_action_vals = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
  # Calculated expected action values
  next_state_act_vals = tgt_net(next_states_v)
  next_state_vals, _ = next_state_act_vals.max(1) # ignore max indices
  next_state_vals[done_mask] = 0.0 # if episode is done, only need to add reward
  # Stop backpropagation from flowing back into tgt_net
  next_state_vals = next_state_vals.detach() 
  expected_state_vals = rewards_v + GAMMA * next_state_vals
  # Calculate loss
  mse = nn.MSELoss()
  loss = mse(state_action_vals, expected_state_vals)
  return loss

In [23]:
device = torch.device(DEVICE)
env = make_env(DEFAULT_ENV_NAME)
net = DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
writer = SummaryWriter(comment='-'+DEFAULT_ENV_NAME)
print(net)

DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(2, 2), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=4096, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


In [24]:
# Create Agent
replay_buffer = ExperienceBuffer(REPLAY_SIZE)
epsilon = EPSILON_START
agent = Agent(env, replay_buffer)

In [None]:
# Create optimizer and counters
optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_mean_reward = -math.inf

# Main training loop
while True:
  frame_idx += 1
  epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

  reward = agent.play_step(net, epsilon, device)
  # If episode finished, calculate a bunch of indicators
  if reward is not None: 
    total_rewards.append(reward)
    speed = (frame_idx - ts_frame)/(time.time() - ts)
    ts_frame = frame_idx
    ts = time.time()
    mean_reward = np.mean(total_rewards[-100:])
    print(f'{frame_idx}: done {len(total_rewards)} games, reward {mean_reward:.3f}, epsilon {epsilon:.2f} speed {speed:.2f} f/s')

    writer.add_scalar('epsilon', epsilon, frame_idx)
    writer.add_scalar('speed', speed, frame_idx)
    writer.add_scalar('reward_100', mean_reward, frame_idx)
    writer.add_scalar('reward', reward, frame_idx)

    if mean_reward > best_mean_reward:
      print(f'Best reward updated {best_mean_reward:.3f} -> {mean_reward:.3f}')
      best_mean_reward = mean_reward

    if mean_reward > MEAN_REWARD_BOUND:
      print(f'Solved in {frame_idx} frames!')
      break

  # Wait for minimum buffer size before starting to train
  if len(replay_buffer) < REPLAY_START_SIZE:
    continue
  # Sync target net and training net 
  if frame_idx % SYNC_TARGET_FRAMES == 0:
    tgt_net.load_state_dict(net.state_dict())
  # Train networks
  optimizer.zero_grad()
  batch = replay_buffer.sample(BATCH_SIZE)
  loss_t = calc_loss(batch, net, tgt_net, device)
  loss_t.backward()
  optimizer.step()
  
writer.close()

944: done 1 games, reward -20.000, epsilon 0.99 speed 672.52 f/s
Best reward updated -inf -> -20.000
1960: done 2 games, reward -20.000, epsilon 0.99 speed 676.55 f/s
2994: done 3 games, reward -20.333, epsilon 0.98 speed 663.25 f/s
3964: done 4 games, reward -20.250, epsilon 0.97 speed 671.52 f/s
5051: done 5 games, reward -20.000, epsilon 0.97 speed 669.19 f/s
5931: done 6 games, reward -20.167, epsilon 0.96 speed 655.17 f/s
7006: done 7 games, reward -20.000, epsilon 0.95 speed 656.33 f/s
7787: done 8 games, reward -20.125, epsilon 0.95 speed 666.68 f/s
8671: done 9 games, reward -20.222, epsilon 0.94 speed 640.86 f/s
9451: done 10 games, reward -20.300, epsilon 0.94 speed 633.75 f/s
10273: done 11 games, reward -20.364, epsilon 0.93 speed 230.93 f/s
11160: done 12 games, reward -20.333, epsilon 0.93 speed 100.38 f/s
11970: done 13 games, reward -20.385, epsilon 0.92 speed 104.04 f/s
12965: done 14 games, reward -20.286, epsilon 0.91 speed 102.19 f/s
13746: done 15 games, reward -20