In [1]:
%matplotlib inline

import gym
from gym.wrappers import Monitor
import itertools
import numpy as np
import os
import random
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.tensorboard import SummaryWriter


if "../" not in sys.path:
    sys.path.append("../")

from lib import plots
from collections import deque, namedtuple

In [2]:
env = gym.envs.make('Breakout-v0')

In [3]:
# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions
VALID_ACTIONS = [0, 1, 2, 3]

In [4]:
class StateProcessor:
    """
    Processes a raw Atari images. Resizes it and converts it to grayscale.
    """ 
    def process(self, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84] state representing grayscale values.
        """
        state = torch.FloatTensor(state)
        state = torch.reshape(state,
                              shape=(state.size(2), state.size(0), state.size(1)))
        state = transforms.ToPILImage()(state)
        processed_state = transforms.functional.to_grayscale(state)
        processed_state = transforms.functional.crop(processed_state,
                                                     top=34, 
                                                     left=0, 
                                                     height=160, 
                                                     width=160)
        processed_state = transforms.functional.resize(processed_state, 
                                                       size=(84, 84), 
                                                       interpolation=Image.NEAREST)
        processed_state = transforms.ToTensor()(processed_state)
        processed_state = torch.squeeze(processed_state)
        return processed_state

In [5]:
class Estimator(nn.Module):
    """
    Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    """
    def __init__(self, summaries_dir=None):
        super(Estimator, self).__init__()
        self.summary_writer = None
        self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        self.linear1 = nn.Linear(64*7*7, 512)
        self.linear2 = nn.Linear(512, len(VALID_ACTIONS))
        
        if summaries_dir:
            summary_dir = os.path.join(summaries_dir, "summaries_{}".format('pytorch'))
            if not os.path.exists(summary_dir):
                os.makedirs(summary_dir)
            self.summary_writer = SummaryWriter()
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = torch.flatten(x, 1)
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x
    
    def predict(self, s):
        """
        Predicts action values.

        Args:
          s: State input of shape [batch_size, 4, 84, 84]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        s = torch.FloatTensor(s)
        s /= 255.0
        return self.forward(s)
    
    def update(self, s, a, y, optimizer):
        """
        Updates the estimator towards the given targets.

        Args:
          s: State input of shape [batch_size, 4, 84, 84]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        self.actions_pl = torch.tensor(a, dtype=torch.int32)
        self.y_pl = torch.tensor(y, dtype=torch.float32)
        
        s = torch.FloatTensor(s)
        s /= 255.0
        
        self.preds = self.forward(s)
        batch_size = s.shape[0]
        gather_indices = torch.arange(batch_size) * self.preds.shape[1] + self.actions_pl
        self.action_predictions = torch.gather(self.preds.reshape((-1,)), 0, gather_indices)
        
        self.losses = (self.action_predictions - self.y_pl)**2
        self.loss = self.losses.mean()
        self.loss.backward()
        optimizer.step()
        
        # summaries for tensorboard
        if self.summary_writer is not None:
            self.summary_writer.add_scalar('Loss', self.loss)
            self.summary_writer.add_histogram('losses', self.losses)
            self.summary_writer.add_histogram('q_values_hist', self.preds)
            self.summary_writer.add_scalar('q_values_max', self.preds.max())
        return self.loss.item()

In [6]:
model = Estimator()
optimizer = torch.optim.RMSprop(model.parameters(), 
                                lr=0.00025, 
                                alpha=0.99, 
                                momentum=0.0, 
                                eps=1e-6)

In [7]:
sp = StateProcessor()

    
# Example observation batch
observation = env.reset()
print(observation.shape)
    
observation_p = sp.process(observation)
print(observation_p.shape)
observation = np.stack([observation_p] * 4, axis=0)
print(observation.shape)
observations = np.array([observation] * 2)
print(observations.shape)

    # Test Prediction
print(type(observations))
print(model.predict(observations))

    # Test training step
y = np.array([10.0, 10.0])
a = np.array([1, 3])
print(model.update(observations, a, y,optimizer))
print('y')

(210, 160, 3)
torch.Size([84, 84])
(4, 84, 84)
(2, 4, 84, 84)
<class 'numpy.ndarray'>
tensor([[-0.0360,  0.0058, -0.0107, -0.0361],
        [-0.0360,  0.0058, -0.0107, -0.0361]], grad_fn=<AddmmBackward>)
100.30374145507812
y


In [8]:
def copy_model_parameters(model_1, model_2):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    model_2.load_state_dict(model_1.state_dict())

In [9]:
def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        nA: Number of actions in the environment.

    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

    """
    def policy_fn(observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(np.expand_dims(observation, 0))[0]
        best_action = np.argmax(q_values.detach().numpy())
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [20]:
def deep_q_learning(env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    optimizer,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sample when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """
    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plots.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))
    
    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "pytorch_monitor")
    print(monitor_path)
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        print("created monitor dir")
        os.makedirs(monitor_path)

    
    # TODO: Load and save checkpoints
    if os.path.exists(checkpoint_path):
        print("Loading model checkpoint {}...\n".format(checkpoint_path))
        q_estimator.load_state_dict(torch.load(checkpoint_path))
    
    # TODO: init total_t as global step torch variable
    total_t = 0
    
    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    
    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))
    
    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(state)
    state = np.stack([state] * 4, axis=0)
    
    for i in range(replay_memory_init_size):
        # populate replay memory
        action_probs = policy(state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(action)
        next_state = state_processor.process(next_state)
        next_state = np.append(state[1:, :, :], np.expand_dims(next_state, 0), axis=0)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = state_processor.process(state)
            state = np.stack([state] * 4, axis=0)
        else:
            state = next_state
    
    print(len(replay_memory))
    env = Monitor(env,
                directory=monitor_path,
                resume=True,
                video_callable=lambda count: count % record_video_every == 0)
        
    for i_episode in range(num_episodes):
        # TODO: Save checkpoint or models here
        torch.save(q_estimator.state_dict(), checkpoint_path)
        # reset env
        state = env.reset()
        state = state_processor.process(state)
        state = np.stack([state]*4, axis=0)
        loss = None
            
        for t in itertools.count():
            # get epsilon for time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]
                
            # TODO: add epsilon summary to q_estimator's summary
            q_estimator.summary_writer.add_scalar(scalar_value=epsilon, tag="epsilon", global_step=i_episode)

                
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(q_estimator, target_estimator)
                
            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()
                
            # take a step in environment
            action_probs = policy(state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            next_state = state_processor.process(next_state)
            next_state = np.append(state[1:, :, :], np.expand_dims(next_state, 0), axis=0)
                
            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)
            
            replay_memory.append(Transition(state, action, reward, next_state, done))
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # TODO: Sample a minibatch from the replay memory
            sample = random.sample(replay_memory, batch_size)
            states_batch, actions_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*sample))
            
            # TODO: calculate q values and targets
            q_values_next = q_estimator.predict(next_states_batch)
            best_actions = np.argmax(q_values_next.detach().cpu().numpy(), axis=1)
            q_values_next_target = target_estimator.predict(next_states_batch)
            
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
                discount_factor * q_values_next_target[np.arange(batch_size), best_actions].detach().numpy()
            
            # TODO: perform SGD
            states_batch = np.array(states_batch)
            loss = q_estimator.update(states_batch, actions_batch, targets_batch, optimizer)
            
            if done: 
                break
            
            state = next_state
            total_t += 1
            
        #TODO: Add summaries to tensorboard
        q_estimator.summary_writer.add_scalar(scalar_value=stats.episode_rewards[i_episode], tag="episode_reward",global_step=i_episode)
        q_estimator.summary_writer.add_scalar(scalar_value=stats.episode_lengths[i_episode], tag="episode_length",global_step=i_episode)
        q_estimator.summary_writer.flush()

        yield total_t, plots.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])
    env.monitor.close()
    return stats

In [22]:
# env.monitor.close()
state_processor = StateProcessor()
experiment_dir = os.path.abspath("./DobleDQN_experiments/{}".format(env.spec.id))
# Create estimators
q_estimator = Estimator(summaries_dir=experiment_dir)
optimizer = torch.optim.RMSprop(q_estimator.parameters(), 
                                lr=0.00025, 
                                alpha=0.99, 
                                momentum=0.0, 
                                eps=1e-6)
target_estimator = Estimator()
for t, stats in deep_q_learning(env,
                q_estimator=q_estimator,
                target_estimator=target_estimator,
                state_processor=state_processor,
                experiment_dir=experiment_dir,
                optimizer=optimizer,
                num_episodes=5,
                replay_memory_size=500000,
                replay_memory_init_size=500,
                update_target_estimator_every=100, 
                epsilon_start=1.0,   
                epsilon_end=0.1,
                epsilon_decay_steps=500000,
                discount_factor=0.99,               
                batch_size=32):
    print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))