# Prioritized Experience Replay DQN, Deep Q Learning

In [None]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt

from collections import namedtuple, deque

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [None]:
import warnings
warnings.simplefilter('ignore', UserWarning)

In [None]:
plt.style.use('ggplot')

## Set Configs

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [None]:
BUFFER_SIZE = int(1e5)
BATCH_SIZE = 64
GAMMA = 0.99
TAU = 1e-3 # for soft update of target parameters
LR = 5e-4
PRINT_EVERY = 100
UPDATE_EVERY = 4

## Set Environment

In [None]:
ENV_NAME = 'LunarLander-v2'
env = gym.make(ENV_NAME).unwrapped; env.seed(90);

In [None]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print('State space {}'.format(env.observation_space))
print('Action space {}'.format(env.action_space))

## Build DQN Architecture

In [None]:
class DQN(nn.Module):
    """Define DQN architecture."""
    
    def __init__(self, state_size, action_size, seed, fcl_units=64, fc2_units=64):
        """Initialize parameters and build model."""
        
        super(DQN, self).__init__()
        
        self.seed = torch.manual_seed(seed)
        
        self.fc1_layer = nn.Linear(state_size, fcl_units)
        self.fc2_layer = nn.Linear(fcl_units, fc2_units)
        self.fc3_layer = nn.Linear(fc2_units, action_size)
        
    def forward(self, state):
        """Build a network that maps state into action values."""
        
        state = F.relu(self.fc1_layer(state))
        state = F.relu(self.fc2_layer(state))
        Qsa = self.fc3_layer(state)
        
        return Qsa        

## Set Prioritized Replay Buffer

In [None]:
class PrioritizedReplayBuffer(object):
    """Fixed-size buffer to store experience tuples."""
    
    def __init__(self, capacity):
        """Initialize a PrioritizedReplayMemory object."""
        
        self.PER_e = 0.01
        self.PER_a = 0.6
        self.PER_b = 0.4
        
        self.capacity = capacity
        self.memory = []
        self.data_pointer = 0
        self.priorities = np.zeros((capacity,), dtype=np.float32)
        
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        
        max_priority = self.priorities.max() if self.memory else 1.0
        
        if len(self.memory) < self.capacity:
            self.memory.append((state, action, reward, next_state, done))
        else:
            self.memory[self.data_pointer] = (state, action, reward, next_state, done)
            
        self.priorities[self.data_pointer] = max_priority
        self.data_pointer = (self.data_pointer + 1) % self.capacity
        
    def sample(self):
        """Sample a batch of prioritized experiences from memory."""
        
        if len(self.memory) == self.capacity:
            priorities = self.priorities
        else:
            priorities = self.priorities[:self.data_pointer]
            
        probs = priorities ** self.PER_a
        probs/= probs.sum() 
        
        indices = np.random.choice(len(self.memory), BATCH_SIZE, p=probs)
        samples = [self.memory[idx] for idx in indices]
        
        total = len(self.memory)
        weights = (total * probs[indices]) ** (-self.PER_b)
        weights/= weights.max()
        weights = np.array(weights, dtype=np.float32)
        
        experiences = list(zip(*samples))
        
        states = torch.from_numpy(np.vstack([np.concatenate(experiences[0])])).float()
        states = states.to(device)
        
        actions = torch.from_numpy(np.vstack([np.expand_dims(experiences[1], axis=1)])).long()
        actions = actions.to(device)
        
        rewards = torch.from_numpy(np.vstack([np.expand_dims(experiences[2], axis=1)])).float()
        rewards = rewards.to(device)
        
        next_states = torch.from_numpy(np.vstack([np.concatenate(experiences[3])])).float()
        next_states = next_states.to(device)
        
        dones = torch.from_numpy(np.vstack([np.expand_dims(experiences[4], axis=1)]).astype(np.uint8)).float()
        dones = dones.to(device)
        
        weights = torch.from_numpy(np.expand_dims(weights, axis=1)).float()
        weights = weights.to(device)
        
        return indices, (states, actions, rewards, next_states, dones), weights
    
    def update_batch(self, batch_indices, batch_priorities):
        """Update current batch of internal memory."""
        
        for batch_id, priority in zip(batch_indices, batch_priorities):
            self.priorities[batch_id] = priority
            
    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

## Define PER DQN Agent

In [None]:
class PER_DQNAgent():
    """The agent interacting with and learning from the environment."""
    
    def __init__(self, state_size, action_size, seed):
        """Initialize an agent object."""
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        
        # Q-Network
        self.dqn_net = DQN(state_size, action_size, seed).to(device)
        self.target_net = DQN(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.dqn_net.parameters(), lr=LR)
        
        # Replay Buffer
        self.buffer = PrioritizedReplayBuffer(BUFFER_SIZE)
        self.time_step = 0
        
    def memorize(self, state, action, reward, next_state, done):
        """Save experience in replay buffer."""
        
        self.buffer.add(state, action, reward, next_state, done)
    
        self.time_step = (self.time_step + 1) % UPDATE_EVERY
        if self.time_step == 0:
            # if enough samples are available in memory, get random subset and learn
            if len(self.buffer) > BATCH_SIZE:
                batch_indices, experiences, weights = self.buffer.sample()
                self.learn(batch_indices, experiences, weights, GAMMA)
            
    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy."""
        
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.dqn_net.eval()
        with torch.no_grad():
            action_values = self.dqn_net(state)
        self.dqn_net.train()
        
        # epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
    
    def learn(self, batch_indices, experiences, weights, gamma):
        """Update value parameters using given batch of experience tuples."""
    
        states, actions, rewards, next_states, dones = experiences
        
        self.optimizer.zero_grad()
        
        # get index of maximum value for next state
        Qsa_next = self.dqn_net(next_states).detach()
        _, action_max = Qsa_next.max(1)

        # get max predicted Q values (for next states) from target network
        Q_target_next = self.target_net(next_states).detach().gather(1, action_max.unsqueeze(1))
        
        # compute Q target
        Q_target = rewards + (gamma * Q_target_next * (1 - dones))
        
        # get expected Q values from dqn network
        Q_expected = self.dqn_net(states).gather(1, actions)
        
        # compute loss
        loss  = torch.pow((Q_expected - Q_target), 2) * weights        
        priorities = loss + self.buffer.PER_e
        
        loss  = torch.mean(loss)
        
        # minimize the loss
        loss.backward()
        self.optimizer.step()
        
        self.buffer.update_batch(batch_indices, priorities.data.cpu().numpy())
        
        # update target network
        self.soft_update(self.dqn_net, self.target_net, TAU)
        
    def soft_update(self, dqn_net, target_net, tau):
        """Soft update target network parameters."""
        
        for dqn_param, target_param in zip(dqn_net.parameters(), target_net.parameters()):
            target_param.data.copy_(tau*dqn_param.data + (1.0-tau) * target_param.data)

In [None]:
agent = PER_DQNAgent(state_size=env.observation_space.shape[0], action_size=env.action_space.n, seed=90)

In [None]:
# watch an untrained agent
state = env.reset()
for time_step in range(200):
    
    # select an action
    action = agent.act(state)
    env.render()
    
    next_state, reward, done, _ = env.step(action)
    state = next_state
    if done:
        break

env.close()

## Train The Agent

In [None]:
def train_agent(num_episodes, max_time=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Train DQN agent."""
    
    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    
    for i_episode in range(1, num_episodes+1):
        state = env.reset()
        score = 0
        for time_step in range(max_time):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            
            agent.memorize(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
                
        scores_window.append(score)
        scores.append(score)
        eps = max(eps_end, eps_decay*eps)
        
        print(f'\rEpisode: {i_episode}, Average Score: {np.mean(scores_window):.2f}', end='')
        
        if i_episode % PRINT_EVERY == 0:
            print(f'\rEpisode: {i_episode}, Average Score: {np.mean(scores_window):.2f}')
        if np.mean(scores_window) >= 200.0:
            print(f'\nEnvironment solved in {i_episode-100:d} episodes! Average Score: {np.mean(scores_window):.2f}')
            if not os.path.exists('./agents/'): os.makedirs('./agents/')
            torch.save(agent.dqn_net.state_dict(), f'./agents/DDQN_{ENV_NAME}.pth')
            
    print('Training completed.')
    return scores

In [None]:
scores = train_agent(num_episodes=2000, max_time=1000)

## Evaluate The Agent

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(scores)), scores, color='green')
plt.xlabel('Num of episodes')
plt.ylabel('Score')
if not os.path.exists('./images/'): os.makedirs('./images/')
plt.savefig('./images/plot_of_agent_evaluation.png')
plt.show()

## Watch The Smart Agent

In [None]:
# load the weights of smart agent
agent.dqn_net.load_state_dict(torch.load(f'./agents/DDQN_{ENV_NAME}.pth'));

In [None]:
num_of_episodes = 20

for i_episode in range(1, num_of_episodes+1):
    
    state = env.reset()
    for time_step in range(200):
        
        # select an action
        action = agent.act(state)
        env.render()
        
        next_state, reward, done, _ = env.step(action)
        state = next_state
        if done:
            break

env.close()

---