# Train Cartpole Balancing problem/Atari Problemms in OpenAI Env using Dueling DQN network along with prioritized memory replay using sum tree

In [1]:
#Initialize
import math, random
import gym
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.autograd as Variable
import torch.nn.functional as F
from collections import deque

env = gym.make("CartPole-v0")

epsilon = 1.0
epsilonMin = 0.01
decay = 0.999
episodes = 500
batch_size = 32
gamma = 0.99
goal_steps = 200

In [2]:
# Network
class DuelingDQN(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(DuelingDQN, self).__init__()
        self.fc1 = nn.Linear(num_inputs, 128)
        
        self.a1 = nn.Linear(128, 128)
        self.a2 = nn.Linear(128, num_outputs)
        
        self.val1 = nn.Linear(128, 128)
        self.val2 = nn.Linear(128, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        
        adv = F.relu(self.a1(x))
        adv = self.a2(adv)

        val = F.relu(self.val1(x))
        val = self.val2(val)
        return val + adv - adv.mean()
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            state = torch.FloatTensor(state).unsqueeze(0)
            q_value = self.forward(state)
            action = q_value.max(1)[1].data[0].numpy()
        else:
            action = env.action_space.sample()
        return action

In [3]:
model1 = DuelingDQN(env.observation_space.shape[0], env.action_space.n)
model2 = DuelingDQN(env.observation_space.shape[0], env.action_space.n)

optimizer = optim.Adam(model1.parameters())
def sync(model1, model2):
    model2.load_state_dict(model1.state_dict())

sync(model1, model2)

In [4]:
class SumTree:
    write = 0
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2*capacity - 1)
        self.data = np.zeros(capacity, dtype=object)

    def _propagate(self, idx, change):
        parent = (idx-1)//2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2*idx + 1
        right = left + 1

        if left >= len(self.tree):
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            return self._retrieve(right, s-self.tree[left])

    def total(self):
        return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

    def update(self, idx, p):
        change = p - self.tree[idx]
        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        dataIdx = idx - self.capacity + 1
        return (idx, self.tree[idx], self.data[dataIdx])

In [5]:
# Replay memory
class Memory:
    samples = []
    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.e = 0.01
        self.a = 0.6

    def _getPriority(self, error):
        return (error + self.e)**self.a

    def add(self, error, sample):
        p = self._getPriority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        segment = self.tree.total()/n

        for i in range(n):
            a = segment*1
            b = segment * (i + 1)
            s = random.uniform(a,b)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))
            
        return batch

    def update(self, idx, error):
        p = self._getPriority(error)
        self.tree.update(idx, p)

In [6]:
# Estimating error
def get_error(state, action, reward, next_state, done):
    state = torch.FloatTensor(np.float32(state))
    next_state = torch.FloatTensor(np.float32(next_state))
    action = torch.LongTensor(action)
#     print(action.unsqueeze(1))
    reward = torch.FloatTensor(reward)
    done = torch.FloatTensor(done)
    q_values = model1(state)
    next_q_values = model1(next_state)
#     print(next_q_values.shape)
    next_q_values2 = model2(next_state)
    q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value = next_q_values2.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    error = abs(q_value - expected_q_value)
    return error

In [None]:
memory = Memory(10000)

for idx in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = int(model1.act(state, epsilon))
        next_state, reward, done, _ = env.step(action)
        exp = state, action, reward, next_state, done

        state = torch.FloatTensor(np.float32(state))
        next_state = torch.FloatTensor(np.float32(next_state))
        q_values = model1(state)
        next_q_values = model1(next_state)
        next_q_values2 = model2(next_state)

        q_value = q_values[action].squeeze(0)
        a = int(torch.max(next_q_values, 0)[1].numpy())
        next_q_value = next_q_values2[a]
        
        expected_q_value = reward + gamma * next_q_value * (1 - done)
        error = float(abs(q_value - expected_q_value).detach().numpy())
        memory.add(error, exp)
        state = next_state
        total_reward += reward
        
        if done:
            done = False
            print("Episode = " + str(idx) + " , Score = " + str(total_reward))
            break
    
    if epsilon > epsilonMin:
        epsilon *= decay
    
    if idx % 100 == 0:
        sync(model1, model2)
        
    if idx > 3:
        batch = memory.sample(batch_size)
        state = [np.array(batch[i][1][0]) for i in range(batch_size)]
        action = np.array([o[1][1] for o in batch])
        reward = np.array([o[1][2] for o in batch])
        next_state = np.array([o[1][3] for o in batch])
        done = np.array([o[1][4] for o in batch])
        d = [0]*32
        for i in range(len(d)):
            if done[i]==True:
                d[i] = 1
            else:
                d[i] = 0
        error = get_error(state, action, reward, next_state, d)
        loss = error.pow(2).mean()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# For Atari Problems

In [7]:
# Preprocess reduces dimension and converts frame of 210x160x3 to 84x84
def preprocess(observation):
    observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
    observation = observation[26:110,:]
    ret, observation = cv2.threshold(observation, 1, 255, cv2.THRESH_BINARY)
    return np.reshape(observation, (84, 84))

In [8]:
# Stack 4 frames (4 frames used to give idea of motion) to create data set
stack_size = 4
stacked_frames  =  deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess(state)
    if is_new_episode:
        stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames, axis=0)
        
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=0)
    return stacked_state, stacked_frames

In [9]:
# Conv. neural network for training
class DuelingCnnDQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DuelingCnnDQN, self).__init__()
        self.conv1 = nn.Conv2d(input_shape, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        self.adv1 = nn.Linear(7*7*64, 512)
        self.adv2 = nn.Linear(512, num_actions)
        
        self.val1 = nn.Linear(7*7*64, 512)
        self.val2 = nn.Linear(512, 1)
        
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        
        advantage = F.relu(self.adv1(x))
        advantage = self.adv2(advantage)
        
        value = F.relu(self.val1(x))
        value = self.val2(value)
        
        return value + advantage  - advantage.mean()
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            state = torch.FloatTensor(state).unsqueeze(0)
            q_value = self.forward(state)
            action = q_value.max(1)[1].data[0].numpy()
        else:
            action = env.action_space.sample()
        return action

In [10]:
env = gym.make('SpaceInvaders-v0')
state = env.reset()
state, stacked_frames = stack_frames(stacked_frames, state, True)
input_size = state.shape[0]
action_size = env.action_space.n

In [11]:
model1 = DuelingCnnDQN(input_size,action_size)
model2  = DuelingCnnDQN(input_size,action_size)

def sync(model1, model2):
    model2.load_state_dict(model1.state_dict())

sync(model1, model2)