# Train Space invaders Balancing problem  in OpenAI Env using DQN and Double DQN

In [1]:
# Also can be used for other atari environments
#Initialize
import math, random
import gym
import numpy as np
import torch
import cv2
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

env = gym.make('SpaceInvaders-v0')
action_size = env.action_space.n

episodes = 500
batch_size = 32
gamma = 0.99
batch_size = 64

epsilon = 1.0
epsilonMin = 0.01
decay = 0.00001
gamma = 0.9

  result = entry_point.load(False)


In [2]:
# Preprocess reduces dimension and converts frame of 210x160x3 to 84x84
def preprocess(observation):
    observation = cv2.cvtColor(cv2.resize(observation, (84, 110)), cv2.COLOR_BGR2GRAY)
    observation = observation[26:110,:]
    ret, observation = cv2.threshold(observation, 1, 255, cv2.THRESH_BINARY)
    return np.reshape(observation, (84, 84))

In [3]:
# Stack 4 frames (4 frames used to give idea of motion) to create data set
stack_size = 4
stacked_frames  =  deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess(state)
    if is_new_episode:
        stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames, axis=0)
        
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=0)
    return stacked_state, stacked_frames

In [4]:
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

In [5]:
# Conv. neural network for training
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_shape, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc4 = nn.Linear(7*7*64, 512)
        self.fc5 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc4(x.view(x.size(0), -1)))
        return self.fc5(x)
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            state = torch.FloatTensor(np.float32(state)).unsqueeze(0)
            q_value = self.forward(state)
            action  = q_value.max(1)[1].data[0].numpy()
        else:
            action = env.action_space.sample()
        return action

In [6]:
state = env.reset()
state, stacked_frames = stack_frames(stacked_frames, state, True)
input_size = state.shape[0]

In [7]:
model1 = DQN(input_size, action_size)
# For Double DQN
# model2 = DQN(input_size, action_size)
# sync both networks
# def sync(model1, model2):
    # model2.load_state_dict(model1.state_dict())

# sync(model1, model2)

In [8]:
# Training
optimizer = optim.Adam(model1.parameters())
memory = ReplayBuffer(1000)

for idx in range(episodes):
    state = env.reset()
    total_reward = 0
    state, stacked_frames = stack_frames(stacked_frames, state, True)
    done = False
    while not done:
        action = int(model1.act(state, epsilon))
        next_state, reward, done, _ = env.step(action)

        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        memory.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        
        if done:
            done = False
            print("Episode = " + str(idx) + " , Score = " + str(total_reward))
            break
    
    if epsilon > epsilonMin:
        epsilon *= decay
    
    # For Double DQN
    # if idx % 100 == 0:
        # sync(model1, model2)
        
    if len(memory) > batch_size:
        state, action, reward, next_state, done = memory.sample(batch_size)

        state = torch.FloatTensor(np.float32(state))
        next_state = torch.FloatTensor(np.float32(next_state))
        action = torch.LongTensor(action)
        print(state.shape)
        reward = torch.FloatTensor(reward)
        done = torch.FloatTensor(done)
        q_values = model1(state)
        next_q_values = model1(next_state)
        # For Double DQN
        # next_q_values2 = model2(next_state)
        
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_values.max(1)[0]
        # For Double DQN (comment above line)
        # next_q_value = next_q_values2.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = reward + gamma * next_q_value * (1 - done)
        loss = (q_value - expected_q_value.data).pow(2).mean()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Episode = 0 , Score = 120.0
torch.Size([64, 4, 84, 84])
Episode = 1 , Score = 0.0
torch.Size([64, 4, 84, 84])
Episode = 2 , Score = 0.0
torch.Size([64, 4, 84, 84])
Episode = 3 , Score = 15.0
torch.Size([64, 4, 84, 84])
Episode = 4 , Score = 145.0
torch.Size([64, 4, 84, 84])


KeyboardInterrupt: 

In [None]:
# Test
for i in range(5):
    state = env.reset()
    done = False
    total = 0
    state, stacked_frames = stack_frames(stacked_frames, state, True)

    while not done:
        env.render()
        action = int(model.act(state, epsilon))
        next_state, reward, done, _ = env.step(action)

        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        state = next_state
        total += reward
        
        if done:
            done = False
            print("Episode = " + str(i) + " , Score = " + str(total_reward))
            break