In [1]:
import gym
import gym.spaces
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import torchvision.datasets as dsets
from torch.autograd import Variable

# Below are just for debugging
import pickle
import os
import sys
import gc
import psutil

In [2]:
env = gym.make('SpaceInvaders-v0').unwrapped
#env = gym.make('MontezumaRevenge-v0').unwrapped
print(env.observation_space)
print(env.action_space)

[2018-08-31 16:51:33,292] Making new env: SpaceInvaders-v0


Box(210, 160, 3)
Discrete(6)


In [3]:
# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

#### Output formula
$O = \frac{W-K+2P}{S}+1$
    - O: Output height/width
    - W: Input height/length
    - K: filter size (kernel size)
    - P: Padding
    - S: Stride
    
<!--
conv1:    3 x (206, 156)
maxpool1: 3 x (103, 78)
conv2: 3 x (99, 74)
maxpool2: 3 x (49, 37)
conv3: 3 x (45, 33)
-->

In [5]:
class DQN(nn.Module):

    def __init__(self):
        super(DQN, self).__init__()
        #self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=5, stride=1)
        #self.bn1 = nn.BatchNorm2d(16)
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=8, stride=4)
        #self.bn1 = nn.BatchNorm2d(32)
        
        #self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        
        #self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1)
        #self.bn2 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        #self.bn2 = nn.BatchNorm2d(64)
        
        #self.maxpool2 = nn.MaxPool2d(kernel_size=2)
            
        #self.conv3 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1)
        #self.bn3 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        #self.bn3 = nn.BatchNorm2d(64)
        
        #self.head = nn.Linear(47520, 6)
        #self.head = nn.Linear(34848, 6)  # with maxpool1 and 2
        #self.fc = nn.Linear(700928, 6)
        #self.fc = nn.Linear(16384, 6)
        self.fc1 = nn.Linear(in_features=16384, out_features=512)
        self.fc2 = nn.Linear(in_features=512, out_features=6)  # out_features should be parametrized to env.action_space
        
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        #x = self.maxpool1(x)
        x = self.relu(self.conv2(x))
        #x = self.maxpool2(x)
        x = self.relu(self.conv3(x))
        x = self.fc1(x.view(x.size(0), -1))
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [6]:
resize = T.Compose([T.ToPILImage(),
                    T.Resize(40, interpolation=Image.CUBIC),
                    T.ToTensor()])

def get_screen():
    screen = env.render(mode='rgb_array').transpose(
        (2, 0, 1))
    #screen = env.render(mode='rgb_array')
    screen = np.ascontiguousarray(screen, dtype=np.float32)
    screen = torch.from_numpy(screen)
    #print("The shape of original screen is {}".format(screen.size()))
    #screen = screen[30:200,:]
    #return resize(screen).unsqueeze(0).to(device)
    return screen.to(device)

In [7]:
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.1
EPS_DECAY = 1000000
TARGET_UPDATE = 10000
LR = 0.00025

policy_net = DQN().to(device)
target_net = DQN().to(device)
policy_net.load_state_dict(torch.load('invaders_policy.pt'))
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters(), lr=LR)
memory = ReplayMemory(4500)  # Original paper used 1000000...

steps_done = 0

In [8]:
def get_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    #print("eps_threshold is {}".format(eps_threshold))
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            #a = policy_net(state)
            # a looks like this: tensor([[ 0.5946,  0.0787,  0.4172,  0.4470, -0.3384,  0.0064]], device='cuda:0')
            return policy_net(state).max(1)[1]
    else:
        return torch.tensor([[random.randrange(6)]], device=device, dtype=torch.long)

In [9]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
    # detailed explanation).
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.uint8)

    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.tensor(batch.action, device=device)
    action_batch = action_batch.view(BATCH_SIZE,-1)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    Q_s_a = policy_net(state_batch)

    actions = Q_s_a.gather(1, action_batch)
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    
    #non_final_next_states = non_final_next_states.view(BATCH_SIZE, 3, 210, 160)
    non_final_next_states = non_final_next_states.view(BATCH_SIZE, 3, 160, 160)
    target = target_net(non_final_next_states)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    
    
    # For debugging
    #cpuStats()
    #memReport()

In [10]:
env.reset()
s = get_screen()

In [11]:
def trim_screen(s):
    ### trim a screen a bit
    s = s[:,35:195,:]
    return s

def memReport():
    for obj in gc.get_objects():
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
    
def cpuStats():
        print(sys.version)
        print(psutil.cpu_percent())
        print(psutil.virtual_memory())
        pid = os.getpid()
        py = psutil.Process(pid)
        memoryUse = py.memory_info()[0] / 2. ** 30
        print('memory GB:', memoryUse)

In [None]:
NUM_EPOCHS = 200

for i in range(NUM_EPOCHS):
    env.reset()
    total_reward = 0
    done = False
    while not done:
        state = get_screen()
        state = trim_screen(state)
        action = get_action(state.unsqueeze_(0))
        env.render()
        _, reward, done, info =  env.step(action.item())
        reward = torch.tensor([reward], device=device)
        total_reward += reward
        next_state = get_screen()
        next_state = trim_screen(next_state)
        #print("S:{} A:{} S':{} R:{}".format(state, action.item(), next_state, reward))
        memory.push(state, action.item(), next_state, reward)
        
        optimize_model()
      
    #steps_done = 0
    if (i % 10 == 0):
        print("{} epochs has finished".format(i))
        print("Got reward {}".format(total_reward))

0 epochs has finished
Got reward tensor([ 120.], device='cuda:0')
10 epochs has finished
Got reward tensor([ 105.], device='cuda:0')
20 epochs has finished
Got reward tensor([ 140.], device='cuda:0')
30 epochs has finished
Got reward tensor([ 210.], device='cuda:0')
40 epochs has finished
Got reward tensor([ 440.], device='cuda:0')
50 epochs has finished
Got reward tensor([ 110.], device='cuda:0')
60 epochs has finished
Got reward tensor([ 90.], device='cuda:0')
70 epochs has finished
Got reward tensor([ 155.], device='cuda:0')
80 epochs has finished
Got reward tensor([ 30.], device='cuda:0')
90 epochs has finished
Got reward tensor([ 545.], device='cuda:0')
100 epochs has finished
Got reward tensor([ 100.], device='cuda:0')
110 epochs has finished
Got reward tensor([ 105.], device='cuda:0')
120 epochs has finished
Got reward tensor([ 90.], device='cuda:0')
130 epochs has finished
Got reward tensor([ 210.], device='cuda:0')
140 epochs has finished
Got reward tensor([ 250.], device='cud

In [None]:
torch.save(policy_net.state_dict(), "./invaders_policy.pt")

In [None]:
# Just a test of trimming
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
s = get_screen()
s = s[:,35:195,:]
print(s.shape)
s = s.cpu()
res = T.ToPILImage()(s)
res.show()