## Import PyTorch

In [1]:
import torch

## Define a tensor

In [2]:
dtype = torch.FloatTensor if not torch.cuda.is_available() else torch.cuda.FloatTensor

## Define model

In [3]:
import torch.nn as nn
import torch.nn.functional as F

input_size = 80 * 80
hidden_size = 200

class PolicyGradient(nn.Module):
    
    def __init__(self):
        super(PolicyGradient, self).__init__()
        self.hidden = nn.Linear(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, 3)
        if torch.cuda.is_available(): self.cuda()

    def forward(self, x):
        h = F.relu(self.hidden(x))
        logits = F.relu(self.out(h))
        probabilities = F.softmax(logits)
        return probabilities
    
    def reset(self):
        pass
    
policy = PolicyGradient()
print(policy)

PolicyGradient (
  (hidden): Linear (6400 -> 200)
  (out): Linear (200 -> 3)
)


## Prepare gym env

In [4]:
import gym
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None # used in computing the difference frame

## Image preprocessing


In [5]:
import numpy as np

def preprocess(I):
  """ preprocess 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(np.float).ravel()

## Test if preprocessing and model works

In [6]:
from torch.autograd import Variable

preprocessed_observation = preprocess(observation)
print(preprocessed_observation)
print(preprocessed_observation.shape)

in_state = Variable(torch.from_numpy(np.zeros(len(preprocessed_observation))).type(dtype).unsqueeze(0))

output = policy.forward(in_state)
print(output.data[0])

[0. 0. 0. ... 0. 0. 0.]
(6400,)

 0.3292
 0.3292
 0.3417
[torch.cuda.FloatTensor of size 3 (GPU 0)]

