## Import PyTorch

In [None]:
import torch

## Define a tensor

In [None]:
dtype = torch.FloatTensor if not torch.cuda.is_available() else torch.cuda.FloatTensor

## Define model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

input_size = 80 * 80
hidden_size = 200

class PolicyGradient(nn.Module):
    
    def __init__(self):
        super(PolicyGradient, self).__init__()
        self.hidden = nn.Linear(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, 3)
        if torch.cuda.is_available(): self.cuda()
        self.actions = []

    def forward(self, x):
        h = F.relu(self.hidden(x))
        logits = F.relu(self.out(h))
        probabilities = F.softmax(logits)
        return probabilities
    
    def reset(self):
        self.actions = []
    
policy = PolicyGradient()
print(policy)

## Prepare gym env

In [None]:
import gym
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None # used in computing the difference frame

## Image preprocessing


In [None]:
import numpy as np

def preprocess(I):
  """ preprocess 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(np.float).ravel()

## Test if preprocessing and model works

In [None]:
from torch.autograd import Variable

preprocessed_observation = preprocess(observation)
print(preprocessed_observation)
print(preprocessed_observation.shape)

in_state = Variable(torch.from_numpy(np.zeros(len(preprocessed_observation))).type(dtype).unsqueeze(0))

output = policy.forward(in_state)
print(output.data[0])

## Getting action

In [None]:
def get_action(observation):
    current_state = preprocess(observation)
    if get_action.prev_state is None:
        get_action.prev_state = current_state
    diff_state = current_state - get_action.prev_state
    get_action.prev_state = current_state
    var_state = Variable(torch.from_numpy(diff_state).type(dtype).unsqueeze(0))
    policy_action = policy.forward(var_state)
    out_action = policy_action.multinomial()
    policy.actions.append(out_action)
    return out_action.data[0,0] + 1 # Pong specific
    

## Test getting action

In [None]:
get_action.prev_state = None
for i in range(100):
    action = get_action(observation)
    print(action)
    print(policy.actions)

## Prepare optimizer

In [None]:
import torch.optim as optim

learning_rate = 1e-3
weight_decay = 1e-3

optimizer = optim.RMSprop(
    policy.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay
)
optimizer.zero_grad()

## Discounting reward

In [None]:
gamma = 0.99

def discount_rewards(rewards):
    current_reward = 0
    out_rewards = []
    for i in reversed(range(len(rewards))):
        if rewards[i] != 0:
            current_reward = 0 # Reset sum between lossing ball
        current_reward = gamma * current_reward + rewards[i]
        out_rewards.insert(0, current_reward)
    return out_rewards

In [None]:
dummy_rewards = [0,0,0,1]
out_rewards = discount_rewards(dummy_rewards)
print(out_rewards)

## Main training loop

In [None]:
import torch.autograd as autograd

rewards = []
reward_sum = 0
batch_size = 8
num_episodes = 0
while True:
    action = get_action(observation)
    observation, reward, done, _ = env.step(action)
    rewards.append(reward)
    reward_sum += reward
    
    if done:
        num_episodes += 1
        discounted_rewards = discount_rewards(rewards)
        rewards_tensor = dtype(discounted_rewards)
        rewards_tensor = (rewards_tensor- rewards_tensor.mean())/(rewards_tensor.std() + np.finfo(np.float32).eps)
        rewards_tensor = rewards_tensor / batch_size
        print(rewards_tensor)
        for action, reward in zip(policy.actions, rewards_tensor):
            action.reinforce(reward)
            
        autograd.backward(policy.actions, [None for a in policy.actions])

        if num_episodes % batch_size == 0:
            optimizer.step()
            optimizer.zero_grad()
            print("Updated parameters")
            
        policy.reset()
        observation = env.reset()
        get_action.prev_state = None
        
        reward_factor = 1 / num_episodes
        
        running_reward = reward_sum if running_reward is None else \
            running_reward * (1 - reward_factor) + reward_sum * reward_factor
        print('{:>5} | {} | Episode reward total was {:d}. Running mean: {:.5f}' \
            .format(num_episodes, datetime.now().strftime('%H:%M:%S'),
                    int(reward_sum), running_reward))
        if num_episodes % 25 == 0:
            directory = 'models'
            if len(directory) > 0 and directory[-1] == '/':
                directory = directory[0:-1]

            path = "{}/model_rr_{:.3f}_epi_{}.pt".format(
                directory, running_reward, num_episodes)
            torch.save(policy.state_dict(), path)
            print("### Saved model: {} ###".format(path))
