In [15]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as T

In [16]:
env = gym.make('CartPole-v0').unwrapped

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [17]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

In [18]:
BATCH_SIZE = 128
GAMMA = 0.95
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

INPUT_SIZE = 4
HIDDEN_SIZE = 64
OUTPUT_SIZE = 2

In [19]:
def create_model():
    return torch.nn.Sequential(
        torch.nn.Linear(INPUT_SIZE, HIDDEN_SIZE),
        torch.nn.ReLU(),
        torch.nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE)
    )

In [20]:
policy_net = create_model()
target_net = create_model()
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

if use_cuda:
    policy_net.cuda()
    target_net.cuda()

In [21]:
from collections import deque

optimizer = optim.RMSprop(policy_net.parameters())

In [22]:
steps_done = 0 # to help decay E (chance of random action) over time.

def policy(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        return policy_net(
            Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
    else:
        return LongTensor([[random.randrange(2)]])

In [23]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = random.choices(memory, k=BATCH_SIZE)
    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
    # detailed explanation).
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)))
    non_final_next_states = Variable(torch.cat([s for s in batch.next_state
                                                if s is not None]),
                                     volatile=True)
    state_batch = Variable(torch.cat(batch.state))
    action_batch = Variable(torch.cat(batch.action))
    reward_batch = Variable(torch.cat(batch.reward))

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor))
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    # Undo volatility (which was used to prevent unnecessary gradients)
    expected_state_action_values = Variable(expected_state_action_values.data)

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [25]:
# populate memory w. BATCH_SIZE using random choices

memory = deque(maxlen=BATCH_SIZE*5)
remaining_count = BATCH_SIZE

for episode in count(1):
    if remaining_count <= 0:
        break
        
    state = env.reset()
    done = False

    while not done:
        action = random.choice([0,1])
        next_state, reward, done, _ = env.step(action)
        memory.append(Transition(state, action, next_state, reward))
        remaining_count -= 1
        state = next_state

In [62]:
# rolled out episode to test code

# reset environment
state_array = env.reset()
state = torch.from_numpy(state_array[np.newaxis, ...]).type(FloatTensor)

# pick action using policy:
sample = random.random()
eps_threshold = EPS_END + (EPS_START - EPS_END) * \
    math.exp(-1. * steps_done / EPS_DECAY)
state_variable = Variable(state, volatile=True).type(FloatTensor)
net_value_output = policy_net(state_variable)
max_value_output = net_value_output.data.max(1)
action = max_value_output[1].view(1, 1)

In [63]:
action


 0
[torch.cuda.LongTensor of size 1x1 (GPU 0)]