In [1]:
import gym

import torch
from torch import nn, optim
from torch.autograd import Variable

from collections import deque
import numpy as np
import random
from operator import itemgetter


In [2]:
env = gym.make('CartPole-v0')

[2017-06-19 02:01:47,916] Making new env: CartPole-v0


In [3]:
BATCH_SIZE = 64
EPOCHES = 100
EPSILON = 0.9
GAMMA = 0.9
MEMORY_CAPACITY = 2000
TARGET_REPLACE_ITER = 100

N_STATES = env.observation_space.shape[0]
N_ACTIONS = env.action_space.n

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(N_STATES, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
        )
        # here is the change for dueling DQN
        self.value_fc = nn.Sequential(
            nn.Linear(256, 1)
        )
        self.advantage_fc = nn.Sequential(
            nn.Linear(256, N_ACTIONS)
        )
    def forward(self, x):
        out = self.fc1(x)
        # here is the change for dueling DQN
        out_v = self.value_fc(out)
        out_a = self.advantage_fc(out)
        out = out_v.expand_as(out_a) + (out_a - out_a.mean(1).expand_as(out_a))
        return out


In [5]:
class DQN(object):
    def __init__(self):
        self.eval_net = Net()
        self.target_net = Net()
        if torch.cuda.is_available():
            self.eval_net = self.eval_net.cuda()
            self.target_net = self.target_net.cuda()
        self.memory = deque(maxlen=MEMORY_CAPACITY)
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.eval_net.parameters(), lr=0.01)
    def choose_action(self, state):
        self.eval_net.eval()
        x = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
        if torch.cuda.is_available():
            x = x.cuda()
        if np.random.uniform() < EPSILON:
            action_values = self.eval_net(x)
            action = torch.max(action_values, -1)[1].data[0, 0]
        else:
            action = np.random.randint(0, N_ACTIONS)
        self.eval_net.train()
        return action
    def refresh_target_net(self):
        self.target_net.load_state_dict(self.eval_net.state_dict())
    def learn(self):
        train_data = random.sample(self.memory, BATCH_SIZE)
        states = Variable(torch.FloatTensor(list(map(itemgetter(0), train_data))))
        actions = Variable(torch.LongTensor(list(map(itemgetter(1), train_data))))
        rewards = Variable(torch.FloatTensor(list(map(itemgetter(2), train_data))))
        next_states = Variable(torch.FloatTensor(list(map(itemgetter(3), train_data))))
        if torch.cuda.is_available():
            states = states.cuda()
            actions = actions.cuda()
            rewards = rewards.cuda()
            next_states = next_states.cuda()
        q_eval = self.eval_net(states).gather(1, actions.unsqueeze(1))
        q_next = self.target_net(next_states).detach()
        # here is the change for double DQN
        q_eval4next = self.eval_net(next_states).max(-1)[1]
        q_target = rewards + GAMMA * q_next.gather(1, q_eval4next)
#         q_target = rewards + GAMMA * q_next.max(-1)[0]
        loss = self.criterion(q_eval, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    def remember(self, state, action, reward, next_state):
        self.memory.append((state, action, reward, next_state))

dqn = DQN()

In [6]:
from IPython.core.debugger import set_trace

env = env.unwrapped

for epoch in range(EPOCHES):
    print('*'*10)
    print('Epoch: {}'.format(epoch))
    state = env.reset()
    done = False
    steps = 0
    scores = 0.0
    while not done:
        action = dqn.choose_action(state)
        next_state, reward, done, info = env.step(action)
        scores += reward

        # Revise reward
        x, x_dot, theta, theta_dot = next_state
        r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
        reward = r1 + r2

        dqn.remember(state, action, reward, next_state)
        if len(dqn.memory) > BATCH_SIZE:
#             set_trace()
            dqn.learn()
        if steps%TARGET_REPLACE_ITER == 0:
            dqn.refresh_target_net()
        state = next_state
        steps += 1
    print('Steps: {}'.format(steps))
    print('Scores: {}'.format(scores))

**********
Epoch: 0
Steps: 58
Scores: 58.0
**********
Epoch: 1
Steps: 20
Scores: 20.0
**********
Epoch: 2
Steps: 102
Scores: 102.0
**********
Epoch: 3
Steps: 75
Scores: 75.0
**********
Epoch: 4
Steps: 212
Scores: 212.0
**********
Epoch: 5
Steps: 170
Scores: 170.0
**********
Epoch: 6
Steps: 148
Scores: 148.0
**********
Epoch: 7
Steps: 216
Scores: 216.0
**********
Epoch: 8
Steps: 246
Scores: 246.0
**********
Epoch: 9
Steps: 256
Scores: 256.0
**********
Epoch: 10
Steps: 234
Scores: 234.0
**********
Epoch: 11
Steps: 562
Scores: 562.0
**********
Epoch: 12
Steps: 1859
Scores: 1859.0
**********
Epoch: 13
Steps: 385
Scores: 385.0
**********
Epoch: 14
Steps: 510
Scores: 510.0
**********
Epoch: 15
Steps: 436
Scores: 436.0
**********
Epoch: 16
Steps: 1929
Scores: 1929.0
**********
Epoch: 17
Steps: 1969
Scores: 1969.0
**********
Epoch: 18
Steps: 397
Scores: 397.0
**********
Epoch: 19
Steps: 904
Scores: 904.0
**********
Epoch: 20
Steps: 774
Scores: 774.0
**********
Epoch: 21
Steps: 369
Scores: 369.