In [1]:
import gym

import torch
from torch import nn, optim
from torch.autograd import Variable
from torch import autograd
import torch.nn.functional as F

from collections import deque
import numpy as np
import random
from operator import itemgetter


In [2]:
env = gym.make('CartPole-v0')

[2017-06-19 07:33:12,529] Making new env: CartPole-v0


In [3]:
BATCH_SIZE = 64
EPOCHES = 300
# EPSILON = 0.9
GAMMA = 0.99
LEARNING_RATE = 3e-4
# MEMORY_CAPACITY = 2000
# TARGET_REPLACE_ITER = 100

N_STATES = env.observation_space.shape[0]
N_ACTIONS = env.action_space.n

In [4]:
class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(N_STATES, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, N_ACTIONS),
            nn.Softmax(), # Output actions directly
        )
    def forward(self, x):
        out = self.fc1(x)
        return out


In [5]:
class DQN(object):
    def __init__(self):
        self.policy_net = PolicyNet()
        if torch.cuda.is_available():
            self.policy_net = self.policy_net.cuda()
        self.actions = []
        self.rewards = []
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE)
    def choose_action(self, state):
        self.policy_net.eval()
        x = Variable(torch.FloatTensor(state).unsqueeze(0))
        if torch.cuda.is_available():
            x = x.cuda()
        probs = self.policy_net(x)
        # Get action with max probs
        action = probs.multinomial()
        self.actions.append(action)
        action = action.data[0, 0]
        self.policy_net.train()
        return action
    def learn(self):
        rewards = []
        reward = 0.0
        for r in self.rewards[::-1]:
            reward = r + GAMMA * reward
            rewards.insert(0, reward)
        rewards = torch.FloatTensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
        for action, reward in zip(self.actions, rewards):
            action.reinforce(reward)
        self.optimizer.zero_grad()
        autograd.backward(self.actions, [None for _ in self.actions])
        self.optimizer.step()
        del self.actions[:]
        del self.rewards[:]

dqn = DQN()

In [6]:
from IPython.core.debugger import set_trace

env = env.unwrapped

for epoch in range(EPOCHES):
    state = env.reset()
    done = False
    steps = 0
    scores = 0.0
    while not done:
        action = dqn.choose_action(state)
        next_state, reward, done, info = env.step(action)
        scores += reward
        
        dqn.rewards.append(reward)

        state = next_state
        steps += 1
#     set_trace()
    dqn.learn()
    if epoch%10 == 0:
        print('*'*10)
        print('Epoch: {}'.format(epoch))
        print('Steps: {}'.format(steps))
        print('Scores: {}'.format(scores))

**********
Epoch: 0
Steps: 20
Scores: 20.0
**********
Epoch: 10
Steps: 19
Scores: 19.0
**********
Epoch: 20
Steps: 25
Scores: 25.0
**********
Epoch: 30
Steps: 66
Scores: 66.0
**********
Epoch: 40
Steps: 13
Scores: 13.0
**********
Epoch: 50
Steps: 25
Scores: 25.0
**********
Epoch: 60
Steps: 27
Scores: 27.0
**********
Epoch: 70
Steps: 39
Scores: 39.0
**********
Epoch: 80
Steps: 124
Scores: 124.0
**********
Epoch: 90
Steps: 43
Scores: 43.0
**********
Epoch: 100
Steps: 92
Scores: 92.0
**********
Epoch: 110
Steps: 30
Scores: 30.0
**********
Epoch: 120
Steps: 103
Scores: 103.0
**********
Epoch: 130
Steps: 46
Scores: 46.0
**********
Epoch: 140
Steps: 132
Scores: 132.0
**********
Epoch: 150
Steps: 191
Scores: 191.0
**********
Epoch: 160
Steps: 152
Scores: 152.0
**********
Epoch: 170
Steps: 435
Scores: 435.0
**********
Epoch: 180
Steps: 129
Scores: 129.0
**********
Epoch: 190
Steps: 474
Scores: 474.0
**********
Epoch: 200
Steps: 95
Scores: 95.0
**********
Epoch: 210
Steps: 127
Scores: 127.0
***