In [1]:
import gym

import torch
from torch import nn, optim
from torch.autograd import Variable
from torch import autograd
import torch.nn.functional as F

from collections import deque
import numpy as np
import random
from operator import itemgetter


In [2]:
env = gym.make('CartPole-v0')

[2017-07-25 13:30:46,286] Making new env: CartPole-v0


In [3]:
BATCH_SIZE = 64
EPOCHES = 1000
# EPSILON = 0.9
GAMMA = 0.99
LEARNING_RATE = 3e-4
# MEMORY_CAPACITY = 2000
# TARGET_REPLACE_ITER = 100

N_STATES = env.observation_space.shape[0]
N_ACTIONS = env.action_space.n

In [4]:
class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(N_STATES, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
        )
        self.actor = nn.Sequential(
            nn.Linear(256, N_ACTIONS),
            nn.Softmax()
        )
        self.critic = nn.Sequential(
            nn.Linear(256, 1)
        )
    def forward(self, x):
        out = self.fc1(x)
        actor_out = self.actor(out)
        critic_out = self.critic(out)
        return actor_out, critic_out

    
class ActorNet(nn.Module):
    def __init__(self):
        super(ActorNet, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(N_STATES, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        self.fc2 = nn.Sequential(
            nn.Linear(256, N_ACTIONS),
            nn.Softmax()
        )
    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        return out

class CriticNet(nn.Module):
    def __init__(self):
        super(CriticNet, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(N_STATES, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        self.fc2 = nn.Sequential(
            nn.Linear(256, 1)
        )
    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        return out

In [None]:
class DQN(object):
    def __init__(self):
        self.policy_net = PolicyNet()
        if torch.cuda.is_available():
            self.policy_net = self.policy_net.cuda()
        self.actions = []
        self.rewards = []
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE)
    def choose_action(self, state):
        self.policy_net.eval()
        x = Variable(torch.FloatTensor(state).unsqueeze(0))
        if torch.cuda.is_available():
            x = x.cuda()
        probs, state_value = self.policy_net(x)
        # Get action with max probs
        action = probs.multinomial()
        self.actions.append((action, state_value))
        action = action.data[0, 0]
        self.policy_net.train()
        return action
    def learn(self):
        rewards = []
        reward = 0.0
        value_loss = 0.0
        for r in self.rewards[::-1]:
            reward = r + GAMMA * reward
            rewards.insert(0, reward)
        rewards = torch.FloatTensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
        for (action, state_value), reward in zip(self.actions, rewards):
            action.reinforce(reward - state_value.data[0, 0])
            v_reward = Variable(torch.Tensor([reward]))
            if torch.cuda.is_available():
                v_reward = v_reward.cuda()
            value_loss += F.smooth_l1_loss(state_value, v_reward)
        self.optimizer.zero_grad()
#         set_trace()
        final_nodes = [value_loss] + list(map(itemgetter(0), self.actions))
        value_gradient = torch.ones(1)
        if torch.cuda.is_available():
            value_gradient = value_gradient.cuda()
        gradients = [value_gradient] + [None]*len(self.actions)
        autograd.backward(final_nodes, gradients)
        self.optimizer.step()
        del self.actions[:]
        del self.rewards[:]

dqn = DQN()

In [None]:
from IPython.core.debugger import set_trace

env = env.unwrapped

for epoch in range(EPOCHES):
    state = env.reset()
    done = False
    steps = 0
    scores = 0.0
    while not done:
        action = dqn.choose_action(state)
        next_state, reward, done, info = env.step(action)
        scores += reward
        
        dqn.rewards.append(reward)

        state = next_state
        steps += 1
#     set_trace()
    dqn.learn()
    if epoch%10 == 0:
        print('*'*10)
        print('Epoch: {}'.format(epoch))
        print('Steps: {}'.format(steps))
        print('Scores: {}'.format(scores))

**********
Epoch: 0
Steps: 24
Scores: 24.0
**********
Epoch: 10
Steps: 17
Scores: 17.0
**********
Epoch: 20
Steps: 13
Scores: 13.0
**********
Epoch: 30
Steps: 10
Scores: 10.0
**********
Epoch: 40
Steps: 52
Scores: 52.0
**********
Epoch: 50
Steps: 53
Scores: 53.0
**********
Epoch: 60
Steps: 26
Scores: 26.0
**********
Epoch: 70
Steps: 45
Scores: 45.0
**********
Epoch: 80
Steps: 27
Scores: 27.0
**********
Epoch: 90
Steps: 23
Scores: 23.0
**********
Epoch: 100
Steps: 38
Scores: 38.0
**********
Epoch: 110
Steps: 86
Scores: 86.0
**********
Epoch: 120
Steps: 20
Scores: 20.0
**********
Epoch: 130
Steps: 126
Scores: 126.0
**********
Epoch: 140
Steps: 92
Scores: 92.0
**********
Epoch: 150
Steps: 123
Scores: 123.0
**********
Epoch: 160
Steps: 149
Scores: 149.0
**********
Epoch: 170
Steps: 42
Scores: 42.0
**********
Epoch: 180
Steps: 198
Scores: 198.0
**********
Epoch: 190
Steps: 90
Scores: 90.0
