# Train Cartpole Balancing problem  in OpenAI Env using DQN and Double DQN

In [7]:
#Initialize
import math, random
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

env = gym.make("CartPole-v0")

epsilon = 1.0
epsilonMin = 0.01
epsilonDecay = 0.999
episodes = 1000
batch_size = 32
gamma = 0.99
goal_steps = 200

In [8]:
# Create replay buffer
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

In [9]:
# Neural network using pytorch
class DQN(nn.Module):
    def __init__(self, num_inputs, num_actions):
        super(DQN, self).__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(env.observation_space.shape[0], 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, env.action_space.n)
        )
        
    def forward(self, x):
        return self.layers(x)
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            state   = torch.FloatTensor(state).unsqueeze(0)
            q_value = self.forward(state)
            action  = q_value.max(1)[1].data[0].numpy()
        else:
            action = env.action_space.sample()
        return action

In [10]:
# For DQN
model1 = DQN(env.observation_space.shape[0], env.action_space.n)

# For Double DQN
# model2 = DQN(env.observation_space.shape[0], env.action_space.n)
# sync both networks
# def sync(model1, model2):
    # model2.load_state_dict(model1.state_dict())

# sync(model1, model2)

In [11]:
# Training
optimizer = optim.Adam(model1.parameters())
memory = ReplayBuffer(1000)

for idx in range(episodes):
    total_reward = 0
    state = env.reset()
    for step in range(goal_steps):
        action = model1.act(state, epsilon)
        action = int(action)
        next_state, reward, done, _ = env.step(action)
        memory.push(state, action, reward, next_state, done)
    
        state = next_state
        total_reward += reward
    
        if done:
            print("Episode = " + str(idx) + " , Score = " + str(total_reward))
            break
    if epsilon > epsilonMin:
        epsilon *= epsilonDecay
    
    # For Double DQN
    # if idx % 100 == 0:
        # sync(model1, model2)
        
    if len(memory) > batch_size:
        state, action, reward, next_state, done = memory.sample(batch_size)

        state = torch.FloatTensor(np.float32(state))
        next_state = torch.FloatTensor(np.float32(next_state))
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        done = torch.FloatTensor(done)

        q_values = model1(state)
        next_q_values = model1(next_state)
        # For Double DQN
        # next_q_values2 = model2(next_state)
        
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_values.max(1)[0]
        # For Double DQN (comment above line)
        # next_q_value = next_q_values2.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = reward + gamma * next_q_value * (1 - done)

        loss = (q_value - expected_q_value.data).pow(2).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Episode = 0 , Score = 22.0
Episode = 1 , Score = 25.0
Episode = 2 , Score = 15.0
Episode = 3 , Score = 13.0
Episode = 4 , Score = 18.0
Episode = 5 , Score = 28.0
Episode = 6 , Score = 17.0
Episode = 7 , Score = 15.0
Episode = 8 , Score = 13.0
Episode = 9 , Score = 31.0
Episode = 10 , Score = 13.0
Episode = 11 , Score = 20.0
Episode = 12 , Score = 32.0
Episode = 13 , Score = 10.0
Episode = 14 , Score = 19.0
Episode = 15 , Score = 17.0
Episode = 16 , Score = 36.0
Episode = 17 , Score = 24.0
Episode = 18 , Score = 13.0
Episode = 19 , Score = 21.0
Episode = 20 , Score = 12.0
Episode = 21 , Score = 15.0
Episode = 22 , Score = 28.0
Episode = 23 , Score = 14.0
Episode = 24 , Score = 40.0
Episode = 25 , Score = 10.0
Episode = 26 , Score = 38.0
Episode = 27 , Score = 13.0
Episode = 28 , Score = 15.0
Episode = 29 , Score = 14.0
Episode = 30 , Score = 13.0
Episode = 31 , Score = 16.0
Episode = 32 , Score = 32.0
Episode = 33 , Score = 10.0
Episode = 34 , Score = 30.0
Episode = 35 , Score = 20.0
Ep

Episode = 321 , Score = 58.0
Episode = 322 , Score = 44.0
Episode = 323 , Score = 28.0
Episode = 324 , Score = 27.0
Episode = 325 , Score = 13.0
Episode = 326 , Score = 33.0
Episode = 327 , Score = 35.0
Episode = 328 , Score = 19.0
Episode = 329 , Score = 13.0
Episode = 330 , Score = 44.0
Episode = 331 , Score = 19.0
Episode = 332 , Score = 29.0
Episode = 333 , Score = 17.0
Episode = 334 , Score = 16.0
Episode = 335 , Score = 12.0
Episode = 336 , Score = 51.0
Episode = 337 , Score = 46.0
Episode = 338 , Score = 18.0
Episode = 339 , Score = 15.0
Episode = 340 , Score = 23.0
Episode = 341 , Score = 14.0
Episode = 342 , Score = 41.0
Episode = 343 , Score = 17.0
Episode = 344 , Score = 18.0
Episode = 345 , Score = 22.0
Episode = 346 , Score = 18.0
Episode = 347 , Score = 12.0
Episode = 348 , Score = 43.0
Episode = 349 , Score = 23.0
Episode = 350 , Score = 18.0
Episode = 351 , Score = 22.0
Episode = 352 , Score = 29.0
Episode = 353 , Score = 90.0
Episode = 354 , Score = 41.0
Episode = 355 

Episode = 623 , Score = 50.0
Episode = 624 , Score = 55.0
Episode = 625 , Score = 48.0
Episode = 626 , Score = 96.0
Episode = 627 , Score = 36.0
Episode = 628 , Score = 50.0
Episode = 629 , Score = 70.0
Episode = 630 , Score = 32.0
Episode = 631 , Score = 21.0
Episode = 632 , Score = 17.0
Episode = 633 , Score = 11.0
Episode = 634 , Score = 20.0
Episode = 635 , Score = 30.0
Episode = 636 , Score = 18.0
Episode = 637 , Score = 22.0
Episode = 638 , Score = 21.0
Episode = 639 , Score = 16.0
Episode = 640 , Score = 18.0
Episode = 641 , Score = 44.0
Episode = 642 , Score = 30.0
Episode = 643 , Score = 21.0
Episode = 644 , Score = 21.0
Episode = 645 , Score = 17.0
Episode = 646 , Score = 37.0
Episode = 647 , Score = 16.0
Episode = 648 , Score = 21.0
Episode = 649 , Score = 26.0
Episode = 650 , Score = 48.0
Episode = 651 , Score = 17.0
Episode = 652 , Score = 12.0
Episode = 653 , Score = 21.0
Episode = 654 , Score = 30.0
Episode = 655 , Score = 26.0
Episode = 656 , Score = 35.0
Episode = 657 

Episode = 912 , Score = 158.0
Episode = 913 , Score = 42.0
Episode = 914 , Score = 128.0
Episode = 915 , Score = 113.0
Episode = 916 , Score = 67.0
Episode = 917 , Score = 124.0
Episode = 918 , Score = 107.0
Episode = 919 , Score = 128.0
Episode = 920 , Score = 17.0
Episode = 921 , Score = 43.0
Episode = 922 , Score = 120.0
Episode = 923 , Score = 63.0
Episode = 924 , Score = 144.0
Episode = 925 , Score = 124.0
Episode = 926 , Score = 169.0
Episode = 927 , Score = 104.0
Episode = 928 , Score = 17.0
Episode = 929 , Score = 98.0
Episode = 930 , Score = 56.0
Episode = 931 , Score = 53.0
Episode = 932 , Score = 45.0
Episode = 933 , Score = 22.0
Episode = 934 , Score = 44.0
Episode = 935 , Score = 90.0
Episode = 936 , Score = 61.0
Episode = 937 , Score = 46.0
Episode = 938 , Score = 38.0
Episode = 939 , Score = 37.0
Episode = 940 , Score = 59.0
Episode = 941 , Score = 46.0
Episode = 942 , Score = 136.0
Episode = 943 , Score = 164.0
Episode = 944 , Score = 155.0
Episode = 945 , Score = 169.0