In [22]:
# Let's make this notebook compatible for Python 2 and 3
from __future__ import division, print_function

# Import libraries
import pandas as pd
import numpy as np
import os
import math
import progressbar
import gym
import random

# for visualization
import matplotlib.pyplot as plt

# to import module from parent directory
import sys
sys.path.append('..')

# Dataset API from sklearn
from sklearn import datasets

In [23]:
# Import helper functions
from utils import mean_squared_error, train_test_split, Plot
from utils import standardize, to_categorical, accuracy_score
from utils.misc import bar_widgets

from deep_learning.loss_functions import SquareLoss, CrossEntropy
from deep_learning.optimizers import Adam
from deep_learning.layers import Dense, Activation
from deep_learning.neural_network import NeuralNetwork

In [24]:
class DeepQNetwork():
    """Q-Learning with deep neural network to learn the control policy. 
    Uses a deep neural network model to predict the expected utility (Q-value) of executing an action in a given state. 

    Reference: https://arxiv.org/abs/1312.5602
    Parameters:
    -----------
    env_name: string
        The environment that the agent will explore. 
        Check: https://gym.openai.com/envs
    epsilon: float
        The epsilon-greedy value. The probability that the agent should select a random action instead of
        the action that will maximize the expected utility. 
    gamma: float
        Determines how much the agent should consider future rewards. 
    decay_rate: float
        The rate of decay for the epsilon value after each epoch.
    min_epsilon: float
        The value which epsilon will approach as the training progresses.
    """
    def __init__(self, env_name='CartPole-v1', epsilon=1, gamma=0.9, decay_rate=0.005, min_epsilon=0.1):
        self.epsilon = epsilon
        self.gamma = gamma
        self.decay_rate = decay_rate
        self.min_epsilon = min_epsilon
        self.memory_size = 300
        self.memory = []

        # Initialize the environment
        self.env = gym.make(env_name)
        self.n_states = self.env.observation_space.shape[0]
        self.n_actions = self.env.action_space.n
    
    def set_model(self, model):
        self.model = model(n_inputs=self.n_states, n_outputs=self.n_actions)

    def _select_action(self, state):
        if np.random.rand() < self.epsilon:
            # Choose action randomly
            action = np.random.randint(self.n_actions)
        else:
            # Take action with highest predicted utility given state
            action = np.argmax(self.model.predict(state), axis=1)[0]

        return action

    def _memorize(self, state, action, reward, new_state, done):
        self.memory.append((state, action, reward, new_state, done))
        # Make sure we restrict memory size to specified limit
        if len(self.memory) > self.memory_size:
            self.memory.pop(0)

    def _construct_training_set(self, replay):
        # Select states and new states from replay
        states = np.array([a[0] for a in replay])
        new_states = np.array([a[3] for a in replay])

        # Predict the expected utility of current state and new state
        Q = self.model.predict(states)
        Q_new = self.model.predict(new_states)

        replay_size = len(replay)
        X = np.empty((replay_size, self.n_states))
        y = np.empty((replay_size, self.n_actions))
        
        # Construct training set
        for i in range(replay_size):
            state_r, action_r, reward_r, new_state_r, done_r = replay[i]

            target = Q[i]
            target[action_r] = reward_r
            # If we're done the utility is simply the reward of executing action a in
            # state s, otherwise we add the expected maximum future reward as well
            if not done_r:
                target[action_r] += self.gamma * np.amax(Q_new[i])

            X[i] = state_r
            y[i] = target

        return X, y

    def train(self, n_epochs=500, batch_size=32):
        max_reward = 0

        for epoch in range(n_epochs):
            state = self.env.reset()
            total_reward = 0

            epoch_loss = []
            while True:

                action = self._select_action(state)
                # Take a step
                new_state, reward, done, _ = self.env.step(action)

                self._memorize(state, action, reward, new_state, done)

                # Sample replay batch from memory
                _batch_size = min(len(self.memory), batch_size)
                replay = random.sample(self.memory, _batch_size)

                # Construct training set from replay
                X, y = self._construct_training_set(replay)

                # Learn control policy
                loss = self.model.train_on_batch(X, y)
                epoch_loss.append(loss)

                total_reward += reward
                state = new_state

                if done: break
            
            epoch_loss = np.mean(epoch_loss)

            # Reduce the epsilon parameter
            self.epsilon = self.min_epsilon + (1.0 - self.min_epsilon) * np.exp(-self.decay_rate * epoch)
            
            max_reward = max(max_reward, total_reward)

            print ("%d [Loss: %.4f, Reward: %s, Epsilon: %.4f, Max Reward: %s]" % (epoch, epoch_loss, total_reward, self.epsilon, max_reward))

        print ("Training Finished")

    def play(self, n_epochs):
        # self.env = gym.wrappers.Monitor(self.env, '/tmp/cartpole-experiment-1', force=True)
        for epoch in range(n_epochs):
            state = self.env.reset()
            total_reward = 0
            while True:
                self.env.render()
                action = np.argmax(self.model.predict(state), axis=1)[0]
                state, reward, done, _ = self.env.step(action)
                total_reward += reward
                if done: break
            print ("%d Reward: %s" % (epoch, total_reward))


In [None]:
def main():
    dqn = DeepQNetwork(env_name='CartPole-v1',
                        epsilon=0.9, 
                        gamma=0.8, 
                        decay_rate=0.005, 
                        min_epsilon=0.1)

    # Model builder
    def model(n_inputs, n_outputs):    
        clf = NeuralNetwork(optimizer=Adam(), loss=SquareLoss)
        clf.add(Dense(64, input_shape=(n_inputs,)))
        clf.add(Activation('relu'))
        clf.add(Dense(n_outputs))
        return clf

    dqn.set_model(model)

    print ()
    dqn.model.summary(name="Deep Q-Network")

    dqn.train(n_epochs=500)
    dqn.play(n_epochs=100)

if __name__ == "__main__":
    main()


+----------------+
| Deep Q-Network |
+----------------+
Input Shape: (4,)
+-------------------+------------+--------------+
| Layer Type        | Parameters | Output Shape |
+-------------------+------------+--------------+
| Dense             | 320        | (64,)        |
| Activation (ReLU) | 0          | (64,)        |
| Dense             | 130        | (2,)         |
+-------------------+------------+--------------+
Total Parameters: 450

0 [Loss: 0.1209, Reward: 37.0, Epsilon: 1.0000, Max Reward: 37.0]
1 [Loss: 0.1102, Reward: 9.0, Epsilon: 0.9955, Max Reward: 37.0]
2 [Loss: 0.1045, Reward: 29.0, Epsilon: 0.9910, Max Reward: 37.0]
3 [Loss: 0.1043, Reward: 19.0, Epsilon: 0.9866, Max Reward: 37.0]
4 [Loss: 0.1273, Reward: 12.0, Epsilon: 0.9822, Max Reward: 37.0]
5 [Loss: 0.1235, Reward: 27.0, Epsilon: 0.9778, Max Reward: 37.0]
6 [Loss: 0.1093, Reward: 20.0, Epsilon: 0.9734, Max Reward: 37.0]
7 [Loss: 0.1192, Reward: 61.0, Epsilon: 0.9690, Max Reward: 61.0]
8 [Loss: 0.1038, Reward:

116 [Loss: 0.0343, Reward: 68.0, Epsilon: 0.6039, Max Reward: 105.0]
117 [Loss: 0.0491, Reward: 47.0, Epsilon: 0.6014, Max Reward: 105.0]
118 [Loss: 0.0445, Reward: 37.0, Epsilon: 0.5989, Max Reward: 105.0]
119 [Loss: 0.0530, Reward: 11.0, Epsilon: 0.5964, Max Reward: 105.0]
120 [Loss: 0.0301, Reward: 26.0, Epsilon: 0.5939, Max Reward: 105.0]
121 [Loss: 0.0423, Reward: 73.0, Epsilon: 0.5915, Max Reward: 105.0]
122 [Loss: 0.0406, Reward: 37.0, Epsilon: 0.5890, Max Reward: 105.0]
123 [Loss: 0.0341, Reward: 95.0, Epsilon: 0.5866, Max Reward: 105.0]
124 [Loss: 0.0242, Reward: 45.0, Epsilon: 0.5841, Max Reward: 105.0]
125 [Loss: 0.0282, Reward: 55.0, Epsilon: 0.5817, Max Reward: 105.0]
126 [Loss: 0.0276, Reward: 46.0, Epsilon: 0.5793, Max Reward: 105.0]
127 [Loss: 0.0284, Reward: 50.0, Epsilon: 0.5769, Max Reward: 105.0]
128 [Loss: 0.0311, Reward: 28.0, Epsilon: 0.5746, Max Reward: 105.0]
129 [Loss: 0.0406, Reward: 46.0, Epsilon: 0.5722, Max Reward: 105.0]
130 [Loss: 0.0345, Reward: 63.0, E

236 [Loss: 0.0045, Reward: 319.0, Epsilon: 0.3766, Max Reward: 434.0]
237 [Loss: 0.0066, Reward: 174.0, Epsilon: 0.3752, Max Reward: 434.0]
238 [Loss: 0.0087, Reward: 116.0, Epsilon: 0.3738, Max Reward: 434.0]
239 [Loss: 0.0061, Reward: 292.0, Epsilon: 0.3724, Max Reward: 434.0]
240 [Loss: 0.0067, Reward: 227.0, Epsilon: 0.3711, Max Reward: 434.0]
241 [Loss: 0.0076, Reward: 139.0, Epsilon: 0.3697, Max Reward: 434.0]
242 [Loss: 0.0088, Reward: 213.0, Epsilon: 0.3684, Max Reward: 434.0]
243 [Loss: 0.0082, Reward: 120.0, Epsilon: 0.3670, Max Reward: 434.0]
244 [Loss: 0.0064, Reward: 18.0, Epsilon: 0.3657, Max Reward: 434.0]
245 [Loss: 0.0096, Reward: 373.0, Epsilon: 0.3644, Max Reward: 434.0]
246 [Loss: 0.0047, Reward: 314.0, Epsilon: 0.3631, Max Reward: 434.0]
247 [Loss: 0.0110, Reward: 51.0, Epsilon: 0.3618, Max Reward: 434.0]
248 [Loss: 0.0133, Reward: 248.0, Epsilon: 0.3604, Max Reward: 434.0]
249 [Loss: 0.0032, Reward: 192.0, Epsilon: 0.3591, Max Reward: 434.0]
250 [Loss: 0.0076, Rew