In [None]:
"""
This notebook contains an implementation of the double Q-learning algorithm.

N.B. This is an untested and incomplete implementation.

Author: Noorvir Aulakh
Date: 10/03/2017
"""


import csv
import gym
import time
import datetime
import collections

import numpy as np
import tensorflow as tf

import saver
import agent_test
from notbook_loader import *

from target_network_q_learning import build_graph


env = gym.make('CartPole-v0')
env._max_episode_steps = 301
    
# ==================================================================================================
# Parameters

isTRAIN = True

EPSILON = 0.05              # Exploration probability
GAMMA = 0.99                # Discount factor
MAX_ITER = 2000             # Number of epochs to run for
LEARNING_RATE = 10 ** -4    # Step-size for optimiser
EX_BUFFER_SIZE = 150000     # Size of experience replay buffer
BATCH_SIZE = 100

NUM_TEST_RUNS = 10          # Number of runs to evaluate performance on
NUM_EXPERIMENTS = 5         # Number of experiments to average over
# ==================================================================================================

# ==================================================================================================
# Save Options
LOG_FOLDER = './logs/'
MODEL_FOLDER = './models/'

LOAD_MODEL_FILENAME = 'double_q_learning.ckpt'
# ==================================================================================================

In [None]:
def double_q_learning(env):

    with tf.variable_scope('DQN_main'):
        X_main, weights_main, DQN_main = build_graph()
        W1, b1, W2, b2 = weights_main

    with tf.variable_scope('DQN_target'):
        X_target, weights_target, DQN_target = build_graph()
        W1_t, b1_t, W2_t, b2_t = weights_target

    updateOp = [W1_t.assign(W1), b1_t.assign(b1), W2_t.assign(W2), b2_t.assign(b2)]

    Q_target = tf.placeholder(tf.float32, [1, 2])

    loss = 0.5 * tf.reduce_sum(tf.square(Q_target - DQN_main), axis=1)
    trainer = tf.train.RMSPropOptimizer(LEARNING_RATE).minimize(loss)


    with tf.Session() as sess:

        total_steps = 0
        tf.global_variables_initializer().run()

        for episode in range(MAX_ITER):

            state = env.reset()
            reward = 0
            is_terminal_state = 0

            for t in range(300):

                Q_val = sess.run(DQN_main, feed_dict={X_main: [state]})
                target = np.copy(Q_val[0])
                action = np.argmax(Q_val)

                # Explore with probability EPSILON
                if np.random.uniform() < EPSILON:
                    action = env.action_space.sample()

                n_state, _, is_done, _ = env.step(action)

                if is_done:
                    reward = -1
                    is_terminal_state = 1

                # Choose action at next state with the primary network
                Q_nVal = sess.run(DQN_main, feed_dict={X_main: [n_state]})
                n_action = np.argmax(Q_nVal)

                # Use target network to calculate Q-value for this action
                Q_nVal_t = sess.run(DQN_target, feed_dict={X_target: [n_state]})

                target[action] = reward + (1 - is_terminal_state) * GAMMA * Q_nVal_t[0][n_action]
                _, c_loss = sess.run([trainer, loss], feed_dict={X_main: [state],
                                                                 Q_target: [target]})

                state = np.copy(n_state)
                total_steps += 1


                if is_done:
                    break

            if (episode + 1) % 5 == 0:
                sess.run(updateOp)

            # Log the loss and reward to CSV file
            data = [episode + 1, total_steps, c_loss[0]]

            if (episode + 1) % 20 == 0:
                print(data)

            # Evaluate performance and log to CSV file
            if (episode + 1) % 20 == 0:
                data = [episode + 1, total_steps, c_loss[0]] + ['%.4f' % elem for elem in
                                                                list(agent_test.test(X_main,
                                                                                  DQN_main,
                                                                                  sess,
                                                                                  NUM_TEST_RUNS))]
                print('Evaluation:')
                print(data)

            if (episode + 1) % 500 == 0:
                saver.save_model(sess, model_filename)

        saver.save_model(sess, model_filename)
    tf.reset_default_graph()

    print('Training complete!')

In [None]:
def train(env):

    # ==============================================================================================
    # Initialise Log writer
    # ==============================================================================================
    global model_filename
    global csv_loss_file, csv_eval_file

    t = time.time()
    ts = datetime.datetime.fromtimestamp(t).strftime('%Y-%m-%d--%H%M-%S')
    csv_loss_filename = LOG_FOLDER + 'double_q_learning' + '_' + 'loss' + '_' + ts + '.csv'
    csv_eval_filename = LOG_FOLDER + 'double_q_learning' + '_' + 'eval' + '_' + ts + '.csv'

    model_filename = 'double_q_learning' + '_' + ts

    csv_loss_header = ['episode', 'total_steps', 'loss']
    csv_eval_header = ['episode', 'total_steps', 'loss', 'reward_mean', 'reward_stddev',
                       'episode_length_mean', 'episode_length_stddev']

    with open(csv_loss_filename, 'w') as csv_loss_file, \
            open(csv_eval_filename, 'w') as csv_eval_file:
        # Write meta-data and headers to CSV file
        csv_writer = csv.writer(csv_loss_file)
        csv_writer.writerow(csv_loss_header)

        csv_writer = csv.writer(csv_eval_file)
        csv_writer.writerow(csv_eval_header)

        for experiment in range(NUM_EXPERIMENTS):
            print('Experiment number: ', experiment)
            double_q_learning(env)

In [None]:
# Train Agent
train(env)

In [None]:
# Test Agent
filename = MODEL_FOLDER + LOAD_MODEL_FILENAME

with tf.Session() as sess:
        X, _, DQN = build_graph()
        tf.global_variables_initializer().run()
        saver.load_model(sess, filename)
        agent_test.test(X, DQN, sess, 10, render=True)

