In [None]:
"""
This notebook contains an implementation of Target Network Q Learning - a trick employed to make 
Q-learning work better. 

Author: Noorvir Aulakh
Date: 07/03/2017
"""

import csv
import gym
import time
import datetime
import collections

import numpy as np
import tensorflow as tf

import saver
import agent_test
from notbook_loader import *

env = gym.make('CartPole-v0')
env._max_episode_steps = 301

# ==================================================================================================
# Parameters

isTRAIN = True

EPSILON = 0.05              # Exploration probability
GAMMA = 0.99                # Discount factor
MAX_ITER = 2000             # Number of epochs to run for
LEARNING_RATE = 10 ** -4    # Step-size for optimiser
EX_BUFFER_SIZE = 150000     # Size of experience replay buffer
BATCH_SIZE = 100

NUM_TEST_RUNS = 10          # Number of runs to evaluate performance on
NUM_EXPERIMENTS = 5         # Number of experiments to average over
# ==================================================================================================

# ==================================================================================================
# Save Options
LOG_FOLDER = './logs/'
MODEL_FOLDER = './models/'

LOAD_MODEL_FILENAME = 'target_network_q_learning.ckpt'
# ==================================================================================================


In [None]:
def build_graph():
    """
    :return:
    """
    
    X = tf.placeholder(tf.float32, [None, 4], name='X')

    W1 = tf.get_variable('W1', [4, 100], initializer=tf.contrib.layers.xavier_initializer())
    b1 = tf.get_variable('b1', [100], initializer=tf.contrib.layers.xavier_initializer())

    W2 = tf.get_variable('W2', [100, 2], initializer=tf.contrib.layers.xavier_initializer())
    b2 = tf.get_variable('b2', [2], initializer=tf.contrib.layers.xavier_initializer())

    relu_l = tf.nn.relu(tf.matmul(X, W1) + b1)
    weights = [W1, b1, W2, b2]

    return X, weights, tf.matmul(relu_l, W2)



In [None]:
def target_net_q_learning(env):

    with tf.variable_scope('DQN_main'):
        X_main, weights_main, DQN_main = build_graph()
        W1, b1, W2, b2 = weights_main

    with tf.variable_scope('DQN_target'):
        X_target, weights_target, DQN_target = build_graph()
        W1_t, b1_t, W2_t, b2_t = weights_target

    updateOp = [W1_t.assign(W1), b1_t.assign(b1), W2_t.assign(W2), b2_t.assign(b2)]

    Q_target = tf.placeholder(tf.float32, [None, 1])
    actions = tf.placeholder(tf.float32, [None, 2])         # One-hot encoding of actions taken
    batch_size = tf.placeholder(tf.int32)                   # The batch size changes and so doe

    delta = Q_target - tf.reshape(tf.reduce_sum(np.multiply(actions, DQN_main), axis=1),
                                  [batch_size, 1])

    loss = tf.reduce_mean(0.5 * tf.square(delta), axis=0)
    trainer = tf.train.RMSPropOptimizer(LEARNING_RATE).minimize(loss)

    ex_replay_buf = collections.deque(maxlen=EX_BUFFER_SIZE)

    with tf.Session() as sess:

        total_steps = 0
        tf.global_variables_initializer().run()

        for episode in range(MAX_ITER):

            state = env.reset()
            reward = 0

            for t in range(300):

                Q_val = sess.run(DQN_main, feed_dict={X_main: [state]})
                action = np.argmax(Q_val)

                # Explore with probability EPSILON
                if np.random.uniform() < EPSILON:
                    action = env.action_space.sample()

                n_state, _, is_done, _ = env.step(action)

                if is_done:
                    reward = -1

                # Save experience to experience-replay buffer
                experience = [state, action, reward, n_state]
                ex_replay_buf.append(experience)

                # Current batch size depends on whether the experience buffer is full or not
                if len(ex_replay_buf) < BATCH_SIZE:
                    C_BATCH_SIZE = len(ex_replay_buf)
                else:
                    C_BATCH_SIZE = BATCH_SIZE

                # =================================================================================
                # Train from experience buffer
                # =================================================================================
                if len(ex_replay_buf) < EX_BUFFER_SIZE:
                    batch = [ex_replay_buf[i] for i in np.random.choice(len(ex_replay_buf),
                                                                        C_BATCH_SIZE,
                                                                        replace=False)]
                else:
                    batch = [ex_replay_buf[i] for i in np.random.choice(EX_BUFFER_SIZE,
                                                                        C_BATCH_SIZE,
                                                                        replace=False)]

                batch_s_t = [m[0] for m in batch]           # State at time-step t
                batch_action = [m[1] for m in batch]        # State at time-step t
                batch_reward = [m[2] for m in batch]        # Reward for taking action a at time t
                batch_s_tn = [m[3] for m in batch]          # State at time-step t+1

                batch_reward = np.reshape(np.array(batch_reward), [C_BATCH_SIZE, 1])

                # Q-value for next state
                Q_nVal = sess.run(DQN_target, feed_dict={X_target: batch_s_tn})

                # Choose max of Q-value at state t+1
                Q_tn_max = np.amax(Q_nVal, axis=1)

                # One-hot encoding of actions chosen
                batch_actions = np.zeros([C_BATCH_SIZE, 2])
                batch_actions[np.arange(C_BATCH_SIZE), batch_action] = 1

                # If the episode has ended, then the total future reward should be zero Q(s_t+1) = 0
                is_terminal_state = np.ones([C_BATCH_SIZE, 1])
                is_terminal_state[np.where(batch_reward == -1)] = 0

                batch_target_val = batch_reward + np.multiply(is_terminal_state,
                                                              GAMMA * np.transpose(
                                                                  np.array([Q_tn_max])))

                _, c_loss = sess.run([trainer, loss], feed_dict={Q_target: batch_target_val,
                                                                 X_main: batch_s_t,
                                                                 actions: batch_actions,
                                                                 batch_size: C_BATCH_SIZE})

                total_steps += 1
                state = np.copy(n_state)

                if is_done:
                    break

            if (episode + 1) % 5 == 0:
                sess.run(updateOp)

            # Log the loss and reward to CSV file
            data = [episode + 1, total_steps, c_loss[0]]
            # helpers.log(data, csv_loss_file)

            if (episode + 1) % 5 == 0:
                print(data)

            # Evaluate performance and log to CSV file
            if (episode + 1) % 20 == 0:
                data = [episode + 1, total_steps, c_loss[0]] + ['%.4f' % elem for elem in
                                                                list(agent_test.test(X_main,
                                                                                  DQN_main,
                                                                                  sess,
                                                                                  NUM_TEST_RUNS))]
#                 helpers.log(data, csv_eval_file)
                print('Evaluation:')
                print(data)

            if (episode + 1) % 500 == 0:
                saver.save_model(sess, model_filename)

        saver.save_model(sess, model_filename)
    tf.reset_default_graph()

    print('Training complete!')


In [None]:
def train(env):

    # ==============================================================================================
    # Initialise Log writer
    # ==============================================================================================
    global model_filename
    global csv_loss_file, csv_eval_file

    t = time.time()
    ts = datetime.datetime.fromtimestamp(t).strftime('%Y-%m-%d--%H%M-%S')
    csv_loss_filename = LOG_FOLDER + 'target_network_q_learning' + '_' + 'loss' + '_' + ts + '.csv'
    csv_eval_filename = LOG_FOLDER + 'target_network_q_learning' + '_' + 'eval' + '_' + ts + '.csv'

    model_filename = 'target_network_q_learning' + '_' + ts

    csv_loss_header = ['episode', 'total_steps', 'loss']
    csv_eval_header = ['episode', 'total_steps', 'loss', 'reward_mean', 'reward_stddev',
                       'episode_length_mean', 'episode_length_stddev']

    with open(csv_loss_filename, 'w') as csv_loss_file, \
            open(csv_eval_filename, 'w') as csv_eval_file:
        # Write meta-data and headers to CSV file
        csv_writer = csv.writer(csv_loss_file)
        csv_writer.writerow(csv_loss_header)

        csv_writer = csv.writer(csv_eval_file)
        csv_writer.writerow(csv_eval_header)

        for experiment in range(NUM_EXPERIMENTS):
            print('Experiment number: ', experiment)
            target_net_q_learning(env)


In [None]:
if __name__ == "__main__":
    train(env)

In [None]:
if __name__ == "__main__":

    filename = MODEL_FOLDER + LOAD_MODEL_FILENAME

    with tf.Session() as sess:
        X, _, DQN = build_graph()
        tf.global_variables_initializer().run()
        saver.load_model(sess, filename)
        agent_test.test(X, DQN, sess, 10, render=True)