In [None]:
"""
This file contains an implementation of vanilla Q-learning.

Author: Noorvir Aulakh
Date: 04/03/2017
"""

import datetime
import time
import csv

import numpy as np
import tensorflow as tf
import gym

import saver
import agent_test

# ==================================================================================================
# Parameters

EPSILON = 0.05                  # Exploration probability
GAMMA = 0.99                    # Discount factor
MAX_ITER = 2000                 # Number of epochs to run for
LEARNING_RATE = 10**-4          # Step-size for optimiser

NUM_TEST_RUNS = 10              # Number of runs to evaluate performance on
NUM_EXPERIMENTS = 100            # Number of experiments to average over
# ==================================================================================================

# ==================================================================================================
# Save Options
LOG_FOLDER = './logs/'
MODEL_FOLDER = './models/'

LOAD_MODEL_FILENAME = 'online_q_learning.ckpt'
# ==================================================================================================

In [None]:
def build_graph(num_hidden=100):
    """
    Create tensorflow model: 1) one linear layer with one output per action, 2) a hidden layer(100)
    - linear transformation + ReLU - followed by a linear layer with one output per action.
    :param num_hidden:
    :return:
    """
    X = tf.placeholder(tf.float32, [1, 4], name='X')
    W1 = tf.Variable(tf.random_normal([4, num_hidden], stddev=0.01))
    b1 = tf.Variable(tf.random_normal([num_hidden], stddev=0.01))

    W2 = tf.Variable(tf.random_normal([num_hidden, 2], stddev=0.01))
    b2 = tf.Variable(tf.random_normal([2], stddev=0.01))

    relu_l = tf.nn.dropout(tf.nn.relu(tf.matmul(X, W1)), keep_prob=0.8)

    return X, tf.matmul(relu_l, W2)

In [None]:
def online_q_learning(env, num_hidden, csv_loss_file, csv_eval_file, model_filename):
    """
    Implement an online Q learning algorithm with a small neural net for function approximation.
    :param env:
    :return:
    """

    Q_target = tf.placeholder(tf.float32, [1, 2])

    # Get computation graph
    X, DQN = build_graph(num_hidden=num_hidden)

    loss = 0.5 * tf.reduce_sum(tf.square(Q_target - DQN), axis=1)
    trainer = tf.train.RMSPropOptimizer(LEARNING_RATE).minimize(loss)

    with tf.Session() as sess:

        total_steps = 0
        tf.global_variables_initializer().run()

        for episode in range(MAX_ITER):

            state = env.reset()
            reward = 0
            is_terminal_state = 0

            for step_num in range(300):

                Q_val = sess.run(DQN, feed_dict={X: [state]})

                target = np.copy(Q_val[0])
                action = np.argmax(Q_val)

                # Explore with probability EPSILON
                if np.random.uniform() < EPSILON:
                    action = env.action_space.sample()

                n_state, _, is_done, _ = env.step(action)

                if is_done:
                    reward = -1
                    is_terminal_state = 1

                # Get Q-values over the next state
                Q_nVal = sess.run(DQN, feed_dict={X: [n_state]})

                # Calculate target Q-value
                target[action] = reward + (1 - is_terminal_state) * GAMMA * np.max(Q_nVal)

                _, c_loss = sess.run([trainer, loss], feed_dict={X: [state], Q_target: [target]})

                state = np.copy(n_state)
                total_steps += 1

                if is_done:
                    break

            if (episode + 1) % 1 == 0:
                data = [episode + 1, total_steps, c_loss[0]]
                print(data)

            # Evaluate performance and log to CSV file
            if (episode + 1) % 1 == 0:
                data = [episode + 1, total_steps, c_loss[0]] + ['%.4f' % elem for elem in
                                                                list(agent_test.test(X, DQN, sess, 
                                                                          NUM_TEST_RUNS))]
                print('Evaluation: \n')
                print(data)

            if (episode + 1) % 500 == 0:
                saver.save_model(sess, model_filename)

        saver.save_model(sess, model_filename)
    tf.reset_default_graph()

    print('Training complete!')

In [None]:
def train(env):
    # ==============================================================================================
    # Initialise Log writer
    # ==============================================================================================

    t = time.time()
    ts = datetime.datetime.fromtimestamp(t).strftime('%Y-%m-%d--%H%M-%S')
    model_filename = 'online_q_learning' + '_' + ts
    
    csv_loss_filename = LOG_FOLDER + 'online_q_learning' + '_' + 'loss' + '_' + ts + '.csv'
    csv_eval_filename = LOG_FOLDER + 'online_q_learning' + '_' + 'eval' + '_' + ts + '.csv'
    csv_loss_file = ""
    csv_eval_file = ""
    
    csv_loss_header = ['episode', 'total_steps', 'loss']
    csv_eval_header = ['episode', 'total_steps', 'loss', 'reward_mean', 'reward_stddev',
                       'episode_length_mean', 'episode_length_stddev']

    with open(csv_loss_filename, 'w') as csv_loss_file, \
            open(csv_eval_filename, 'w') as csv_eval_file:

        # Write meta-data and headers to CSV file
        csv_writer = csv.writer(csv_loss_file)
        csv_writer.writerow(csv_loss_header)

        csv_writer = csv.writer(csv_eval_file)
        csv_writer.writerow(csv_eval_header)

    for experiment in range(3):
        print('Experiment number: ', experiment)
        online_q_learning(env, 100, csv_loss_file, csv_eval_file, model_filename)

In [None]:
# Run training
if __name__ == "__main__":
    env = gym.make('CartPole-v0')
    env._max_episode_steps = 301        # need this hack to ensure the environment doesn't quit on the 300th step

    train(env)

In [None]:
# Test network.
if __name__ == "__main__":

    filename = MODEL_FOLDER + LOAD_MODEL_FILENAME

    with tf.Session() as sess:
        X, DQN = build_graph(num_hidden=100)
        tf.global_variables_initializer().run()
        saver.load_model(sess, filename)
        saver.test(env, X, DQN, sess, 10, render=True)