In [None]:
"""
This files experiments with a Q-learning policy trained on randomly sampled roll-outs. 

Author: Noorvir Aulakh
Date: 01/03/2017
"""

import datetime
import time
import csv

import gym
import tensorflow as tf
import numpy as np

import agent_test
import saver


env = gym.make('CartPole-v0')
env._max_episode_steps = 301

    
learning_rates = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 0.5]
# ==================================================================================================
# Parameters

# Part 1 - Linear model with one output per action
# Part 2 - Hidden layer (linear + ReLU) followed by linear layer with one output per action
PART_NUM = 1

LEARNING_RATE_INDEX = 5         # index of learning_rates array

GAMMA = 0.99                    # Discount factor
MAX_ITER = 2000                 # Number of epochs to run for

# Step-size for optimiser
LEARNING_RATE = learning_rates[LEARNING_RATE_INDEX]
NUM_TRAINING_SAMPLES = 2000
BATCH_SIZE = 25

NUM_TEST_RUNS = 10              # Number of runs to evaluate performance on
NUM_EXPERIMENTS = 10            # Number of experiments to average over
# ==================================================================================================

# ==================================================================================================
# Save Options
LOG_FOLDER = './logs/'
MODEL_FOLDER = './models/'
LOAD_MODEL_FILENAME = 'random_q_learning_part1.ckpt'
# ==================================================================================================

In [None]:
def collect_episodes(env):
    """
    Collect 2000 episodes under a uniform-random policy
    :return: episodes - dictionary containing history of agent over 200 episodes
    """

    episodes = []  # dictionary to store episode history

    for episode_num in range(NUM_TRAINING_SAMPLES):

        state = env.reset()
        for t in range(300):

            action = env.action_space.sample()
            res_state, _, is_done, _ = env.step(action)

            reward = 0
            episode = [state, action, reward, res_state]

            if is_done:
                reward = -1
                episode = [state, action, reward, res_state]
                episodes.append(episode)
                break

            episodes.append(episode)
            state = res_state

    return episodes


In [None]:
def build_graph(graph_type, init_type='xavier', num_hidden=100, bias=False, dropout=True,
                                                                                keep_prob=0.7):
    """
    Create tensorflow model: 1) one linear layer with one output per action, 2) a hidden layer(100)
    - linear transformation + ReLU - followed by a linear layer with one output per action.
    :param graph_type: 1 for linear, 2 hidden layer graph
    :param init_type:
    :param num_hidden:
    :param bias:
    :param dropout:
    :return:
    """

    if graph_type == 1:

        W = tf.Variable(tf.truncated_normal([4, 2]), name='W')
        X = tf.placeholder(tf.float32, [None, 4], name='X')

        return X, tf.matmul(X, W)

    elif graph_type == 2:

        X = tf.placeholder(tf.float32, [None, 4], name='X')

        if init_type == 'truncated_normal':
            initializer = tf.truncated_normal_initializer(stddev=0.01)

        elif init_type == 'random_normal':
            initializer = tf.random_normal_initializer(stddev=0.01)

        elif init_type == 'xavier':
            initializer = tf.contrib.layers.xavier_initializer()

        W1 = tf.get_variable('W1', [4, num_hidden], initializer=initializer)
        b1 = tf.get_variable('b1', [num_hidden], initializer=initializer)

        W2 = tf.get_variable('W2', [num_hidden, 2], initializer=initializer)
        b2 = tf.get_variable('b2', [2], initializer=initializer)

        if bias:
            l1 = tf.nn.relu(tf.matmul(X, W1) + b1)
            relu_l = l1
            if dropout:
                relu_l = tf.nn.dropout(l1, keep_prob=keep_prob)
            output_l = tf.matmul(relu_l, W2) + b2
        else:
            l1 = tf.nn.relu(tf.matmul(X, W1))
            relu_l = l1
            if dropout:
                relu_l = tf.nn.dropout(l1, keep_prob=keep_prob)
            output_l = tf.matmul(relu_l, W2)

        return X, output_l

    else:
        print('Unknown graph-type specified')
        return -1


In [None]:
def random_q_learning(env):
    """
    Implement batch Q-learning using random experience alone over 2000 episodes.
    :return:
    """

    episodes = collect_episodes(env)

    Q_target = tf.placeholder(tf.float32, [None, 1])
    actions = tf.placeholder(tf.float32, [None, 2])

    # Get computation graph
    X, DQN = build_graph(PART_NUM)

    delta = Q_target - tf.reshape(tf.reduce_sum(np.multiply(actions, DQN), axis=1),
                                  [BATCH_SIZE, 1])
    loss = tf.reduce_mean(0.5 * tf.square(delta), axis=0)

    trainer = tf.train.RMSPropOptimizer(LEARNING_RATE).minimize(loss)

    print('Starting training... \n')

    with tf.Session() as sess:

        total_steps = 0
        tf.global_variables_initializer().run()

        for episode in range(MAX_ITER):

            # Initialise mini-batch start and end indices
            mb_st = 0
            np.random.shuffle(episodes)

            for batch_num in range(int(NUM_TRAINING_SAMPLES / BATCH_SIZE)):

                # Select batch
                batch = [episodes[mb_st + i] for i in range(BATCH_SIZE)]
                batch_s_t = [m[0] for m in batch]       # State at time-step t
                batch_action = [m[1] for m in batch]    # State at time-step t
                batch_reward = [m[2] for m in batch]    # Reward for taking action a at time t
                batch_s_tn = [m[3] for m in batch]      # State at time-step t+1

                batch_reward = np.reshape(np.array(batch_reward), [BATCH_SIZE, 1])

                # Q-value for next state
                Q_nVal = sess.run(DQN, feed_dict={X: batch_s_tn})

                # Choose max of Q-value at state t+1
                Q_nVal_max = np.amax(Q_nVal, axis=1)

                # One-hot encoding of actions chosen
                batch_actions = np.zeros([BATCH_SIZE, 2])
                batch_actions[np.arange(BATCH_SIZE), batch_action] = 1

                # If the episode has ended, then the total future reward should be zero Q(s_t+1) = 0
                is_terminal_state = np.ones([BATCH_SIZE, 1])
                is_terminal_state[np.where(batch_reward == -1)] = 0

                batch_target_val = batch_reward + np.multiply(is_terminal_state,
                                                              GAMMA * np.transpose(
                                                                  np.array([Q_nVal_max])))

                _, c_loss = sess.run([trainer, loss], feed_dict={Q_target: batch_target_val,
                                                                 X: batch_s_t,
                                                                 actions: batch_actions})

                mb_st += BATCH_SIZE
                total_steps += 1

            if (episode + 1) % 10 == 0:
                data = [episode + 1, total_steps, c_loss[0]]
                # Log the loss and reward to CSV file
                print(data)

            # Evaluate performance and log to CSV file
            if (episode + 1) % 50 == 0:
                data = [episode + 1, total_steps, c_loss[0]] + ['%.4f' % elem for elem in
                                                                list(agent_test.test(X, DQN, sess,
                                                                                  NUM_TEST_RUNS))]
                print('Evaluation:')
                print(data)

            if (episode + 1) % 500 == 0:
                saver.save_model(sess, model_filename)

        saver.save_model(sess, model_filename)
    tf.reset_default_graph()

    print('Training complete!')

In [None]:
def train(env):
    # ==============================================================================================
    # Initialise Log writer
    # ==============================================================================================

    global model_filename
    global csv_loss_file, csv_eval_file

    t = time.time()
    ts = datetime.datetime.fromtimestamp(t).strftime('%Y-%m-%d--%H%M-%S')
    csv_loss_filename = LOG_FOLDER + 'A3_lin_' + str(LEARNING_RATE_INDEX) + '_loss_' + ts + '.csv'
    csv_eval_filename = LOG_FOLDER + 'A3_lin_' + str(LEARNING_RATE_INDEX) + '_eval_' + ts + '.csv'

    model_filename = 'A3_lin_' + str(LEARNING_RATE_INDEX) + '_' + ts

    csv_loss_header = ['episode', 'total_steps', 'loss']
    csv_eval_header = ['episode', 'total_steps', 'loss', 'reward_mean', 'reward_stddev',
                       'episode_length_mean', 'episode_length_stddev']

    with open(csv_loss_filename, 'w') as csv_loss_file, \
            open(csv_eval_filename, 'w') as csv_eval_file:

        # Write meta-data and headers to CSV file
        csv_writer = csv.writer(csv_loss_file)
        csv_writer.writerow(csv_loss_header)

        csv_writer = csv.writer(csv_eval_file)
        csv_writer.writerow(csv_eval_header)

        for experiment in range(NUM_EXPERIMENTS):
            print('Experiment number: ', experiment)
            random_q_learning(env)


In [None]:
if __name__ == "__main__":
    train(env)

In [None]:
if __name__ == "__main__":
    filename = MODEL_FOLDER + LOAD_MODEL_FILENAME

    with tf.Session() as sess:

        if PART_NUM == 1:
            X, DQN = build_graph(1)
        else:
            X, DQN = build_graph(2)

        tf.global_variables_initializer().run()
        saver.load_model(sess, filename)
        saver.test(env, X, DQN, sess, 100, render=True)
