In [None]:
"""
An implementation for training a DQN agent to play atari games. Doesn't quite solve the games yet; 
it needs a bit more fine tuning. Some training tricks (experience replay, target networking etc.) 
are probably a good place to start.

Author: Noorvir Aulakh
Date: 15/03/2017
"""


import csv
import datetime

import numpy as np
import tensorflow as tf
import time
import os

import gym
from collections import deque
import matplotlib.pyplot as plt
from skimage.viewer import ImageViewer
from skimage.transform import resize
from skimage.color import rgb2gray
from skimage import img_as_ubyte
from skimage import exposure
import warnings

games = ['Pong-v3', 'MsPacman-v3', 'Boxing-v3']

# ==================================================================================================
# Parameters

isTRAIN = True
# Problem subsection to run (1, 2 or 3)
# Part 1 - Report score and frame counts under a random policy, and average mean and stddev
# Part 2 -
# Part 3 -
PART_NUM = 2
GAME_NUM = 0  # Game to play: 1 - Pong, 2 - Pacman, 3 - Boxing

EX_BUFFER_SIZE = 100000  # Size of experience replay buffer
EPSILON = 0.1  # Exploration parameter
GAMMA = 0.99  # Discount factor
NUM_ITER = 10**6               # Number of epochs to run for
LEARNING_RATE = 10 ** -3  # Step-size for optimiser
BATCH_SIZE = 32

TRAINING_MOVING_AVG_ARR_LEN = 5000
# ==================================================================================================

# ==================================================================================================
# Save Options
LOG_FOLDER = './logs/part_b/'
MODEL_FOLDER = './models/part_b/'
LOAD_MODEL_FILENAME = ''
# ==================================================================================================

game_env = gym.make(games[GAME_NUM])

print(game_env.action_space.n)
print(game_env.observation_space)


class AtariSan:
    def __init__(self, env=game_env, experience_buff_size=EX_BUFFER_SIZE, epsilon=EPSILON,
                 gamma=GAMMA, num_iter=NUM_ITER, learning_rate=LEARNING_RATE,
                 batch_size=BATCH_SIZE):
        """

        :param env:
        :param experience_buff_size:
        :param epsilon:
        :param gamma:
        :param num_iter:
        :param learning_rate:
        :param batch_size:
        """

        self.env = env
        self.experience_buff_size = experience_buff_size
        self.experience_buff = deque(maxlen=self.experience_buff_size)
        self.epsilon = epsilon
        self.gamma = gamma
        self.num_iter = num_iter
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.action_space = env.action_space.n
        self.state = []
        self.next_state = []

        # Initialise graphs
        with tf.variable_scope('nn_main'):
            self.X_main, weights_main, self.nn_main = self.build_graph()
            self.W_conv1, self.b_conv1, self.W_conv2, self.b_conv2, self.W_fcl, self.b_fcl, \
                self.W_lin, self.b_lin = weights_main

        with tf.variable_scope('nn_target'):
            self.X_target, weights_target, self.nn_target = self.build_graph()
            self.W_Tconv1, self.b_Tconv1, self.W_Tconv2, self.b_Tconv2, self.W_Tfcl, self.b_Tfcl, \
                self.W_Tlin, self.b_Tlin = weights_target

        # Define operation to transfer weights from main network to target network
        self.update_target_net = [
            self.W_Tconv1.assign(self.W_conv1), self.b_Tconv1.assign(self.b_conv1),
            self.W_Tconv2.assign(self.W_conv2), self.b_Tconv2.assign(self.b_conv2),
            self.W_Tfcl.assign(self.W_fcl), self.b_Tfcl.assign(self.b_fcl),
            self.W_Tlin.assign(self.W_lin), self.b_Tlin.assign(self.b_lin)]

    def pre_process(self, observation):
        """
        Preprocess frames by resizing them to 28x28 and stacking four frames together to provide
        temporal information
        :return:
        """

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            observation = resize(rgb2gray(observation)[35:195, :], (28, 28))
            observation = exposure.rescale_intensity(observation)
            observation = img_as_ubyte(observation)

            # ImageViewer(observation).show()

        return observation

    def build_graph(self):
        """
        For the first layer use a filter size of 6x6 with a stride of 2 and 16 channels followed by
        a ReLU. For the second layer use a filter size of 4x4 with a stride of 2 and 32 channels
        followed by a ReLU. Flatten the output and add a fully connected third layer with 256 units
        followed by a ReLU. Finally, we have a linear layer that predicts the state-action value
        function with one output for each action.
        :param part_num:
        :return:
        """

        X = tf.placeholder(tf.float32, [None, 28, 28, 4], name='X')

        # Initialise weights
        W_conv1 = tf.Variable(tf.truncated_normal([6, 6, 4, 16], stddev=0.01), name='W_conv1')
        b_conv1 = tf.Variable(tf.truncated_normal([16]), name='b_conv1')

        W_conv2 = tf.Variable(tf.random_normal([4, 4, 16, 32], stddev=0.01), name='W_conv2')
        b_conv2 = tf.Variable(tf.truncated_normal([32]), name='b_conv2')

        W_fcl = tf.Variable(tf.truncated_normal([7 * 7 * 32, 256]), name='W_fcl')
        b_fcl = tf.Variable(tf.truncated_normal([256]), name='b_fcl')

        W_lin = tf.Variable(tf.truncated_normal([256, self.action_space], stddev=0.01),
                            name='W_lin')
        b_lin = tf.Variable(tf.truncated_normal([self.action_space], stddev=0.01), name='b_lin')

        # Convolution layers
        conv_l1 = tf.nn.relu(tf.nn.conv2d(X, W_conv1, strides=[1, 2, 2, 1],
                                          padding='SAME') + b_conv1, name='conv_1l')

        conv_21 = tf.nn.relu(tf.nn.conv2d(conv_l1, W_conv2, strides=[1, 2, 2, 1],
                                          padding='SAME') + b_conv2, name='conv_2l')

        # Fully connected layer
        fcl_input = tf.reshape(conv_21, [-1, 7 * 7 * 32])
        fcl = tf.nn.relu(tf.matmul(fcl_input, W_fcl) + b_fcl, name='fcl')

        # Linear Layer
        output_l = tf.matmul(fcl, W_lin) + b_lin

        weights = [W_conv1, b_conv1, W_conv2, b_conv2, W_fcl, b_fcl, W_lin, b_lin]

        return X, weights, output_l

    def train(self):
        """
        Train DQN.
        :return:
        """

        # Define training-specific placeholders
        Q_target = tf.placeholder(tf.float32, [None, 1])
        # One-hot encoding of actions taken to reach next state
        actions = tf.placeholder(tf.float32, [None, self.action_space])
        # The batch size changes as the experience buffer grows
        batch_size = tf.placeholder(tf.int32)

        delta = Q_target - tf.reshape(tf.reduce_sum(np.multiply(actions, self.nn_main), axis=1),
                                      [batch_size, 1])

        loss = tf.reduce_mean(0.5 * tf.square(delta), axis=0)
        trainer = tf.train.RMSPropOptimizer(LEARNING_RATE).minimize(loss)

        print('Starting training...\n')

        with tf.Session() as sess:

            episode = 1
            total_steps = 1
            tf.global_variables_initializer().run()

            # Array for computing moving average of discounted reward
            ep_discounted_reward_mavg_arr = deque(maxlen=TRAINING_MOVING_AVG_ARR_LEN)
            while total_steps <= self.num_iter:

                step_num = 0
                ep_discounted_reward = 0

                # Create initial state by stacking the first frame
                current_observation = self.env.reset()
                current_frame = self.pre_process(current_observation)
                self.state = np.stack((current_frame, current_frame, current_frame,
                                       current_frame), axis=2)

                while True:

                    self.state = np.reshape(self.state, [1, 28, 28, 4])

                    Q_val = sess.run(self.nn_main, feed_dict={self.X_main: self.state})
                    action = np.argmax(Q_val)

                    # Explore with probability EPSILON
                    if np.random.uniform() < EPSILON:
                        action = self.env.action_space.sample()

                    next_observation, reward, is_done, info = self.env.step(action)
                    next_frame = self.pre_process(next_observation)
                    self.next_state = np.append(self.state[0, :, :, 1:],
                                                np.reshape(next_frame, [28, 28, 1]), axis=2)

                    # Clip rewards
                    if reward <= -0.5:
                        reward = -1
                    elif -0.5 < reward <= 0.5:
                        reward = 0
                    elif reward > 0.5:
                        reward = 1

                    ep_discounted_reward += (self.gamma ** step_num) * reward
                    ep_discounted_reward_mavg_arr.append(ep_discounted_reward)

                    # Save experience to experience-replay buffer
                    experience = [self.state[0], action, reward, is_done, self.next_state]
                    self.experience_buff.append(experience)

                    # Current batch size depends on whether the experience buffer is full or not
                    if len(self.experience_buff) < self.batch_size:
                        c_batch_size = len(self.experience_buff)
                    else:
                        c_batch_size = self.batch_size

                    # ==============================================================================
                    # Train from experience buffer
                    # ==============================================================================

                    if len(self.experience_buff) < self.experience_buff_size:
                        batch = [self.experience_buff[i] for i in np.random.choice(
                            len(self.experience_buff),
                            c_batch_size,
                            replace=False)]
                    else:
                        batch = [self.experience_buff[i] for i in np.random.choice(
                            len(self.experience_buff),
                            c_batch_size,
                            replace=False)]

                    batch_s_t = [m[0] for m in batch]  # State at time-step t
                    batch_action = [m[1] for m in batch]  # State at time-step t
                    batch_reward = [m[2] for m in batch]  # Reward for taking action a at time t
                    batch_is_done = [m[3] for m in batch]  # Check if episode endoed
                    batch_s_tn = [m[4] for m in batch]  # State at time-step t+1

                    batch_reward = np.reshape(np.array(batch_reward), [c_batch_size, 1])

                    # Q-value for next state
                    Q_nVal = sess.run(self.nn_target, feed_dict={self.X_target: batch_s_tn})

                    # Choose max of Q-value at state t+1
                    Q_nVal_max = np.amax(Q_nVal, axis=1)

                    # One-hot encoding of actions chosen
                    batch_actions = np.zeros([c_batch_size, self.action_space])
                    batch_actions[np.arange(c_batch_size), batch_action] = 1

                    # If the episode has ended, then the total future reward should be zero i.e.
                    # Q(s_t+1) = 0
                    is_terminal_state = np.ones([c_batch_size, 1])
                    is_terminal_state[np.where(batch_is_done)] = 0

                    # Q-target for training in batch form
                    batch_target_val = batch_reward + np.multiply(is_terminal_state,
                                                                  self.gamma * np.transpose(
                                                                      np.array([Q_nVal_max])))

                    _, c_loss = sess.run([trainer, loss], feed_dict={Q_target: batch_target_val,
                                                                     self.X_main: batch_s_t,
                                                                     actions: batch_actions,
                                                                     batch_size: c_batch_size})

                    self.state = np.copy(self.next_state)

                    # Save loss
                    if total_steps % 100 == 0:
                        data = [episode, total_steps, c_loss[0],
                                np.mean(ep_discounted_reward_mavg_arr)]
                        log(data, 'loss')
                        print(data)

                    # Evaluate agent and save performance
                    if total_steps % 50000 == 0:
                        data = [episode, total_steps, c_loss[0]] + ['%.4f' % elem for elem in
                                                                    list(self.test(sess))]
                        log(data, 'eval')

                        print('Evaluation:\n')
                        print(data)

                        save_model(sess)


                    if step_num % 5000 == 0:
                        sess.run(self.update_target_net)

                    step_num += 1
                    total_steps += 1

                    if is_done:
                        break
                episode += 1
        print('Finished training!!\n')


    def test(self, sess, render=False):
        """
        Evaluate agent's performance.
        :param sess:
        :param render:
        :return:
        """

        # List of episode lengths for each episode (number of time-steps)
        ep_length = []
        # Cumulative reward array to average over multiple runs
        c_reward = []

        for run in range(100):

            step_num = 0
            ep_c_reward = 0

            # Create initial state by stacking the first frame
            current_observation = self.env.reset()
            current_frame = self.pre_process(current_observation)
            state = np.stack((current_frame, current_frame, current_frame, current_frame), axis=2)

            while True:

                state = np.reshape(state, [1, 28, 28, 4])

                if render:
                    self.env.render()

                actions = sess.run(self.nn_main, feed_dict={self.X_main: state})
                action = np.argmax(actions)
                next_observation, reward, is_done, _ = self.env.step(action)

                # Clip rewards
                if reward <= -0.5:
                    reward = -1
                elif -0.5 < reward <= 0.5:
                    reward = 0
                elif reward > 0.5:
                    reward = 1

                next_frame = self.pre_process(next_observation)
                next_state = np.append(state[0, :, :, 1:], np.reshape(next_frame, [28, 28, 1]),
                                       axis=2)
                ep_c_reward += (self.gamma ** step_num) * reward

                state = np.copy(next_state)
                step_num += 1

                if is_done:
                    ep_length.append(step_num + 1)
                    c_reward.append(ep_c_reward)
                    break

        return np.mean(c_reward), np.std(c_reward), np.mean(ep_length), np.std(ep_length)

In [None]:
def log(row, file):
    """
    Writes training logs to CSV file.
    :return:
    """
    if file == 'loss':
        csv_writer = csv.writer(csv_loss_file)
        csv_writer.writerow(row)

    else:
        csv_writer = csv.writer(csv_eval_file)
        csv_writer.writerow(row)


def save_model(sess):
    """
    :param sess:
    :param filename:
    :return:
    """
    if not os.path.exists(MODEL_FOLDER):
        print('Creating path where to save model: ' + MODEL_FOLDER)

        os.mkdir(MODEL_FOLDER)

    print('Saving model at: ' + model_filename)
    saver = tf.train.Saver(write_version=tf.train.SaverDef.V1)
    saver.save(sess, MODEL_FOLDER + model_filename)
    print('Model successfully saved.\n')


def load_model(sess, filename):
    """
    :param sess:
    :param filename:
    :return:
    """
    if os.path.exists(filename):
        print('\nLoading save model from: ' + filename)
        saver = tf.train.Saver()
        saver.restore(sess, filename)
        print('Model successfully loaded.\n')
        return True
    else:
        print('Model file <<' + filename + '>> does not exists!')
        return False


if __name__ == '__main__':

    game = AtariSan()

    if isTRAIN:
        global model_filename
        global csv_eval_file, csv_loss_file

        t = time.time()
        ts = datetime.datetime.fromtimestamp(t).strftime('%Y%m%d - %H%M-%S')
        csv_loss_filename = LOG_FOLDER + games[GAME_NUM][:-3] + '_' + 'loss' + '_' + ts + '.csv'
        csv_eval_filename = LOG_FOLDER + games[GAME_NUM][:-3] + '_' + 'eval' + '_' + ts + '.csv'

        model_filename = games[GAME_NUM][:-3] + '_' + ts

        csv_loss_header = ['episode', 'total_steps', 'loss']
        csv_eval_header = ['episode', 'total_steps', 'loss', 'reward_mean', 'reward_stddev',
                           'episode_length_mean', 'episode_length_stddev']

        with open(csv_loss_filename, 'w') as csv_loss_file, \
                open(csv_eval_filename, 'w') as csv_eval_file:
            # Write meta-data and headers to CSV file
            csv_writer = csv.writer(csv_loss_file)
            csv_writer.writerow(csv_loss_header)

            csv_writer = csv.writer(csv_eval_file)
            csv_writer.writerow(csv_eval_header)

            game.train()

    else:
        filename = MODEL_FOLDER + LOAD_MODEL_FILENAME

        with tf.Session() as sess:
            load_model(sess, filename)
            game.test(sess, render=True)

