In [None]:
import tensorflow as tf
import random
import time
import numpy as np
from collections import deque

from environment_creation import create_environment
import pg_network as pg
from frame_preprocessing import preprocess_frame

import warnings
warnings.filterwarnings('ignore')

In [None]:
game, possible_actions = create_environment()

stack_size = 4

stacked_frames = deque([np.zeros((84,84), dtype = np.int) for i in range(stack_size)], maxlen = 4)

In [None]:
def stack_frames (stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)

    if is_new_episode:
        #clear stacked frames
        stacked_frames  =  deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4) 

        for i in (0,4):
            stacked_frames.append(frame)
    else:
        #append the frame to deque, automatically removes the oldest one
        stacked_frames.append(frame)

        #build the stacked state

    stacked_state = np.stack(stacked_frames, axis = 2)
    return stacked_state, stacked_frames

In [None]:
def discount_and_normalize_rewards(episode_rewards, gamma):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative

    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean)/std

    return discounted_episode_rewards

In [None]:
####################################
###Environment hyperparameters
state_size = [84, 84, 4] #our input is a stack of 4 frames , 84x84
action_size = game.get_available_buttons_size() # 3 possible actions, turn left, turn right, go forward
stack_size = 4 #how many frames are stacked together

#deep learning model hyperparameters
learning_rate = 0.002
num_epochs = 1000

batch_size = 1000
gamma = 0.95 #discount rate

training = True
######################################
action_size

In [None]:
class PGNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            with tf.name_scope("inputs"):
                # We create the placeholders
                # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
                # [None, 84, 84, 4]
                self.inputs_= tf.placeholder(tf.float32, [None, *state_size], name="inputs_")
                self.actions = tf.placeholder(tf.int32, [None, action_size], name="actions")
                self.discounted_episode_rewards_ = tf.placeholder(tf.float32, [None, ], name="discounted_episode_rewards_")
            
                
                # Add this placeholder for having this variable in tensorboard
                self.mean_reward_ = tf.placeholder(tf.float32, name="mean_reward")
                
            with tf.name_scope("conv1"):
                """
                First convnet:
                CNN
                BatchNormalization
                ELU
                """
                # Input is 84x84x4
                self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                             filters = 32,
                                             kernel_size = [8,8],
                                             strides = [4,4],
                                             padding = "VALID",
                                              kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                             name = "conv1")

                self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                       training = True,
                                                       epsilon = 1e-5,
                                                         name = 'batch_norm1')

                self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
                ## --> [20, 20, 32]
            
            with tf.name_scope("conv2"):
                """
                Second convnet:
                CNN
                BatchNormalization
                ELU
                """
                self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                     filters = 64,
                                     kernel_size = [4,4],
                                     strides = [2,2],
                                     padding = "VALID",
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                     name = "conv2")

                self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                       training = True,
                                                       epsilon = 1e-5,
                                                         name = 'batch_norm2')

                self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
                ## --> [9, 9, 64]
            
            with tf.name_scope("conv3"):
                """
                Third convnet:
                CNN
                BatchNormalization
                ELU
                """
                self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                     filters = 128,
                                     kernel_size = [4,4],
                                     strides = [2,2],
                                     padding = "VALID",
                                    kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                     name = "conv3")

                self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
                                                       training = True,
                                                       epsilon = 1e-5,
                                                         name = 'batch_norm3')

                self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
                ## --> [3, 3, 128]
            
            with tf.name_scope("flatten"):
                self.flatten = tf.layers.flatten(self.conv3_out)
                ## --> [1152]
            
            with tf.name_scope("fc1"):
                self.fc = tf.layers.dense(inputs = self.flatten,
                                      units = 512,
                                      activation = tf.nn.elu,
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                    name="fc1")
            
            with tf.name_scope("logits"):
                self.logits = tf.layers.dense(inputs = self.fc, 
                                               kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                              units = 3, 
                                            activation=None)
            
            with tf.name_scope("softmax"):
                self.action_distribution = tf.nn.softmax(self.logits)
                

            with tf.name_scope("loss"):
                # tf.nn.softmax_cross_entropy_with_logits computes the cross entropy of the result after applying the softmax function
                # If you have single-class labels, where an object can only belong to one class, you might now consider using 
                # tf.nn.sparse_softmax_cross_entropy_with_logits so that you don't have to convert your labels to a dense one-hot array. 
                self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.logits, labels = self.actions)
                self.loss = tf.reduce_mean(self.neg_log_prob * self.discounted_episode_rewards_) 
        
    
            with tf.name_scope("train"):
                self.train_opt = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

In [None]:
tf.reset_default_graph()

policy_gradient_net = PGNetwork(state_size, action_size, learning_rate)

In [None]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [54]:
#setup tensorflow writer
writer = tf.summary.FileWriter("/tensorboard/pg/test")

#losses
tf.summary.scalar("Loss", PGNetwork.loss)

#reward mean
tf.summary.scalar("")

In [None]:
#now is the time to train the agent

In [None]:
def make_batch(batch_size, stacked_frames):
    #init lists: states, actions, rewards_of_episodes, discount_rewards
    states, actions, rewards_of_episode, rewards_of_batch, discounted_rewards = [], [], [], [], []

    #reward of batch is also a trick 
    #keep track of how many episodes in our batch
    episode_num = 1

    #launch a new episode
    game.new_episode()

    #get a new state
    state = game.get_state().screen_buffer
    state, stacked_frames = stack_frames(stacked_frames, state, True)

    while True:
        #run state through policy & calculate action
        action_probability_distribution = sess.run(PGNetwork.action_distribution, 
                                                    feed_dict = {PGNetwork.inputs_: state.reshape(1, *state_size)})

        # stochastic policy
        action = np.random.choice(range(action_probability_distribution.shape[1],
                                  p = action_probability_distribution.ravel()))
                                  #select the action with respec to the action probab.
        
        action = possible_actions[action]

        #perform action
        reward = game.make_action(action)
        done = game.is_episode_finished()

        #store results
        states.append(state)
        actions.append(action)
        rewards_of_episode.append(reward)

        if done:
            next_state = np.zeros((84, 84), dtype = np.int)
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

            #append the rewards_of_batch
            rewards_of_batch.append(rewards_of_episode)

            #calculate gamma Gt
            discounted_rewards.append(discount_and_normalize_rewards(rewards_of_episode))

            #if the number of rewards_of_batch > batch_size stop the minibatch creation
            # because we have sufficient number of episode mb
            # we want the entire episode (Monte Carlo)
            # so we cant check that condition for earch step, but only if an episode is finished

            if len(np.concatenate(rewards_of_batch)) > batch_size:
                break

            #reset the transition stores
            rewards_of_episode = []

            episode_num += 1 

            game.new_episode()

            state = game.get_state().screen_buffer

            state, stacked_frames = stack_frames(stacked_frames, state, True)

        else:
            # if not done, the next_state becomes the current one
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state

        return np.stack(np.array(states)), np.stack(np.array(actions)), np.concatenate(rewards_of_batch), np.concatenate(discounted_rewards), episode_num

- Create the Neural Network
- Init the weights
- Init the environment
- maxReward = 0 #keep track of max reward
- for epochs in range(num_epochs):
    - get batches
    - optimize

In [None]:
allRewards = []

total_rewards = 0
maximumRewardsRecorded = 0
mean_reward_total = []
epoch = 1
average_reward = []

saver = tf.train.Saver()

if training:
    #load the model
    # saver.restore(sess, "./models/model.ckpt")
    
    while epoch < num_epochs + 1:
        states_mb, actions_mb, rewards_of_batch, discounted_rewards_mb, nb_episodes_mb = make_batch(batch_size, stacked_frames)
        
        #analytics part
        # calculate the total reward of the batch
        total_reward_of_that_batch = np.sum(rewards_of_batch)
        allRewards.append(total_reward_of_that_batch)
        
        #calculate the mean reward of the batch
        mean_reward_of_that_batch = np.divide(total_reward_of_that_batch, nb_episodes_mb)
        mean_reward_total.append(mean_reward_of_that_batch)
        
        #calculate the average reward of all training
        #mean_reward_of_that_batch / epoch
        average_reward_of_all_training = np.divide(np.sum(mean_reward_total), epoch)
        
        #calculate max reward recorded
        maximumRewardsRecorded = np.amax(allRewards)
        
        print("==========================================")
        print("Epoch: ", epoch, "/", num_epochs)
        print("-----------")
        print("Number of training episodes: {}".format(nb_episodes_mb))
        print("Total reward: {}".format(total_reward_of_that_batch, nb_episodes_mb))
        print("Mean Reward of that batch {}".format(mean_reward_of_that_batch))
        print("Average Reward of all training: {}".format(average_reward_of_all_training))
        print("Max reward for a batch so far: {}".format(maximumRewardRecorded))
        
        #feedforward, gradient and backpropagation
        loss_, _ = sess.run([policy_gradient_net.loss, policy_gradient_net.train_opt], feed_dict = {policy_gradient_net.inputs_: states_mb.reshape((len(states_mb), 84, 84, 4)),
                                                            policy_gradient_net.actions: actions_mb,
                                                                     policy_gradient_net.discounted_episode_rewards_: discounted_rewards_mb 
                                                                    })
        print("Training Loss: {}".format(loss_))
        
        summary = sess.run(write_op, feed_dict={policy_gradient_net.inputs_: states_mb.reshape((len(states_mb), 84,84,4)),
                                                            policy_gradient_net.actions: actions_mb,
                                                                     policy_gradient_net.discounted_episode_rewards_: discounted_rewards_mb,
                                                                    policy_gradient_net.mean_reward_: mean_reward_of_that_batch
                                                                   })
        
        writer.add_summary(summary, epoch)
        writer.flush()
        
        # save model
        if epoch%10 == 0:
            saver.save(sess, "./models/model.ckpt")
            print("Model saved")
        epoch+=1