In [11]:
import tensorflow as tf
import random
import time
import numpy as np
from collections import deque

from environment_creation import create_environment
import policy_gradient_network as pgn
from frame_preprocessing import preprocess_frame
from frames_stacking import stack_frames 

import warnings
warnings.filterwarnings('ignore')

In [12]:
game, possible_actions = create_environment()

stack_size = 4
stacked_frames = deque([np.zeros((84,84), dtype = np.int) for i in range(stack_size)], maxlen = 4)

In [13]:
def discount_and_normalize_rewards(episode_rewards, gamma):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)

    return discounted_episode_rewards

In [14]:
####################################
###Environment hyperparameters
state_size = [84, 84, 4] #our input is a stack of 4 frames , 84x84
action_size = game.get_available_buttons_size() # 3 possible actions, turn left, turn right, go forward
stack_size = 4 #how many frames are stacked together

#deep learning model hyperparameters
learning_rate = 0.002
num_epochs = 1000

batch_size = 2000
gamma = 0.95 #discount rate

training = True
######################################
action_size

3

In [15]:
tf.reset_default_graph()

PGNetwork = pgn.PGNetwork(state_size, action_size, learning_rate)

In [16]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [17]:
#setup tensorflow writer
writer = tf.summary.FileWriter("/tensorboard/pg/test")

#losses
tf.summary.scalar("Loss", PGNetwork.loss)

tf.summary.scalar("Reward_mean", PGNetwork.mean_reward_ )

write_op = tf.summary.merge_all()

In [18]:
#now is the time to train the agent

Here we'll create batches.
These batches contains episodes (their number depends on how many rewards we collect: for instance if we have episodes with only 10 rewards we can put batch_size/10 episodes

    Make a batch
        For each step:
            Choose action a
            Perform action a
            Store s, a, r
            If done:
                Calculate sum reward
                Calculate gamma Gt



In [19]:
def make_batch(batch_size, stacked_frames):
    # Initialize lists: states, actions, rewards_of_episode, rewards_of_batch, discounted_rewards
    states, actions, rewards_of_episode, rewards_of_batch, discounted_rewards = [], [], [], [], []
    
    # Reward of batch is also a trick to keep track of how many timestep we made.
    # We use to to verify at the end of each episode if > batch_size or not.
    
    # Keep track of how many episodes in our batch (useful when we'll need to calculate the average reward per episode)
    episode_num  = 1
    
    # Launch a new episode
    game.new_episode()
        
    # Get a new state
    state = game.get_state().screen_buffer
    state, stacked_frames = stack_frames(stacked_frames, state, True, stack_size)

    while True:
        # Run State Through Policy & Calculate Action
        action_probability_distribution = sess.run(PGNetwork.action_distribution, 
                                                   feed_dict={PGNetwork.inputs_: state.reshape(1, *state_size)})
        
        # REMEMBER THAT WE ARE IN A STOCHASTIC POLICY SO WE DON'T ALWAYS TAKE THE ACTION WITH THE HIGHEST PROBABILITY
        # (For instance if the action with the best probability for state S is a1 with 70% chances, there is
        #30% chance that we take action a2)
        action = np.random.choice(range(action_probability_distribution.shape[1]), 
                                  p=action_probability_distribution.ravel())  # select action w.r.t the actions prob
        action = possible_actions[action]

        # Perform action
        reward = game.make_action(action)
        done = game.is_episode_finished()

        # Store results
        states.append(state)
        actions.append(action)
        rewards_of_episode.append(reward)
        
        if done:
            # The episode ends so no next state
            next_state = np.zeros((84, 84), dtype=np.int)
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, stack_size)
            
            # Append the rewards_of_batch to reward_of_episode
            rewards_of_batch.append(rewards_of_episode)
            
            # Calculate gamma Gt
            discounted_rewards.append(discount_and_normalize_rewards(rewards_of_episode, gamma))
           
            # If the number of rewards_of_batch > batch_size stop the minibatch creation
            # (Because we have sufficient number of episode mb)
            # Remember that we put this condition here, because we want entire episode (Monte Carlo)
            # so we can't check that condition for each step but only if an episode is finished
            if len(np.concatenate(rewards_of_batch)) > batch_size:
                break
                
            # Reset the transition stores
            rewards_of_episode = []
            
            # Add episode
            episode_num += 1
            
            # Start a new episode
            game.new_episode()

            # First we need a state
            state = game.get_state().screen_buffer

            # Stack the frames
            state, stacked_frames = stack_frames(stacked_frames, state, True, stack_size)
         
        else:
            # If not done, the next_state become the current state
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, stack_size)
            state = next_state
                         
    return np.stack(np.array(states)), np.stack(np.array(actions)), np.concatenate(rewards_of_batch), np.concatenate(discounted_rewards), episode_num

- Create the Neural Network
- Init the weights
- Init the environment
- maxReward = 0 #keep track of max reward
- for epochs in range(num_epochs):
    - get batches
    - optimize

In [None]:
# Keep track of all rewards total for each batch
allRewards = []

total_rewards = 0
maximumRewardRecorded = 0
mean_reward_total = []
epoch = 1
average_reward = []

# Saver
saver = tf.train.Saver()

if training:
    # Load the model
    #saver.restore(sess, "./models/model.ckpt")

    while epoch < num_epochs + 1:
        # Gather training data
        states_mb, actions_mb, rewards_of_batch, discounted_rewards_mb, nb_episodes_mb = make_batch(batch_size, stacked_frames)

        ### These part is used for analytics
        # Calculate the total reward ot the batch
        total_reward_of_that_batch = np.sum(rewards_of_batch)
        allRewards.append(total_reward_of_that_batch)

        # Calculate the mean reward of the batch
        # Total rewards of batch / nb episodes in that batch
        mean_reward_of_that_batch = np.divide(total_reward_of_that_batch, nb_episodes_mb)
        mean_reward_total.append(mean_reward_of_that_batch)

        # Calculate the average reward of all training
        # mean_reward_of_that_batch / epoch
        average_reward_of_all_training = np.divide(np.sum(mean_reward_total), epoch)

        # Calculate maximum reward recorded 
        maximumRewardRecorded = np.amax(allRewards)

        print("==========================================")
        print("Epoch: ", epoch, "/", num_epochs)
        print("-----------")
        print("Number of training episodes: {}".format(nb_episodes_mb))
        print("Total reward: {}".format(total_reward_of_that_batch, nb_episodes_mb))
        print("Mean Reward of that batch {}".format(mean_reward_of_that_batch))
        print("Average Reward of all training: {}".format(average_reward_of_all_training))
        print("Max reward for a batch so far: {}".format(maximumRewardRecorded))

        # Feedforward, gradient and backpropagation
        loss_, _ = sess.run([PGNetwork.loss, PGNetwork.train_opt], feed_dict={PGNetwork.inputs_: states_mb.reshape((len(states_mb), 84,84,4)),
                                                            PGNetwork.actions: actions_mb,
                                                                     PGNetwork.discounted_episode_rewards_: discounted_rewards_mb 
                                                                    })

        print("Training Loss: {}".format(loss_))

        # Write TF Summaries
        summary = sess.run(write_op, feed_dict={PGNetwork.inputs_: states_mb.reshape((len(states_mb), 84,84,4)),
                                                            PGNetwork.actions: actions_mb,
                                                                     PGNetwork.discounted_episode_rewards_: discounted_rewards_mb,
                                                                    PGNetwork.mean_reward_: mean_reward_of_that_batch
                                                                    })

        #summary = sess.run(write_op, feed_dict={x: s_.reshape(len(s_),84,84,1), y:a_, d_r: d_r_, r: r_, n: n_})
        writer.add_summary(summary, epoch)
        writer.flush()

        # Save Model
        if epoch % 10 == 0:
            saver.save(sess, "./models/model.ckpt")
            print("Model saved")
        epoch += 1

Epoch:  1 / 1000
-----------
Number of training episodes: 4
Total reward: 1840.0
Mean Reward of that batch 460.0
Average Reward of all training: 460.0
Max reward for a batch so far: 1840.0
Training Loss: -0.0076517038978636265
Epoch:  2 / 1000
-----------
Number of training episodes: 4
Total reward: 1680.0
Mean Reward of that batch 420.0
Average Reward of all training: 440.0
Max reward for a batch so far: 1840.0
Training Loss: -0.010891149751842022
Epoch:  3 / 1000
-----------
Number of training episodes: 4
Total reward: 1840.0
Mean Reward of that batch 460.0
Average Reward of all training: 446.6666666666667
Max reward for a batch so far: 1840.0
Training Loss: -0.014711610041558743
Epoch:  4 / 1000
-----------
Number of training episodes: 4
Total reward: 1968.0
Mean Reward of that batch 492.0
Average Reward of all training: 458.0
Max reward for a batch so far: 1968.0
Training Loss: -0.0057493350468575954
Epoch:  5 / 1000
-----------
Number of training episodes: 4
Total reward: 1904.0
M

Training Loss: 0.013952933251857758
Model saved
Epoch:  31 / 1000
-----------
Number of training episodes: 4
Total reward: 1712.0
Mean Reward of that batch 428.0
Average Reward of all training: 485.51397849462364
Max reward for a batch so far: 2360.0
Training Loss: -0.13070173561573029
Epoch:  32 / 1000
-----------
Number of training episodes: 4
Total reward: 2064.0
Mean Reward of that batch 516.0
Average Reward of all training: 486.4666666666667
Max reward for a batch so far: 2360.0
Training Loss: 0.007631264626979828
Epoch:  33 / 1000
-----------
Number of training episodes: 3
Total reward: 1780.0
Mean Reward of that batch 593.3333333333334
Average Reward of all training: 489.70505050505056
Max reward for a batch so far: 2360.0
Training Loss: 0.036599233746528625
Epoch:  34 / 1000
-----------
Number of training episodes: 4
Total reward: 2320.0
Mean Reward of that batch 580.0
Average Reward of all training: 492.3607843137256
Max reward for a batch so far: 2360.0
Training Loss: -0.0027