In [52]:
import tensorflow as tf      # Deep Learning library
import numpy as np           # Handle matrices
from vizdoom import *        # Doom Environment

import random                # Handling random number generation
import time                  # Handling time calculation
from skimage import transform# Help us to preprocess the frames

import environment_creation as ec
import frame_preprocessing as fp
import frames_stacking as fs
import memory as mem

from collections import deque# Ordered collection with ends
import matplotlib.pyplot as plt # Display graphs

import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore')

In [40]:
game, possible_actions = ec.create_environment()

In [41]:
#frame preprocessing

In [42]:
#stack_frames
stack_size = 4
stacked_frames = deque([np.zeros((84, 84), dtype = np.int) for i in range(stack_size)], maxlen = 4)

In [43]:
#model hyperparameters
state_size = [84, 84, 4]
action_size = game.get_available_buttons_size()
learning_rate = 0.0002

#training hyperparameters
total_episodes = 500
max_steps = 100
batch_size = 64

#exploration/epsilon greedy strategy parameters
explore_start = 1.0
explore_stop = 0.01
decay_rate = 0.0001

#q leraning hyperparameters
gamma = 0.95 #discount rate

###memory hyperparameters
pretrain_length = batch_size #number of experiences stored in memory when initialized for the first time
memory_size = 100000 #no of experiences memory can keep

training = True
episode_render = False

stacked_frames = deque([np.zeros((84, 84), dtype = np.int) for i in range(stack_size)], maxlen = 4)

In [44]:
class DeepQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name = "DeepQNetwork"):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            #here we create the placeholders
            
            #[None, 84, 84, 4]
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, 3], name = "actions_")
        
            #target_Q is the R(s, a) + ymax Q^(s', a')
            self.target_Q = tf.placeholder(tf.float32, [None], name = "targetQ")
            
            # Input is 84x84x4
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv1")
            
            self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                   training = True,
                                                   epsilon = 1e-5,
                                                     name = 'batch_norm1')
            
            self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
            ## --> [20, 20, 32]
            
            #second
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv2")
        
            self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                   training = True,
                                                   epsilon = 1e-5,
                                                     name = 'batch_norm2')

            self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
            ## --> [9, 9, 64]
            
            #thirds
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                 filters = 128,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv3")
        
            self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
                                                   training = True,
                                                   epsilon = 1e-5,
                                                     name = 'batch_norm3')

            self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
            ## --> [3, 3, 128]
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            ## --> [1152]
            
            self.fc = tf.layers.dense(inputs = self.flatten,
                                      units = 512,
                                      activation = tf.nn.elu,
                                      kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                      name = 'fc1')
            
            self.output = tf.layers.dense(inputs = self.fc, 
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units = 3, 
                                        activation=None)
            
            #Q is our predicted Q value
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis = 1)
            
            #loss is the difference between predicted Q value and Q_target
            # sum(q_target - q)^2
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

In [45]:
tf.reset_default_graph()

DeepQNetwork = DeepQNetwork(state_size, action_size, learning_rate)

In [49]:
stack_size = 4 # We stack 4 frames

# Initialize deque with zero-images one array for each image
stacked_frames  =  deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4) 

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = fp.preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        
        # Because we're in a new episode, copy the same frame 4x
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2)
        
    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

In [56]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

In [57]:
#create the memory class
#instantiate the memory
memory = Memory(max_size = memory_size)

#now dealing with the empty memory problem - prepopulating memory by taking random actions
#and storing experience (state, action, reward, new_state)

game.new_episode()
for i in range(pretrain_length):
    #if it's the first step
    if i==0:
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    #take random action
    action = random.choice(possible_actions)
   
    #get rewardd
    reward = game.make_action(action)
    
    #look if episode is finished
    done = game.is_episode_finished()
    
    if done:
        next_state = np.zeros(state.shape)
        
        #add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        #start a new episode
        game.new_episode()
        
        state = game.get_state().screen_buffer
        state, stacked_frames =  stack_frames(stacked_frames, state, True)
    else:
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        #add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        #our state is now the next state
        state = next_state    

In [58]:
#setup tensorboard writer
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

tf.summary.scalar("Loss", DeepQNetwork.loss)

write_op = tf.summary.merge_all()

In [59]:
#init the weights
#init the environment
#init decay rate

#the following function does the epsilon-greedy strategy part - predicts action
def predict_action(explore_start, explore_top, decay_rate, decay_step, state, actions):
    exp_exp_tradeoff = np.random.rand()
    
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if(explore_probability > exp_exp_tradeoff):
        action = random.choice(possible_actions)
    else:
        #get action from q-network (exploitation)
        Qs = sess.run(DeepQNetwork.output, feed_dict = {DeepQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        #take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
    
    return action, explore_probability

In [64]:
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        #init the variables
        sess.run(tf.global_variables_initializer())
        
        #init the decay rate that reduces the epsilon
        decay_step = 0
        
        game.init()
        for episode in range(total_episodes):
            step = 0
            
            episode_rewards = []
            
            game.new_episode()
            state = game.get_state().screen_buffer
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            
            while step < max_steps:
                step+=1                
                decay_step +=1
                
                #predict the action to take and take it
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)
                
                reward = game.make_action(action)
                done = game.is_episode_finished()
                
                episode_rewards.append(reward)
                
                if done:
                    next_state = np.zeros((84, 84), dtype = np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    step = max_steps
                    
                    total_reward = np.sum(episode_rewards)
                    
                    print('Episode: {}'.format(episode), 'Total reward: {}'.format(total_reward), 'Training loss: {:.4f}'.format(loss),
                                                       'Explore P: {:.4f}'.format(explore_probability))
                    
                    memory.add((state, action, reward, next_state, done))
                else:
                    next_state = game.get_state().screen_buffer                    
                    next_state, stacked_frames = stack_frames(stacked_frames,next_state, False)
                    
                    memory.add((state, action, reward, next_state, done))                   
                    state = next_state
                
                #learning part
                #obtain random minibatch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin = 3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch])
                dones_mb = np.array([each[4] for each in batch])
                
                target_Qs_batch = []
                
                Qs_next_state_vals = sess.run(DeepQNetwork.output, feed_dict = {DeepQNetwork.inputs_: next_states_mb})
                
                #set q_target = r if the episode ends at s+1, otherwise q_target = r + gamma*maxQ(s', a')
                
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]
                    
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma*np.max(Qs_next_state_vals[i])
                        target_Qs_batch.append(target)
                        
                targets_mb = np.array([each for each in target_Qs_batch])
                
                loss, _ = sess.run([DeepQNetwork.loss, DeepQNetwork.optimizer],
                                  feed_dict = {DeepQNetwork.inputs_: states_mb,
                                              DeepQNetwork.target_Q: targets_mb, 
                                              DeepQNetwork.actions_: actions_mb})
                
                #write tensorflow summaries
                summary = sess.run(write_op, feed_dict = {DeepQNetwork.inputs_: states_mb,
                                                         DeepQNetwork.target_Q: targets_mb,
                                                         DeepQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model saved.")              
                

Model saved.
Episode: 2 Total reward: 94.0 Training loss: 5.3815 Explore P: 1.0000
Episode: 4 Total reward: 90.0 Training loss: 123.9396 Explore P: 1.0000
Model saved.
Episode: 6 Total reward: 89.0 Training loss: 6.9530 Explore P: 1.0000
Episode: 7 Total reward: 95.0 Training loss: 17.1589 Explore P: 1.0000
Episode: 8 Total reward: 75.0 Training loss: 22.6181 Explore P: 1.0000
Episode: 10 Total reward: 95.0 Training loss: 16.6917 Explore P: 1.0000
Model saved.
Episode: 11 Total reward: 94.0 Training loss: 6.1417 Explore P: 1.0000
Episode: 13 Total reward: 92.0 Training loss: 8.4667 Explore P: 1.0000
Episode: 15 Total reward: 95.0 Training loss: 7.5991 Explore P: 1.0000
Model saved.
Episode: 17 Total reward: -23.0 Training loss: 5.8741 Explore P: 1.0000
Episode: 19 Total reward: -16.0 Training loss: 6.7965 Explore P: 1.0000
Model saved.
Episode: 21 Total reward: 71.0 Training loss: 9.0128 Explore P: 1.0000
Episode: 22 Total reward: 92.0 Training loss: 5.6467 Explore P: 1.0000
Episode: 2

Episode: 188 Total reward: 90.0 Training loss: 6.1995 Explore P: 1.0000
Episode: 189 Total reward: 94.0 Training loss: 2.7730 Explore P: 1.0000
Episode: 190 Total reward: 17.0 Training loss: 6.4069 Explore P: 1.0000
Model saved.
Episode: 191 Total reward: 95.0 Training loss: 2.9121 Explore P: 1.0000
Model saved.
Episode: 197 Total reward: 91.0 Training loss: 2.9257 Explore P: 1.0000
Episode: 198 Total reward: 92.0 Training loss: 4.3747 Explore P: 1.0000
Episode: 199 Total reward: 93.0 Training loss: 2.3052 Explore P: 1.0000
Episode: 200 Total reward: 94.0 Training loss: 5.2764 Explore P: 1.0000
Model saved.
Episode: 202 Total reward: 94.0 Training loss: 4.3524 Explore P: 1.0000
Model saved.
Episode: 206 Total reward: 94.0 Training loss: 3.7801 Explore P: 1.0000
Episode: 208 Total reward: 19.0 Training loss: 4.1573 Explore P: 1.0000
Episode: 210 Total reward: 94.0 Training loss: 3.5854 Explore P: 1.0000
Model saved.
Episode: 211 Total reward: 95.0 Training loss: 8.3586 Explore P: 1.0000

Episode: 362 Total reward: -23.0 Training loss: 4.9420 Explore P: 1.0000
Episode: 363 Total reward: 24.0 Training loss: 2.8818 Explore P: 1.0000
Episode: 365 Total reward: 94.0 Training loss: 5.8920 Explore P: 1.0000
Model saved.
Episode: 366 Total reward: 94.0 Training loss: 1.4393 Explore P: 1.0000
Model saved.
Episode: 371 Total reward: 95.0 Training loss: 2.3773 Explore P: 1.0000
Episode: 373 Total reward: 92.0 Training loss: 3.1392 Explore P: 1.0000
Episode: 374 Total reward: 95.0 Training loss: 2.3877 Explore P: 1.0000
Episode: 375 Total reward: 92.0 Training loss: 3.9482 Explore P: 1.0000
Model saved.
Episode: 377 Total reward: 27.0 Training loss: 2.3631 Explore P: 1.0000
Episode: 378 Total reward: 69.0 Training loss: 2.0178 Explore P: 1.0000
Model saved.
Episode: 381 Total reward: 72.0 Training loss: 3.4088 Explore P: 1.0000
Model saved.
Episode: 386 Total reward: 95.0 Training loss: 4.4144 Explore P: 1.0000
Episode: 387 Total reward: 94.0 Training loss: 3.1238 Explore P: 1.000

In [69]:
with tf.Session() as sess:
    
    game, possible_actions = ec.create_environment()
    
    totalScore = 0
    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")
    game.init()
    for i in range(1):
        
        done = False
        
        game.new_episode()
        
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
            
        while not game.is_episode_finished():
            # Take the biggest Q value (= the best action)
            Qs = sess.run(DeepQNetwork.output, feed_dict = {DeepQNetwork.inputs_: state.reshape((1, *state.shape))})
            
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[int(choice)]
            
            game.make_action(action)
            done = game.is_episode_finished()
            score = game.get_total_reward()
            
            if done:
                break  
                
            else:
                print("else")
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state
                
        score = game.get_total_reward()
        print("Score: ", score)
    game.close()

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
else
else
else
else
else
else
else
else
else
else
else
else
else
else
else
else
else
else
else
else
else
else
Score:  73.0
