# Doom with Deep Q-Learning
Based on [Thomas Simomini's example](https://github.com/simoninithomas/Deep_reinforcement_learning_Course)

## Step 1: Import the libraries

In [1]:
import tensorflow as tf
import numpy as np
from vizdoom import *

import random
import time
from skimage import transform

from collections import deque
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import time

  from ._conv import register_converters as _register_converters


## Step 2: Create the environmnet

Doom environment takes:

- A configuration file that handle all the options (size of the frame, possible actions...)
- A scenario file: that generates the correct scenario (in our case basic but you're invited to try other scenarios).

Note: We have 3 possible actions [[0,0,1], [1,0,0], [0,1,0]]

- A monster is spawned randomly somewhere along the opposite wall.
- Player can only go left/right and shoot.
- 1 hit is enough to kill the monster.
- Episode finishes when monster is killed or on timeout (300).

REWARDS:

- +101 for killing the monster
- -5 for missing
- Episode ends after killing the monster or on timeout.
- living reward = -1



In [2]:
def create_environment():
    game = DoomGame()
    
    # Scenarios path
    s_path = "C:\\Users\\Miguel\\AppData\\Local\\conda\\conda\\envs\\gym\\Lib\\site-packages\\vizdoom\\scenarios\\"
    # Load config
    game.load_config(s_path + "basic.cfg")
    # Load scenario
    game.set_doom_scenario_path(s_path + "basic.wad")
    
    game.set_screen_format(ScreenFormat.GRAY8)
    
    game.init()
    
    # Possible actions
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    possible_actions = [left, right, shoot]
    
    return game, possible_actions

def test_environment():
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    shoot = [0, 0, 1]
    left = [1, 0, 0]
    right = [0, 1, 0]
    actions = [shoot, left, right]
    
    episodes = 10
    for i in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(actions)
            print("Action taken:", action)
            reward = game.make_action(action)
            print("\treward:", reward)
            time.sleep(0.02)
        print("Result:", game.get_total_reward())
        time.sleep(2)
    game.close()
    

In [3]:
game, possible_actions = create_environment()

## Step 3: Define the preprocessing functions
### preprocess_frame
Used to reduce the complexity of our states and thus reduce computation time needed for training

In [4]:
def preprocess_frame(frame):
    # Grayscale each frame (already done on config file)
    # Crop the screen
    cropped_frame = frame[30:-10, 30:-30]
    
    # Normalize pixel values
    normalized_frame = cropped_frame/255.0
    
    # Resize preprocessed frame
    preprocessed_frame = transform.resize(normalized_frame, [84,84])
    
    return preprocessed_frame

### stack_frames
Give a sense of motion to the NN

In [5]:
stack_size = 4

stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

        # Because we're in a new episode, copy the same frame 4x
        for i in range(4):
            stacked_frames.append(frame)
        
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2)
        
    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)
        
        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2)
        
    return stacked_state, stacked_frames


## Step 4: Set up our hyperparameters


In [15]:
### Model hyperparameters
state_size = [84,84,4]
action_size = game.get_available_buttons_size()
learning_rate = 0.002

### Training hyperparameters
total_episodes = 500
max_steps = 100
batch_size = 64

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0
explore_stop = 0.01
decay_rate = 0.0001

# Q-learning hyperparameters
gamma = 0.95                    # Discount rate

### Memory hyperparameters
pretrain_length = batch_size    # Number of exp stored in memory when init
memory_size = 1000000           # Number of experiences memory can keep

### This to FALSE to just see the trained agent
training = True

### This to TRUE to render the environment
episode_render = False

## Step 5: Create the Deep Q-learning Neural Network model

In [7]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # Create placeholders
            # [None, 84, 84, 4]
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, 3], name="actons_")
            
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            # First convnet: CNN, BatchNormalization, ELU
            # Input is 84x84x84
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_, 
                                          filters = 32,
                                          kernel_size = [8,8],
                                          strides = [4,4],
                                          padding = "VALID",
                                          kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
                                          name = "conv1")
            
            self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                                 training = True,
                                                                 epsilon = 1e-5,
                                                                 name = 'batch_norm1')
            
            self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
            # --> [20, 20, 32]
            
            # Second convnet: CNN, BatchNormalization, ELU
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                          filters = 64,
                                          kernel_size = [4,4],
                                          strides = [2,2],
                                          padding = "VALID",
                                          kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
                                          name = "conv2")
            
            self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                                 training = True,
                                                                 epsilon = 1e-5,
                                                                 name = 'batch_norm2')
            
            self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
            # --> [9, 9, 64]
            
            # Third convnet: CNN, BatchNormalization, ELU
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                          filters = 128,
                                          kernel_size = [4,4],
                                          strides = [2,2],
                                          padding = "VALID",
                                          kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
                                          name = "conv3")
            
            self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
                                                                 training = True,
                                                                 epsilon = 1e-5,
                                                                 name = 'batch_norm3')
            
            self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
            # --> [3, 3, 128]
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            # --> [1152]
            
            self.fc = tf.layers.dense(inputs = self.flatten,
                                      units = 512,
                                      activation = tf.nn.elu,
                                      kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                      name = "fc1")
            
            self.output = tf.layers.dense(inputs = self.fc,
                                          kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                          units = 3,
                                          activation = None)
            
            # Q is our predicted Q value
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)          
            

In [8]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DQNetwork = DQNetwork(state_size, action_size, learning_rate)

# Step 6: Experience Replay

In [9]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
        
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

Deal with empty memory problem

In [10]:
# Instantiate memory
memory = Memory(max_size = memory_size)

# Render the environment
game.new_episode()

for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
        # First we need a state
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    # Random action
    action = random.choice(possible_actions)
    
    # Get the rewards
    reward = game.make_action(action)
    
    # Look if the episode is finished
    done = game.is_episode_finished()
    
    # If we're dead
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Start a new episode
        game.new_episode()
        
        # First we need a state
        state = game.get_state().screen_buffer
        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        # Get the next state
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Our state is now the next_state
        state = next_state

## Step 7: Set up Tensorboard

In [11]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

## Loses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

## Step 8: Train the Agent

- Initialize weights, environment and decay rate
- For episode in episode_max
 - Make new episode
 - Step to 0
 -  Observe first state s_0
 
 - While step < max_steps
  - Increase decay_rate
  - With epsilon select randoma ction a_t, otherwise select a_t = argmax Q
  - Execute action a_t and observe reward in new state
  - Store transition
  - Sample random mini-batch from D
  - Set $\hat{Q} = r$ if the episode ends at $+1$, otherwise set $\hat{Q} = r + \gamma \max_{a'}{Q(s', a')}$
  - Make a gradient descent step with loss $(\hat{Q} - Q(s, a))^2$

In [12]:
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, action):
    # Epsilon greedy strategy
    exp_exp_tradeoff = np.random.rand()
    
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        action = random.choice(possible_actions)
        
    else:
        # Estimate Q values state
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
        
    return action, explore_probability

In [18]:
saver = tf.train.Saver()

if training:
    with tf.Session() as sess:
        # initialize the variables
        sess.run(tf.global_variables_initializer())
        
        # init decay rate
        decay_step = 0
        
        # init the game
        game.init()
        
        for episode in range(total_episodes):
            step = 0
            episode_rewards = []
            game.new_episode()
            state = game.get_state().screen_buffer
            
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            
            while step < max_steps:
                step += 1
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)
                reward = game.make_action(action)
                done = game.is_episode_finished()
                episode_rewards.append(reward)
                
                if done:
                    next_state = np.zeros((84,84), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    step = max_steps
                    
                    total_reward = np.sum(episode_rewards)
                    
                    print('Episode: {}'.format(episode),
                              'Total reward: {}'.format(total_reward),
                              'Training loss: {:.4f}'.format(loss),
                              'Explore P: {:.4f}'.format(explore_probability))

                    memory.add((state, action, reward, next_state, done))
                    
                else:
                    next_state = game.get_state().screen_buffer
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    memory.add((state, action, reward, next_state, done))
                    state = next_state
                    
                ## LEARNING
                # Mini batch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])
                
                target_Qs_batch = []
                
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                
                targets_mb = np.array([each for each in target_Qs_batch])
                
                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                  feed_dict={DQNetwork.inputs_: states_mb,
                                            DQNetwork.target_Q: targets_mb,
                                            DQNetwork.actions_: actions_mb})
                
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                       DQNetwork.target_Q: targets_mb,
                                                       DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")

Episode: 0 Total reward: 31.0 Training loss: 165.3071 Explore P: 1.0000
Model Saved
Episode: 4 Total reward: 89.0 Training loss: 251.6142 Explore P: 1.0000
Episode: 5 Total reward: 44.0 Training loss: 12.7884 Explore P: 1.0000
Model Saved
Episode: 7 Total reward: 94.0 Training loss: 7.5907 Explore P: 1.0000
Episode: 8 Total reward: 66.0 Training loss: 19.4421 Explore P: 1.0000
Episode: 9 Total reward: 94.0 Training loss: 15.6159 Explore P: 1.0000
Episode: 10 Total reward: 95.0 Training loss: 72.6078 Explore P: 1.0000
Model Saved
Episode: 11 Total reward: 95.0 Training loss: 26.8368 Explore P: 1.0000


KeyboardInterrupt: 

## Step 9: Watch the agent play

In [27]:
with tf.Session() as sess:
    game, possible_actions = create_environment()
    totalScore = 0
    # Load the model
    saver.restore(sess, "./models/model.ckpt")
    game.init()
    for i in range(1):
        done = False
        game.new_episode()
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
        while not game.is_episode_finished():
            # Take the biggest Q value
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
            choice = np.argmax(Qs)
            action = possible_actions[int(choice)]
            
            game.make_action(action)
            done = game.is_episode_finished()
            score = game.get_total_reward()
            
            if done:
                break
            else:
                print('else')
                #time.sleep(2)
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                
            score = game.get_total_reward()
            print("Score: ", score)
        game.close()
            

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
else
Score:  -1.0
else
Score:  -2.0
else
Score:  -3.0
else
Score:  -4.0
else
Score:  96.0
