In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random

from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras import layers
from doom_src import utilities
from collections import deque
from time import time
from vizdoom import *

First, let's load the config file and create a new game instance

In [None]:
config = utilities.get_config('configs/config.json')

In [None]:
game, actions = utilities.create_environment(config)

We will now need a DQN model

In [None]:
class DQN_D():
    """
    Define the Deep-Q Network to play Doom.
    """
    
    def __init__(self, config):
        frame_size = config['frame_size']
        stack_size = config['stack_size']
        learning_rate = config['learning_rate']
        
        model = tf.keras.Sequential()
        
        # Convolutional layer 1
        model.add(
            layers.Convolution2D(
                filters=32, 
                kernel_size=(8, 8),
                strides=(4,4),
                padding='valid',
                input_shape=(frame_size, frame_size, stack_size),
            )
        )
        
        model.add(
            layers.BatchNormalization(
                epsilon = 1e-5,
            )
        )

        # Convolutional layer 2
        model.add(
            layers.Convolution2D(
                filters=64, 
                kernel_size=(4, 4),
                strides=(2,2),
                padding='valid',
            )
        )
        
        model.add(
            layers.BatchNormalization(
                epsilon = 1e-5,
            )
        )
        
        # Convolutional layer 3
        model.add(
            layers.Convolution2D(
                filters=64, 
                kernel_size=(4, 4),
                strides=(1,1),
                padding='valid',
            )
        )
        
        model.add(
            layers.BatchNormalization(
                epsilon = 1e-5,
            )
        )
    
        # Flatten before passing to dense layers
        model.add(layers.Flatten())
        
        # Dense layer 1
        model.add(
            layers.Dense(
                units=512,
                activation='relu',
            )
        )
        
        # Dense layer 2
        model.add(
            layers.Dense(
                units=3,
                activation='relu',
            )
        )
        
        self.op = tf.keras.optimizers.Adam(lr=learning_rate)
        model.compile(loss='mse', optimizer=self.op)
        
        tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
        
        self.model = model
        self.board = tensorboard

In [None]:
def predict_action(policy_network, epsilon, state, actions):
    """
    Implements the epsilon-greedy strategy
    """
    
    if np.random.rand(1) < epsilon:
        max_Q = np.random.randint(len(actions))
    
    else:
        # print(state.shape)
        q = policy_network.model.predict(state.reshape([1,] + list(state.shape)))
        max_Q = np.argmax(q)
        # print(q, max_Q)
    
    action = actions[max_Q]
        
    return action, max_Q 

In [None]:
def train_net(config, n_episodes, game, actions):
    """
    Train the Q Network
    """
    
    gamma           = config['gamma']
    stack_size      = config['stack_size']
    frame_size      = config['frame_size']
    pretrain_steps  = config['pretrain_steps']
    batch_size      = config['batch_size']
    memory_size     = config['memory_size']
    annealing_steps = config['annealing_steps']
    annealing_stop  = config['annealing_stop']
    annealing_start = config['annealing_start']
    
    n_actions = len(actions)
    
    episode_rewards = []
    episode_losses = []
    
    # Initialize the DQN
    policy_net = DQN_D(config)
    
    # Inititalize the memory buffer
    memory = utilities.Memory(memory_size)
    
    # Initialize the linear annealing scheduler
    epsilon = utilities.LinearSchedule(
        annealing_steps, 
        annealing_stop, 
        annealing_start
    )
    
    # Inititalize the stack of frames
    stacked_frames = deque(
        [np.zeros((frame_size, frame_size), dtype=np.int) for i in range(stack_size)], 
        maxlen=stack_size
    ) 
    
    # Fill up the memory buffer
    utilities.pretrain(
        pretrain_steps,
        memory,
        stack_size,
        frame_size,
        stacked_frames,
        game,
        actions
    )
    
    for episode in range(n_episodes):
        eps = epsilon.value(episode)
        episode_reward = 0.0
        episode_loss = 0.0
        
        # New episode
        game.new_episode()
        done = game.is_episode_finished()
        
        # Initial state
        frame = game.get_state().screen_buffer
        state = utilities.stack_frames(
            stacked_frames, frame, True, stack_size, frame_size
        )
        
        while not done:
            # Action selection phase
            action, action_index = predict_action(policy_net, eps, state, actions)

            reward = game.make_action(action)
            done = game.is_episode_finished()

            # Next state
            if done: # Dead
                frame = np.zeros(frame.shape)
            
            else:
                frame = game.get_state().screen_buffer
            
            next_state = utilities.stack_frames(
                stacked_frames, frame, False, stack_size, frame_size
            )
            
            memory.add((state, action_index, reward, next_state, done))
            episode_reward += reward
            
            # Learning step
#             batch = memory.sample(batch_size)
                
#             state_b, action_b, reward_b, next_state_b, done_b = zip(*batch)
            
#             state_b = np.stack(state_b)
#             print('state_b', state_b.shape)
            
#             next_state_b = np.stack(next_state_b)
#             print('next_state_b', next_state_b.shape)
            
#             action_b = np.array(action_b)
#             print('action_b', action_b.shape)            
            
            batch = np.array(memory.sample(batch_size))
            
            state_b = np.stack(batch[:,0])
#             print('state_b', state_b.shape)
            
            action_b = np.stack(batch[:,1])
#             print('action_b', action_b.shape)
#             print('action_b0', type(action_b[0]))
            
            reward_b = batch[:,2]
#             print('reward_b', reward_b.shape)
            
            next_state_b = np.stack(batch[:,3])
#             print('next_state_b', next_state_b.shape)
            
            done_b = batch[:,4]
#             print('done_b', done_b.shape)  
            
            Q_next_b = policy_net.model.predict(next_state_b)
#             print('Q_next_b', Q_next_b.shape)
          
            # Trick to get the right shape of the targets
            targets_b = policy_net.model.predict(state_b)
#             targets_b = np.zeros((batch_size, n_actions), dtype=float)
            targets_b[range(batch_size), action_b] = reward_b + gamma * np.max(Q_next_b, axis=1) * np.invert(done_b)
#             print('targets_b', targets_b.shape)
            
            episode_loss += policy_net.model.train_on_batch(state_b, targets_b)
#             return

        
        episode_rewards.append(episode_reward)
        episode_losses.append(episode_loss)
    
    return episode_rewards, episode_losses, policy_net

In [None]:
episode_rewards, episode_losses, policy_net = train_net(config, 100, game, actions)

In [None]:
plt.plot(episode_rewards)

In [None]:
plt.plot(episode_losses)