# Dependencies

In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

    except RuntimeError as e:
        print(e)

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow.keras as K
import numpy as np

from collections import deque
import itertools as it
import cv2
from vizdoom import *   
import random
import time

# Doom-Ai

## Initial Configuration

In [22]:
def configure_game_training():
    game = DoomGame()
    game.load_config("/home/msi-gtfo/repos/ViZDoom/scenarios/basic.cfg")
    game.set_window_visible(False)
    game.set_render_hud(False)
    game.set_screen_format(vizdoom.ScreenFormat.GRAY8)

    nothing     = [0, 0, 0]
    left        = [1, 0, 0]
    right       = [0, 1, 0]
    shoot       = [0, 0, 1]
    left_shoot  = [1, 0, 1]
    right_shoot = [0, 1, 1]
    possible_actions = [nothing, left, right, shoot, left_shoot, right_shoot]

    return game, possible_actions


## Define Q-Learning Functions

In [20]:
MIN_REPLAY_MEMORY = 1000
MAX_REPLAY_MEMORY = 50000
MINI_BATCH_SIZE   = 64

GAMMA = 0.99
EPSILON = 0.2
EPSILON_DISCOUNT =0.01

resolution = (84,84)

def preprocess(img):
    #img = np.reshape(img,(img.shape[1],img.shape[2],img.shape[0]))
    img = cv2.resize(img, (resolution[1],resolution[0]))
    img = img.astype(np.float32)
    img = img / 255
    return img

def create_model(n_actions):
    model = Sequential()
    model.add(Conv2D(128,3,input_shape=(resolution[0], resolution[1], 4), activation='relu', padding='same'))
    model.add(MaxPooling2D(2,2))
    model.add(Conv2D(128,3, activation='relu', padding='same'))
    model.add(MaxPooling2D(2,2))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_actions))

    model.summary()
    opt = Adam(lr=0.001)
    model.compile(opt,'mean_squared_error',['accuracy'])

    return model

class DQNAgent:
    def __init__(self, n_actions, use_latest=False):
        self.model = create_model(n_actions)
        self.replay_memory = deque(maxlen=MAX_REPLAY_MEMORY)
        if use_latest:
            self.load_model()

    def save_model(self):
        self.model.save("model")

    def load_model(self):
        self.model = K.models.load_model("model")

    # (observation space, action, reward, new observation space, done)
    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    # Queries main network for Q values given current observation space (environment state)
    def get_qs(self, states):
        states_exp = np.expand_dims(states, axis=0)
        prediction = self.model.predict(states_exp)
        return prediction[0]

    # Trains network every step during episode
    def train(self):

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY:
            return
        
        minibatch = random.sample(self.replay_memory, MINI_BATCH_SIZE)

        s1_batch = np.array([d[0] for d in minibatch])
        a_batch = [d[1] for d in minibatch]
        r_batch = [d[2] for d in minibatch]
        s2_batch = np.array([d[3] for d in minibatch])

        Y = []
        s2_qs = self.model.predict(s2_batch)

        # TODO fix the definition of targets

        for i in range(0,MINI_BATCH_SIZE):
            value = 0
            # Check if terminal
            if minibatch[i][4]:
                value = r_batch[i]
            else:
                value = r_batch[i] + GAMMA * np.max(s2_qs[i])
            tmp = np.zeros(len(actions))
            tmp[minibatch[i][1]] = value
            Y.append(tmp)
        
        self.model.fit(s1_batch, np.array(Y), batch_size=MINI_BATCH_SIZE, epochs=1, verbose=0, shuffle=False)


In [24]:
EPOCHS = 50
MAX_STEPS = 100

game, actions = configure_game_training()
agent = DQNAgent(len(actions),use_latest=False)

scores = []

print("---------------Starting training Doom Ai--------------")
game.init()
for epoch in range(EPOCHS):
    epsilon = EPSILON
    print("-> Episode ",epoch)
    game.new_episode()
    
    # First state to predict on as a starting point
    s1 = preprocess(game.get_state().screen_buffer)
    s_t_deque = deque(maxlen=4)
    s_t_deque.append(s1)
    s_t_deque.append(s1)
    s_t_deque.append(s1)
    s_t_deque.append(s1)

    s_t = np.stack(s_t_deque, axis=2)
    step = 0
    while (not game.is_episode_finished()) or step < MAX_STEPS:
        q_t = agent.get_qs(s_t) 

        # Decide if greedy or not
        if random.random() <= epsilon:
            epsilon -= EPSILON_DISCOUNT
            a = random.randint(0, len(actions) - 1) # TODO remove option of pressing left and right at the same time
        else:
            a = np.argmax(q_t)

        # TODO work on epsilon degredation techniques


        # Execute action    
        reward = game.make_action(actions[a],12) #frame repeat ?
        isterminal = game.is_episode_finished()
        s2 = preprocess(game.get_state().screen_buffer) if not isterminal else np.zeros((resolution))
        s_t_deque.append(s2)
        s_t2 = np.stack(s_t_deque, axis=2)

        agent.update_replay_memory([s_t, a, reward, s_t2, isterminal])

        # Update current input of states
        s_t = s_t2
        agent.train()
        step += 1

    if game.is_episode_finished():
        final_reward = game.get_total_reward()
        print("Final Reward: ", final_reward)
        scores.append(final_reward)
    
    if (epoch % 5) == 0:
        agent.save_model()
    

train_scores = np.array(scores)

print("Results: mean: %.1f±%.1f," % (train_scores.mean(), train_scores.std()),
         "min: %.1f," % train_scores.min(), "max: %.1f," % train_scores.max())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 84, 84, 128)       4736      
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 42, 42, 128)       0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 42, 42, 128)       147584    
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 21, 21, 128)       0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 56448)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 256)               14450944  
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)              

KeyboardInterrupt: 

In [19]:
print("======================================")
print("Training finished. It's time to watch!")

# Reinitialize the game with window visible
game, _ = configure_game_training()
game.set_window_visible(True)
game.set_mode(vizdoom.Mode.ASYNC_PLAYER)
game.init()

for _ in range(5):
    game.new_episode()

    s1 = preprocess(game.get_state().screen_buffer)
    s_t_deque = deque(maxlen=4)
    s_t_deque.append(s1)
    s_t_deque.append(s1)
    s_t_deque.append(s1)
    s_t_deque.append(s1)

    s_t = np.stack(s_t_deque, axis=2)
    while not game.is_episode_finished():
        q_t = agent.get_qs(s_t) 
        a = np.argmax(q_t)

        # Instead of make_action(a, frame_repeat) in order to make the animation smooth
        game.set_action(actions[a])
        for _ in range(12):
            game.advance_action()

        s2 = []
        if not game.is_episode_finished:
            s2 = preprocess(game.get_state().screen_buffer)
        else:
            break
        s_t_deque.append(s2)
        s_t2 = np.stack(s_t_deque, axis=2)

    # Sleep between episodes
    time.sleep(1.0)
    score = game.get_total_reward()
    print("Total score: ", score)

Training finished. It's time to watch!
Total score:  -12.0
Total score:  -12.0
Total score:  -12.0
Total score:  -12.0
Total score:  -12.0


In [26]:
from numba import cuda
cuda.select_device(0)
cuda.close()