In [None]:
%%bash

apt-get update

# ZDoom dependencies
apt-get install build-essential zlib1g-dev libsdl2-dev libjpeg-dev \
nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \
libopenal-dev timidity libwildmidi-dev unzip

# Boost libraries
apt-get install libboost-all-dev

# Lua binding dependencies
apt-get install liblua5.1-dev

pip install git+https://github.com/mwydmuch/ViZDoom

In [None]:
import tensorflow as tf
import numpy as np
import cv2
import vizdoom

In [None]:
env = vizdoom.DoomGame()
env.set_doom_scenario_path(f"./drive/My Drive/doom/corridor.wad")
env.set_doom_map("map01")

env.set_screen_resolution(vizdoom.ScreenResolution.RES_320X240)
env.set_screen_format(vizdoom.ScreenFormat.RGB24)

env.set_render_hud(False)
env.set_render_minimal_hud(False)
env.set_render_crosshair(False)
env.set_render_weapon(True)
env.set_render_messages(False)
env.set_render_screen_flashes(True)

env.add_available_button(vizdoom.Button.MOVE_BACKWARD)
env.add_available_button(vizdoom.Button.MOVE_FORWARD)

env.add_available_game_variable(vizdoom.GameVariable.POSITION_X)

env.set_episode_timeout(200)

env.set_window_visible(False)
env.set_mode(vizdoom.Mode.PLAYER)

# #env.set_console_enabled(True)

env.init()

actions = [[1, 0], [0, 1]]
positions = [32, 96, 160, 224, 288, 352, 416]

def select_action(x, option):
    return 1 if x < positions[option] else 0

def option_reached(x, option):
    return True if abs(x - positions[option]) < 5 else False

In [None]:
class RecurrentHReinforce:

    def __init__(self):
        self.memory = []
        self.frames = []
        self.options = []
        self.rewards = []
        self.discount_rate = 0.99
        self.learning_rate = 0.001
        self.model = self.build_model()

    def build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(
            32, (8, 8), strides=(4, 4), activation='relu'), input_shape=(None, 100, 100, 3)))
        model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Conv2D(
            64, (4, 4), strides=(2, 2), activation='relu')))
        model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten()))
        model.add(tf.keras.layers.GRU(256, return_sequences=True))
        model.add(tf.keras.layers.Dense(7, activation='softmax'))

        model.compile(loss="categorical_crossentropy",
                      optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate, clipnorm=1.0))
        model.summary()

        return model

    def select_option(self, frame_list, x):
        applicable = np.delete(np.arange(7), x // 64)
        frame_list = np.reshape(frame_list, (1, len(frame_list), 100, 100, 3)) / 255
        prob = np.delete(self.model.predict(frame_list)[0, -1], x // 64)
        if np.sum(prob) == 0:
            return np.random.choice(applicable)
        prob /= np.sum(prob)
        return np.random.choice(applicable, 1, p=prob)[0]

    def store_transition(self, frame, option, reward):
        self.frames.append(frame)
        self.options.append(option)
        self.rewards.append(reward)

    def store_episode(self):
        self.memory.append((self.frames, self.options, self.rewards))
        self.frames = []
        self.options = []
        self.rewards = []

    def update(self, episode):
        trajectory = self.memory[episode]
        frames = trajectory[0][:-1]
        options = trajectory[1][:-1]
        rewards = trajectory[2][:-1]

        T = len(frames)

        returns = np.zeros(T)
        returns[-1] = rewards[-1]
        for t in reversed(range(1, T)):
            returns[t - 1] = rewards[t - 1] + self.discount_rate * returns[t]

        x = np.reshape(frames, (1, T, 100, 100, 3)) / 255
        y = np.zeros((1, T, 7))

        for i in range(T):
            y[0, i, options[i]] = returns[i]

        self.model.fit(x, y, epochs=1, verbose=0)

    def batch_update(self, start, end):
        for episode in range(start, end):
            self.update(episode)

In [None]:
def train(start, runs):
    if start == 0:
        rewards = np.zeros((runs, 10000))
    else:
        rewards = np.load(f"./drive/My Drive/doom/mnemonic_rewards.npy")

    for run in range(start, runs):
        print("\nRun " + str(run))

        meta = RecurrentHReinforce()

        for episode in range(10000):
            env.new_episode()
            meta_state = env.get_state()
            episode_reward = 0

            frame_list = []
            frame = cv2.resize(meta_state.screen_buffer, (100, 100))
            frame_list.append(frame)

            while not env.is_episode_finished():
                if episode < 1000:
                    # applicable = np.delete(np.arange(7), meta_state.game_variables[0] // 64)
                    option = np.random.choice(7)
                else:
                    option = meta.select_option(frame_list, meta_state.game_variables[0])
                reached = option_reached(meta_state.game_variables[0], option)

                option_reward = 0

                state = meta_state
                while not env.is_episode_finished() and not reached:
                    action = select_action(state.game_variables[0], option)
                    reward = env.make_action(actions[action])
                    next_state = env.get_state()
                    
                    option_reward += reward
                    episode_reward += reward
                    if next_state:
                        reached = option_reached(next_state.game_variables[0], option)
                        state = next_state

                meta.store_transition(frame, option, option_reward)
                meta_state = state
                frame = cv2.resize(meta_state.screen_buffer, (100, 100))
                frame_list.append(frame)

            meta.store_episode()
            meta.update(episode)

            rewards[run, episode] = episode_reward

            if episode % 1000 == 999:
                np.save(f"./drive/My Drive/doom/mnemonic_rewards", rewards)

        meta.model.save(f"./drive/My Drive/doom/mnemonic_{run}_{episode}.h5")

In [None]:
train(0, 10)