The following cells are required to enable the display of Gym environment in Colab and install retro.
If you run this notebook in local environment, please ignore these cells.



In [0]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1

In [0]:
!pip3 install gym pyvirtualdisplay > /dev/null 2>&1

In [0]:
!pip3 install gym-retro

In [0]:
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

Make sure no error message is shown in Colab after running the cells above.

-------------------------------------------------------------------------------

Now import the necessary packages

In [0]:
import random
import gym
import numpy as np
import cv2
import datetime
from collections import deque
from keras.models import Sequential, clone_model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.optimizers import Adam
from keras import losses
from keras.callbacks import TensorBoard, Callback
import tensorflow as tf
import os
import retro

In [0]:
#define some constants
EPISODES = 2000
FRAME_NUM = 4
RESIZE = 80
SKIP_FRAME = 4
SHOOT_FRAME = 5
ACTION_SPACE = 2 # move left / right
MAX_STEP = 10000
BATCH_SIZE = 32
REWARD_BASE = 1 #5000


In [0]:
# for logging training loss history in keras
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

This is the class of the DQN Agent. It contains the definition of the operations of the agents. You are going to implement some of the areas.

In [0]:
# DQN Agent
class DQNAgent:
    def __init__(self, state_size, action_size, batch_size = 32):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=100000)
        self.gamma = 0.9    # discount rate
        self.epsilon_max = 1.0  # initial exploration rate
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.001
        self.learning_rate = 0.001

        self.batch_size = batch_size
        self.log_path = './logs'
        self.model = self._build_model()
        self.target_model = clone_model(self.model)
        self.history = LossHistory()
        self.observe = 0
        self.epsilon = 1.0
        self.tensorboard = TensorBoard(log_dir=self.log_path)
        self.tensorboard.set_model(self.model)

        self.tau_step = 100
        self.tau = 0.125
        self.loss = 0.0
        

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=4, activation = 'elu', input_shape = self.state_size, padding='valid', kernel_initializer='glorot_normal'))
        
        ### Your code here #####
        

        ########################
        model.compile(loss='mean_squared_error', optimizer=Adam(lr=self.learning_rate))

        model.summary()

        return model

    def remember(self, state, action_id, reward, next_state, done):
        ### Your code here ###
        # append the parameters as a tuple into the deque self.memory
        

    def move(self, state, decay_step):
        if len(self.memory) < self.batch_size:
            return random.randint(0, self.action_size-1) 

        self.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step)
    
        if (self.epsilon > np.random.rand()):
            # Make a random action (exploration)
            action_id = random.randint(0,self.action_size-1)
        else:
            # Get action from Q-network (exploitation)
            action_id = np.argmax(self.model.predict(np.reshape(state, (1, *self.state_size))))
        return action_id

    def trained_move(self, state):
        return np.argmax(self.model.predict(np.reshape(state, (1, *self.state_size))))

    def train(self, e): 
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)

        # split each elements from minibatch
        states_mb = np.array([each[0] for each in batch], ndmin=3)
        actions_mb = np.array([each[1] for each in batch])
        rewards_mb = np.array([each[2] for each in batch]) 
        next_states_mb = np.array([each[3] for each in batch], ndmin=3)
        dones_mb = np.array([each[4] for each in batch])
        target_Qs_batch = []

        # Get Q values for next_state using the target network 
        ##### Your code here ####
        Qs_next_state = None
        #########################
        targets_mb = np.squeeze(self.target_model.predict(np.reshape(states_mb, (self.batch_size, *self.state_size))))

        # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma * maxQ(s', a')
        for i in range(self.batch_size):
            done = dones_mb[i]
            if done:
                targets_mb[i,actions_mb[i]] = rewards_mb[i]
            else:
                targets_mb[i,actions_mb[i]] = rewards_mb[i] + self.gamma * np.max(Qs_next_state[i])

        self.loss = self.model.fit(states_mb, targets_mb, epochs = 1, verbose = False)


    def update_target_model(self, global_step):
        if global_step % self.tau_step == 0 and global_step > self.tau_step:
            self.target_model.set_weights(self.model.get_weights())
            print("Update Target Network in step {}".format(global_step))

    def load_model(self, name):
        if os.path.isfile(name):
            self.model.load_weights(name)
            print("Successfully loaded model weights")
        else:
            print("Can't load the model weights!")

    def save_model(self, name):
        self.model.save_weights(name)
        print("Successfully saved model weights")


    def named_logs(self, model, logs):
        result = {} 
        for l in zip(model.metrics_names, logs):
            result[l[0]] = l[1]
        return result

    def write_log(self, callback, names, logs, batch_no):
        for name, value in zip(names, logs):
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value
            summary_value.tag = name
            callback.writer.add_summary(summary, batch_no)
            callback.writer.flush()

    

Now we will go on to define some methods which are useful for processing the game.

You can change the delta_score for changing the reward at each step.

In [0]:
def check_status(respawn, respawn_prev, lives):
    # not dead
    killed = False
    gameover = False
    run = True
    delta_score = 0

    # keep alive and get some reward
    if respawn > 4 and respawn_prev > 4:
        #respawn_prev = value
        killed = False
        run = True
        delta_score = 0 # -0.1 #0.01
    
    # dead, waiting to respawn
    if respawn < 4 and respawn_prev < 4:
        #respawn_prev = value
        killed = False
        run = False
        delta_score = 0

    # respawn
    if respawn > 4 and respawn_prev < 4:
        killed = False
        run = True
        delta_score = 0

    # just hit and killed    
    if respawn < 4 and respawn_prev > 4:
        killed = True
        run = True
        delta_score = 0 #-1000

    if killed:
        if lives <= 0:
            gameover = True
            # extra panalty on losing all lives
            #delta_score = -10000
    
    return run, killed, gameover, delta_score

def get_action(move, frame_cnt):
    action = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

    # keep shooting, no need stop
    if frame_cnt % SHOOT_FRAME == 0:
        action[0] = 1
   
    # find action from agent's DQN based on the current state
    if move == 0: # move left
        action[6] = 1
        action[7] = 0
    else: # move right
        action[6] = 0
        action[7] = 1
    return action


In [0]:
def preprocess_frame(obs):
    return cv2.resize(cv2.cvtColor(obs[40:,:,:], cv2.COLOR_BGR2GRAY), (RESIZE, RESIZE), interpolation=cv2.INTER_CUBIC) / 255.0

def frames_to_state(frame_queue):
    state = np.zeros((RESIZE, RESIZE, FRAME_NUM))
    # do frame skipping and find maximum of ith and i-1th frame
    for i in range (0, FRAME_NUM):
        state[:,:,i] = np.maximum(frame_queue[(i+1)*SKIP_FRAME-1], frame_queue[(i+1)*SKIP_FRAME-2])
    return state

  

In [0]:
def train():
    # create a game environment and initialize it
    env = retro.RetroEnv(game='Airstriker-Genesis', use_restricted_actions=retro.Actions.FILTERED  )
    #env.reset()

    # obtain state parameters
    screen_shape = env.observation_space.shape
    state_size = (RESIZE, RESIZE, FRAME_NUM)
    action_size = ACTION_SPACE

    # create game frame container
    frame_queue = deque(maxlen=4 * SKIP_FRAME) # 16 frames at most for sampling 1 every 4 frames
    for i in range (0, 16):
        frame_queue.append(np.zeros((RESIZE, RESIZE)))

    # initialize learning agent
    agent = DQNAgent(state_size, action_size, batch_size = BATCH_SIZE)
    agent.load_model("./airstriker_dqn.h5")

    decay_step = -BATCH_SIZE


    # begin training episodes
    global_step = 0
    for e in range(EPISODES):
        # initialize the environment for every episode
        frame_0 = env.reset()
        frame_queue.append(preprocess_frame(frame_0))

        # initialize the state (with initial frame)
        state = frames_to_state(frame_queue)

        # frame counter for counting frame
        frame_cnt = 1

        # cummulated score of shooting down enemy
        score = 0
        # no. of hits
        hits = 0

        gameover = False
        killed = False
        respawn = 9
        respawn_prev = respawn
        #if e % 4 == 0:
        decay_step = -BATCH_SIZE
        

        while frame_cnt < MAX_STEP:
            screen = env.render(mode='rgb_array')
            ##########################
            plt.imshow(screen)
            ipythondisplay.clear_output(wait=True)
            ipythondisplay.display(plt.gcf())
            ########################

            # get action on the current frame using the last known state
            action_id = agent.move(state, global_step )
            action = get_action(action_id, frame_cnt)

            # get next observation based on the action above
            # Note: the reward obtained from env.step() in the game is not the actual reward
            # done is not used
            next_obs, reward, _, info = env.step(action)
            
            # append new observation into frame_queue
            frame_queue.append(preprocess_frame(next_obs))

            # hit an enemy
            if int(reward) > 0:
                hits += 1
                score += 0.05 #1000
                #score *= 1.5

            # get game state for every SKIP_FRAME-th (e.g. 4) frame
            if frame_cnt % SKIP_FRAME == 0:
                # check reward for each action taken
                # accumulated score (raward) to be stored
                respawn = int(info['gameover'])
                # check game status and other panalties
                run_next_state, killed, gameover, delta_score = check_status(respawn, respawn_prev, int(info['lives']))
                #score += delta_score

                respawn_prev = respawn

                if run_next_state:
                    next_state = frames_to_state(frame_queue)    
                  
                    # save into agent's memory
                    agent.remember(state, action_id, score, next_state, killed)
                    print("Logging: episode: {}, frame_cnt:{}, score:{}, hits:{}, killed:{}, epsilon:{}, mem_len:{}, global_step:{}".format(e, frame_cnt, score, hits, killed, agent.epsilon, len(agent.memory), global_step))
                    state = next_state

                    decay_step += 1
                else:
                    frame_queue.append(np.zeros((RESIZE, RESIZE)))
                    next_state = frames_to_state(frame_queue)   
                    # save into agent's memory
                    agent.remember(state, action_id, score, next_state, killed)
                    # reset the hit counter when loosing one lives
                    hits = 0
                    score = 0

                # get experience replay and train agent's and target's model
                agent.train(e)
                agent.update_target_model(global_step)

                global_step += 1

                # lost all lives. Restart game immediately
                if gameover:
                    
                    print("{} episode: {}/{}, reward: {}, e: {:.2}" .format(datetime.datetime.now, e, EPISODES, score, agent.epsilon))
                    break

            frame_cnt += 1

        #if e % 10 == 0 and e > 10:
        agent.save_model("./airstriker_dqn.h5")

    env.close()
    ipythondisplay.clear_output(wait=True)
    display.stop()

In [0]:
def play():
    # create a game environment and initialize it
    env = retro.RetroEnv(game='Airstriker-Genesis', use_restricted_actions=retro.Actions.FILTERED  )
    screen = env.render(mode='rgb_array')
    ##########################
    plt.imshow(screen)
    ipythondisplay.clear_output(wait=True)
    ipythondisplay.display(plt.gcf())
    ########################

    # obtain state parameters
    screen_shape = env.observation_space.shape
    state_size = (RESIZE, RESIZE, FRAME_NUM)
    action_size = ACTION_SPACE

    # create game frame container
    frame_queue = deque(maxlen=4 * SKIP_FRAME) # 16 frames at most for sampling 1 every 4 frames
    for i in range (0, 16):
        frame_queue.append(np.zeros((RESIZE, RESIZE)))

    # initialize learning agent
    agent = DQNAgent(state_size, action_size)
    agent.load_model("./airstriker_dqn.h5")

    # begin training episodes
    for e in range(EPISODES):
        # initialize the environment for every episode
        frame_0 = env.reset()
        frame_queue.append(preprocess_frame(frame_0))

        # initialize the state (with initial frame)
        state = frames_to_state(frame_queue)

        # frame counter for counting frame
        frame_cnt = 1

        gameover = False
        killed = False
        respawn = 9
        respawn_prev = respawn

        while True:
            env.render()

            # get action on the current frame using the last known state
            action_id = agent.trained_move(state)
            action = get_action(action_id, frame_cnt)

            # get next observation based on the action above
            next_obs, reward, _, info = env.step(action)
            print(frame_cnt, ": ", action_id, reward, "  ",  info)

            respawn = int(info['gameover'])
            # check game status and other panalties
            run_next_state, killed, gameover, delta_score = check_status(respawn, respawn_prev, int(info['lives']))
            # append new observation into frame_queue

            frame_queue.append(preprocess_frame(next_obs))
            state = frames_to_state(frame_queue)   

            if gameover:
                print("{} episode: {}/{}" .format(datetime.datetime.now, e, EPISODES))
                break

            frame_cnt += 1

    env.close()
    ipythondisplay.clear_output(wait=True)
    display.stop()

In [0]:
if __name__ == "__main__":
    train()