In [None]:
import tensorflow as tf
import numpy as np

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird(pipe_gap = 300)
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

In [None]:
# Define Input Size
IMG_WIDTH = 84
IMG_HEIGHT = 84
NUM_STACK = 4
# For Epsilon-greedy
MIN_EXPLORING_RATE = 0.01
max_step_forward = 10

BATCH_SIZE = 1

In [None]:
class Actor(tf.keras.Model):
    def __init__(self):
        super(Actor, self).__init__()
        self.dense1 = tf.keras.layers.Dense(256, activation='relu')
        self.dense2 = tf.keras.layers.Dense(2)
 
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)

        return x

In [None]:
class Critic(tf.keras.Model):
    def __init__(self):
        super(Critic, self).__init__()
        self.dense1 = tf.keras.layers.Dense(256, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1)
        
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        
        return x

In [None]:
import copy

class Agent:
    def __init__(self, name, num_action, discount_factor=0.99):
        self.exploring_rate = 0.1
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.actor = Actor()
        self.critic = Critic()
        
        checkpoint_dir = './checkpoint'
        
        self.actor_ckpt = os.path.join(checkpoint_dir, "actor_ckpt")
        self.critic_ckpt = os.path.join(checkpoint_dir, "critic_ckpt")
    
    def save_model(self, ep):
        self.actor.save(self.actor_ckpt, 'model_{}.h5'.format(ep))
        self.critic.save(self.critic_ckpt, 'model_{}.h5'.format(ep))
        
    def restore_model(self):
        self.actor_ckpt.restore(self.actor_manager.latest_checkpoint)
        self.critic_ckpt.restore(self.critic_manager.latest_checkpoint)
    
    def td_target(self, memory, next_state, ternimal):
        if ternimal:
            v_value = 0.  # terminal
        else:
            v_value = self.critic(next_state)

        # Get discounted rewards
        discounted_rewards = []
        for reward in memory.rewards[::-1]:  # reverse buffer r
            v_value = reward + 0.99*v_value
            discounted_rewards.append(v_value)
            
        discounted_rewards.reverse()

        
        return discounted_rewards

    def advantage(self, td_targets, baselines):
        td_targets = tf.convert_to_tensor(np.array(td_targets)[:, None], dtype=tf.float32)
        return td_targets - baselines
    
    def loss(self, memory, next_state, ternimal):
        td_target = self.td_target(memory, next_state, ternimal)
        advantage = self.advantage(td_target, self.critic(np.vstack(memory.states)))
        
        critic_loss = advantage**2
        
        actions_one_hot = tf.one_hot(memory.actions, 2, dtype=tf.float32)
        
        logits = self.actor(np.vstack(memory.states))
        policy = tf.nn.softmax(logits)
        entropy = tf.reduce_sum(policy * tf.math.log(policy + 1e-20), axis=1)
        
        #print(td_target)
        #print(logits)
        actor_loss = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels=actions_one_hot,
                                                                 logits=logits)
        actor_loss *= tf.stop_gradient(advantage)
        actor_loss -= 0.01 * entropy

        total_loss = tf.reduce_mean((0.5 * critic_loss + actor_loss))
        return total_loss
    
    def select_action(self, state):
        if np.random.rand() < self.exploring_rate:
            action = np.random.choice(self.num_action)  # Select a random action
        else:
            state = np.array(state)
            state = np.expand_dims(state, axis = 0)
            output = self.actor(state)
            output = tf.nn.softmax(output)
            
            action = np.random.choice(2, p=output.numpy()[0])

        return action
    
    def get_state_idx(self, state):
        # instead of using absolute position of pipe, use relative position
        state = copy.deepcopy(state)
        state['next_next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_pipe_top_y'] -= state['player_y']

        relative_state = list(state.values())
        return relative_state
    
    def update_parameters(self, episode):
        self.exploring_rate = max(MIN_EXPLORING_RATE, min(0.5, 0.99**((episode) / 30)))

    def shutdown_explore(self):
        # make action selection greedy
        self.exploring_rate = 0

In [None]:
# init agent
num_action = len(env.getActionSet())

# agent for frequently updating
online_agent = Agent('online', num_action)

# agent for slow updating
target_agent = Agent('target', num_action)
# synchronize target model's weight with online model's weight
target_agent.actor.set_weights(online_agent.actor.get_weights())
target_agent.critic.set_weights(online_agent.critic.get_weights())

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
average_loss = tf.keras.metrics.Mean(name='loss')

#@tf.function
def train_step(mem, next_state, ternimal):
    # Delayed Target Network
    with tf.GradientTape() as tape:
        loss = online_agent.loss(mem, next_state, ternimal)
        
    trainable_variables = online_agent.actor.trainable_variables + online_agent.critic.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    average_loss.update_state(loss)

In [None]:
import moviepy.editor as mpy

def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

In [None]:
import skimage.transform

def preprocess_screen(screen):
    screen = skimage.transform.resize(screen, [IMG_WIDTH, IMG_HEIGHT, 1])
    return screen

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [None]:
class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
    
    def store(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
    
    def clear(self):
        self.states = []
        self.actions = []
        self.rewards = []

In [None]:
def reward_function(reward, state):
    if reward == 0:
        reward = 0.1
    elif reward == 1:
        reward = 100
    elif reward == -5:
        reward = -500    
    '''if state[3] > 5: # player < next_pipe_top_y
        reward -= 0.001*abs(state[3])
    elif state[4] < -5: # player > next_pipe_bottom_y
        reward -= 0.001*abs(state[4])
    else:
        reward += 1'''
    #reward -= abs(state[3] + state[4])
    
    return reward

In [None]:
max_t = 0

In [None]:
from IPython.display import Image, display

update_every_iteration = 1000
print_every_episode = 500
save_video_every_episode = 500
NUM_EPISODE = 2000000
NUM_EXPLORE = 20

iter_num = 0
online_agent.shutdown_explore()
mem = Memory()
for episode in range(0, NUM_EPISODE + 1):
    
    # Reset the environment
    env.reset_game()
    
    # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]
    
    # input frame
    input_frames = [preprocess_screen(env.getScreenGrayscale())]
    
    # for every 500 episodes, shutdown exploration to see the performance of greedy action
    if episode % print_every_episode == 0:
        online_agent.shutdown_explore()
    
    # cumulate reward for this episode
    cum_reward = 0
    
    t = 0
    mem.clear()
    while not env.game_over():
        state = game.getGameState()
        state = online_agent.get_state_idx(state)
        
        # feed current state and select an action
        action = online_agent.select_action(state)
        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])
        reward = reward_function(reward, state)
        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())
        
        # record input frame
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))
        
        # cumulate reward
        cum_reward += reward
        
        # observe the result
        state_prime = game.getGameState()
        state_prime = online_agent.get_state_idx(state_prime)
        
        terminal = env.game_over()

        # convert Python object to Tensor to prevent graph re-tracing
        train_states = np.array([state])
        train_states_prime = np.array([state_prime])
        
        train_states_prime = tf.convert_to_tensor(train_states_prime, tf.float32)
        terminal = tf.convert_to_tensor(terminal, tf.bool)
        
        mem.store(train_states, action, reward)
        if t == 25 or terminal:
            train_step(mem, train_states_prime, terminal)
        
        # Setting up for the next iteration
        state = state_prime
        t += 1
    # update exploring rate
    #online_agent.update_parameters(episode)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        if t >= max_t:
            online_agent.save_model(episode)
            max_t = t
        
        print('max_t: ', max_t)
        print(
            "[{}] time live:{}, cumulated reward: {}, exploring rate: {}, average loss: {}".
            format(episode, t, cum_reward, online_agent.exploring_rate, average_loss.result()))
        average_loss.reset_states()

    if episode % save_video_every_episode == 0:  # for every 500 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/DQN_demo-{}.webm".format(episode), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

In [None]:
# test result
env.reset_game()
frames = [env.getScreenRGB()]

online_agent.shutdown_explore()
#online_agent.restore_model()
while not env.game_over():
    state = game.getGameState()
    state = online_agent.get_state_idx(state)
    
    action = online_agent.select_action(state)
    #action = 1
    reward = env.act(env.getActionSet()[action])
    reward = reward_function(reward, state)
    
    #print(state[3], state[4])
    #print(reward)
    
    frames.append(env.getScreenRGB())
    
    state_prime = game.getGameState()
    state_prime = online_agent.get_state_idx(state_prime)
    state = state_prime
    
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
clip.write_videofile("movie_f/DQN_demo-{}.webm".format(episode), fps=60)
display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

In [None]:
env.reset_game()

state = game.getGameState()
print(state)
state = online_agent.get_state_idx(state)
print(state)