In [1]:
from ple.games.flappybird import FlappyBird
from ple import PLE
import matplotlib.pyplot as plt
import os
import numpy as np

%matplotlib inline
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line disable pop-out window
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()
# return a dictionary whose key is action description and value is action index
print(game.actions)
# return a list of action index (include None)
print(env.getActionSet())

pygame 2.0.1 (SDL 2.0.14, Python 3.7.9)
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom
{'up': 119}
[119, None]


In [2]:
# a dictionary describe state
'''
    player y position.
    players velocity.
    next pipe distance to player
    next pipe top y position
    next pipe bottom y position
    next next pipe distance to player
    next next pipe top y position
    next next pipe bottom y position
'''
game.getGameState()  

{'player_y': 256,
 'player_vel': 0,
 'next_pipe_dist_to_player': 309.0,
 'next_pipe_top_y': 144,
 'next_pipe_bottom_y': 244,
 'next_next_pipe_dist_to_player': 453.0,
 'next_next_pipe_top_y': 160,
 'next_next_pipe_bottom_y': 260}

In [3]:
num_action = len(env.getActionSet())
bucket_range_per_feature = {
  'next_next_pipe_bottom_y': 40,
  'next_next_pipe_dist_to_player': 512,
  'next_next_pipe_top_y': 40,
  'next_pipe_bottom_y': 20,
  'next_pipe_dist_to_player': 20,
  'next_pipe_top_y': 20,
  'player_vel': 4,
  'player_y': 16
}
# init agent
# agent = Agent(bucket_range_per_feature, num_action)

In [4]:
import multiprocessing
import threading

In [5]:
multiprocessing.cpu_count()

12

In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class ActorCriticModel(keras.Model):
    def __init__(self, state_size, action_size):
        super(ActorCriticModel, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.dense1 = layers.Dense(256, activation='relu')
        self.policy_logits = layers.Dense(action_size)
        self.dense2 = layers.Dense(256, activation='relu')
        self.values = layers.Dense(1)

    def call(self, inputs):
        # Forward pass
        x = self.dense1(inputs)
        logits = self.policy_logits(x)
        v1 = self.dense2(inputs)
        values = self.values(v1)
        return logits, values

In [7]:
# import skimage.transform
import moviepy.editor as mpy
import copy
def preprocess_state(state):
    
    # return  np.expand_dims(np.array(list(state.values()), dtype=np.float32), axis=-1)
    return TA_state(state)

def TA_state(state):
    
    state['next_next_pipe_bottom_y'] -= state['player_y']
    state['next_next_pipe_top_y'] -= state['player_y']
    state['next_pipe_bottom_y'] -= state['player_y']
    state['next_pipe_top_y'] -= state['player_y']
    relative_state = list(state.values())


    # return the state in tensor type, with batch dimension
    relative_state = tf.convert_to_tensor(relative_state, dtype=tf.float32)
    relative_state = tf.expand_dims(relative_state, axis=0)
    
    return relative_state

def record(episode,
           episode_reward,
           worker_idx,
           global_ep_reward,
           result_queue,
           total_loss,
           num_steps):
    """Helper function to store score and print statistics.
    Args:
    episode: Current episode
    episode_reward: Reward accumulated over the current episode
    worker_idx: Which thread (worker)
    global_ep_reward: The moving average of the global reward
    result_queue: Queue storing the moving average of the scores
    total_loss: The total loss accumualted over the current episode
    num_steps: The number of steps the episode took to complete
    """
    if global_ep_reward == 0:
        global_ep_reward = episode_reward
    else:
        global_ep_reward = global_ep_reward * 0.99 + episode_reward * 0.01
    print(
      f"Episode: {episode} | "
      f"Moving Average Reward: {global_ep_reward:0.2f} | "
      f"Episode Reward: {episode_reward:0.2f} | "
      f"True Reward: {(episode_reward - num_steps*0.1):0.2f} |"
      f"Loss: {int(total_loss / float(num_steps) * 1000) / 1000} | "
      f"Steps: {num_steps} | "
      f"Worker: {worker_idx}"
      )
    result_queue.put(global_ep_reward)
    return global_ep_reward


def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

In [8]:
preprocess_state(env.getGameState())

<tf.Tensor: shape=(1, 8), dtype=float32, numpy=
array([[ 256.,    0.,  309., -112.,  -12.,  453.,  -96.,    4.]],
      dtype=float32)>

In [9]:
from queue import Queue
class MasterAgent():
    def __init__(self, start_from_old=False, model_name=None):
        self.game_name = 'FlappyBird'
        save_dir = SAVE_DIR
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        game = FlappyBird()
        env = PLE(game, fps=30, display_screen=False)
        self.state_size = len(game.getGameState())
        self.action_size = len(env.getActionSet())
        self.opt = tf.compat.v1.train.AdamOptimizer(LR, use_locking=True)
        print(self.state_size, self.action_size)

        self.global_model = ActorCriticModel(self.state_size, self.action_size)  # global network
        self.global_model(tf.convert_to_tensor(np.random.random((1, self.state_size)), dtype=tf.float32))
        if start_from_old:
            model_path = os.path.join(self.save_dir, 'model_{}_final.h5'.format(self.game_name))
            if model_name:
                model_path = os.path.join(self.save_dir, model_name)
            print('Loading model from: {}'.format(model_path))
            self.global_model.load_weights(model_path)
        
    def train(self):
        res_queue = Queue()

        workers = [Worker(self.state_size,
                          self.action_size,
                          self.global_model,
                          self.opt, res_queue,
                          i, game_name=self.game_name,
                          save_dir=self.save_dir) for i in range(multiprocessing.cpu_count())]

        for i, worker in enumerate(workers):
            print("Starting worker {}".format(i))
            worker.start()

        moving_average_rewards = []  # record episode reward to plot
        while True:
            reward = res_queue.get()
            if reward is not None:
                moving_average_rewards.append(reward)
            else:
                break
            
            if Worker.global_episode % 500 == 0:
                with Worker.save_lock:
                    print("Saving checkpoint model to {}, "
                          "episode score: {}".format(self.save_dir, reward))
                    self.global_model.save_weights(
                        os.path.join(self.save_dir,
                        'model_{}.h5'.format(self.game_name))
                    )
        [w.join() for w in workers]

        self.global_model.save_weights(
            os.path.join(self.save_dir,
            'model_{}_final.h5'.format(self.game_name))
        )
        
        plt.plot(moving_average_rewards)
        plt.ylabel('Moving average ep reward')
        plt.xlabel('Step')
        plt.savefig(os.path.join(self.save_dir,
                                 '{} Moving Average.png'.format(self.game_name)))
        plt.show()
        
    def play(self):
        game = FlappyBird(pipe_gap=PIPE_GAP)
        env = PLE(game, fps=30, display_screen=False)
        env.reset_game()
        state = preprocess_state(game.getGameState())
        model = self.global_model
        model_path = os.path.join(self.save_dir, 'model_{}.h5'.format(self.game_name))
        print('Loading model from: {}'.format(model_path))
        model.load_weights(model_path)
        done = False
        step_counter = 0
        reward_sum = 0

        frames = []
        
        try:
            while not done and step_counter < 1000:
                frames.append(env.getScreenRGB())
                policy, value = model(state)
                policy = tf.nn.softmax(policy)
                action = np.argmax(policy)
                
                reward = env.act(env.getActionSet()[action])
                state = preprocess_state(game.getGameState())
                done = env.game_over()
                
                reward_sum += reward
                print(f"{step_counter}. Reward: {reward_sum}, action: {action}, policy: {policy}")
                step_counter += 1
        except KeyboardInterrupt:
              print("Received Keyboard Interrupt. Shutting down.")
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        # clip.write_videofile("movie_f/DQN_demo-{}.webm".format(episode), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

In [10]:
class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
    
    def store(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
    
    def clear(self):
        self.states = []
        self.actions = []
        self.rewards = []

In [26]:
MAX_EPS = 1000
PIPE_GAP = 100
UPDATE_FREQ = 25
SAVE_DIR = './gap_change'
LR = 1e-5
GAMMA = 0.99

In [27]:
import matplotlib.pyplot as plt
from PIL import Image
class Worker(threading.Thread):
    # Set up global variables across different threads
    global_episode = 0
    # Moving average reward
    global_moving_average_reward = 0
    best_score = 0
    save_lock = threading.Lock()

    def __init__(self,
               state_size,
               action_size,
               global_model,
               opt,
               result_queue,
               idx,
               game_name='FlappyBird',
               save_dir='/tmp'):
        super(Worker, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.result_queue = result_queue
        self.global_model = global_model
        self.opt = opt
        self.local_model = ActorCriticModel(self.state_size, self.action_size)
        self.worker_idx = idx
        self.game_name = game_name
        self.game = FlappyBird(pipe_gap=PIPE_GAP)
        self.env = PLE(self.game, fps=30, display_screen=False)
        self.save_dir = save_dir
        self.ep_loss = 0.0
        
    
    def run(self):
        total_step = 1
        mem = Memory()
        while Worker.global_episode < MAX_EPS:
            self.env.reset_game()
            current_state = preprocess_state(self.game.getGameState())
            mem.clear()
            ep_reward = 0.
            ep_steps = 0
            self.ep_loss = 0

            time_count = 0
            done = False

            while not done:
                # plt.imshow(Image.fromarray(self.env.getScreenRGB()))
                # plt.show()
                logits, _ = self.local_model(current_state)
                probs = tf.nn.softmax(logits)
                # print(probs)
                action = np.random.choice(self.action_size, p=probs.numpy()[0])
                
                # print(time_count, current_state,  action)
                
                reward = self.env.act(self.env.getActionSet()[action])
                new_state = preprocess_state(self.game.getGameState())
                done = self.env.game_over()
                # new_state, reward, done, _ = self.env.act(self.env.getActionSet()[action])
                # if done:
                #     reward = -5
                if not done:
                    alive_bouns = 0.1
                else:
                    alive_bouns = 0.0
                ep_reward += reward + alive_bouns
                mem.store(current_state, action, reward)

                if time_count == UPDATE_FREQ or done:
                  # Calculate gradient wrt to local model. We do so by tracking the
                  # variables involved in computing the loss by using tf.GradientTape
                    with tf.GradientTape() as tape:
                        total_loss = self.compute_loss(done,
                                                   new_state,
                                                   mem,
                                                   GAMMA)
                    self.ep_loss += total_loss
                    # Calculate local gradients
                    grads = tape.gradient(total_loss, self.local_model.trainable_weights)
                    # Push local gradients to global model
                    self.opt.apply_gradients(zip(grads,
                                               self.global_model.trainable_weights))
                    # Update local model with new weights
                    self.local_model.set_weights(self.global_model.get_weights())

                    mem.clear()
                    time_count = 0

                    if done:  # done and print information
                        Worker.global_moving_average_reward = \
                        record(Worker.global_episode, ep_reward, self.worker_idx,
                             Worker.global_moving_average_reward, self.result_queue,
                             self.ep_loss, ep_steps)
                        # We must use a lock to save our model and to print to prevent data races.
                        if ep_reward > Worker.best_score:
                            with Worker.save_lock:
                                print("Saving best model to {}, "
                                      "episode score: {}".format(self.save_dir, ep_reward))
                                self.global_model.save_weights(
                                    os.path.join(self.save_dir,
                                    'model_{}.h5'.format(self.game_name))
                                )
                                Worker.best_score = ep_reward
                        Worker.global_episode += 1
                ep_steps += 1

                time_count += 1
                current_state = new_state
                total_step += 1
        self.result_queue.put(None)
    def compute_loss(self,
                   done,
                   new_state,
                   memory,
                   gamma=0.99):
        # print(new_state, done)
        if done:
            reward_sum = 0.  # terminal
        else:
            _, reward_sum = self.local_model(
                tf.convert_to_tensor([new_state],
                                     dtype=tf.float32))
            # reward_sum = reward_sum.numpy()[0]
        # print(reward_sum)
        # Get discounted rewards
        discounted_rewards = []
        for reward in memory.rewards[::-1]:  # reverse buffer r
            reward_sum = reward + gamma * reward_sum
            discounted_rewards.append(reward_sum)
        discounted_rewards.reverse()

        logits, values = self.local_model(
            tf.convert_to_tensor(np.vstack(memory.states),
                                 dtype=tf.float32))
        # Get our advantages
        advantage = tf.convert_to_tensor(np.array(discounted_rewards)[:, None],
                                dtype=tf.float32) - values
        # Value loss
        value_loss = advantage ** 2

        # Calculate our policy loss
        actions_one_hot = tf.one_hot(memory.actions, self.action_size, dtype=tf.float32)

        policy = tf.nn.softmax(logits)
        entropy = tf.reduce_sum(policy * tf.math.log(policy + 1e-20), axis=1)

        policy_loss = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels=actions_one_hot,
                                                                 logits=logits)
        policy_loss *= tf.stop_gradient(advantage)
        policy_loss -= 0.01 * entropy
        total_loss = tf.reduce_mean((0.5 * value_loss + policy_loss))
        return total_loss

In [28]:

test_global_model = ActorCriticModel(8, 2)  # global network
# test_global_model(tf.convert_to_tensor(np.random.random((1, 8)), dtype=tf.float32))
optimizer = tf.compat.v1.train.AdamOptimizer(LR, use_locking=True)
q = Queue()
test_worker = Worker(8,
                     2,
                     test_global_model,
                     optimizer, q,
                     0, game_name='FlappyBird',
                      save_dir=SAVE_DIR)

In [29]:
Worker.global_episode = 0

In [None]:

master = MasterAgent(True, model_name='model_FlappyBird_The_Best_One_gap_125.h5')
master.train()

8 2
Loading model from: ./gap_change\model_FlappyBird_The_Best_One_gap_125.h5
Starting worker 0
Starting worker 1
Starting worker 2
Starting worker 3
Starting worker 4
Starting worker 5
Starting worker 6
Starting worker 7
Starting worker 8
Starting worker 9
Starting worker 10
Starting worker 11
Episode: 0 | Moving Average Reward: 20.00 | Episode Reward: 20.00 | True Reward: -1.00 |Loss: 0.524 | Steps: 210 | Worker: 5
Saving best model to ./gap_change, episode score: 20.00000000000007
Episode: 0 | Moving Average Reward: 20.00 | Episode Reward: 20.00 | True Reward: -1.00 |Loss: 1.581 | Steps: 210 | Worker: 2
Saving checkpoint model to ./gap_change, episode score: 20.00000000000007
Episode: 1 | Moving Average Reward: 20.00 | Episode Reward: 20.00 | True Reward: -1.00 |Loss: 0.914 | Steps: 210 | Worker: 6
Saving best model to ./gap_change, episode score: 20.00000000000007
Episode: 2 | Moving Average Reward: 20.00 | Episode Reward: 20.00 | True Reward: -1.00 |Loss: 0.203 | Steps: 210 | Work

Episode: 56 | Moving Average Reward: 29.16 | Episode Reward: 24.60 | True Reward: 0.00 |Loss: 0.043 | Steps: 246 | Worker: 9
Episode: 57 | Moving Average Reward: 30.49 | Episode Reward: 163.00 | True Reward: 29.00 |Loss: 0.016 | Steps: 1340 | Worker: 3
Episode: 58 | Moving Average Reward: 31.82 | Episode Reward: 163.00 | True Reward: 29.00 |Loss: 0.018 | Steps: 1340 | Worker: 7
Episode: 59 | Moving Average Reward: 31.60 | Episode Reward: 10.30 | True Reward: -3.00 |Loss: 0.094 | Steps: 133 | Worker: 10
Episode: 60 | Moving Average Reward: 33.61 | Episode Reward: 232.40 | True Reward: 44.00 |Loss: 0.018 | Steps: 1884 | Worker: 11
Saving best model to ./gap_change, episode score: 232.39999999999276
Episode: 61 | Moving Average Reward: 35.62 | Episode Reward: 234.50 | True Reward: 44.00 |Loss: 0.015 | Steps: 1905 | Worker: 5
Saving best model to ./gap_change, episode score: 234.49999999999264
Episode: 62 | Moving Average Reward: 36.51 | Episode Reward: 124.70 | True Reward: 21.00 |Loss: 0

Episode: 119 | Moving Average Reward: 58.96 | Episode Reward: 10.30 | True Reward: -3.00 |Loss: 0.038 | Steps: 133 | Worker: 9
Episode: 120 | Moving Average Reward: 58.62 | Episode Reward: 24.70 | True Reward: 0.00 |Loss: 0.055 | Steps: 247 | Worker: 6
Episode: 121 | Moving Average Reward: 59.85 | Episode Reward: 181.90 | True Reward: 33.00 |Loss: 0.012 | Steps: 1489 | Worker: 7
Episode: 122 | Moving Average Reward: 61.45 | Episode Reward: 220.20 | True Reward: 41.00 |Loss: 0.013 | Steps: 1792 | Worker: 3
Episode: 123 | Moving Average Reward: 61.09 | Episode Reward: 24.60 | True Reward: 0.00 |Loss: 0.03 | Steps: 246 | Worker: 2
Episode: 124 | Moving Average Reward: 62.29 | Episode Reward: 181.90 | True Reward: 33.00 |Loss: 0.01 | Steps: 1489 | Worker: 10
Episode: 125 | Moving Average Reward: 61.77 | Episode Reward: 10.30 | True Reward: -3.00 |Loss: 0.057 | Steps: 133 | Worker: 7
Episode: 126 | Moving Average Reward: 61.26 | Episode Reward: 10.50 | True Reward: -3.00 |Loss: 0.138 | Step

Episode: 184 | Moving Average Reward: 69.88 | Episode Reward: 239.10 | True Reward: 45.00 |Loss: 0.017 | Steps: 1941 | Worker: 10
Episode: 185 | Moving Average Reward: 70.53 | Episode Reward: 134.40 | True Reward: 23.00 |Loss: 0.017 | Steps: 1114 | Worker: 11
Episode: 186 | Moving Average Reward: 71.45 | Episode Reward: 163.00 | True Reward: 29.00 |Loss: 0.011 | Steps: 1340 | Worker: 8
Episode: 187 | Moving Average Reward: 71.08 | Episode Reward: 34.30 | True Reward: 2.00 |Loss: 0.031 | Steps: 323 | Worker: 2
Episode: 188 | Moving Average Reward: 70.62 | Episode Reward: 24.60 | True Reward: 0.00 |Loss: 0.03 | Steps: 246 | Worker: 5
Episode: 189 | Moving Average Reward: 72.30 | Episode Reward: 239.10 | True Reward: 45.00 |Loss: 0.011 | Steps: 1941 | Worker: 0
Episode: 190 | Moving Average Reward: 71.64 | Episode Reward: 5.70 | True Reward: -4.00 |Loss: 0.116 | Steps: 97 | Worker: 0
Episode: 191 | Moving Average Reward: 74.02 | Episode Reward: 310.60 | True Reward: 60.00 |Loss: 0.012 | S

Episode: 249 | Moving Average Reward: 91.06 | Episode Reward: 91.50 | True Reward: 14.00 |Loss: 0.02 | Steps: 775 | Worker: 4
Episode: 250 | Moving Average Reward: 92.40 | Episode Reward: 224.80 | True Reward: 42.00 |Loss: 0.016 | Steps: 1828 | Worker: 11
Episode: 251 | Moving Average Reward: 92.58 | Episode Reward: 110.70 | True Reward: 18.00 |Loss: 0.027 | Steps: 927 | Worker: 6
Episode: 252 | Moving Average Reward: 93.45 | Episode Reward: 179.20 | True Reward: 33.00 |Loss: 0.016 | Steps: 1462 | Worker: 0
Episode: 253 | Moving Average Reward: 93.43 | Episode Reward: 91.50 | True Reward: 14.00 |Loss: 0.019 | Steps: 775 | Worker: 10
Episode: 254 | Moving Average Reward: 93.52 | Episode Reward: 103.00 | True Reward: 17.00 |Loss: 0.03 | Steps: 860 | Worker: 8
Episode: 255 | Moving Average Reward: 93.22 | Episode Reward: 62.90 | True Reward: 8.00 |Loss: 0.024 | Steps: 549 | Worker: 9
Episode: 256 | Moving Average Reward: 94.06 | Episode Reward: 177.30 | True Reward: 32.00 |Loss: 0.015 | S

Episode: 314 | Moving Average Reward: 77.55 | Episode Reward: 24.60 | True Reward: 0.00 |Loss: 0.03 | Steps: 246 | Worker: 1
Episode: 315 | Moving Average Reward: 77.50 | Episode Reward: 72.60 | True Reward: 10.00 |Loss: 0.032 | Steps: 626 | Worker: 0
Episode: 316 | Moving Average Reward: 77.55 | Episode Reward: 81.80 | True Reward: 12.00 |Loss: 0.024 | Steps: 698 | Worker: 10
Episode: 317 | Moving Average Reward: 77.02 | Episode Reward: 24.60 | True Reward: 0.00 |Loss: 0.024 | Steps: 246 | Worker: 1
Episode: 318 | Moving Average Reward: 77.30 | Episode Reward: 105.80 | True Reward: 17.00 |Loss: 0.019 | Steps: 888 | Worker: 5
Episode: 319 | Moving Average Reward: 77.59 | Episode Reward: 105.80 | True Reward: 17.00 |Loss: 0.018 | Steps: 888 | Worker: 6
Episode: 320 | Moving Average Reward: 77.49 | Episode Reward: 67.50 | True Reward: 9.00 |Loss: 0.031 | Steps: 585 | Worker: 4
Episode: 321 | Moving Average Reward: 76.96 | Episode Reward: 24.60 | True Reward: 0.00 |Loss: 0.026 | Steps: 24

Episode: 379 | Moving Average Reward: 87.87 | Episode Reward: 134.40 | True Reward: 23.00 |Loss: 0.013 | Steps: 1114 | Worker: 8
Episode: 380 | Moving Average Reward: 87.91 | Episode Reward: 91.50 | True Reward: 14.00 |Loss: 0.021 | Steps: 775 | Worker: 5
Episode: 381 | Moving Average Reward: 88.62 | Episode Reward: 158.40 | True Reward: 28.00 |Loss: 0.015 | Steps: 1304 | Worker: 2
Episode: 382 | Moving Average Reward: 89.36 | Episode Reward: 163.00 | True Reward: 29.00 |Loss: 0.013 | Steps: 1340 | Worker: 0
Episode: 383 | Moving Average Reward: 88.95 | Episode Reward: 48.60 | True Reward: 5.00 |Loss: 0.018 | Steps: 436 | Worker: 7
Episode: 384 | Moving Average Reward: 88.59 | Episode Reward: 53.20 | True Reward: 6.00 |Loss: 0.017 | Steps: 472 | Worker: 6
Episode: 385 | Moving Average Reward: 89.05 | Episode Reward: 134.40 | True Reward: 23.00 |Loss: 0.017 | Steps: 1114 | Worker: 1
Episode: 386 | Moving Average Reward: 88.65 | Episode Reward: 48.60 | True Reward: 5.00 |Loss: 0.017 | St

Episode: 445 | Moving Average Reward: 80.12 | Episode Reward: 38.90 | True Reward: 3.00 |Loss: 0.035 | Steps: 359 | Worker: 3
Episode: 446 | Moving Average Reward: 79.38 | Episode Reward: 5.70 | True Reward: -4.00 |Loss: 0.051 | Steps: 97 | Worker: 10
Episode: 447 | Moving Average Reward: 78.78 | Episode Reward: 20.00 | True Reward: -1.00 |Loss: 0.026 | Steps: 210 | Worker: 0
Episode: 448 | Moving Average Reward: 78.20 | Episode Reward: 20.00 | True Reward: -1.00 |Loss: 0.037 | Steps: 210 | Worker: 11
Episode: 449 | Moving Average Reward: 78.04 | Episode Reward: 62.90 | True Reward: 8.00 |Loss: 0.016 | Steps: 549 | Worker: 1
Episode: 450 | Moving Average Reward: 78.75 | Episode Reward: 148.70 | True Reward: 26.00 |Loss: 0.015 | Steps: 1227 | Worker: 7
Episode: 451 | Moving Average Reward: 78.21 | Episode Reward: 24.60 | True Reward: 0.00 |Loss: 0.027 | Steps: 246 | Worker: 11
Episode: 452 | Moving Average Reward: 77.96 | Episode Reward: 53.20 | True Reward: 6.00 |Loss: 0.019 | Steps: 4

Episode: 510 | Moving Average Reward: 74.12 | Episode Reward: 29.70 | True Reward: 1.00 |Loss: 0.04 | Steps: 287 | Worker: 3
Episode: 511 | Moving Average Reward: 73.82 | Episode Reward: 44.00 | True Reward: 4.00 |Loss: 0.033 | Steps: 400 | Worker: 10
Episode: 512 | Moving Average Reward: 73.99 | Episode Reward: 91.50 | True Reward: 14.00 |Loss: 0.018 | Steps: 775 | Worker: 7
Episode: 513 | Moving Average Reward: 74.51 | Episode Reward: 126.10 | True Reward: 22.00 |Loss: 0.021 | Steps: 1041 | Worker: 4
Episode: 514 | Moving Average Reward: 74.64 | Episode Reward: 86.90 | True Reward: 13.00 |Loss: 0.021 | Steps: 739 | Worker: 11
Episode: 515 | Moving Average Reward: 74.42 | Episode Reward: 53.20 | True Reward: 6.00 |Loss: 0.021 | Steps: 472 | Worker: 5
Episode: 516 | Moving Average Reward: 74.59 | Episode Reward: 91.50 | True Reward: 14.00 |Loss: 0.014 | Steps: 775 | Worker: 0
Episode: 517 | Moving Average Reward: 74.33 | Episode Reward: 48.60 | True Reward: 5.00 |Loss: 0.036 | Steps: 4

Episode: 575 | Moving Average Reward: 88.76 | Episode Reward: 20.00 | True Reward: -1.00 |Loss: 0.037 | Steps: 210 | Worker: 7
Episode: 576 | Moving Average Reward: 88.26 | Episode Reward: 38.90 | True Reward: 3.00 |Loss: 0.022 | Steps: 359 | Worker: 8
Episode: 577 | Moving Average Reward: 87.91 | Episode Reward: 53.20 | True Reward: 6.00 |Loss: 0.036 | Steps: 472 | Worker: 9
Episode: 578 | Moving Average Reward: 87.56 | Episode Reward: 53.20 | True Reward: 6.00 |Loss: 0.016 | Steps: 472 | Worker: 10
Episode: 579 | Moving Average Reward: 87.36 | Episode Reward: 67.50 | True Reward: 9.00 |Loss: 0.026 | Steps: 585 | Worker: 7
Episode: 580 | Moving Average Reward: 87.45 | Episode Reward: 96.10 | True Reward: 15.00 |Loss: 0.016 | Steps: 811 | Worker: 4
Episode: 581 | Moving Average Reward: 86.92 | Episode Reward: 34.30 | True Reward: 2.00 |Loss: 0.027 | Steps: 323 | Worker: 10
Episode: 582 | Moving Average Reward: 86.44 | Episode Reward: 38.90 | True Reward: 3.00 |Loss: 0.046 | Steps: 359 

Episode: 639 | Moving Average Reward: 90.50 | Episode Reward: 5.70 | True Reward: -4.00 |Loss: 0.02 | Steps: 97 | Worker: 4
Episode: 640 | Moving Average Reward: 89.79 | Episode Reward: 20.00 | True Reward: -1.00 |Loss: 0.057 | Steps: 210 | Worker: 2
Episode: 641 | Moving Average Reward: 89.71 | Episode Reward: 81.80 | True Reward: 12.00 |Loss: 0.019 | Steps: 698 | Worker: 1
Episode: 642 | Moving Average Reward: 88.97 | Episode Reward: 15.40 | True Reward: -2.00 |Loss: 0.046 | Steps: 174 | Worker: 0
Episode: 643 | Moving Average Reward: 88.23 | Episode Reward: 15.40 | True Reward: -2.00 |Loss: 0.039 | Steps: 174 | Worker: 2
Episode: 644 | Moving Average Reward: 87.88 | Episode Reward: 53.20 | True Reward: 6.00 |Loss: 0.023 | Steps: 472 | Worker: 6
Episode: 645 | Moving Average Reward: 88.73 | Episode Reward: 172.70 | True Reward: 31.00 |Loss: 0.012 | Steps: 1417 | Worker: 11
Episode: 646 | Moving Average Reward: 89.90 | Episode Reward: 205.90 | True Reward: 38.00 |Loss: 0.011 | Steps: 

Episode: 704 | Moving Average Reward: 104.86 | Episode Reward: 10.30 | True Reward: -3.00 |Loss: 0.064 | Steps: 133 | Worker: 4
Episode: 705 | Moving Average Reward: 104.58 | Episode Reward: 77.20 | True Reward: 11.00 |Loss: 0.024 | Steps: 662 | Worker: 0
Episode: 706 | Moving Average Reward: 106.35 | Episode Reward: 282.00 | True Reward: 54.00 |Loss: 0.013 | Steps: 2280 | Worker: 1
Episode: 707 | Moving Average Reward: 105.35 | Episode Reward: 5.70 | True Reward: -4.00 |Loss: 0.047 | Steps: 97 | Worker: 1
Episode: 708 | Moving Average Reward: 104.78 | Episode Reward: 48.60 | True Reward: 5.00 |Loss: 0.026 | Steps: 436 | Worker: 7
Episode: 709 | Moving Average Reward: 104.81 | Episode Reward: 107.70 | True Reward: 18.00 |Loss: 0.019 | Steps: 897 | Worker: 8
Episode: 710 | Moving Average Reward: 104.82 | Episode Reward: 105.80 | True Reward: 17.00 |Loss: 0.013 | Steps: 888 | Worker: 5
Episode: 711 | Moving Average Reward: 105.40 | Episode Reward: 163.00 | True Reward: 29.00 |Loss: 0.013

In [25]:
master.play()

Loading model from: ./gap_change\model_FlappyBird.h5
0. Reward: 0.0, action: 1, policy: [[0.2617155 0.7382845]]
1. Reward: 0.0, action: 1, policy: [[9.8399166e-04 9.9901605e-01]]
2. Reward: 0.0, action: 1, policy: [[4.8766920e-05 9.9995124e-01]]
3. Reward: 0.0, action: 1, policy: [[5.2329101e-06 9.9999475e-01]]
4. Reward: 0.0, action: 1, policy: [[7.8613448e-07 9.9999917e-01]]
5. Reward: 0.0, action: 1, policy: [[2.4033176e-07 9.9999976e-01]]
6. Reward: 0.0, action: 1, policy: [[1.1927051e-07 9.9999988e-01]]
7. Reward: 0.0, action: 1, policy: [[8.421681e-08 9.999999e-01]]
8. Reward: 0.0, action: 1, policy: [[8.507020e-08 9.999999e-01]]
9. Reward: 0.0, action: 1, policy: [[1.193069e-07 9.999999e-01]]
10. Reward: 0.0, action: 1, policy: [[2.4724193e-07 9.9999976e-01]]
11. Reward: 0.0, action: 1, policy: [[7.5164155e-07 9.9999928e-01]]
12. Reward: 0.0, action: 1, policy: [[3.1786888e-06 9.9999678e-01]]
13. Reward: 0.0, action: 1, policy: [[1.9119836e-05 9.9998093e-01]]
14. Reward: 0.0, ac

t:   0%|                                                                                                           | 0/211 [00:00<?, ?it/s, now=None]


180. Reward: 4.0, action: 1, policy: [[4.1576667e-10 1.0000000e+00]]
181. Reward: 4.0, action: 1, policy: [[2.5928755e-09 1.0000000e+00]]
182. Reward: 4.0, action: 1, policy: [[2.5791993e-08 1.0000000e+00]]
183. Reward: 4.0, action: 1, policy: [[4.3399334e-07 9.9999952e-01]]
184. Reward: 4.0, action: 1, policy: [[1.2071287e-05 9.9998796e-01]]
185. Reward: 4.0, action: 1, policy: [[7.1673765e-04 9.9928325e-01]]
186. Reward: 4.0, action: 1, policy: [[0.06227109 0.93772894]]
187. Reward: 4.0, action: 0, policy: [[0.91623324 0.08376674]]
188. Reward: 4.0, action: 1, policy: [[1.8261142e-04 9.9981743e-01]]
189. Reward: 4.0, action: 1, policy: [[2.5057373e-06 9.9999750e-01]]
190. Reward: 4.0, action: 1, policy: [[1.0045767e-07 9.9999988e-01]]
191. Reward: 4.0, action: 0, policy: [[1.000000e+00 9.242429e-22]]
192. Reward: 4.0, action: 0, policy: [[1.0000000e+00 9.2636874e-20]]
193. Reward: 4.0, action: 0, policy: [[1.0000000e+00 6.0330313e-21]]
194. Reward: 4.0, action: 0, policy: [[1.000000

                                                                                                                                                     

Moviepy - Done !
Moviepy - video ready __temp__.mp4


In [None]:
from IPython.display import Image, display

reward_per_epoch = []
lifetime_per_epoch = []
exploring_rates = []
learning_rates = []
print_every_episode = 500
show_gif_every_episode = 5000
NUM_EPISODE = 40000
for episode in range(0, NUM_EPISODE):

    # Reset the environment
    env.reset_game()

    # record frame
    frames = [env.getScreenRGB()]

    # for every 500 episodes, shutdown exploration to see performance of greedy action
    if episode % print_every_episode == 0:
        agent.shutdown_explore()

    # the initial state
    state = game.getGameState()
    action = agent.select_action(state)
    # cumulate reward for this episode
    cum_reward = 0  
    t = 0

    while not env.game_over():

        # execute the action and get reward
        # reward = +1 when pass a pipe, -5 when die
        reward = env.act(env.getActionSet()[action])  

        frames.append(env.getScreenRGB())

        # cumulate reward
        cum_reward += reward

        # observe the result
        state_prime = game.getGameState()  # get next state
        action_prime = agent.select_action(state_prime)

        # update agent
        agent.update_policy(state, action, reward, state_prime, action_prime)

        # Setting up for the next iteration
        action = action_prime
        state = state_prime
        t += 1

    # update exploring_rate and learning_rate
    agent.update_parameters(episode)

    if episode % print_every_episode == 0:
        print("Episode {} finished after {} time steps, cumulated reward: {}, exploring rate: {}, learning rate: {}".format(
            episode,
            t,
            cum_reward,
            agent.exploring_rate,
            agent.learning_rate
        ))
        reward_per_epoch.append(cum_reward)
        exploring_rates.append(agent.exploring_rate)
        learning_rates.append(agent.learning_rate)
        lifetime_per_epoch.append(t)

    # for every 5000 episode, record an animation
    if episode % show_gif_every_episode == 0:
        print("len frames:", len(frames))
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1))