In [1]:
import numpy as np
import matplotlib.pyplot as plt
import moviepy.editor as mpy
import skimage.transform
from IPython.display import Image, display
            
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls






In [2]:
gpus = tf.config.list_physical_devices("GPU") 
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        print("Tensorflow version " + tf.__version__)                                       
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
        print("Tensorflow version " + tf.__version__)

In [3]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

test_game = FlappyBird()
test_env = PLE(test_game, fps=30, display_screen=False)
test_env.reset_game()

couldn't import doomish
Couldn't import doom


In [4]:
path = './movie_f' 
if not os.path.exists(path):
    os.makedirs(path)

In [5]:
hparas = {
    'image_size': 84,
    'num_stack': 4,
    'action_dim': len(env.getActionSet()),
    'hidden_size': 256,
    'lr': 0.0001,
    'gamma': 0.99,
    'lambda': 0.95,
    'clip_val': 0.2,
    'ppo_epochs': 8,
    'test_epochs': 1,
    'num_steps': 512,
    'mini_batch_size': 64,
    'target_reward': 200,
    'max_episode': 30000,
}

In [6]:
# Please do not modify this method
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    
    return clip

In [7]:
def preprocess_screen(screen):
    screen = skimage.transform.rotate(screen, -90, resize=True)
    screen = screen[:400, :]
    screen = skimage.transform.resize(screen, [hparas['image_size'], hparas['image_size'], 1])
    return screen.astype(np.float32)

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [8]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self, hparas):
        super().__init__()

        self.feature_extractor = tf.keras.Sequential([
          # Convolutional Layers
          tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1),
          tf.keras.layers.ReLU(),
          # Embedding Layers
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(hparas['hidden_size']),
          tf.keras.layers.ReLU(),
        ])

        # Actor Network
        self.actor = tf.keras.layers.Dense(hparas['action_dim'], activation='softmax')
        # Critic Network
        self.critic = tf.keras.layers.Dense(1, activation = None)

    def call(self, input):
        x = self.feature_extractor(input)
        action_logits = self.actor(x)
        value = self.critic(x)
        return action_logits, value

In [9]:
class Agent():
    def __init__(self, hparas):
        self.gamma = hparas['gamma']
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=hparas['lr'])
        self.actor_critic = ActorCriticNetwork(hparas)
        self.clip_pram = hparas['clip_val']
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.shape[0]
        for _ in range(batch_size // mini_batch_size):
            rand_ids = tf.convert_to_tensor(np.random.randint(0, batch_size, mini_batch_size), dtype=tf.int32)
            yield tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids), \
             tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)
    
    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, discount_rewards, advantages):       
        total_actor_loss = 0
        total_critic_loss = 0
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, reward, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
                reward = tf.expand_dims(reward, axis=-1)

                with tf.GradientTape() as tape:
                    prob, value = self.actor_critic(state, training=True)
                    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
                    entropy = tf.math.reduce_mean(dist.entropy())
                    new_log_probs = dist.log_prob(action)

                    # PPO ratio
                    ratio = tf.math.exp(new_log_probs - old_log_probs)
                    surr1 = ratio * advantage
                    surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

                    actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
                    critic_loss = 0.5 * tf.math.reduce_mean(kls.mean_squared_error(reward, value))

                    total_loss = actor_loss + critic_loss
            
                # single optimizer
                grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
      
                total_actor_loss += actor_loss
                total_critic_loss += critic_loss
        return total_actor_loss, total_critic_loss

In [10]:
# https://arxiv.org/pdf/1506.02438.pdf
# Equation 16
def compute_gae(rewards, masks, values, gamma, LAMBDA):
    gae = 0
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * LAMBDA * masks[i] * gae
        returns.append(gae + values[i])

    returns.reverse()
    return returns

In [11]:
def test_reward(test_env, agent):
    total_reward = 0
    # Reset the environment
    test_env.reset_game()
    input_frames = [preprocess_screen(test_env.getScreenGrayscale())]

    while not test_env.game_over():

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        action = np.argmax(prob[0].numpy())
        reward = test_env.act(test_env.getActionSet()[action])
        total_reward += reward

        input_frames.append(preprocess_screen(test_env.getScreenGrayscale()))

    return total_reward

In [12]:
agent = Agent(hparas)
max_episode = hparas['max_episode']
test_per_n_episode = 10
force_save_per_n_episode = 1000
early_stop_reward = 30

start_s = 0
best_reward = -5.0

checkpoint = tf.train.Checkpoint(
    actor_critic = agent.actor_critic,
    optimizer = agent.optimizer,
)

# Load from old checkpoint
# checkpoint.restore('ckpt_dir/ckpt-?')




In [13]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % test_per_n_episode == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            # agent.actor_critic.save('.\\save\\Actor\\model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            # checkpoint.save(file_prefix = './save/checkpoints/ckpt')
            clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
            clip.write_videofile("movie_f\\{}_demo-{}.webm".format('Lab15', s), fps=60)
            display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if s % force_save_per_n_episode == 0:
        # agent.actor_critic.save('.\\save\\Actor\\model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        print("Save model at episode %d\n" % s)
        #checkpoint.save(file_prefix = '.\\save\\checkpoints\\ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f\\{}_demo-{}.webm".format('Lab15', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

[Episode 0]  Actor loss: 88.19906, Critic loss: 60.51859
Test average reward is -5.0, Current best average reward is -5.0

Save model at episode 0

Moviepy - Building video movie_f\Lab15_demo-0.webm.
Moviepy - Writing video movie_f\Lab15_demo-0.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f\Lab15_demo-0.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1]  Actor loss: 40.25028, Critic loss: 28.02130
[Episode 2]  Actor loss: 25.73141, Critic loss: 22.41962
[Episode 3]  Actor loss: 20.69059, Critic loss: 14.83496
[Episode 4]  Actor loss: 18.46104, Critic loss: 9.18247
[Episode 5]  Actor loss: 5.95439, Critic loss: 7.13567
[Episode 6]  Actor loss: 6.84542, Critic loss: 4.10030
[Episode 7]  Actor loss: -0.39989, Critic loss: 4.03430
[Episode 8]  Actor loss: 1.95171, Critic loss: 3.47525
[Episode 9]  Actor loss: 0.88220, Critic loss: 2.78249
[Episode 10]  Actor loss: 4.28870, Critic loss: 3.40540
Test average reward is -5.0, Current best average reward is -5.0

[Episode 11]  Actor loss: 0.02692, Critic loss: 4.27466
[Episode 12]  Actor loss: -2.42579, Critic loss: 2.45531
[Episode 13]  Actor loss: 0.08835, Critic loss: 2.53183
[Episode 14]  Actor loss: -6.51742, Critic loss: 2.74868
[Episode 15]  Actor loss: -7.64742, Critic loss: 2.11743
[Episode 16]  Actor loss: -2.61524, Critic loss: 2.38879
[Episode 17]  Actor loss: -2.77862,

                                                               

Moviepy - Done !
Moviepy - video ready movie_f\Lab15_demo-100.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 101]  Actor loss: -5.24923, Critic loss: 0.81184
[Episode 102]  Actor loss: -2.37628, Critic loss: 0.44638
[Episode 103]  Actor loss: -5.26442, Critic loss: 0.39519
[Episode 104]  Actor loss: -1.10276, Critic loss: 0.56981
[Episode 105]  Actor loss: -7.43058, Critic loss: 0.34454
[Episode 106]  Actor loss: -5.28577, Critic loss: 0.23773
[Episode 107]  Actor loss: -2.38592, Critic loss: 0.31489
[Episode 108]  Actor loss: -4.94691, Critic loss: 0.35054
[Episode 109]  Actor loss: -5.63575, Critic loss: 0.68485
[Episode 110]  Actor loss: -8.83463, Critic loss: 0.34353
Test average reward is -5.0, Current best average reward is -4.0

[Episode 111]  Actor loss: -4.61854, Critic loss: 0.62542
[Episode 112]  Actor loss: -3.36568, Critic loss: 0.28998
[Episode 113]  Actor loss: -6.82374, Critic loss: 0.23479
[Episode 114]  Actor loss: -2.07840, Critic loss: 1.17135
[Episode 115]  Actor loss: -3.94761, Critic loss: 0.89352
[Episode 116]  Actor loss: -6.45633, Critic loss: 0.49486
[Episo

                                                               

Moviepy - Done !
Moviepy - video ready movie_f\Lab15_demo-1000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1001]  Actor loss: -19.72261, Critic loss: 2.38882
[Episode 1002]  Actor loss: -12.08296, Critic loss: 2.63100
[Episode 1003]  Actor loss: 1.51490, Critic loss: 1.09949
[Episode 1004]  Actor loss: -12.65797, Critic loss: 2.09356
[Episode 1005]  Actor loss: -1.12789, Critic loss: 2.18826
[Episode 1006]  Actor loss: -6.86793, Critic loss: 1.64717
[Episode 1007]  Actor loss: -3.71637, Critic loss: 1.88170
[Episode 1008]  Actor loss: -10.61245, Critic loss: 2.03659
[Episode 1009]  Actor loss: -2.12383, Critic loss: 1.66891
[Episode 1010]  Actor loss: -8.19508, Critic loss: 1.77078
Test average reward is -5.0, Current best average reward is -4.0

[Episode 1011]  Actor loss: -4.28725, Critic loss: 1.11708
[Episode 1012]  Actor loss: -2.17105, Critic loss: 0.85190
[Episode 1013]  Actor loss: -21.02561, Critic loss: 3.67082
[Episode 1014]  Actor loss: -4.78841, Critic loss: 3.57444
[Episode 1015]  Actor loss: 5.45032, Critic loss: 1.35035
[Episode 1016]  Actor loss: -9.95561, Critic l

                                                               

Moviepy - Done !
Moviepy - video ready movie_f\Lab15_demo-1570.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4


[Episode 1571]  Actor loss: -26.96129, Critic loss: 2.50345
[Episode 1572]  Actor loss: -4.59469, Critic loss: 3.69755
[Episode 1573]  Actor loss: 0.00617, Critic loss: 2.07112
[Episode 1574]  Actor loss: -18.99827, Critic loss: 3.98375
[Episode 1575]  Actor loss: -10.93923, Critic loss: 2.50339
[Episode 1576]  Actor loss: -16.27178, Critic loss: 2.07895
[Episode 1577]  Actor loss: 4.93931, Critic loss: 6.42846
[Episode 1578]  Actor loss: -15.05919, Critic loss: 6.12017
[Episode 1579]  Actor loss: -11.30005, Critic loss: 3.73282
[Episode 1580]  Actor loss: -10.70456, Critic loss: 5.74243
Test average reward is -5.0, Current best average reward is -2.0

[Episode 1581]  Actor loss: 1.31087, Critic loss: 3.96255
[Episode 1582]  Actor loss: -1.57502, Critic loss: 2.40394
[Episode 1583]  Actor loss: -18.74009, Critic loss: 2.98407
[Episode 1584]  Actor loss: -29.89504, Critic loss: 3.02643
[Episode 1585]  Actor loss: -15.17520, Critic loss: 2.50954
[Episode 1586]  Actor loss: -9.62987, Crit

                                                               

Moviepy - Done !
Moviepy - video ready movie_f\Lab15_demo-1940.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1941]  Actor loss: 9.46634, Critic loss: 5.92111
[Episode 1942]  Actor loss: 2.40057, Critic loss: 3.50691
[Episode 1943]  Actor loss: -27.06128, Critic loss: 5.03619
[Episode 1944]  Actor loss: -4.38633, Critic loss: 4.70622
[Episode 1945]  Actor loss: -14.37513, Critic loss: 5.58792
[Episode 1946]  Actor loss: -31.66286, Critic loss: 5.68223
[Episode 1947]  Actor loss: -10.10772, Critic loss: 7.17157
[Episode 1948]  Actor loss: -37.46720, Critic loss: 9.49116
[Episode 1949]  Actor loss: -21.30279, Critic loss: 10.27897
[Episode 1950]  Actor loss: -13.13332, Critic loss: 6.38609
Test average reward is -5.0, Current best average reward is -1.0

[Episode 1951]  Actor loss: -13.45218, Critic loss: 12.02550
[Episode 1952]  Actor loss: -6.84601, Critic loss: 11.03288
[Episode 1953]  Actor loss: -5.39010, Critic loss: 10.67626
[Episode 1954]  Actor loss: 9.31644, Critic loss: 8.24568
[Episode 1955]  Actor loss: -5.17730, Critic loss: 4.44984
[Episode 1956]  Actor loss: -15.70089, C

                                                               

Moviepy - Done !
Moviepy - video ready movie_f\Lab15_demo-2000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 2001]  Actor loss: -16.96634, Critic loss: 14.65344
[Episode 2002]  Actor loss: -10.75436, Critic loss: 9.71458
[Episode 2003]  Actor loss: -15.14199, Critic loss: 8.04755
[Episode 2004]  Actor loss: 2.67672, Critic loss: 7.34958
[Episode 2005]  Actor loss: 39.04951, Critic loss: 10.18235
[Episode 2006]  Actor loss: -8.01187, Critic loss: 7.09327
[Episode 2007]  Actor loss: -14.50383, Critic loss: 5.13936
[Episode 2008]  Actor loss: -7.85466, Critic loss: 4.49921
[Episode 2009]  Actor loss: 7.35354, Critic loss: 5.25984
[Episode 2010]  Actor loss: 1.88921, Critic loss: 7.35721
Test average reward is -4.0, Current best average reward is -1.0

[Episode 2011]  Actor loss: -0.49063, Critic loss: 4.95285
[Episode 2012]  Actor loss: -9.79781, Critic loss: 9.10437
[Episode 2013]  Actor loss: -15.74097, Critic loss: 9.00692
[Episode 2014]  Actor loss: -27.46825, Critic loss: 7.95064
[Episode 2015]  Actor loss: -18.97445, Critic loss: 3.61297
[Episode 2016]  Actor loss: -3.03425, Criti

                                                               

Moviepy - Done !
Moviepy - video ready movie_f\Lab15_demo-2050.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 2051]  Actor loss: -9.30834, Critic loss: 9.09034
[Episode 2052]  Actor loss: -2.56709, Critic loss: 6.72564
[Episode 2053]  Actor loss: 6.00907, Critic loss: 11.23726
[Episode 2054]  Actor loss: 6.08377, Critic loss: 12.05335
[Episode 2055]  Actor loss: -8.31134, Critic loss: 6.36881
[Episode 2056]  Actor loss: -13.93262, Critic loss: 8.47374
[Episode 2057]  Actor loss: -8.19080, Critic loss: 5.85039
[Episode 2058]  Actor loss: -23.87017, Critic loss: 6.88477
[Episode 2059]  Actor loss: 10.87166, Critic loss: 11.24193
[Episode 2060]  Actor loss: -6.86644, Critic loss: 7.96362
Test average reward is -5.0, Current best average reward is 1.0

[Episode 2061]  Actor loss: 13.41834, Critic loss: 6.18598
[Episode 2062]  Actor loss: 7.07413, Critic loss: 7.67318
[Episode 2063]  Actor loss: 6.98372, Critic loss: 4.53992
[Episode 2064]  Actor loss: 2.01966, Critic loss: 8.53182
[Episode 2065]  Actor loss: 1.39473, Critic loss: 3.74712
[Episode 2066]  Actor loss: -3.58575, Critic loss: 

# Report

這次花費了蠻多時間在 setup 環境上, 不知道為什麼這次 lab 需要的環境好像跟之前不一樣導致我需要重新去下載 library(後來問同學後發現好像其實是有一個 library 我選錯版本了), 然後在實驗的部分, 這次跟之前比起來不會像上次的 lab 一樣跑到後面需要的時間越久, 而是每個 episode 都花費一樣多的時間, 以及這次 train 起來花的時間比上個 lab 還要久, 就算 train 了這麼多個 episode 也只能飛過一個柱子