In [1]:
# import package needed
%matplotlib inline
import matplotlib.pyplot as plt
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import numpy as np
import skimage.color
import skimage.transform
from ple.games.flappybird import FlappyBird
from ple import PLE
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [2]:
# define input size
screen_width = 80
screen_height = 80
num_stack = 4

In [3]:
bucket_range_per_feature = {
  'next_next_pipe_bottom_y': 40,
  'next_next_pipe_dist_to_player': 512,
  'next_next_pipe_top_y': 40,
  'next_pipe_bottom_y': 20,
  'next_pipe_dist_to_player': 20,
  'next_pipe_top_y': 20,
  'player_vel': 4,
  'player_y': 16
}

In [4]:
def preprocess(state):
    state = copy.deepcopy(state)
    state['next_next_pipe_bottom_y'] -= state['player_y']
    state['next_next_pipe_top_y'] -= state['player_y']
    state['next_pipe_bottom_y'] -= state['player_y']
    state['next_pipe_top_y'] -= state['player_y']

    # sort to make list converted from dict ordered in alphabet order
    state_key = [k for k, v in sorted(state.items())]

    # do bucketing to decrease state space to speed up training
    state_idx = []
    for key in state_key:
        state_idx.append(int(state[key] / bucket_range_per_feature[key]))
    return np.asarray([tuple(state_idx)])

In [5]:
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 10e-4


class Agent:

    def __init__(self, name, num_action, t=0, discount_factor=0.99):
        self.exploring_rate = 0.1
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.name = name
        with tf.variable_scope(name):
            self.build_model()

    def build_model(self):

    # input: current screen, selected action and reward
        self.input_state = tf.placeholder(tf.float32, shape=[None,8])
        self.action = tf.placeholder(tf.int32, [None])
        self.reward = tf.placeholder(tf.float32, [None])
        self.is_training = tf.placeholder(tf.bool, shape=[])

        def net(state, reuse=False):
              with tf.variable_scope(
                  "layers",
                  reuse=reuse,
                  initializer=tf.truncated_normal_initializer(stddev=1e-2)):
                dense1 = tf.layers.dense(inputs=state, units=512, activation=tf.nn.relu)
                dense2 = tf.layers.dense(inputs=dense1, units=1024, activation=tf.nn.relu)
                dense3 = tf.layers.dense(inputs=dense2, units=1024, activation=tf.nn.relu)
                dense = tf.layers.dense(inputs=dense3, units=512, activation=tf.nn.relu)
                Q = tf.layers.dense(
                    inputs=dense, units=self.num_action, activation=None)

                return Q

        # optimize
        self.output = net(self.input_state
                         )  # Q(s,a,theta) for all a, shape (batch_size, num_action)
        index = tf.stack([tf.range(tf.shape(self.action)[0]), self.action], axis=1)
        self.esti_Q = tf.gather_nd(
            self.output,
            index)  # Q(s,a,theta) for selected action, shape (batch_size, 1)

        self.max_Q = tf.reduce_max(
            self.output, axis=1)  # max(Q(s',a',theta')), shape (batch_size, 1)
        self.tar_Q = tf.placeholder(tf.float32, [None])

        # loss = E[r+max(Q(s',a',theta'))-Q(s,a,theta)]
        self.loss = tf.reduce_mean(
            tf.square(self.reward + self.discount_factor * self.tar_Q -
                      self.esti_Q))

        optimizer = tf.train.AdamOptimizer(learning_rate=1e-5)
        self.g_gvs = optimizer.compute_gradients(
            self.loss,
            var_list=[v for v in tf.global_variables() if self.name in v.name])
        self.train_op = optimizer.apply_gradients(self.g_gvs)
        self.pred = tf.argmax(
            self.output, axis=1
        )  # select action with highest action-value, only used in inference

    def select_action(self, input_state, sess):
    # epsilon-greedy
        if np.random.rand() < self.exploring_rate:
            action = np.random.choice(num_action)  # Select a random action
        else:
            feed_dict = {
                self.input_state: input_state,
                self.is_training: False,
            }
            action = sess.run(
                self.pred,
                feed_dict=feed_dict)[0]  # Select the action with the highest q
        return action

    def update_policy(self, input_states, actions, rewards, input_states_plum,
                    terminal, target_netwrok):
        # use max_Q estimate from target one to update online one
        feed_dict = {
            target_netwrok.input_state:
                np.array(input_states_plum),
            target_netwrok.is_training:
                True,
        }
        max_Q = sess.run(target_netwrok.max_Q, feed_dict=feed_dict)
        max_Q *= ~np.array(terminal)
        feed_dict = {
            self.input_state: input_states,
            self.tar_Q: max_Q,
            self.action: actions,
            self.reward: rewards,
            self.is_training: True,
        }
        loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
        return loss

    def update_parameters(self, episode):
        if self.exploring_rate > MIN_EXPLORING_RATE:
            self.exploring_rate -= (0.1 - MIN_EXPLORING_RATE) / 3000000

    def shutdown_explore(self):
        # make action selection greedy
        self.exploring_rate = 0

In [6]:
def get_update_ops():
  # return operations assign weight to target network
    src_vars = [v for v in tf.global_variables() if 'online' in v.name]
    tar_vars = [v for v in tf.global_variables() if 'target' in v.name]
    update_ops = []
    for src_var, tar_var in zip(src_vars, tar_vars):
        update_ops.append(tar_var.assign(src_var))
    return update_ops


def update_target(update_ops, sess):
    sess.run(update_ops)

In [7]:
# init agent
tf.reset_default_graph()
num_action = len(env.getActionSet())

# agent for frequently updating
online_agent = Agent('online', num_action)

# agent for slow updating
target_agent = Agent('target', num_action)
update_ops = get_update_ops()

In [8]:
class Replay_buffer():

    def __init__(self, buffer_size=50000):
        self.experiences = []
        self.buffer_size = buffer_size

    def add(self, experience):
        if len(self.experiences) >= self.buffer_size:
            self.experiences.pop(0)
        self.experiences.append(experience)

    def sample(self, size):
        """
            sameple experience from buffer
            """
        if size > len(self.experiences):
            experiences_idx = np.random.choice(len(self.experiences), size=size)
        else:
            experiences_idx = np.random.choice(
                len(self.experiences), size=size, replace=False)
        # from all sampled experiences, extract a tuple of (s,a,r,s')
        states = []
        actions = []
        rewards = []
        states_plum = []
        terminal = []
        for i in range(size):
            states.append(self.experiences[experiences_idx[i]][0])
            actions.append(self.experiences[experiences_idx[i]][1])
            rewards.append(self.experiences[experiences_idx[i]][2])
            states_plum.append(self.experiences[experiences_idx[i]][3])
            terminal.append(self.experiences[experiences_idx[i]][4])
        return states, actions, rewards, states_plum, terminal

In [9]:
# init buffer
buffer = Replay_buffer()

In [10]:
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps
    import moviepy.editor as mpy

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

In [11]:
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [12]:
import warnings
warnings.filterwarnings("ignore")

from IPython.display import Image, display

update_every_t_step = 3
print_every_episode = 10
save_video_every_episode = 100
NUM_EPISODE = 100
NUM_EXPLORE = 20

# we can redefine origin reward function
reward_values = {
    "positive": 1,  # reward pass a pipe
    "tick": 0.1,  # reward per timestamp
    "loss": -1,  # reward of gameover
}
for episode in range(0, NUM_EPISODE + 1):

  # Reset the environment
    game = FlappyBird()
      # for demo purpose, the following code is trained in the same scene,
    env = PLE(
          game,
          fps=30,
          display_screen=False,
          reward_values=reward_values,
          rng=np.random.RandomState(1))
    env.reset_game()
    env.act(0)  # dummy input to make sure input screen is correct

      # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]

      # for every 500 episodes, shutdown exploration to see performance of greedy action
    if episode % print_every_episode == 0:
        online_agent.shutdown_explore()

    input_states = preprocess(game.getGameState())

      # experience for this episode, store all (s,a,r,s') tuple
    experience = []

      # cumulate reward for this episode
    cum_reward = 0

    t = 0
    while not env.game_over():

        # feed four previous screen, select an action
        action = online_agent.select_action(input_states, sess)

        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])

        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())

        # cumulate reward
        cum_reward += reward

        # append grayscale screen for this episode
        input_states_plum = preprocess(game.getGameState())

        # append experience for this episode
        buffer.add((input_states, action, reward, input_states_plum,env.game_over()))
        
        t += 1

        # update agent
    if episode > NUM_EXPLORE:
        train_states, train_actions, train_rewards, train_states_plum, terminal = buffer.sample(
            32)
        #print(np.reshape(np.asarray(train_states),[32,8]))
        train_states = np.reshape(np.asarray(train_states),[np.asarray(train_states).shape[0],8])
        train_states_plum = np.reshape(np.asarray(train_states_plum),[np.asarray(train_states_plum).shape[0],8])
        loss = online_agent.update_policy(train_states, train_actions,
                                          train_rewards, train_states_plum,
                                          terminal, target_agent)
    if t % update_every_t_step == 0 and episode > NUM_EXPLORE:
        update_target(update_ops, sess)

      # update explore rating and learning rate
    online_agent.update_parameters(episode)
    target_agent.update_parameters(episode)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        print(
            "[{}] time live:{}, cumulated reward: {}, exploring rate: {}, loss: {}".
            format(episode, t, cum_reward, target_agent.exploring_rate, loss))

    if episode % save_video_every_episode == 0:  # for every 100 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie/DQN-{}.webm".format(episode), fps=60)

[MoviePy] >>>> Building video movie/DQN-0.webm
[MoviePy] Writing video movie/DQN-0.webm


 97%|███████████████████████████████████████████████████████████████████████████████▌  | 33/34 [00:00<00:00, 58.63it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-0.webm 

[30] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09999897700000011, loss: 0.08501280099153519
[40] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09999864700000015, loss: 0.00952146016061306
[50] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09999831700000018, loss: 0.034467291086912155
[60] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09999798700000022, loss: 0.008838241919875145
[70] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09999765700000025, loss: 0.033882517367601395
[80] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09999732700000029, loss: 0.007638230919837952
[90] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09999699700000032, loss: 0.032956961542367935
[100] time live:32, cumulated reward: 2.200000000000002, exploring rate: 0.09999666700000036, loss:

 97%|███████████████████████████████████████████████████████████████████████████████▌  | 33/34 [00:00<00:00, 60.13it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-100.webm 



In [13]:
from moviepy.editor import *
clip = VideoFileClip("movie/DQN-100.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

 97%|██████████████████████████████████████████████████████████████████████████████▌  | 33/34 [00:00<00:00, 308.50it/s]


In [14]:
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 0.01
MIN_LEARNING_RATE = 0.1


class Policy_Gradiebt_Agent:

    def __init__(self, name, num_action, t=0, discount_factor=0.99):
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.name = name
        with tf.variable_scope(name):
            self.build_model()

    def build_model(self):

    # input: current state, selected action and reward
        self.input_state = tf.placeholder(tf.float32, shape=[None,8])
        self.action = tf.placeholder(tf.int32, [None])
        self.reward = tf.placeholder(tf.float32, [None])
        self.is_training = tf.placeholder(tf.bool, shape=[])

        def net(state, reuse=False):
            with tf.variable_scope("layers", reuse=reuse):
                dense_1 = tf.layers.dense(inputs=state, units=512, activation=tf.nn.relu)
                dense_2 = tf.layers.dense(inputs=dense_1, units=1024, activation=tf.nn.relu)
                dense_3 = tf.layers.dense(inputs=dense_2, units=1024, activation=tf.nn.relu)
                self.dense1 = tf.layers.dense(
                    inputs=dense_3, units=512, activation=tf.nn.relu)
                self.dense2 = tf.layers.dense(
                    inputs=self.dense1, units=self.num_action, activation=None)
                return self.dense2

        # optimize
        self.output_logit = net(
            self.input_state
        )  # logit of probility(P(s,a,theta)) for all a, shape (batch_size, num_action)
        index = tf.stack([tf.range(tf.shape(self.action)[0]), self.action], axis=1)
        self.prob = tf.gather_nd(
            tf.nn.softmax(self.output_logit),
            index)  # P(s,a,theta) for selected action, shape (batch_size, 1)

        # loss = E[log(p(s,a))*r]
        # because we want to maximize objective, add negative sign before loss
        self.loss = -tf.reduce_mean(tf.log(self.prob + 0.00000001) * self.reward)
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
        g_gvs = optimizer.compute_gradients(
            self.loss,
            var_list=[v for v in tf.global_variables() if self.name in v.name])
        self.train_op = optimizer.apply_gradients(g_gvs)

        self.pred = tf.multinomial(self.output_logit,
                                   1)  # sample action from distribution

    def select_action(self, input_state, sess):
        feed_dict = {
            self.input_state: input_state,
            self.is_training: False,
        }
        action = sess.run(
            self.pred,
            feed_dict=feed_dict)[0][0]  # sameple action from distribution
        return action

    def update_policy(self, input_states, actions, rewards, input_states_plum):
        feed_dict = {
            self.input_state: input_states,
            self.action: actions,
            self.reward: rewards,
            self.is_training: True,
        }
        loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
        return loss

In [15]:
# init agent
tf.reset_default_graph()
# agent for frequently updating
pg_agent = Policy_Gradiebt_Agent('PG_Agent', num_action)
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [16]:
from IPython.display import Image, display

update_every_episode = 1
print_every_episode = 10
save_video_every_episode = 100
NUM_EPISODE = 100
NUM_EXPLORE = 10
NUM_PASS = 20
reward_values = {
    "positive": 1,
    "tick": 0.1,  # reward per timestamp
    "loss": -1,
}
for episode in range(0, NUM_EPISODE + 1):

  # Reset the environment
    game = FlappyBird()
    env = PLE(
          game,
          fps=30,
          display_screen=False,
          reward_values=reward_values,
          rng=np.random.RandomState(1))
    env.reset_game()
    env.act(0)  # dummy input to make sure input screen is correct

      # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]

      # grayscale input state for this episode
    input_states = preprocess(game.getGameState())

      # cumulate reward for this episode
    cum_reward = 0

    experiences = []
    t = 0
    while not env.game_over():
        # feed four previous screen, select an action
        action = pg_agent.select_action(input_states, sess)

        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])

        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())

        # cumulate reward
        cum_reward += reward

        # append grayscale screen for this episode
        input_states_plum = preprocess(game.getGameState())

        # append experience for this episode
        experiences.append(
            [input_states, action, reward, input_states_plum])

        t += 1

    def discount_reward(x, discount_rate):
        discounted_r = np.zeros(len(x))
        num_r = len(x)
        for i in range(num_r):
            discounted_r[i] = x[i] * math.pow(discount_rate, i)
        discounted_r = np.cumsum(discounted_r[::-1])
        return discounted_r[::-1]

    rewards = [e[2] for e in experiences]
    discounted_reward = discount_reward(rewards, pg_agent.discount_factor)

      # normalize
    discounted_reward -= np.mean(discounted_reward)
    discounted_reward /= np.std(discounted_reward)
    train_states = []
    train_actions = []
    train_rewards = []
    train_input_states_plum = []
    for i in range(len(experiences)):
        experiences[i][2] = discounted_reward[i]
        train_states.append(experiences[i][0])
        train_actions.append(experiences[i][1])
        train_rewards.append(experiences[i][2])
        train_input_states_plum.append(experiences[i][3])
    #print(np.asarray(train_states).shape[0])
    train_states = np.reshape(np.asarray(train_states),[np.asarray(train_states).shape[0],8])
    train_input_states_plum = np.reshape(np.asarray(train_states),[np.asarray(train_input_states_plum).shape[0],8])
    #train_input_states_plum = np.reshape(np.asarray(train_input_states_plum),[59,8])
    loss = pg_agent.update_policy(train_states, train_actions, train_rewards,
                                    train_input_states_plum)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        print("[{}] time live:{}, cumulated reward: {}, loss: {}".format(
            episode, t, cum_reward, loss))

    if episode % save_video_every_episode == 0 and episode > NUM_EXPLORE:  # for every 5000 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie/pg_{}.webm".format(episode), fps=60)
        #display(clip.ipython_display(fps=60, autoplay=1, loop=1))

[20] time live:61, cumulated reward: 5.099999999999994, loss: 0.00023266526113729924
[30] time live:61, cumulated reward: 5.099999999999994, loss: -0.00768717797473073
[40] time live:46, cumulated reward: 3.5999999999999996, loss: 0.012832910753786564
[50] time live:56, cumulated reward: 4.599999999999996, loss: -0.003847088199108839
[60] time live:60, cumulated reward: 4.999999999999995, loss: -0.0014129320625215769
[70] time live:52, cumulated reward: 4.1999999999999975, loss: -8.289630386570934e-06
[80] time live:42, cumulated reward: 3.200000000000001, loss: -0.00023237864661496133
[90] time live:61, cumulated reward: 5.099999999999994, loss: 0.0008553676889277995
[100] time live:72, cumulated reward: 7.19999999999999, loss: -0.005838274955749512
[MoviePy] >>>> Building video movie/pg_100.webm
[MoviePy] Writing video movie/pg_100.webm


 99%|████████████████████████████████████████████████████████████████████████████████▉ | 73/74 [00:00<00:00, 73.83it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_100.webm 



In [17]:
from moviepy.editor import *
clip = VideoFileClip("movie/pg_100.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

100%|█████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 413.53it/s]


In [18]:
class Actor_critic:

    def __init__(self, name, num_action, discount_factor=0.99):
        self.exploring_rate = 0.1
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.name = name
        with tf.variable_scope(name):
            self.build_model()

    def build_model(self):
    # input: current state, selected action and reward
        self.input_state = tf.placeholder(tf.float32, shape=[None, 8])
        self.action = tf.placeholder(tf.int32, [None])
        self.reward = tf.placeholder(tf.float32, [None])
        self.is_training = tf.placeholder(tf.bool, shape=[])

        def value_net(state, reuse=False):
            with tf.variable_scope(
                "value_net",
                reuse=reuse,
                initializer=tf.truncated_normal_initializer(stddev=1e-2)):
                dense_1 = tf.layers.dense(inputs=state, units=512, activation=tf.nn.relu)
                dense_2 = tf.layers.dense(inputs=dense_1, units=1024, activation=tf.nn.relu)
                dense_3 = tf.layers.dense(inputs=dense_2, units=1024, activation=tf.nn.relu)
                dense = tf.layers.dense(inputs=dense_3, units=512, activation=tf.nn.relu)
                V = tf.layers.dense(inputs=dense, units=1, activation=None)
                return V

        def policy_net(state, reuse=False):
            with tf.variable_scope("policy_net", reuse=reuse):
                dense_1 = tf.layers.dense(inputs=state, units=512, activation=tf.nn.relu)
                dense_2 = tf.layers.dense(inputs=dense_1, units=1024, activation=tf.nn.relu)
                dense_3 = tf.layers.dense(inputs=dense_2, units=1024, activation=tf.nn.relu)

                self.dense1 = tf.layers.dense(
                    inputs=dense_3, units=512, activation=tf.nn.relu)
                self.dense2 = tf.layers.dense(
                    inputs=self.dense1, units=self.num_action, activation=None)
                return self.dense2

        # value
        self.v_output = value_net(
            self.input_state
        )  # Q(s,a,theta) for all a, shape (batch_size, num_action)
        self.tar_V = tf.placeholder(tf.float32, [None])
        self.V_loss = tf.reduce_mean(
            tf.square(self.reward + self.discount_factor * self.tar_V -
                      self.v_output))
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
        g_gvs = optimizer.compute_gradients(
            self.V_loss,
            var_list=[v for v in tf.global_variables() if 'value_net' in v.name])
        self.V_train_op = optimizer.apply_gradients(g_gvs)

        # policy
        self.policy_logit = policy_net(
            self.input_state
        )  # logit of probility(P(s,a,theta)) for all a, shape (batch_size, num_action)
        index = tf.stack([tf.range(tf.shape(self.action)[0]), self.action], axis=1)
        self.prob = tf.gather_nd(
            tf.nn.softmax(self.policy_logit),
            index)  # P(s,a,theta) for selected action, shape (batch_size, 1)

        # loss = E[log(p(s,a))*r]
        self.policy_loss = -tf.reduce_mean(
            tf.log(self.prob + 0.00000001) * self.reward)
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
        g_gvs = optimizer.compute_gradients(
            self.policy_loss,
            var_list=[v for v in tf.global_variables() if 'policy_net' in v.name])
        self.train_op = optimizer.apply_gradients(g_gvs)
        self.pred = tf.multinomial(self.policy_logit,1)  # sample action from distribution

    def select_action(self, input_state, sess):
        feed_dict = {
            self.input_state: input_state,
        }
        action = sess.run(
            self.pred,
            feed_dict=feed_dict)[0][0]  # sameple action from distribution
        return action

    def update_policy(self, input_states, actions, rewards, input_states_plum):
        feed_dict = {
            self.input_state: input_states_plum,
        }
        esti_V = sess.run(self.v_output, feed_dict=feed_dict).flatten()
        td_target = rewards + self.discount_factor * esti_V

        feed_dict = {
            self.input_state: input_states,
        }
        esti_V = sess.run(self.v_output, feed_dict=feed_dict).flatten()
        td_error = td_target - esti_V
        feed_dict = {
            self.input_state: input_states_plum,
        }
        feed_dict = {
            self.input_state: input_states,
            self.tar_V: td_target,
            self.reward: rewards,
        }

        V_loss, _ = sess.run([self.V_loss, self.V_train_op], feed_dict=feed_dict)

        feed_dict = {
            self.input_state: input_states,
            self.action: actions,
            self.reward: td_error,
        }
        policy_loss, _ = sess.run(
            [self.policy_loss, self.train_op], feed_dict=feed_dict)
        return V_loss, policy_loss

    def update_parameters(self, episode):
        if self.exploring_rate > MIN_EXPLORING_RATE:
            self.exploring_rate -= (0.1 - MIN_EXPLORING_RATE) / 3000000

    def shutdown_explore(self):
        # make action selection greedy
        self.exploring_rate = 0

In [19]:
# init agent
tf.reset_default_graph()
# agent for frequently updating
ac_agent = Actor_critic('PG_Agent', num_action)
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [20]:
from IPython.display import Image, display

update_every_episode = 1
print_every_episode = 10
save_video_every_episode = 100
NUM_EPISODE = 100
NUM_EXPLORE = 0
reward_values = {
    "positive": 1,
    "tick": 0.1,  # reward per timestamp
    "loss": -1,
}
for episode in range(0, NUM_EPISODE + 1):

  # Reset the environment
    game = FlappyBird()
    env = PLE(
          game,
          fps=30,
          display_screen=False,
          reward_values=reward_values,
          rng=np.random.RandomState(1))
    env.reset_game()
    env.act(0)  # dummy input to make sure input screen is correct

      # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]

      # grayscale input screen for this episode
    input_states = preprocess(game.getGameState())

      # cumulate reward for this episode
    cum_reward = 0

    experiences = []
    t = 0
    while not env.game_over():
        # feed four previous screen, select an action
        action = ac_agent.select_action(input_states, sess)

        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])

        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())

        # cumulate reward
        cum_reward += reward

        # append grayscale screen for this episode
        input_states_plum = preprocess(game.getGameState())

        # append experience for this episode
        experiences.append([input_states, action, reward, input_states_plum])

        t += 1

    def discount_reward(x, discount_rate):
        discounted_r = np.zeros(len(x))
        num_r = len(x)
        for i in range(num_r):
            discounted_r[i] = x[i] * math.pow(discount_rate, i)
        discounted_r = np.cumsum(discounted_r[::-1])
        return discounted_r[::-1]

    rewards = [e[2] for e in experiences]
    discounted_reward = discount_reward(rewards, ac_agent.discount_factor)

      # normalize
    discounted_reward -= np.mean(discounted_reward)
    discounted_reward /= np.std(discounted_reward)
    train_states = []
    train_actions = []
    train_rewards = []
    train_input_states_plum = []
    for i in range(len(experiences)):
        experiences[i][2] = discounted_reward[i]
        train_states.append(experiences[i][0])
        train_actions.append(experiences[i][1])
        train_rewards.append(experiences[i][2])
        train_input_states_plum.append(experiences[i][3])
    train_states = np.reshape(np.asarray(train_states),[np.asarray(train_states).shape[0],8])
    train_input_states_plum = np.reshape(np.asarray(train_states),[np.asarray(train_input_states_plum).shape[0],8])
    loss = ac_agent.update_policy(train_states, train_actions, train_rewards,
                                    train_input_states_plum)

    if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
        print("[{}] time live:{}, cumulated reward: {}, loss: {}".format(
            episode, t, cum_reward, loss))

    if episode % save_video_every_episode == 0 and episode > NUM_EXPLORE:  # for every 5000 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie/ac_{}.webm".format(episode), fps=60)
        #display(clip.ipython_display(fps=60, autoplay=1, loop=1))

[10] time live:45, cumulated reward: 3.5, loss: (3.9601002, -0.023594368)
[20] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.0032966489)
[30] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, 0.009605408)
[40] time live:46, cumulated reward: 3.5999999999999996, loss: (3.9601004, -0.04543526)
[50] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601, -0.0177292)
[60] time live:65, cumulated reward: 6.499999999999993, loss: (3.9601, 0.022024155)
[70] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601004, 0.0009068543)
[80] time live:43, cumulated reward: 3.3000000000000007, loss: (3.9601002, -0.014202251)
[90] time live:48, cumulated reward: 3.799999999999999, loss: (3.9601, -0.0017491579)
[100] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, 0.0049704197)
[MoviePy] >>>> Building video movie/ac_100.webm
[MoviePy] Writing video movie/ac_100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:00<00:00, 61.48it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_100.webm 



In [21]:
from moviepy.editor import *
clip = VideoFileClip("movie/ac_100.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

100%|█████████████████████████████████████████████████████████████████████████████████| 53/53 [00:00<00:00, 335.55it/s]


# Brief Report

此次作業是將screen input改成上一次lab preprocess好的game state。看了討論區的作業討論後，決定將全部的CNN改成fully connected方便實作。由於feature是事先定義好的，但真實的screen的情況會比定義好的複雜很多，故一開始助教的CNN將4個frame的image pixel抽取特徵出來後再去產生Q值會是比較好的。
