# Deep Q-Network

In [1]:
# import package needed
%matplotlib inline
import matplotlib.pyplot as plt
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import numpy as np
import skimage.color
import skimage.transform
from ple.games.flappybird import FlappyBird
from ple import PLE
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [2]:
# define input size
screen_width = 80
screen_height = 80
num_stack = 4

In [3]:
'''
def preprocess(screen):
  #screen = skimage.color.rgb2gray(screen)
  screen = skimage.transform.resize(screen, [screen_width, screen_height])
  return screen
'''

bucket_range_per_feature = {
  'next_next_pipe_bottom_y': 40,
  'next_next_pipe_dist_to_player': 512,
  'next_next_pipe_top_y': 40,
  'next_pipe_bottom_y': 20,
  'next_pipe_dist_to_player': 20,
  'next_pipe_top_y': 20,
  'player_vel': 4,
  'player_y': 16
}

def preprocess(state):
    # instead of using absolute position of pipe, use relative position
    state = copy.deepcopy(state)
    state['next_next_pipe_bottom_y'] -= state['player_y']
    state['next_next_pipe_top_y'] -= state['player_y']
    state['next_pipe_bottom_y'] -= state['player_y']
    state['next_pipe_top_y'] -= state['player_y']

    # sort to make list converted from dict ordered in alphabet order
    state_key = [k for k, v in sorted(state.items())]

    # do bucketing to decrease state space to speed up training
    state_idx = []
    for key in state_key:
      state_idx.append(int(state[key] / bucket_range_per_feature[key]))
    result = np.asarray([tuple(state_idx)])
    return result

In [4]:
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 10e-4


class Agent:

  def __init__(self, name, num_action, t=0, discount_factor=0.99):
    self.exploring_rate = 0.1
    self.discount_factor = discount_factor
    self.num_action = num_action
    self.name = name
    
    with tf.variable_scope(name):
      self.build_model()

  def build_model(self):

    # input: current screen, selected action and reward
    #self.input_screen = tf.placeholder(tf.float32, shape=[None, screen_width, screen_height, num_stack])
    self.input_state = tf.placeholder(tf.float32, [None,8])
    self.action = tf.placeholder(tf.int32, [None])
    self.reward = tf.placeholder(tf.float32, [None])
    self.is_training = tf.placeholder(tf.bool, shape=[])

    def net(state, reuse=False):
      with tf.variable_scope(
          "layers",
          reuse=reuse,
          initializer=tf.truncated_normal_initializer(stddev=1e-2)):
        '''
        conv1 = tf.layers.conv2d(
            inputs=screen,
            filters=32,
            kernel_size=[8, 8],
            strides=[4, 4],
            padding='SAME',
            activation=tf.nn.relu)
        pool1 = tf.layers.max_pooling2d(
            conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

        conv2 = tf.layers.conv2d(
            inputs=pool1,
            filters=64,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding='SAME',
            activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(
            inputs=conv2,
            filters=64,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding='SAME',
            activation=tf.nn.relu)
        flat = tf.contrib.layers.flatten(conv3)'''
        dense = tf.layers.dense(inputs=state, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=128, activation=tf.nn.relu)
        Q = tf.layers.dense(
            inputs=dense, units=self.num_action, activation=None)

        return Q

    # optimize
    self.output = net(self.input_state
                     )  # Q(s,a,theta) for all a, shape (batch_size, num_action)
    index = tf.stack([tf.range(tf.shape(self.action)[0]), self.action], axis=1)
    self.esti_Q = tf.gather_nd(
        self.output,
        index)  # Q(s,a,theta) for selected action, shape (batch_size, 1)

    self.max_Q = tf.reduce_max(
        self.output, axis=1)  # max(Q(s',a',theta')), shape (batch_size, 1)
    self.tar_Q = tf.placeholder(tf.float32, [None])

    # loss = E[r+max(Q(s',a',theta'))-Q(s,a,theta)]
    self.loss = tf.reduce_mean(
        tf.square(self.reward + self.discount_factor * self.tar_Q -
                  self.esti_Q))

    optimizer = tf.train.AdamOptimizer(learning_rate=1e-5)
    self.g_gvs = optimizer.compute_gradients(
        self.loss,
        var_list=[v for v in tf.global_variables() if self.name in v.name])
    self.train_op = optimizer.apply_gradients(self.g_gvs)
    self.pred = tf.argmax(
        self.output, axis=1
    )  # select action with highest action-value, only used in inference

  def select_action(self, input_state, sess):
    # epsilon-greedy
    if np.random.rand() < self.exploring_rate:
      action = np.random.choice(num_action)  # Select a random action
    else:
      #input_screen = np.array(input_screen).transpose([1, 2, 0])
      feed_dict = {
          self.input_state: input_state,
          self.is_training: False,
      }
      action = sess.run(
          self.pred,
          feed_dict=feed_dict)[0]  # Select the action with the highest q
    return action

  def update_policy(self, input_state, actions, rewards, input_state_plum,
                    terminal, target_netwrok):
    # use max_Q estimate from target one to update online one
    feed_dict = {
        target_netwrok.input_state:
            input_state_plum,
        target_netwrok.is_training:
            True,
    }
    max_Q = sess.run(target_netwrok.max_Q, feed_dict=feed_dict)
    max_Q *= ~np.array(terminal)
    feed_dict = {
        self.input_state: input_state,
        self.tar_Q: max_Q,
        self.action: actions,
        self.reward: rewards,
        self.is_training: True,
    }
    loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
    return loss

  def update_parameters(self, episode):
    if self.exploring_rate > MIN_EXPLORING_RATE:
      self.exploring_rate -= (0.1 - MIN_EXPLORING_RATE) / 3000000

  def shutdown_explore(self):
    # make action selection greedy
    self.exploring_rate = 0

In [5]:
def get_update_ops():
  # return operations assign weight to target network
  src_vars = [v for v in tf.global_variables() if 'online' in v.name]
  tar_vars = [v for v in tf.global_variables() if 'target' in v.name]
  update_ops = []
  for src_var, tar_var in zip(src_vars, tar_vars):
    update_ops.append(tar_var.assign(src_var))
  return update_ops


def update_target(update_ops, sess):
  sess.run(update_ops)



In [6]:
# init agent
tf.reset_default_graph()
num_action = len(env.getActionSet())

# agent for frequently updating
online_agent = Agent('online', num_action)

# agent for slow updating
target_agent = Agent('target', num_action)
update_ops = get_update_ops()

In [7]:
class Replay_buffer():

  def __init__(self, buffer_size=50000):
    self.experiences = []
    self.buffer_size = buffer_size

  def add(self, experience):
    if len(self.experiences) >= self.buffer_size:
      self.experiences.pop(0)
    self.experiences.append(experience)

  def sample(self, size):
    """
        sameple experience from buffer
        """
    if size > len(self.experiences):
      experiences_idx = np.random.choice(len(self.experiences), size=size)
    else:
      experiences_idx = np.random.choice(
          len(self.experiences), size=size, replace=False)
    # from all sampled experiences, extract a tuple of (s,a,r,s')
    states = []
    actions = []
    rewards = []
    states_plum = []
    terminal = []
    for i in range(size):
      states.append(self.experiences[experiences_idx[i]][0])
      actions.append(self.experiences[experiences_idx[i]][1])
      rewards.append(self.experiences[experiences_idx[i]][2])
      states_plum.append(self.experiences[experiences_idx[i]][3])
      terminal.append(self.experiences[experiences_idx[i]][4])
    return states, actions, rewards, states_plum, terminal

In [8]:
# init buffer
buffer = Replay_buffer()

In [9]:
def make_anim(images, fps=60, true_image=False):
  duration = len(images) / fps
  import moviepy.editor as mpy

  def make_frame(t):
    try:
      x = images[int(len(images) / duration * t)]
    except:
      x = images[-1]

    if true_image:
      return x.astype(np.uint8)
    else:
      return ((x + 1) / 2 * 255).astype(np.uint8)

  clip = mpy.VideoClip(make_frame, duration=duration)
  clip.fps = fps
  return clip

In [10]:
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [None]:
from IPython.display import Image, display

update_every_t_step = 3
print_every_episode = 200
save_video_every_episode = 200
NUM_EPISODE = 20000
NUM_EXPLORE = 20

# we can redefine origin reward function
reward_values = {
    "positive": 1,  # reward pass a pipe
    "tick": 0.1,  # reward per timestamp
    "loss": -1,  # reward of gameover
}
for episode in range(0, NUM_EPISODE + 1):

  # Reset the environment
  game = FlappyBird()
  # for demo purpose, the following code is trained in the same scene,
  env = PLE(
      game,
      fps=30,
      display_screen=False,
      reward_values=reward_values,
      rng=np.random.RandomState(1))
  env.reset_game()
  env.act(0)  # dummy input to make sure input screen is correct

  # record frame
  if episode % save_video_every_episode == 0:
    frames = [env.getScreenRGB()]

  # for every 500 episodes, shutdown exploration to see performance of greedy action
  if episode % print_every_episode == 0:
    online_agent.shutdown_explore()

  # grayscale input screen for this episode
  #input_screens = [preprocess(env.getScreenGrayscale())] * 4

  input_states = preprocess(game.getGameState())

  # experience for this episode, store all (s,a,r,s') tuple
  experience = []

  # cumulate reward for this episode
  cum_reward = 0 
    
  t = 0
  while not env.game_over():

    # feed four previous screen, select an action
    action = online_agent.select_action(input_states, sess)

    # execute the action and get reward
    reward = env.act(env.getActionSet()[action])

    # record frame
    if episode % save_video_every_episode == 0:
      frames.append(env.getScreenRGB())

    # cumulate reward
    cum_reward += reward

    # append grayscale screen for this episode
    #input_screens.append(preprocess(env.getScreenGrayscale()))

    input_states_plum = preprocess(game.getGameState())
    
    # append experience for this episode
    buffer.add((input_states, action, reward, input_states_plum,
                env.game_over()))
    t += 1

    # update agent
  if episode > NUM_EXPLORE:
    train_states, train_actions, train_rewards, train_states_plum, terminal = buffer.sample(32)
    loss = online_agent.update_policy(np.squeeze(train_states), train_actions,
                                      train_rewards, np.squeeze(train_states_plum),
                                      terminal, target_agent)
  if t % update_every_t_step == 0 and episode > NUM_EXPLORE:
    update_target(update_ops, sess)

  # update explore rating and learning rate
  online_agent.update_parameters(episode)
  target_agent.update_parameters(episode)

  if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
    print(
        "[{}] time live:{}, cumulated reward: {}, exploring rate: {}, loss: {}".
        format(episode, t, cum_reward, target_agent.exploring_rate, loss))

  if episode % save_video_every_episode == 0:  # for every 100 episode, record an animation
    clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
    clip.write_videofile("movie/DQN-{}.webm".format(episode), fps=60)

[MoviePy] >>>> Building video movie/DQN-0.webm
[MoviePy] Writing video movie/DQN-0.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:00<00:00, 77.55it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-0.webm 

[200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09999336700000071, loss: 0.03457795828580856
[MoviePy] >>>> Building video movie/DQN-200.webm
[MoviePy] Writing video movie/DQN-200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 51.17it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-200.webm 

[400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999867670000014, loss: 0.0056176502257585526
[MoviePy] >>>> Building video movie/DQN-400.webm
[MoviePy] Writing video movie/DQN-400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 47.95it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-400.webm 

[600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999801670000021, loss: 0.0004317938582971692
[MoviePy] >>>> Building video movie/DQN-600.webm
[MoviePy] Writing video movie/DQN-600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.66it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-600.webm 

[800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999735670000028, loss: 0.000228092132601887
[MoviePy] >>>> Building video movie/DQN-800.webm
[MoviePy] Writing video movie/DQN-800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 47.83it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-800.webm 

[1000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999669670000035, loss: 0.030442040413618088
[MoviePy] >>>> Building video movie/DQN-1000.webm
[MoviePy] Writing video movie/DQN-1000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 46.39it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-1000.webm 

[1200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999603670000042, loss: 0.030554236844182014
[MoviePy] >>>> Building video movie/DQN-1200.webm
[MoviePy] Writing video movie/DQN-1200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 46.73it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-1200.webm 

[1400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999537670000049, loss: 0.030377186834812164
[MoviePy] >>>> Building video movie/DQN-1400.webm
[MoviePy] Writing video movie/DQN-1400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.13it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-1400.webm 

[1600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999471670000056, loss: 0.0002821034286171198
[MoviePy] >>>> Building video movie/DQN-1600.webm
[MoviePy] Writing video movie/DQN-1600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.44it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-1600.webm 

[1800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0999405670000063, loss: 0.0002171500527765602
[MoviePy] >>>> Building video movie/DQN-1800.webm
[MoviePy] Writing video movie/DQN-1800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.86it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-1800.webm 

[2000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09993396700000699, loss: 0.030475247651338577
[MoviePy] >>>> Building video movie/DQN-2000.webm
[MoviePy] Writing video movie/DQN-2000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.74it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-2000.webm 

[2200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09992736700000769, loss: 0.030444839969277382
[MoviePy] >>>> Building video movie/DQN-2200.webm
[MoviePy] Writing video movie/DQN-2200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.99it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-2200.webm 

[2400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09992076700000839, loss: 0.06068011000752449
[MoviePy] >>>> Building video movie/DQN-2400.webm
[MoviePy] Writing video movie/DQN-2400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 54.71it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-2400.webm 

[2600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09991416700000909, loss: 0.030498284846544266
[MoviePy] >>>> Building video movie/DQN-2600.webm
[MoviePy] Writing video movie/DQN-2600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.92it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-2600.webm 

[2800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09990756700000979, loss: 0.00023315238649956882
[MoviePy] >>>> Building video movie/DQN-2800.webm
[MoviePy] Writing video movie/DQN-2800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.04it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-2800.webm 

[3000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09990096700001048, loss: 0.030494121834635735
[MoviePy] >>>> Building video movie/DQN-3000.webm
[MoviePy] Writing video movie/DQN-3000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.39it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-3000.webm 

[3200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09989436700001118, loss: 0.00035538000520318747
[MoviePy] >>>> Building video movie/DQN-3200.webm
[MoviePy] Writing video movie/DQN-3200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.11it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-3200.webm 

[3400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09988776700001188, loss: 0.030465753749012947
[MoviePy] >>>> Building video movie/DQN-3400.webm
[MoviePy] Writing video movie/DQN-3400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 54.06it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-3400.webm 

[3600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09988116700001258, loss: 0.061158858239650726
[MoviePy] >>>> Building video movie/DQN-3600.webm
[MoviePy] Writing video movie/DQN-3600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 47.41it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-3600.webm 

[3800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09987456700001328, loss: 0.0003860170254483819
[MoviePy] >>>> Building video movie/DQN-3800.webm
[MoviePy] Writing video movie/DQN-3800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.13it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-3800.webm 

[4000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986796700001398, loss: 0.0003036668640561402
[MoviePy] >>>> Building video movie/DQN-4000.webm
[MoviePy] Writing video movie/DQN-4000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 55.10it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-4000.webm 

[4200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09986136700001468, loss: 0.030449239537119865
[MoviePy] >>>> Building video movie/DQN-4200.webm
[MoviePy] Writing video movie/DQN-4200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.87it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-4200.webm 

[4400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09985476700001537, loss: 0.0003007449849974364
[MoviePy] >>>> Building video movie/DQN-4400.webm
[MoviePy] Writing video movie/DQN-4400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.10it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-4400.webm 

[4600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09984816700001607, loss: 0.030521037057042122
[MoviePy] >>>> Building video movie/DQN-4600.webm
[MoviePy] Writing video movie/DQN-4600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.78it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-4600.webm 

[4800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09984156700001677, loss: 0.00024748884607106447
[MoviePy] >>>> Building video movie/DQN-4800.webm
[MoviePy] Writing video movie/DQN-4800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 54.08it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-4800.webm 

[5000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09983496700001747, loss: 0.00025932438438758254
[MoviePy] >>>> Building video movie/DQN-5000.webm
[MoviePy] Writing video movie/DQN-5000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 51.84it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-5000.webm 

[5200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09982836700001817, loss: 0.00020141879213042557
[MoviePy] >>>> Building video movie/DQN-5200.webm
[MoviePy] Writing video movie/DQN-5200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.94it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-5200.webm 

[5400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09982176700001887, loss: 0.00026122538838535547
[MoviePy] >>>> Building video movie/DQN-5400.webm
[MoviePy] Writing video movie/DQN-5400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.15it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-5400.webm 

[5600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09981516700001956, loss: 0.0003006410552188754
[MoviePy] >>>> Building video movie/DQN-5600.webm
[MoviePy] Writing video movie/DQN-5600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 45.29it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-5600.webm 

[5800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09980856700002026, loss: 0.06080994755029678
[MoviePy] >>>> Building video movie/DQN-5800.webm
[MoviePy] Writing video movie/DQN-5800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.26it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-5800.webm 

[6000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09980196700002096, loss: 0.00030517642153427005
[MoviePy] >>>> Building video movie/DQN-6000.webm
[MoviePy] Writing video movie/DQN-6000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 45.84it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-6000.webm 

[6200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09979536700002166, loss: 0.030479973182082176
[MoviePy] >>>> Building video movie/DQN-6200.webm
[MoviePy] Writing video movie/DQN-6200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.65it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-6200.webm 

[6400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09978876700002236, loss: 0.0002818656212184578
[MoviePy] >>>> Building video movie/DQN-6400.webm
[MoviePy] Writing video movie/DQN-6400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.34it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-6400.webm 

[6600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09978216700002306, loss: 0.00027000438421964645
[MoviePy] >>>> Building video movie/DQN-6600.webm
[MoviePy] Writing video movie/DQN-6600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.57it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-6600.webm 

[6800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09977556700002375, loss: 0.0002987095504067838
[MoviePy] >>>> Building video movie/DQN-6800.webm
[MoviePy] Writing video movie/DQN-6800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.46it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-6800.webm 

[7000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09976896700002445, loss: 0.00027598970336839557
[MoviePy] >>>> Building video movie/DQN-7000.webm
[MoviePy] Writing video movie/DQN-7000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.35it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-7000.webm 

[7200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09976236700002515, loss: 0.00030219441396184266
[MoviePy] >>>> Building video movie/DQN-7200.webm
[MoviePy] Writing video movie/DQN-7200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.63it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-7200.webm 

[7400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09975576700002585, loss: 0.00027556990971788764
[MoviePy] >>>> Building video movie/DQN-7400.webm
[MoviePy] Writing video movie/DQN-7400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.82it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-7400.webm 

[7600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09974916700002655, loss: 0.0608372688293457
[MoviePy] >>>> Building video movie/DQN-7600.webm
[MoviePy] Writing video movie/DQN-7600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 44.67it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-7600.webm 

[7800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09974256700002725, loss: 0.00027756288181990385
[MoviePy] >>>> Building video movie/DQN-7800.webm
[MoviePy] Writing video movie/DQN-7800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 54.10it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-7800.webm 

[8000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09973596700002794, loss: 0.0001822013728087768
[MoviePy] >>>> Building video movie/DQN-8000.webm
[MoviePy] Writing video movie/DQN-8000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.76it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-8000.webm 

[8200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09972936700002864, loss: 0.030563460662961006
[MoviePy] >>>> Building video movie/DQN-8200.webm
[MoviePy] Writing video movie/DQN-8200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.07it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-8200.webm 

[8400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09972276700002934, loss: 0.0002555629180278629
[MoviePy] >>>> Building video movie/DQN-8400.webm
[MoviePy] Writing video movie/DQN-8400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.73it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-8400.webm 

[8600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09971616700003004, loss: 0.060824085026979446
[MoviePy] >>>> Building video movie/DQN-8600.webm
[MoviePy] Writing video movie/DQN-8600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.86it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-8600.webm 

[8800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09970956700003074, loss: 0.030489016324281693
[MoviePy] >>>> Building video movie/DQN-8800.webm
[MoviePy] Writing video movie/DQN-8800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.69it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-8800.webm 

[9000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09970296700003144, loss: 0.00022469925170298666
[MoviePy] >>>> Building video movie/DQN-9000.webm
[MoviePy] Writing video movie/DQN-9000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.40it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-9000.webm 

[9200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09969636700003214, loss: 0.00023739745665807277
[MoviePy] >>>> Building video movie/DQN-9200.webm
[MoviePy] Writing video movie/DQN-9200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 47.89it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-9200.webm 

[9400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09968976700003283, loss: 0.03044702485203743
[MoviePy] >>>> Building video movie/DQN-9400.webm
[MoviePy] Writing video movie/DQN-9400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 51.93it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-9400.webm 

[9600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09968316700003353, loss: 0.00031525338999927044
[MoviePy] >>>> Building video movie/DQN-9600.webm
[MoviePy] Writing video movie/DQN-9600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.51it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-9600.webm 

[9800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09967656700003423, loss: 0.06065772846341133
[MoviePy] >>>> Building video movie/DQN-9800.webm
[MoviePy] Writing video movie/DQN-9800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 47.10it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-9800.webm 

[10000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09966996700003493, loss: 0.030546972528100014
[MoviePy] >>>> Building video movie/DQN-10000.webm
[MoviePy] Writing video movie/DQN-10000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 46.62it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-10000.webm 

[10200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09966336700003563, loss: 0.030470939353108406
[MoviePy] >>>> Building video movie/DQN-10200.webm
[MoviePy] Writing video movie/DQN-10200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.99it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-10200.webm 

[10400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09965676700003633, loss: 0.03051428496837616
[MoviePy] >>>> Building video movie/DQN-10400.webm
[MoviePy] Writing video movie/DQN-10400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 46.02it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-10400.webm 

[10600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09965016700003702, loss: 0.00025589141296222806
[MoviePy] >>>> Building video movie/DQN-10600.webm
[MoviePy] Writing video movie/DQN-10600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.70it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-10600.webm 

[10800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09964356700003772, loss: 0.00018863091827370226
[MoviePy] >>>> Building video movie/DQN-10800.webm
[MoviePy] Writing video movie/DQN-10800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.78it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-10800.webm 

[11000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09963696700003842, loss: 0.060748498886823654
[MoviePy] >>>> Building video movie/DQN-11000.webm
[MoviePy] Writing video movie/DQN-11000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.93it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-11000.webm 

[11200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09963036700003912, loss: 0.030449241399765015
[MoviePy] >>>> Building video movie/DQN-11200.webm
[MoviePy] Writing video movie/DQN-11200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 43.94it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-11200.webm 

[11400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09962376700003982, loss: 0.00031883371411822736
[MoviePy] >>>> Building video movie/DQN-11400.webm
[MoviePy] Writing video movie/DQN-11400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.28it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-11400.webm 

[11600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09961716700004052, loss: 0.00033403339330106974
[MoviePy] >>>> Building video movie/DQN-11600.webm
[MoviePy] Writing video movie/DQN-11600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.70it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-11600.webm 

[11800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09961056700004121, loss: 0.00027219249750487506
[MoviePy] >>>> Building video movie/DQN-11800.webm
[MoviePy] Writing video movie/DQN-11800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.15it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-11800.webm 

[12000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09960396700004191, loss: 0.00031301548006013036
[MoviePy] >>>> Building video movie/DQN-12000.webm
[MoviePy] Writing video movie/DQN-12000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 47.83it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-12000.webm 

[12200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09959736700004261, loss: 0.0003650028957054019
[MoviePy] >>>> Building video movie/DQN-12200.webm
[MoviePy] Writing video movie/DQN-12200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 47.21it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-12200.webm 

[12400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09959076700004331, loss: 0.00022749684285372496
[MoviePy] >>>> Building video movie/DQN-12400.webm
[MoviePy] Writing video movie/DQN-12400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 47.82it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-12400.webm 

[12600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09958416700004401, loss: 0.00026558974059298635
[MoviePy] >>>> Building video movie/DQN-12600.webm
[MoviePy] Writing video movie/DQN-12600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.14it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-12600.webm 

[12800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995775670000447, loss: 0.00029398989863693714
[MoviePy] >>>> Building video movie/DQN-12800.webm
[MoviePy] Writing video movie/DQN-12800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 47.54it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-12800.webm 

[13000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995709670000454, loss: 0.03047366440296173
[MoviePy] >>>> Building video movie/DQN-13000.webm
[MoviePy] Writing video movie/DQN-13000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.48it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-13000.webm 

[13200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995643670000461, loss: 0.030370257794857025
[MoviePy] >>>> Building video movie/DQN-13200.webm
[MoviePy] Writing video movie/DQN-13200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.39it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-13200.webm 

[13400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995577670000468, loss: 0.030429089441895485
[MoviePy] >>>> Building video movie/DQN-13400.webm
[MoviePy] Writing video movie/DQN-13400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 54.65it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-13400.webm 

[13600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995511670000475, loss: 0.00036557228304445744
[MoviePy] >>>> Building video movie/DQN-13600.webm
[MoviePy] Writing video movie/DQN-13600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.33it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-13600.webm 

[13800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995445670000482, loss: 0.0002911413903348148
[MoviePy] >>>> Building video movie/DQN-13800.webm
[MoviePy] Writing video movie/DQN-13800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.29it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-13800.webm 

[14000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995379670000489, loss: 0.000192973660887219
[MoviePy] >>>> Building video movie/DQN-14000.webm
[MoviePy] Writing video movie/DQN-14000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.75it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-14000.webm 

[14200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995313670000496, loss: 0.030584031715989113
[MoviePy] >>>> Building video movie/DQN-14200.webm
[MoviePy] Writing video movie/DQN-14200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.80it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-14200.webm 

[14400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.0995247670000503, loss: 0.00026266981149092317
[MoviePy] >>>> Building video movie/DQN-14400.webm
[MoviePy] Writing video movie/DQN-14400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.86it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-14400.webm 

[14600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09951816700005099, loss: 0.0003111872938461602
[MoviePy] >>>> Building video movie/DQN-14600.webm
[MoviePy] Writing video movie/DQN-14600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.20it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-14600.webm 

[14800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09951156700005169, loss: 0.0002852014731615782
[MoviePy] >>>> Building video movie/DQN-14800.webm
[MoviePy] Writing video movie/DQN-14800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 46.30it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-14800.webm 

[15000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09950496700005239, loss: 0.060972895473241806
[MoviePy] >>>> Building video movie/DQN-15000.webm
[MoviePy] Writing video movie/DQN-15000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.59it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-15000.webm 

[15200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09949836700005309, loss: 0.00027131973183713853
[MoviePy] >>>> Building video movie/DQN-15200.webm
[MoviePy] Writing video movie/DQN-15200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 51.39it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-15200.webm 

[15400] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09949176700005379, loss: 0.030565578490495682
[MoviePy] >>>> Building video movie/DQN-15400.webm
[MoviePy] Writing video movie/DQN-15400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 50.11it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-15400.webm 

[15600] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09948516700005448, loss: 0.00026916558272205293
[MoviePy] >>>> Building video movie/DQN-15600.webm
[MoviePy] Writing video movie/DQN-15600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 49.52it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-15600.webm 

[15800] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09947856700005518, loss: 0.060259975492954254
[MoviePy] >>>> Building video movie/DQN-15800.webm
[MoviePy] Writing video movie/DQN-15800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.14it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-15800.webm 

[16000] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09947196700005588, loss: 0.030452024191617966
[MoviePy] >>>> Building video movie/DQN-16000.webm
[MoviePy] Writing video movie/DQN-16000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 46.93it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-16000.webm 

[16200] time live:61, cumulated reward: 5.099999999999994, exploring rate: 0.09946536700005658, loss: 0.060992531478405
[MoviePy] >>>> Building video movie/DQN-16200.webm
[MoviePy] Writing video movie/DQN-16200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.03it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/DQN-16200.webm 



In [12]:
from moviepy.editor import *
clip = VideoFileClip("movie/DQN-200.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

 98%|███████████████████████████████████████████████████████████████████████████████▋ | 63/64 [00:00<00:00, 467.73it/s]


# Policy Gradient

In [39]:
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 0.01
MIN_LEARNING_RATE = 0.1


class Policy_Gradiebt_Agent:

  def __init__(self, name, num_action, t=0, discount_factor=0.99):
    self.discount_factor = discount_factor
    self.num_action = num_action
    self.name = name
    with tf.variable_scope(name):
      self.build_model()

  def build_model(self):

    # input: current screen, selected action and reward
    #self.input_screen = tf.placeholder(tf.float32, shape=[None, screen_width, screen_height, num_stack])
    self.input_state = tf.placeholder(tf.float32, [None,8])
    self.action = tf.placeholder(tf.int32, [None])
    self.reward = tf.placeholder(tf.float32, [None])
    self.is_training = tf.placeholder(tf.bool, shape=[])

    def net(state, reuse=False):
      with tf.variable_scope("layers", reuse=reuse):
        '''
        conv1 = tf.layers.conv2d(
            inputs=screen,
            filters=32,
            kernel_size=[8, 8],
            strides=[4, 4],
            padding='SAME',
            activation=tf.nn.relu)
        pool1 = tf.layers.max_pooling2d(
            conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

        conv2 = tf.layers.conv2d(
            inputs=pool1,
            filters=64,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding='SAME',
            activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(
            inputs=conv2,
            filters=64,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding='SAME',
            activation=tf.nn.relu)
        self.flat = tf.contrib.layers.flatten(conv3)
        '''
        dense = tf.layers.dense(inputs=state, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=128, activation=tf.nn.relu)
        self.dense1 = tf.layers.dense(
            inputs=dense, units=512, activation=tf.nn.relu)
        self.dense2 = tf.layers.dense(
            inputs=self.dense1, units=self.num_action, activation=None)
        return self.dense2

    # optimize
    self.output_logit = net(
        self.input_state
    )  # logit of probility(P(s,a,theta)) for all a, shape (batch_size, num_action)
    index = tf.stack([tf.range(tf.shape(self.action)[0]), self.action], axis=1)
    self.prob = tf.gather_nd(
        tf.nn.softmax(self.output_logit),
        index)  # P(s,a,theta) for selected action, shape (batch_size, 1)

    # loss = E[log(p(s,a))*r]
    # because we want to maximize objective, add negative sign before loss
    self.loss = -tf.reduce_mean(tf.log(self.prob + 0.00000001) * self.reward)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
    g_gvs = optimizer.compute_gradients(
        self.loss,
        var_list=[v for v in tf.global_variables() if self.name in v.name])
    self.train_op = optimizer.apply_gradients(g_gvs)

    self.pred = tf.multinomial(self.output_logit,
                               1)  # sample action from distribution

  def select_action(self, input_state, sess):
    #input_screen = np.array(input_screen).transpose([1, 2, 0])
    feed_dict = {
        self.input_state: input_state,
        self.is_training: False,
    }
    action = sess.run(
        self.pred,
        feed_dict=feed_dict)[0][0]  # sameple action from distribution
    return action

  def update_policy(self, input_state, actions, rewards, input_state_plum):
    feed_dict = {
        self.input_state: input_state,
        self.action: actions,
        self.reward: rewards,
        self.is_training: True,
    }
    loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
    return loss


In [40]:
# init agent
tf.reset_default_graph()
# agent for frequently updating
pg_agent = Policy_Gradiebt_Agent('PG_Agent', num_action)
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [41]:
from IPython.display import Image, display

update_every_episode = 1
print_every_episode = 100
save_video_every_episode = 100
NUM_EPISODE = 10000
NUM_EXPLORE = 10
NUM_PASS = 20
reward_values = {
    "positive": 1,
    "tick": 0.1,  # reward per timestamp
    "loss": -1,
}
for episode in range(0, NUM_EPISODE + 1):

  # Reset the environment
  game = FlappyBird()
  env = PLE(
      game,
      fps=30,
      display_screen=False,
      reward_values=reward_values,
      rng=np.random.RandomState(1))
  env.reset_game()
  env.act(0)  # dummy input to make sure input screen is correct

  # record frame
  if episode % save_video_every_episode == 0:
    frames = [env.getScreenRGB()]

  # grayscale input screen for this episode
  #input_screens = [preprocess(env.getScreenGrayscale())] * 4
    
  input_states = preprocess(game.getGameState())

  # cumulate reward for this episode
  cum_reward = 0

  experiences = []
  t = 0
  while not env.game_over():
    # feed four previous screen, select an action
    action = pg_agent.select_action(input_states, sess)

    # execute the action and get reward
    reward = env.act(env.getActionSet()[action])

    # record frame
    if episode % save_video_every_episode == 0:
      frames.append(env.getScreenRGB())

    # cumulate reward
    cum_reward += reward

    # append grayscale screen for this episode
    #input_screens.append(preprocess(env.getScreenGrayscale()))

    input_states_plum = preprocess(game.getGameState())
    
    # append experience for this episode
    experiences.append(
        [input_states, action, reward, input_states_plum])

    t += 1

  def discount_reward(x, discount_rate):
    discounted_r = np.zeros(len(x))
    num_r = len(x)
    for i in range(num_r):
      discounted_r[i] = x[i] * math.pow(discount_rate, i)
    discounted_r = np.cumsum(discounted_r[::-1])
    return discounted_r[::-1]

  rewards = [e[2] for e in experiences]
  discounted_reward = discount_reward(rewards, pg_agent.discount_factor)

  # normalize
  discounted_reward -= np.mean(discounted_reward)
  discounted_reward /= np.std(discounted_reward)
  train_states = []
  train_actions = []
  train_rewards = []
  train_input_states_plum = []
  for i in range(len(experiences)):
    experiences[i][2] = discounted_reward[i]
    train_states.append(experiences[i][0])
    train_actions.append(experiences[i][1])
    train_rewards.append(experiences[i][2])
    train_input_states_plum.append(experiences[i][3])
  loss = pg_agent.update_policy(np.squeeze(train_states), train_actions, train_rewards,
                                np.squeeze(train_input_states_plum))

  if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
    print("[{}] time live:{}, cumulated reward: {}, loss: {}".format(
        episode, t, cum_reward, loss))

  if episode % save_video_every_episode == 0 and episode > NUM_EXPLORE:  # for every 5000 episode, record an animation
    clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
    clip.write_videofile("movie/pg_{}.webm".format(episode), fps=60)
    #display(clip.ipython_display(fps=60, autoplay=1, loop=1))

[100] time live:44, cumulated reward: 3.4000000000000004, loss: 0.052745211869478226
[MoviePy] >>>> Building video movie/pg_100.webm
[MoviePy] Writing video movie/pg_100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▏ | 45/46 [00:01<00:00, 38.38it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_100.webm 

[200] time live:47, cumulated reward: 3.6999999999999993, loss: 0.011050487868487835
[MoviePy] >>>> Building video movie/pg_200.webm
[MoviePy] Writing video movie/pg_200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 48/49 [00:01<00:00, 48.54it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_200.webm 

[300] time live:48, cumulated reward: 3.799999999999999, loss: 0.029944023117423058
[MoviePy] >>>> Building video movie/pg_300.webm
[MoviePy] Writing video movie/pg_300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 49/50 [00:01<00:00, 42.48it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_300.webm 

[400] time live:54, cumulated reward: 4.399999999999997, loss: -0.016886623576283455
[MoviePy] >>>> Building video movie/pg_400.webm
[MoviePy] Writing video movie/pg_400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 44.67it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_400.webm 

[500] time live:48, cumulated reward: 3.799999999999999, loss: 0.012325246818363667
[MoviePy] >>>> Building video movie/pg_500.webm
[MoviePy] Writing video movie/pg_500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 49/50 [00:01<00:00, 37.43it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_500.webm 

[600] time live:61, cumulated reward: 5.099999999999994, loss: 0.014545378275215626
[MoviePy] >>>> Building video movie/pg_600.webm
[MoviePy] Writing video movie/pg_600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 45.39it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_600.webm 

[700] time live:53, cumulated reward: 4.299999999999997, loss: -0.015559142455458641
[MoviePy] >>>> Building video movie/pg_700.webm
[MoviePy] Writing video movie/pg_700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 47.70it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_700.webm 

[800] time live:49, cumulated reward: 3.8999999999999986, loss: 0.029776904731988907
[MoviePy] >>>> Building video movie/pg_800.webm
[MoviePy] Writing video movie/pg_800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 50/51 [00:01<00:00, 45.45it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_800.webm 

[900] time live:53, cumulated reward: 4.299999999999997, loss: 0.028577426448464394
[MoviePy] >>>> Building video movie/pg_900.webm
[MoviePy] Writing video movie/pg_900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 46.45it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_900.webm 

[1000] time live:51, cumulated reward: 4.099999999999998, loss: 0.003425317583605647
[MoviePy] >>>> Building video movie/pg_1000.webm
[MoviePy] Writing video movie/pg_1000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:01<00:00, 40.95it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1000.webm 

[1100] time live:50, cumulated reward: 3.9999999999999982, loss: 0.02545434981584549
[MoviePy] >>>> Building video movie/pg_1100.webm
[MoviePy] Writing video movie/pg_1100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 51/52 [00:00<00:00, 51.18it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1100.webm 

[1200] time live:53, cumulated reward: 4.299999999999997, loss: -0.00319671630859375
[MoviePy] >>>> Building video movie/pg_1200.webm
[MoviePy] Writing video movie/pg_1200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 42.50it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1200.webm 

[1300] time live:60, cumulated reward: 4.999999999999995, loss: -0.030508454889059067
[MoviePy] >>>> Building video movie/pg_1300.webm
[MoviePy] Writing video movie/pg_1300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 61/62 [00:01<00:00, 47.83it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1300.webm 

[1400] time live:49, cumulated reward: 3.8999999999999986, loss: -0.02029559016227722
[MoviePy] >>>> Building video movie/pg_1400.webm
[MoviePy] Writing video movie/pg_1400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 50/51 [00:01<00:00, 43.39it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1400.webm 

[1500] time live:61, cumulated reward: 5.099999999999994, loss: -0.04183003306388855
[MoviePy] >>>> Building video movie/pg_1500.webm
[MoviePy] Writing video movie/pg_1500.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 43.72it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1500.webm 

[1600] time live:52, cumulated reward: 4.1999999999999975, loss: -0.03382433205842972
[MoviePy] >>>> Building video movie/pg_1600.webm
[MoviePy] Writing video movie/pg_1600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 53/54 [00:01<00:00, 39.56it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1600.webm 

[1700] time live:57, cumulated reward: 4.699999999999996, loss: -0.03775755688548088
[MoviePy] >>>> Building video movie/pg_1700.webm
[MoviePy] Writing video movie/pg_1700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 41.77it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1700.webm 

[1800] time live:56, cumulated reward: 4.599999999999996, loss: -0.05260828509926796
[MoviePy] >>>> Building video movie/pg_1800.webm
[MoviePy] Writing video movie/pg_1800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 46.57it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1800.webm 

[1900] time live:47, cumulated reward: 3.6999999999999993, loss: 0.03157731145620346
[MoviePy] >>>> Building video movie/pg_1900.webm
[MoviePy] Writing video movie/pg_1900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 48/49 [00:01<00:00, 37.85it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_1900.webm 

[2000] time live:59, cumulated reward: 4.899999999999995, loss: 0.017174171283841133
[MoviePy] >>>> Building video movie/pg_2000.webm
[MoviePy] Writing video movie/pg_2000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 37.08it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2000.webm 

[2100] time live:59, cumulated reward: 4.899999999999995, loss: -0.07170457392930984
[MoviePy] >>>> Building video movie/pg_2100.webm
[MoviePy] Writing video movie/pg_2100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 40.36it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2100.webm 

[2200] time live:47, cumulated reward: 3.6999999999999993, loss: 0.019302591681480408
[MoviePy] >>>> Building video movie/pg_2200.webm
[MoviePy] Writing video movie/pg_2200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 48/49 [00:01<00:00, 38.34it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2200.webm 

[2300] time live:57, cumulated reward: 4.699999999999996, loss: -0.050547800958156586
[MoviePy] >>>> Building video movie/pg_2300.webm
[MoviePy] Writing video movie/pg_2300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 42.68it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2300.webm 

[2400] time live:61, cumulated reward: 5.099999999999994, loss: 0.061673883348703384
[MoviePy] >>>> Building video movie/pg_2400.webm
[MoviePy] Writing video movie/pg_2400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 37.59it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2400.webm 

[2500] time live:59, cumulated reward: 4.899999999999995, loss: 0.012951446697115898
[MoviePy] >>>> Building video movie/pg_2500.webm
[MoviePy] Writing video movie/pg_2500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 41.33it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2500.webm 

[2600] time live:52, cumulated reward: 4.1999999999999975, loss: -0.025958610698580742
[MoviePy] >>>> Building video movie/pg_2600.webm
[MoviePy] Writing video movie/pg_2600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 53/54 [00:01<00:00, 41.58it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2600.webm 

[2700] time live:61, cumulated reward: 5.099999999999994, loss: 0.022882962599396706
[MoviePy] >>>> Building video movie/pg_2700.webm
[MoviePy] Writing video movie/pg_2700.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 37.74it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2700.webm 

[2800] time live:61, cumulated reward: 5.099999999999994, loss: -0.0003796874370891601
[MoviePy] >>>> Building video movie/pg_2800.webm
[MoviePy] Writing video movie/pg_2800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 37.58it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2800.webm 

[2900] time live:53, cumulated reward: 4.299999999999997, loss: -0.011232124641537666
[MoviePy] >>>> Building video movie/pg_2900.webm
[MoviePy] Writing video movie/pg_2900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 41.31it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_2900.webm 

[3000] time live:55, cumulated reward: 4.4999999999999964, loss: -0.07766690850257874
[MoviePy] >>>> Building video movie/pg_3000.webm
[MoviePy] Writing video movie/pg_3000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 35.80it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3000.webm 

[3100] time live:61, cumulated reward: 5.099999999999994, loss: -0.030250253155827522
[MoviePy] >>>> Building video movie/pg_3100.webm
[MoviePy] Writing video movie/pg_3100.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 34.57it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3100.webm 

[3200] time live:51, cumulated reward: 4.099999999999998, loss: -0.050385549664497375
[MoviePy] >>>> Building video movie/pg_3200.webm
[MoviePy] Writing video movie/pg_3200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:02<00:00, 20.95it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3200.webm 

[3300] time live:51, cumulated reward: 4.099999999999998, loss: 0.003101236652582884
[MoviePy] >>>> Building video movie/pg_3300.webm
[MoviePy] Writing video movie/pg_3300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:01<00:00, 40.59it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3300.webm 

[3400] time live:60, cumulated reward: 4.999999999999995, loss: -0.0006911277887411416
[MoviePy] >>>> Building video movie/pg_3400.webm
[MoviePy] Writing video movie/pg_3400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 61/62 [00:01<00:00, 41.86it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3400.webm 

[3500] time live:55, cumulated reward: 4.4999999999999964, loss: -0.053135596215724945
[MoviePy] >>>> Building video movie/pg_3500.webm
[MoviePy] Writing video movie/pg_3500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 38.12it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3500.webm 

[3600] time live:61, cumulated reward: 5.099999999999994, loss: -0.026998644694685936
[MoviePy] >>>> Building video movie/pg_3600.webm
[MoviePy] Writing video movie/pg_3600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 32.11it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3600.webm 

[3700] time live:61, cumulated reward: 5.099999999999994, loss: 0.031718768179416656
[MoviePy] >>>> Building video movie/pg_3700.webm
[MoviePy] Writing video movie/pg_3700.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 34.24it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3700.webm 

[3800] time live:47, cumulated reward: 3.6999999999999993, loss: -0.013847919180989265
[MoviePy] >>>> Building video movie/pg_3800.webm
[MoviePy] Writing video movie/pg_3800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 48/49 [00:01<00:00, 41.83it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3800.webm 

[3900] time live:54, cumulated reward: 4.399999999999997, loss: -0.09137040376663208
[MoviePy] >>>> Building video movie/pg_3900.webm
[MoviePy] Writing video movie/pg_3900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 43.08it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_3900.webm 

[4000] time live:56, cumulated reward: 4.599999999999996, loss: 0.026154842227697372
[MoviePy] >>>> Building video movie/pg_4000.webm
[MoviePy] Writing video movie/pg_4000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 41.66it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4000.webm 

[4100] time live:54, cumulated reward: 4.399999999999997, loss: 0.009112958796322346
[MoviePy] >>>> Building video movie/pg_4100.webm
[MoviePy] Writing video movie/pg_4100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 36.03it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4100.webm 

[4200] time live:59, cumulated reward: 4.899999999999995, loss: 0.010548979975283146
[MoviePy] >>>> Building video movie/pg_4200.webm
[MoviePy] Writing video movie/pg_4200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 44.63it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4200.webm 

[4300] time live:45, cumulated reward: 3.5, loss: 0.011054865084588528
[MoviePy] >>>> Building video movie/pg_4300.webm
[MoviePy] Writing video movie/pg_4300.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [00:01<00:00, 36.58it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4300.webm 

[4400] time live:57, cumulated reward: 4.699999999999996, loss: -0.12576349079608917
[MoviePy] >>>> Building video movie/pg_4400.webm
[MoviePy] Writing video movie/pg_4400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 40.26it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4400.webm 

[4500] time live:54, cumulated reward: 4.399999999999997, loss: -0.05070142447948456
[MoviePy] >>>> Building video movie/pg_4500.webm
[MoviePy] Writing video movie/pg_4500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 30.47it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4500.webm 

[4600] time live:59, cumulated reward: 4.899999999999995, loss: -0.03121030144393444
[MoviePy] >>>> Building video movie/pg_4600.webm
[MoviePy] Writing video movie/pg_4600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 38.23it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4600.webm 

[4700] time live:55, cumulated reward: 4.4999999999999964, loss: 0.020847182720899582
[MoviePy] >>>> Building video movie/pg_4700.webm
[MoviePy] Writing video movie/pg_4700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 33.70it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4700.webm 

[4800] time live:52, cumulated reward: 4.1999999999999975, loss: 0.03994132950901985
[MoviePy] >>>> Building video movie/pg_4800.webm
[MoviePy] Writing video movie/pg_4800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 53/54 [00:01<00:00, 40.50it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4800.webm 

[4900] time live:61, cumulated reward: 5.099999999999994, loss: -0.0006888655479997396
[MoviePy] >>>> Building video movie/pg_4900.webm
[MoviePy] Writing video movie/pg_4900.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 34.92it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_4900.webm 

[5000] time live:61, cumulated reward: 5.099999999999994, loss: -0.02096685767173767
[MoviePy] >>>> Building video movie/pg_5000.webm
[MoviePy] Writing video movie/pg_5000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 39.52it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5000.webm 

[5100] time live:61, cumulated reward: 5.099999999999994, loss: 0.027252698317170143
[MoviePy] >>>> Building video movie/pg_5100.webm
[MoviePy] Writing video movie/pg_5100.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 36.10it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5100.webm 

[5200] time live:57, cumulated reward: 4.699999999999996, loss: -0.07837419211864471
[MoviePy] >>>> Building video movie/pg_5200.webm
[MoviePy] Writing video movie/pg_5200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 46.13it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5200.webm 

[5300] time live:56, cumulated reward: 4.599999999999996, loss: 0.03219657391309738
[MoviePy] >>>> Building video movie/pg_5300.webm
[MoviePy] Writing video movie/pg_5300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 35.11it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5300.webm 

[5400] time live:51, cumulated reward: 4.099999999999998, loss: -0.038497719913721085
[MoviePy] >>>> Building video movie/pg_5400.webm
[MoviePy] Writing video movie/pg_5400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:01<00:00, 38.34it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5400.webm 

[5500] time live:61, cumulated reward: 5.099999999999994, loss: 0.027692247182130814
[MoviePy] >>>> Building video movie/pg_5500.webm
[MoviePy] Writing video movie/pg_5500.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 33.46it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5500.webm 

[5600] time live:57, cumulated reward: 4.699999999999996, loss: -0.091570183634758
[MoviePy] >>>> Building video movie/pg_5600.webm
[MoviePy] Writing video movie/pg_5600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 38.03it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5600.webm 

[5700] time live:53, cumulated reward: 4.299999999999997, loss: -0.09315476566553116
[MoviePy] >>>> Building video movie/pg_5700.webm
[MoviePy] Writing video movie/pg_5700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 36.05it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5700.webm 

[5800] time live:57, cumulated reward: 4.699999999999996, loss: -0.11108257621526718
[MoviePy] >>>> Building video movie/pg_5800.webm
[MoviePy] Writing video movie/pg_5800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 42.44it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5800.webm 

[5900] time live:57, cumulated reward: 4.699999999999996, loss: -0.025478463619947433
[MoviePy] >>>> Building video movie/pg_5900.webm
[MoviePy] Writing video movie/pg_5900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 42.71it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_5900.webm 

[6000] time live:60, cumulated reward: 4.999999999999995, loss: -0.03439108654856682
[MoviePy] >>>> Building video movie/pg_6000.webm
[MoviePy] Writing video movie/pg_6000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 61/62 [00:01<00:00, 41.23it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6000.webm 

[6100] time live:61, cumulated reward: 5.099999999999994, loss: 0.004295489750802517
[MoviePy] >>>> Building video movie/pg_6100.webm
[MoviePy] Writing video movie/pg_6100.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 33.98it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6100.webm 

[6200] time live:54, cumulated reward: 4.399999999999997, loss: -0.03251584246754646
[MoviePy] >>>> Building video movie/pg_6200.webm
[MoviePy] Writing video movie/pg_6200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 41.27it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6200.webm 

[6300] time live:61, cumulated reward: 5.099999999999994, loss: -0.038311224430799484
[MoviePy] >>>> Building video movie/pg_6300.webm
[MoviePy] Writing video movie/pg_6300.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 35.41it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6300.webm 

[6400] time live:54, cumulated reward: 4.399999999999997, loss: 0.005631729494780302
[MoviePy] >>>> Building video movie/pg_6400.webm
[MoviePy] Writing video movie/pg_6400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 45.83it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6400.webm 

[6500] time live:61, cumulated reward: 5.099999999999994, loss: 0.015112095512449741
[MoviePy] >>>> Building video movie/pg_6500.webm
[MoviePy] Writing video movie/pg_6500.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 31.68it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6500.webm 

[6600] time live:53, cumulated reward: 4.299999999999997, loss: -0.008131656795740128
[MoviePy] >>>> Building video movie/pg_6600.webm
[MoviePy] Writing video movie/pg_6600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 34.67it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6600.webm 

[6700] time live:57, cumulated reward: 4.699999999999996, loss: 0.05248964950442314
[MoviePy] >>>> Building video movie/pg_6700.webm
[MoviePy] Writing video movie/pg_6700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 42.50it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6700.webm 

[6800] time live:61, cumulated reward: 5.099999999999994, loss: -0.04030599817633629
[MoviePy] >>>> Building video movie/pg_6800.webm
[MoviePy] Writing video movie/pg_6800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 34.44it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6800.webm 

[6900] time live:53, cumulated reward: 4.299999999999997, loss: -0.07600204646587372
[MoviePy] >>>> Building video movie/pg_6900.webm
[MoviePy] Writing video movie/pg_6900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 34.64it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_6900.webm 

[7000] time live:61, cumulated reward: 5.099999999999994, loss: 0.029980143532156944
[MoviePy] >>>> Building video movie/pg_7000.webm
[MoviePy] Writing video movie/pg_7000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 34.07it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7000.webm 

[7100] time live:58, cumulated reward: 4.799999999999995, loss: 0.02233077771961689
[MoviePy] >>>> Building video movie/pg_7100.webm
[MoviePy] Writing video movie/pg_7100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 37.62it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7100.webm 

[7200] time live:61, cumulated reward: 5.099999999999994, loss: -0.11690396070480347
[MoviePy] >>>> Building video movie/pg_7200.webm
[MoviePy] Writing video movie/pg_7200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 36.42it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7200.webm 

[7300] time live:57, cumulated reward: 4.699999999999996, loss: 0.12686598300933838
[MoviePy] >>>> Building video movie/pg_7300.webm
[MoviePy] Writing video movie/pg_7300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 38.45it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7300.webm 

[7400] time live:55, cumulated reward: 4.4999999999999964, loss: 0.033414267003536224
[MoviePy] >>>> Building video movie/pg_7400.webm
[MoviePy] Writing video movie/pg_7400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 41.87it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7400.webm 

[7500] time live:56, cumulated reward: 4.599999999999996, loss: -0.049816932529211044
[MoviePy] >>>> Building video movie/pg_7500.webm
[MoviePy] Writing video movie/pg_7500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 41.92it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7500.webm 

[7600] time live:54, cumulated reward: 4.399999999999997, loss: 0.04694018512964249
[MoviePy] >>>> Building video movie/pg_7600.webm
[MoviePy] Writing video movie/pg_7600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 34.32it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7600.webm 

[7700] time live:55, cumulated reward: 4.4999999999999964, loss: -0.0009215961908921599
[MoviePy] >>>> Building video movie/pg_7700.webm
[MoviePy] Writing video movie/pg_7700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 40.67it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7700.webm 

[7800] time live:61, cumulated reward: 5.099999999999994, loss: -0.10740136355161667
[MoviePy] >>>> Building video movie/pg_7800.webm
[MoviePy] Writing video movie/pg_7800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 34.66it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7800.webm 

[7900] time live:52, cumulated reward: 4.1999999999999975, loss: 0.07479233294725418
[MoviePy] >>>> Building video movie/pg_7900.webm
[MoviePy] Writing video movie/pg_7900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 53/54 [00:01<00:00, 36.56it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_7900.webm 

[8000] time live:58, cumulated reward: 4.799999999999995, loss: -0.010325135663151741
[MoviePy] >>>> Building video movie/pg_8000.webm
[MoviePy] Writing video movie/pg_8000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 43.13it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8000.webm 

[8100] time live:56, cumulated reward: 4.599999999999996, loss: 0.009682689793407917
[MoviePy] >>>> Building video movie/pg_8100.webm
[MoviePy] Writing video movie/pg_8100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 42.45it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8100.webm 

[8200] time live:56, cumulated reward: 4.599999999999996, loss: 0.12319759279489517
[MoviePy] >>>> Building video movie/pg_8200.webm
[MoviePy] Writing video movie/pg_8200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 37.08it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8200.webm 

[8300] time live:53, cumulated reward: 4.299999999999997, loss: -0.14125706255435944
[MoviePy] >>>> Building video movie/pg_8300.webm
[MoviePy] Writing video movie/pg_8300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 38.73it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8300.webm 

[8400] time live:61, cumulated reward: 5.099999999999994, loss: 0.0072333537973463535
[MoviePy] >>>> Building video movie/pg_8400.webm
[MoviePy] Writing video movie/pg_8400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 39.33it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8400.webm 

[8500] time live:54, cumulated reward: 4.399999999999997, loss: 0.06858299672603607
[MoviePy] >>>> Building video movie/pg_8500.webm
[MoviePy] Writing video movie/pg_8500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 35.55it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8500.webm 

[8600] time live:53, cumulated reward: 4.299999999999997, loss: 0.07972976565361023
[MoviePy] >>>> Building video movie/pg_8600.webm
[MoviePy] Writing video movie/pg_8600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 37.80it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8600.webm 

[8700] time live:56, cumulated reward: 4.599999999999996, loss: 0.11240022629499435
[MoviePy] >>>> Building video movie/pg_8700.webm
[MoviePy] Writing video movie/pg_8700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 41.88it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8700.webm 

[8800] time live:58, cumulated reward: 4.799999999999995, loss: 0.06152830272912979
[MoviePy] >>>> Building video movie/pg_8800.webm
[MoviePy] Writing video movie/pg_8800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 33.38it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8800.webm 

[8900] time live:59, cumulated reward: 4.899999999999995, loss: 0.027671555057168007
[MoviePy] >>>> Building video movie/pg_8900.webm
[MoviePy] Writing video movie/pg_8900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 36.52it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_8900.webm 

[9000] time live:58, cumulated reward: 4.799999999999995, loss: 0.03331838920712471
[MoviePy] >>>> Building video movie/pg_9000.webm
[MoviePy] Writing video movie/pg_9000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 35.10it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9000.webm 

[9100] time live:59, cumulated reward: 4.899999999999995, loss: 0.0035450176801532507
[MoviePy] >>>> Building video movie/pg_9100.webm
[MoviePy] Writing video movie/pg_9100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 34.57it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9100.webm 

[9200] time live:61, cumulated reward: 5.099999999999994, loss: 0.13460522890090942
[MoviePy] >>>> Building video movie/pg_9200.webm
[MoviePy] Writing video movie/pg_9200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 33.85it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9200.webm 

[9300] time live:61, cumulated reward: 5.099999999999994, loss: -0.0302289891988039
[MoviePy] >>>> Building video movie/pg_9300.webm
[MoviePy] Writing video movie/pg_9300.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 32.15it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9300.webm 

[9400] time live:61, cumulated reward: 5.099999999999994, loss: 0.06470123678445816
[MoviePy] >>>> Building video movie/pg_9400.webm
[MoviePy] Writing video movie/pg_9400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 45.39it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9400.webm 

[9500] time live:61, cumulated reward: 5.099999999999994, loss: -0.048279933631420135
[MoviePy] >>>> Building video movie/pg_9500.webm
[MoviePy] Writing video movie/pg_9500.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 33.06it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9500.webm 

[9600] time live:61, cumulated reward: 5.099999999999994, loss: 0.021065305918455124
[MoviePy] >>>> Building video movie/pg_9600.webm
[MoviePy] Writing video movie/pg_9600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:02<00:00, 28.65it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9600.webm 

[9700] time live:55, cumulated reward: 4.4999999999999964, loss: -0.10208221524953842
[MoviePy] >>>> Building video movie/pg_9700.webm
[MoviePy] Writing video movie/pg_9700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 35.72it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9700.webm 

[9800] time live:53, cumulated reward: 4.299999999999997, loss: -0.006243939511477947
[MoviePy] >>>> Building video movie/pg_9800.webm
[MoviePy] Writing video movie/pg_9800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 34.05it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9800.webm 

[9900] time live:56, cumulated reward: 4.599999999999996, loss: 0.03826720267534256
[MoviePy] >>>> Building video movie/pg_9900.webm
[MoviePy] Writing video movie/pg_9900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:02<00:00, 29.92it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_9900.webm 

[10000] time live:61, cumulated reward: 5.099999999999994, loss: -0.032292820513248444
[MoviePy] >>>> Building video movie/pg_10000.webm
[MoviePy] Writing video movie/pg_10000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:02<00:00, 30.93it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/pg_10000.webm 



In [24]:
from moviepy.editor import *
clip = VideoFileClip("movie/pg_200.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

 98%|███████████████████████████████████████████████████████████████████████████████▋ | 63/64 [00:00<00:00, 423.33it/s]


# Actor-Critic

In [36]:
class Actor_critic:

  def __init__(self, name, num_action, discount_factor=0.99):
    self.exploring_rate = 0.1
    self.discount_factor = discount_factor
    self.num_action = num_action
    self.name = name
    with tf.variable_scope(name):
      self.build_model()

  def build_model(self):
    # input: current screen, selected action and reward
    #self.input_screen = tf.placeholder(tf.float32, shape=[None, screen_width, screen_height, num_stack])
    self.input_state = tf.placeholder(tf.float32, [None,8])
    self.action = tf.placeholder(tf.int32, [None])
    self.reward = tf.placeholder(tf.float32, [None])
    self.is_training = tf.placeholder(tf.bool, shape=[])

    def value_net(state, reuse=False):
      with tf.variable_scope(
          "value_net",
          reuse=reuse,
          initializer=tf.truncated_normal_initializer(stddev=1e-2)):
        '''
        conv1 = tf.layers.conv2d(
            inputs=screen,
            filters=32,
            kernel_size=[8, 8],
            strides=[4, 4],
            padding='SAME',
            activation=tf.nn.relu)
        pool1 = tf.layers.max_pooling2d(
            conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

        conv2 = tf.layers.conv2d(
            inputs=pool1,
            filters=64,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding='SAME',
            activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(
            inputs=conv2,
            filters=64,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding='SAME',
            activation=tf.nn.relu)
        flat = tf.contrib.layers.flatten(conv3)
        '''
        dense = tf.layers.dense(inputs=state, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=512, activation=tf.nn.relu)
        V = tf.layers.dense(inputs=dense, units=1, activation=None)
        return V

    def policy_net(state, reuse=False):
      with tf.variable_scope("policy_net", reuse=reuse):
        '''
        conv1 = tf.layers.conv2d(
            inputs=screen,
            filters=32,
            kernel_size=[8, 8],
            strides=[4, 4],
            padding='SAME',
            activation=tf.nn.relu)
        pool1 = tf.layers.max_pooling2d(
            conv1, pool_size=[2, 2], strides=[2, 2], padding='SAME')

        conv2 = tf.layers.conv2d(
            inputs=pool1,
            filters=64,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding='SAME',
            activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(
            inputs=conv2,
            filters=64,
            kernel_size=[3, 3],
            strides=[1, 1],
            padding='SAME',
            activation=tf.nn.relu)
        self.flat = tf.contrib.layers.flatten(conv3)
        '''
        dense = tf.layers.dense(inputs=state, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=128, activation=tf.nn.relu)
        dense = tf.layers.dense(inputs=dense, units=128, activation=tf.nn.relu)
        self.dense1 = tf.layers.dense(
            inputs=dense, units=512, activation=tf.nn.relu)
        self.dense2 = tf.layers.dense(
            inputs=self.dense1, units=self.num_action, activation=None)
        return self.dense2

    # value
    self.v_output = value_net(
        self.input_state
    )  # Q(s,a,theta) for all a, shape (batch_size, num_action)
    self.tar_V = tf.placeholder(tf.float32, [None])
    self.V_loss = tf.reduce_mean(
        tf.square(self.reward + self.discount_factor * self.tar_V -
                  self.v_output))
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
    g_gvs = optimizer.compute_gradients(
        self.V_loss,
        var_list=[v for v in tf.global_variables() if 'value_net' in v.name])
    self.V_train_op = optimizer.apply_gradients(g_gvs)

    # policy
    self.policy_logit = policy_net(
        self.input_state
    )  # logit of probility(P(s,a,theta)) for all a, shape (batch_size, num_action)
    index = tf.stack([tf.range(tf.shape(self.action)[0]), self.action], axis=1)
    self.prob = tf.gather_nd(
        tf.nn.softmax(self.policy_logit),
        index)  # P(s,a,theta) for selected action, shape (batch_size, 1)

    # loss = E[log(p(s,a))*r]
    self.policy_loss = -tf.reduce_mean(
        tf.log(self.prob + 0.00000001) * self.reward)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-6)
    g_gvs = optimizer.compute_gradients(
        self.policy_loss,
        var_list=[v for v in tf.global_variables() if 'policy_net' in v.name])
    self.train_op = optimizer.apply_gradients(g_gvs)
    self.pred = tf.multinomial(self.policy_logit,
                               1)  # sample action from distribution

  def select_action(self, input_state, sess):
    #input_screen = np.array(input_screen).transpose([1, 2, 0])
    feed_dict = {
        self.input_state: input_state,
    }
    action = sess.run(
        self.pred,
        feed_dict=feed_dict)[0][0]  # sameple action from distribution
    return action

  def update_policy(self, input_state, actions, rewards, input_state_plum):
    feed_dict = {
        self.input_state: input_state_plum,
    }
    esti_V = sess.run(self.v_output, feed_dict=feed_dict).flatten()
    td_target = rewards + self.discount_factor * esti_V

    feed_dict = {
        self.input_state: input_state,
    }
    esti_V = sess.run(self.v_output, feed_dict=feed_dict).flatten()
    td_error = td_target - esti_V
    feed_dict = {
        self.input_state: input_state_plum,
    }
    feed_dict = {
        self.input_state: input_state,
        self.tar_V: td_target,
        self.reward: rewards,
    }

    V_loss, _ = sess.run([self.V_loss, self.V_train_op], feed_dict=feed_dict)

    feed_dict = {
        self.input_state: input_state,
        self.action: actions,
        self.reward: td_error,
    }
    policy_loss, _ = sess.run(
        [self.policy_loss, self.train_op], feed_dict=feed_dict)
    return V_loss, policy_loss

  def update_parameters(self, episode):
    if self.exploring_rate > MIN_EXPLORING_RATE:
      self.exploring_rate -= (0.1 - MIN_EXPLORING_RATE) / 3000000

  def shutdown_explore(self):
    # make action selection greedy
    self.exploring_rate = 0

In [37]:
# init agent
tf.reset_default_graph()
# agent for frequently updating
ac_agent = Actor_critic('PG_Agent', num_action)
# init all
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
sess.run(tf.global_variables_initializer())

In [38]:
from IPython.display import Image, display

update_every_episode = 1
print_every_episode = 100
save_video_every_episode = 100
NUM_EPISODE = 10000
NUM_EXPLORE = 0
reward_values = {
    "positive": 1,
    "tick": 0.1,  # reward per timestamp
    "loss": -1,
}
for episode in range(0, NUM_EPISODE + 1):

  # Reset the environment
  game = FlappyBird()
  env = PLE(
      game,
      fps=30,
      display_screen=False,
      reward_values=reward_values,
      rng=np.random.RandomState(1))
  env.reset_game()
  env.act(0)  # dummy input to make sure input screen is correct

  # record frame
  if episode % save_video_every_episode == 0:
    frames = [env.getScreenRGB()]

  # grayscale input screen for this episode
  #input_screens = [preprocess(env.getScreenGrayscale())] * 4

  input_states = preprocess(game.getGameState())
    
  # cumulate reward for this episode
  cum_reward = 0

  experiences = []
  t = 0
  while not env.game_over():
    # feed four previous screen, select an action
    action = ac_agent.select_action(input_states, sess)

    # execute the action and get reward
    reward = env.act(env.getActionSet()[action])

    # record frame
    if episode % save_video_every_episode == 0:
      frames.append(env.getScreenRGB())

    # cumulate reward
    cum_reward += reward

    # append grayscale screen for this episode
    #input_screens.append(preprocess(env.getScreenGrayscale()))

    input_states_plum = preprocess(game.getGameState())
    
    # append experience for this episode
    experiences.append(
        [input_states, action, reward, input_states_plum])

    t += 1

  def discount_reward(x, discount_rate):
    discounted_r = np.zeros(len(x))
    num_r = len(x)
    for i in range(num_r):
      discounted_r[i] = x[i] * math.pow(discount_rate, i)
    discounted_r = np.cumsum(discounted_r[::-1])
    return discounted_r[::-1]

  rewards = [e[2] for e in experiences]
  discounted_reward = discount_reward(rewards, ac_agent.discount_factor)

  # normalize
  discounted_reward -= np.mean(discounted_reward)
  discounted_reward /= np.std(discounted_reward)
  train_states = []
  train_actions = []
  train_rewards = []
  train_input_states_plum = []
  for i in range(len(experiences)):
    experiences[i][2] = discounted_reward[i]
    train_states.append(experiences[i][0])
    train_actions.append(experiences[i][1])
    train_rewards.append(experiences[i][2])
    train_input_states_plum.append(experiences[i][3])
  loss = ac_agent.update_policy(np.squeeze(train_states), train_actions, train_rewards,
                                np.squeeze(train_input_states_plum))

  if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
    print("[{}] time live:{}, cumulated reward: {}, loss: {}".format(
        episode, t, cum_reward, loss))

  if episode % save_video_every_episode == 0 and episode > NUM_EXPLORE:  # for every 5000 episode, record an animation
    clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
    clip.write_videofile("movie/ac_{}.webm".format(episode), fps=60)
    #display(clip.ipython_display(fps=60, autoplay=1, loop=1))

[100] time live:75, cumulated reward: 7.499999999999989, loss: (3.9601018, 0.05025584)
[MoviePy] >>>> Building video movie/ac_100.webm
[MoviePy] Writing video movie/ac_100.webm


 99%|████████████████████████████████████████████████████████████████████████████████▉ | 76/77 [00:01<00:00, 41.39it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_100.webm 

[200] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, -0.079638325)
[MoviePy] >>>> Building video movie/ac_200.webm
[MoviePy] Writing video movie/ac_200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 42.42it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_200.webm 

[300] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.059361473)
[MoviePy] >>>> Building video movie/ac_300.webm
[MoviePy] Writing video movie/ac_300.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 33.13it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_300.webm 

[400] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601, 0.06469143)
[MoviePy] >>>> Building video movie/ac_400.webm
[MoviePy] Writing video movie/ac_400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 51/52 [00:01<00:00, 41.24it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_400.webm 

[500] time live:53, cumulated reward: 4.299999999999997, loss: (3.9601004, -0.09443105)
[MoviePy] >>>> Building video movie/ac_500.webm
[MoviePy] Writing video movie/ac_500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 37.81it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_500.webm 

[600] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.04022288)
[MoviePy] >>>> Building video movie/ac_600.webm
[MoviePy] Writing video movie/ac_600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:01<00:00, 31.19it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_600.webm 

[700] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, -0.008182816)
[MoviePy] >>>> Building video movie/ac_700.webm
[MoviePy] Writing video movie/ac_700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 41.73it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_700.webm 

[800] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.011961999)
[MoviePy] >>>> Building video movie/ac_800.webm
[MoviePy] Writing video movie/ac_800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 37.15it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_800.webm 

[900] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, -0.061957344)
[MoviePy] >>>> Building video movie/ac_900.webm
[MoviePy] Writing video movie/ac_900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 40.63it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_900.webm 

[1000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.056905527)
[MoviePy] >>>> Building video movie/ac_1000.webm
[MoviePy] Writing video movie/ac_1000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 38.56it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1000.webm 

[1100] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, -0.07691748)
[MoviePy] >>>> Building video movie/ac_1100.webm
[MoviePy] Writing video movie/ac_1100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:01<00:00, 34.72it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1100.webm 

[1200] time live:65, cumulated reward: 6.499999999999993, loss: (3.9601011, 0.08232641)
[MoviePy] >>>> Building video movie/ac_1200.webm
[MoviePy] Writing video movie/ac_1200.webm


 99%|████████████████████████████████████████████████████████████████████████████████▊ | 66/67 [00:02<00:00, 36.49it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1200.webm 

[1300] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9601, 0.05400313)
[MoviePy] >>>> Building video movie/ac_1300.webm
[MoviePy] Writing video movie/ac_1300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 50/51 [00:01<00:00, 42.34it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1300.webm 

[1400] time live:60, cumulated reward: 4.999999999999995, loss: (3.9601002, -0.010662715)
[MoviePy] >>>> Building video movie/ac_1400.webm
[MoviePy] Writing video movie/ac_1400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 61/62 [00:01<00:00, 37.28it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1400.webm 

[1500] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9601, -0.01904706)
[MoviePy] >>>> Building video movie/ac_1500.webm
[MoviePy] Writing video movie/ac_1500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 53/54 [00:01<00:00, 32.67it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1500.webm 

[1600] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.018570993)
[MoviePy] >>>> Building video movie/ac_1600.webm
[MoviePy] Writing video movie/ac_1600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 38.12it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1600.webm 

[1700] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601002, -0.07440065)
[MoviePy] >>>> Building video movie/ac_1700.webm
[MoviePy] Writing video movie/ac_1700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 42.64it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1700.webm 

[1800] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9601004, -0.06502968)
[MoviePy] >>>> Building video movie/ac_1800.webm
[MoviePy] Writing video movie/ac_1800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 51/52 [00:01<00:00, 32.05it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1800.webm 

[1900] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.027211392)
[MoviePy] >>>> Building video movie/ac_1900.webm
[MoviePy] Writing video movie/ac_1900.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 32.08it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_1900.webm 

[2000] time live:60, cumulated reward: 4.999999999999995, loss: (3.9601, 0.02369631)
[MoviePy] >>>> Building video movie/ac_2000.webm
[MoviePy] Writing video movie/ac_2000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 61/62 [00:02<00:00, 25.64it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2000.webm 

[2100] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601002, 0.016381402)
[MoviePy] >>>> Building video movie/ac_2100.webm
[MoviePy] Writing video movie/ac_2100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 42.41it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2100.webm 

[2200] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600992, 0.041054923)
[MoviePy] >>>> Building video movie/ac_2200.webm
[MoviePy] Writing video movie/ac_2200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 37.74it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2200.webm 

[2300] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601002, -0.0039834804)
[MoviePy] >>>> Building video movie/ac_2300.webm
[MoviePy] Writing video movie/ac_2300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 47.37it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2300.webm 

[2400] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, 0.07396108)
[MoviePy] >>>> Building video movie/ac_2400.webm
[MoviePy] Writing video movie/ac_2400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 50.64it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2400.webm 

[2500] time live:51, cumulated reward: 4.099999999999998, loss: (3.9601, 0.00072473637)
[MoviePy] >>>> Building video movie/ac_2500.webm
[MoviePy] Writing video movie/ac_2500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:01<00:00, 43.60it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2500.webm 

[2600] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600992, 0.034312643)
[MoviePy] >>>> Building video movie/ac_2600.webm
[MoviePy] Writing video movie/ac_2600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 50.29it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2600.webm 

[2700] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601007, 0.06939233)
[MoviePy] >>>> Building video movie/ac_2700.webm
[MoviePy] Writing video movie/ac_2700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 42.45it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2700.webm 

[2800] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601002, 0.116535254)
[MoviePy] >>>> Building video movie/ac_2800.webm
[MoviePy] Writing video movie/ac_2800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 47.54it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2800.webm 

[2900] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.049206227)
[MoviePy] >>>> Building video movie/ac_2900.webm
[MoviePy] Writing video movie/ac_2900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:01<00:00, 48.02it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_2900.webm 

[3000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.039549876)
[MoviePy] >>>> Building video movie/ac_3000.webm
[MoviePy] Writing video movie/ac_3000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 45.49it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3000.webm 

[3100] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, -0.05694769)
[MoviePy] >>>> Building video movie/ac_3100.webm
[MoviePy] Writing video movie/ac_3100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 48.27it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3100.webm 

[3200] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.019714883)
[MoviePy] >>>> Building video movie/ac_3200.webm
[MoviePy] Writing video movie/ac_3200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 47.27it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3200.webm 

[3300] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.03342014)
[MoviePy] >>>> Building video movie/ac_3300.webm
[MoviePy] Writing video movie/ac_3300.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 44.24it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3300.webm 

[3400] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9601, -0.0036128117)
[MoviePy] >>>> Building video movie/ac_3400.webm
[MoviePy] Writing video movie/ac_3400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 53/54 [00:01<00:00, 44.80it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3400.webm 

[3500] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601014, 0.044854347)
[MoviePy] >>>> Building video movie/ac_3500.webm
[MoviePy] Writing video movie/ac_3500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 36.61it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3500.webm 

[3600] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600992, 0.015048849)
[MoviePy] >>>> Building video movie/ac_3600.webm
[MoviePy] Writing video movie/ac_3600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 43.23it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3600.webm 

[3700] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601002, 0.014422972)
[MoviePy] >>>> Building video movie/ac_3700.webm
[MoviePy] Writing video movie/ac_3700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 29.69it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3700.webm 

[3800] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, 0.028466208)
[MoviePy] >>>> Building video movie/ac_3800.webm
[MoviePy] Writing video movie/ac_3800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 44.41it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3800.webm 

[3900] time live:49, cumulated reward: 3.8999999999999986, loss: (3.9600997, 0.0823677)
[MoviePy] >>>> Building video movie/ac_3900.webm
[MoviePy] Writing video movie/ac_3900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 50/51 [00:01<00:00, 46.14it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_3900.webm 

[4000] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600997, 0.039813932)
[MoviePy] >>>> Building video movie/ac_4000.webm
[MoviePy] Writing video movie/ac_4000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 47.86it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4000.webm 

[4100] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600997, -0.0040951096)
[MoviePy] >>>> Building video movie/ac_4100.webm
[MoviePy] Writing video movie/ac_4100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 45.50it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4100.webm 

[4200] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600992, 0.005969969)
[MoviePy] >>>> Building video movie/ac_4200.webm
[MoviePy] Writing video movie/ac_4200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 44.96it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4200.webm 

[4300] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.042234797)
[MoviePy] >>>> Building video movie/ac_4300.webm
[MoviePy] Writing video movie/ac_4300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 42.33it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4300.webm 

[4400] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601007, -0.023249839)
[MoviePy] >>>> Building video movie/ac_4400.webm
[MoviePy] Writing video movie/ac_4400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 48.75it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4400.webm 

[4500] time live:56, cumulated reward: 4.599999999999996, loss: (3.9600995, -0.034238834)
[MoviePy] >>>> Building video movie/ac_4500.webm
[MoviePy] Writing video movie/ac_4500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 42.03it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4500.webm 

[4600] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.07183691)
[MoviePy] >>>> Building video movie/ac_4600.webm
[MoviePy] Writing video movie/ac_4600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 44.46it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4600.webm 

[4700] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, -0.08167722)
[MoviePy] >>>> Building video movie/ac_4700.webm
[MoviePy] Writing video movie/ac_4700.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 38.18it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4700.webm 

[4800] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.026858345)
[MoviePy] >>>> Building video movie/ac_4800.webm
[MoviePy] Writing video movie/ac_4800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 53.28it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4800.webm 

[4900] time live:50, cumulated reward: 3.9999999999999982, loss: (3.9600997, 0.010509891)
[MoviePy] >>>> Building video movie/ac_4900.webm
[MoviePy] Writing video movie/ac_4900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 51/52 [00:01<00:00, 48.05it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_4900.webm 

[5000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.012891816)
[MoviePy] >>>> Building video movie/ac_5000.webm
[MoviePy] Writing video movie/ac_5000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 44.10it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5000.webm 

[5100] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.017076049)
[MoviePy] >>>> Building video movie/ac_5100.webm
[MoviePy] Writing video movie/ac_5100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 53.83it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5100.webm 

[5200] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601004, -0.03659322)
[MoviePy] >>>> Building video movie/ac_5200.webm
[MoviePy] Writing video movie/ac_5200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 41.64it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5200.webm 

[5300] time live:57, cumulated reward: 4.699999999999996, loss: (3.9600995, -0.036367215)
[MoviePy] >>>> Building video movie/ac_5300.webm
[MoviePy] Writing video movie/ac_5300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 45.62it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5300.webm 

[5400] time live:60, cumulated reward: 4.999999999999995, loss: (3.9601, 0.0287014)
[MoviePy] >>>> Building video movie/ac_5400.webm
[MoviePy] Writing video movie/ac_5400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 61/62 [00:01<00:00, 50.20it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5400.webm 

[5500] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9601004, 0.045235597)
[MoviePy] >>>> Building video movie/ac_5500.webm
[MoviePy] Writing video movie/ac_5500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 53/54 [00:01<00:00, 42.96it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5500.webm 

[5600] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.03254459)
[MoviePy] >>>> Building video movie/ac_5600.webm
[MoviePy] Writing video movie/ac_5600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 42.65it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5600.webm 

[5700] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.04701247)
[MoviePy] >>>> Building video movie/ac_5700.webm
[MoviePy] Writing video movie/ac_5700.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 42.74it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5700.webm 

[5800] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.025239475)
[MoviePy] >>>> Building video movie/ac_5800.webm
[MoviePy] Writing video movie/ac_5800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 44.53it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5800.webm 

[5900] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.0019523756)
[MoviePy] >>>> Building video movie/ac_5900.webm
[MoviePy] Writing video movie/ac_5900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 53.33it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_5900.webm 

[6000] time live:62, cumulated reward: 5.199999999999994, loss: (3.9600992, -0.043695386)
[MoviePy] >>>> Building video movie/ac_6000.webm
[MoviePy] Writing video movie/ac_6000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 63/64 [00:01<00:00, 34.01it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6000.webm 

[6100] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601002, -0.050523546)
[MoviePy] >>>> Building video movie/ac_6100.webm
[MoviePy] Writing video movie/ac_6100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 30.65it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6100.webm 

[6200] time live:59, cumulated reward: 4.899999999999995, loss: (3.9600995, 0.00810245)
[MoviePy] >>>> Building video movie/ac_6200.webm
[MoviePy] Writing video movie/ac_6200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:01<00:00, 31.40it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6200.webm 

[6300] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601014, 0.09551877)
[MoviePy] >>>> Building video movie/ac_6300.webm
[MoviePy] Writing video movie/ac_6300.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:02<00:00, 30.24it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6300.webm 

[6400] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, -0.14806582)
[MoviePy] >>>> Building video movie/ac_6400.webm
[MoviePy] Writing video movie/ac_6400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 28.77it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6400.webm 

[6500] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600992, 0.0811318)
[MoviePy] >>>> Building video movie/ac_6500.webm
[MoviePy] Writing video movie/ac_6500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 41.17it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6500.webm 

[6600] time live:57, cumulated reward: 4.699999999999996, loss: (3.9601002, -0.10759775)
[MoviePy] >>>> Building video movie/ac_6600.webm
[MoviePy] Writing video movie/ac_6600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 58/59 [00:01<00:00, 29.73it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6600.webm 

[6700] time live:52, cumulated reward: 4.1999999999999975, loss: (3.9601, -0.11367031)
[MoviePy] >>>> Building video movie/ac_6700.webm
[MoviePy] Writing video movie/ac_6700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 53/54 [00:01<00:00, 30.25it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6700.webm 

[6800] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, 0.09951802)
[MoviePy] >>>> Building video movie/ac_6800.webm
[MoviePy] Writing video movie/ac_6800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:01<00:00, 32.30it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6800.webm 

[6900] time live:64, cumulated reward: 5.399999999999993, loss: (3.9601002, 0.0020834506)
[MoviePy] >>>> Building video movie/ac_6900.webm
[MoviePy] Writing video movie/ac_6900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▊ | 65/66 [00:01<00:00, 32.71it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_6900.webm 

[7000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.028992027)
[MoviePy] >>>> Building video movie/ac_7000.webm
[MoviePy] Writing video movie/ac_7000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:02<00:00, 29.05it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7000.webm 

[7100] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, 0.065260954)
[MoviePy] >>>> Building video movie/ac_7100.webm
[MoviePy] Writing video movie/ac_7100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:02<00:00, 23.34it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7100.webm 

[7200] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.028747246)
[MoviePy] >>>> Building video movie/ac_7200.webm
[MoviePy] Writing video movie/ac_7200.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:02<00:00, 30.67it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7200.webm 

[7300] time live:53, cumulated reward: 4.299999999999997, loss: (3.9600997, -0.012089135)
[MoviePy] >>>> Building video movie/ac_7300.webm
[MoviePy] Writing video movie/ac_7300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 54/55 [00:02<00:00, 26.50it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7300.webm 

[7400] time live:47, cumulated reward: 3.6999999999999993, loss: (3.9601002, -0.11768263)
[MoviePy] >>>> Building video movie/ac_7400.webm
[MoviePy] Writing video movie/ac_7400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 48/49 [00:01<00:00, 28.75it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7400.webm 

[7500] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.030954828)
[MoviePy] >>>> Building video movie/ac_7500.webm
[MoviePy] Writing video movie/ac_7500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:01<00:00, 35.51it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7500.webm 

[7600] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9601002, 0.030740825)
[MoviePy] >>>> Building video movie/ac_7600.webm
[MoviePy] Writing video movie/ac_7600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 37.80it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7600.webm 

[7700] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601, 0.036570515)
[MoviePy] >>>> Building video movie/ac_7700.webm
[MoviePy] Writing video movie/ac_7700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 41.80it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7700.webm 

[7800] time live:60, cumulated reward: 4.999999999999995, loss: (3.9600995, 0.03670041)
[MoviePy] >>>> Building video movie/ac_7800.webm
[MoviePy] Writing video movie/ac_7800.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 61/62 [00:01<00:00, 33.08it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7800.webm 

[7900] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, -0.025304072)
[MoviePy] >>>> Building video movie/ac_7900.webm
[MoviePy] Writing video movie/ac_7900.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 34.61it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_7900.webm 

[8000] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, 0.04325257)
[MoviePy] >>>> Building video movie/ac_8000.webm
[MoviePy] Writing video movie/ac_8000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 30.37it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8000.webm 

[8100] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600992, -0.005302429)
[MoviePy] >>>> Building video movie/ac_8100.webm
[MoviePy] Writing video movie/ac_8100.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:02<00:00, 30.31it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8100.webm 

[8200] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, -0.069633365)
[MoviePy] >>>> Building video movie/ac_8200.webm
[MoviePy] Writing video movie/ac_8200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 36.97it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8200.webm 

[8300] time live:54, cumulated reward: 4.399999999999997, loss: (3.9601007, 0.11838306)
[MoviePy] >>>> Building video movie/ac_8300.webm
[MoviePy] Writing video movie/ac_8300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 37.49it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8300.webm 

[8400] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, 0.095711194)
[MoviePy] >>>> Building video movie/ac_8400.webm
[MoviePy] Writing video movie/ac_8400.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:02<00:00, 32.31it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8400.webm 

[8500] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600995, 0.04404678)
[MoviePy] >>>> Building video movie/ac_8500.webm
[MoviePy] Writing video movie/ac_8500.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:02<00:00, 30.27it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8500.webm 

[8600] time live:55, cumulated reward: 4.4999999999999964, loss: (3.9600992, 0.023376621)
[MoviePy] >>>> Building video movie/ac_8600.webm
[MoviePy] Writing video movie/ac_8600.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 56/57 [00:01<00:00, 35.44it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8600.webm 

[8700] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601002, -0.008003763)
[MoviePy] >>>> Building video movie/ac_8700.webm
[MoviePy] Writing video movie/ac_8700.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:02<00:00, 27.54it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8700.webm 

[8800] time live:61, cumulated reward: 5.099999999999994, loss: (3.9601, -0.0042459145)
[MoviePy] >>>> Building video movie/ac_8800.webm
[MoviePy] Writing video movie/ac_8800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 31.54it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8800.webm 

[8900] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.13929667)
[MoviePy] >>>> Building video movie/ac_8900.webm
[MoviePy] Writing video movie/ac_8900.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 39.58it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_8900.webm 

[9000] time live:56, cumulated reward: 4.599999999999996, loss: (3.9601, 0.026407275)
[MoviePy] >>>> Building video movie/ac_9000.webm
[MoviePy] Writing video movie/ac_9000.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 57/58 [00:01<00:00, 35.06it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9000.webm 

[9100] time live:58, cumulated reward: 4.799999999999995, loss: (3.9600997, 0.13561775)
[MoviePy] >>>> Building video movie/ac_9100.webm
[MoviePy] Writing video movie/ac_9100.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 30.28it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9100.webm 

[9200] time live:59, cumulated reward: 4.899999999999995, loss: (3.9601002, -0.031911496)
[MoviePy] >>>> Building video movie/ac_9200.webm
[MoviePy] Writing video movie/ac_9200.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 60/61 [00:02<00:00, 32.14it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9200.webm 

[9300] time live:51, cumulated reward: 4.099999999999998, loss: (3.9600995, 0.074875966)
[MoviePy] >>>> Building video movie/ac_9300.webm
[MoviePy] Writing video movie/ac_9300.webm


 98%|████████████████████████████████████████████████████████████████████████████████▍ | 52/53 [00:01<00:00, 33.30it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9300.webm 

[9400] time live:54, cumulated reward: 4.399999999999997, loss: (3.9600997, 0.071274295)
[MoviePy] >>>> Building video movie/ac_9400.webm
[MoviePy] Writing video movie/ac_9400.webm


 98%|████████████████████████████████████████████████████████████████████████████████▌ | 55/56 [00:01<00:00, 39.30it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9400.webm 

[9500] time live:58, cumulated reward: 4.799999999999995, loss: (3.9601007, -0.08753324)
[MoviePy] >>>> Building video movie/ac_9500.webm
[MoviePy] Writing video movie/ac_9500.webm


 98%|████████████████████████████████████████████████████████████████████████████████▋ | 59/60 [00:01<00:00, 30.16it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9500.webm 

[9600] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.0021178918)
[MoviePy] >>>> Building video movie/ac_9600.webm
[MoviePy] Writing video movie/ac_9600.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 40.72it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9600.webm 

[9700] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.040542994)
[MoviePy] >>>> Building video movie/ac_9700.webm
[MoviePy] Writing video movie/ac_9700.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 42.15it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9700.webm 

[9800] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.07230496)
[MoviePy] >>>> Building video movie/ac_9800.webm
[MoviePy] Writing video movie/ac_9800.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 45.99it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9800.webm 

[9900] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.02733559)
[MoviePy] >>>> Building video movie/ac_9900.webm
[MoviePy] Writing video movie/ac_9900.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 42.88it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_9900.webm 

[10000] time live:61, cumulated reward: 5.099999999999994, loss: (3.9600997, -0.06814709)
[MoviePy] >>>> Building video movie/ac_10000.webm
[MoviePy] Writing video movie/ac_10000.webm


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 34.79it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: movie/ac_10000.webm 



In [28]:
from moviepy.editor import *
clip = VideoFileClip("movie/ac_100.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1))

100%|█████████████████████████████████████████████████████████████████████████████████| 59/59 [00:00<00:00, 497.13it/s]


# Report

將input從screen轉成上次lab preprocess好的game state，因為game state只有一個維度，所以將convolution layer全部改成fully connected layer，除此之外的部分都按照notebook的作法。

此次作業我將DQN、Policy Gradient、Actor-Critic都跑過10000個episode，實驗結果發現DQN的結果遠比Policy Gradient和Actor-Critic穩定許多，誠如老師在影片中所說，Policy Gradient使用MC estimation，因此variance很高，所以需要透過baseline以及Actor-Critic、A2C等方式來降低variance。