In [1]:
# https://deeplearningcourses.com/c/deep-reinforcement-learning-in-python
# https://www.udemy.com/deep-reinforcement-learning-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future

import copy
import gym
import os
import sys
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
from scipy.misc import imresize





##### testing only
# MAX_EXPERIENCES = 10000
# MIN_EXPERIENCES = 1000


MAX_EXPERIENCES = 500000
MIN_EXPERIENCES = 50000
TARGET_UPDATE_PERIOD = 10000
IM_SIZE = 80
K = 4 #env.action_space.n




def downsample_image(A):
  B = A[31:195] # select the important parts of the image
  B = B.mean(axis=2) # convert to grayscale

  # downsample image
  # changing aspect ratio doesn't significantly distort the image
  # nearest neighbor interpolation produces a much sharper image
  # than default bilinear
  B = imresize(B, size=(IM_SIZE, IM_SIZE), interp='nearest')
  return B


def update_state(state, obs):
  obs_small = downsample_image(obs)
  return np.append(state[1:], np.expand_dims(obs_small, 0), axis=0)


class DQN:
  def __init__(self, K, conv_layer_sizes, hidden_layer_sizes, gamma, scope):

    self.K = K
    self.scope = scope

    with tf.variable_scope(scope):

      # inputs and targets
      self.X = tf.placeholder(tf.float32, shape=(None, 4, IM_SIZE, IM_SIZE), name='X')

      # tensorflow convolution needs the order to be:
      # (num_samples, height, width, "color")
      # so we need to tranpose later
      self.G = tf.placeholder(tf.float32, shape=(None,), name='G')
      self.actions = tf.placeholder(tf.int32, shape=(None,), name='actions')

      # calculate output and cost
      # convolutional layers
      # these built-in layers are faster and don't require us to
      # calculate the size of the output of the final conv layer!
      Z = self.X / 255.0
      Z = tf.transpose(Z, [0, 2, 3, 1])
      for num_output_filters, filtersz, poolsz in conv_layer_sizes:
        Z = tf.contrib.layers.conv2d(
          Z,
          num_output_filters,
          filtersz,
          poolsz,
          activation_fn=tf.nn.relu
        )

      # fully connected layers
      Z = tf.contrib.layers.flatten(Z)
      for M in hidden_layer_sizes:
        Z = tf.contrib.layers.fully_connected(Z, M)

      # final output layer
      self.predict_op = tf.contrib.layers.fully_connected(Z, K)

      selected_action_values = tf.reduce_sum(
        self.predict_op * tf.one_hot(self.actions, K),
        reduction_indices=[1]
      )

      cost = tf.reduce_mean(tf.square(self.G - selected_action_values))
      # self.train_op = tf.train.AdamOptimizer(1e-2).minimize(cost)
      # self.train_op = tf.train.AdagradOptimizer(1e-2).minimize(cost)
      # self.train_op = tf.train.RMSPropOptimizer(2.5e-4, decay=0.99, epsilon=10e-3).minimize(cost)
      self.train_op = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6).minimize(cost)
      # self.train_op = tf.train.MomentumOptimizer(1e-3, momentum=0.9).minimize(cost)
      # self.train_op = tf.train.GradientDescentOptimizer(1e-4).minimize(cost)

      self.cost = cost

  def copy_from(self, other):
    mine = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
    mine = sorted(mine, key=lambda v: v.name)
    theirs = [t for t in tf.trainable_variables() if t.name.startswith(other.scope)]
    theirs = sorted(theirs, key=lambda v: v.name)

    ops = []
    for p, q in zip(mine, theirs):
      actual = self.session.run(q)
      op = p.assign(actual)
      ops.append(op)

    self.session.run(ops)

  def set_session(self, session):
    self.session = session

  def predict(self, states):
    return self.session.run(self.predict_op, feed_dict={self.X: states})

  def update(self, states, actions, targets):
    c, _ = self.session.run(
      [self.cost, self.train_op],
      feed_dict={
        self.X: states,
        self.G: targets,
        self.actions: actions
      }
    )
    return c

  def sample_action(self, x, eps):
    if np.random.random() < eps:
      return np.random.choice(self.K)
    else:
      return np.argmax(self.predict([x])[0])


def learn(model, target_model, experience_replay_buffer, gamma, batch_size):
  # Sample experiences
  samples = random.sample(experience_replay_buffer, batch_size)
  states, actions, rewards, next_states, dones = map(np.array, zip(*samples))

  # Calculate targets
  next_Qs = target_model.predict(next_states)
  next_Q = np.amax(next_Qs, axis=1)
  targets = rewards + np.invert(dones).astype(np.float32) * gamma * next_Q

  # Update model
  loss = model.update(states, actions, targets)
  return loss


def play_one(
  env,
  total_t,
  experience_replay_buffer,
  model,
  target_model,
  gamma,
  batch_size,
  epsilon,
  epsilon_change,
  epsilon_min):

  t0 = datetime.now()

  # Reset the environment
  obs = env.reset()
  obs_small = downsample_image(obs)
  state = np.stack([obs_small] * 4, axis=0)
  assert(state.shape == (4, 80, 80))
  loss = None


  total_time_training = 0
  num_steps_in_episode = 0
  episode_reward = 0

  done = False
  while not done:

    # Update target network
    if total_t % TARGET_UPDATE_PERIOD == 0:
      target_model.copy_from(model)
      print("Copied model parameters to target network. total_t = %s, period = %s" % (total_t, TARGET_UPDATE_PERIOD))


    # Take action
    action = model.sample_action(state, epsilon)
    obs, reward, done, _ = env.step(action)
    obs_small = downsample_image(obs)
    next_state = np.append(state[1:], np.expand_dims(obs_small, 0), axis=0)
    # assert(state.shape == (4, 80, 80))



    episode_reward += reward

    # Remove oldest experience if replay buffer is full
    if len(experience_replay_buffer) == MAX_EXPERIENCES:
      experience_replay_buffer.pop(0)

    # Save the latest experience
    experience_replay_buffer.append((state, action, reward, next_state, done))

    # Train the model, keep track of time
    t0_2 = datetime.now()
    loss = learn(model, target_model, experience_replay_buffer, gamma, batch_size)
    dt = datetime.now() - t0_2

    total_time_training += dt.total_seconds()
    num_steps_in_episode += 1


    state = next_state
    total_t += 1

    epsilon = max(epsilon - epsilon_change, epsilon_min)

  return total_t, episode_reward, (datetime.now() - t0), num_steps_in_episode, total_time_training/num_steps_in_episode, epsilon



def main():

  # hyperparams and initialize stuff
  conv_layer_sizes = [(32, 8, 4), (64, 4, 2), (64, 3, 1)]
  hidden_layer_sizes = [512]
  gamma = 0.99
  batch_sz = 32
  num_episodes = 10000
  total_t = 0
  experience_replay_buffer = []
  episode_rewards = np.zeros(num_episodes)



  # epsilon
  # decays linearly until 0.1
  epsilon = 1.0
  epsilon_min = 0.1
  epsilon_change = (epsilon - epsilon_min) / 500000



  # Create environment
  env = gym.envs.make("Breakout-v0")
 


  # Create models
  model = DQN(
    K=K,
    conv_layer_sizes=conv_layer_sizes,
    hidden_layer_sizes=hidden_layer_sizes,
    gamma=gamma,
    scope="model")
  target_model = DQN(
    K=K,
    conv_layer_sizes=conv_layer_sizes,
    hidden_layer_sizes=hidden_layer_sizes,
    gamma=gamma,
    scope="target_model"
  )



  with tf.Session() as sess:
    model.set_session(sess)
    target_model.set_session(sess)
    sess.run(tf.global_variables_initializer())


    print("Populating experience replay buffer...")
    obs = env.reset()
    obs_small = downsample_image(obs)
    state = np.stack([obs_small] * 4, axis=0)
    # assert(state.shape == (4, 80, 80))
    for i in range(MIN_EXPERIENCES):

        action = np.random.choice(K)
        obs, reward, done, _ = env.step(action)
        next_state = update_state(state, obs)
        # assert(state.shape == (4, 80, 80))
        experience_replay_buffer.append((state, action, reward, next_state, done))

        if done:
            obs = env.reset()
            obs_small = downsample_image(obs)
            state = np.stack([obs_small] * 4, axis=0)
            # assert(state.shape == (4, 80, 80))
        else:
            state = next_state


    # Play a number of episodes and learn!
    for i in range(num_episodes):

      total_t, episode_reward, duration, num_steps_in_episode, time_per_step, epsilon = play_one(
        env,
        total_t,
        experience_replay_buffer,
        model,
        target_model,
        gamma,
        batch_sz,
        epsilon,
        epsilon_change,
        epsilon_min,
      )
      episode_rewards[i] = episode_reward

      last_100_avg = episode_rewards[max(0, i - 100):i + 1].mean()
      print("Episode:", i,
        "Duration:", duration,
        "Num steps:", num_steps_in_episode,
        "Reward:", episode_reward,
        "Training time per step:", "%.3f" % time_per_step,
        "Avg Reward (Last 100):", "%.3f" % last_100_avg,
        "Epsilon:", "%.3f" % epsilon
      )
      sys.stdout.flush()

In [None]:
main()

Populating experience replay buffer...
Copied model parameters to target network. total_t = 0, period = 10000
Episode: 0 Duration: 0:00:03.988829 Num steps: 199 Reward: 1.0 Training time per step: 0.017 Avg Reward (Last 100): 1.000 Epsilon: 1.000
Episode: 1 Duration: 0:00:02.731743 Num steps: 241 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.000 Epsilon: 0.999
Episode: 2 Duration: 0:00:02.923278 Num steps: 256 Reward: 1.0 Training time per step: 0.010 Avg Reward (Last 100): 1.000 Epsilon: 0.999
Episode: 3 Duration: 0:00:02.612389 Num steps: 229 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.000 Epsilon: 0.998
Episode: 4 Duration: 0:00:01.889264 Num steps: 165 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 0.800 Epsilon: 0.998
Episode: 5 Duration: 0:00:05.518013 Num steps: 489 Reward: 9.0 Training time per step: 0.009 Avg Reward (Last 100): 2.167 Epsilon: 0.997
Episode: 6 Duration: 0:00:02.779699 Num steps: 246 Reward: 2.0 Train

Episode: 59 Duration: 0:00:01.870559 Num steps: 164 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.333 Epsilon: 0.973
Episode: 60 Duration: 0:00:02.781739 Num steps: 244 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.328 Epsilon: 0.973
Episode: 61 Duration: 0:00:02.640798 Num steps: 233 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.323 Epsilon: 0.973
Episode: 62 Duration: 0:00:03.163880 Num steps: 280 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.333 Epsilon: 0.972
Episode: 63 Duration: 0:00:01.945819 Num steps: 171 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.312 Epsilon: 0.972
Episode: 64 Duration: 0:00:02.952333 Num steps: 259 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.323 Epsilon: 0.971
Episode: 65 Duration: 0:00:02.008837 Num steps: 176 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.303 Epsilon: 0.971
Episode: 66 Duration: 0:00:01.9331

Episode: 118 Duration: 0:00:02.641456 Num steps: 231 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.287 Epsilon: 0.948
Episode: 119 Duration: 0:00:03.021705 Num steps: 265 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.297 Epsilon: 0.947
Episode: 120 Duration: 0:00:02.654519 Num steps: 233 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.297 Epsilon: 0.947
Episode: 121 Duration: 0:00:03.214489 Num steps: 281 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.317 Epsilon: 0.946
Episode: 122 Duration: 0:00:02.159770 Num steps: 188 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.317 Epsilon: 0.946
Copied model parameters to target network. total_t = 30000, period = 10000
Episode: 123 Duration: 0:00:03.319191 Num steps: 275 Reward: 2.0 Training time per step: 0.010 Avg Reward (Last 100): 1.317 Epsilon: 0.946
Episode: 124 Duration: 0:00:05.855161 Num steps: 513 Reward: 6.0 Training time per step: 0.

Episode: 176 Duration: 0:00:04.226740 Num steps: 367 Reward: 3.0 Training time per step: 0.010 Avg Reward (Last 100): 1.198 Epsilon: 0.922
Episode: 177 Duration: 0:00:02.667795 Num steps: 232 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.198 Epsilon: 0.922
Episode: 178 Duration: 0:00:02.464492 Num steps: 216 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.208 Epsilon: 0.922
Episode: 179 Duration: 0:00:03.253854 Num steps: 283 Reward: 2.0 Training time per step: 0.010 Avg Reward (Last 100): 1.228 Epsilon: 0.921
Episode: 180 Duration: 0:00:03.878109 Num steps: 340 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.248 Epsilon: 0.920
Episode: 181 Duration: 0:00:02.464377 Num steps: 214 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.228 Epsilon: 0.920
Episode: 182 Duration: 0:00:02.055880 Num steps: 179 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.208 Epsilon: 0.920
Episode: 183 Duration: 0:00

Episode: 235 Duration: 0:00:02.006951 Num steps: 174 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.277 Epsilon: 0.896
Episode: 236 Duration: 0:00:01.818981 Num steps: 159 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.277 Epsilon: 0.896
Episode: 237 Duration: 0:00:02.424067 Num steps: 211 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.257 Epsilon: 0.895
Episode: 238 Duration: 0:00:03.367845 Num steps: 295 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.267 Epsilon: 0.895
Episode: 239 Duration: 0:00:03.834287 Num steps: 333 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.277 Epsilon: 0.894
Episode: 240 Duration: 0:00:02.880398 Num steps: 249 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.267 Epsilon: 0.894
Episode: 241 Duration: 0:00:02.068585 Num steps: 179 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.257 Epsilon: 0.894
Episode: 242 Duration: 0:00

Episode: 293 Duration: 0:00:03.163306 Num steps: 275 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.248 Epsilon: 0.872
Episode: 294 Duration: 0:00:02.009854 Num steps: 174 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.238 Epsilon: 0.872
Episode: 295 Duration: 0:00:01.929301 Num steps: 167 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.228 Epsilon: 0.871
Episode: 296 Duration: 0:00:02.046548 Num steps: 177 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.208 Epsilon: 0.871
Episode: 297 Duration: 0:00:03.358813 Num steps: 290 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.218 Epsilon: 0.870
Episode: 298 Duration: 0:00:02.759218 Num steps: 241 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.208 Epsilon: 0.870
Episode: 299 Duration: 0:00:02.069036 Num steps: 179 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.198 Epsilon: 0.870
Episode: 300 Duration: 0:00

Episode: 352 Duration: 0:00:02.393027 Num steps: 206 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.109 Epsilon: 0.847
Episode: 353 Duration: 0:00:03.279164 Num steps: 286 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.119 Epsilon: 0.846
Episode: 354 Duration: 0:00:02.064697 Num steps: 180 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.089 Epsilon: 0.846
Episode: 355 Duration: 0:00:02.038619 Num steps: 178 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.089 Epsilon: 0.845
Episode: 356 Duration: 0:00:03.602750 Num steps: 313 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.119 Epsilon: 0.845
Episode: 357 Duration: 0:00:01.891949 Num steps: 165 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.109 Epsilon: 0.845
Episode: 358 Duration: 0:00:04.662237 Num steps: 405 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 1.139 Epsilon: 0.844
Episode: 359 Duration: 0:00

Episode: 411 Duration: 0:00:01.970421 Num steps: 169 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.198 Epsilon: 0.821
Episode: 412 Duration: 0:00:03.881245 Num steps: 338 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.228 Epsilon: 0.820
Episode: 413 Duration: 0:00:02.114903 Num steps: 183 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.188 Epsilon: 0.820
Copied model parameters to target network. total_t = 100000, period = 10000
Episode: 414 Duration: 0:00:03.084864 Num steps: 252 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.178 Epsilon: 0.820
Episode: 415 Duration: 0:00:03.358572 Num steps: 288 Reward: 2.0 Training time per step: 0.010 Avg Reward (Last 100): 1.178 Epsilon: 0.819
Episode: 416 Duration: 0:00:02.160033 Num steps: 184 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.158 Epsilon: 0.819
Episode: 417 Duration: 0:00:02.356242 Num steps: 201 Reward: 0.0 Training time per step: 0

Episode: 469 Duration: 0:00:02.392681 Num steps: 206 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.139 Epsilon: 0.796
Episode: 470 Duration: 0:00:02.944844 Num steps: 255 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.158 Epsilon: 0.796
Episode: 471 Duration: 0:00:02.152472 Num steps: 186 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.149 Epsilon: 0.795
Episode: 472 Duration: 0:00:02.757310 Num steps: 234 Reward: 1.0 Training time per step: 0.010 Avg Reward (Last 100): 1.149 Epsilon: 0.795
Episode: 473 Duration: 0:00:04.370852 Num steps: 377 Reward: 3.0 Training time per step: 0.010 Avg Reward (Last 100): 1.168 Epsilon: 0.794
Episode: 474 Duration: 0:00:03.668222 Num steps: 314 Reward: 2.0 Training time per step: 0.010 Avg Reward (Last 100): 1.178 Epsilon: 0.794
Episode: 475 Duration: 0:00:03.986692 Num steps: 343 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.208 Epsilon: 0.793
Episode: 476 Duration: 0:00

Episode: 528 Duration: 0:00:02.560312 Num steps: 217 Reward: 1.0 Training time per step: 0.010 Avg Reward (Last 100): 1.396 Epsilon: 0.769
Episode: 529 Duration: 0:00:02.243020 Num steps: 193 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.386 Epsilon: 0.768
Episode: 530 Duration: 0:00:02.635149 Num steps: 226 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.376 Epsilon: 0.768
Episode: 531 Duration: 0:00:03.113268 Num steps: 270 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.366 Epsilon: 0.768
Episode: 532 Duration: 0:00:01.954393 Num steps: 169 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.366 Epsilon: 0.767
Episode: 533 Duration: 0:00:02.465119 Num steps: 212 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.356 Epsilon: 0.767
Episode: 534 Duration: 0:00:02.448431 Num steps: 211 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.366 Epsilon: 0.767
Copied model parameters to 

Episode: 586 Duration: 0:00:03.264321 Num steps: 280 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.317 Epsilon: 0.744
Episode: 587 Duration: 0:00:02.683631 Num steps: 228 Reward: 1.0 Training time per step: 0.010 Avg Reward (Last 100): 1.307 Epsilon: 0.744
Episode: 588 Duration: 0:00:01.970390 Num steps: 170 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.297 Epsilon: 0.744
Episode: 589 Duration: 0:00:03.125206 Num steps: 268 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.317 Epsilon: 0.743
Episode: 590 Duration: 0:00:02.859033 Num steps: 245 Reward: 1.0 Training time per step: 0.010 Avg Reward (Last 100): 1.327 Epsilon: 0.743
Episode: 591 Duration: 0:00:02.236217 Num steps: 187 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.327 Epsilon: 0.742
Episode: 592 Duration: 0:00:02.063839 Num steps: 176 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.307 Epsilon: 0.742
Episode: 593 Duration: 0:00

Episode: 645 Duration: 0:00:03.136386 Num steps: 268 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.139 Epsilon: 0.720
Episode: 646 Duration: 0:00:02.870861 Num steps: 245 Reward: 1.0 Training time per step: 0.010 Avg Reward (Last 100): 1.149 Epsilon: 0.719
Episode: 647 Duration: 0:00:03.221683 Num steps: 277 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.149 Epsilon: 0.719
Episode: 648 Duration: 0:00:02.626294 Num steps: 226 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.158 Epsilon: 0.718
Episode: 649 Duration: 0:00:02.106121 Num steps: 182 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.139 Epsilon: 0.718
Episode: 650 Duration: 0:00:03.178384 Num steps: 272 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.158 Epsilon: 0.718
Episode: 651 Duration: 0:00:02.368019 Num steps: 203 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.149 Epsilon: 0.717
Episode: 652 Duration: 0:00

Episode: 704 Duration: 0:00:04.165396 Num steps: 356 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.089 Epsilon: 0.695
Episode: 705 Duration: 0:00:02.061297 Num steps: 177 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.079 Epsilon: 0.695
Episode: 706 Duration: 0:00:01.972874 Num steps: 168 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.069 Epsilon: 0.695
Episode: 707 Duration: 0:00:01.920282 Num steps: 163 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.069 Epsilon: 0.694
Episode: 708 Duration: 0:00:02.041083 Num steps: 176 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.050 Epsilon: 0.694
Copied model parameters to target network. total_t = 170000, period = 10000
Episode: 709 Duration: 0:00:02.237312 Num steps: 168 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.050 Epsilon: 0.694
Episode: 710 Duration: 0:00:02.061821 Num steps: 175 Reward: 0.0 Training time per step: 0

Episode: 762 Duration: 0:00:02.044381 Num steps: 174 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.030 Epsilon: 0.672
Episode: 763 Duration: 0:00:05.300301 Num steps: 454 Reward: 6.0 Training time per step: 0.009 Avg Reward (Last 100): 1.089 Epsilon: 0.671
Episode: 764 Duration: 0:00:01.972877 Num steps: 168 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.079 Epsilon: 0.670
Episode: 765 Duration: 0:00:01.962370 Num steps: 168 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.079 Epsilon: 0.670
Episode: 766 Duration: 0:00:02.672794 Num steps: 229 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.069 Epsilon: 0.670
Episode: 767 Duration: 0:00:01.941729 Num steps: 164 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.069 Epsilon: 0.669
Episode: 768 Duration: 0:00:02.498756 Num steps: 212 Reward: 1.0 Training time per step: 0.010 Avg Reward (Last 100): 1.079 Epsilon: 0.669
Episode: 769 Duration: 0:00

Episode: 821 Duration: 0:00:02.481483 Num steps: 210 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.287 Epsilon: 0.645
Episode: 822 Duration: 0:00:02.098672 Num steps: 177 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.287 Epsilon: 0.645
Episode: 823 Duration: 0:00:01.996410 Num steps: 167 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.277 Epsilon: 0.645
Episode: 824 Duration: 0:00:04.437411 Num steps: 379 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 1.297 Epsilon: 0.644
Episode: 825 Duration: 0:00:02.990922 Num steps: 254 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.297 Epsilon: 0.643
Episode: 826 Duration: 0:00:05.100430 Num steps: 435 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 1.317 Epsilon: 0.643
Episode: 827 Duration: 0:00:02.096097 Num steps: 180 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.317 Epsilon: 0.642
Episode: 828 Duration: 0:00

Episode: 879 Duration: 0:00:04.122689 Num steps: 351 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.188 Epsilon: 0.620
Episode: 880 Duration: 0:00:02.716621 Num steps: 232 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.188 Epsilon: 0.620
Episode: 881 Duration: 0:00:01.938401 Num steps: 167 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.168 Epsilon: 0.620
Episode: 882 Duration: 0:00:04.393328 Num steps: 372 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 1.208 Epsilon: 0.619
Episode: 883 Duration: 0:00:03.936421 Num steps: 336 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.218 Epsilon: 0.619
Episode: 884 Duration: 0:00:02.156239 Num steps: 183 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.198 Epsilon: 0.618
Episode: 885 Duration: 0:00:03.744985 Num steps: 317 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.228 Epsilon: 0.618
Episode: 886 Duration: 0:00

Episode: 938 Duration: 0:00:02.848650 Num steps: 241 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.069 Epsilon: 0.596
Episode: 939 Duration: 0:00:04.517664 Num steps: 385 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.079 Epsilon: 0.595
Episode: 940 Duration: 0:00:02.864812 Num steps: 245 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.089 Epsilon: 0.595
Episode: 941 Duration: 0:00:03.261289 Num steps: 276 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.099 Epsilon: 0.594
Episode: 942 Duration: 0:00:02.077034 Num steps: 178 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.099 Epsilon: 0.594
Episode: 943 Duration: 0:00:02.623553 Num steps: 224 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.099 Epsilon: 0.593
Episode: 944 Duration: 0:00:02.948835 Num steps: 251 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.089 Epsilon: 0.593
Episode: 945 Duration: 0:00

Episode: 997 Duration: 0:00:02.113831 Num steps: 179 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.168 Epsilon: 0.569
Episode: 998 Duration: 0:00:02.783709 Num steps: 236 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.158 Epsilon: 0.569
Episode: 999 Duration: 0:00:03.998268 Num steps: 340 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.188 Epsilon: 0.568
Copied model parameters to target network. total_t = 240000, period = 10000
Episode: 1000 Duration: 0:00:03.727678 Num steps: 298 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.208 Epsilon: 0.568
Episode: 1001 Duration: 0:00:02.057627 Num steps: 173 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.188 Epsilon: 0.568
Episode: 1002 Duration: 0:00:02.140413 Num steps: 180 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.168 Epsilon: 0.567
Episode: 1003 Duration: 0:00:04.025168 Num steps: 338 Reward: 4.0 Training time per ste

Episode: 1055 Duration: 0:00:02.179897 Num steps: 184 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.208 Epsilon: 0.544
Episode: 1056 Duration: 0:00:02.824708 Num steps: 237 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.218 Epsilon: 0.544
Episode: 1057 Duration: 0:00:02.053583 Num steps: 173 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.218 Epsilon: 0.543
Episode: 1058 Duration: 0:00:02.112221 Num steps: 179 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.208 Epsilon: 0.543
Episode: 1059 Duration: 0:00:04.806214 Num steps: 408 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 1.248 Epsilon: 0.542
Episode: 1060 Duration: 0:00:03.391484 Num steps: 288 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.248 Epsilon: 0.542
Episode: 1061 Duration: 0:00:01.945577 Num steps: 165 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.218 Epsilon: 0.541
Episode: 1062 Durati

Episode: 1113 Duration: 0:00:02.206259 Num steps: 186 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.119 Epsilon: 0.519
Episode: 1114 Duration: 0:00:02.465732 Num steps: 209 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.089 Epsilon: 0.519
Episode: 1115 Duration: 0:00:02.157773 Num steps: 180 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.079 Epsilon: 0.518
Episode: 1116 Duration: 0:00:01.958522 Num steps: 164 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.069 Epsilon: 0.518
Episode: 1117 Duration: 0:00:02.735680 Num steps: 231 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.079 Epsilon: 0.518
Episode: 1118 Duration: 0:00:02.147765 Num steps: 181 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.069 Epsilon: 0.517
Episode: 1119 Duration: 0:00:03.812755 Num steps: 323 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.069 Epsilon: 0.517
Episode: 1120 Durati

Episode: 1171 Duration: 0:00:02.700528 Num steps: 228 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.248 Epsilon: 0.493
Episode: 1172 Duration: 0:00:03.537892 Num steps: 301 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.257 Epsilon: 0.493
Episode: 1173 Duration: 0:00:04.431333 Num steps: 375 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.287 Epsilon: 0.492
Episode: 1174 Duration: 0:00:04.345451 Num steps: 367 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.317 Epsilon: 0.491
Episode: 1175 Duration: 0:00:02.192604 Num steps: 182 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.307 Epsilon: 0.491
Episode: 1176 Duration: 0:00:03.928819 Num steps: 334 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.327 Epsilon: 0.491
Episode: 1177 Duration: 0:00:04.154521 Num steps: 351 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.356 Epsilon: 0.490
Episode: 1178 Durati

Episode: 1229 Duration: 0:00:03.141794 Num steps: 264 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.535 Epsilon: 0.465
Episode: 1230 Duration: 0:00:02.683370 Num steps: 228 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.545 Epsilon: 0.465
Episode: 1231 Duration: 0:00:03.723914 Num steps: 314 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.564 Epsilon: 0.464
Episode: 1232 Duration: 0:00:02.165160 Num steps: 183 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.554 Epsilon: 0.464
Episode: 1233 Duration: 0:00:02.681764 Num steps: 228 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.564 Epsilon: 0.463
Episode: 1234 Duration: 0:00:02.989374 Num steps: 252 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.564 Epsilon: 0.463
Episode: 1235 Duration: 0:00:03.968873 Num steps: 335 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.564 Epsilon: 0.462
Episode: 1236 Durati

Episode: 1287 Duration: 0:00:03.338918 Num steps: 279 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 1.802 Epsilon: 0.435
Episode: 1288 Duration: 0:00:03.765606 Num steps: 313 Reward: 3.0 Training time per step: 0.010 Avg Reward (Last 100): 1.802 Epsilon: 0.435
Episode: 1289 Duration: 0:00:03.593448 Num steps: 303 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 1.802 Epsilon: 0.434
Episode: 1290 Duration: 0:00:02.116921 Num steps: 176 Reward: 0.0 Training time per step: 0.009 Avg Reward (Last 100): 1.772 Epsilon: 0.434
Episode: 1291 Duration: 0:00:02.837538 Num steps: 236 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 1.772 Epsilon: 0.433
Episode: 1292 Duration: 0:00:04.494637 Num steps: 376 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 1.802 Epsilon: 0.433
Episode: 1293 Duration: 0:00:02.342877 Num steps: 194 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 1.782 Epsilon: 0.432
Episode: 1294 Durati

Copied model parameters to target network. total_t = 330000, period = 10000
Episode: 1345 Duration: 0:00:04.593217 Num steps: 361 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 2.030 Epsilon: 0.406
Episode: 1346 Duration: 0:00:03.071338 Num steps: 254 Reward: 2.0 Training time per step: 0.010 Avg Reward (Last 100): 2.040 Epsilon: 0.406
Episode: 1347 Duration: 0:00:03.231109 Num steps: 269 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 2.040 Epsilon: 0.405
Episode: 1348 Duration: 0:00:03.710577 Num steps: 308 Reward: 2.0 Training time per step: 0.010 Avg Reward (Last 100): 2.040 Epsilon: 0.404
Episode: 1349 Duration: 0:00:02.125647 Num steps: 175 Reward: 0.0 Training time per step: 0.010 Avg Reward (Last 100): 2.030 Epsilon: 0.404
Episode: 1350 Duration: 0:00:02.544227 Num steps: 212 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 2.020 Epsilon: 0.404
Episode: 1351 Duration: 0:00:03.355941 Num steps: 276 Reward: 2.0 Training time per 

Episode: 1403 Duration: 0:00:02.860829 Num steps: 238 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 2.168 Epsilon: 0.374
Episode: 1404 Duration: 0:00:04.239456 Num steps: 352 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 2.198 Epsilon: 0.374
Episode: 1405 Duration: 0:00:04.527679 Num steps: 378 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 2.188 Epsilon: 0.373
Episode: 1406 Duration: 0:00:02.606462 Num steps: 216 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 2.178 Epsilon: 0.373
Episode: 1407 Duration: 0:00:03.293416 Num steps: 273 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 2.188 Epsilon: 0.372
Episode: 1408 Duration: 0:00:03.489371 Num steps: 291 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 2.188 Epsilon: 0.372
Episode: 1409 Duration: 0:00:03.860146 Num steps: 322 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 2.188 Epsilon: 0.371
Episode: 1410 Durati

Episode: 1461 Duration: 0:00:05.701211 Num steps: 476 Reward: 5.0 Training time per step: 0.009 Avg Reward (Last 100): 3.228 Epsilon: 0.336
Episode: 1462 Duration: 0:00:03.415074 Num steps: 283 Reward: 2.0 Training time per step: 0.009 Avg Reward (Last 100): 3.228 Epsilon: 0.335
Episode: 1463 Duration: 0:00:02.988034 Num steps: 248 Reward: 1.0 Training time per step: 0.009 Avg Reward (Last 100): 3.208 Epsilon: 0.335
Episode: 1464 Duration: 0:00:03.539639 Num steps: 295 Reward: 3.0 Training time per step: 0.009 Avg Reward (Last 100): 3.198 Epsilon: 0.334
Copied model parameters to target network. total_t = 370000, period = 10000
Episode: 1465 Duration: 0:00:07.078279 Num steps: 564 Reward: 6.0 Training time per step: 0.009 Avg Reward (Last 100): 3.228 Epsilon: 0.333
Episode: 1466 Duration: 0:00:04.627002 Num steps: 383 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 3.238 Epsilon: 0.333
Episode: 1467 Duration: 0:00:03.879365 Num steps: 323 Reward: 3.0 Training time per 

Episode: 1518 Duration: 0:00:09.955875 Num steps: 825 Reward: 14.0 Training time per step: 0.009 Avg Reward (Last 100): 5.307 Epsilon: 0.283
Episode: 1519 Duration: 0:00:04.595154 Num steps: 380 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 5.317 Epsilon: 0.282
Episode: 1520 Duration: 0:00:08.087282 Num steps: 671 Reward: 8.0 Training time per step: 0.009 Avg Reward (Last 100): 5.366 Epsilon: 0.281
Episode: 1521 Duration: 0:00:04.560122 Num steps: 380 Reward: 4.0 Training time per step: 0.009 Avg Reward (Last 100): 5.386 Epsilon: 0.280
Copied model parameters to target network. total_t = 400000, period = 10000
Episode: 1522 Duration: 0:00:08.721176 Num steps: 699 Reward: 10.0 Training time per step: 0.009 Avg Reward (Last 100): 5.406 Epsilon: 0.279
Episode: 1523 Duration: 0:00:09.745440 Num steps: 804 Reward: 15.0 Training time per step: 0.009 Avg Reward (Last 100): 5.515 Epsilon: 0.278
Episode: 1524 Duration: 0:00:09.540498 Num steps: 788 Reward: 15.0 Training time 

Episode: 1575 Duration: 0:00:11.036787 Num steps: 910 Reward: 18.0 Training time per step: 0.009 Avg Reward (Last 100): 9.277 Epsilon: 0.211
Episode: 1576 Duration: 0:00:05.611275 Num steps: 465 Reward: 8.0 Training time per step: 0.009 Avg Reward (Last 100): 9.307 Epsilon: 0.210
Episode: 1577 Duration: 0:00:07.420247 Num steps: 610 Reward: 8.0 Training time per step: 0.009 Avg Reward (Last 100): 9.356 Epsilon: 0.209
Copied model parameters to target network. total_t = 440000, period = 10000
Episode: 1578 Duration: 0:00:10.694501 Num steps: 855 Reward: 16.0 Training time per step: 0.009 Avg Reward (Last 100): 9.416 Epsilon: 0.207
Episode: 1579 Duration: 0:00:10.310938 Num steps: 846 Reward: 12.0 Training time per step: 0.009 Avg Reward (Last 100): 9.505 Epsilon: 0.206
Episode: 1580 Duration: 0:00:08.830168 Num steps: 727 Reward: 10.0 Training time per step: 0.009 Avg Reward (Last 100): 9.545 Epsilon: 0.204
Episode: 1581 Duration: 0:00:07.119698 Num steps: 585 Reward: 9.0 Training time 

Episode: 1631 Duration: 0:00:12.520629 Num steps: 975 Reward: 15.0 Training time per step: 0.010 Avg Reward (Last 100): 13.139 Epsilon: 0.130
Episode: 1632 Duration: 0:00:11.241743 Num steps: 879 Reward: 13.0 Training time per step: 0.010 Avg Reward (Last 100): 13.089 Epsilon: 0.128
Episode: 1633 Duration: 0:00:14.555755 Num steps: 1134 Reward: 25.0 Training time per step: 0.010 Avg Reward (Last 100): 13.257 Epsilon: 0.126
Episode: 1634 Duration: 0:00:09.860064 Num steps: 767 Reward: 12.0 Training time per step: 0.010 Avg Reward (Last 100): 13.327 Epsilon: 0.125
Episode: 1635 Duration: 0:00:13.533554 Num steps: 1051 Reward: 25.0 Training time per step: 0.010 Avg Reward (Last 100): 13.465 Epsilon: 0.123
Episode: 1636 Duration: 0:00:11.879553 Num steps: 927 Reward: 16.0 Training time per step: 0.010 Avg Reward (Last 100): 13.584 Epsilon: 0.121
Episode: 1637 Duration: 0:00:09.482582 Num steps: 739 Reward: 12.0 Training time per step: 0.010 Avg Reward (Last 100): 13.624 Epsilon: 0.120
Epis

Episode: 1686 Duration: 0:00:11.678905 Num steps: 908 Reward: 16.0 Training time per step: 0.010 Avg Reward (Last 100): 17.158 Epsilon: 0.100
Episode: 1687 Duration: 0:00:07.387404 Num steps: 573 Reward: 19.0 Training time per step: 0.010 Avg Reward (Last 100): 17.218 Epsilon: 0.100
Episode: 1688 Duration: 0:00:08.747535 Num steps: 680 Reward: 13.0 Training time per step: 0.010 Avg Reward (Last 100): 17.238 Epsilon: 0.100
Episode: 1689 Duration: 0:00:11.716698 Num steps: 908 Reward: 22.0 Training time per step: 0.010 Avg Reward (Last 100): 17.218 Epsilon: 0.100
Episode: 1690 Duration: 0:00:13.655273 Num steps: 1062 Reward: 26.0 Training time per step: 0.010 Avg Reward (Last 100): 17.376 Epsilon: 0.100
Episode: 1691 Duration: 0:00:09.022153 Num steps: 698 Reward: 12.0 Training time per step: 0.010 Avg Reward (Last 100): 17.317 Epsilon: 0.100
Episode: 1692 Duration: 0:00:12.976838 Num steps: 1011 Reward: 18.0 Training time per step: 0.010 Avg Reward (Last 100): 17.396 Epsilon: 0.100
Copi

Episode: 1741 Duration: 0:00:08.103181 Num steps: 633 Reward: 9.0 Training time per step: 0.010 Avg Reward (Last 100): 18.663 Epsilon: 0.100
Episode: 1742 Duration: 0:00:13.535260 Num steps: 1063 Reward: 19.0 Training time per step: 0.010 Avg Reward (Last 100): 18.644 Epsilon: 0.100
Episode: 1743 Duration: 0:00:14.280057 Num steps: 1121 Reward: 35.0 Training time per step: 0.010 Avg Reward (Last 100): 18.842 Epsilon: 0.100
Episode: 1744 Duration: 0:00:11.598044 Num steps: 909 Reward: 29.0 Training time per step: 0.010 Avg Reward (Last 100): 19.010 Epsilon: 0.100
Episode: 1745 Duration: 0:00:18.481978 Num steps: 1448 Reward: 43.0 Training time per step: 0.010 Avg Reward (Last 100): 19.327 Epsilon: 0.100
Episode: 1746 Duration: 0:00:17.156181 Num steps: 1346 Reward: 30.0 Training time per step: 0.010 Avg Reward (Last 100): 19.426 Epsilon: 0.100
Copied model parameters to target network. total_t = 590000, period = 10000
Episode: 1747 Duration: 0:00:14.003249 Num steps: 1063 Reward: 23.0 T

Episode: 1796 Duration: 0:00:16.392447 Num steps: 1284 Reward: 33.0 Training time per step: 0.010 Avg Reward (Last 100): 22.713 Epsilon: 0.100
Episode: 1797 Duration: 0:00:09.402128 Num steps: 737 Reward: 18.0 Training time per step: 0.010 Avg Reward (Last 100): 22.683 Epsilon: 0.100
Episode: 1798 Duration: 0:00:12.829391 Num steps: 1003 Reward: 18.0 Training time per step: 0.010 Avg Reward (Last 100): 22.663 Epsilon: 0.100
Episode: 1799 Duration: 0:00:10.416493 Num steps: 819 Reward: 14.0 Training time per step: 0.009 Avg Reward (Last 100): 22.634 Epsilon: 0.100
Episode: 1800 Duration: 0:00:13.347652 Num steps: 1042 Reward: 19.0 Training time per step: 0.010 Avg Reward (Last 100): 22.663 Epsilon: 0.100
Copied model parameters to target network. total_t = 650000, period = 10000
Episode: 1801 Duration: 0:00:13.966042 Num steps: 1064 Reward: 25.0 Training time per step: 0.010 Avg Reward (Last 100): 22.802 Epsilon: 0.100
Episode: 1802 Duration: 0:00:16.048248 Num steps: 1260 Reward: 25.0 

Episode: 1851 Duration: 0:00:17.220846 Num steps: 1351 Reward: 35.0 Training time per step: 0.010 Avg Reward (Last 100): 23.980 Epsilon: 0.100
Episode: 1852 Duration: 0:00:17.825846 Num steps: 1394 Reward: 42.0 Training time per step: 0.010 Avg Reward (Last 100): 24.109 Epsilon: 0.100
Episode: 1853 Duration: 0:00:15.972649 Num steps: 1249 Reward: 34.0 Training time per step: 0.010 Avg Reward (Last 100): 24.089 Epsilon: 0.100
Episode: 1854 Duration: 0:00:09.175234 Num steps: 719 Reward: 15.0 Training time per step: 0.010 Avg Reward (Last 100): 23.950 Epsilon: 0.100
Episode: 1855 Duration: 0:00:13.729367 Num steps: 1078 Reward: 26.0 Training time per step: 0.010 Avg Reward (Last 100): 23.970 Epsilon: 0.100
Episode: 1856 Duration: 0:00:12.731541 Num steps: 1000 Reward: 21.0 Training time per step: 0.010 Avg Reward (Last 100): 24.000 Epsilon: 0.100
Episode: 1857 Duration: 0:00:11.156630 Num steps: 878 Reward: 16.0 Training time per step: 0.009 Avg Reward (Last 100): 23.861 Epsilon: 0.100
E

Episode: 1906 Duration: 0:00:12.588806 Num steps: 984 Reward: 34.0 Training time per step: 0.010 Avg Reward (Last 100): 22.505 Epsilon: 0.100
Episode: 1907 Duration: 0:00:08.955004 Num steps: 703 Reward: 11.0 Training time per step: 0.010 Avg Reward (Last 100): 22.297 Epsilon: 0.100
Copied model parameters to target network. total_t = 760000, period = 10000
Episode: 1908 Duration: 0:00:12.219150 Num steps: 925 Reward: 19.0 Training time per step: 0.010 Avg Reward (Last 100): 22.248 Epsilon: 0.100
Episode: 1909 Duration: 0:00:15.979255 Num steps: 1246 Reward: 32.0 Training time per step: 0.010 Avg Reward (Last 100): 22.228 Epsilon: 0.100
Episode: 1910 Duration: 0:00:18.331834 Num steps: 1432 Reward: 37.0 Training time per step: 0.010 Avg Reward (Last 100): 22.327 Epsilon: 0.100
Episode: 1911 Duration: 0:00:10.551604 Num steps: 824 Reward: 13.0 Training time per step: 0.010 Avg Reward (Last 100): 22.218 Epsilon: 0.100
Episode: 1912 Duration: 0:00:10.209470 Num steps: 802 Reward: 16.0 Tra

Episode: 1961 Duration: 0:00:13.688556 Num steps: 1061 Reward: 27.0 Training time per step: 0.010 Avg Reward (Last 100): 23.277 Epsilon: 0.100
Episode: 1962 Duration: 0:00:09.722973 Num steps: 757 Reward: 11.0 Training time per step: 0.010 Avg Reward (Last 100): 23.119 Epsilon: 0.100
Copied model parameters to target network. total_t = 820000, period = 10000
Episode: 1963 Duration: 0:00:14.344877 Num steps: 1083 Reward: 21.0 Training time per step: 0.010 Avg Reward (Last 100): 23.158 Epsilon: 0.100
Episode: 1964 Duration: 0:00:18.074968 Num steps: 1404 Reward: 39.0 Training time per step: 0.010 Avg Reward (Last 100): 23.426 Epsilon: 0.100
Episode: 1965 Duration: 0:00:14.003575 Num steps: 1087 Reward: 26.0 Training time per step: 0.010 Avg Reward (Last 100): 23.436 Epsilon: 0.100
Episode: 1966 Duration: 0:00:15.149287 Num steps: 1177 Reward: 28.0 Training time per step: 0.010 Avg Reward (Last 100): 23.525 Epsilon: 0.100
Episode: 1967 Duration: 0:00:09.085554 Num steps: 705 Reward: 12.0 

Episode: 2015 Duration: 0:00:18.549825 Num steps: 1406 Reward: 34.0 Training time per step: 0.010 Avg Reward (Last 100): 26.337 Epsilon: 0.100
Episode: 2016 Duration: 0:00:19.625550 Num steps: 1526 Reward: 35.0 Training time per step: 0.010 Avg Reward (Last 100): 26.564 Epsilon: 0.100
Episode: 2017 Duration: 0:00:13.378748 Num steps: 1042 Reward: 17.0 Training time per step: 0.010 Avg Reward (Last 100): 26.347 Epsilon: 0.100
Episode: 2018 Duration: 0:00:16.488992 Num steps: 1280 Reward: 40.0 Training time per step: 0.010 Avg Reward (Last 100): 26.535 Epsilon: 0.100
Episode: 2019 Duration: 0:00:14.927117 Num steps: 1160 Reward: 29.0 Training time per step: 0.010 Avg Reward (Last 100): 26.693 Epsilon: 0.100
Episode: 2020 Duration: 0:00:09.041398 Num steps: 704 Reward: 14.0 Training time per step: 0.010 Avg Reward (Last 100): 26.475 Epsilon: 0.100
Episode: 2021 Duration: 0:00:14.166178 Num steps: 1101 Reward: 23.0 Training time per step: 0.010 Avg Reward (Last 100): 26.436 Epsilon: 0.100


Episode: 2070 Duration: 0:00:13.684662 Num steps: 1064 Reward: 21.0 Training time per step: 0.010 Avg Reward (Last 100): 26.634 Epsilon: 0.100
Episode: 2071 Duration: 0:00:16.947446 Num steps: 1313 Reward: 39.0 Training time per step: 0.010 Avg Reward (Last 100): 26.723 Epsilon: 0.100
Episode: 2072 Duration: 0:00:12.281502 Num steps: 954 Reward: 21.0 Training time per step: 0.010 Avg Reward (Last 100): 26.446 Epsilon: 0.100
Episode: 2073 Duration: 0:00:10.250245 Num steps: 798 Reward: 16.0 Training time per step: 0.010 Avg Reward (Last 100): 26.208 Epsilon: 0.100
Episode: 2074 Duration: 0:00:08.934188 Num steps: 696 Reward: 14.0 Training time per step: 0.010 Avg Reward (Last 100): 26.178 Epsilon: 0.100
Episode: 2075 Duration: 0:00:11.916310 Num steps: 925 Reward: 18.0 Training time per step: 0.010 Avg Reward (Last 100): 26.158 Epsilon: 0.100
Episode: 2076 Duration: 0:00:11.963178 Num steps: 928 Reward: 16.0 Training time per step: 0.010 Avg Reward (Last 100): 26.188 Epsilon: 0.100
Epis