# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
env = gym.make('Breakout-v0')
#env.render()

In [3]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = env.action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        curr_state = history[3,:,:]
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        #r = np.clip(reward, -1, 1)
        r = reward
        """
        if terminal_state:
            r -= 20
        """
        # Store the transition in memory 
        
        agent.memory.push(deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            _, frame_next_val = agent.get_action(np.float32(history[1:, :, :]) / 255.)
            agent.train_policy_net(frame, frame_next_val)
            # Update the target network
            agent.update_target_net()
        score += r
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 700 and len(evaluation_reward) > 40:
                torch.save(agent.policy_net, "./save_model/breakout_dqn")
                sys.exit()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


episode: 0   score: 1.0   memory length: 236   epsilon: 1.0    steps: 236     evaluation reward: 1.0
episode: 1   score: 2.0   memory length: 532   epsilon: 1.0    steps: 296     evaluation reward: 1.5
episode: 2   score: 2.0   memory length: 806   epsilon: 1.0    steps: 274     evaluation reward: 1.6666666666666667
episode: 3   score: 0.0   memory length: 978   epsilon: 1.0    steps: 172     evaluation reward: 1.25
Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: 0.015309. Value loss: 0.022334. Entropy: 1.365773.
Iteration 2: Policy loss: 0.007594. Value loss: 0.022622. Entropy: 1.375099.
Iteration 3: Policy loss: 0.001630. Value loss: 0.021370. Entropy: 1.376557.
episode: 4   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 211     evaluation reward: 1.2
episode: 5   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 250     evaluation reward: 1.3333333333333333
episode: 6   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 303     evaluation reward: 1.4285714285714286
episode: 7   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 210     evaluation reward: 1.25
Training network. lr: 0.000250. clip: 0.099991
Iteration 4: Policy loss: 0.001487. Value loss: 0.022242. Entropy: 1.366803.
Iteration 5: Policy loss: 0.000655. Value loss: 0.022145. Entropy: 1.367186.
Iteration 6: Policy loss: 0.000348. Value loss: 0.022892. Entropy: 1.365525.
episode: 8   score: 1.0   memory length: 1024   epsi

episode: 49   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 236     evaluation reward: 1.32
episode: 50   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 280     evaluation reward: 1.3333333333333333
Training network. lr: 0.000250. clip: 0.099901
Iteration 34: Policy loss: -0.001704. Value loss: 0.019973. Entropy: 1.368357.
Iteration 35: Policy loss: -0.002906. Value loss: 0.018659. Entropy: 1.364466.
Iteration 36: Policy loss: -0.004754. Value loss: 0.017865. Entropy: 1.363726.
episode: 51   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 182     evaluation reward: 1.3076923076923077
episode: 52   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 201     evaluation reward: 1.3018867924528301
episode: 53   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 169     evaluation reward: 1.2777777777777777
episode: 54   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 170     evaluation reward: 1.2545454545454546
episode: 55   score: 0

episode: 95   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 237     evaluation reward: 1.1666666666666667
episode: 96   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 167     evaluation reward: 1.1546391752577319
episode: 97   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 211     evaluation reward: 1.153061224489796
Training network. lr: 0.000250. clip: 0.099811
Iteration 64: Policy loss: 0.000791. Value loss: 0.015426. Entropy: 1.355216.
Iteration 65: Policy loss: 0.001012. Value loss: 0.014380. Entropy: 1.348447.
Iteration 66: Policy loss: 0.001699. Value loss: 0.013505. Entropy: 1.358719.
episode: 98   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 367     evaluation reward: 1.1818181818181819
episode: 99   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 280     evaluation reward: 1.19
episode: 100   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 210     evaluation reward: 1.19
Training network. lr: 0.000250. clip: 0

episode: 143   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 171     evaluation reward: 1.14
episode: 144   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 183     evaluation reward: 1.12
Training network. lr: 0.000249. clip: 0.099712
Iteration 97: Policy loss: 0.001695. Value loss: 0.014277. Entropy: 1.353512.
Iteration 98: Policy loss: -0.003296. Value loss: 0.011550. Entropy: 1.358497.
Iteration 99: Policy loss: -0.005294. Value loss: 0.009463. Entropy: 1.354294.
episode: 145   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 222     evaluation reward: 1.13
episode: 146   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 276     evaluation reward: 1.14
episode: 147   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 328     evaluation reward: 1.15
episode: 148   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 211     evaluation reward: 1.16
Training network. lr: 0.000249. clip: 0.099703
Iteration 100: Policy loss: 0.003609. Va

Iteration 130: Policy loss: 0.000856. Value loss: 0.027582. Entropy: 1.358510.
Iteration 131: Policy loss: -0.003005. Value loss: 0.018906. Entropy: 1.357869.
Iteration 132: Policy loss: -0.003899. Value loss: 0.015984. Entropy: 1.356809.
episode: 192   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 177     evaluation reward: 1.3
episode: 193   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 449     evaluation reward: 1.32
episode: 194   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 291     evaluation reward: 1.33
Training network. lr: 0.000249. clip: 0.099604
Iteration 133: Policy loss: 0.001044. Value loss: 0.025018. Entropy: 1.365763.
Iteration 134: Policy loss: -0.005035. Value loss: 0.019862. Entropy: 1.365883.
Iteration 135: Policy loss: -0.007619. Value loss: 0.017555. Entropy: 1.358311.
episode: 195   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 214     evaluation reward: 1.33
episode: 196   score: 3.0   memory length: 1024   epsilo

episode: 239   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 184     evaluation reward: 1.25
episode: 240   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 220     evaluation reward: 1.24
episode: 241   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 294     evaluation reward: 1.24
Training network. lr: 0.000249. clip: 0.099505
Iteration 166: Policy loss: 0.002892. Value loss: 0.016123. Entropy: 1.356560.
Iteration 167: Policy loss: -0.004986. Value loss: 0.013055. Entropy: 1.353376.
Iteration 168: Policy loss: -0.010098. Value loss: 0.011506. Entropy: 1.354755.
episode: 242   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 394     evaluation reward: 1.26
episode: 243   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 322     evaluation reward: 1.29
episode: 244   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 306     evaluation reward: 1.31
episode: 245   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 249     eva

Iteration 200: Policy loss: -0.006313. Value loss: 0.013304. Entropy: 1.335308.
Iteration 201: Policy loss: -0.012231. Value loss: 0.011435. Entropy: 1.332890.
episode: 286   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 343     evaluation reward: 1.44
episode: 287   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 270     evaluation reward: 1.43
episode: 288   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 256     evaluation reward: 1.44
episode: 289   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 312     evaluation reward: 1.47
Training network. lr: 0.000248. clip: 0.099397
Iteration 202: Policy loss: 0.005933. Value loss: 0.022496. Entropy: 1.330503.
Iteration 203: Policy loss: -0.002463. Value loss: 0.017232. Entropy: 1.329263.
Iteration 204: Policy loss: -0.008245. Value loss: 0.013967. Entropy: 1.326074.
episode: 290   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 274     evaluation reward: 1.47
episode: 291   score: 3.0   m

episode: 330   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 215     evaluation reward: 1.82
episode: 331   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 205     evaluation reward: 1.8
episode: 332   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 211     evaluation reward: 1.81
episode: 333   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 231     evaluation reward: 1.82
Training network. lr: 0.000248. clip: 0.099289
Iteration 238: Policy loss: -0.000774. Value loss: 0.010376. Entropy: 1.314741.
Iteration 239: Policy loss: -0.009184. Value loss: 0.007226. Entropy: 1.306343.
Iteration 240: Policy loss: -0.017871. Value loss: 0.006412. Entropy: 1.303159.
episode: 334   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 314     evaluation reward: 1.84
episode: 335   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 220     evaluation reward: 1.84
episode: 336   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 213     eva

episode: 376   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 279     evaluation reward: 2.06
Training network. lr: 0.000248. clip: 0.099181
Iteration 274: Policy loss: 0.002666. Value loss: 0.015027. Entropy: 1.295640.
Iteration 275: Policy loss: -0.013597. Value loss: 0.011367. Entropy: 1.299512.
Iteration 276: Policy loss: -0.020172. Value loss: 0.008674. Entropy: 1.292650.
episode: 377   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 271     evaluation reward: 2.07
episode: 378   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 252     evaluation reward: 2.08
episode: 379   score: 0.0   memory length: 1024   epsilon: 1.0    steps: 204     evaluation reward: 2.08
episode: 380   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 226     evaluation reward: 2.08
Training network. lr: 0.000248. clip: 0.099172
Iteration 277: Policy loss: 0.005054. Value loss: 0.021686. Entropy: 1.291196.
Iteration 278: Policy loss: -0.007048. Value loss: 0.016108. En

episode: 421   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 236     evaluation reward: 2.13
Training network. lr: 0.000248. clip: 0.099073
Iteration 310: Policy loss: 0.009015. Value loss: 0.014376. Entropy: 1.220987.
Iteration 311: Policy loss: -0.007546. Value loss: 0.010351. Entropy: 1.225841.
Iteration 312: Policy loss: -0.016346. Value loss: 0.007500. Entropy: 1.232211.
episode: 422   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 540     evaluation reward: 2.17
episode: 423   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 332     evaluation reward: 2.2
Training network. lr: 0.000248. clip: 0.099064
Iteration 313: Policy loss: 0.015961. Value loss: 0.022654. Entropy: 1.166992.
Iteration 314: Policy loss: -0.004754. Value loss: 0.015201. Entropy: 1.169362.
Iteration 315: Policy loss: -0.011294. Value loss: 0.013310. Entropy: 1.166469.
episode: 424   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 323     evaluation reward: 2.22
episode: 

episode: 464   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 289     evaluation reward: 2.36
Training network. lr: 0.000247. clip: 0.098956
Iteration 349: Policy loss: 0.007561. Value loss: 0.019898. Entropy: 1.183044.
Iteration 350: Policy loss: -0.009371. Value loss: 0.013550. Entropy: 1.176934.
Iteration 351: Policy loss: -0.019551. Value loss: 0.011777. Entropy: 1.176751.
episode: 465   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 376     evaluation reward: 2.33
episode: 466   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 325     evaluation reward: 2.32
Training network. lr: 0.000247. clip: 0.098947
Iteration 352: Policy loss: 0.004882. Value loss: 0.024058. Entropy: 1.224743.
Iteration 353: Policy loss: -0.012262. Value loss: 0.016726. Entropy: 1.223568.
Iteration 354: Policy loss: -0.022677. Value loss: 0.013678. Entropy: 1.216209.
episode: 467   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 411     evaluation reward: 2.34
episode:

episode: 507   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 276     evaluation reward: 2.55
Training network. lr: 0.000247. clip: 0.098839
Iteration 388: Policy loss: 0.004707. Value loss: 0.013374. Entropy: 1.235665.
Iteration 389: Policy loss: -0.014416. Value loss: 0.010097. Entropy: 1.224148.
Iteration 390: Policy loss: -0.022300. Value loss: 0.008411. Entropy: 1.226563.
episode: 508   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 278     evaluation reward: 2.56
episode: 509   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 349     evaluation reward: 2.57
episode: 510   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 484     evaluation reward: 2.62
Training network. lr: 0.000247. clip: 0.098830
Iteration 391: Policy loss: 0.004065. Value loss: 0.021599. Entropy: 1.204677.
Iteration 392: Policy loss: -0.008531. Value loss: 0.015622. Entropy: 1.210770.
Iteration 393: Policy loss: -0.015618. Value loss: 0.013236. Entropy: 1.198266.
episode:

Iteration 429: Policy loss: -0.022920. Value loss: 0.011876. Entropy: 1.228173.
episode: 548   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 360     evaluation reward: 2.83
episode: 549   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 377     evaluation reward: 2.86
Training network. lr: 0.000247. clip: 0.098713
Iteration 430: Policy loss: 0.007654. Value loss: 0.025837. Entropy: 1.214751.
Iteration 431: Policy loss: -0.009846. Value loss: 0.016657. Entropy: 1.192993.
Iteration 432: Policy loss: -0.017719. Value loss: 0.013763. Entropy: 1.205499.
episode: 550   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 318     evaluation reward: 2.87
episode: 551   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 321     evaluation reward: 2.9
episode: 552   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 504     evaluation reward: 2.94
Training network. lr: 0.000247. clip: 0.098704
Iteration 433: Policy loss: 0.009740. Value loss: 0.024770. Ent

Iteration 471: Policy loss: -0.022176. Value loss: 0.013533. Entropy: 1.161919.
episode: 588   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 497     evaluation reward: 3.2
episode: 589   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 328     evaluation reward: 3.22
episode: 590   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 367     evaluation reward: 3.21
episode: 591   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 214     evaluation reward: 3.19
Training network. lr: 0.000246. clip: 0.098587
Iteration 472: Policy loss: 0.006619. Value loss: 0.022121. Entropy: 1.196375.
Iteration 473: Policy loss: -0.011832. Value loss: 0.016160. Entropy: 1.191671.
Iteration 474: Policy loss: -0.025891. Value loss: 0.013092. Entropy: 1.187290.
episode: 592   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 437     evaluation reward: 3.23
episode: 593   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 440     evaluation reward: 3.25
Train

episode: 628   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 429     evaluation reward: 3.69
episode: 629   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 275     evaluation reward: 3.7
Training network. lr: 0.000246. clip: 0.098461
Iteration 514: Policy loss: 0.010526. Value loss: 0.016647. Entropy: 1.089678.
Iteration 515: Policy loss: -0.008598. Value loss: 0.011972. Entropy: 1.086168.
Iteration 516: Policy loss: -0.020152. Value loss: 0.009361. Entropy: 1.086967.
episode: 630   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 366     evaluation reward: 3.69
episode: 631   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 480     evaluation reward: 3.72
episode: 632   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 397     evaluation reward: 3.74
Training network. lr: 0.000246. clip: 0.098452
Iteration 517: Policy loss: 0.001887. Value loss: 0.019747. Entropy: 1.116406.
Iteration 518: Policy loss: -0.018419. Value loss: 0.013009. Ent

episode: 668   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 394     evaluation reward: 3.89
episode: 669   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 314     evaluation reward: 3.88
Training network. lr: 0.000246. clip: 0.098335
Iteration 556: Policy loss: 0.008145. Value loss: 0.023488. Entropy: 1.088807.
Iteration 557: Policy loss: -0.005740. Value loss: 0.017150. Entropy: 1.087662.
Iteration 558: Policy loss: -0.022760. Value loss: 0.013826. Entropy: 1.093592.
episode: 670   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 252     evaluation reward: 3.87
episode: 671   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 360     evaluation reward: 3.89
episode: 672   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 376     evaluation reward: 3.9
Training network. lr: 0.000246. clip: 0.098326
Iteration 559: Policy loss: 0.002029. Value loss: 0.019374. Entropy: 1.097467.
Iteration 560: Policy loss: -0.020867. Value loss: 0.013632. Ent

Iteration 599: Policy loss: -0.014960. Value loss: 0.016919. Entropy: 0.976886.
Iteration 600: Policy loss: -0.029066. Value loss: 0.014233. Entropy: 0.973951.
episode: 707   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 585     evaluation reward: 4.06
episode: 708   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 525     evaluation reward: 4.14
Training network. lr: 0.000246. clip: 0.098200
Iteration 601: Policy loss: 0.007396. Value loss: 0.062610. Entropy: 1.049932.
Iteration 602: Policy loss: -0.008206. Value loss: 0.047049. Entropy: 1.048678.
Iteration 603: Policy loss: -0.015444. Value loss: 0.034708. Entropy: 1.036131.
episode: 709   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 346     evaluation reward: 4.11
episode: 710   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 392     evaluation reward: 4.14
episode: 711   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 464     evaluation reward: 4.18
Training network. lr: 0.0002

Training network. lr: 0.000245. clip: 0.098074
Iteration 643: Policy loss: 0.008991. Value loss: 0.024199. Entropy: 0.952169.
Iteration 644: Policy loss: -0.011966. Value loss: 0.016880. Entropy: 0.936028.
Iteration 645: Policy loss: -0.022576. Value loss: 0.013873. Entropy: 0.941839.
episode: 746   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 429     evaluation reward: 4.29
episode: 747   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 493     evaluation reward: 4.33
Training network. lr: 0.000245. clip: 0.098065
Iteration 646: Policy loss: 0.013235. Value loss: 0.021108. Entropy: 1.039946.
Iteration 647: Policy loss: -0.008801. Value loss: 0.014363. Entropy: 1.032929.
Iteration 648: Policy loss: -0.025042. Value loss: 0.011452. Entropy: 1.029423.
episode: 748   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 442     evaluation reward: 4.36
episode: 749   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 314     evaluation reward: 4.36
episode:

Training network. lr: 0.000245. clip: 0.097948
Iteration 685: Policy loss: 0.004310. Value loss: 0.016884. Entropy: 0.778091.
Iteration 686: Policy loss: -0.010412. Value loss: 0.010778. Entropy: 0.796954.
Iteration 687: Policy loss: -0.018366. Value loss: 0.008159. Entropy: 0.807277.
episode: 786   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 348     evaluation reward: 4.39
episode: 787   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 261     evaluation reward: 4.35
episode: 788   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 411     evaluation reward: 4.34
Training network. lr: 0.000245. clip: 0.097939
Iteration 688: Policy loss: 0.010480. Value loss: 0.016744. Entropy: 0.881528.
Iteration 689: Policy loss: -0.009053. Value loss: 0.012251. Entropy: 0.864681.
Iteration 690: Policy loss: -0.020992. Value loss: 0.009306. Entropy: 0.879114.
episode: 789   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 415     evaluation reward: 4.35
episode:

episode: 826   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 455     evaluation reward: 3.99
Training network. lr: 0.000245. clip: 0.097822
Iteration 727: Policy loss: 0.011500. Value loss: 0.014353. Entropy: 0.888078.
Iteration 728: Policy loss: -0.010930. Value loss: 0.010278. Entropy: 0.890605.
Iteration 729: Policy loss: -0.020773. Value loss: 0.008391. Entropy: 0.893369.
episode: 827   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 387     evaluation reward: 4.01
episode: 828   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 308     evaluation reward: 3.96
episode: 829   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 381     evaluation reward: 3.98
Training network. lr: 0.000245. clip: 0.097813
Iteration 730: Policy loss: 0.013815. Value loss: 0.014631. Entropy: 0.941853.
Iteration 731: Policy loss: -0.009023. Value loss: 0.009758. Entropy: 0.946997.
Iteration 732: Policy loss: -0.025687. Value loss: 0.008500. Entropy: 0.935238.
now time

Iteration 771: Policy loss: -0.016969. Value loss: 0.014608. Entropy: 0.889086.
episode: 864   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 343     evaluation reward: 3.98
episode: 865   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 250     evaluation reward: 3.95
episode: 866   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 441     evaluation reward: 3.97
Training network. lr: 0.000244. clip: 0.097687
Iteration 772: Policy loss: 0.006539. Value loss: 0.014685. Entropy: 0.993245.
Iteration 773: Policy loss: -0.016453. Value loss: 0.010527. Entropy: 0.987869.
Iteration 774: Policy loss: -0.023817. Value loss: 0.008086. Entropy: 0.988609.
episode: 867   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 250     evaluation reward: 3.95
episode: 868   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 340     evaluation reward: 3.95
episode: 869   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 424     evaluation reward: 3.93
Trai

Training network. lr: 0.000244. clip: 0.097561
Iteration 814: Policy loss: 0.012395. Value loss: 0.015267. Entropy: 0.876491.
Iteration 815: Policy loss: -0.006606. Value loss: 0.011315. Entropy: 0.871247.
Iteration 816: Policy loss: -0.019661. Value loss: 0.008950. Entropy: 0.868152.
episode: 904   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 409     evaluation reward: 4.05
episode: 905   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 408     evaluation reward: 4.06
episode: 906   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 395     evaluation reward: 4.06
Training network. lr: 0.000244. clip: 0.097552
Iteration 817: Policy loss: 0.003445. Value loss: 0.015451. Entropy: 0.912970.
Iteration 818: Policy loss: -0.012175. Value loss: 0.011975. Entropy: 0.911690.
Iteration 819: Policy loss: -0.020789. Value loss: 0.010706. Entropy: 0.900104.
episode: 907   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 369     evaluation reward: 4.08
episode:

Iteration 859: Policy loss: 0.010998. Value loss: 0.024943. Entropy: 0.885631.
Iteration 860: Policy loss: -0.011381. Value loss: 0.017732. Entropy: 0.884178.
Iteration 861: Policy loss: -0.021254. Value loss: 0.012719. Entropy: 0.883739.
episode: 941   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 489     evaluation reward: 4.53
episode: 942   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 302     evaluation reward: 4.52
Training network. lr: 0.000244. clip: 0.097417
Iteration 862: Policy loss: 0.012476. Value loss: 0.023320. Entropy: 0.892411.
Iteration 863: Policy loss: -0.008488. Value loss: 0.016640. Entropy: 0.893884.
Iteration 864: Policy loss: -0.016227. Value loss: 0.013902. Entropy: 0.898849.
episode: 943   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 453     evaluation reward: 4.52
episode: 944   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 420     evaluation reward: 4.51
episode: 945   score: 4.0   memory length: 1024   epsil

episode: 976   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 512     evaluation reward: 5.17
Training network. lr: 0.000243. clip: 0.097282
Iteration 907: Policy loss: 0.011576. Value loss: 0.021388. Entropy: 0.850032.
Iteration 908: Policy loss: -0.008367. Value loss: 0.015684. Entropy: 0.850560.
Iteration 909: Policy loss: -0.020079. Value loss: 0.012697. Entropy: 0.847494.
episode: 977   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 499     evaluation reward: 5.2
episode: 978   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 572     evaluation reward: 5.24
Training network. lr: 0.000243. clip: 0.097273
Iteration 910: Policy loss: 0.013904. Value loss: 0.022457. Entropy: 0.901448.
Iteration 911: Policy loss: -0.007821. Value loss: 0.016684. Entropy: 0.862777.
Iteration 912: Policy loss: -0.018443. Value loss: 0.014051. Entropy: 0.865607.
episode: 979   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 594     evaluation reward: 5.28
episode: 

episode: 1012   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 285     evaluation reward: 5.48
episode: 1013   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 358     evaluation reward: 5.49
Training network. lr: 0.000243. clip: 0.097138
Iteration 955: Policy loss: 0.008760. Value loss: 0.020274. Entropy: 0.822965.
Iteration 956: Policy loss: -0.007781. Value loss: 0.015018. Entropy: 0.815059.
Iteration 957: Policy loss: -0.021386. Value loss: 0.012200. Entropy: 0.808836.
episode: 1014   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 646     evaluation reward: 5.48
episode: 1015   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 480     evaluation reward: 5.45
Training network. lr: 0.000243. clip: 0.097129
Iteration 958: Policy loss: 0.016023. Value loss: 0.022177. Entropy: 0.848856.
Iteration 959: Policy loss: -0.010645. Value loss: 0.015507. Entropy: 0.852336.
Iteration 960: Policy loss: -0.020329. Value loss: 0.012657. Entropy: 0.853227.
epis

Iteration 999: Policy loss: -0.022106. Value loss: 0.012634. Entropy: 0.868810.
episode: 1050   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 504     evaluation reward: 5.14
episode: 1051   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 384     evaluation reward: 5.13
episode: 1052   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 484     evaluation reward: 5.1
Training network. lr: 0.000243. clip: 0.097003
Iteration 1000: Policy loss: 0.012537. Value loss: 0.019351. Entropy: 0.831502.
Iteration 1001: Policy loss: -0.016549. Value loss: 0.013697. Entropy: 0.830711.
Iteration 1002: Policy loss: -0.024015. Value loss: 0.010984. Entropy: 0.832556.
episode: 1053   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 623     evaluation reward: 5.1
episode: 1054   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 408     evaluation reward: 5.1
Training network. lr: 0.000242. clip: 0.096994
Iteration 1003: Policy loss: 0.013506. Value loss: 0.0213

Iteration 1043: Policy loss: -0.010744. Value loss: 0.018480. Entropy: 0.812152.
Iteration 1044: Policy loss: -0.023799. Value loss: 0.014360. Entropy: 0.809356.
episode: 1087   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 558     evaluation reward: 4.78
episode: 1088   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 258     evaluation reward: 4.73
episode: 1089   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 421     evaluation reward: 4.74
Training network. lr: 0.000242. clip: 0.096868
Iteration 1045: Policy loss: 0.010563. Value loss: 0.022569. Entropy: 0.875363.
Iteration 1046: Policy loss: -0.010523. Value loss: 0.014894. Entropy: 0.871239.
Iteration 1047: Policy loss: -0.022097. Value loss: 0.011160. Entropy: 0.851476.
episode: 1090   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 239     evaluation reward: 4.71
episode: 1091   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 410     evaluation reward: 4.7
episode: 1092   scor

episode: 1125   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 413     evaluation reward: 4.42
episode: 1126   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 381     evaluation reward: 4.4
Training network. lr: 0.000242. clip: 0.096742
Iteration 1087: Policy loss: 0.012565. Value loss: 0.020055. Entropy: 0.865530.
Iteration 1088: Policy loss: -0.013439. Value loss: 0.016422. Entropy: 0.863944.
Iteration 1089: Policy loss: -0.027970. Value loss: 0.011870. Entropy: 0.858428.
episode: 1127   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 371     evaluation reward: 4.42
episode: 1128   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 502     evaluation reward: 4.4
Training network. lr: 0.000242. clip: 0.096733
Iteration 1090: Policy loss: 0.009722. Value loss: 0.024910. Entropy: 0.975217.
Iteration 1091: Policy loss: -0.013767. Value loss: 0.016980. Entropy: 0.967840.
Iteration 1092: Policy loss: -0.029485. Value loss: 0.013794. Entropy: 0.960057.


episode: 1162   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 344     evaluation reward: 4.54
episode: 1163   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 350     evaluation reward: 4.53
Training network. lr: 0.000242. clip: 0.096607
Iteration 1132: Policy loss: 0.012514. Value loss: 0.014448. Entropy: 0.819892.
Iteration 1133: Policy loss: -0.010321. Value loss: 0.010322. Entropy: 0.799826.
Iteration 1134: Policy loss: -0.020964. Value loss: 0.008806. Entropy: 0.799889.
episode: 1164   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 565     evaluation reward: 4.58
episode: 1165   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 448     evaluation reward: 4.59
Training network. lr: 0.000241. clip: 0.096598
Iteration 1135: Policy loss: 0.015421. Value loss: 0.046471. Entropy: 0.819187.
Iteration 1136: Policy loss: -0.003158. Value loss: 0.035085. Entropy: 0.826189.
Iteration 1137: Policy loss: -0.014274. Value loss: 0.030726. Entropy: 0.82885

episode: 1199   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 524     evaluation reward: 4.71
episode: 1200   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 342     evaluation reward: 4.7
Training network. lr: 0.000241. clip: 0.096472
Iteration 1177: Policy loss: 0.008438. Value loss: 0.018523. Entropy: 0.811035.
Iteration 1178: Policy loss: -0.006226. Value loss: 0.013263. Entropy: 0.807521.
Iteration 1179: Policy loss: -0.022941. Value loss: 0.011607. Entropy: 0.805975.
episode: 1201   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 661     evaluation reward: 4.77
Training network. lr: 0.000241. clip: 0.096463
Iteration 1180: Policy loss: 0.009001. Value loss: 0.017899. Entropy: 0.804773.
Iteration 1181: Policy loss: -0.008245. Value loss: 0.012809. Entropy: 0.796705.
Iteration 1182: Policy loss: -0.022684. Value loss: 0.010263. Entropy: 0.791877.
episode: 1202   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 451     evaluation reward: 4.78

Training network. lr: 0.000241. clip: 0.096337
Iteration 1222: Policy loss: 0.008141. Value loss: 0.019961. Entropy: 0.774977.
Iteration 1223: Policy loss: -0.013692. Value loss: 0.015923. Entropy: 0.773141.
Iteration 1224: Policy loss: -0.019706. Value loss: 0.013590. Entropy: 0.762391.
episode: 1236   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 351     evaluation reward: 4.95
episode: 1237   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 421     evaluation reward: 4.96
episode: 1238   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 501     evaluation reward: 4.97
Training network. lr: 0.000241. clip: 0.096328
Iteration 1225: Policy loss: 0.007262. Value loss: 0.018128. Entropy: 0.810985.
Iteration 1226: Policy loss: -0.010516. Value loss: 0.011577. Entropy: 0.807455.
Iteration 1227: Policy loss: -0.027913. Value loss: 0.008950. Entropy: 0.810214.
episode: 1239   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 331     evaluation reward: 4.9

episode: 1273   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 398     evaluation reward: 4.91
episode: 1274   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 246     evaluation reward: 4.85
Training network. lr: 0.000241. clip: 0.096202
Iteration 1267: Policy loss: 0.009608. Value loss: 0.017515. Entropy: 0.871477.
Iteration 1268: Policy loss: -0.009360. Value loss: 0.012639. Entropy: 0.889411.
Iteration 1269: Policy loss: -0.028124. Value loss: 0.010757. Entropy: 0.877365.
episode: 1275   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 477     evaluation reward: 4.84
episode: 1276   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 572     evaluation reward: 4.81
Training network. lr: 0.000240. clip: 0.096193
Iteration 1270: Policy loss: 0.033792. Value loss: 0.034226. Entropy: 0.900452.
Iteration 1271: Policy loss: 0.004066. Value loss: 0.011120. Entropy: 0.894218.
Iteration 1272: Policy loss: -0.014705. Value loss: 0.010510. Entropy: 0.888059.

Iteration 1309: Policy loss: 0.011047. Value loss: 0.020501. Entropy: 0.809393.
Iteration 1310: Policy loss: -0.012759. Value loss: 0.015155. Entropy: 0.810217.
Iteration 1311: Policy loss: -0.021661. Value loss: 0.012595. Entropy: 0.801782.
episode: 1312   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 568     evaluation reward: 4.6
episode: 1313   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 326     evaluation reward: 4.57
Training network. lr: 0.000240. clip: 0.096067
Iteration 1312: Policy loss: 0.014457. Value loss: 0.016546. Entropy: 0.706377.
Iteration 1313: Policy loss: -0.005671. Value loss: 0.014155. Entropy: 0.697767.
Iteration 1314: Policy loss: -0.019699. Value loss: 0.010892. Entropy: 0.696981.
episode: 1314   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 497     evaluation reward: 4.55
episode: 1315   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 471     evaluation reward: 4.54
Training network. lr: 0.000240. clip: 0.096058

Iteration 1354: Policy loss: 0.006944. Value loss: 0.017690. Entropy: 0.844730.
Iteration 1355: Policy loss: -0.011350. Value loss: 0.013028. Entropy: 0.838928.
Iteration 1356: Policy loss: -0.025742. Value loss: 0.010367. Entropy: 0.835970.
episode: 1349   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 601     evaluation reward: 4.46
Training network. lr: 0.000240. clip: 0.095932
Iteration 1357: Policy loss: 0.010788. Value loss: 0.020870. Entropy: 0.825567.
Iteration 1358: Policy loss: -0.009295. Value loss: 0.014248. Entropy: 0.818267.
Iteration 1359: Policy loss: -0.026991. Value loss: 0.011467. Entropy: 0.816529.
episode: 1350   score: 13.0   memory length: 1024   epsilon: 1.0    steps: 687     evaluation reward: 4.55
episode: 1351   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 275     evaluation reward: 4.51
episode: 1352   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 363     evaluation reward: 4.49
Training network. lr: 0.000240. clip: 0.0959

episode: 1384   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 448     evaluation reward: 4.76
Training network. lr: 0.000239. clip: 0.095797
Iteration 1402: Policy loss: 0.009656. Value loss: 0.021603. Entropy: 0.752489.
Iteration 1403: Policy loss: -0.010448. Value loss: 0.014920. Entropy: 0.755571.
Iteration 1404: Policy loss: -0.021494. Value loss: 0.012025. Entropy: 0.743659.
episode: 1385   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 587     evaluation reward: 4.81
episode: 1386   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 258     evaluation reward: 4.79
episode: 1387   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 395     evaluation reward: 4.81
episode: 1388   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 308     evaluation reward: 4.82
Training network. lr: 0.000239. clip: 0.095788
Iteration 1405: Policy loss: 0.004711. Value loss: 0.018326. Entropy: 0.753437.
Iteration 1406: Policy loss: -0.015587. Value loss: 0.

episode: 1421   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 441     evaluation reward: 5.07
Training network. lr: 0.000239. clip: 0.095662
Iteration 1447: Policy loss: 0.008163. Value loss: 0.018839. Entropy: 0.791189.
Iteration 1448: Policy loss: -0.007210. Value loss: 0.014105. Entropy: 0.792789.
Iteration 1449: Policy loss: -0.023171. Value loss: 0.011814. Entropy: 0.796964.
episode: 1422   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 452     evaluation reward: 5.09
episode: 1423   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 353     evaluation reward: 5.1
episode: 1424   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 205     evaluation reward: 5.04
Training network. lr: 0.000239. clip: 0.095653
Iteration 1450: Policy loss: 0.010006. Value loss: 0.018920. Entropy: 0.832936.
Iteration 1451: Policy loss: -0.009920. Value loss: 0.013616. Entropy: 0.836748.
Iteration 1452: Policy loss: -0.023197. Value loss: 0.010778. Entropy: 0.820325.

episode: 1455   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 561     evaluation reward: 5.35
episode: 1456   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 411     evaluation reward: 5.34
Training network. lr: 0.000239. clip: 0.095518
Iteration 1495: Policy loss: 0.013040. Value loss: 0.048713. Entropy: 0.825261.
Iteration 1496: Policy loss: -0.006665. Value loss: 0.038332. Entropy: 0.834109.
Iteration 1497: Policy loss: -0.012826. Value loss: 0.033760. Entropy: 0.818998.
episode: 1457   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 458     evaluation reward: 5.34
episode: 1458   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 361     evaluation reward: 5.33
Training network. lr: 0.000239. clip: 0.095509
Iteration 1498: Policy loss: 0.010258. Value loss: 0.023192. Entropy: 0.848072.
Iteration 1499: Policy loss: -0.011159. Value loss: 0.015274. Entropy: 0.836074.
Iteration 1500: Policy loss: -0.025159. Value loss: 0.014323. Entropy: 0.82900

Iteration 1542: Policy loss: -0.027359. Value loss: 0.014137. Entropy: 0.908321.
episode: 1490   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 494     evaluation reward: 5.47
episode: 1491   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 450     evaluation reward: 5.45
episode: 1492   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 233     evaluation reward: 5.41
Training network. lr: 0.000238. clip: 0.095374
Iteration 1543: Policy loss: 0.006372. Value loss: 0.020775. Entropy: 0.857714.
Iteration 1544: Policy loss: -0.011251. Value loss: 0.014828. Entropy: 0.858173.
Iteration 1545: Policy loss: -0.025936. Value loss: 0.012753. Entropy: 0.846334.
episode: 1493   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 692     evaluation reward: 5.44
episode: 1494   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 454     evaluation reward: 5.48
Training network. lr: 0.000238. clip: 0.095365
Iteration 1546: Policy loss: 0.003366. Value loss: 0

Training network. lr: 0.000238. clip: 0.095239
Iteration 1588: Policy loss: 0.009802. Value loss: 0.024785. Entropy: 0.809793.
Iteration 1589: Policy loss: -0.010528. Value loss: 0.018981. Entropy: 0.798693.
Iteration 1590: Policy loss: -0.021847. Value loss: 0.015375. Entropy: 0.804750.
episode: 1526   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 437     evaluation reward: 5.71
episode: 1527   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 467     evaluation reward: 5.69
Training network. lr: 0.000238. clip: 0.095230
Iteration 1591: Policy loss: 0.007215. Value loss: 0.020751. Entropy: 0.817367.
Iteration 1592: Policy loss: -0.013453. Value loss: 0.016202. Entropy: 0.812631.
Iteration 1593: Policy loss: -0.028324. Value loss: 0.012939. Entropy: 0.807170.
episode: 1528   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 620     evaluation reward: 5.73
episode: 1529   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 558     evaluation reward: 5.6

Iteration 1638: Policy loss: -0.027292. Value loss: 0.013952. Entropy: 0.819785.
episode: 1558   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 555     evaluation reward: 5.85
episode: 1559   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 427     evaluation reward: 5.8
episode: 1560   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 398     evaluation reward: 5.8
Training network. lr: 0.000238. clip: 0.095086
Iteration 1639: Policy loss: 0.009146. Value loss: 0.021654. Entropy: 0.840994.
Iteration 1640: Policy loss: -0.012995. Value loss: 0.015960. Entropy: 0.839430.
Iteration 1641: Policy loss: -0.022670. Value loss: 0.012799. Entropy: 0.832768.
episode: 1561   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 696     evaluation reward: 5.84
Training network. lr: 0.000238. clip: 0.095077
Iteration 1642: Policy loss: 0.004937. Value loss: 0.021391. Entropy: 0.765707.
Iteration 1643: Policy loss: -0.010767. Value loss: 0.014561. Entropy: 0.765151.

Training network. lr: 0.000237. clip: 0.094942
Iteration 1687: Policy loss: 0.009070. Value loss: 0.016440. Entropy: 0.637769.
Iteration 1688: Policy loss: -0.010778. Value loss: 0.012190. Entropy: 0.648292.
Iteration 1689: Policy loss: -0.019149. Value loss: 0.010758. Entropy: 0.639824.
episode: 1591   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 521     evaluation reward: 6.37
episode: 1592   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 558     evaluation reward: 6.43
Training network. lr: 0.000237. clip: 0.094933
Iteration 1690: Policy loss: 0.008745. Value loss: 0.019361. Entropy: 0.672722.
Iteration 1691: Policy loss: -0.008540. Value loss: 0.013357. Entropy: 0.675218.
Iteration 1692: Policy loss: -0.019865. Value loss: 0.010845. Entropy: 0.671289.
episode: 1593   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 541     evaluation reward: 6.39
episode: 1594   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 644     evaluation reward: 6.4

Training network. lr: 0.000237. clip: 0.094789
Iteration 1738: Policy loss: 0.016547. Value loss: 0.034204. Entropy: 0.690071.
Iteration 1739: Policy loss: -0.008633. Value loss: 0.026619. Entropy: 0.700312.
Iteration 1740: Policy loss: -0.017667. Value loss: 0.022370. Entropy: 0.692164.
episode: 1622   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 610     evaluation reward: 6.81
episode: 1623   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 508     evaluation reward: 6.82
Training network. lr: 0.000237. clip: 0.094780
Iteration 1741: Policy loss: 0.008237. Value loss: 0.019456. Entropy: 0.717944.
Iteration 1742: Policy loss: -0.010908. Value loss: 0.014147. Entropy: 0.714642.
Iteration 1743: Policy loss: -0.022420. Value loss: 0.011230. Entropy: 0.720054.
episode: 1624   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 509     evaluation reward: 6.84
episode: 1625   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 563     evaluation reward: 6.8

Iteration 1786: Policy loss: 0.010602. Value loss: 0.047249. Entropy: 0.799429.
Iteration 1787: Policy loss: -0.003121. Value loss: 0.039370. Entropy: 0.793058.
Iteration 1788: Policy loss: -0.016522. Value loss: 0.031707. Entropy: 0.788974.
episode: 1655   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 453     evaluation reward: 7.02
episode: 1656   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 594     evaluation reward: 7.02
Training network. lr: 0.000237. clip: 0.094636
Iteration 1789: Policy loss: 0.009022. Value loss: 0.024115. Entropy: 0.775667.
Iteration 1790: Policy loss: -0.011769. Value loss: 0.019193. Entropy: 0.764524.
Iteration 1791: Policy loss: -0.023262. Value loss: 0.016067. Entropy: 0.763424.
episode: 1657   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 468     evaluation reward: 7.03
Training network. lr: 0.000237. clip: 0.094627
Iteration 1792: Policy loss: 0.008167. Value loss: 0.024152. Entropy: 0.681536.
Iteration 1793: Policy l

Training network. lr: 0.000236. clip: 0.094492
Iteration 1837: Policy loss: 0.010777. Value loss: 0.028933. Entropy: 0.734801.
Iteration 1838: Policy loss: -0.010436. Value loss: 0.022825. Entropy: 0.754028.
Iteration 1839: Policy loss: -0.024045. Value loss: 0.019159. Entropy: 0.750539.
episode: 1687   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 456     evaluation reward: 7.21
episode: 1688   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 656     evaluation reward: 7.22
Training network. lr: 0.000236. clip: 0.094483
Iteration 1840: Policy loss: 0.012167. Value loss: 0.019356. Entropy: 0.741945.
Iteration 1841: Policy loss: -0.007180. Value loss: 0.015503. Entropy: 0.738160.
Iteration 1842: Policy loss: -0.019476. Value loss: 0.012300. Entropy: 0.740586.
episode: 1689   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 619     evaluation reward: 7.22
Training network. lr: 0.000236. clip: 0.094474
Iteration 1843: Policy loss: 0.007534. Value loss: 0.0244

episode: 1718   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 579     evaluation reward: 7.26
Training network. lr: 0.000236. clip: 0.094339
Iteration 1888: Policy loss: 0.008875. Value loss: 0.033358. Entropy: 0.878953.
Iteration 1889: Policy loss: -0.008651. Value loss: 0.021222. Entropy: 0.864872.
Iteration 1890: Policy loss: -0.012495. Value loss: 0.016944. Entropy: 0.860854.
episode: 1719   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 455     evaluation reward: 7.27
episode: 1720   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 566     evaluation reward: 7.25
Training network. lr: 0.000236. clip: 0.094330
Iteration 1891: Policy loss: 0.006976. Value loss: 0.048517. Entropy: 0.797968.
Iteration 1892: Policy loss: -0.008255. Value loss: 0.035645. Entropy: 0.782167.
Iteration 1893: Policy loss: -0.018857. Value loss: 0.029227. Entropy: 0.770229.
episode: 1721   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 625     evaluation reward: 7.

episode: 1749   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 514     evaluation reward: 7.71
Training network. lr: 0.000235. clip: 0.094186
Iteration 1939: Policy loss: 0.009349. Value loss: 0.040854. Entropy: 0.888805.
Iteration 1940: Policy loss: -0.009078. Value loss: 0.027729. Entropy: 0.879776.
Iteration 1941: Policy loss: -0.025562. Value loss: 0.022684. Entropy: 0.871891.
episode: 1750   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 584     evaluation reward: 7.71
episode: 1751   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 453     evaluation reward: 7.73
Training network. lr: 0.000235. clip: 0.094177
Iteration 1942: Policy loss: 0.009047. Value loss: 0.024982. Entropy: 0.815590.
Iteration 1943: Policy loss: -0.011508. Value loss: 0.018117. Entropy: 0.833230.
Iteration 1944: Policy loss: -0.024519. Value loss: 0.014095. Entropy: 0.824665.
episode: 1752   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 388     evaluation reward: 7.7

episode: 1780   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 475     evaluation reward: 7.64
Training network. lr: 0.000235. clip: 0.094033
Iteration 1990: Policy loss: 0.009850. Value loss: 0.018082. Entropy: 0.801488.
Iteration 1991: Policy loss: -0.012278. Value loss: 0.013577. Entropy: 0.798291.
Iteration 1992: Policy loss: -0.023906. Value loss: 0.010870. Entropy: 0.787063.
episode: 1781   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 587     evaluation reward: 7.65
episode: 1782   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 543     evaluation reward: 7.7
Training network. lr: 0.000235. clip: 0.094024
Iteration 1993: Policy loss: 0.010935. Value loss: 0.036784. Entropy: 0.817348.
Iteration 1994: Policy loss: -0.010976. Value loss: 0.024436. Entropy: 0.820655.
Iteration 1995: Policy loss: -0.025461. Value loss: 0.018604. Entropy: 0.807301.
episode: 1783   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 403     evaluation reward: 7.62

Iteration 2040: Policy loss: -0.022740. Value loss: 0.032369. Entropy: 0.774762.
episode: 1812   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 646     evaluation reward: 7.65
episode: 1813   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 406     evaluation reward: 7.62
Training network. lr: 0.000235. clip: 0.093880
Iteration 2041: Policy loss: 0.012261. Value loss: 0.020052. Entropy: 0.801726.
Iteration 2042: Policy loss: -0.009315. Value loss: 0.013668. Entropy: 0.784484.
Iteration 2043: Policy loss: -0.026737. Value loss: 0.011078. Entropy: 0.790429.
episode: 1814   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 733     evaluation reward: 7.62
episode: 1815   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 590     evaluation reward: 7.61
Training network. lr: 0.000235. clip: 0.093871
Iteration 2044: Policy loss: 0.012081. Value loss: 0.021785. Entropy: 0.779667.
Iteration 2045: Policy loss: -0.008470. Value loss: 0.015447. Entropy: 0.77812

Iteration 2091: Policy loss: -0.026618. Value loss: 0.012769. Entropy: 0.756992.
episode: 1843   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 484     evaluation reward: 7.55
episode: 1844   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 610     evaluation reward: 7.54
Training network. lr: 0.000234. clip: 0.093727
Iteration 2092: Policy loss: 0.013287. Value loss: 0.020210. Entropy: 0.785651.
Iteration 2093: Policy loss: -0.008662. Value loss: 0.015152. Entropy: 0.783859.
Iteration 2094: Policy loss: -0.020992. Value loss: 0.012561. Entropy: 0.786359.
episode: 1845   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 738     evaluation reward: 7.55
episode: 1846   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 302     evaluation reward: 7.5
Training network. lr: 0.000234. clip: 0.093718
Iteration 2095: Policy loss: 0.006648. Value loss: 0.028025. Entropy: 0.746876.
Iteration 2096: Policy loss: -0.015433. Value loss: 0.018635. Entropy: 0.749454

episode: 1876   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 548     evaluation reward: 7.26
Training network. lr: 0.000234. clip: 0.093583
Iteration 2140: Policy loss: 0.010937. Value loss: 0.019927. Entropy: 0.743115.
Iteration 2141: Policy loss: -0.003272. Value loss: 0.014034. Entropy: 0.733597.
Iteration 2142: Policy loss: -0.020068. Value loss: 0.011542. Entropy: 0.737342.
episode: 1877   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 553     evaluation reward: 7.28
episode: 1878   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 589     evaluation reward: 7.28
Training network. lr: 0.000234. clip: 0.093574
Iteration 2143: Policy loss: 0.011945. Value loss: 0.022159. Entropy: 0.806346.
Iteration 2144: Policy loss: -0.007336. Value loss: 0.016045. Entropy: 0.802415.
Iteration 2145: Policy loss: -0.020903. Value loss: 0.012852. Entropy: 0.797014.
episode: 1879   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 388     evaluation reward: 7.2

Training network. lr: 0.000234. clip: 0.093439
Iteration 2188: Policy loss: 0.010616. Value loss: 0.021865. Entropy: 0.703080.
Iteration 2189: Policy loss: -0.008344. Value loss: 0.014095. Entropy: 0.711887.
Iteration 2190: Policy loss: -0.021293. Value loss: 0.012404. Entropy: 0.711891.
episode: 1910   score: 13.0   memory length: 1024   epsilon: 1.0    steps: 783     evaluation reward: 7.16
Training network. lr: 0.000234. clip: 0.093430
Iteration 2191: Policy loss: 0.007198. Value loss: 0.045847. Entropy: 0.791842.
Iteration 2192: Policy loss: -0.003750. Value loss: 0.028997. Entropy: 0.792632.
Iteration 2193: Policy loss: -0.016864. Value loss: 0.025281. Entropy: 0.790632.
episode: 1911   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 527     evaluation reward: 7.2
episode: 1912   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 630     evaluation reward: 7.21
Training network. lr: 0.000234. clip: 0.093421
Iteration 2194: Policy loss: 0.013827. Value loss: 0.0569

Training network. lr: 0.000233. clip: 0.093286
Iteration 2239: Policy loss: 0.022851. Value loss: 0.019081. Entropy: 0.752867.
Iteration 2240: Policy loss: -0.007983. Value loss: 0.013917. Entropy: 0.754174.
Iteration 2241: Policy loss: -0.021536. Value loss: 0.011455. Entropy: 0.755832.
episode: 1941   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 502     evaluation reward: 7.09
episode: 1942   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 489     evaluation reward: 7.07
Training network. lr: 0.000233. clip: 0.093277
Iteration 2242: Policy loss: 0.008672. Value loss: 0.020313. Entropy: 0.855314.
Iteration 2243: Policy loss: -0.013016. Value loss: 0.014909. Entropy: 0.873946.
Iteration 2244: Policy loss: -0.023887. Value loss: 0.013292. Entropy: 0.856726.
episode: 1943   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 555     evaluation reward: 7.08
episode: 1944   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 501     evaluation reward: 7.0

episode: 1972   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 469     evaluation reward: 7.32
Training network. lr: 0.000233. clip: 0.093133
Iteration 2290: Policy loss: 0.004736. Value loss: 0.036029. Entropy: 0.794461.
Iteration 2291: Policy loss: -0.007338. Value loss: 0.029426. Entropy: 0.792805.
Iteration 2292: Policy loss: -0.021085. Value loss: 0.025095. Entropy: 0.789236.
episode: 1973   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 574     evaluation reward: 7.33
episode: 1974   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 403     evaluation reward: 7.33
Training network. lr: 0.000233. clip: 0.093124
Iteration 2293: Policy loss: 0.009405. Value loss: 0.028188. Entropy: 0.830916.
Iteration 2294: Policy loss: -0.011981. Value loss: 0.019930. Entropy: 0.815290.
Iteration 2295: Policy loss: -0.021837. Value loss: 0.016995. Entropy: 0.809077.
episode: 1975   score: 13.0   memory length: 1024   epsilon: 1.0    steps: 867     evaluation reward: 7

Iteration 2340: Policy loss: -0.022619. Value loss: 0.026250. Entropy: 0.807844.
episode: 2004   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 669     evaluation reward: 7.57
episode: 2005   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 482     evaluation reward: 7.56
Training network. lr: 0.000232. clip: 0.092980
Iteration 2341: Policy loss: 0.007606. Value loss: 0.023997. Entropy: 0.698180.
Iteration 2342: Policy loss: -0.010051. Value loss: 0.015059. Entropy: 0.709601.
Iteration 2343: Policy loss: -0.023314. Value loss: 0.012782. Entropy: 0.702789.
episode: 2006   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 371     evaluation reward: 7.51
now time :  2018-12-26 13:16:13.467821
episode: 2007   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 427     evaluation reward: 7.51
episode: 2008   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 356     evaluation reward: 7.5
Training network. lr: 0.000232. clip: 0.092971
Iteration 2344

Iteration 2388: Policy loss: -0.023322. Value loss: 0.010571. Entropy: 0.822809.
episode: 2038   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 577     evaluation reward: 7.14
Training network. lr: 0.000232. clip: 0.092836
Iteration 2389: Policy loss: 0.006061. Value loss: 0.016506. Entropy: 0.780780.
Iteration 2390: Policy loss: -0.014117. Value loss: 0.012295. Entropy: 0.771479.
Iteration 2391: Policy loss: -0.026441. Value loss: 0.009564. Entropy: 0.769389.
episode: 2039   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 618     evaluation reward: 7.16
episode: 2040   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 705     evaluation reward: 7.18
Training network. lr: 0.000232. clip: 0.092827
Iteration 2392: Policy loss: 0.006073. Value loss: 0.018156. Entropy: 0.785334.
Iteration 2393: Policy loss: -0.012679. Value loss: 0.013687. Entropy: 0.779331.
Iteration 2394: Policy loss: -0.027968. Value loss: 0.010974. Entropy: 0.772200.
episode: 2041   score:

Iteration 2438: Policy loss: -0.009971. Value loss: 0.020371. Entropy: 0.712835.
Iteration 2439: Policy loss: -0.021655. Value loss: 0.016224. Entropy: 0.705213.
episode: 2070   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 562     evaluation reward: 7.08
episode: 2071   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 482     evaluation reward: 7.11
episode: 2072   score: 1.0   memory length: 1024   epsilon: 1.0    steps: 206     evaluation reward: 7.02
Training network. lr: 0.000232. clip: 0.092683
Iteration 2440: Policy loss: 0.013096. Value loss: 0.062221. Entropy: 0.852261.
Iteration 2441: Policy loss: -0.005929. Value loss: 0.040432. Entropy: 0.841650.
Iteration 2442: Policy loss: -0.018766. Value loss: 0.034219. Entropy: 0.838703.
episode: 2073   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 680     evaluation reward: 7.04
Training network. lr: 0.000232. clip: 0.092674
Iteration 2443: Policy loss: 0.010795. Value loss: 0.024600. Entropy: 0.85360

episode: 2103   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 452     evaluation reward: 7.07
Training network. lr: 0.000231. clip: 0.092539
Iteration 2488: Policy loss: 0.003252. Value loss: 0.023786. Entropy: 0.792792.
Iteration 2489: Policy loss: -0.010653. Value loss: 0.018366. Entropy: 0.792709.
Iteration 2490: Policy loss: -0.023688. Value loss: 0.015093. Entropy: 0.791246.
now time :  2018-12-26 13:20:36.800548
episode: 2104   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 592     evaluation reward: 7.06
Training network. lr: 0.000231. clip: 0.092530
Iteration 2491: Policy loss: 0.018582. Value loss: 0.080537. Entropy: 0.714300.
Iteration 2492: Policy loss: -0.005067. Value loss: 0.045746. Entropy: 0.707660.
Iteration 2493: Policy loss: -0.010994. Value loss: 0.038759. Entropy: 0.700814.
episode: 2105   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 581     evaluation reward: 7.07
episode: 2106   score: 8.0   memory length: 1024   epsilon: 1.0 

Iteration 2538: Policy loss: -0.025219. Value loss: 0.013016. Entropy: 0.766449.
episode: 2135   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 587     evaluation reward: 7.22
Training network. lr: 0.000231. clip: 0.092386
Iteration 2539: Policy loss: 0.003482. Value loss: 0.042139. Entropy: 0.704425.
Iteration 2540: Policy loss: -0.008599. Value loss: 0.034137. Entropy: 0.701008.
Iteration 2541: Policy loss: -0.019779. Value loss: 0.029128. Entropy: 0.701642.
episode: 2136   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 452     evaluation reward: 7.24
episode: 2137   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 461     evaluation reward: 7.23
episode: 2138   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 407     evaluation reward: 7.2
Training network. lr: 0.000231. clip: 0.092377
Iteration 2542: Policy loss: 0.001542. Value loss: 0.030031. Entropy: 0.771642.
Iteration 2543: Policy loss: -0.018564. Value loss: 0.023421. Entropy: 0.770251

Iteration 2586: Policy loss: -0.023718. Value loss: 0.012780. Entropy: 0.917789.
episode: 2169   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 477     evaluation reward: 6.85
episode: 2170   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 726     evaluation reward: 6.87
Training network. lr: 0.000231. clip: 0.092242
Iteration 2587: Policy loss: 0.004451. Value loss: 0.016521. Entropy: 0.787754.
Iteration 2588: Policy loss: -0.010291. Value loss: 0.012346. Entropy: 0.783736.
Iteration 2589: Policy loss: -0.021353. Value loss: 0.009966. Entropy: 0.777177.
episode: 2171   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 356     evaluation reward: 6.81
episode: 2172   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 545     evaluation reward: 6.87
Training network. lr: 0.000231. clip: 0.092233
Iteration 2590: Policy loss: 0.004435. Value loss: 0.021384. Entropy: 0.747002.
Iteration 2591: Policy loss: -0.008491. Value loss: 0.013059. Entropy: 0.74339

now time :  2018-12-26 13:24:59.384128
Training network. lr: 0.000230. clip: 0.092098
Iteration 2635: Policy loss: 0.001163. Value loss: 0.020107. Entropy: 0.707561.
Iteration 2636: Policy loss: -0.015331. Value loss: 0.015740. Entropy: 0.702475.
Iteration 2637: Policy loss: -0.024758. Value loss: 0.012519. Entropy: 0.699726.
episode: 2202   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 555     evaluation reward: 6.81
episode: 2203   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 525     evaluation reward: 6.82
Training network. lr: 0.000230. clip: 0.092089
Iteration 2638: Policy loss: 0.005295. Value loss: 0.025208. Entropy: 0.683546.
Iteration 2639: Policy loss: -0.010494. Value loss: 0.017054. Entropy: 0.676397.
Iteration 2640: Policy loss: -0.020695. Value loss: 0.013283. Entropy: 0.674209.
episode: 2204   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 572     evaluation reward: 6.79
episode: 2205   score: 6.0   memory length: 1024   epsilon: 1.0  

Iteration 2687: Policy loss: -0.014155. Value loss: 0.015308. Entropy: 0.792536.
Iteration 2688: Policy loss: -0.023913. Value loss: 0.012687. Entropy: 0.792080.
episode: 2232   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 582     evaluation reward: 7.17
Training network. lr: 0.000230. clip: 0.091936
Iteration 2689: Policy loss: 0.011046. Value loss: 0.015878. Entropy: 0.738629.
Iteration 2690: Policy loss: -0.011525. Value loss: 0.013809. Entropy: 0.723001.
Iteration 2691: Policy loss: -0.021896. Value loss: 0.011887. Entropy: 0.714976.
episode: 2233   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 669     evaluation reward: 7.19
episode: 2234   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 431     evaluation reward: 7.16
episode: 2235   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 444     evaluation reward: 7.11
Training network. lr: 0.000230. clip: 0.091927
Iteration 2692: Policy loss: 0.008286. Value loss: 0.028976. Entropy: 0.718262

episode: 2263   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 343     evaluation reward: 7.68
episode: 2264   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 519     evaluation reward: 7.69
Training network. lr: 0.000229. clip: 0.091783
Iteration 2740: Policy loss: 0.000218. Value loss: 0.024968. Entropy: 0.819675.
Iteration 2741: Policy loss: -0.014160. Value loss: 0.020160. Entropy: 0.817583.
Iteration 2742: Policy loss: -0.024237. Value loss: 0.016924. Entropy: 0.813703.
episode: 2265   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 671     evaluation reward: 7.71
Training network. lr: 0.000229. clip: 0.091774
Iteration 2743: Policy loss: 0.005535. Value loss: 0.025876. Entropy: 0.806869.
Iteration 2744: Policy loss: -0.012500. Value loss: 0.019388. Entropy: 0.808125.
Iteration 2745: Policy loss: -0.022451. Value loss: 0.016711. Entropy: 0.800845.
episode: 2266   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 519     evaluation reward: 7.7

episode: 2294   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 623     evaluation reward: 8.13
episode: 2295   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 674     evaluation reward: 8.17
Training network. lr: 0.000229. clip: 0.091630
Iteration 2791: Policy loss: 0.008607. Value loss: 0.044044. Entropy: 0.663202.
Iteration 2792: Policy loss: -0.005137. Value loss: 0.034658. Entropy: 0.671978.
Iteration 2793: Policy loss: -0.015709. Value loss: 0.029697. Entropy: 0.675227.
episode: 2296   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 456     evaluation reward: 8.15
episode: 2297   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 633     evaluation reward: 8.12
Training network. lr: 0.000229. clip: 0.091621
Iteration 2794: Policy loss: 0.011210. Value loss: 0.024278. Entropy: 0.729630.
Iteration 2795: Policy loss: -0.008967. Value loss: 0.016139. Entropy: 0.714013.
Iteration 2796: Policy loss: -0.023848. Value loss: 0.013891. Entropy: 0.712

episode: 2325   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 486     evaluation reward: 8.13
Training network. lr: 0.000229. clip: 0.091477
Iteration 2842: Policy loss: 0.009021. Value loss: 0.017590. Entropy: 0.665597.
Iteration 2843: Policy loss: -0.008074. Value loss: 0.013126. Entropy: 0.663866.
Iteration 2844: Policy loss: -0.019128. Value loss: 0.011284. Entropy: 0.666802.
episode: 2326   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 672     evaluation reward: 8.15
episode: 2327   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 500     evaluation reward: 8.13
Training network. lr: 0.000229. clip: 0.091468
Iteration 2845: Policy loss: 0.008285. Value loss: 0.025468. Entropy: 0.759405.
Iteration 2846: Policy loss: -0.008903. Value loss: 0.018480. Entropy: 0.762244.
Iteration 2847: Policy loss: -0.019766. Value loss: 0.015573. Entropy: 0.755488.
episode: 2328   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 646     evaluation reward: 8.

episode: 2356   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 488     evaluation reward: 7.98
Training network. lr: 0.000228. clip: 0.091324
Iteration 2893: Policy loss: 0.003115. Value loss: 0.021417. Entropy: 0.666996.
Iteration 2894: Policy loss: -0.012390. Value loss: 0.014229. Entropy: 0.661156.
Iteration 2895: Policy loss: -0.023252. Value loss: 0.011759. Entropy: 0.659603.
episode: 2357   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 599     evaluation reward: 8.0
episode: 2358   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 291     evaluation reward: 7.94
Training network. lr: 0.000228. clip: 0.091315
Iteration 2896: Policy loss: 0.002067. Value loss: 0.032326. Entropy: 0.771159.
Iteration 2897: Policy loss: -0.013551. Value loss: 0.020877. Entropy: 0.779396.
Iteration 2898: Policy loss: -0.023190. Value loss: 0.015466. Entropy: 0.779367.
episode: 2359   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 461     evaluation reward: 7.93

episode: 2387   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 511     evaluation reward: 7.97
Training network. lr: 0.000228. clip: 0.091171
Iteration 2944: Policy loss: 0.006147. Value loss: 0.018356. Entropy: 0.780235.
Iteration 2945: Policy loss: -0.015011. Value loss: 0.013783. Entropy: 0.775316.
Iteration 2946: Policy loss: -0.022681. Value loss: 0.011337. Entropy: 0.771997.
episode: 2388   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 622     evaluation reward: 7.89
episode: 2389   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 496     evaluation reward: 7.84
episode: 2390   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 354     evaluation reward: 7.85
Training network. lr: 0.000228. clip: 0.091162
Iteration 2947: Policy loss: 0.005499. Value loss: 0.021153. Entropy: 0.841752.
Iteration 2948: Policy loss: -0.011191. Value loss: 0.016563. Entropy: 0.846882.
Iteration 2949: Policy loss: -0.024072. Value loss: 0.014424. Entropy: 0.847428

episode: 2418   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 659     evaluation reward: 8.06
Training network. lr: 0.000228. clip: 0.091018
Iteration 2995: Policy loss: 0.010901. Value loss: 0.028582. Entropy: 0.710890.
Iteration 2996: Policy loss: -0.010294. Value loss: 0.021958. Entropy: 0.718640.
Iteration 2997: Policy loss: -0.024809. Value loss: 0.018116. Entropy: 0.705437.
episode: 2419   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 414     evaluation reward: 8.02
episode: 2420   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 650     evaluation reward: 8.01
Training network. lr: 0.000228. clip: 0.091009
Iteration 2998: Policy loss: 0.005454. Value loss: 0.034700. Entropy: 0.777545.
Iteration 2999: Policy loss: -0.015752. Value loss: 0.026834. Entropy: 0.776944.
Iteration 3000: Policy loss: -0.021542. Value loss: 0.023731. Entropy: 0.771248.
episode: 2421   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 654     evaluation reward: 8

Training network. lr: 0.000227. clip: 0.090865
Iteration 3046: Policy loss: 0.007085. Value loss: 0.040341. Entropy: 0.843810.
Iteration 3047: Policy loss: -0.009384. Value loss: 0.027865. Entropy: 0.832349.
Iteration 3048: Policy loss: -0.019914. Value loss: 0.024348. Entropy: 0.832296.
episode: 2449   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 564     evaluation reward: 8.15
episode: 2450   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 383     evaluation reward: 8.14
Training network. lr: 0.000227. clip: 0.090856
Iteration 3049: Policy loss: 0.011815. Value loss: 0.024853. Entropy: 0.819611.
Iteration 3050: Policy loss: -0.005111. Value loss: 0.017287. Entropy: 0.816526.
Iteration 3051: Policy loss: -0.018914. Value loss: 0.014891. Entropy: 0.814586.
episode: 2451   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 573     evaluation reward: 8.14
episode: 2452   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 468     evaluation reward: 8.

Iteration 3098: Policy loss: -0.012811. Value loss: 0.017632. Entropy: 0.662578.
Iteration 3099: Policy loss: -0.022371. Value loss: 0.014927. Entropy: 0.662259.
episode: 2479   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 536     evaluation reward: 8.3
episode: 2480   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 634     evaluation reward: 8.28
Training network. lr: 0.000227. clip: 0.090703
Iteration 3100: Policy loss: 0.010680. Value loss: 0.017804. Entropy: 0.734036.
Iteration 3101: Policy loss: -0.011022. Value loss: 0.014475. Entropy: 0.740612.
Iteration 3102: Policy loss: -0.021459. Value loss: 0.013104. Entropy: 0.743877.
episode: 2481   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 474     evaluation reward: 8.2
episode: 2482   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 626     evaluation reward: 8.24
Training network. lr: 0.000227. clip: 0.090694
Iteration 3103: Policy loss: 0.008316. Value loss: 0.015443. Entropy: 0.556825.

episode: 2509   score: 15.0   memory length: 1024   epsilon: 1.0    steps: 547     evaluation reward: 8.56
Training network. lr: 0.000226. clip: 0.090550
Iteration 3151: Policy loss: 0.009400. Value loss: 0.070543. Entropy: 0.759586.
Iteration 3152: Policy loss: -0.004021. Value loss: 0.057739. Entropy: 0.763372.
Iteration 3153: Policy loss: -0.013987. Value loss: 0.048590. Entropy: 0.774067.
episode: 2510   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 812     evaluation reward: 8.61
episode: 2511   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 658     evaluation reward: 8.68
Training network. lr: 0.000226. clip: 0.090541
Iteration 3154: Policy loss: 0.008841. Value loss: 0.046513. Entropy: 0.652330.
Iteration 3155: Policy loss: -0.007337. Value loss: 0.035146. Entropy: 0.652760.
Iteration 3156: Policy loss: -0.017030. Value loss: 0.029302. Entropy: 0.648448.
episode: 2512   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 702     evaluation reward:

Iteration 3204: Policy loss: -0.021999. Value loss: 0.015004. Entropy: 0.772888.
episode: 2538   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 343     evaluation reward: 8.95
episode: 2539   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 394     evaluation reward: 8.89
Training network. lr: 0.000226. clip: 0.090388
Iteration 3205: Policy loss: 0.013046. Value loss: 0.034495. Entropy: 0.774739.
Iteration 3206: Policy loss: -0.006632. Value loss: 0.025181. Entropy: 0.790038.
Iteration 3207: Policy loss: -0.022835. Value loss: 0.020425. Entropy: 0.780043.
episode: 2540   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 545     evaluation reward: 8.9
episode: 2541   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 466     evaluation reward: 8.82
Training network. lr: 0.000226. clip: 0.090379
Iteration 3208: Policy loss: 0.004899. Value loss: 0.021334. Entropy: 0.805297.
Iteration 3209: Policy loss: -0.007610. Value loss: 0.016301. Entropy: 0.805906.

Iteration 3254: Policy loss: -0.008389. Value loss: 0.013113. Entropy: 0.682747.
Iteration 3255: Policy loss: -0.017940. Value loss: 0.011215. Entropy: 0.682351.
episode: 2570   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 470     evaluation reward: 8.62
episode: 2571   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 655     evaluation reward: 8.64
Training network. lr: 0.000226. clip: 0.090235
Iteration 3256: Policy loss: 0.008667. Value loss: 0.015401. Entropy: 0.675977.
Iteration 3257: Policy loss: -0.011681. Value loss: 0.011564. Entropy: 0.669910.
Iteration 3258: Policy loss: -0.024355. Value loss: 0.009475. Entropy: 0.661044.
episode: 2572   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 586     evaluation reward: 8.63
episode: 2573   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 561     evaluation reward: 8.63
Training network. lr: 0.000226. clip: 0.090226
Iteration 3259: Policy loss: 0.007571. Value loss: 0.019868. Entropy: 0.77419

episode: 2600   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 435     evaluation reward: 8.65
episode: 2601   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 405     evaluation reward: 8.62
Training network. lr: 0.000225. clip: 0.090082
Iteration 3307: Policy loss: 0.006439. Value loss: 0.030938. Entropy: 0.660282.
Iteration 3308: Policy loss: -0.012364. Value loss: 0.023354. Entropy: 0.651728.
Iteration 3309: Policy loss: -0.014785. Value loss: 0.019096. Entropy: 0.657106.
episode: 2602   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 459     evaluation reward: 8.54
Training network. lr: 0.000225. clip: 0.090073
Iteration 3310: Policy loss: 0.002737. Value loss: 0.018275. Entropy: 0.692276.
Iteration 3311: Policy loss: -0.015831. Value loss: 0.012402. Entropy: 0.683721.
Iteration 3312: Policy loss: -0.025466. Value loss: 0.010213. Entropy: 0.685232.
episode: 2603   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 605     evaluation reward: 8.5

episode: 2631   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 626     evaluation reward: 8.2
Training network. lr: 0.000225. clip: 0.089929
Iteration 3358: Policy loss: 0.007925. Value loss: 0.038777. Entropy: 0.693948.
Iteration 3359: Policy loss: -0.007890. Value loss: 0.032065. Entropy: 0.679016.
Iteration 3360: Policy loss: -0.014230. Value loss: 0.028580. Entropy: 0.683634.
episode: 2632   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 715     evaluation reward: 8.19
Training network. lr: 0.000225. clip: 0.089920
Iteration 3361: Policy loss: 0.011826. Value loss: 0.017379. Entropy: 0.749569.
Iteration 3362: Policy loss: -0.007333. Value loss: 0.015819. Entropy: 0.753205.
Iteration 3363: Policy loss: -0.019475. Value loss: 0.014346. Entropy: 0.753244.
episode: 2633   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 622     evaluation reward: 8.17
episode: 2634   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 574     evaluation reward: 8.1

Training network. lr: 0.000224. clip: 0.089776
Iteration 3409: Policy loss: 0.007878. Value loss: 0.050798. Entropy: 0.703179.
Iteration 3410: Policy loss: -0.006436. Value loss: 0.039507. Entropy: 0.712100.
Iteration 3411: Policy loss: -0.015357. Value loss: 0.032828. Entropy: 0.696039.
episode: 2662   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 695     evaluation reward: 8.36
episode: 2663   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 805     evaluation reward: 8.39
Training network. lr: 0.000224. clip: 0.089767
Iteration 3412: Policy loss: 0.005877. Value loss: 0.021424. Entropy: 0.650977.
Iteration 3413: Policy loss: -0.008665. Value loss: 0.015686. Entropy: 0.634467.
Iteration 3414: Policy loss: -0.016869. Value loss: 0.013769. Entropy: 0.645113.
episode: 2664   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 674     evaluation reward: 8.42
Training network. lr: 0.000224. clip: 0.089758
Iteration 3415: Policy loss: 0.004840. Value loss: 0.0

Training network. lr: 0.000224. clip: 0.089614
Iteration 3463: Policy loss: 0.007459. Value loss: 0.023507. Entropy: 0.705432.
Iteration 3464: Policy loss: -0.003856. Value loss: 0.014745. Entropy: 0.683161.
Iteration 3465: Policy loss: -0.014749. Value loss: 0.012281. Entropy: 0.685236.
episode: 2691   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 638     evaluation reward: 8.63
episode: 2692   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 412     evaluation reward: 8.6
episode: 2693   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 399     evaluation reward: 8.58
Training network. lr: 0.000224. clip: 0.089605
Iteration 3466: Policy loss: 0.009901. Value loss: 0.022478. Entropy: 0.699077.
Iteration 3467: Policy loss: -0.011146. Value loss: 0.017462. Entropy: 0.685416.
Iteration 3468: Policy loss: -0.020614. Value loss: 0.015802. Entropy: 0.678303.
episode: 2694   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 559     evaluation reward: 8.6

now time :  2018-12-26 13:54:55.456226
Training network. lr: 0.000224. clip: 0.089461
Iteration 3514: Policy loss: 0.010064. Value loss: 0.017775. Entropy: 0.675509.
Iteration 3515: Policy loss: -0.007693. Value loss: 0.012485. Entropy: 0.677653.
Iteration 3516: Policy loss: -0.017092. Value loss: 0.011229. Entropy: 0.675183.
episode: 2722   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 597     evaluation reward: 8.54
episode: 2723   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 670     evaluation reward: 8.56
Training network. lr: 0.000224. clip: 0.089452
Iteration 3517: Policy loss: 0.006378. Value loss: 0.015788. Entropy: 0.636058.
Iteration 3518: Policy loss: -0.006063. Value loss: 0.012861. Entropy: 0.626239.
Iteration 3519: Policy loss: -0.019242. Value loss: 0.010613. Entropy: 0.624026.
episode: 2724   score: 15.0   memory length: 1024   epsilon: 1.0    steps: 754     evaluation reward: 8.65
Training network. lr: 0.000224. clip: 0.089443
Iteration 3520: 

episode: 2750   score: 16.0   memory length: 1024   epsilon: 1.0    steps: 846     evaluation reward: 8.93
Training network. lr: 0.000223. clip: 0.089299
Iteration 3568: Policy loss: 0.008745. Value loss: 0.053600. Entropy: 0.632169.
Iteration 3569: Policy loss: -0.006385. Value loss: 0.044159. Entropy: 0.638593.
Iteration 3570: Policy loss: -0.015633. Value loss: 0.039847. Entropy: 0.629088.
episode: 2751   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 591     evaluation reward: 8.94
episode: 2752   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 486     evaluation reward: 8.93
Training network. lr: 0.000223. clip: 0.089290
Iteration 3571: Policy loss: 0.006016. Value loss: 0.027110. Entropy: 0.691088.
Iteration 3572: Policy loss: -0.009562. Value loss: 0.020282. Entropy: 0.682329.
Iteration 3573: Policy loss: -0.022236. Value loss: 0.017232. Entropy: 0.678372.
episode: 2753   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 619     evaluation reward: 8.

Iteration 3620: Policy loss: -0.000580. Value loss: 0.021774. Entropy: 0.674553.
Iteration 3621: Policy loss: -0.011666. Value loss: 0.018584. Entropy: 0.683427.
episode: 2780   score: 14.0   memory length: 1024   epsilon: 1.0    steps: 657     evaluation reward: 9.06
Training network. lr: 0.000223. clip: 0.089137
Iteration 3622: Policy loss: 0.008780. Value loss: 0.051735. Entropy: 0.749256.
Iteration 3623: Policy loss: -0.003205. Value loss: 0.033650. Entropy: 0.738799.
Iteration 3624: Policy loss: -0.011451. Value loss: 0.027986. Entropy: 0.739272.
episode: 2781   score: 15.0   memory length: 1024   epsilon: 1.0    steps: 728     evaluation reward: 9.17
episode: 2782   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 738     evaluation reward: 9.22
Training network. lr: 0.000223. clip: 0.089128
Iteration 3625: Policy loss: 0.013496. Value loss: 0.051819. Entropy: 0.783741.
Iteration 3626: Policy loss: -0.002150. Value loss: 0.037764. Entropy: 0.784678.
Iteration 3627: Poli

episode: 2809   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 762     evaluation reward: 9.16
episode: 2810   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 541     evaluation reward: 9.19
Training network. lr: 0.000222. clip: 0.088984
Iteration 3673: Policy loss: 0.009442. Value loss: 0.027833. Entropy: 0.606149.
Iteration 3674: Policy loss: -0.007881. Value loss: 0.019439. Entropy: 0.587244.
Iteration 3675: Policy loss: -0.015773. Value loss: 0.016062. Entropy: 0.588037.
episode: 2811   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 508     evaluation reward: 9.16
episode: 2812   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 402     evaluation reward: 9.1
Training network. lr: 0.000222. clip: 0.088975
Iteration 3676: Policy loss: 0.002870. Value loss: 0.038212. Entropy: 0.665588.
Iteration 3677: Policy loss: -0.007295. Value loss: 0.028851. Entropy: 0.660527.
Iteration 3678: Policy loss: -0.019188. Value loss: 0.025073. Entropy: 0.657941

episode: 2840   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 764     evaluation reward: 8.9
Training network. lr: 0.000222. clip: 0.088831
Iteration 3724: Policy loss: 0.008967. Value loss: 0.021419. Entropy: 0.661391.
Iteration 3725: Policy loss: -0.007801. Value loss: 0.015616. Entropy: 0.655325.
Iteration 3726: Policy loss: -0.015066. Value loss: 0.013836. Entropy: 0.642371.
episode: 2841   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 533     evaluation reward: 8.94
episode: 2842   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 610     evaluation reward: 8.93
Training network. lr: 0.000222. clip: 0.088822
Iteration 3727: Policy loss: 0.006889. Value loss: 0.019383. Entropy: 0.647327.
Iteration 3728: Policy loss: -0.004904. Value loss: 0.015006. Entropy: 0.651797.
Iteration 3729: Policy loss: -0.012525. Value loss: 0.012751. Entropy: 0.645474.
episode: 2843   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 587     evaluation reward: 8.9

episode: 2871   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 593     evaluation reward: 8.27
episode: 2872   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 482     evaluation reward: 8.27
Training network. lr: 0.000222. clip: 0.088678
Iteration 3775: Policy loss: 0.008923. Value loss: 0.028987. Entropy: 0.661699.
Iteration 3776: Policy loss: -0.007440. Value loss: 0.021152. Entropy: 0.655860.
Iteration 3777: Policy loss: -0.016841. Value loss: 0.017191. Entropy: 0.657527.
episode: 2873   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 441     evaluation reward: 8.22
episode: 2874   score: 13.0   memory length: 1024   epsilon: 1.0    steps: 727     evaluation reward: 8.29
Training network. lr: 0.000222. clip: 0.088669
Iteration 3778: Policy loss: 0.010421. Value loss: 0.038319. Entropy: 0.706007.
Iteration 3779: Policy loss: -0.007423. Value loss: 0.029358. Entropy: 0.702274.
Iteration 3780: Policy loss: -0.015285. Value loss: 0.024840. Entropy: 0.71921

episode: 2902   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 594     evaluation reward: 8.25
Training network. lr: 0.000221. clip: 0.088525
Iteration 3826: Policy loss: 0.010004. Value loss: 0.024902. Entropy: 0.639134.
Iteration 3827: Policy loss: -0.003379. Value loss: 0.018662. Entropy: 0.642522.
Iteration 3828: Policy loss: -0.016503. Value loss: 0.017256. Entropy: 0.635195.
episode: 2903   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 658     evaluation reward: 8.26
Training network. lr: 0.000221. clip: 0.088516
Iteration 3829: Policy loss: 0.006688. Value loss: 0.027070. Entropy: 0.686582.
Iteration 3830: Policy loss: -0.004878. Value loss: 0.017242. Entropy: 0.666253.
Iteration 3831: Policy loss: -0.016296. Value loss: 0.014612. Entropy: 0.663842.
episode: 2904   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 658     evaluation reward: 8.27
episode: 2905   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 653     evaluation reward: 8

Iteration 3879: Policy loss: -0.018273. Value loss: 0.014904. Entropy: 0.612786.
episode: 2931   score: 15.0   memory length: 1024   epsilon: 1.0    steps: 816     evaluation reward: 8.52
Training network. lr: 0.000221. clip: 0.088363
Iteration 3880: Policy loss: 0.009566. Value loss: 0.041882. Entropy: 0.615328.
Iteration 3881: Policy loss: -0.006857. Value loss: 0.034046. Entropy: 0.613871.
Iteration 3882: Policy loss: -0.013980. Value loss: 0.029653. Entropy: 0.612184.
episode: 2932   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 736     evaluation reward: 8.57
episode: 2933   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 537     evaluation reward: 8.6
Training network. lr: 0.000221. clip: 0.088354
Iteration 3883: Policy loss: 0.007846. Value loss: 0.030614. Entropy: 0.529897.
Iteration 3884: Policy loss: -0.005617. Value loss: 0.018386. Entropy: 0.521620.
Iteration 3885: Policy loss: -0.013782. Value loss: 0.014992. Entropy: 0.523066.
episode: 2934   score:

Iteration 3930: Policy loss: -0.018239. Value loss: 0.014218. Entropy: 0.622410.
episode: 2962   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 513     evaluation reward: 8.61
episode: 2963   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 476     evaluation reward: 8.58
Training network. lr: 0.000221. clip: 0.088210
Iteration 3931: Policy loss: 0.004357. Value loss: 0.019007. Entropy: 0.611947.
Iteration 3932: Policy loss: -0.007774. Value loss: 0.014956. Entropy: 0.593867.
Iteration 3933: Policy loss: -0.018460. Value loss: 0.011976. Entropy: 0.597978.
episode: 2964   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 623     evaluation reward: 8.61
Training network. lr: 0.000221. clip: 0.088201
Iteration 3934: Policy loss: 0.010058. Value loss: 0.017685. Entropy: 0.667688.
Iteration 3935: Policy loss: -0.008793. Value loss: 0.013471. Entropy: 0.668567.
Iteration 3936: Policy loss: -0.016448. Value loss: 0.011184. Entropy: 0.663385.
episode: 2965   score: 

Training network. lr: 0.000220. clip: 0.088057
Iteration 3982: Policy loss: 0.009872. Value loss: 0.047922. Entropy: 0.687034.
Iteration 3983: Policy loss: -0.009690. Value loss: 0.036476. Entropy: 0.684753.
Iteration 3984: Policy loss: -0.012607. Value loss: 0.031174. Entropy: 0.673903.
episode: 2992   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 722     evaluation reward: 8.75
episode: 2993   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 563     evaluation reward: 8.75
Training network. lr: 0.000220. clip: 0.088048
Iteration 3985: Policy loss: 0.008109. Value loss: 0.019667. Entropy: 0.586878.
Iteration 3986: Policy loss: -0.003090. Value loss: 0.014573. Entropy: 0.587025.
Iteration 3987: Policy loss: -0.016474. Value loss: 0.011393. Entropy: 0.581459.
episode: 2994   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 625     evaluation reward: 8.76
episode: 2995   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 539     evaluation reward: 8.

Iteration 4035: Policy loss: -0.013275. Value loss: 0.025008. Entropy: 0.616985.
episode: 3021   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 571     evaluation reward: 8.61
Training network. lr: 0.000220. clip: 0.087895
Iteration 4036: Policy loss: 0.007223. Value loss: 0.046764. Entropy: 0.654047.
Iteration 4037: Policy loss: -0.007597. Value loss: 0.034285. Entropy: 0.652664.
Iteration 4038: Policy loss: -0.014524. Value loss: 0.028781. Entropy: 0.645650.
episode: 3022   score: 14.0   memory length: 1024   epsilon: 1.0    steps: 836     evaluation reward: 8.65
episode: 3023   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 710     evaluation reward: 8.67
Training network. lr: 0.000220. clip: 0.087886
Iteration 4039: Policy loss: 0.010973. Value loss: 0.032962. Entropy: 0.566461.
Iteration 4040: Policy loss: -0.003787. Value loss: 0.024182. Entropy: 0.583028.
Iteration 4041: Policy loss: -0.016045. Value loss: 0.020327. Entropy: 0.571056.
episode: 3024   scor

Iteration 4087: Policy loss: 0.007455. Value loss: 0.021541. Entropy: 0.602093.
Iteration 4088: Policy loss: -0.006294. Value loss: 0.014857. Entropy: 0.592687.
Iteration 4089: Policy loss: -0.014446. Value loss: 0.012494. Entropy: 0.587416.
episode: 3051   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 446     evaluation reward: 8.78
episode: 3052   score: 15.0   memory length: 1024   epsilon: 1.0    steps: 606     evaluation reward: 8.89
Training network. lr: 0.000219. clip: 0.087733
Iteration 4090: Policy loss: 0.005803. Value loss: 0.062679. Entropy: 0.623096.
Iteration 4091: Policy loss: -0.005926. Value loss: 0.044547. Entropy: 0.615593.
Iteration 4092: Policy loss: -0.008614. Value loss: 0.038052. Entropy: 0.605284.
episode: 3053   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 580     evaluation reward: 8.88
Training network. lr: 0.000219. clip: 0.087724
Iteration 4093: Policy loss: 0.002744. Value loss: 0.046686. Entropy: 0.597204.
Iteration 4094: Policy

Iteration 4140: Policy loss: -0.016271. Value loss: 0.011602. Entropy: 0.647391.
episode: 3080   score: 13.0   memory length: 1024   epsilon: 1.0    steps: 825     evaluation reward: 9.03
Training network. lr: 0.000219. clip: 0.087580
Iteration 4141: Policy loss: 0.010068. Value loss: 0.026306. Entropy: 0.635313.
Iteration 4142: Policy loss: -0.009810. Value loss: 0.020005. Entropy: 0.632057.
Iteration 4143: Policy loss: -0.017703. Value loss: 0.016865. Entropy: 0.638863.
episode: 3081   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 556     evaluation reward: 9.05
Training network. lr: 0.000219. clip: 0.087571
Iteration 4144: Policy loss: 0.003186. Value loss: 0.024830. Entropy: 0.614438.
Iteration 4145: Policy loss: -0.007008. Value loss: 0.018024. Entropy: 0.622127.
Iteration 4146: Policy loss: -0.013911. Value loss: 0.015401. Entropy: 0.618567.
episode: 3082   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 719     evaluation reward: 9.1
episode: 3083   score:

Iteration 4190: Policy loss: -0.009034. Value loss: 0.017394. Entropy: 0.757720.
Iteration 4191: Policy loss: -0.014515. Value loss: 0.014561. Entropy: 0.754336.
episode: 3112   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 442     evaluation reward: 8.64
episode: 3113   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 444     evaluation reward: 8.6
Training network. lr: 0.000219. clip: 0.087427
Iteration 4192: Policy loss: 0.009141. Value loss: 0.022424. Entropy: 0.665890.
Iteration 4193: Policy loss: -0.009977. Value loss: 0.015892. Entropy: 0.662690.
Iteration 4194: Policy loss: -0.015493. Value loss: 0.014319. Entropy: 0.656221.
episode: 3114   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 402     evaluation reward: 8.58
episode: 3115   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 726     evaluation reward: 8.6
Training network. lr: 0.000219. clip: 0.087418
Iteration 4195: Policy loss: 0.010938. Value loss: 0.025360. Entropy: 0.779564.

Training network. lr: 0.000218. clip: 0.087274
Iteration 4243: Policy loss: 0.002106. Value loss: 0.020268. Entropy: 0.622031.
Iteration 4244: Policy loss: -0.012260. Value loss: 0.015372. Entropy: 0.630067.
Iteration 4245: Policy loss: -0.022290. Value loss: 0.013023. Entropy: 0.616951.
episode: 3142   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 707     evaluation reward: 8.53
episode: 3143   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 555     evaluation reward: 8.53
Training network. lr: 0.000218. clip: 0.087265
Iteration 4246: Policy loss: 0.003260. Value loss: 0.025080. Entropy: 0.686987.
Iteration 4247: Policy loss: -0.006636. Value loss: 0.017320. Entropy: 0.673879.
Iteration 4248: Policy loss: -0.013783. Value loss: 0.015174. Entropy: 0.684247.
now time :  2018-12-26 14:20:14.818490
episode: 3144   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 687     evaluation reward: 8.52
Training network. lr: 0.000218. clip: 0.087256
Iteration 4249: P

Training network. lr: 0.000218. clip: 0.087112
Iteration 4297: Policy loss: 0.006500. Value loss: 0.029874. Entropy: 0.509004.
Iteration 4298: Policy loss: -0.004537. Value loss: 0.022983. Entropy: 0.504811.
Iteration 4299: Policy loss: -0.013322. Value loss: 0.019202. Entropy: 0.503925.
episode: 3170   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 598     evaluation reward: 8.93
episode: 3171   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 663     evaluation reward: 8.96
Training network. lr: 0.000218. clip: 0.087103
Iteration 4300: Policy loss: 0.013148. Value loss: 0.026761. Entropy: 0.565582.
Iteration 4301: Policy loss: -0.007679. Value loss: 0.017998. Entropy: 0.560548.
Iteration 4302: Policy loss: -0.011803. Value loss: 0.015838. Entropy: 0.558411.
episode: 3172   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 585     evaluation reward: 8.9
Training network. lr: 0.000218. clip: 0.087094
Iteration 4303: Policy loss: 0.002620. Value loss: 0.0218

Iteration 4351: Policy loss: 0.005715. Value loss: 0.041098. Entropy: 0.659496.
Iteration 4352: Policy loss: -0.006866. Value loss: 0.030943. Entropy: 0.640086.
Iteration 4353: Policy loss: -0.010794. Value loss: 0.024740. Entropy: 0.636020.
episode: 3198   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 807     evaluation reward: 9.28
Training network. lr: 0.000217. clip: 0.086941
Iteration 4354: Policy loss: 0.002607. Value loss: 0.028483. Entropy: 0.577292.
Iteration 4355: Policy loss: -0.010413. Value loss: 0.020355. Entropy: 0.573711.
Iteration 4356: Policy loss: -0.020788. Value loss: 0.017075. Entropy: 0.568245.
episode: 3199   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 716     evaluation reward: 9.33
episode: 3200   score: 13.0   memory length: 1024   epsilon: 1.0    steps: 793     evaluation reward: 9.41
Training network. lr: 0.000217. clip: 0.086932
Iteration 4357: Policy loss: 0.006186. Value loss: 0.029342. Entropy: 0.542614.
Iteration 4358: Polic

episode: 3224   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 552     evaluation reward: 10.08
episode: 3225   score: 12.0   memory length: 1024   epsilon: 1.0    steps: 643     evaluation reward: 10.09
Training network. lr: 0.000217. clip: 0.086779
Iteration 4408: Policy loss: 0.003749. Value loss: 0.059928. Entropy: 0.516122.
Iteration 4409: Policy loss: -0.003722. Value loss: 0.046842. Entropy: 0.512026.
Iteration 4410: Policy loss: -0.012701. Value loss: 0.041674. Entropy: 0.507424.
episode: 3226   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 591     evaluation reward: 10.09
episode: 3227   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 401     evaluation reward: 10.05
Training network. lr: 0.000217. clip: 0.086770
Iteration 4411: Policy loss: 0.014444. Value loss: 0.044180. Entropy: 0.545634.
Iteration 4412: Policy loss: -0.000539. Value loss: 0.030319. Entropy: 0.554638.
Iteration 4413: Policy loss: -0.017362. Value loss: 0.025109. Entropy: 0.5

Iteration 4460: Policy loss: -0.001669. Value loss: 0.023230. Entropy: 0.567508.
Iteration 4461: Policy loss: -0.014362. Value loss: 0.019139. Entropy: 0.570536.
episode: 3254   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 616     evaluation reward: 9.91
Training network. lr: 0.000217. clip: 0.086617
Iteration 4462: Policy loss: 0.006567. Value loss: 0.033667. Entropy: 0.635569.
Iteration 4463: Policy loss: -0.001918. Value loss: 0.025409. Entropy: 0.649728.
Iteration 4464: Policy loss: -0.010612. Value loss: 0.020533. Entropy: 0.640461.
episode: 3255   score: 11.0   memory length: 1024   epsilon: 1.0    steps: 772     evaluation reward: 9.96
episode: 3256   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 672     evaluation reward: 9.94
Training network. lr: 0.000217. clip: 0.086608
Iteration 4465: Policy loss: 0.004943. Value loss: 0.024844. Entropy: 0.655402.
Iteration 4466: Policy loss: -0.006822. Value loss: 0.015456. Entropy: 0.646637.
Iteration 4467: Polic

Iteration 4515: Policy loss: -0.016019. Value loss: 0.009684. Entropy: 0.537097.
episode: 3282   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 651     evaluation reward: 9.9
Training network. lr: 0.000216. clip: 0.086455
Iteration 4516: Policy loss: 0.005793. Value loss: 0.026744. Entropy: 0.680283.
Iteration 4517: Policy loss: -0.010894. Value loss: 0.020484. Entropy: 0.683255.
Iteration 4518: Policy loss: -0.018956. Value loss: 0.016943. Entropy: 0.680037.
episode: 3283   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 611     evaluation reward: 9.87
episode: 3284   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 710     evaluation reward: 9.85
Training network. lr: 0.000216. clip: 0.086446
Iteration 4519: Policy loss: 0.009382. Value loss: 0.023322. Entropy: 0.674959.
Iteration 4520: Policy loss: -0.003827. Value loss: 0.014731. Entropy: 0.669255.
Iteration 4521: Policy loss: -0.014236. Value loss: 0.013528. Entropy: 0.675674.
episode: 3285   score: 8

episode: 3312   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 645     evaluation reward: 9.03
Training network. lr: 0.000216. clip: 0.086302
Iteration 4567: Policy loss: 0.010615. Value loss: 0.022953. Entropy: 0.644818.
Iteration 4568: Policy loss: 0.001384. Value loss: 0.016653. Entropy: 0.640916.
Iteration 4569: Policy loss: -0.007724. Value loss: 0.014424. Entropy: 0.644011.
episode: 3313   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 719     evaluation reward: 9.02
episode: 3314   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 517     evaluation reward: 8.99
Training network. lr: 0.000216. clip: 0.086293
Iteration 4570: Policy loss: 0.004482. Value loss: 0.020291. Entropy: 0.600978.
Iteration 4571: Policy loss: -0.010349. Value loss: 0.016654. Entropy: 0.595308.
Iteration 4572: Policy loss: -0.014913. Value loss: 0.013985. Entropy: 0.586321.
episode: 3315   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 543     evaluation reward: 8.9

episode: 3341   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 728     evaluation reward: 9.13
Training network. lr: 0.000215. clip: 0.086140
Iteration 4621: Policy loss: 0.015138. Value loss: 0.022409. Entropy: 0.504729.
Iteration 4622: Policy loss: -0.008161. Value loss: 0.017444. Entropy: 0.516327.
Iteration 4623: Policy loss: -0.014130. Value loss: 0.013761. Entropy: 0.514145.
episode: 3342   score: 9.0   memory length: 1024   epsilon: 1.0    steps: 638     evaluation reward: 9.12
Training network. lr: 0.000215. clip: 0.086131
Iteration 4624: Policy loss: 0.011875. Value loss: 0.030188. Entropy: 0.610531.
Iteration 4625: Policy loss: -0.000394. Value loss: 0.022402. Entropy: 0.610859.
Iteration 4626: Policy loss: -0.011330. Value loss: 0.020885. Entropy: 0.591667.
episode: 3343   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 597     evaluation reward: 9.14
episode: 3344   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 693     evaluation reward: 9

Iteration 4673: Policy loss: -0.006938. Value loss: 0.018332. Entropy: 0.535646.
Iteration 4674: Policy loss: -0.014390. Value loss: 0.015121. Entropy: 0.541192.
episode: 3371   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 463     evaluation reward: 8.66
episode: 3372   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 341     evaluation reward: 8.55
episode: 3373   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 481     evaluation reward: 8.52
Training network. lr: 0.000215. clip: 0.085978
Iteration 4675: Policy loss: 0.002543. Value loss: 0.030181. Entropy: 0.535030.
Iteration 4676: Policy loss: -0.005170. Value loss: 0.021477. Entropy: 0.541090.
Iteration 4677: Policy loss: -0.018998. Value loss: 0.017214. Entropy: 0.556650.
episode: 3374   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 548     evaluation reward: 8.47
episode: 3375   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 611     evaluation reward: 8.49
Training network. l

episode: 3401   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 524     evaluation reward: 8.45
Training network. lr: 0.000215. clip: 0.085825
Iteration 4726: Policy loss: 0.002396. Value loss: 0.019165. Entropy: 0.370457.
Iteration 4727: Policy loss: -0.010008. Value loss: 0.014523. Entropy: 0.372180.
Iteration 4728: Policy loss: -0.011646. Value loss: 0.012815. Entropy: 0.369674.
episode: 3402   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 495     evaluation reward: 8.4
episode: 3403   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 489     evaluation reward: 8.41
Training network. lr: 0.000215. clip: 0.085816
Iteration 4729: Policy loss: 0.006453. Value loss: 0.053779. Entropy: 0.452587.
Iteration 4730: Policy loss: -0.003888. Value loss: 0.038318. Entropy: 0.454285.
Iteration 4731: Policy loss: -0.007939. Value loss: 0.031038. Entropy: 0.438150.
episode: 3404   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 518     evaluation reward: 8.3

Training network. lr: 0.000214. clip: 0.085663
Iteration 4780: Policy loss: 0.005248. Value loss: 0.015379. Entropy: 0.488742.
Iteration 4781: Policy loss: -0.003174. Value loss: 0.011461. Entropy: 0.497208.
Iteration 4782: Policy loss: -0.008847. Value loss: 0.010649. Entropy: 0.498926.
episode: 3430   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 294     evaluation reward: 7.78
episode: 3431   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 830     evaluation reward: 7.73
Training network. lr: 0.000214. clip: 0.085654
Iteration 4783: Policy loss: 0.008021. Value loss: 0.011453. Entropy: 0.299852.
Iteration 4784: Policy loss: 0.000723. Value loss: 0.009036. Entropy: 0.280825.
Iteration 4785: Policy loss: -0.006181. Value loss: 0.006689. Entropy: 0.271431.
episode: 3432   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 372     evaluation reward: 7.68
episode: 3433   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 738     evaluation reward: 7.53

Training network. lr: 0.000214. clip: 0.085492
Iteration 4837: Policy loss: 0.010376. Value loss: 0.012235. Entropy: 0.318890.
Iteration 4838: Policy loss: -0.002616. Value loss: 0.008774. Entropy: 0.342413.
Iteration 4839: Policy loss: -0.007662. Value loss: 0.007848. Entropy: 0.355952.
episode: 3456   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 587     evaluation reward: 6.75
Training network. lr: 0.000214. clip: 0.085483
Iteration 4840: Policy loss: 0.005458. Value loss: 0.012644. Entropy: 0.420800.
Iteration 4841: Policy loss: -0.003918. Value loss: 0.008345. Entropy: 0.421443.
Iteration 4842: Policy loss: -0.008358. Value loss: 0.007838. Entropy: 0.414732.
episode: 3457   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 693     evaluation reward: 6.71
episode: 3458   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 657     evaluation reward: 6.7
Training network. lr: 0.000214. clip: 0.085474
Iteration 4843: Policy loss: 0.007820. Value loss: 0.01379

episode: 3485   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 505     evaluation reward: 6.29
episode: 3486   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 436     evaluation reward: 6.23
episode: 3487   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 501     evaluation reward: 6.18
Training network. lr: 0.000213. clip: 0.085330
Iteration 4891: Policy loss: 0.007240. Value loss: 0.015608. Entropy: 0.280500.
Iteration 4892: Policy loss: 0.001150. Value loss: 0.012253. Entropy: 0.279809.
Iteration 4893: Policy loss: -0.006992. Value loss: 0.010400. Entropy: 0.303771.
episode: 3488   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 499     evaluation reward: 6.15
episode: 3489   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 560     evaluation reward: 6.12
Training network. lr: 0.000213. clip: 0.085321
Iteration 4894: Policy loss: 0.010711. Value loss: 0.012596. Entropy: 0.412544.
Iteration 4895: Policy loss: 0.002537. Value loss: 0.01

Training network. lr: 0.000213. clip: 0.085186
Iteration 4939: Policy loss: 0.012677. Value loss: 0.011782. Entropy: 0.387551.
Iteration 4940: Policy loss: -0.002382. Value loss: 0.009442. Entropy: 0.412488.
Iteration 4941: Policy loss: -0.014180. Value loss: 0.008848. Entropy: 0.415172.
episode: 3519   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 612     evaluation reward: 5.56
episode: 3520   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 446     evaluation reward: 5.52
Training network. lr: 0.000213. clip: 0.085177
Iteration 4942: Policy loss: 0.000439. Value loss: 0.012645. Entropy: 0.457190.
Iteration 4943: Policy loss: -0.010721. Value loss: 0.009398. Entropy: 0.439025.
Iteration 4944: Policy loss: -0.017057. Value loss: 0.007947. Entropy: 0.427618.
episode: 3521   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 365     evaluation reward: 5.51
episode: 3522   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 527     evaluation reward: 5.5

episode: 3550   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 503     evaluation reward: 5.75
episode: 3551   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 444     evaluation reward: 5.74
Training network. lr: 0.000213. clip: 0.085033
Iteration 4990: Policy loss: 0.006162. Value loss: 0.016240. Entropy: 0.294435.
Iteration 4991: Policy loss: -0.003451. Value loss: 0.013122. Entropy: 0.298079.
Iteration 4992: Policy loss: -0.006912. Value loss: 0.011304. Entropy: 0.278999.
episode: 3552   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 625     evaluation reward: 5.75
episode: 3553   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 581     evaluation reward: 5.78
Training network. lr: 0.000213. clip: 0.085024
Iteration 4993: Policy loss: 0.013600. Value loss: 0.017517. Entropy: 0.340597.
Iteration 4994: Policy loss: -0.001559. Value loss: 0.013824. Entropy: 0.329174.
Iteration 4995: Policy loss: -0.010718. Value loss: 0.010798. Entropy: 0.342901

Iteration 5039: Policy loss: 0.002069. Value loss: 0.027443. Entropy: 0.463299.
Iteration 5040: Policy loss: -0.006399. Value loss: 0.024057. Entropy: 0.497597.
episode: 3583   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 780     evaluation reward: 5.51
Training network. lr: 0.000212. clip: 0.084880
Iteration 5041: Policy loss: 0.000867. Value loss: 0.011436. Entropy: 0.520217.
Iteration 5042: Policy loss: -0.005059. Value loss: 0.008111. Entropy: 0.531909.
Iteration 5043: Policy loss: -0.010819. Value loss: 0.007090. Entropy: 0.531075.
episode: 3584   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 593     evaluation reward: 5.53
episode: 3585   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 368     evaluation reward: 5.51
Training network. lr: 0.000212. clip: 0.084871
Iteration 5044: Policy loss: 0.009450. Value loss: 0.018904. Entropy: 0.504448.
Iteration 5045: Policy loss: -0.004511. Value loss: 0.014702. Entropy: 0.485476.
Iteration 5046: Policy l

Iteration 5089: Policy loss: 0.016685. Value loss: 0.030988. Entropy: 0.460144.
Iteration 5090: Policy loss: -0.007732. Value loss: 0.024754. Entropy: 0.478520.
Iteration 5091: Policy loss: -0.009048. Value loss: 0.023397. Entropy: 0.478969.
episode: 3615   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 634     evaluation reward: 5.8
episode: 3616   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 558     evaluation reward: 5.82
Training network. lr: 0.000212. clip: 0.084727
Iteration 5092: Policy loss: 0.008061. Value loss: 0.013908. Entropy: 0.514627.
Iteration 5093: Policy loss: -0.006430. Value loss: 0.011271. Entropy: 0.510566.
Iteration 5094: Policy loss: -0.009557. Value loss: 0.009196. Entropy: 0.510587.
episode: 3617   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 559     evaluation reward: 5.81
episode: 3618   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 537     evaluation reward: 5.81
Training network. lr: 0.000212. clip: 0.084718

Iteration 5139: Policy loss: -0.012647. Value loss: 0.009163. Entropy: 0.517636.
episode: 3647   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 596     evaluation reward: 5.69
episode: 3648   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 403     evaluation reward: 5.63
Training network. lr: 0.000211. clip: 0.084583
Iteration 5140: Policy loss: 0.018083. Value loss: 0.015093. Entropy: 0.532991.
Iteration 5141: Policy loss: 0.015119. Value loss: 0.012573. Entropy: 0.540318.
Iteration 5142: Policy loss: -0.005593. Value loss: 0.011233. Entropy: 0.538552.
episode: 3649   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 499     evaluation reward: 5.62
episode: 3650   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 507     evaluation reward: 5.62
Training network. lr: 0.000211. clip: 0.084574
Iteration 5143: Policy loss: 0.005172. Value loss: 0.022980. Entropy: 0.534905.
Iteration 5144: Policy loss: -0.007320. Value loss: 0.017229. Entropy: 0.524934.

Iteration 5187: Policy loss: -0.015295. Value loss: 0.012703. Entropy: 0.594334.
episode: 3681   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 436     evaluation reward: 5.56
episode: 3682   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 438     evaluation reward: 5.52
Training network. lr: 0.000211. clip: 0.084439
Iteration 5188: Policy loss: 0.027887. Value loss: 0.012865. Entropy: 0.587331.
Iteration 5189: Policy loss: -0.005993. Value loss: 0.011193. Entropy: 0.586479.
Iteration 5190: Policy loss: -0.006801. Value loss: 0.010676. Entropy: 0.590893.
episode: 3683   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 503     evaluation reward: 5.5
episode: 3684   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 370     evaluation reward: 5.47
episode: 3685   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 358     evaluation reward: 5.47
Training network. lr: 0.000211. clip: 0.084430
Iteration 5191: Policy loss: 0.008186. Value loss: 0.0

Training network. lr: 0.000211. clip: 0.084304
Iteration 5233: Policy loss: 0.028590. Value loss: 0.013852. Entropy: 0.500646.
Iteration 5234: Policy loss: 0.003981. Value loss: 0.010963. Entropy: 0.493407.
Iteration 5235: Policy loss: -0.000547. Value loss: 0.009338. Entropy: 0.489859.
episode: 3717   score: 10.0   memory length: 1024   epsilon: 1.0    steps: 837     evaluation reward: 4.9
episode: 3718   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 583     evaluation reward: 4.91
Training network. lr: 0.000211. clip: 0.084295
Iteration 5236: Policy loss: 0.007920. Value loss: 0.021477. Entropy: 0.440983.
Iteration 5237: Policy loss: -0.013171. Value loss: 0.014685. Entropy: 0.447305.
Iteration 5238: Policy loss: -0.012619. Value loss: 0.011815. Entropy: 0.451590.
episode: 3719   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 372     evaluation reward: 4.88
episode: 3720   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 293     evaluation reward: 4.83

Iteration 5279: Policy loss: -0.006303. Value loss: 0.011555. Entropy: 0.579077.
Iteration 5280: Policy loss: -0.017538. Value loss: 0.010245. Entropy: 0.568065.
episode: 3752   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 404     evaluation reward: 4.68
episode: 3753   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 418     evaluation reward: 4.69
Training network. lr: 0.000210. clip: 0.084160
Iteration 5281: Policy loss: 0.010741. Value loss: 0.012438. Entropy: 0.497550.
Iteration 5282: Policy loss: -0.009694. Value loss: 0.010268. Entropy: 0.494810.
Iteration 5283: Policy loss: -0.011873. Value loss: 0.008711. Entropy: 0.483065.
episode: 3754   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 464     evaluation reward: 4.69
episode: 3755   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 417     evaluation reward: 4.7
Training network. lr: 0.000210. clip: 0.084151
Iteration 5284: Policy loss: 0.010273. Value loss: 0.011546. Entropy: 0.481682.

Training network. lr: 0.000210. clip: 0.084025
Iteration 5326: Policy loss: 0.011153. Value loss: 0.019324. Entropy: 0.554111.
Iteration 5327: Policy loss: -0.008788. Value loss: 0.014340. Entropy: 0.544467.
Iteration 5328: Policy loss: -0.012194. Value loss: 0.012559. Entropy: 0.559929.
episode: 3787   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 270     evaluation reward: 4.99
episode: 3788   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 529     evaluation reward: 5.0
Training network. lr: 0.000210. clip: 0.084016
Iteration 5329: Policy loss: 0.003212. Value loss: 0.015887. Entropy: 0.677283.
Iteration 5330: Policy loss: -0.009725. Value loss: 0.012568. Entropy: 0.665851.
Iteration 5331: Policy loss: -0.016071. Value loss: 0.010514. Entropy: 0.668169.
episode: 3789   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 381     evaluation reward: 5.02
episode: 3790   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 378     evaluation reward: 5.03

Training network. lr: 0.000210. clip: 0.083890
Iteration 5371: Policy loss: 0.008890. Value loss: 0.016104. Entropy: 0.471921.
Iteration 5372: Policy loss: 0.003780. Value loss: 0.012790. Entropy: 0.464872.
Iteration 5373: Policy loss: -0.008956. Value loss: 0.010587. Entropy: 0.452186.
episode: 3824   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 439     evaluation reward: 5.02
episode: 3825   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 404     evaluation reward: 5.03
Training network. lr: 0.000210. clip: 0.083881
Iteration 5374: Policy loss: 0.007058. Value loss: 0.015161. Entropy: 0.548321.
Iteration 5375: Policy loss: -0.004737. Value loss: 0.010628. Entropy: 0.546592.
Iteration 5376: Policy loss: -0.012067. Value loss: 0.009104. Entropy: 0.539995.
episode: 3826   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 474     evaluation reward: 5.04
episode: 3827   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 423     evaluation reward: 4.99

Training network. lr: 0.000209. clip: 0.083746
Iteration 5419: Policy loss: 0.000671. Value loss: 0.016351. Entropy: 0.389515.
Iteration 5420: Policy loss: -0.006875. Value loss: 0.011867. Entropy: 0.389262.
Iteration 5421: Policy loss: -0.014436. Value loss: 0.010335. Entropy: 0.383246.
episode: 3858   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 544     evaluation reward: 5.14
episode: 3859   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 467     evaluation reward: 5.12
Training network. lr: 0.000209. clip: 0.083737
Iteration 5422: Policy loss: 0.008329. Value loss: 0.012100. Entropy: 0.456683.
Iteration 5423: Policy loss: 0.003592. Value loss: 0.009628. Entropy: 0.454334.
Iteration 5424: Policy loss: -0.015682. Value loss: 0.007432. Entropy: 0.454604.
episode: 3860   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 424     evaluation reward: 5.11
episode: 3861   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 421     evaluation reward: 5.08

episode: 3892   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 483     evaluation reward: 4.68
Training network. lr: 0.000209. clip: 0.083602
Iteration 5467: Policy loss: 0.007769. Value loss: 0.015396. Entropy: 0.395205.
Iteration 5468: Policy loss: 0.001480. Value loss: 0.012055. Entropy: 0.384756.
Iteration 5469: Policy loss: -0.009663. Value loss: 0.009898. Entropy: 0.394727.
episode: 3893   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 398     evaluation reward: 4.68
episode: 3894   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 408     evaluation reward: 4.67
episode: 3895   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 399     evaluation reward: 4.68
Training network. lr: 0.000209. clip: 0.083593
Iteration 5470: Policy loss: 0.007885. Value loss: 0.014410. Entropy: 0.352340.
Iteration 5471: Policy loss: -0.005304. Value loss: 0.011119. Entropy: 0.350745.
Iteration 5472: Policy loss: -0.010920. Value loss: 0.010129. Entropy: 0.353543.

Iteration 5513: Policy loss: 0.003202. Value loss: 0.009310. Entropy: 0.340342.
Iteration 5514: Policy loss: 0.000172. Value loss: 0.007987. Entropy: 0.327823.
episode: 3928   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 640     evaluation reward: 4.58
episode: 3929   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 390     evaluation reward: 4.58
episode: 3930   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 554     evaluation reward: 4.6
Training network. lr: 0.000209. clip: 0.083458
Iteration 5515: Policy loss: 0.007111. Value loss: 0.011652. Entropy: 0.358320.
Iteration 5516: Policy loss: -0.007594. Value loss: 0.009673. Entropy: 0.364007.
Iteration 5517: Policy loss: -0.015405. Value loss: 0.008649. Entropy: 0.372743.
episode: 3931   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 389     evaluation reward: 4.59
episode: 3932   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 380     evaluation reward: 4.58
Training network. lr: 

now time :  2018-12-26 15:16:41.857225
episode: 3958   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 1771     evaluation reward: 4.52
episode: 3959   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 451     evaluation reward: 4.5
Training network. lr: 0.000208. clip: 0.083305
Iteration 5566: Policy loss: 0.013765. Value loss: 0.008908. Entropy: 0.480697.
Iteration 5567: Policy loss: 0.000172. Value loss: 0.006567. Entropy: 0.490342.
Iteration 5568: Policy loss: -0.004252. Value loss: 0.005715. Entropy: 0.486397.
Training network. lr: 0.000208. clip: 0.083296
Iteration 5569: Policy loss: 0.003898. Value loss: 0.006267. Entropy: 0.579369.
Iteration 5570: Policy loss: -0.004312. Value loss: 0.004766. Entropy: 0.647911.
Iteration 5571: Policy loss: -0.014692. Value loss: 0.004049. Entropy: 0.631929.
episode: 3960   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 1147     evaluation reward: 4.53
episode: 3961   score: 5.0   memory length: 1024   epsilon: 1.0  

Iteration 5614: Policy loss: 0.017251. Value loss: 0.011687. Entropy: 0.415843.
Iteration 5615: Policy loss: 0.000856. Value loss: 0.009365. Entropy: 0.408438.
Iteration 5616: Policy loss: -0.003634. Value loss: 0.008389. Entropy: 0.412461.
episode: 3991   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 587     evaluation reward: 4.7
episode: 3992   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 577     evaluation reward: 4.71
Training network. lr: 0.000208. clip: 0.083152
Iteration 5617: Policy loss: 0.008751. Value loss: 0.010506. Entropy: 0.418470.
Iteration 5618: Policy loss: 0.011746. Value loss: 0.009052. Entropy: 0.419489.
Iteration 5619: Policy loss: -0.002245. Value loss: 0.008033. Entropy: 0.419827.
episode: 3993   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 370     evaluation reward: 4.71
episode: 3994   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 386     evaluation reward: 4.72
Training network. lr: 0.000208. clip: 0.083143
I

Iteration 5662: Policy loss: 0.013572. Value loss: 0.037569. Entropy: 0.331766.
Iteration 5663: Policy loss: -0.003841. Value loss: 0.032016. Entropy: 0.336421.
Iteration 5664: Policy loss: -0.007925. Value loss: 0.028203. Entropy: 0.338477.
episode: 4025   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 456     evaluation reward: 5.06
episode: 4026   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 510     evaluation reward: 5.07
Training network. lr: 0.000208. clip: 0.083008
Iteration 5665: Policy loss: 0.004874. Value loss: 0.012460. Entropy: 0.297048.
Iteration 5666: Policy loss: -0.004646. Value loss: 0.009893. Entropy: 0.290692.
Iteration 5667: Policy loss: -0.008511. Value loss: 0.008506. Entropy: 0.287838.
episode: 4027   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 528     evaluation reward: 5.08
episode: 4028   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 353     evaluation reward: 5.06
Training network. lr: 0.000207. clip: 0.08299

episode: 4057   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 371     evaluation reward: 5.24
now time :  2018-12-26 15:22:09.257965
episode: 4058   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 444     evaluation reward: 5.26
Training network. lr: 0.000207. clip: 0.082864
Iteration 5713: Policy loss: 0.015119. Value loss: 0.014157. Entropy: 0.436013.
Iteration 5714: Policy loss: 0.003979. Value loss: 0.011651. Entropy: 0.422555.
Iteration 5715: Policy loss: -0.004416. Value loss: 0.009546. Entropy: 0.434142.
episode: 4059   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 477     evaluation reward: 5.28
episode: 4060   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 552     evaluation reward: 5.27
Training network. lr: 0.000207. clip: 0.082855
Iteration 5716: Policy loss: 0.008213. Value loss: 0.013208. Entropy: 0.332598.
Iteration 5717: Policy loss: 0.008593. Value loss: 0.010444. Entropy: 0.335527.
Iteration 5718: Policy loss: 0.008198. Val

episode: 4091   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 542     evaluation reward: 5.4
Training network. lr: 0.000207. clip: 0.082720
Iteration 5761: Policy loss: 0.007146. Value loss: 0.014819. Entropy: 0.341410.
Iteration 5762: Policy loss: -0.002510. Value loss: 0.011443. Entropy: 0.342122.
Iteration 5763: Policy loss: -0.006223. Value loss: 0.009819. Entropy: 0.341323.
episode: 4092   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 622     evaluation reward: 5.41
episode: 4093   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 433     evaluation reward: 5.42
episode: 4094   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 352     evaluation reward: 5.41
Training network. lr: 0.000207. clip: 0.082711
Iteration 5764: Policy loss: 0.013070. Value loss: 0.012969. Entropy: 0.383564.
Iteration 5765: Policy loss: 0.010529. Value loss: 0.010149. Entropy: 0.382438.
Iteration 5766: Policy loss: -0.005056. Value loss: 0.008626. Entropy: 0.387555.


Iteration 5807: Policy loss: 0.003171. Value loss: 0.007958. Entropy: 0.352814.
Iteration 5808: Policy loss: -0.005697. Value loss: 0.006751. Entropy: 0.352840.
episode: 4127   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 319     evaluation reward: 5.17
episode: 4128   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 549     evaluation reward: 5.21
Training network. lr: 0.000206. clip: 0.082576
Iteration 5809: Policy loss: 0.005791. Value loss: 0.012326. Entropy: 0.444594.
Iteration 5810: Policy loss: -0.003069. Value loss: 0.009038. Entropy: 0.450354.
Iteration 5811: Policy loss: -0.009258. Value loss: 0.007738. Entropy: 0.439137.
episode: 4129   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 453     evaluation reward: 5.18
episode: 4130   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 463     evaluation reward: 5.14
Training network. lr: 0.000206. clip: 0.082567
Iteration 5812: Policy loss: 0.006853. Value loss: 0.011467. Entropy: 0.298575.

Iteration 5856: Policy loss: -0.003015. Value loss: 0.009013. Entropy: 0.433396.
episode: 4160   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 491     evaluation reward: 4.95
episode: 4161   score: 7.0   memory length: 1024   epsilon: 1.0    steps: 505     evaluation reward: 4.96
Training network. lr: 0.000206. clip: 0.082432
Iteration 5857: Policy loss: 0.010561. Value loss: 0.011502. Entropy: 0.376866.
Iteration 5858: Policy loss: -0.001032. Value loss: 0.008254. Entropy: 0.373266.
Iteration 5859: Policy loss: -0.006451. Value loss: 0.007432. Entropy: 0.369136.
now time :  2018-12-26 15:27:30.399834
episode: 4162   score: 8.0   memory length: 1024   epsilon: 1.0    steps: 701     evaluation reward: 5.0
episode: 4163   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 579     evaluation reward: 5.01
Training network. lr: 0.000206. clip: 0.082423
Iteration 5860: Policy loss: 0.008121. Value loss: 0.012720. Entropy: 0.398137.
Iteration 5861: Policy loss: -0.001919. V

Iteration 5907: Policy loss: -0.009137. Value loss: 0.011348. Entropy: 0.305348.
episode: 4191   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 468     evaluation reward: 4.73
episode: 4192   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 418     evaluation reward: 4.69
Training network. lr: 0.000206. clip: 0.082279
Iteration 5908: Policy loss: 0.011578. Value loss: 0.009174. Entropy: 0.328884.
Iteration 5909: Policy loss: 0.009761. Value loss: 0.007347. Entropy: 0.309053.
Iteration 5910: Policy loss: -0.008202. Value loss: 0.006201. Entropy: 0.310377.
episode: 4193   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 365     evaluation reward: 4.68
episode: 4194   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 423     evaluation reward: 4.69
episode: 4195   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 429     evaluation reward: 4.68
Training network. lr: 0.000206. clip: 0.082270
Iteration 5911: Policy loss: 0.006992. Value loss: 0.0

Training network. lr: 0.000205. clip: 0.082144
Iteration 5953: Policy loss: 0.019885. Value loss: 0.007761. Entropy: 0.247051.
Iteration 5954: Policy loss: 0.007734. Value loss: 0.005963. Entropy: 0.281439.
Iteration 5955: Policy loss: -0.003515. Value loss: 0.005721. Entropy: 0.229911.
Training network. lr: 0.000205. clip: 0.082135
Iteration 5956: Policy loss: -0.000044. Value loss: 0.000311. Entropy: 0.064126.
Iteration 5957: Policy loss: 0.000002. Value loss: 0.000092. Entropy: 0.193768.
Iteration 5958: Policy loss: 0.000170. Value loss: 0.000208. Entropy: 0.298529.
episode: 4227   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 1627     evaluation reward: 4.44
episode: 4228   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 449     evaluation reward: 4.41
Training network. lr: 0.000205. clip: 0.082126
Iteration 5959: Policy loss: 0.010996. Value loss: 0.010662. Entropy: 0.247103.
Iteration 5960: Policy loss: 0.001875. Value loss: 0.009381. Entropy: 0.243337.
Iter

episode: 4261   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 620     evaluation reward: 4.25
episode: 4262   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 425     evaluation reward: 4.21
Training network. lr: 0.000205. clip: 0.082000
Iteration 6001: Policy loss: 0.013660. Value loss: 0.012801. Entropy: 0.210301.
Iteration 6002: Policy loss: 0.018201. Value loss: 0.009566. Entropy: 0.209940.
Iteration 6003: Policy loss: -0.005082. Value loss: 0.008250. Entropy: 0.202152.
episode: 4263   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 519     evaluation reward: 4.18
episode: 4264   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 350     evaluation reward: 4.15
now time :  2018-12-26 15:32:48.503481
Training network. lr: 0.000205. clip: 0.081991
Iteration 6004: Policy loss: 0.012127. Value loss: 0.007932. Entropy: 0.266080.
Iteration 6005: Policy loss: 0.007829. Value loss: 0.006876. Entropy: 0.259098.
Iteration 6006: Policy loss: 0.001528. Val

episode: 4295   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 1796     evaluation reward: 4.07
Training network. lr: 0.000205. clip: 0.081856
Iteration 6049: Policy loss: 0.008402. Value loss: 0.006552. Entropy: 0.172645.
Iteration 6050: Policy loss: 0.001397. Value loss: 0.004978. Entropy: 0.185247.
Iteration 6051: Policy loss: -0.004934. Value loss: 0.005965. Entropy: 0.296154.
episode: 4296   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 514     evaluation reward: 4.05
episode: 4297   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 464     evaluation reward: 4.03
Training network. lr: 0.000205. clip: 0.081847
Iteration 6052: Policy loss: 0.019828. Value loss: 0.008785. Entropy: 0.411256.
Iteration 6053: Policy loss: 0.005222. Value loss: 0.006200. Entropy: 0.399453.
Iteration 6054: Policy loss: -0.005246. Value loss: 0.005341. Entropy: 0.401586.
episode: 4298   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 509     evaluation reward: 4.04

episode: 4329   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 342     evaluation reward: 3.84
Training network. lr: 0.000204. clip: 0.081712
Iteration 6097: Policy loss: 0.007323. Value loss: 0.005985. Entropy: 0.316913.
Iteration 6098: Policy loss: 0.000926. Value loss: 0.004785. Entropy: 0.318854.
Iteration 6099: Policy loss: -0.000774. Value loss: 0.004265. Entropy: 0.311476.
episode: 4330   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 528     evaluation reward: 3.85
episode: 4331   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 403     evaluation reward: 3.84
Training network. lr: 0.000204. clip: 0.081703
Iteration 6100: Policy loss: 0.014233. Value loss: 0.010442. Entropy: 0.298563.
Iteration 6101: Policy loss: 0.002415. Value loss: 0.008181. Entropy: 0.307831.
Iteration 6102: Policy loss: -0.002552. Value loss: 0.007100. Entropy: 0.305983.
episode: 4332   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 468     evaluation reward: 3.87


episode: 4366   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 355     evaluation reward: 3.87
Training network. lr: 0.000204. clip: 0.081577
Iteration 6142: Policy loss: 0.015903. Value loss: 0.007859. Entropy: 0.383651.
Iteration 6143: Policy loss: 0.008361. Value loss: 0.006722. Entropy: 0.382484.
Iteration 6144: Policy loss: 0.002929. Value loss: 0.006397. Entropy: 0.373023.
episode: 4367   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 497     evaluation reward: 3.9
episode: 4368   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 353     evaluation reward: 3.89
Training network. lr: 0.000204. clip: 0.081568
Iteration 6145: Policy loss: 0.005750. Value loss: 0.014065. Entropy: 0.369246.
Iteration 6146: Policy loss: -0.002063. Value loss: 0.010717. Entropy: 0.374226.
Iteration 6147: Policy loss: -0.008392. Value loss: 0.008733. Entropy: 0.382293.
episode: 4369   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 462     evaluation reward: 3.91
e

episode: 4403   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 513     evaluation reward: 3.94
Training network. lr: 0.000204. clip: 0.081442
Iteration 6187: Policy loss: 0.018224. Value loss: 0.008867. Entropy: 0.226253.
Iteration 6188: Policy loss: 0.001098. Value loss: 0.007034. Entropy: 0.239473.
Iteration 6189: Policy loss: -0.000211. Value loss: 0.006466. Entropy: 0.235732.
episode: 4404   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 293     evaluation reward: 3.93
episode: 4405   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 450     evaluation reward: 3.95
episode: 4406   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 447     evaluation reward: 3.96
Training network. lr: 0.000204. clip: 0.081433
Iteration 6190: Policy loss: 0.016430. Value loss: 0.013498. Entropy: 0.351308.
Iteration 6191: Policy loss: -0.001178. Value loss: 0.010415. Entropy: 0.352530.
Iteration 6192: Policy loss: 0.000125. Value loss: 0.008578. Entropy: 0.361932.


episode: 4440   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 285     evaluation reward: 3.9
episode: 4441   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 355     evaluation reward: 3.9
episode: 4442   score: 2.0   memory length: 1024   epsilon: 1.0    steps: 317     evaluation reward: 3.85
Training network. lr: 0.000203. clip: 0.081307
Iteration 6232: Policy loss: 0.007500. Value loss: 0.014886. Entropy: 0.349011.
Iteration 6233: Policy loss: -0.001191. Value loss: 0.011934. Entropy: 0.359718.
Iteration 6234: Policy loss: -0.003911. Value loss: 0.010680. Entropy: 0.356230.
episode: 4443   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 533     evaluation reward: 3.85
episode: 4444   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 389     evaluation reward: 3.84
episode: 4445   score: 3.0   memory length: 1024   epsilon: 1.0    steps: 350     evaluation reward: 3.83
Training network. lr: 0.000203. clip: 0.081298
Iteration 6235: Policy loss: 0

Iteration 6274: Policy loss: 0.002906. Value loss: 0.007019. Entropy: 0.282886.
Iteration 6275: Policy loss: 0.001060. Value loss: 0.004801. Entropy: 0.294622.
Iteration 6276: Policy loss: -0.004072. Value loss: 0.004206. Entropy: 0.280899.
episode: 4479   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 438     evaluation reward: 3.83
episode: 4480   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 501     evaluation reward: 3.83
Training network. lr: 0.000203. clip: 0.081172
Iteration 6277: Policy loss: 0.016113. Value loss: 0.007262. Entropy: 0.226870.
Iteration 6278: Policy loss: 0.001680. Value loss: 0.005102. Entropy: 0.215612.
Iteration 6279: Policy loss: -0.008918. Value loss: 0.004499. Entropy: 0.218053.
episode: 4481   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 428     evaluation reward: 3.82
episode: 4482   score: 5.0   memory length: 1024   epsilon: 1.0    steps: 450     evaluation reward: 3.84
episode: 4483   score: 3.0   memory length: 102

episode: 4516   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 544     evaluation reward: 3.57
episode: 4517   score: 6.0   memory length: 1024   epsilon: 1.0    steps: 509     evaluation reward: 3.59
episode: 4518   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 365     evaluation reward: 3.61
Training network. lr: 0.000203. clip: 0.081046
Iteration 6319: Policy loss: 0.009383. Value loss: 0.009285. Entropy: 0.294036.
Iteration 6320: Policy loss: 0.001861. Value loss: 0.008770. Entropy: 0.300908.
Iteration 6321: Policy loss: -0.005476. Value loss: 0.007693. Entropy: 0.293728.
episode: 4519   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 382     evaluation reward: 3.6
episode: 4520   score: 4.0   memory length: 1024   epsilon: 1.0    steps: 438     evaluation reward: 3.61
Training network. lr: 0.000203. clip: 0.081037
Iteration 6322: Policy loss: 0.006909. Value loss: 0.009491. Entropy: 0.300893.
Iteration 6323: Policy loss: -0.008531. Value loss: 0.00

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn")