# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [None]:
agent = Agent(action_size)
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 3

### Main Training Loop

In [None]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
max_eval = -np.inf
reset_count = 0

while (e < EPISODES):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                    
                    if np.mean(evaluation_reward) > max_eval:
                        torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
                        max_eval = float(np.mean(evaluation_reward))
                        reset_count = 0
                    elif e > 5000:
                        reset_count += 1
                        if (reset_count == reset_max):
                            print("Training went nowhere, starting again at best model")
                            agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                            agent.update_target_net()
                            reset_count = 0
                    
                e += 1
                evaluation_reward.append(env.score)
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                
                env.done = False
                env.score = 0
                env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()

'''
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        curr_state = history[3,:,:]
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        #r = np.clip(reward, -1, 1)
        r = reward
        """
        if terminal_state:
            r -= 20
        """
        # Store the transition in memory 
        
        agent.memory.push(deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            _, frame_next_val = agent.get_action(np.float32(history[1:, :, :]) / 255.)
            agent.train_policy_net(frame, frame_next_val)
            # Update the target network
            agent.update_target_net()
        score += r
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/spaceinvaders_ppo.png")
            torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 700 and len(evaluation_reward) > 40:
                torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                sys.exit()
'''

  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: -1.446995. Value loss: 7.394697. Entropy: 1.790212.
Iteration 2: Policy loss: -1.446225. Value loss: 6.261855. Entropy: 1.785242.
Iteration 3: Policy loss: -1.420722. Value loss: 5.935813. Entropy: 1.780717.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: -5.571462. Value loss: 66.334053. Entropy: 1.766625.
Iteration 5: Policy loss: -5.743255. Value loss: 58.227165. Entropy: 1.776056.
Iteration 6: Policy loss: -5.517460. Value loss: 54.872612. Entropy: 1.779818.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: -2.662891. Value loss: 43.241081. Entropy: 1.764398.
Iteration 8: Policy loss: -2.797832. Value loss: 45.214840. Entropy: 1.773946.
Iteration 9: Policy loss: -2.675509. Value loss: 39.119541. Entropy: 1.762149.
now time :  2019-02-25 18:39:54.087786
episode: 1   score: 160.0  epsilon: 1.0    steps: 596  evaluation reward: 160.0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: -1.171639. Value loss: 41.231827. Entropy: 1.752817.
Iteration 11: Policy loss: -1.252181. Value loss: 39.284859. Entropy: 1.755325.
Iteration 12: Policy loss: -1.423982. Value loss: 40.109615. Entropy: 1.746622.
episode: 2   score: 130.0  epsilon: 1.0    steps: 239  evaluation reward: 145.0
episode: 3   score: 180.0  epsilon: 1.0    steps: 843  evaluation reward: 156.66666666666666
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: -2.246829. Value loss: 60.771400. Entropy: 1.761159.
Iteration 14: Policy loss: -2.141927. Value loss: 54.244881. Entropy: 1.755545.
Iteration 15: Policy loss: -2.055633. Value loss: 45.924706. Entropy: 1.743846.
episode: 4   score: 180.0  epsilon: 1.0    steps: 462  evaluation reward: 162.5
episode: 5   score: 180.0  epsilon: 1.0    steps: 752  evaluation reward: 166.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 16: Policy loss: 0.035408. Value los

Iteration 72: Policy loss: 3.240810. Value loss: 47.459927. Entropy: 1.634043.
episode: 28   score: 30.0  epsilon: 1.0    steps: 173  evaluation reward: 192.67857142857142
episode: 29   score: 150.0  epsilon: 1.0    steps: 598  evaluation reward: 191.20689655172413
episode: 30   score: 180.0  epsilon: 1.0    steps: 781  evaluation reward: 190.83333333333334
episode: 31   score: 275.0  epsilon: 1.0    steps: 1005  evaluation reward: 193.5483870967742
Training network. lr: 0.000250. clip: 0.099853
Iteration 73: Policy loss: 3.627331. Value loss: 48.241913. Entropy: 1.699124.
Iteration 74: Policy loss: 3.352917. Value loss: 32.753712. Entropy: 1.667349.
Iteration 75: Policy loss: 3.690335. Value loss: 27.435455. Entropy: 1.681661.
episode: 32   score: 55.0  epsilon: 1.0    steps: 311  evaluation reward: 189.21875
episode: 33   score: 305.0  epsilon: 1.0    steps: 494  evaluation reward: 192.72727272727272
Training network. lr: 0.000250. clip: 0.099853
Iteration 76: Policy loss: 0.006630. 

Training network. lr: 0.000249. clip: 0.099696
Iteration 130: Policy loss: 1.661342. Value loss: 33.874485. Entropy: 1.699284.
Iteration 131: Policy loss: 1.598543. Value loss: 21.786894. Entropy: 1.699288.
Iteration 132: Policy loss: 1.735163. Value loss: 17.794155. Entropy: 1.694766.
episode: 57   score: 335.0  epsilon: 1.0    steps: 371  evaluation reward: 184.82456140350877
episode: 58   score: 190.0  epsilon: 1.0    steps: 983  evaluation reward: 184.91379310344828
Training network. lr: 0.000249. clip: 0.099696
Iteration 133: Policy loss: 0.525161. Value loss: 51.568081. Entropy: 1.689142.
Iteration 134: Policy loss: 0.472924. Value loss: 34.846043. Entropy: 1.676441.
Iteration 135: Policy loss: 0.790344. Value loss: 27.416672. Entropy: 1.678120.
episode: 59   score: 135.0  epsilon: 1.0    steps: 184  evaluation reward: 184.0677966101695
episode: 60   score: 110.0  epsilon: 1.0    steps: 882  evaluation reward: 182.83333333333334
Training network. lr: 0.000249. clip: 0.099696
Iter

episode: 84   score: 465.0  epsilon: 1.0    steps: 875  evaluation reward: 187.20238095238096
Training network. lr: 0.000249. clip: 0.099548
Iteration 190: Policy loss: 0.602559. Value loss: 22.674053. Entropy: 1.473097.
Iteration 191: Policy loss: 0.406330. Value loss: 20.361053. Entropy: 1.476636.
Iteration 192: Policy loss: 0.556847. Value loss: 14.580667. Entropy: 1.463428.
Training network. lr: 0.000249. clip: 0.099548
Iteration 193: Policy loss: -1.455430. Value loss: 26.055922. Entropy: 1.348237.
Iteration 194: Policy loss: -1.363055. Value loss: 18.550213. Entropy: 1.343106.
Iteration 195: Policy loss: -1.398001. Value loss: 18.513401. Entropy: 1.327157.
episode: 85   score: 260.0  epsilon: 1.0    steps: 454  evaluation reward: 188.05882352941177
episode: 86   score: 105.0  epsilon: 1.0    steps: 745  evaluation reward: 187.09302325581396
Training network. lr: 0.000249. clip: 0.099548
Iteration 196: Policy loss: 3.339564. Value loss: 65.835533. Entropy: 1.469316.
Iteration 197:

Iteration 251: Policy loss: 0.268657. Value loss: 7.993870. Entropy: 1.179927.
Iteration 252: Policy loss: 0.416335. Value loss: 7.712707. Entropy: 1.190038.
episode: 110   score: 180.0  epsilon: 1.0    steps: 728  evaluation reward: 179.3
Training network. lr: 0.000248. clip: 0.099235
Iteration 253: Policy loss: 0.451152. Value loss: 7.807262. Entropy: 1.053278.
Iteration 254: Policy loss: 0.366645. Value loss: 6.230274. Entropy: 1.059868.
Iteration 255: Policy loss: 0.410222. Value loss: 5.345844. Entropy: 1.092538.
episode: 111   score: 210.0  epsilon: 1.0    steps: 326  evaluation reward: 179.15
episode: 112   score: 105.0  epsilon: 1.0    steps: 562  evaluation reward: 178.7
Training network. lr: 0.000248. clip: 0.099235
Iteration 256: Policy loss: -0.681511. Value loss: 20.260069. Entropy: 0.977629.
Iteration 257: Policy loss: -0.836573. Value loss: 14.787963. Entropy: 0.998167.
Iteration 258: Policy loss: -0.740807. Value loss: 14.938766. Entropy: 0.963496.
episode: 113   score:

Training network. lr: 0.000248. clip: 0.099088
Iteration 313: Policy loss: 2.044637. Value loss: 17.327808. Entropy: 0.829335.
Iteration 314: Policy loss: 2.029523. Value loss: 9.869965. Entropy: 0.814965.
Iteration 315: Policy loss: 1.908351. Value loss: 8.308845. Entropy: 0.853987.
Training network. lr: 0.000248. clip: 0.099088
Iteration 316: Policy loss: 1.124578. Value loss: 12.760306. Entropy: 0.633052.
Iteration 317: Policy loss: 1.095892. Value loss: 11.729929. Entropy: 0.626465.
Iteration 318: Policy loss: 1.009474. Value loss: 11.874995. Entropy: 0.644320.
episode: 138   score: 105.0  epsilon: 1.0    steps: 40  evaluation reward: 178.0
episode: 139   score: 180.0  epsilon: 1.0    steps: 867  evaluation reward: 178.75
Training network. lr: 0.000248. clip: 0.099088
Iteration 319: Policy loss: 0.116538. Value loss: 14.367938. Entropy: 0.993298.
Iteration 320: Policy loss: 0.235559. Value loss: 10.387097. Entropy: 0.999250.
Iteration 321: Policy loss: 0.161660. Value loss: 10.6730

Training network. lr: 0.000247. clip: 0.098931
Iteration 376: Policy loss: 0.755725. Value loss: 6.381102. Entropy: 1.122677.
Iteration 377: Policy loss: 0.755028. Value loss: 5.400436. Entropy: 1.106681.
Iteration 378: Policy loss: 0.881160. Value loss: 4.898935. Entropy: 1.134820.
episode: 164   score: 210.0  epsilon: 1.0    steps: 249  evaluation reward: 174.95
Training network. lr: 0.000247. clip: 0.098931
Iteration 379: Policy loss: -1.078150. Value loss: 14.331660. Entropy: 0.862760.
Iteration 380: Policy loss: -1.101031. Value loss: 11.723396. Entropy: 0.905521.
Iteration 381: Policy loss: -1.088434. Value loss: 11.871847. Entropy: 0.921247.
episode: 165   score: 165.0  epsilon: 1.0    steps: 311  evaluation reward: 173.65
Training network. lr: 0.000247. clip: 0.098931
Iteration 382: Policy loss: 1.336081. Value loss: 11.377154. Entropy: 1.050846.
Iteration 383: Policy loss: 1.331780. Value loss: 8.071862. Entropy: 1.036355.
Iteration 384: Policy loss: 1.368378. Value loss: 7.37

Iteration 440: Policy loss: -1.012044. Value loss: 299.712921. Entropy: 0.571251.
Iteration 441: Policy loss: -0.208278. Value loss: 195.712845. Entropy: 0.410842.
episode: 189   score: 180.0  epsilon: 1.0    steps: 166  evaluation reward: 181.0
episode: 190   score: 210.0  epsilon: 1.0    steps: 502  evaluation reward: 181.3
episode: 191   score: 210.0  epsilon: 1.0    steps: 735  evaluation reward: 182.3
episode: 192   score: 410.0  epsilon: 1.0    steps: 1005  evaluation reward: 183.8
Training network. lr: 0.000247. clip: 0.098774
Iteration 442: Policy loss: 0.734335. Value loss: 17.207094. Entropy: 0.877990.
Iteration 443: Policy loss: 0.736543. Value loss: 15.523857. Entropy: 0.892018.
Iteration 444: Policy loss: 0.718938. Value loss: 13.771951. Entropy: 0.916106.
Training network. lr: 0.000247. clip: 0.098774
Iteration 445: Policy loss: 0.381624. Value loss: 7.669224. Entropy: 0.633778.
Iteration 446: Policy loss: 0.349319. Value loss: 8.339935. Entropy: 0.616543.
Iteration 447: 

Iteration 502: Policy loss: 1.376376. Value loss: 18.670271. Entropy: 0.839032.
Iteration 503: Policy loss: 1.454679. Value loss: 15.891736. Entropy: 0.848653.
Iteration 504: Policy loss: 1.070795. Value loss: 11.736226. Entropy: 0.839092.
Training network. lr: 0.000246. clip: 0.098470
Iteration 505: Policy loss: 0.316117. Value loss: 19.500633. Entropy: 0.674985.
Iteration 506: Policy loss: 0.250573. Value loss: 18.376005. Entropy: 0.663267.
Iteration 507: Policy loss: 0.215282. Value loss: 17.263618. Entropy: 0.664497.
episode: 216   score: 95.0  epsilon: 1.0    steps: 174  evaluation reward: 192.95
Training network. lr: 0.000246. clip: 0.098470
Iteration 508: Policy loss: 1.637205. Value loss: 15.825288. Entropy: 0.874560.
Iteration 509: Policy loss: 1.582072. Value loss: 9.866695. Entropy: 0.847636.
Iteration 510: Policy loss: 1.567608. Value loss: 9.857630. Entropy: 0.855689.
episode: 217   score: 180.0  epsilon: 1.0    steps: 98  evaluation reward: 193.7
episode: 218   score: 185

episode: 240   score: 210.0  epsilon: 1.0    steps: 315  evaluation reward: 203.95
Training network. lr: 0.000246. clip: 0.098313
Iteration 568: Policy loss: -0.636937. Value loss: 44.097435. Entropy: 1.064749.
Iteration 569: Policy loss: -1.197904. Value loss: 29.090227. Entropy: 1.034243.
Iteration 570: Policy loss: -0.895331. Value loss: 19.351875. Entropy: 1.061089.
Training network. lr: 0.000246. clip: 0.098313
Iteration 571: Policy loss: 2.608484. Value loss: 288.826233. Entropy: 0.955756.
Iteration 572: Policy loss: 2.312109. Value loss: 257.155579. Entropy: 0.807778.
Iteration 573: Policy loss: 2.746093. Value loss: 132.830917. Entropy: 0.851549.
episode: 241   score: 735.0  epsilon: 1.0    steps: 767  evaluation reward: 210.25
Training network. lr: 0.000246. clip: 0.098313
Iteration 574: Policy loss: -0.671981. Value loss: 49.235245. Entropy: 1.085204.
Iteration 575: Policy loss: -0.223348. Value loss: 24.794386. Entropy: 1.055096.
Iteration 576: Policy loss: -0.391772. Value 

Iteration 632: Policy loss: 3.786635. Value loss: 5.819323. Entropy: 1.077302.
Iteration 633: Policy loss: 3.601938. Value loss: 5.146475. Entropy: 1.135568.
episode: 265   score: 105.0  epsilon: 1.0    steps: 287  evaluation reward: 208.05
episode: 266   score: 365.0  epsilon: 1.0    steps: 1013  evaluation reward: 210.15
Training network. lr: 0.000245. clip: 0.098166
Iteration 634: Policy loss: 4.376854. Value loss: 17.391495. Entropy: 1.072004.
Iteration 635: Policy loss: 4.326365. Value loss: 9.552444. Entropy: 1.029243.
Iteration 636: Policy loss: 3.933841. Value loss: 10.146520. Entropy: 1.099871.
episode: 267   score: 155.0  epsilon: 1.0    steps: 482  evaluation reward: 210.15
Training network. lr: 0.000245. clip: 0.098166
Iteration 637: Policy loss: 1.117403. Value loss: 21.288670. Entropy: 1.148572.
Iteration 638: Policy loss: 1.096349. Value loss: 14.156104. Entropy: 1.175792.
Iteration 639: Policy loss: 0.924472. Value loss: 14.846335. Entropy: 1.174303.
episode: 268   scor

Iteration 696: Policy loss: 0.259666. Value loss: 6.439619. Entropy: 0.894459.
Training network. lr: 0.000245. clip: 0.098009
Iteration 697: Policy loss: 1.233210. Value loss: 12.451632. Entropy: 0.584001.
Iteration 698: Policy loss: 1.357694. Value loss: 6.719467. Entropy: 0.603833.
Iteration 699: Policy loss: 1.335443. Value loss: 6.085748. Entropy: 0.590659.
episode: 291   score: 180.0  epsilon: 1.0    steps: 83  evaluation reward: 206.8
episode: 292   score: 165.0  epsilon: 1.0    steps: 707  evaluation reward: 204.35
Training network. lr: 0.000245. clip: 0.098009
Iteration 700: Policy loss: 0.073618. Value loss: 34.765648. Entropy: 0.677720.
Iteration 701: Policy loss: -0.302723. Value loss: 22.453690. Entropy: 0.725559.
Iteration 702: Policy loss: 0.001130. Value loss: 15.503341. Entropy: 0.715273.
episode: 293   score: 135.0  epsilon: 1.0    steps: 162  evaluation reward: 203.9
episode: 294   score: 180.0  epsilon: 1.0    steps: 996  evaluation reward: 203.9
Training network. lr

episode: 316   score: 170.0  epsilon: 1.0    steps: 244  evaluation reward: 207.0
episode: 317   score: 515.0  epsilon: 1.0    steps: 706  evaluation reward: 210.35
Training network. lr: 0.000244. clip: 0.097705
Iteration 760: Policy loss: -4.505770. Value loss: 362.924103. Entropy: 0.887806.
Iteration 761: Policy loss: -3.867243. Value loss: 246.897507. Entropy: 0.949970.
Iteration 762: Policy loss: -4.728679. Value loss: 159.173431. Entropy: 0.956035.
episode: 318   score: 380.0  epsilon: 1.0    steps: 72  evaluation reward: 212.3
episode: 319   score: 160.0  epsilon: 1.0    steps: 999  evaluation reward: 212.35
Training network. lr: 0.000244. clip: 0.097705
Iteration 763: Policy loss: -0.054388. Value loss: 18.754875. Entropy: 0.859732.
Iteration 764: Policy loss: 0.031836. Value loss: 10.088614. Entropy: 0.870201.
Iteration 765: Policy loss: -0.102389. Value loss: 9.090225. Entropy: 0.917261.
Training network. lr: 0.000244. clip: 0.097705
Iteration 766: Policy loss: 0.468463. Value

Iteration 825: Policy loss: 0.285850. Value loss: 121.468132. Entropy: 0.986363.
episode: 340   score: 130.0  epsilon: 1.0    steps: 482  evaluation reward: 205.75
episode: 341   score: 460.0  epsilon: 1.0    steps: 961  evaluation reward: 203.0
Training network. lr: 0.000244. clip: 0.097549
Iteration 826: Policy loss: 3.504089. Value loss: 48.476681. Entropy: 0.874652.
Iteration 827: Policy loss: 3.563432. Value loss: 23.815170. Entropy: 0.846059.
Iteration 828: Policy loss: 3.713620. Value loss: 20.184858. Entropy: 0.863239.
episode: 342   score: 250.0  epsilon: 1.0    steps: 658  evaluation reward: 204.4
Training network. lr: 0.000244. clip: 0.097549
Iteration 829: Policy loss: -0.861620. Value loss: 208.167862. Entropy: 1.014819.
Iteration 830: Policy loss: -1.469862. Value loss: 159.584930. Entropy: 1.012413.
Iteration 831: Policy loss: -0.970864. Value loss: 153.397675. Entropy: 1.004148.
episode: 343   score: 295.0  epsilon: 1.0    steps: 240  evaluation reward: 205.9
episode: 3

Iteration 887: Policy loss: 0.959360. Value loss: 18.417784. Entropy: 1.083453.
Iteration 888: Policy loss: 0.739437. Value loss: 15.917133. Entropy: 1.059605.
Training network. lr: 0.000243. clip: 0.097392
Iteration 889: Policy loss: -1.165611. Value loss: 248.782028. Entropy: 1.082668.
Iteration 890: Policy loss: -0.228072. Value loss: 79.446518. Entropy: 1.060001.
Iteration 891: Policy loss: -1.177929. Value loss: 86.419563. Entropy: 1.112710.
episode: 367   score: 240.0  epsilon: 1.0    steps: 649  evaluation reward: 217.55
Training network. lr: 0.000243. clip: 0.097392
Iteration 892: Policy loss: 1.527296. Value loss: 18.246973. Entropy: 0.806019.
Iteration 893: Policy loss: 1.687675. Value loss: 13.927464. Entropy: 0.857328.
Iteration 894: Policy loss: 1.586190. Value loss: 12.392501. Entropy: 0.810551.
Training network. lr: 0.000243. clip: 0.097392
Iteration 895: Policy loss: 0.204911. Value loss: 29.110867. Entropy: 0.678555.
Iteration 896: Policy loss: 0.355732. Value loss: 26

episode: 395   score: 400.0  epsilon: 1.0    steps: 897  evaluation reward: 219.7
Training network. lr: 0.000243. clip: 0.097244
Iteration 949: Policy loss: 1.540264. Value loss: 37.846848. Entropy: 1.076899.
Iteration 950: Policy loss: 1.575651. Value loss: 22.940815. Entropy: 1.088716.
Iteration 951: Policy loss: 1.522644. Value loss: 15.462969. Entropy: 1.071392.
Training network. lr: 0.000243. clip: 0.097088
Iteration 952: Policy loss: 3.073817. Value loss: 32.611641. Entropy: 1.099908.
Iteration 953: Policy loss: 3.034723. Value loss: 21.897478. Entropy: 1.100896.
Iteration 954: Policy loss: 3.342264. Value loss: 20.614176. Entropy: 1.116136.
Training network. lr: 0.000243. clip: 0.097088
Iteration 955: Policy loss: -0.699732. Value loss: 18.683393. Entropy: 1.096446.
Iteration 956: Policy loss: -0.518999. Value loss: 13.494078. Entropy: 1.080412.
Iteration 957: Policy loss: -0.846726. Value loss: 14.934360. Entropy: 1.071246.
Training network. lr: 0.000243. clip: 0.097088
Iterati

Iteration 1013: Policy loss: -0.660030. Value loss: 11.512567. Entropy: 1.013139.
Iteration 1014: Policy loss: -0.680587. Value loss: 10.246143. Entropy: 1.007048.
episode: 419   score: 460.0  epsilon: 1.0    steps: 385  evaluation reward: 214.95
Training network. lr: 0.000242. clip: 0.096931
Iteration 1015: Policy loss: 0.691698. Value loss: 18.292500. Entropy: 1.025226.
Iteration 1016: Policy loss: 0.636466. Value loss: 13.883596. Entropy: 1.054290.
Iteration 1017: Policy loss: 0.559956. Value loss: 12.658310. Entropy: 1.041582.
episode: 420   score: 210.0  epsilon: 1.0    steps: 627  evaluation reward: 215.5
Training network. lr: 0.000242. clip: 0.096931
Iteration 1018: Policy loss: 0.256221. Value loss: 14.251858. Entropy: 0.885123.
Iteration 1019: Policy loss: 0.284134. Value loss: 10.828312. Entropy: 0.888162.
Iteration 1020: Policy loss: 0.312022. Value loss: 9.566157. Entropy: 0.882923.
episode: 421   score: 240.0  epsilon: 1.0    steps: 90  evaluation reward: 216.7
Training ne

Iteration 1078: Policy loss: 2.597067. Value loss: 34.537670. Entropy: 0.505972.
Iteration 1079: Policy loss: 2.695229. Value loss: 18.129896. Entropy: 0.462568.
Iteration 1080: Policy loss: 2.451393. Value loss: 14.572277. Entropy: 0.516532.
episode: 442   score: 180.0  epsilon: 1.0    steps: 328  evaluation reward: 227.25
Training network. lr: 0.000242. clip: 0.096784
Iteration 1081: Policy loss: 3.176476. Value loss: 60.609287. Entropy: 0.522584.
Iteration 1082: Policy loss: 3.234039. Value loss: 34.799408. Entropy: 0.522631.
Iteration 1083: Policy loss: 2.936750. Value loss: 28.235989. Entropy: 0.540583.
episode: 443   score: 400.0  epsilon: 1.0    steps: 642  evaluation reward: 228.3
Training network. lr: 0.000242. clip: 0.096784
Iteration 1084: Policy loss: 1.833363. Value loss: 24.859383. Entropy: 0.620413.
Iteration 1085: Policy loss: 1.676916. Value loss: 18.056005. Entropy: 0.621156.
Iteration 1086: Policy loss: 1.766652. Value loss: 14.603519. Entropy: 0.627277.
episode: 444

Iteration 1141: Policy loss: 0.629284. Value loss: 16.453812. Entropy: 0.763552.
Iteration 1142: Policy loss: 0.532892. Value loss: 14.070022. Entropy: 0.730369.
Iteration 1143: Policy loss: 0.510721. Value loss: 10.818964. Entropy: 0.733134.
episode: 468   score: 260.0  epsilon: 1.0    steps: 29  evaluation reward: 225.7
episode: 469   score: 210.0  epsilon: 1.0    steps: 139  evaluation reward: 226.8
Training network. lr: 0.000242. clip: 0.096627
Iteration 1144: Policy loss: -0.656915. Value loss: 7.790002. Entropy: 0.713660.
Iteration 1145: Policy loss: -0.724861. Value loss: 6.161287. Entropy: 0.720631.
Iteration 1146: Policy loss: -0.705813. Value loss: 5.209171. Entropy: 0.720766.
episode: 470   score: 210.0  epsilon: 1.0    steps: 896  evaluation reward: 224.8
Training network. lr: 0.000242. clip: 0.096627
Iteration 1147: Policy loss: 0.889707. Value loss: 13.978276. Entropy: 0.610010.
Iteration 1148: Policy loss: 1.123841. Value loss: 15.464385. Entropy: 0.595360.
Iteration 114

Training network. lr: 0.000241. clip: 0.096323
Iteration 1204: Policy loss: 0.227674. Value loss: 14.158964. Entropy: 0.948468.
Iteration 1205: Policy loss: 0.183245. Value loss: 11.116944. Entropy: 0.956901.
Iteration 1206: Policy loss: 0.412201. Value loss: 10.800515. Entropy: 0.955293.
episode: 495   score: 185.0  epsilon: 1.0    steps: 396  evaluation reward: 220.7
Training network. lr: 0.000241. clip: 0.096323
Iteration 1207: Policy loss: -0.260593. Value loss: 5.282744. Entropy: 0.869424.
Iteration 1208: Policy loss: -0.257699. Value loss: 4.371811. Entropy: 0.868844.
Iteration 1209: Policy loss: -0.370678. Value loss: 4.608997. Entropy: 0.878084.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1210: Policy loss: -0.463615. Value loss: 10.746820. Entropy: 0.751513.
Iteration 1211: Policy loss: -0.649870. Value loss: 7.924883. Entropy: 0.719921.
Iteration 1212: Policy loss: -0.394264. Value loss: 6.916552. Entropy: 0.741171.
episode: 496   score: 210.0  epsilon: 1.0    st

Iteration 1267: Policy loss: -1.775961. Value loss: 194.359756. Entropy: 0.968517.
Iteration 1268: Policy loss: -2.308502. Value loss: 159.412918. Entropy: 0.961327.
Iteration 1269: Policy loss: -1.901428. Value loss: 72.207649. Entropy: 0.971311.
Training network. lr: 0.000240. clip: 0.096166
Iteration 1270: Policy loss: 1.791493. Value loss: 37.060562. Entropy: 0.885870.
Iteration 1271: Policy loss: 0.937382. Value loss: 13.799290. Entropy: 0.952855.
Iteration 1272: Policy loss: 1.578555. Value loss: 13.241053. Entropy: 0.921158.
episode: 520   score: 180.0  epsilon: 1.0    steps: 236  evaluation reward: 217.0
episode: 521   score: 80.0  epsilon: 1.0    steps: 494  evaluation reward: 215.4
episode: 522   score: 110.0  epsilon: 1.0    steps: 732  evaluation reward: 214.4
Training network. lr: 0.000240. clip: 0.096166
Iteration 1273: Policy loss: 1.215452. Value loss: 17.205956. Entropy: 0.884510.
Iteration 1274: Policy loss: 1.281350. Value loss: 12.647974. Entropy: 0.862204.
Iteratio

Iteration 1330: Policy loss: 0.282090. Value loss: 14.938943. Entropy: 0.915774.
Iteration 1331: Policy loss: 0.121744. Value loss: 11.722298. Entropy: 0.933922.
Iteration 1332: Policy loss: 0.093708. Value loss: 9.660933. Entropy: 0.952747.
episode: 546   score: 210.0  epsilon: 1.0    steps: 346  evaluation reward: 193.9
Training network. lr: 0.000240. clip: 0.096009
Iteration 1333: Policy loss: -0.731429. Value loss: 25.707735. Entropy: 0.934123.
Iteration 1334: Policy loss: -0.767914. Value loss: 13.874340. Entropy: 0.957352.
Iteration 1335: Policy loss: -0.846737. Value loss: 11.732559. Entropy: 0.950907.
episode: 547   score: 275.0  epsilon: 1.0    steps: 178  evaluation reward: 192.4
episode: 548   score: 180.0  epsilon: 1.0    steps: 664  evaluation reward: 191.35
Training network. lr: 0.000240. clip: 0.096009
Iteration 1336: Policy loss: 0.026422. Value loss: 16.804605. Entropy: 1.040371.
Iteration 1337: Policy loss: -0.190538. Value loss: 12.350044. Entropy: 1.029408.
Iteratio

Training network. lr: 0.000240. clip: 0.095862
Iteration 1393: Policy loss: -0.838581. Value loss: 16.291447. Entropy: 1.047614.
Iteration 1394: Policy loss: -0.750995. Value loss: 10.786757. Entropy: 1.015644.
Iteration 1395: Policy loss: -0.832066. Value loss: 8.485952. Entropy: 1.009982.
episode: 572   score: 180.0  epsilon: 1.0    steps: 71  evaluation reward: 189.75
episode: 573   score: 410.0  epsilon: 1.0    steps: 528  evaluation reward: 192.05
episode: 574   score: 155.0  epsilon: 1.0    steps: 880  evaluation reward: 191.8
Training network. lr: 0.000240. clip: 0.095862
Iteration 1396: Policy loss: 0.095845. Value loss: 16.305729. Entropy: 0.994394.
Iteration 1397: Policy loss: 0.083018. Value loss: 12.122435. Entropy: 0.981064.
Iteration 1398: Policy loss: 0.108393. Value loss: 11.233398. Entropy: 1.004308.
episode: 575   score: 180.0  epsilon: 1.0    steps: 426  evaluation reward: 192.05
Training network. lr: 0.000240. clip: 0.095862
Iteration 1399: Policy loss: -0.725581. V

episode: 598   score: 155.0  epsilon: 1.0    steps: 806  evaluation reward: 197.25
episode: 599   score: 120.0  epsilon: 1.0    steps: 936  evaluation reward: 196.35
Training network. lr: 0.000239. clip: 0.095549
Iteration 1456: Policy loss: 0.042386. Value loss: 10.187260. Entropy: 1.109315.
Iteration 1457: Policy loss: -0.003726. Value loss: 6.250106. Entropy: 1.131262.
Iteration 1458: Policy loss: 0.013805. Value loss: 5.917012. Entropy: 1.150441.
Training network. lr: 0.000239. clip: 0.095549
Iteration 1459: Policy loss: 1.267827. Value loss: 11.745013. Entropy: 0.814507.
Iteration 1460: Policy loss: 0.824212. Value loss: 6.630285. Entropy: 0.850136.
Iteration 1461: Policy loss: 1.312910. Value loss: 4.966064. Entropy: 0.825470.
Training network. lr: 0.000239. clip: 0.095549
Iteration 1462: Policy loss: -1.562211. Value loss: 135.922897. Entropy: 0.912763.
Iteration 1463: Policy loss: -0.997242. Value loss: 24.769762. Entropy: 0.933532.
Iteration 1464: Policy loss: -1.502657. Value

Iteration 1519: Policy loss: 0.011023. Value loss: 10.754158. Entropy: 0.729171.
Iteration 1520: Policy loss: 0.060859. Value loss: 6.676558. Entropy: 0.716486.
Iteration 1521: Policy loss: 0.077636. Value loss: 6.186719. Entropy: 0.761490.
episode: 623   score: 265.0  epsilon: 1.0    steps: 557  evaluation reward: 198.65
Training network. lr: 0.000239. clip: 0.095401
Iteration 1522: Policy loss: 0.149054. Value loss: 3.968026. Entropy: 1.079093.
Iteration 1523: Policy loss: 0.250163. Value loss: 2.696465. Entropy: 1.015222.
Iteration 1524: Policy loss: 0.173003. Value loss: 2.786646. Entropy: 1.035712.
Training network. lr: 0.000239. clip: 0.095401
Iteration 1525: Policy loss: 0.137095. Value loss: 13.264050. Entropy: 0.866165.
Iteration 1526: Policy loss: 0.107875. Value loss: 8.314763. Entropy: 0.833132.
Iteration 1527: Policy loss: 0.292400. Value loss: 7.014889. Entropy: 0.898522.
episode: 624   score: 155.0  epsilon: 1.0    steps: 132  evaluation reward: 195.8
episode: 625   scor

Iteration 1583: Policy loss: -0.437589. Value loss: 3.928986. Entropy: 0.835755.
Iteration 1584: Policy loss: -0.523152. Value loss: 4.533579. Entropy: 0.832917.
episode: 649   score: 145.0  epsilon: 1.0    steps: 114  evaluation reward: 193.2
episode: 650   score: 180.0  epsilon: 1.0    steps: 310  evaluation reward: 193.2
now time :  2019-02-25 19:09:28.685877
episode: 651   score: 155.0  epsilon: 1.0    steps: 690  evaluation reward: 192.65
episode: 652   score: 105.0  epsilon: 1.0    steps: 956  evaluation reward: 192.65
Training network. lr: 0.000238. clip: 0.095245
Iteration 1585: Policy loss: 0.274248. Value loss: 9.717252. Entropy: 0.940720.
Iteration 1586: Policy loss: 0.136815. Value loss: 7.937088. Entropy: 0.951505.
Iteration 1587: Policy loss: 0.055931. Value loss: 7.129829. Entropy: 0.930873.
Training network. lr: 0.000238. clip: 0.095245
Iteration 1588: Policy loss: -0.906621. Value loss: 5.496384. Entropy: 0.662111.
Iteration 1589: Policy loss: -0.916894. Value loss: 4.

Training network. lr: 0.000238. clip: 0.095088
Iteration 1645: Policy loss: 0.193395. Value loss: 5.921730. Entropy: 0.591835.
Iteration 1646: Policy loss: 0.154041. Value loss: 3.841994. Entropy: 0.634391.
Iteration 1647: Policy loss: 0.224866. Value loss: 4.383388. Entropy: 0.609459.
episode: 676   score: 155.0  epsilon: 1.0    steps: 912  evaluation reward: 190.7
Training network. lr: 0.000238. clip: 0.095088
Iteration 1648: Policy loss: 0.648089. Value loss: 10.299220. Entropy: 0.908935.
Iteration 1649: Policy loss: 0.637681. Value loss: 7.506359. Entropy: 0.919133.
Iteration 1650: Policy loss: 0.627466. Value loss: 7.725567. Entropy: 0.933959.
episode: 677   score: 210.0  epsilon: 1.0    steps: 472  evaluation reward: 190.7
episode: 678   score: 210.0  epsilon: 1.0    steps: 817  evaluation reward: 191.2
Training network. lr: 0.000237. clip: 0.094940
Iteration 1651: Policy loss: 0.473869. Value loss: 10.432400. Entropy: 1.040957.
Iteration 1652: Policy loss: 0.376654. Value loss: 

Training network. lr: 0.000237. clip: 0.094784
Iteration 1708: Policy loss: -0.031790. Value loss: 8.206283. Entropy: 0.894950.
Iteration 1709: Policy loss: -0.144885. Value loss: 6.137877. Entropy: 0.887691.
Iteration 1710: Policy loss: -0.023842. Value loss: 5.103371. Entropy: 0.884032.
episode: 702   score: 460.0  epsilon: 1.0    steps: 896  evaluation reward: 191.2
Training network. lr: 0.000237. clip: 0.094784
Iteration 1711: Policy loss: 1.968519. Value loss: 30.163197. Entropy: 0.908613.
Iteration 1712: Policy loss: 1.885305. Value loss: 10.311728. Entropy: 0.906859.
Iteration 1713: Policy loss: 1.738776. Value loss: 9.123777. Entropy: 0.929765.
episode: 703   score: 180.0  epsilon: 1.0    steps: 374  evaluation reward: 191.8
Training network. lr: 0.000237. clip: 0.094784
Iteration 1714: Policy loss: -2.907147. Value loss: 291.408508. Entropy: 0.558947.
Iteration 1715: Policy loss: -2.716036. Value loss: 195.142914. Entropy: 0.530236.
Iteration 1716: Policy loss: -2.964897. Valu

episode: 725   score: 120.0  epsilon: 1.0    steps: 1012  evaluation reward: 207.4
Training network. lr: 0.000237. clip: 0.094627
Iteration 1774: Policy loss: -2.418637. Value loss: 241.569016. Entropy: 0.671922.
Iteration 1775: Policy loss: -1.629223. Value loss: 118.254181. Entropy: 0.671819.
Iteration 1776: Policy loss: -3.164479. Value loss: 87.922188. Entropy: 0.698680.
episode: 726   score: 210.0  epsilon: 1.0    steps: 166  evaluation reward: 207.7
episode: 727   score: 410.0  epsilon: 1.0    steps: 350  evaluation reward: 210.6
episode: 728   score: 285.0  epsilon: 1.0    steps: 614  evaluation reward: 211.65
episode: 729   score: 410.0  epsilon: 1.0    steps: 803  evaluation reward: 213.65
Training network. lr: 0.000237. clip: 0.094627
Iteration 1777: Policy loss: -0.384958. Value loss: 16.973204. Entropy: 0.988249.
Iteration 1778: Policy loss: -0.406169. Value loss: 14.874666. Entropy: 0.952480.
Iteration 1779: Policy loss: -0.339337. Value loss: 10.297225. Entropy: 0.941431.

now time :  2019-02-25 19:14:11.423108
episode: 751   score: 210.0  epsilon: 1.0    steps: 899  evaluation reward: 220.65
Training network. lr: 0.000236. clip: 0.094480
Iteration 1837: Policy loss: -0.380307. Value loss: 11.371275. Entropy: 0.722403.
Iteration 1838: Policy loss: -0.421842. Value loss: 8.958973. Entropy: 0.685571.
Iteration 1839: Policy loss: -0.446278. Value loss: 7.856825. Entropy: 0.732918.
episode: 752   score: 185.0  epsilon: 1.0    steps: 294  evaluation reward: 221.45
episode: 753   score: 180.0  epsilon: 1.0    steps: 532  evaluation reward: 221.7
Training network. lr: 0.000236. clip: 0.094480
Iteration 1840: Policy loss: -0.973042. Value loss: 9.494258. Entropy: 0.832595.
Iteration 1841: Policy loss: -0.938946. Value loss: 7.417164. Entropy: 0.843729.
Iteration 1842: Policy loss: -0.964166. Value loss: 6.309573. Entropy: 0.834751.
episode: 754   score: 180.0  epsilon: 1.0    steps: 671  evaluation reward: 221.4
Training network. lr: 0.000236. clip: 0.094480
Ite

episode: 777   score: 180.0  epsilon: 1.0    steps: 391  evaluation reward: 230.55
episode: 778   score: 180.0  epsilon: 1.0    steps: 702  evaluation reward: 230.25
Training network. lr: 0.000236. clip: 0.094323
Iteration 1900: Policy loss: 1.093903. Value loss: 7.788144. Entropy: 0.646493.
Iteration 1901: Policy loss: 0.954424. Value loss: 5.812388. Entropy: 0.662646.
Iteration 1902: Policy loss: 0.939326. Value loss: 4.994433. Entropy: 0.724749.
Training network. lr: 0.000235. clip: 0.094166
Iteration 1903: Policy loss: 1.422650. Value loss: 12.450649. Entropy: 0.651102.
Iteration 1904: Policy loss: 1.644214. Value loss: 10.919611. Entropy: 0.659101.
Iteration 1905: Policy loss: 1.610463. Value loss: 7.637060. Entropy: 0.664256.
Training network. lr: 0.000235. clip: 0.094166
Iteration 1906: Policy loss: 0.943855. Value loss: 6.878540. Entropy: 0.886176.
Iteration 1907: Policy loss: 0.897034. Value loss: 5.283123. Entropy: 0.919588.
Iteration 1908: Policy loss: 0.863737. Value loss: 

episode: 803   score: 180.0  epsilon: 1.0    steps: 492  evaluation reward: 225.2
Training network. lr: 0.000235. clip: 0.094019
Iteration 1963: Policy loss: -0.253485. Value loss: 7.181131. Entropy: 0.959949.
Iteration 1964: Policy loss: -0.349792. Value loss: 4.903742. Entropy: 0.943788.
Iteration 1965: Policy loss: -0.347830. Value loss: 4.016987. Entropy: 0.943767.
episode: 804   score: 260.0  epsilon: 1.0    steps: 796  evaluation reward: 223.7
Training network. lr: 0.000235. clip: 0.094019
Iteration 1966: Policy loss: -0.379604. Value loss: 8.860384. Entropy: 0.923637.
Iteration 1967: Policy loss: -0.341980. Value loss: 6.980501. Entropy: 0.921169.
Iteration 1968: Policy loss: -0.479643. Value loss: 4.198288. Entropy: 0.933924.
episode: 805   score: 210.0  epsilon: 1.0    steps: 28  evaluation reward: 221.7
episode: 806   score: 155.0  epsilon: 1.0    steps: 143  evaluation reward: 221.4
episode: 807   score: 155.0  epsilon: 1.0    steps: 380  evaluation reward: 220.85
Training n

episode: 829   score: 155.0  epsilon: 1.0    steps: 176  evaluation reward: 205.05
Training network. lr: 0.000235. clip: 0.093862
Iteration 2026: Policy loss: 0.716743. Value loss: 17.288038. Entropy: 0.688575.
Iteration 2027: Policy loss: 0.729206. Value loss: 12.823090. Entropy: 0.663490.
Iteration 2028: Policy loss: 0.608604. Value loss: 9.735865. Entropy: 0.684888.
episode: 830   score: 210.0  epsilon: 1.0    steps: 294  evaluation reward: 205.35
episode: 831   score: 490.0  epsilon: 1.0    steps: 701  evaluation reward: 208.45
episode: 832   score: 210.0  epsilon: 1.0    steps: 781  evaluation reward: 208.75
Training network. lr: 0.000235. clip: 0.093862
Iteration 2029: Policy loss: -0.426120. Value loss: 10.171779. Entropy: 0.816848.
Iteration 2030: Policy loss: -0.478565. Value loss: 7.578917. Entropy: 0.824629.
Iteration 2031: Policy loss: -0.486942. Value loss: 7.198960. Entropy: 0.817542.
Training network. lr: 0.000235. clip: 0.093862
Iteration 2032: Policy loss: 0.529324. Va

episode: 856   score: 245.0  epsilon: 1.0    steps: 648  evaluation reward: 204.85
Training network. lr: 0.000234. clip: 0.093705
Iteration 2089: Policy loss: -2.210908. Value loss: 176.249054. Entropy: 0.771775.
Iteration 2090: Policy loss: -0.994274. Value loss: 50.640610. Entropy: 0.804296.
Iteration 2091: Policy loss: -1.896159. Value loss: 33.721294. Entropy: 0.843877.
episode: 857   score: 465.0  epsilon: 1.0    steps: 280  evaluation reward: 207.7
episode: 858   score: 415.0  epsilon: 1.0    steps: 427  evaluation reward: 209.75
episode: 859   score: 125.0  epsilon: 1.0    steps: 774  evaluation reward: 206.9
Training network. lr: 0.000234. clip: 0.093705
Iteration 2092: Policy loss: 1.641539. Value loss: 26.507797. Entropy: 1.074663.
Iteration 2093: Policy loss: 1.721892. Value loss: 17.862719. Entropy: 1.044716.
Iteration 2094: Policy loss: 1.635293. Value loss: 17.530212. Entropy: 1.071164.
Training network. lr: 0.000234. clip: 0.093705
Iteration 2095: Policy loss: 1.647604. 

Iteration 2150: Policy loss: -2.576711. Value loss: 50.838917. Entropy: 0.804672.
Iteration 2151: Policy loss: -2.050696. Value loss: 41.731827. Entropy: 0.836205.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2152: Policy loss: -0.578362. Value loss: 173.082443. Entropy: 0.904788.
Iteration 2153: Policy loss: -0.749674. Value loss: 55.912552. Entropy: 0.849973.
Iteration 2154: Policy loss: -1.176742. Value loss: 47.793255. Entropy: 0.849598.
episode: 884   score: 475.0  epsilon: 1.0    steps: 259  evaluation reward: 206.45
episode: 885   score: 230.0  epsilon: 1.0    steps: 618  evaluation reward: 207.2
Training network. lr: 0.000234. clip: 0.093401
Iteration 2155: Policy loss: 1.350469. Value loss: 24.033319. Entropy: 0.928273.
Iteration 2156: Policy loss: 1.477335. Value loss: 19.760885. Entropy: 0.939573.
Iteration 2157: Policy loss: 1.366175. Value loss: 16.029533. Entropy: 0.950647.
episode: 886   score: 570.0  epsilon: 1.0    steps: 711  evaluation reward: 210.3
Train

Iteration 2212: Policy loss: 0.452387. Value loss: 28.656990. Entropy: 0.801350.
Iteration 2213: Policy loss: 0.508657. Value loss: 16.590565. Entropy: 0.823994.
Iteration 2214: Policy loss: 0.402384. Value loss: 13.331876. Entropy: 0.812683.
episode: 910   score: 85.0  epsilon: 1.0    steps: 312  evaluation reward: 215.5
Training network. lr: 0.000233. clip: 0.093245
Iteration 2215: Policy loss: 2.552080. Value loss: 41.001514. Entropy: 0.727548.
Iteration 2216: Policy loss: 2.656335. Value loss: 21.315384. Entropy: 0.688725.
Iteration 2217: Policy loss: 2.754773. Value loss: 16.730436. Entropy: 0.761861.
episode: 911   score: 460.0  epsilon: 1.0    steps: 146  evaluation reward: 218.3
Training network. lr: 0.000233. clip: 0.093245
Iteration 2218: Policy loss: 1.468108. Value loss: 24.615143. Entropy: 0.836215.
Iteration 2219: Policy loss: 1.333639. Value loss: 18.083662. Entropy: 0.877080.
Iteration 2220: Policy loss: 1.311131. Value loss: 14.401433. Entropy: 0.858513.
episode: 912  

Iteration 2276: Policy loss: 1.009256. Value loss: 14.479310. Entropy: 1.050959.
Iteration 2277: Policy loss: 0.820468. Value loss: 10.712809. Entropy: 1.087166.
episode: 935   score: 225.0  epsilon: 1.0    steps: 4  evaluation reward: 220.45
episode: 936   score: 105.0  epsilon: 1.0    steps: 594  evaluation reward: 219.4
episode: 937   score: 240.0  epsilon: 1.0    steps: 954  evaluation reward: 220.0
Training network. lr: 0.000233. clip: 0.093097
Iteration 2278: Policy loss: -3.858988. Value loss: 458.168518. Entropy: 1.094912.
Iteration 2279: Policy loss: -2.944789. Value loss: 276.496521. Entropy: 1.009365.
Iteration 2280: Policy loss: -4.006985. Value loss: 300.274445. Entropy: 0.902378.
episode: 938   score: 75.0  epsilon: 1.0    steps: 358  evaluation reward: 219.7
Training network. lr: 0.000233. clip: 0.093097
Iteration 2281: Policy loss: 0.695332. Value loss: 76.599388. Entropy: 0.979156.
Iteration 2282: Policy loss: 0.807225. Value loss: 47.286701. Entropy: 0.950157.
Iterati

Iteration 2335: Policy loss: 0.010577. Value loss: 23.815159. Entropy: 0.933968.
Iteration 2336: Policy loss: 0.058831. Value loss: 19.610117. Entropy: 0.892359.
Iteration 2337: Policy loss: 0.084140. Value loss: 17.117313. Entropy: 0.902812.
episode: 965   score: 205.0  epsilon: 1.0    steps: 200  evaluation reward: 221.95
episode: 966   score: 175.0  epsilon: 1.0    steps: 917  evaluation reward: 221.85
Training network. lr: 0.000232. clip: 0.092941
Iteration 2338: Policy loss: 0.949565. Value loss: 22.100723. Entropy: 0.907023.
Iteration 2339: Policy loss: 1.015906. Value loss: 14.079199. Entropy: 0.880201.
Iteration 2340: Policy loss: 1.040607. Value loss: 12.531670. Entropy: 0.873869.
Training network. lr: 0.000232. clip: 0.092941
Iteration 2341: Policy loss: -0.733909. Value loss: 52.962578. Entropy: 0.969188.
Iteration 2342: Policy loss: -0.993907. Value loss: 37.562981. Entropy: 0.975548.
Iteration 2343: Policy loss: -1.202017. Value loss: 30.395012. Entropy: 0.956221.
episode:

Iteration 2397: Policy loss: 1.169144. Value loss: 14.268110. Entropy: 0.903669.
Training network. lr: 0.000232. clip: 0.092784
Iteration 2398: Policy loss: -0.416749. Value loss: 36.099316. Entropy: 0.772958.
Iteration 2399: Policy loss: -0.255421. Value loss: 26.599281. Entropy: 0.765316.
Iteration 2400: Policy loss: -0.261977. Value loss: 21.351925. Entropy: 0.774187.
episode: 992   score: 205.0  epsilon: 1.0    steps: 81  evaluation reward: 228.15
episode: 993   score: 370.0  epsilon: 1.0    steps: 307  evaluation reward: 230.5
episode: 994   score: 165.0  epsilon: 1.0    steps: 804  evaluation reward: 230.25
Training network. lr: 0.000232. clip: 0.092636
Iteration 2401: Policy loss: 1.594190. Value loss: 213.593643. Entropy: 1.033839.
Iteration 2402: Policy loss: 1.286244. Value loss: 199.741714. Entropy: 1.016686.
Iteration 2403: Policy loss: 1.141465. Value loss: 156.140488. Entropy: 1.001948.
episode: 995   score: 155.0  epsilon: 1.0    steps: 591  evaluation reward: 231.05
epi

episode: 1021   score: 75.0  epsilon: 1.0    steps: 247  evaluation reward: 214.75
Training network. lr: 0.000231. clip: 0.092480
Iteration 2458: Policy loss: 1.480857. Value loss: 32.177254. Entropy: 0.861093.
Iteration 2459: Policy loss: 1.493356. Value loss: 13.737444. Entropy: 0.842431.
Iteration 2460: Policy loss: 1.380728. Value loss: 10.709804. Entropy: 0.830824.
episode: 1022   score: 205.0  epsilon: 1.0    steps: 602  evaluation reward: 212.3
episode: 1023   score: 150.0  epsilon: 1.0    steps: 859  evaluation reward: 211.6
Training network. lr: 0.000231. clip: 0.092480
Iteration 2461: Policy loss: -0.647640. Value loss: 28.765938. Entropy: 0.771653.
Iteration 2462: Policy loss: -0.807406. Value loss: 18.571646. Entropy: 0.762622.
Iteration 2463: Policy loss: -0.883497. Value loss: 14.780798. Entropy: 0.757125.
episode: 1024   score: 185.0  epsilon: 1.0    steps: 407  evaluation reward: 210.3
Training network. lr: 0.000231. clip: 0.092480
Iteration 2464: Policy loss: 0.083628.

episode: 1047   score: 190.0  epsilon: 1.0    steps: 386  evaluation reward: 208.8
episode: 1048   score: 230.0  epsilon: 1.0    steps: 936  evaluation reward: 210.0
Training network. lr: 0.000231. clip: 0.092323
Iteration 2521: Policy loss: 0.725672. Value loss: 25.292866. Entropy: 1.143489.
Iteration 2522: Policy loss: 0.564760. Value loss: 11.624939. Entropy: 1.151441.
Iteration 2523: Policy loss: 0.679261. Value loss: 9.844065. Entropy: 1.164894.
episode: 1049   score: 70.0  epsilon: 1.0    steps: 58  evaluation reward: 208.15
episode: 1050   score: 270.0  epsilon: 1.0    steps: 279  evaluation reward: 209.2
now time :  2019-02-25 19:27:02.144937
episode: 1051   score: 235.0  epsilon: 1.0    steps: 533  evaluation reward: 208.7
Training network. lr: 0.000231. clip: 0.092323
Iteration 2524: Policy loss: 2.060709. Value loss: 24.253767. Entropy: 0.841724.
Iteration 2525: Policy loss: 1.883155. Value loss: 14.340004. Entropy: 0.842929.
Iteration 2526: Policy loss: 2.218159. Value loss

Iteration 2580: Policy loss: -3.998431. Value loss: 136.410858. Entropy: 0.910669.
episode: 1077   score: 120.0  epsilon: 1.0    steps: 347  evaluation reward: 195.65
episode: 1078   score: 115.0  epsilon: 1.0    steps: 860  evaluation reward: 194.3
episode: 1079   score: 300.0  epsilon: 1.0    steps: 907  evaluation reward: 194.15
Training network. lr: 0.000230. clip: 0.092176
Iteration 2581: Policy loss: 0.880338. Value loss: 19.211895. Entropy: 0.781353.
Iteration 2582: Policy loss: 0.868049. Value loss: 10.751359. Entropy: 0.814033.
Iteration 2583: Policy loss: 0.887824. Value loss: 8.339018. Entropy: 0.822752.
episode: 1080   score: 310.0  epsilon: 1.0    steps: 659  evaluation reward: 192.6
Training network. lr: 0.000230. clip: 0.092176
Iteration 2584: Policy loss: -0.189596. Value loss: 27.343336. Entropy: 0.776964.
Iteration 2585: Policy loss: -0.145922. Value loss: 14.545242. Entropy: 0.833082.
Iteration 2586: Policy loss: -0.259262. Value loss: 10.997845. Entropy: 0.811780.
e

Training network. lr: 0.000230. clip: 0.092019
Iteration 2641: Policy loss: 0.463513. Value loss: 42.761135. Entropy: 0.791910.
Iteration 2642: Policy loss: 0.342273. Value loss: 24.863064. Entropy: 0.790358.
Iteration 2643: Policy loss: 0.135857. Value loss: 20.978130. Entropy: 0.781976.
episode: 1104   score: 220.0  epsilon: 1.0    steps: 239  evaluation reward: 201.75
episode: 1105   score: 60.0  epsilon: 1.0    steps: 356  evaluation reward: 200.45
Training network. lr: 0.000230. clip: 0.092019
Iteration 2644: Policy loss: 1.328859. Value loss: 40.781490. Entropy: 0.846687.
Iteration 2645: Policy loss: 1.929872. Value loss: 21.421097. Entropy: 0.811046.
Iteration 2646: Policy loss: 1.724007. Value loss: 19.500700. Entropy: 0.833626.
episode: 1106   score: 170.0  epsilon: 1.0    steps: 519  evaluation reward: 200.85
Training network. lr: 0.000230. clip: 0.092019
Iteration 2647: Policy loss: 1.793515. Value loss: 34.793861. Entropy: 0.843584.
Iteration 2648: Policy loss: 1.310735. Va

Iteration 2701: Policy loss: 2.114470. Value loss: 29.891884. Entropy: 0.614609.
Iteration 2702: Policy loss: 1.890383. Value loss: 14.359991. Entropy: 0.631566.
Iteration 2703: Policy loss: 2.171020. Value loss: 11.216838. Entropy: 0.606051.
episode: 1132   score: 225.0  epsilon: 1.0    steps: 990  evaluation reward: 216.35
Training network. lr: 0.000229. clip: 0.091715
Iteration 2704: Policy loss: 0.307891. Value loss: 31.277531. Entropy: 0.541500.
Iteration 2705: Policy loss: 0.460410. Value loss: 21.174202. Entropy: 0.529276.
Iteration 2706: Policy loss: 0.533165. Value loss: 15.966892. Entropy: 0.536110.
Training network. lr: 0.000229. clip: 0.091715
Iteration 2707: Policy loss: -0.248437. Value loss: 38.403591. Entropy: 0.783012.
Iteration 2708: Policy loss: -0.680093. Value loss: 25.648161. Entropy: 0.816609.
Iteration 2709: Policy loss: -0.003983. Value loss: 24.752537. Entropy: 0.785753.
episode: 1133   score: 105.0  epsilon: 1.0    steps: 291  evaluation reward: 215.45
episod

Iteration 2763: Policy loss: 1.126503. Value loss: 20.062756. Entropy: 0.953265.
episode: 1158   score: 230.0  epsilon: 1.0    steps: 258  evaluation reward: 219.65
episode: 1159   score: 185.0  epsilon: 1.0    steps: 718  evaluation reward: 219.55
episode: 1160   score: 275.0  epsilon: 1.0    steps: 773  evaluation reward: 220.85
Training network. lr: 0.000229. clip: 0.091558
Iteration 2764: Policy loss: -0.191535. Value loss: 14.256827. Entropy: 0.671655.
Iteration 2765: Policy loss: 0.041501. Value loss: 8.403933. Entropy: 0.651636.
Iteration 2766: Policy loss: 0.002474. Value loss: 8.401904. Entropy: 0.650463.
Training network. lr: 0.000229. clip: 0.091558
Iteration 2767: Policy loss: 1.792115. Value loss: 21.700211. Entropy: 0.891616.
Iteration 2768: Policy loss: 2.205096. Value loss: 17.571115. Entropy: 0.920692.
Iteration 2769: Policy loss: 2.008843. Value loss: 16.087856. Entropy: 0.893884.
Training network. lr: 0.000229. clip: 0.091558
Iteration 2770: Policy loss: -1.253142. V

Training network. lr: 0.000229. clip: 0.091401
Iteration 2827: Policy loss: 1.753661. Value loss: 8.652190. Entropy: 0.810483.
Iteration 2828: Policy loss: 1.824307. Value loss: 6.486760. Entropy: 0.807893.
Iteration 2829: Policy loss: 1.941023. Value loss: 4.523484. Entropy: 0.816754.
episode: 1183   score: 105.0  epsilon: 1.0    steps: 770  evaluation reward: 215.7
Training network. lr: 0.000229. clip: 0.091401
Iteration 2830: Policy loss: -2.835052. Value loss: 225.153717. Entropy: 0.834421.
Iteration 2831: Policy loss: -2.921358. Value loss: 229.287338. Entropy: 0.809865.
Iteration 2832: Policy loss: -2.442691. Value loss: 104.330894. Entropy: 0.441035.
episode: 1184   score: 225.0  epsilon: 1.0    steps: 246  evaluation reward: 216.95
Training network. lr: 0.000229. clip: 0.091401
Iteration 2833: Policy loss: 0.006310. Value loss: 29.713276. Entropy: 0.711425.
Iteration 2834: Policy loss: -0.154356. Value loss: 16.631912. Entropy: 0.724316.
Iteration 2835: Policy loss: 0.157265. V

Iteration 2888: Policy loss: -0.848703. Value loss: 16.655890. Entropy: 0.875377.
Iteration 2889: Policy loss: -0.747822. Value loss: 12.307539. Entropy: 0.864075.
episode: 1210   score: 90.0  epsilon: 1.0    steps: 66  evaluation reward: 207.1
episode: 1211   score: 265.0  epsilon: 1.0    steps: 971  evaluation reward: 206.75
Training network. lr: 0.000228. clip: 0.091254
Iteration 2890: Policy loss: -0.098091. Value loss: 20.789972. Entropy: 0.920588.
Iteration 2891: Policy loss: 0.110379. Value loss: 8.354395. Entropy: 0.919643.
Iteration 2892: Policy loss: 0.060425. Value loss: 8.025658. Entropy: 0.914214.
Training network. lr: 0.000228. clip: 0.091254
Iteration 2893: Policy loss: -0.581834. Value loss: 25.228746. Entropy: 1.070351.
Iteration 2894: Policy loss: -0.530033. Value loss: 13.493002. Entropy: 1.080003.
Iteration 2895: Policy loss: -0.522137. Value loss: 10.619376. Entropy: 1.077194.
episode: 1212   score: 180.0  epsilon: 1.0    steps: 796  evaluation reward: 205.4
Traini

Iteration 2953: Policy loss: 0.378140. Value loss: 16.246920. Entropy: 0.832906.
Iteration 2954: Policy loss: 0.297964. Value loss: 9.642452. Entropy: 0.813841.
Iteration 2955: Policy loss: 0.378633. Value loss: 8.717864. Entropy: 0.823287.
episode: 1233   score: 50.0  epsilon: 1.0    steps: 562  evaluation reward: 208.65
Training network. lr: 0.000227. clip: 0.090941
Iteration 2956: Policy loss: 1.252321. Value loss: 21.291021. Entropy: 0.933554.
Iteration 2957: Policy loss: 1.345525. Value loss: 13.951058. Entropy: 0.947580.
Iteration 2958: Policy loss: 1.081517. Value loss: 12.324223. Entropy: 0.945593.
episode: 1234   score: 135.0  epsilon: 1.0    steps: 496  evaluation reward: 208.95
episode: 1235   score: 285.0  epsilon: 1.0    steps: 795  evaluation reward: 210.85
Training network. lr: 0.000227. clip: 0.090941
Iteration 2959: Policy loss: 1.247323. Value loss: 11.147813. Entropy: 0.828683.
Iteration 2960: Policy loss: 1.348488. Value loss: 4.607421. Entropy: 0.872080.
Iteration 

episode: 1259   score: 180.0  epsilon: 1.0    steps: 353  evaluation reward: 205.1
Training network. lr: 0.000227. clip: 0.090793
Iteration 3016: Policy loss: 1.425166. Value loss: 15.222254. Entropy: 0.644505.
Iteration 3017: Policy loss: 1.346359. Value loss: 10.893215. Entropy: 0.663329.
Iteration 3018: Policy loss: 1.401648. Value loss: 9.993553. Entropy: 0.659369.
episode: 1260   score: 210.0  epsilon: 1.0    steps: 4  evaluation reward: 204.45
Training network. lr: 0.000227. clip: 0.090793
Iteration 3019: Policy loss: -3.485860. Value loss: 26.051132. Entropy: 0.663595.
Iteration 3020: Policy loss: -3.471474. Value loss: 12.821436. Entropy: 0.653439.
Iteration 3021: Policy loss: -3.407848. Value loss: 9.472459. Entropy: 0.647523.
episode: 1261   score: 290.0  epsilon: 1.0    steps: 829  evaluation reward: 205.65
Training network. lr: 0.000227. clip: 0.090793
Iteration 3022: Policy loss: -4.408550. Value loss: 175.415436. Entropy: 0.614439.
Iteration 3023: Policy loss: -3.615410. 

Iteration 3078: Policy loss: 0.169944. Value loss: 5.868052. Entropy: 0.780933.
episode: 1286   score: 210.0  epsilon: 1.0    steps: 629  evaluation reward: 212.6
Training network. lr: 0.000227. clip: 0.090637
Iteration 3079: Policy loss: 1.008137. Value loss: 8.833015. Entropy: 0.787580.
Iteration 3080: Policy loss: 1.040024. Value loss: 5.390838. Entropy: 0.778786.
Iteration 3081: Policy loss: 0.906864. Value loss: 5.214595. Entropy: 0.798417.
episode: 1287   score: 80.0  epsilon: 1.0    steps: 103  evaluation reward: 210.9
episode: 1288   score: 50.0  epsilon: 1.0    steps: 135  evaluation reward: 206.95
Training network. lr: 0.000227. clip: 0.090637
Iteration 3082: Policy loss: 0.145028. Value loss: 10.896179. Entropy: 0.840097.
Iteration 3083: Policy loss: 0.244910. Value loss: 6.127170. Entropy: 0.862992.
Iteration 3084: Policy loss: 0.218970. Value loss: 3.954879. Entropy: 0.854525.
episode: 1289   score: 145.0  epsilon: 1.0    steps: 737  evaluation reward: 207.4
Training netwo

Iteration 3141: Policy loss: 0.364038. Value loss: 6.239238. Entropy: 0.646564.
episode: 1311   score: 210.0  epsilon: 1.0    steps: 29  evaluation reward: 211.95
Training network. lr: 0.000226. clip: 0.090480
Iteration 3142: Policy loss: -3.130781. Value loss: 122.970055. Entropy: 0.721439.
Iteration 3143: Policy loss: -2.949334. Value loss: 35.962257. Entropy: 0.759153.
Iteration 3144: Policy loss: -2.926271. Value loss: 23.995987. Entropy: 0.686494.
episode: 1312   score: 185.0  epsilon: 1.0    steps: 538  evaluation reward: 212.0
episode: 1313   score: 210.0  epsilon: 1.0    steps: 919  evaluation reward: 212.25
Training network. lr: 0.000226. clip: 0.090480
Iteration 3145: Policy loss: -0.172681. Value loss: 34.389004. Entropy: 0.732885.
Iteration 3146: Policy loss: 0.006932. Value loss: 17.535055. Entropy: 0.722324.
Iteration 3147: Policy loss: -0.037663. Value loss: 13.839320. Entropy: 0.768039.
episode: 1314   score: 380.0  epsilon: 1.0    steps: 135  evaluation reward: 214.0
T

Iteration 3203: Policy loss: 1.217135. Value loss: 9.262335. Entropy: 0.771197.
Iteration 3204: Policy loss: 1.145019. Value loss: 6.661269. Entropy: 0.817642.
episode: 1338   score: 105.0  epsilon: 1.0    steps: 168  evaluation reward: 196.1
episode: 1339   score: 185.0  epsilon: 1.0    steps: 789  evaluation reward: 196.85
Training network. lr: 0.000225. clip: 0.090176
Iteration 3205: Policy loss: 1.103035. Value loss: 22.094019. Entropy: 0.789104.
Iteration 3206: Policy loss: 1.219581. Value loss: 12.655103. Entropy: 0.796680.
Iteration 3207: Policy loss: 0.899889. Value loss: 9.895728. Entropy: 0.776319.
episode: 1340   score: 195.0  epsilon: 1.0    steps: 279  evaluation reward: 196.25
episode: 1341   score: 105.0  epsilon: 1.0    steps: 401  evaluation reward: 195.1
episode: 1342   score: 105.0  epsilon: 1.0    steps: 755  evaluation reward: 195.4
Training network. lr: 0.000225. clip: 0.090176
Iteration 3208: Policy loss: 2.106619. Value loss: 40.401222. Entropy: 0.670679.
Iterat

Iteration 3264: Policy loss: 0.562550. Value loss: 18.334076. Entropy: 0.931836.
episode: 1366   score: 110.0  epsilon: 1.0    steps: 16  evaluation reward: 182.75
episode: 1367   score: 135.0  epsilon: 1.0    steps: 205  evaluation reward: 180.85
Training network. lr: 0.000225. clip: 0.090019
Iteration 3265: Policy loss: 0.651147. Value loss: 25.254501. Entropy: 0.932665.
Iteration 3266: Policy loss: 0.549493. Value loss: 15.894929. Entropy: 0.919865.
Iteration 3267: Policy loss: 0.474678. Value loss: 12.498925. Entropy: 0.922812.
episode: 1368   score: 235.0  epsilon: 1.0    steps: 291  evaluation reward: 181.1
Training network. lr: 0.000225. clip: 0.090019
Iteration 3268: Policy loss: 0.199403. Value loss: 26.166117. Entropy: 0.938919.
Iteration 3269: Policy loss: 0.237971. Value loss: 13.622351. Entropy: 0.943973.
Iteration 3270: Policy loss: 0.185539. Value loss: 10.503524. Entropy: 0.953202.
episode: 1369   score: 305.0  epsilon: 1.0    steps: 597  evaluation reward: 183.05
Train

Iteration 3327: Policy loss: 0.575702. Value loss: 17.065285. Entropy: 0.686544.
episode: 1392   score: 390.0  epsilon: 1.0    steps: 523  evaluation reward: 189.2
Training network. lr: 0.000225. clip: 0.089872
Iteration 3328: Policy loss: -1.632879. Value loss: 55.998138. Entropy: 0.781295.
Iteration 3329: Policy loss: -1.564750. Value loss: 32.271397. Entropy: 0.808881.
Iteration 3330: Policy loss: -1.733234. Value loss: 22.843565. Entropy: 0.809656.
Training network. lr: 0.000225. clip: 0.089872
Iteration 3331: Policy loss: 1.119585. Value loss: 23.952715. Entropy: 0.738428.
Iteration 3332: Policy loss: 0.826601. Value loss: 13.290872. Entropy: 0.722092.
Iteration 3333: Policy loss: 1.122491. Value loss: 7.897899. Entropy: 0.756112.
episode: 1393   score: 225.0  epsilon: 1.0    steps: 865  evaluation reward: 190.4
Training network. lr: 0.000225. clip: 0.089872
Iteration 3334: Policy loss: 4.507875. Value loss: 28.743958. Entropy: 0.826611.
Iteration 3335: Policy loss: 3.950831. Valu

episode: 1420   score: 110.0  epsilon: 1.0    steps: 896  evaluation reward: 190.75
Training network. lr: 0.000224. clip: 0.089715
Iteration 3388: Policy loss: -2.242440. Value loss: 33.323666. Entropy: 0.962963.
Iteration 3389: Policy loss: -2.218866. Value loss: 20.005610. Entropy: 0.939614.
Iteration 3390: Policy loss: -2.187312. Value loss: 15.684772. Entropy: 0.956152.
episode: 1421   score: 175.0  epsilon: 1.0    steps: 22  evaluation reward: 190.9
Training network. lr: 0.000224. clip: 0.089715
Iteration 3391: Policy loss: 0.848501. Value loss: 23.482792. Entropy: 0.835108.
Iteration 3392: Policy loss: 0.674023. Value loss: 14.148493. Entropy: 0.832287.
Iteration 3393: Policy loss: 0.814502. Value loss: 12.008693. Entropy: 0.871826.
episode: 1422   score: 80.0  epsilon: 1.0    steps: 905  evaluation reward: 190.65
Training network. lr: 0.000224. clip: 0.089715
Iteration 3394: Policy loss: 0.603395. Value loss: 23.205849. Entropy: 0.896663.
Iteration 3395: Policy loss: 0.508575. V

Iteration 3449: Policy loss: -3.317024. Value loss: 138.909973. Entropy: 0.885149.
Iteration 3450: Policy loss: -4.418789. Value loss: 126.707222. Entropy: 0.848173.
episode: 1447   score: 185.0  epsilon: 1.0    steps: 197  evaluation reward: 201.75
episode: 1448   score: 230.0  epsilon: 1.0    steps: 686  evaluation reward: 202.75
Training network. lr: 0.000224. clip: 0.089411
Iteration 3451: Policy loss: -0.771842. Value loss: 55.904716. Entropy: 0.906309.
Iteration 3452: Policy loss: -0.730284. Value loss: 26.572807. Entropy: 0.905456.
Iteration 3453: Policy loss: -0.527592. Value loss: 20.127745. Entropy: 0.869143.
episode: 1449   score: 135.0  epsilon: 1.0    steps: 45  evaluation reward: 203.2
episode: 1450   score: 180.0  epsilon: 1.0    steps: 562  evaluation reward: 201.6
Training network. lr: 0.000224. clip: 0.089411
Iteration 3454: Policy loss: 2.092057. Value loss: 44.034809. Entropy: 0.773828.
Iteration 3455: Policy loss: 2.310785. Value loss: 23.084213. Entropy: 0.782870.

Training network. lr: 0.000223. clip: 0.089254
Iteration 3511: Policy loss: 2.660785. Value loss: 16.652563. Entropy: 0.916543.
Iteration 3512: Policy loss: 2.772571. Value loss: 10.195044. Entropy: 0.943692.
Iteration 3513: Policy loss: 2.649359. Value loss: 8.881975. Entropy: 0.919704.
episode: 1474   score: 45.0  epsilon: 1.0    steps: 141  evaluation reward: 208.15
Training network. lr: 0.000223. clip: 0.089254
Iteration 3514: Policy loss: 1.168110. Value loss: 14.466429. Entropy: 0.979306.
Iteration 3515: Policy loss: 1.220703. Value loss: 11.047109. Entropy: 0.977770.
Iteration 3516: Policy loss: 1.048719. Value loss: 9.907266. Entropy: 0.987161.
episode: 1475   score: 215.0  epsilon: 1.0    steps: 466  evaluation reward: 206.55
Training network. lr: 0.000223. clip: 0.089254
Iteration 3517: Policy loss: 1.953385. Value loss: 22.745163. Entropy: 0.952823.
Iteration 3518: Policy loss: 2.005641. Value loss: 12.805627. Entropy: 0.961023.
Iteration 3519: Policy loss: 1.968505. Value l

episode: 1499   score: 165.0  epsilon: 1.0    steps: 643  evaluation reward: 207.7
Training network. lr: 0.000223. clip: 0.089097
Iteration 3574: Policy loss: 0.764462. Value loss: 38.377995. Entropy: 0.977512.
Iteration 3575: Policy loss: 0.468942. Value loss: 23.501379. Entropy: 0.960963.
Iteration 3576: Policy loss: 0.563487. Value loss: 17.304289. Entropy: 0.970583.
Training network. lr: 0.000223. clip: 0.089097
Iteration 3577: Policy loss: -0.478043. Value loss: 28.697683. Entropy: 0.783407.
Iteration 3578: Policy loss: -0.411476. Value loss: 19.396179. Entropy: 0.788610.
Iteration 3579: Policy loss: -0.405480. Value loss: 14.669308. Entropy: 0.802209.
episode: 1500   score: 200.0  epsilon: 1.0    steps: 50  evaluation reward: 208.65
now time :  2019-02-25 19:46:47.936441
episode: 1501   score: 105.0  epsilon: 1.0    steps: 877  evaluation reward: 208.35
Training network. lr: 0.000223. clip: 0.089097
Iteration 3580: Policy loss: 0.065368. Value loss: 21.384752. Entropy: 0.765654.


Training network. lr: 0.000222. clip: 0.088950
Iteration 3637: Policy loss: -2.699434. Value loss: 32.799004. Entropy: 0.900461.
Iteration 3638: Policy loss: -2.910495. Value loss: 19.586990. Entropy: 0.927161.
Iteration 3639: Policy loss: -2.960784. Value loss: 12.630003. Entropy: 0.911010.
episode: 1524   score: 110.0  epsilon: 1.0    steps: 457  evaluation reward: 210.9
episode: 1525   score: 175.0  epsilon: 1.0    steps: 877  evaluation reward: 211.55
episode: 1526   score: 205.0  epsilon: 1.0    steps: 954  evaluation reward: 212.1
Training network. lr: 0.000222. clip: 0.088950
Iteration 3640: Policy loss: 0.984288. Value loss: 48.204117. Entropy: 1.022933.
Iteration 3641: Policy loss: 0.944600. Value loss: 26.478954. Entropy: 0.998242.
Iteration 3642: Policy loss: 0.979317. Value loss: 18.981312. Entropy: 1.007538.
episode: 1527   score: 515.0  epsilon: 1.0    steps: 144  evaluation reward: 212.7
episode: 1528   score: 115.0  epsilon: 1.0    steps: 315  evaluation reward: 210.1
e

Iteration 3696: Policy loss: 2.251923. Value loss: 4.433064. Entropy: 0.862905.
episode: 1554   score: 120.0  epsilon: 1.0    steps: 741  evaluation reward: 193.4
episode: 1555   score: 200.0  epsilon: 1.0    steps: 899  evaluation reward: 194.1
Training network. lr: 0.000222. clip: 0.088793
Iteration 3697: Policy loss: 1.662888. Value loss: 17.837080. Entropy: 0.818770.
Iteration 3698: Policy loss: 1.621379. Value loss: 14.856506. Entropy: 0.830068.
Iteration 3699: Policy loss: 1.611957. Value loss: 10.336171. Entropy: 0.826385.
Training network. lr: 0.000222. clip: 0.088793
Iteration 3700: Policy loss: 0.169303. Value loss: 27.434158. Entropy: 0.966918.
Iteration 3701: Policy loss: 0.269081. Value loss: 16.782471. Entropy: 0.956865.
Iteration 3702: Policy loss: -0.036895. Value loss: 17.066956. Entropy: 0.974923.
episode: 1556   score: 80.0  epsilon: 1.0    steps: 317  evaluation reward: 192.75
Training network. lr: 0.000222. clip: 0.088637
Iteration 3703: Policy loss: -3.491068. Val

Training network. lr: 0.000221. clip: 0.088489
Iteration 3757: Policy loss: -0.990824. Value loss: 27.975145. Entropy: 0.834651.
Iteration 3758: Policy loss: -0.807316. Value loss: 14.909947. Entropy: 0.871630.
Iteration 3759: Policy loss: -0.734032. Value loss: 13.361601. Entropy: 0.831049.
episode: 1582   score: 85.0  epsilon: 1.0    steps: 871  evaluation reward: 175.85
episode: 1583   score: 140.0  epsilon: 1.0    steps: 978  evaluation reward: 174.8
Training network. lr: 0.000221. clip: 0.088489
Iteration 3760: Policy loss: -1.833234. Value loss: 30.747025. Entropy: 0.757478.
Iteration 3761: Policy loss: -1.766570. Value loss: 19.388432. Entropy: 0.713523.
Iteration 3762: Policy loss: -1.862978. Value loss: 16.032007. Entropy: 0.730162.
episode: 1584   score: 235.0  epsilon: 1.0    steps: 134  evaluation reward: 176.5
episode: 1585   score: 170.0  epsilon: 1.0    steps: 521  evaluation reward: 175.1
Training network. lr: 0.000221. clip: 0.088489
Iteration 3763: Policy loss: -0.523

Iteration 3818: Policy loss: -0.725947. Value loss: 22.662310. Entropy: 0.800666.
Iteration 3819: Policy loss: -0.972820. Value loss: 17.273247. Entropy: 0.785527.
episode: 1609   score: 160.0  epsilon: 1.0    steps: 723  evaluation reward: 151.2
episode: 1610   score: 55.0  epsilon: 1.0    steps: 964  evaluation reward: 150.95
Training network. lr: 0.000221. clip: 0.088333
Iteration 3820: Policy loss: -1.082112. Value loss: 19.939062. Entropy: 0.896103.
Iteration 3821: Policy loss: -1.103699. Value loss: 11.931809. Entropy: 0.958271.
Iteration 3822: Policy loss: -1.321370. Value loss: 9.251760. Entropy: 0.955404.
episode: 1611   score: 140.0  epsilon: 1.0    steps: 113  evaluation reward: 151.4
Training network. lr: 0.000221. clip: 0.088333
Iteration 3823: Policy loss: -0.012587. Value loss: 26.853262. Entropy: 0.832468.
Iteration 3824: Policy loss: 0.010993. Value loss: 15.738640. Entropy: 0.799419.
Iteration 3825: Policy loss: 0.010936. Value loss: 12.942081. Entropy: 0.835199.
Trai

Training network. lr: 0.000220. clip: 0.088176
Iteration 3883: Policy loss: 1.768116. Value loss: 25.668589. Entropy: 0.744692.
Iteration 3884: Policy loss: 1.938274. Value loss: 17.211302. Entropy: 0.748281.
Iteration 3885: Policy loss: 1.805433. Value loss: 14.154957. Entropy: 0.737697.
episode: 1632   score: 380.0  epsilon: 1.0    steps: 728  evaluation reward: 172.15
Training network. lr: 0.000220. clip: 0.088176
Iteration 3886: Policy loss: 0.042832. Value loss: 30.538105. Entropy: 0.823056.
Iteration 3887: Policy loss: 0.187959. Value loss: 20.343138. Entropy: 0.827826.
Iteration 3888: Policy loss: 0.127548. Value loss: 17.184740. Entropy: 0.812576.
episode: 1633   score: 230.0  epsilon: 1.0    steps: 118  evaluation reward: 174.3
Training network. lr: 0.000220. clip: 0.088176
Iteration 3889: Policy loss: 3.164565. Value loss: 37.313778. Entropy: 0.843112.
Iteration 3890: Policy loss: 3.057849. Value loss: 16.425806. Entropy: 0.850409.
Iteration 3891: Policy loss: 2.946158. Value

Training network. lr: 0.000220. clip: 0.088028
Iteration 3943: Policy loss: -1.359923. Value loss: 30.184959. Entropy: 0.903534.
Iteration 3944: Policy loss: -1.461080. Value loss: 19.421486. Entropy: 0.887322.
Iteration 3945: Policy loss: -1.242838. Value loss: 18.694124. Entropy: 0.887856.
Training network. lr: 0.000220. clip: 0.088028
Iteration 3946: Policy loss: 0.088936. Value loss: 10.906717. Entropy: 0.875661.
Iteration 3947: Policy loss: 0.134971. Value loss: 7.300690. Entropy: 0.876021.
Iteration 3948: Policy loss: 0.019350. Value loss: 7.638121. Entropy: 0.886967.
episode: 1661   score: 75.0  epsilon: 1.0    steps: 952  evaluation reward: 191.8
Training network. lr: 0.000220. clip: 0.088028
Iteration 3949: Policy loss: 1.375321. Value loss: 20.641426. Entropy: 0.908200.
Iteration 3950: Policy loss: 1.359989. Value loss: 12.902821. Entropy: 0.896101.
Iteration 3951: Policy loss: 1.252145. Value loss: 11.367594. Entropy: 0.905376.
Training network. lr: 0.000220. clip: 0.087872


Training network. lr: 0.000219. clip: 0.087715
Iteration 4006: Policy loss: 2.524961. Value loss: 37.957176. Entropy: 0.592871.
Iteration 4007: Policy loss: 2.340931. Value loss: 26.980177. Entropy: 0.596573.
Iteration 4008: Policy loss: 2.350275. Value loss: 21.611694. Entropy: 0.590795.
episode: 1686   score: 330.0  epsilon: 1.0    steps: 292  evaluation reward: 210.0
Training network. lr: 0.000219. clip: 0.087715
Iteration 4009: Policy loss: 0.332290. Value loss: 54.177879. Entropy: 0.837640.
Iteration 4010: Policy loss: -0.403712. Value loss: 25.840693. Entropy: 0.811013.
Iteration 4011: Policy loss: 0.148383. Value loss: 18.494020. Entropy: 0.804038.
episode: 1687   score: 210.0  epsilon: 1.0    steps: 826  evaluation reward: 210.0
Training network. lr: 0.000219. clip: 0.087715
Iteration 4012: Policy loss: 2.125710. Value loss: 37.076645. Entropy: 0.685762.
Iteration 4013: Policy loss: 2.079665. Value loss: 21.796688. Entropy: 0.695843.
Iteration 4014: Policy loss: 1.709199. Value

Iteration 4066: Policy loss: 1.241672. Value loss: 34.241470. Entropy: 0.735828.
Iteration 4067: Policy loss: 1.243848. Value loss: 20.936234. Entropy: 0.718068.
Iteration 4068: Policy loss: 1.107583. Value loss: 17.640062. Entropy: 0.729643.
Training network. lr: 0.000219. clip: 0.087568
Iteration 4069: Policy loss: -1.294599. Value loss: 41.482677. Entropy: 0.811691.
Iteration 4070: Policy loss: -1.205549. Value loss: 23.536423. Entropy: 0.808578.
Iteration 4071: Policy loss: -1.576101. Value loss: 19.116674. Entropy: 0.808556.
episode: 1714   score: 65.0  epsilon: 1.0    steps: 319  evaluation reward: 220.85
episode: 1715   score: 525.0  epsilon: 1.0    steps: 604  evaluation reward: 224.6
episode: 1716   score: 185.0  epsilon: 1.0    steps: 780  evaluation reward: 223.55
Training network. lr: 0.000219. clip: 0.087568
Iteration 4072: Policy loss: 0.683704. Value loss: 49.206837. Entropy: 0.789389.
Iteration 4073: Policy loss: 0.890310. Value loss: 25.873915. Entropy: 0.761750.
Itera

episode: 1740   score: 180.0  epsilon: 1.0    steps: 577  evaluation reward: 211.65
Training network. lr: 0.000219. clip: 0.087411
Iteration 4129: Policy loss: 0.192784. Value loss: 14.072398. Entropy: 0.460462.
Iteration 4130: Policy loss: 0.493532. Value loss: 8.674212. Entropy: 0.499142.
Iteration 4131: Policy loss: 0.214345. Value loss: 9.702655. Entropy: 0.479446.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4132: Policy loss: -1.889883. Value loss: 21.478243. Entropy: 0.514699.
Iteration 4133: Policy loss: -1.987000. Value loss: 12.877275. Entropy: 0.517450.
Iteration 4134: Policy loss: -1.954674. Value loss: 11.005254. Entropy: 0.511522.
episode: 1741   score: 160.0  epsilon: 1.0    steps: 674  evaluation reward: 209.25
episode: 1742   score: 120.0  epsilon: 1.0    steps: 903  evaluation reward: 209.05
Training network. lr: 0.000219. clip: 0.087411
Iteration 4135: Policy loss: -4.332361. Value loss: 281.030060. Entropy: 0.601810.
Iteration 4136: Policy loss: -4.37700

Training network. lr: 0.000218. clip: 0.087254
Iteration 4186: Policy loss: 0.090403. Value loss: 15.343248. Entropy: 0.824198.
Iteration 4187: Policy loss: 0.139201. Value loss: 9.498746. Entropy: 0.831689.
Iteration 4188: Policy loss: 0.006534. Value loss: 6.450415. Entropy: 0.831878.
episode: 1772   score: 105.0  epsilon: 1.0    steps: 82  evaluation reward: 200.45
episode: 1773   score: 125.0  epsilon: 1.0    steps: 307  evaluation reward: 198.0
Training network. lr: 0.000218. clip: 0.087254
Iteration 4189: Policy loss: -3.379395. Value loss: 31.851261. Entropy: 0.630134.
Iteration 4190: Policy loss: -3.196429. Value loss: 17.071424. Entropy: 0.609938.
Iteration 4191: Policy loss: -3.225611. Value loss: 16.053169. Entropy: 0.621853.
episode: 1774   score: 120.0  epsilon: 1.0    steps: 562  evaluation reward: 198.15
episode: 1775   score: 180.0  epsilon: 1.0    steps: 813  evaluation reward: 198.4
Training network. lr: 0.000218. clip: 0.087254
Iteration 4192: Policy loss: 0.473538. 

now time :  2019-02-25 19:59:14.185647
episode: 1801   score: 320.0  epsilon: 1.0    steps: 141  evaluation reward: 182.25
episode: 1802   score: 605.0  epsilon: 1.0    steps: 461  evaluation reward: 187.6
episode: 1803   score: 155.0  epsilon: 1.0    steps: 826  evaluation reward: 188.15
Training network. lr: 0.000218. clip: 0.087107
Iteration 4246: Policy loss: -0.158757. Value loss: 33.863724. Entropy: 0.682465.
Iteration 4247: Policy loss: -0.114445. Value loss: 21.045986. Entropy: 0.658000.
Iteration 4248: Policy loss: -0.302321. Value loss: 16.770164. Entropy: 0.668341.
Training network. lr: 0.000218. clip: 0.087107
Iteration 4249: Policy loss: 1.324045. Value loss: 29.975592. Entropy: 0.715228.
Iteration 4250: Policy loss: 1.504781. Value loss: 20.475922. Entropy: 0.702852.
Iteration 4251: Policy loss: 1.538741. Value loss: 14.975239. Entropy: 0.739799.
episode: 1804   score: 115.0  epsilon: 1.0    steps: 119  evaluation reward: 187.85
episode: 1805   score: 205.0  epsilon: 1.0 

episode: 1830   score: 80.0  epsilon: 1.0    steps: 211  evaluation reward: 175.9
episode: 1831   score: 280.0  epsilon: 1.0    steps: 569  evaluation reward: 177.75
Training network. lr: 0.000217. clip: 0.086793
Iteration 4306: Policy loss: -0.600603. Value loss: 30.909904. Entropy: 0.650346.
Iteration 4307: Policy loss: -0.410529. Value loss: 16.758585. Entropy: 0.675421.
Iteration 4308: Policy loss: -0.601856. Value loss: 13.671443. Entropy: 0.649706.
episode: 1832   score: 275.0  epsilon: 1.0    steps: 38  evaluation reward: 177.65
episode: 1833   score: 370.0  epsilon: 1.0    steps: 765  evaluation reward: 180.3
Training network. lr: 0.000217. clip: 0.086793
Iteration 4309: Policy loss: 1.475874. Value loss: 53.863350. Entropy: 0.516652.
Iteration 4310: Policy loss: 1.950478. Value loss: 23.515356. Entropy: 0.527542.
Iteration 4311: Policy loss: 1.136459. Value loss: 19.726957. Entropy: 0.532444.
Training network. lr: 0.000217. clip: 0.086793
Iteration 4312: Policy loss: -0.463841

Training network. lr: 0.000217. clip: 0.086646
Iteration 4363: Policy loss: -0.337633. Value loss: 15.115201. Entropy: 0.655489.
Iteration 4364: Policy loss: -0.413407. Value loss: 9.746082. Entropy: 0.667004.
Iteration 4365: Policy loss: -0.364224. Value loss: 8.135192. Entropy: 0.656717.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4366: Policy loss: -0.858330. Value loss: 6.462837. Entropy: 0.824394.
Iteration 4367: Policy loss: -0.865341. Value loss: 5.072128. Entropy: 0.807055.
Iteration 4368: Policy loss: -0.800346. Value loss: 4.003242. Entropy: 0.800901.
episode: 1862   score: 75.0  epsilon: 1.0    steps: 885  evaluation reward: 181.55
Training network. lr: 0.000217. clip: 0.086646
Iteration 4369: Policy loss: 0.701454. Value loss: 9.133399. Entropy: 0.695183.
Iteration 4370: Policy loss: 0.666232. Value loss: 4.983834. Entropy: 0.707576.
Iteration 4371: Policy loss: 0.648337. Value loss: 4.849241. Entropy: 0.715540.
episode: 1863   score: 120.0  epsilon: 1.0    ste

episode: 1888   score: 105.0  epsilon: 1.0    steps: 864  evaluation reward: 179.35
Training network. lr: 0.000216. clip: 0.086489
Iteration 4426: Policy loss: 1.387551. Value loss: 22.443108. Entropy: 0.658085.
Iteration 4427: Policy loss: 1.513243. Value loss: 12.234916. Entropy: 0.662600.
Iteration 4428: Policy loss: 1.499044. Value loss: 9.934115. Entropy: 0.692081.
episode: 1889   score: 155.0  epsilon: 1.0    steps: 441  evaluation reward: 179.35
episode: 1890   score: 135.0  epsilon: 1.0    steps: 664  evaluation reward: 178.75
Training network. lr: 0.000216. clip: 0.086489
Iteration 4429: Policy loss: 0.825264. Value loss: 6.991617. Entropy: 0.769325.
Iteration 4430: Policy loss: 0.646334. Value loss: 3.471842. Entropy: 0.781555.
Iteration 4431: Policy loss: 0.644597. Value loss: 3.706109. Entropy: 0.800634.
episode: 1891   score: 120.0  epsilon: 1.0    steps: 65  evaluation reward: 178.6
episode: 1892   score: 260.0  epsilon: 1.0    steps: 622  evaluation reward: 179.65
Traini

Iteration 4487: Policy loss: 0.385731. Value loss: 12.324015. Entropy: 0.820554.
Iteration 4488: Policy loss: 0.310854. Value loss: 10.759770. Entropy: 0.802503.
Training network. lr: 0.000216. clip: 0.086333
Iteration 4489: Policy loss: -2.702842. Value loss: 192.478470. Entropy: 0.888263.
Iteration 4490: Policy loss: -3.698209. Value loss: 163.413605. Entropy: 0.823736.
Iteration 4491: Policy loss: -2.681087. Value loss: 91.191292. Entropy: 0.830898.
episode: 1915   score: 105.0  epsilon: 1.0    steps: 412  evaluation reward: 163.6
episode: 1916   score: 80.0  epsilon: 1.0    steps: 576  evaluation reward: 161.95
episode: 1917   score: 225.0  epsilon: 1.0    steps: 754  evaluation reward: 163.0
episode: 1918   score: 420.0  epsilon: 1.0    steps: 889  evaluation reward: 166.1
episode: 1919   score: 120.0  epsilon: 1.0    steps: 934  evaluation reward: 166.25
Training network. lr: 0.000216. clip: 0.086333
Iteration 4492: Policy loss: -0.315679. Value loss: 14.411426. Entropy: 0.632817

Training network. lr: 0.000215. clip: 0.086185
Iteration 4549: Policy loss: 0.127104. Value loss: 9.213293. Entropy: 0.874261.
Iteration 4550: Policy loss: 0.324320. Value loss: 6.210553. Entropy: 0.883306.
Iteration 4551: Policy loss: 0.529431. Value loss: 6.372404. Entropy: 0.924317.
episode: 1942   score: 105.0  epsilon: 1.0    steps: 130  evaluation reward: 159.2
episode: 1943   score: 180.0  epsilon: 1.0    steps: 825  evaluation reward: 159.75
Training network. lr: 0.000215. clip: 0.086029
Iteration 4552: Policy loss: 0.630204. Value loss: 12.247446. Entropy: 1.116038.
Iteration 4553: Policy loss: 0.341542. Value loss: 9.885705. Entropy: 1.137480.
Iteration 4554: Policy loss: 0.509899. Value loss: 6.016371. Entropy: 1.112331.
episode: 1944   score: 110.0  epsilon: 1.0    steps: 426  evaluation reward: 159.65
Training network. lr: 0.000215. clip: 0.086029
Iteration 4555: Policy loss: -0.849389. Value loss: 9.176172. Entropy: 0.972692.
Iteration 4556: Policy loss: -0.810290. Value 

episode: 1974   score: 165.0  epsilon: 1.0    steps: 102  evaluation reward: 149.1
episode: 1975   score: 185.0  epsilon: 1.0    steps: 335  evaluation reward: 149.15
episode: 1976   score: 75.0  epsilon: 1.0    steps: 891  evaluation reward: 148.35
Training network. lr: 0.000215. clip: 0.085872
Iteration 4606: Policy loss: -1.655109. Value loss: 29.264853. Entropy: 0.947880.
Iteration 4607: Policy loss: -1.456448. Value loss: 16.577827. Entropy: 0.923278.
Iteration 4608: Policy loss: -1.670082. Value loss: 12.439906. Entropy: 0.926560.
episode: 1977   score: 105.0  epsilon: 1.0    steps: 420  evaluation reward: 147.85
episode: 1978   score: 135.0  epsilon: 1.0    steps: 561  evaluation reward: 147.35
Training network. lr: 0.000215. clip: 0.085872
Iteration 4609: Policy loss: 0.713904. Value loss: 9.091210. Entropy: 0.829736.
Iteration 4610: Policy loss: 0.635353. Value loss: 6.013665. Entropy: 0.848043.
Iteration 4611: Policy loss: 0.525221. Value loss: 4.789754. Entropy: 0.826609.
Tr

Training network. lr: 0.000214. clip: 0.085724
Iteration 4666: Policy loss: 0.165287. Value loss: 18.014296. Entropy: 0.812871.
Iteration 4667: Policy loss: 0.182680. Value loss: 9.961085. Entropy: 0.824525.
Iteration 4668: Policy loss: 0.116493. Value loss: 7.100731. Entropy: 0.795711.
episode: 2003   score: 315.0  epsilon: 1.0    steps: 14  evaluation reward: 162.45
episode: 2004   score: 260.0  epsilon: 1.0    steps: 189  evaluation reward: 164.0
episode: 2005   score: 310.0  epsilon: 1.0    steps: 983  evaluation reward: 164.7
Training network. lr: 0.000214. clip: 0.085724
Iteration 4669: Policy loss: 1.779582. Value loss: 10.203805. Entropy: 0.754207.
Iteration 4670: Policy loss: 1.767440. Value loss: 5.855810. Entropy: 0.767413.
Iteration 4671: Policy loss: 1.529659. Value loss: 4.910637. Entropy: 0.787001.
Training network. lr: 0.000214. clip: 0.085724
Iteration 4672: Policy loss: -0.053701. Value loss: 13.978382. Entropy: 0.777168.
Iteration 4673: Policy loss: -0.111315. Value 

Iteration 4729: Policy loss: 0.204398. Value loss: 12.303585. Entropy: 0.822087.
Iteration 4730: Policy loss: 0.196295. Value loss: 8.600326. Entropy: 0.860184.
Iteration 4731: Policy loss: 0.059270. Value loss: 6.380945. Entropy: 0.855755.
episode: 2029   score: 330.0  epsilon: 1.0    steps: 502  evaluation reward: 165.25
episode: 2030   score: 290.0  epsilon: 1.0    steps: 529  evaluation reward: 166.35
Training network. lr: 0.000214. clip: 0.085568
Iteration 4732: Policy loss: 1.755353. Value loss: 9.251736. Entropy: 0.917924.
Iteration 4733: Policy loss: 1.631563. Value loss: 6.927368. Entropy: 0.942637.
Iteration 4734: Policy loss: 1.572654. Value loss: 5.476974. Entropy: 0.946900.
Training network. lr: 0.000214. clip: 0.085568
Iteration 4735: Policy loss: 0.526174. Value loss: 18.503078. Entropy: 0.900443.
Iteration 4736: Policy loss: 0.716696. Value loss: 12.460983. Entropy: 0.913183.
Iteration 4737: Policy loss: 0.829056. Value loss: 11.057864. Entropy: 0.922923.
Training netwo

Training network. lr: 0.000214. clip: 0.085411
Iteration 4792: Policy loss: -0.379388. Value loss: 23.531101. Entropy: 0.548043.
Iteration 4793: Policy loss: -0.161918. Value loss: 13.985315. Entropy: 0.530107.
Iteration 4794: Policy loss: -0.068874. Value loss: 11.600697. Entropy: 0.545696.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4795: Policy loss: -2.267464. Value loss: 152.673096. Entropy: 0.608110.
Iteration 4796: Policy loss: -3.455958. Value loss: 170.279495. Entropy: 0.468130.
Iteration 4797: Policy loss: -2.991239. Value loss: 61.214588. Entropy: 0.483715.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4798: Policy loss: -3.270819. Value loss: 41.839252. Entropy: 0.273092.
Iteration 4799: Policy loss: -3.212539. Value loss: 23.738533. Entropy: 0.256446.
Iteration 4800: Policy loss: -3.214823. Value loss: 21.145052. Entropy: 0.267439.
episode: 2055   score: 210.0  epsilon: 1.0    steps: 156  evaluation reward: 187.2
episode: 2056   score: 565.0  epsilo

episode: 2084   score: 210.0  epsilon: 1.0    steps: 826  evaluation reward: 210.55
Training network. lr: 0.000213. clip: 0.085107
Iteration 4852: Policy loss: 1.130760. Value loss: 21.289869. Entropy: 0.402858.
Iteration 4853: Policy loss: 0.940269. Value loss: 14.087255. Entropy: 0.410427.
Iteration 4854: Policy loss: 0.961047. Value loss: 9.513206. Entropy: 0.416219.
episode: 2085   score: 90.0  epsilon: 1.0    steps: 714  evaluation reward: 208.15
Training network. lr: 0.000213. clip: 0.085107
Iteration 4855: Policy loss: 0.530545. Value loss: 16.433496. Entropy: 0.426495.
Iteration 4856: Policy loss: 0.678122. Value loss: 9.582279. Entropy: 0.453454.
Iteration 4857: Policy loss: 0.546086. Value loss: 7.638901. Entropy: 0.476036.
episode: 2086   score: 210.0  epsilon: 1.0    steps: 34  evaluation reward: 206.5
Training network. lr: 0.000213. clip: 0.085107
Iteration 4858: Policy loss: -0.276422. Value loss: 46.584393. Entropy: 0.464549.
Iteration 4859: Policy loss: 0.183858. Value 

episode: 2113   score: 300.0  epsilon: 1.0    steps: 555  evaluation reward: 211.7
episode: 2114   score: 80.0  epsilon: 1.0    steps: 722  evaluation reward: 211.15
episode: 2115   score: 150.0  epsilon: 1.0    steps: 885  evaluation reward: 211.1
Training network. lr: 0.000212. clip: 0.084950
Iteration 4912: Policy loss: 2.357553. Value loss: 27.007843. Entropy: 0.475875.
Iteration 4913: Policy loss: 2.590549. Value loss: 12.292084. Entropy: 0.495084.
Iteration 4914: Policy loss: 2.466601. Value loss: 10.702726. Entropy: 0.457344.
episode: 2116   score: 240.0  epsilon: 1.0    steps: 7  evaluation reward: 211.1
episode: 2117   score: 155.0  epsilon: 1.0    steps: 332  evaluation reward: 210.85
Training network. lr: 0.000212. clip: 0.084950
Iteration 4915: Policy loss: -1.361840. Value loss: 27.489159. Entropy: 0.428254.
Iteration 4916: Policy loss: -1.526537. Value loss: 16.371834. Entropy: 0.405992.
Iteration 4917: Policy loss: -1.599912. Value loss: 13.674053. Entropy: 0.422802.
Tra

Iteration 4970: Policy loss: 0.371532. Value loss: 17.897856. Entropy: 0.549663.
Iteration 4971: Policy loss: 0.096173. Value loss: 15.612453. Entropy: 0.549198.
episode: 2144   score: 105.0  epsilon: 1.0    steps: 57  evaluation reward: 209.65
episode: 2145   score: 105.0  epsilon: 1.0    steps: 433  evaluation reward: 208.6
episode: 2146   score: 270.0  epsilon: 1.0    steps: 972  evaluation reward: 208.15
Training network. lr: 0.000212. clip: 0.084803
Iteration 4972: Policy loss: -0.527288. Value loss: 31.924313. Entropy: 0.609651.
Iteration 4973: Policy loss: -0.299621. Value loss: 19.484409. Entropy: 0.609545.
Iteration 4974: Policy loss: -0.356584. Value loss: 14.803093. Entropy: 0.615183.
Training network. lr: 0.000212. clip: 0.084803
Iteration 4975: Policy loss: 0.283953. Value loss: 29.796848. Entropy: 0.500898.
Iteration 4976: Policy loss: 0.212652. Value loss: 20.371794. Entropy: 0.528283.
Iteration 4977: Policy loss: 0.325755. Value loss: 15.417240. Entropy: 0.509622.
Train

Training network. lr: 0.000212. clip: 0.084646
Iteration 5032: Policy loss: -1.289111. Value loss: 31.092678. Entropy: 0.356435.
Iteration 5033: Policy loss: -1.252738. Value loss: 18.682577. Entropy: 0.380951.
Iteration 5034: Policy loss: -1.216848. Value loss: 13.275498. Entropy: 0.387837.
episode: 2170   score: 125.0  epsilon: 1.0    steps: 625  evaluation reward: 214.1
episode: 2171   score: 210.0  epsilon: 1.0    steps: 934  evaluation reward: 214.6
Training network. lr: 0.000212. clip: 0.084646
Iteration 5035: Policy loss: -0.109924. Value loss: 27.278263. Entropy: 0.311300.
Iteration 5036: Policy loss: -0.062144. Value loss: 15.176250. Entropy: 0.335692.
Iteration 5037: Policy loss: 0.109806. Value loss: 11.267164. Entropy: 0.322316.
episode: 2172   score: 215.0  epsilon: 1.0    steps: 130  evaluation reward: 214.65
episode: 2173   score: 195.0  epsilon: 1.0    steps: 268  evaluation reward: 214.8
episode: 2174   score: 265.0  epsilon: 1.0    steps: 475  evaluation reward: 216.2

Iteration 5092: Policy loss: 0.421511. Value loss: 22.733677. Entropy: 0.311413.
Iteration 5093: Policy loss: 0.260041. Value loss: 14.429324. Entropy: 0.320836.
Iteration 5094: Policy loss: 0.142906. Value loss: 11.845476. Entropy: 0.334183.
episode: 2198   score: 210.0  epsilon: 1.0    steps: 744  evaluation reward: 212.35
Training network. lr: 0.000211. clip: 0.084489
Iteration 5095: Policy loss: 0.418561. Value loss: 15.796069. Entropy: 0.434959.
Iteration 5096: Policy loss: 0.359690. Value loss: 10.087809. Entropy: 0.482820.
Iteration 5097: Policy loss: 0.541325. Value loss: 7.210877. Entropy: 0.476203.
episode: 2199   score: 350.0  epsilon: 1.0    steps: 198  evaluation reward: 214.3
episode: 2200   score: 150.0  epsilon: 1.0    steps: 807  evaluation reward: 215.05
Training network. lr: 0.000211. clip: 0.084489
Iteration 5098: Policy loss: 0.496561. Value loss: 9.375492. Entropy: 0.291731.
Iteration 5099: Policy loss: 0.530911. Value loss: 5.794184. Entropy: 0.291353.
Iteration 

episode: 2224   score: 240.0  epsilon: 1.0    steps: 918  evaluation reward: 222.05
Training network. lr: 0.000210. clip: 0.084185
Iteration 5155: Policy loss: 1.725693. Value loss: 17.891270. Entropy: 0.309168.
Iteration 5156: Policy loss: 1.716899. Value loss: 9.903576. Entropy: 0.348858.
Iteration 5157: Policy loss: 1.698728. Value loss: 8.628958. Entropy: 0.378503.
episode: 2225   score: 155.0  epsilon: 1.0    steps: 402  evaluation reward: 222.6
episode: 2226   score: 260.0  epsilon: 1.0    steps: 533  evaluation reward: 223.1
Training network. lr: 0.000210. clip: 0.084185
Iteration 5158: Policy loss: 0.497399. Value loss: 18.192749. Entropy: 0.368234.
Iteration 5159: Policy loss: 0.333469. Value loss: 12.273008. Entropy: 0.378085.
Iteration 5160: Policy loss: 0.292526. Value loss: 10.514172. Entropy: 0.384256.
episode: 2227   score: 440.0  epsilon: 1.0    steps: 236  evaluation reward: 226.3
Training network. lr: 0.000210. clip: 0.084185
Iteration 5161: Policy loss: -1.183523. Va

Iteration 5213: Policy loss: 1.780572. Value loss: 18.040421. Entropy: 0.401919.
Iteration 5214: Policy loss: 1.525514. Value loss: 15.470521. Entropy: 0.427157.
episode: 2254   score: 245.0  epsilon: 1.0    steps: 325  evaluation reward: 217.85
Training network. lr: 0.000210. clip: 0.084029
Iteration 5215: Policy loss: 0.677269. Value loss: 31.136395. Entropy: 0.469371.
Iteration 5216: Policy loss: 0.464937. Value loss: 21.976210. Entropy: 0.457583.
Iteration 5217: Policy loss: 0.447656. Value loss: 16.145994. Entropy: 0.449510.
episode: 2255   score: 150.0  epsilon: 1.0    steps: 521  evaluation reward: 214.5
episode: 2256   score: 125.0  epsilon: 1.0    steps: 698  evaluation reward: 213.95
episode: 2257   score: 685.0  epsilon: 1.0    steps: 795  evaluation reward: 217.65
Training network. lr: 0.000210. clip: 0.084029
Iteration 5218: Policy loss: 0.421110. Value loss: 24.172695. Entropy: 0.482280.
Iteration 5219: Policy loss: 0.209611. Value loss: 16.997116. Entropy: 0.521578.
Iter

Training network. lr: 0.000210. clip: 0.083881
Iteration 5269: Policy loss: -1.370865. Value loss: 21.620834. Entropy: 0.480304.
Iteration 5270: Policy loss: -1.342375. Value loss: 14.694747. Entropy: 0.470163.
Iteration 5271: Policy loss: -1.165387. Value loss: 12.240931. Entropy: 0.487862.
episode: 2288   score: 155.0  epsilon: 1.0    steps: 386  evaluation reward: 207.1
episode: 2289   score: 210.0  epsilon: 1.0    steps: 723  evaluation reward: 207.4
Training network. lr: 0.000210. clip: 0.083881
Iteration 5272: Policy loss: -2.329916. Value loss: 21.264582. Entropy: 0.519257.
Iteration 5273: Policy loss: -2.337321. Value loss: 15.294360. Entropy: 0.466812.
Iteration 5274: Policy loss: -2.301190. Value loss: 14.341470. Entropy: 0.482541.
Training network. lr: 0.000210. clip: 0.083881
Iteration 5275: Policy loss: -1.728249. Value loss: 37.267693. Entropy: 0.413254.
Iteration 5276: Policy loss: -1.229022. Value loss: 21.467798. Entropy: 0.425636.
Iteration 5277: Policy loss: -1.70662

episode: 2316   score: 245.0  epsilon: 1.0    steps: 356  evaluation reward: 225.4
episode: 2317   score: 180.0  epsilon: 1.0    steps: 786  evaluation reward: 225.4
Training network. lr: 0.000209. clip: 0.083725
Iteration 5329: Policy loss: 1.559082. Value loss: 23.992727. Entropy: 0.308920.
Iteration 5330: Policy loss: 1.574073. Value loss: 14.563641. Entropy: 0.324385.
Iteration 5331: Policy loss: 1.472268. Value loss: 12.883773. Entropy: 0.325522.
episode: 2318   score: 210.0  epsilon: 1.0    steps: 107  evaluation reward: 224.15
episode: 2319   score: 65.0  epsilon: 1.0    steps: 679  evaluation reward: 222.7
Training network. lr: 0.000209. clip: 0.083725
Iteration 5332: Policy loss: 1.164787. Value loss: 25.484415. Entropy: 0.296643.
Iteration 5333: Policy loss: 1.131589. Value loss: 10.915146. Entropy: 0.286991.
Iteration 5334: Policy loss: 1.088923. Value loss: 10.236285. Entropy: 0.285541.
episode: 2320   score: 230.0  epsilon: 1.0    steps: 183  evaluation reward: 220.4
episo

Iteration 5385: Policy loss: -1.096646. Value loss: 11.835034. Entropy: 0.493282.
Training network. lr: 0.000209. clip: 0.083568
Iteration 5386: Policy loss: 0.954525. Value loss: 28.644260. Entropy: 0.331586.
Iteration 5387: Policy loss: 0.869510. Value loss: 16.468729. Entropy: 0.347377.
Iteration 5388: Policy loss: 0.738991. Value loss: 14.150226. Entropy: 0.338816.
episode: 2349   score: 210.0  epsilon: 1.0    steps: 981  evaluation reward: 210.5
Training network. lr: 0.000209. clip: 0.083568
Iteration 5389: Policy loss: 3.644655. Value loss: 41.828293. Entropy: 0.175100.
Iteration 5390: Policy loss: 3.181190. Value loss: 18.913799. Entropy: 0.167635.
Iteration 5391: Policy loss: 3.400955. Value loss: 14.902040. Entropy: 0.182135.
episode: 2350   score: 305.0  epsilon: 1.0    steps: 201  evaluation reward: 211.7
now time :  2019-02-25 20:20:45.142893
episode: 2351   score: 510.0  epsilon: 1.0    steps: 304  evaluation reward: 213.85
episode: 2352   score: 55.0  epsilon: 1.0    step

Iteration 5446: Policy loss: -0.242686. Value loss: 29.921820. Entropy: 0.682448.
Iteration 5447: Policy loss: -0.196015. Value loss: 17.393335. Entropy: 0.659675.
Iteration 5448: Policy loss: -0.083030. Value loss: 13.175156. Entropy: 0.670458.
episode: 2376   score: 240.0  epsilon: 1.0    steps: 491  evaluation reward: 204.15
episode: 2377   score: 265.0  epsilon: 1.0    steps: 710  evaluation reward: 204.5
episode: 2378   score: 80.0  epsilon: 1.0    steps: 895  evaluation reward: 204.55
episode: 2379   score: 225.0  epsilon: 1.0    steps: 899  evaluation reward: 204.85
Training network. lr: 0.000209. clip: 0.083420
Iteration 5449: Policy loss: 0.175372. Value loss: 16.999170. Entropy: 0.806423.
Iteration 5450: Policy loss: 0.032607. Value loss: 9.711875. Entropy: 0.777073.
Iteration 5451: Policy loss: 0.106705. Value loss: 8.086752. Entropy: 0.769156.
Training network. lr: 0.000208. clip: 0.083264
Iteration 5452: Policy loss: 0.192491. Value loss: 14.410005. Entropy: 0.691122.
Iter

episode: 2405   score: 180.0  epsilon: 1.0    steps: 721  evaluation reward: 199.75
Training network. lr: 0.000208. clip: 0.083107
Iteration 5506: Policy loss: 0.179903. Value loss: 24.028978. Entropy: 0.498994.
Iteration 5507: Policy loss: 0.196385. Value loss: 13.168758. Entropy: 0.501572.
Iteration 5508: Policy loss: 0.084301. Value loss: 9.911963. Entropy: 0.496589.
episode: 2406   score: 215.0  epsilon: 1.0    steps: 436  evaluation reward: 199.8
episode: 2407   score: 315.0  epsilon: 1.0    steps: 783  evaluation reward: 200.3
episode: 2408   score: 210.0  epsilon: 1.0    steps: 931  evaluation reward: 200.6
Training network. lr: 0.000208. clip: 0.083107
Iteration 5509: Policy loss: 0.478029. Value loss: 8.067687. Entropy: 0.470395.
Iteration 5510: Policy loss: 0.465948. Value loss: 5.662351. Entropy: 0.442598.
Iteration 5511: Policy loss: 0.773340. Value loss: 4.618909. Entropy: 0.441243.
Training network. lr: 0.000208. clip: 0.083107
Iteration 5512: Policy loss: -0.877419. Valu

episode: 2434   score: 285.0  epsilon: 1.0    steps: 204  evaluation reward: 199.1
episode: 2435   score: 105.0  epsilon: 1.0    steps: 588  evaluation reward: 198.65
episode: 2436   score: 395.0  epsilon: 1.0    steps: 751  evaluation reward: 201.8
Training network. lr: 0.000207. clip: 0.082960
Iteration 5566: Policy loss: 1.215006. Value loss: 17.552172. Entropy: 0.326226.
Iteration 5567: Policy loss: 1.207279. Value loss: 11.640192. Entropy: 0.309669.
Iteration 5568: Policy loss: 1.298972. Value loss: 10.010635. Entropy: 0.315512.
Training network. lr: 0.000207. clip: 0.082960
Iteration 5569: Policy loss: 0.033236. Value loss: 12.350272. Entropy: 0.550036.
Iteration 5570: Policy loss: -0.108268. Value loss: 8.468417. Entropy: 0.579058.
Iteration 5571: Policy loss: 0.028675. Value loss: 7.388412. Entropy: 0.556710.
Training network. lr: 0.000207. clip: 0.082960
Iteration 5572: Policy loss: -0.438170. Value loss: 19.143467. Entropy: 0.324644.
Iteration 5573: Policy loss: -0.370774. Va

Training network. lr: 0.000207. clip: 0.082803
Iteration 5626: Policy loss: 1.354437. Value loss: 19.591219. Entropy: 0.473332.
Iteration 5627: Policy loss: 1.325829. Value loss: 9.316572. Entropy: 0.493891.
Iteration 5628: Policy loss: 1.441732. Value loss: 6.952620. Entropy: 0.491906.
episode: 2463   score: 180.0  epsilon: 1.0    steps: 157  evaluation reward: 199.9
Training network. lr: 0.000207. clip: 0.082803
Iteration 5629: Policy loss: 0.929460. Value loss: 11.777409. Entropy: 0.469686.
Iteration 5630: Policy loss: 0.875056. Value loss: 6.036533. Entropy: 0.482646.
Iteration 5631: Policy loss: 0.929494. Value loss: 4.131516. Entropy: 0.487149.
episode: 2464   score: 115.0  epsilon: 1.0    steps: 829  evaluation reward: 199.85
Training network. lr: 0.000207. clip: 0.082803
Iteration 5632: Policy loss: 0.578521. Value loss: 13.689707. Entropy: 0.470765.
Iteration 5633: Policy loss: 0.429023. Value loss: 7.797975. Entropy: 0.489150.
Iteration 5634: Policy loss: 0.301854. Value loss

Iteration 5690: Policy loss: -1.640856. Value loss: 8.338089. Entropy: 0.588648.
Iteration 5691: Policy loss: -1.295399. Value loss: 9.022756. Entropy: 0.602259.
episode: 2487   score: 180.0  epsilon: 1.0    steps: 674  evaluation reward: 203.15
Training network. lr: 0.000207. clip: 0.082646
Iteration 5692: Policy loss: -0.345383. Value loss: 14.203615. Entropy: 0.603910.
Iteration 5693: Policy loss: -0.069751. Value loss: 6.912871. Entropy: 0.591386.
Iteration 5694: Policy loss: -0.521674. Value loss: 4.714059. Entropy: 0.598091.
episode: 2488   score: 210.0  epsilon: 1.0    steps: 918  evaluation reward: 204.15
Training network. lr: 0.000207. clip: 0.082646
Iteration 5695: Policy loss: -0.844088. Value loss: 9.836941. Entropy: 0.400666.
Iteration 5696: Policy loss: -0.722080. Value loss: 5.878652. Entropy: 0.404271.
Iteration 5697: Policy loss: -0.800555. Value loss: 5.759713. Entropy: 0.391063.
episode: 2489   score: 155.0  epsilon: 1.0    steps: 412  evaluation reward: 202.25
Train

episode: 2514   score: 355.0  epsilon: 1.0    steps: 631  evaluation reward: 204.55
episode: 2515   score: 260.0  epsilon: 1.0    steps: 901  evaluation reward: 204.55
Training network. lr: 0.000206. clip: 0.082342
Iteration 5752: Policy loss: 0.834293. Value loss: 25.171749. Entropy: 0.458976.
Iteration 5753: Policy loss: 0.667077. Value loss: 13.674945. Entropy: 0.465010.
Iteration 5754: Policy loss: 0.848762. Value loss: 9.184001. Entropy: 0.472444.
episode: 2516   score: 50.0  epsilon: 1.0    steps: 790  evaluation reward: 203.85
Training network. lr: 0.000206. clip: 0.082342
Iteration 5755: Policy loss: 0.668602. Value loss: 10.850893. Entropy: 0.415151.
Iteration 5756: Policy loss: 0.597607. Value loss: 7.377614. Entropy: 0.410081.
Iteration 5757: Policy loss: 0.573162. Value loss: 5.237402. Entropy: 0.412264.
episode: 2517   score: 180.0  epsilon: 1.0    steps: 51  evaluation reward: 203.55
Training network. lr: 0.000206. clip: 0.082342
Iteration 5758: Policy loss: 0.338089. Val

Training network. lr: 0.000205. clip: 0.082185
Iteration 5815: Policy loss: -0.679827. Value loss: 22.918814. Entropy: 0.588192.
Iteration 5816: Policy loss: -0.928987. Value loss: 12.723268. Entropy: 0.569316.
Iteration 5817: Policy loss: -0.487754. Value loss: 8.740479. Entropy: 0.588134.
episode: 2540   score: 275.0  epsilon: 1.0    steps: 94  evaluation reward: 208.8
Training network. lr: 0.000205. clip: 0.082185
Iteration 5818: Policy loss: 0.331510. Value loss: 9.265335. Entropy: 0.475188.
Iteration 5819: Policy loss: 0.364489. Value loss: 5.332398. Entropy: 0.484081.
Iteration 5820: Policy loss: 0.315529. Value loss: 4.312104. Entropy: 0.484249.
episode: 2541   score: 210.0  epsilon: 1.0    steps: 227  evaluation reward: 209.1
episode: 2542   score: 240.0  epsilon: 1.0    steps: 274  evaluation reward: 210.45
episode: 2543   score: 180.0  epsilon: 1.0    steps: 455  evaluation reward: 209.65
episode: 2544   score: 185.0  epsilon: 1.0    steps: 629  evaluation reward: 207.6
episo

episode: 2572   score: 390.0  epsilon: 1.0    steps: 503  evaluation reward: 208.4
Training network. lr: 0.000205. clip: 0.082038
Iteration 5872: Policy loss: -2.800666. Value loss: 219.970169. Entropy: 0.364620.
Iteration 5873: Policy loss: -2.442656. Value loss: 132.258057. Entropy: 0.337674.
Iteration 5874: Policy loss: -2.873480. Value loss: 92.010544. Entropy: 0.326027.
Training network. lr: 0.000205. clip: 0.082038
Iteration 5875: Policy loss: 0.546630. Value loss: 25.243422. Entropy: 0.478294.
Iteration 5876: Policy loss: 0.486519. Value loss: 12.740969. Entropy: 0.455781.
Iteration 5877: Policy loss: 0.592621. Value loss: 9.643446. Entropy: 0.458677.
episode: 2573   score: 230.0  epsilon: 1.0    steps: 328  evaluation reward: 208.9
episode: 2574   score: 80.0  epsilon: 1.0    steps: 577  evaluation reward: 208.45
episode: 2575   score: 135.0  epsilon: 1.0    steps: 874  evaluation reward: 206.7
Training network. lr: 0.000205. clip: 0.082038
Iteration 5878: Policy loss: 2.664894

Iteration 5928: Policy loss: -2.426757. Value loss: 37.401104. Entropy: 0.293263.
episode: 2605   score: 185.0  epsilon: 1.0    steps: 418  evaluation reward: 192.0
episode: 2606   score: 210.0  epsilon: 1.0    steps: 545  evaluation reward: 188.95
Training network. lr: 0.000205. clip: 0.081881
Iteration 5929: Policy loss: 0.773762. Value loss: 53.324047. Entropy: 0.540682.
Iteration 5930: Policy loss: 0.807047. Value loss: 18.695154. Entropy: 0.487290.
Iteration 5931: Policy loss: 1.033228. Value loss: 14.472447. Entropy: 0.470039.
episode: 2607   score: 165.0  epsilon: 1.0    steps: 372  evaluation reward: 189.5
Training network. lr: 0.000205. clip: 0.081881
Iteration 5932: Policy loss: 4.085889. Value loss: 37.483574. Entropy: 0.311820.
Iteration 5933: Policy loss: 3.501954. Value loss: 20.700846. Entropy: 0.331255.
Iteration 5934: Policy loss: 3.735038. Value loss: 18.137947. Entropy: 0.334454.
episode: 2608   score: 220.0  epsilon: 1.0    steps: 199  evaluation reward: 190.95
Trai

Iteration 5990: Policy loss: -0.606037. Value loss: 14.583551. Entropy: 0.263223.
Iteration 5991: Policy loss: -0.769790. Value loss: 11.842278. Entropy: 0.268112.
episode: 2632   score: 255.0  epsilon: 1.0    steps: 183  evaluation reward: 194.55
episode: 2633   score: 240.0  epsilon: 1.0    steps: 285  evaluation reward: 193.65
episode: 2634   score: 290.0  epsilon: 1.0    steps: 641  evaluation reward: 194.75
Training network. lr: 0.000204. clip: 0.081725
Iteration 5992: Policy loss: 1.220258. Value loss: 9.095880. Entropy: 0.364113.
Iteration 5993: Policy loss: 0.973296. Value loss: 6.653890. Entropy: 0.371579.
Iteration 5994: Policy loss: 0.899800. Value loss: 4.168577. Entropy: 0.377039.
Training network. lr: 0.000204. clip: 0.081725
Iteration 5995: Policy loss: -0.431065. Value loss: 17.091276. Entropy: 0.417467.
Iteration 5996: Policy loss: -0.446587. Value loss: 12.430593. Entropy: 0.412826.
Iteration 5997: Policy loss: -0.391711. Value loss: 8.654578. Entropy: 0.421656.
episo

episode: 2662   score: 320.0  epsilon: 1.0    steps: 631  evaluation reward: 193.15
Training network. lr: 0.000204. clip: 0.081577
Iteration 6049: Policy loss: -2.304451. Value loss: 33.349949. Entropy: 0.511124.
Iteration 6050: Policy loss: -2.195524. Value loss: 17.911903. Entropy: 0.518488.
Iteration 6051: Policy loss: -2.718853. Value loss: 18.799520. Entropy: 0.503822.
episode: 2663   score: 285.0  epsilon: 1.0    steps: 859  evaluation reward: 194.65
Training network. lr: 0.000204. clip: 0.081421
Iteration 6052: Policy loss: 0.429782. Value loss: 30.653851. Entropy: 0.563625.
Iteration 6053: Policy loss: 0.517095. Value loss: 17.513180. Entropy: 0.587450.
Iteration 6054: Policy loss: 0.839190. Value loss: 11.033463. Entropy: 0.586105.
episode: 2664   score: 180.0  epsilon: 1.0    steps: 349  evaluation reward: 194.35
episode: 2665   score: 240.0  epsilon: 1.0    steps: 745  evaluation reward: 195.55
Training network. lr: 0.000204. clip: 0.081421
Iteration 6055: Policy loss: -0.11

Iteration 6107: Policy loss: -0.230442. Value loss: 10.890643. Entropy: 0.385091.
Iteration 6108: Policy loss: -0.192661. Value loss: 9.470179. Entropy: 0.396792.
Training network. lr: 0.000203. clip: 0.081264
Iteration 6109: Policy loss: 3.264207. Value loss: 19.181974. Entropy: 0.586547.
Iteration 6110: Policy loss: 2.914472. Value loss: 10.384481. Entropy: 0.581742.
Iteration 6111: Policy loss: 3.085068. Value loss: 8.945456. Entropy: 0.578465.
episode: 2693   score: 125.0  epsilon: 1.0    steps: 204  evaluation reward: 199.1
episode: 2694   score: 75.0  epsilon: 1.0    steps: 274  evaluation reward: 198.75
Training network. lr: 0.000203. clip: 0.081264
Iteration 6112: Policy loss: 0.376848. Value loss: 29.930138. Entropy: 0.558422.
Iteration 6113: Policy loss: 0.533303. Value loss: 14.247811. Entropy: 0.510568.
Iteration 6114: Policy loss: 0.355699. Value loss: 10.310543. Entropy: 0.487603.
episode: 2695   score: 260.0  epsilon: 1.0    steps: 509  evaluation reward: 200.9
episode: 

episode: 2720   score: 260.0  epsilon: 1.0    steps: 807  evaluation reward: 203.05
episode: 2721   score: 285.0  epsilon: 1.0    steps: 958  evaluation reward: 203.6
Training network. lr: 0.000203. clip: 0.081116
Iteration 6169: Policy loss: -0.322445. Value loss: 15.786114. Entropy: 0.629484.
Iteration 6170: Policy loss: -0.485089. Value loss: 9.063816. Entropy: 0.617546.
Iteration 6171: Policy loss: -0.506371. Value loss: 8.829552. Entropy: 0.625314.
Training network. lr: 0.000203. clip: 0.081116
Iteration 6172: Policy loss: -0.904330. Value loss: 24.971134. Entropy: 0.541578.
Iteration 6173: Policy loss: -0.669767. Value loss: 13.809347. Entropy: 0.513287.
Iteration 6174: Policy loss: -0.883942. Value loss: 10.102118. Entropy: 0.494322.
episode: 2722   score: 265.0  epsilon: 1.0    steps: 134  evaluation reward: 204.7
episode: 2723   score: 135.0  epsilon: 1.0    steps: 722  evaluation reward: 203.95
Training network. lr: 0.000203. clip: 0.081116
Iteration 6175: Policy loss: -0.193

episode: 2746   score: 195.0  epsilon: 1.0    steps: 647  evaluation reward: 203.65
episode: 2747   score: 210.0  epsilon: 1.0    steps: 929  evaluation reward: 203.6
Training network. lr: 0.000202. clip: 0.080960
Iteration 6232: Policy loss: -1.076785. Value loss: 10.264053. Entropy: 0.692033.
Iteration 6233: Policy loss: -0.931731. Value loss: 8.026813. Entropy: 0.680092.
Iteration 6234: Policy loss: -0.853935. Value loss: 4.969696. Entropy: 0.709407.
Training network. lr: 0.000202. clip: 0.080960
Iteration 6235: Policy loss: -0.132101. Value loss: 27.164579. Entropy: 0.428559.
Iteration 6236: Policy loss: -0.327942. Value loss: 16.879675. Entropy: 0.475739.
Iteration 6237: Policy loss: -0.064920. Value loss: 14.874904. Entropy: 0.439136.
Training network. lr: 0.000202. clip: 0.080960
Iteration 6238: Policy loss: 0.091908. Value loss: 27.845161. Entropy: 0.588247.
Iteration 6239: Policy loss: 0.162840. Value loss: 15.620988. Entropy: 0.599234.
Iteration 6240: Policy loss: 0.195421. V

Training network. lr: 0.000202. clip: 0.080803
Iteration 6295: Policy loss: 2.144563. Value loss: 18.404728. Entropy: 0.660342.
Iteration 6296: Policy loss: 2.273962. Value loss: 10.577043. Entropy: 0.634072.
Iteration 6297: Policy loss: 2.194955. Value loss: 11.572173. Entropy: 0.650888.
Training network. lr: 0.000202. clip: 0.080803
Iteration 6298: Policy loss: 0.462230. Value loss: 14.276697. Entropy: 0.336565.
Iteration 6299: Policy loss: 0.268602. Value loss: 13.685552. Entropy: 0.327097.
Iteration 6300: Policy loss: 0.316896. Value loss: 12.165421. Entropy: 0.353349.
episode: 2771   score: 180.0  epsilon: 1.0    steps: 356  evaluation reward: 218.05
episode: 2772   score: 120.0  epsilon: 1.0    steps: 984  evaluation reward: 217.75
Training network. lr: 0.000202. clip: 0.080656
Iteration 6301: Policy loss: 2.302907. Value loss: 39.566227. Entropy: 0.589135.
Iteration 6302: Policy loss: 2.157692. Value loss: 21.342579. Entropy: 0.593882.
Iteration 6303: Policy loss: 2.173975. Valu

Iteration 6356: Policy loss: -0.579541. Value loss: 11.688658. Entropy: 0.402339.
Iteration 6357: Policy loss: -0.476607. Value loss: 11.763608. Entropy: 0.387084.
episode: 2799   score: 240.0  epsilon: 1.0    steps: 523  evaluation reward: 224.8
Training network. lr: 0.000201. clip: 0.080499
Iteration 6358: Policy loss: 0.871492. Value loss: 15.079495. Entropy: 0.483442.
Iteration 6359: Policy loss: 0.894980. Value loss: 8.890687. Entropy: 0.476924.
Iteration 6360: Policy loss: 0.921607. Value loss: 7.421604. Entropy: 0.474628.
episode: 2800   score: 155.0  epsilon: 1.0    steps: 60  evaluation reward: 223.8
now time :  2019-02-25 20:38:54.606854
episode: 2801   score: 285.0  epsilon: 1.0    steps: 408  evaluation reward: 224.35
Training network. lr: 0.000201. clip: 0.080499
Iteration 6361: Policy loss: 1.021776. Value loss: 16.105803. Entropy: 0.543774.
Iteration 6362: Policy loss: 0.866455. Value loss: 10.374643. Entropy: 0.542875.
Iteration 6363: Policy loss: 0.984181. Value loss: 

episode: 2826   score: 225.0  epsilon: 1.0    steps: 752  evaluation reward: 233.85
Training network. lr: 0.000201. clip: 0.080342
Iteration 6418: Policy loss: 1.250571. Value loss: 16.723721. Entropy: 0.645951.
Iteration 6419: Policy loss: 1.144503. Value loss: 11.683695. Entropy: 0.664365.
Iteration 6420: Policy loss: 1.089853. Value loss: 9.092857. Entropy: 0.642370.
episode: 2827   score: 180.0  epsilon: 1.0    steps: 365  evaluation reward: 232.8
episode: 2828   score: 135.0  epsilon: 1.0    steps: 1009  evaluation reward: 231.75
Training network. lr: 0.000201. clip: 0.080342
Iteration 6421: Policy loss: -5.193030. Value loss: 445.643250. Entropy: 0.464994.
Iteration 6422: Policy loss: -4.606608. Value loss: 207.935242. Entropy: 0.415375.
Iteration 6423: Policy loss: -4.013222. Value loss: 126.743484. Entropy: 0.412936.
Training network. lr: 0.000201. clip: 0.080342
Iteration 6424: Policy loss: -1.145135. Value loss: 253.171082. Entropy: 0.309670.
Iteration 6425: Policy loss: -1.2

Training network. lr: 0.000200. clip: 0.080195
Iteration 6478: Policy loss: 0.328652. Value loss: 11.321465. Entropy: 0.728018.
Iteration 6479: Policy loss: 0.237843. Value loss: 7.062061. Entropy: 0.757642.
Iteration 6480: Policy loss: 0.225195. Value loss: 6.002151. Entropy: 0.765338.
episode: 2855   score: 185.0  epsilon: 1.0    steps: 500  evaluation reward: 228.75
episode: 2856   score: 135.0  epsilon: 1.0    steps: 914  evaluation reward: 225.75
Training network. lr: 0.000200. clip: 0.080195
Iteration 6481: Policy loss: 1.888680. Value loss: 14.057236. Entropy: 0.841385.
Iteration 6482: Policy loss: 1.939135. Value loss: 7.159658. Entropy: 0.815507.
Iteration 6483: Policy loss: 1.838467. Value loss: 5.863451. Entropy: 0.811205.
episode: 2857   score: 135.0  epsilon: 1.0    steps: 314  evaluation reward: 225.0
episode: 2858   score: 215.0  epsilon: 1.0    steps: 722  evaluation reward: 224.5
Training network. lr: 0.000200. clip: 0.080195
Iteration 6484: Policy loss: 1.669205. Valu

episode: 2884   score: 235.0  epsilon: 1.0    steps: 514  evaluation reward: 208.75
Training network. lr: 0.000200. clip: 0.080038
Iteration 6538: Policy loss: -5.343370. Value loss: 315.784973. Entropy: 0.679002.
Iteration 6539: Policy loss: -5.396975. Value loss: 129.080002. Entropy: 0.590292.
Iteration 6540: Policy loss: -5.512131. Value loss: 145.082779. Entropy: 0.583428.
episode: 2885   score: 225.0  epsilon: 1.0    steps: 769  evaluation reward: 208.4
Training network. lr: 0.000200. clip: 0.080038
Iteration 6541: Policy loss: -1.746175. Value loss: 39.554108. Entropy: 0.524962.
Iteration 6542: Policy loss: -1.671353. Value loss: 20.888775. Entropy: 0.519625.
Iteration 6543: Policy loss: -1.976876. Value loss: 15.369419. Entropy: 0.522449.
episode: 2886   score: 440.0  epsilon: 1.0    steps: 745  evaluation reward: 210.65
episode: 2887   score: 130.0  epsilon: 1.0    steps: 1004  evaluation reward: 210.15
Training network. lr: 0.000200. clip: 0.080038
Iteration 6544: Policy loss:

Iteration 6597: Policy loss: 1.358149. Value loss: 3.941272. Entropy: 0.703459.
episode: 2913   score: 105.0  epsilon: 1.0    steps: 463  evaluation reward: 215.7
episode: 2914   score: 75.0  epsilon: 1.0    steps: 919  evaluation reward: 213.2
Training network. lr: 0.000200. clip: 0.079881
Iteration 6598: Policy loss: 1.159881. Value loss: 9.145762. Entropy: 0.385774.
Iteration 6599: Policy loss: 1.156647. Value loss: 6.822482. Entropy: 0.414379.
Iteration 6600: Policy loss: 1.173955. Value loss: 6.367426. Entropy: 0.439182.
episode: 2915   score: 265.0  epsilon: 1.0    steps: 242  evaluation reward: 214.65
Training network. lr: 0.000199. clip: 0.079734
Iteration 6601: Policy loss: 1.086706. Value loss: 10.715366. Entropy: 0.912331.
Iteration 6602: Policy loss: 1.163842. Value loss: 5.722205. Entropy: 0.904486.
Iteration 6603: Policy loss: 1.048041. Value loss: 4.279373. Entropy: 0.927205.
episode: 2916   score: 75.0  epsilon: 1.0    steps: 313  evaluation reward: 212.8
episode: 2917 

episode: 2945   score: 300.0  epsilon: 1.0    steps: 349  evaluation reward: 182.9
Training network. lr: 0.000199. clip: 0.079577
Iteration 6655: Policy loss: -0.265650. Value loss: 18.497187. Entropy: 0.663269.
Iteration 6656: Policy loss: -0.211773. Value loss: 9.511113. Entropy: 0.670053.
Iteration 6657: Policy loss: -0.341519. Value loss: 8.560772. Entropy: 0.632695.
episode: 2946   score: 105.0  epsilon: 1.0    steps: 521  evaluation reward: 181.55
episode: 2947   score: 195.0  epsilon: 1.0    steps: 872  evaluation reward: 182.4
Training network. lr: 0.000199. clip: 0.079577
Iteration 6658: Policy loss: -0.057782. Value loss: 21.695015. Entropy: 0.608800.
Iteration 6659: Policy loss: 0.031747. Value loss: 10.630980. Entropy: 0.610400.
Iteration 6660: Policy loss: -0.119199. Value loss: 8.356744. Entropy: 0.619884.
episode: 2948   score: 270.0  epsilon: 1.0    steps: 507  evaluation reward: 184.8
episode: 2949   score: 135.0  epsilon: 1.0    steps: 741  evaluation reward: 185.1
Tr

Iteration 6712: Policy loss: -1.619627. Value loss: 33.682491. Entropy: 0.785547.
Iteration 6713: Policy loss: -1.708935. Value loss: 17.940725. Entropy: 0.770255.
Iteration 6714: Policy loss: -1.705692. Value loss: 13.752329. Entropy: 0.772272.
episode: 2977   score: 220.0  epsilon: 1.0    steps: 335  evaluation reward: 187.6
episode: 2978   score: 345.0  epsilon: 1.0    steps: 677  evaluation reward: 189.5
episode: 2979   score: 105.0  epsilon: 1.0    steps: 828  evaluation reward: 188.45
Training network. lr: 0.000199. clip: 0.079421
Iteration 6715: Policy loss: 0.319424. Value loss: 18.312464. Entropy: 0.906632.
Iteration 6716: Policy loss: 0.193300. Value loss: 8.750050. Entropy: 0.884528.
Iteration 6717: Policy loss: 0.076127. Value loss: 7.023168. Entropy: 0.918474.
Training network. lr: 0.000199. clip: 0.079421
Iteration 6718: Policy loss: -0.999981. Value loss: 19.348555. Entropy: 0.648786.
Iteration 6719: Policy loss: -0.864450. Value loss: 11.183462. Entropy: 0.660541.
Itera

Iteration 6772: Policy loss: 0.102297. Value loss: 19.080988. Entropy: 0.618566.
Iteration 6773: Policy loss: 0.023363. Value loss: 12.078233. Entropy: 0.637619.
Iteration 6774: Policy loss: -0.037581. Value loss: 8.775872. Entropy: 0.624885.
episode: 3005   score: 225.0  epsilon: 1.0    steps: 839  evaluation reward: 184.0
Training network. lr: 0.000198. clip: 0.079273
Iteration 6775: Policy loss: 1.081723. Value loss: 32.675533. Entropy: 0.572535.
Iteration 6776: Policy loss: 0.660296. Value loss: 19.124628. Entropy: 0.575306.
Iteration 6777: Policy loss: 0.799957. Value loss: 14.340330. Entropy: 0.569248.
Training network. lr: 0.000198. clip: 0.079273
Iteration 6778: Policy loss: -1.253003. Value loss: 32.973351. Entropy: 0.507162.
Iteration 6779: Policy loss: -1.168996. Value loss: 15.910953. Entropy: 0.511597.
Iteration 6780: Policy loss: -1.064971. Value loss: 12.388615. Entropy: 0.499340.
episode: 3006   score: 125.0  epsilon: 1.0    steps: 355  evaluation reward: 182.4
Training

episode: 3035   score: 240.0  epsilon: 1.0    steps: 586  evaluation reward: 211.0
episode: 3036   score: 260.0  epsilon: 1.0    steps: 991  evaluation reward: 212.8
Training network. lr: 0.000198. clip: 0.079117
Iteration 6832: Policy loss: -1.712093. Value loss: 28.053623. Entropy: 0.444875.
Iteration 6833: Policy loss: -1.518338. Value loss: 17.743361. Entropy: 0.467636.
Iteration 6834: Policy loss: -1.709108. Value loss: 14.728077. Entropy: 0.455742.
Training network. lr: 0.000198. clip: 0.079117
Iteration 6835: Policy loss: 0.010831. Value loss: 26.711437. Entropy: 0.530274.
Iteration 6836: Policy loss: 0.014025. Value loss: 15.248648. Entropy: 0.542772.
Iteration 6837: Policy loss: 0.015266. Value loss: 11.272017. Entropy: 0.565387.
episode: 3037   score: 275.0  epsilon: 1.0    steps: 262  evaluation reward: 214.5
Training network. lr: 0.000198. clip: 0.079117
Iteration 6838: Policy loss: 0.484220. Value loss: 31.281155. Entropy: 0.616234.
Iteration 6839: Policy loss: 0.528777. V

Iteration 6892: Policy loss: -2.425145. Value loss: 28.995779. Entropy: 0.622169.
Iteration 6893: Policy loss: -2.231010. Value loss: 16.142897. Entropy: 0.637219.
Iteration 6894: Policy loss: -2.222157. Value loss: 12.265715. Entropy: 0.625284.
episode: 3063   score: 345.0  epsilon: 1.0    steps: 769  evaluation reward: 235.1
episode: 3064   score: 150.0  epsilon: 1.0    steps: 990  evaluation reward: 232.8
Training network. lr: 0.000197. clip: 0.078960
Iteration 6895: Policy loss: -0.935072. Value loss: 27.089642. Entropy: 0.518793.
Iteration 6896: Policy loss: -0.930653. Value loss: 15.873392. Entropy: 0.483056.
Iteration 6897: Policy loss: -0.827516. Value loss: 14.229106. Entropy: 0.502205.
Training network. lr: 0.000197. clip: 0.078960
Iteration 6898: Policy loss: 1.362264. Value loss: 20.216970. Entropy: 0.720522.
Iteration 6899: Policy loss: 1.344995. Value loss: 11.612790. Entropy: 0.706785.
Iteration 6900: Policy loss: 1.411379. Value loss: 9.955474. Entropy: 0.721975.
episod

Iteration 6950: Policy loss: 0.447834. Value loss: 23.251740. Entropy: 0.547277.
Iteration 6951: Policy loss: 0.360525. Value loss: 16.231928. Entropy: 0.524687.
episode: 3094   score: 65.0  epsilon: 1.0    steps: 453  evaluation reward: 226.75
episode: 3095   score: 260.0  epsilon: 1.0    steps: 513  evaluation reward: 224.4
Training network. lr: 0.000197. clip: 0.078656
Iteration 6952: Policy loss: -0.563435. Value loss: 29.358601. Entropy: 0.633174.
Iteration 6953: Policy loss: -0.938138. Value loss: 17.671539. Entropy: 0.614109.
Iteration 6954: Policy loss: -1.006961. Value loss: 19.928648. Entropy: 0.647847.
Training network. lr: 0.000197. clip: 0.078656
Iteration 6955: Policy loss: -1.367084. Value loss: 24.032316. Entropy: 0.425646.
Iteration 6956: Policy loss: -1.383660. Value loss: 15.664126. Entropy: 0.417901.
Iteration 6957: Policy loss: -1.417666. Value loss: 12.959441. Entropy: 0.430791.
Training network. lr: 0.000197. clip: 0.078656
Iteration 6958: Policy loss: 0.409715. 

Training network. lr: 0.000196. clip: 0.078499
Iteration 7009: Policy loss: 1.053966. Value loss: 28.339737. Entropy: 0.505438.
Iteration 7010: Policy loss: 1.255273. Value loss: 19.736921. Entropy: 0.485190.
Iteration 7011: Policy loss: 1.075701. Value loss: 14.306350. Entropy: 0.501321.
episode: 3124   score: 80.0  epsilon: 1.0    steps: 192  evaluation reward: 220.2
episode: 3125   score: 215.0  epsilon: 1.0    steps: 319  evaluation reward: 220.25
Training network. lr: 0.000196. clip: 0.078499
Iteration 7012: Policy loss: -0.120111. Value loss: 25.602386. Entropy: 0.385186.
Iteration 7013: Policy loss: -0.246532. Value loss: 12.783136. Entropy: 0.389006.
Iteration 7014: Policy loss: -0.453015. Value loss: 10.086871. Entropy: 0.393587.
episode: 3126   score: 260.0  epsilon: 1.0    steps: 460  evaluation reward: 220.6
episode: 3127   score: 275.0  epsilon: 1.0    steps: 885  evaluation reward: 221.8
Training network. lr: 0.000196. clip: 0.078499
Iteration 7015: Policy loss: 1.448109.

Training network. lr: 0.000196. clip: 0.078352
Iteration 7069: Policy loss: 2.062272. Value loss: 35.264587. Entropy: 0.324681.
Iteration 7070: Policy loss: 1.838333. Value loss: 16.845415. Entropy: 0.374040.
Iteration 7071: Policy loss: 1.792724. Value loss: 15.647941. Entropy: 0.378385.
episode: 3152   score: 215.0  epsilon: 1.0    steps: 582  evaluation reward: 232.3
Training network. lr: 0.000196. clip: 0.078352
Iteration 7072: Policy loss: 0.085856. Value loss: 33.580696. Entropy: 0.336491.
Iteration 7073: Policy loss: 0.226360. Value loss: 14.731666. Entropy: 0.339967.
Iteration 7074: Policy loss: 0.129568. Value loss: 12.368567. Entropy: 0.338155.
episode: 3153   score: 240.0  epsilon: 1.0    steps: 365  evaluation reward: 233.95
Training network. lr: 0.000196. clip: 0.078352
Iteration 7075: Policy loss: -1.925846. Value loss: 37.213593. Entropy: 0.337988.
Iteration 7076: Policy loss: -1.625578. Value loss: 23.570738. Entropy: 0.315134.
Iteration 7077: Policy loss: -1.977047. Va

Iteration 7130: Policy loss: 0.758296. Value loss: 16.579914. Entropy: 0.274508.
Iteration 7131: Policy loss: 0.386564. Value loss: 13.758310. Entropy: 0.246743.
Training network. lr: 0.000195. clip: 0.078195
Iteration 7132: Policy loss: 0.819066. Value loss: 26.848883. Entropy: 0.309898.
Iteration 7133: Policy loss: 0.854260. Value loss: 18.268108. Entropy: 0.330561.
Iteration 7134: Policy loss: 0.681075. Value loss: 12.618000. Entropy: 0.302800.
episode: 3179   score: 260.0  epsilon: 1.0    steps: 799  evaluation reward: 249.0
Training network. lr: 0.000195. clip: 0.078195
Iteration 7135: Policy loss: 1.459106. Value loss: 32.530529. Entropy: 0.328775.
Iteration 7136: Policy loss: 1.663697. Value loss: 22.821297. Entropy: 0.340160.
Iteration 7137: Policy loss: 1.414118. Value loss: 17.350342. Entropy: 0.337689.
episode: 3180   score: 225.0  epsilon: 1.0    steps: 98  evaluation reward: 248.9
Training network. lr: 0.000195. clip: 0.078195
Iteration 7138: Policy loss: 0.040178. Value l

Training network. lr: 0.000195. clip: 0.078038
Iteration 7192: Policy loss: 2.419033. Value loss: 31.792269. Entropy: 0.501449.
Iteration 7193: Policy loss: 2.623880. Value loss: 18.068054. Entropy: 0.490874.
Iteration 7194: Policy loss: 2.469742. Value loss: 14.933040. Entropy: 0.496331.
episode: 3206   score: 240.0  epsilon: 1.0    steps: 858  evaluation reward: 263.15
Training network. lr: 0.000195. clip: 0.078038
Iteration 7195: Policy loss: 1.499198. Value loss: 28.078077. Entropy: 0.401900.
Iteration 7196: Policy loss: 1.561404. Value loss: 13.437133. Entropy: 0.423787.
Iteration 7197: Policy loss: 1.466037. Value loss: 12.797100. Entropy: 0.428548.
Training network. lr: 0.000195. clip: 0.078038
Iteration 7198: Policy loss: 3.098836. Value loss: 23.256384. Entropy: 0.315042.
Iteration 7199: Policy loss: 3.041380. Value loss: 14.829965. Entropy: 0.308659.
Iteration 7200: Policy loss: 3.059209. Value loss: 13.694926. Entropy: 0.380083.
episode: 3207   score: 180.0  epsilon: 1.0    

Iteration 7253: Policy loss: -0.665847. Value loss: 14.142336. Entropy: 0.754052.
Iteration 7254: Policy loss: -1.119935. Value loss: 14.371737. Entropy: 0.782355.
Training network. lr: 0.000194. clip: 0.077734
Iteration 7255: Policy loss: 2.335570. Value loss: 26.938019. Entropy: 0.753332.
Iteration 7256: Policy loss: 2.487636. Value loss: 18.761889. Entropy: 0.748217.
Iteration 7257: Policy loss: 2.287062. Value loss: 14.593274. Entropy: 0.745838.
episode: 3233   score: 105.0  epsilon: 1.0    steps: 122  evaluation reward: 266.35
Training network. lr: 0.000194. clip: 0.077734
Iteration 7258: Policy loss: 0.054890. Value loss: 16.722067. Entropy: 0.587322.
Iteration 7259: Policy loss: -0.015175. Value loss: 8.469484. Entropy: 0.590602.
Iteration 7260: Policy loss: 0.086465. Value loss: 7.543274. Entropy: 0.524240.
episode: 3234   score: 210.0  epsilon: 1.0    steps: 258  evaluation reward: 264.75
episode: 3235   score: 285.0  epsilon: 1.0    steps: 725  evaluation reward: 263.35
Train

Iteration 7314: Policy loss: 1.058828. Value loss: 14.831835. Entropy: 0.693349.
episode: 3261   score: 285.0  epsilon: 1.0    steps: 344  evaluation reward: 252.0
episode: 3262   score: 140.0  epsilon: 1.0    steps: 706  evaluation reward: 251.25
Training network. lr: 0.000194. clip: 0.077577
Iteration 7315: Policy loss: 0.561555. Value loss: 33.512131. Entropy: 0.678018.
Iteration 7316: Policy loss: 0.943487. Value loss: 15.746131. Entropy: 0.689581.
Iteration 7317: Policy loss: 0.490943. Value loss: 11.584265. Entropy: 0.680948.
episode: 3263   score: 210.0  epsilon: 1.0    steps: 56  evaluation reward: 251.4
episode: 3264   score: 210.0  epsilon: 1.0    steps: 256  evaluation reward: 248.6
episode: 3265   score: 210.0  epsilon: 1.0    steps: 540  evaluation reward: 248.1
episode: 3266   score: 155.0  epsilon: 1.0    steps: 777  evaluation reward: 247.55
Training network. lr: 0.000194. clip: 0.077577
Iteration 7318: Policy loss: 0.928706. Value loss: 24.686419. Entropy: 0.859104.
It

episode: 3289   score: 105.0  epsilon: 1.0    steps: 618  evaluation reward: 241.7
Training network. lr: 0.000194. clip: 0.077430
Iteration 7375: Policy loss: -0.582948. Value loss: 29.945196. Entropy: 0.822206.
Iteration 7376: Policy loss: -0.595992. Value loss: 18.382549. Entropy: 0.813117.
Iteration 7377: Policy loss: -0.388830. Value loss: 15.200196. Entropy: 0.822024.
episode: 3290   score: 160.0  epsilon: 1.0    steps: 89  evaluation reward: 242.25
Training network. lr: 0.000194. clip: 0.077430
Iteration 7378: Policy loss: -0.436624. Value loss: 28.286169. Entropy: 0.726455.
Iteration 7379: Policy loss: -0.313932. Value loss: 12.011748. Entropy: 0.738245.
Iteration 7380: Policy loss: -0.518565. Value loss: 9.410004. Entropy: 0.731285.
episode: 3291   score: 105.0  epsilon: 1.0    steps: 352  evaluation reward: 240.45
Training network. lr: 0.000194. clip: 0.077430
Iteration 7381: Policy loss: -0.719723. Value loss: 31.815842. Entropy: 0.608360.
Iteration 7382: Policy loss: -0.7693

episode: 3317   score: 265.0  epsilon: 1.0    steps: 362  evaluation reward: 231.75
episode: 3318   score: 235.0  epsilon: 1.0    steps: 693  evaluation reward: 232.3
Training network. lr: 0.000193. clip: 0.077273
Iteration 7435: Policy loss: -0.058241. Value loss: 20.512720. Entropy: 0.547845.
Iteration 7436: Policy loss: 0.011023. Value loss: 13.360107. Entropy: 0.560677.
Iteration 7437: Policy loss: 0.021101. Value loss: 10.119625. Entropy: 0.542074.
episode: 3319   score: 240.0  epsilon: 1.0    steps: 424  evaluation reward: 231.85
Training network. lr: 0.000193. clip: 0.077273
Iteration 7438: Policy loss: 0.563295. Value loss: 17.136786. Entropy: 0.582237.
Iteration 7439: Policy loss: 0.601855. Value loss: 12.503965. Entropy: 0.575265.
Iteration 7440: Policy loss: 0.652857. Value loss: 10.243402. Entropy: 0.567068.
Training network. lr: 0.000193. clip: 0.077273
Iteration 7441: Policy loss: -0.096946. Value loss: 23.367626. Entropy: 0.524913.
Iteration 7442: Policy loss: 0.050570. 

Iteration 7494: Policy loss: -2.134095. Value loss: 13.468282. Entropy: 0.497597.
episode: 3347   score: 210.0  epsilon: 1.0    steps: 34  evaluation reward: 223.75
episode: 3348   score: 210.0  epsilon: 1.0    steps: 365  evaluation reward: 223.75
episode: 3349   score: 270.0  epsilon: 1.0    steps: 577  evaluation reward: 224.35
Training network. lr: 0.000193. clip: 0.077117
Iteration 7495: Policy loss: 0.544065. Value loss: 15.679067. Entropy: 0.552961.
Iteration 7496: Policy loss: 0.354107. Value loss: 9.863743. Entropy: 0.552242.
Iteration 7497: Policy loss: 0.459617. Value loss: 8.179467. Entropy: 0.554013.
episode: 3350   score: 215.0  epsilon: 1.0    steps: 388  evaluation reward: 224.7
Training network. lr: 0.000193. clip: 0.077117
Iteration 7498: Policy loss: 0.975711. Value loss: 24.728661. Entropy: 0.635242.
Iteration 7499: Policy loss: 0.811499. Value loss: 17.997820. Entropy: 0.647776.
Iteration 7500: Policy loss: 0.906181. Value loss: 14.323285. Entropy: 0.625984.
now ti

Training network. lr: 0.000192. clip: 0.076813
Iteration 7555: Policy loss: -1.682774. Value loss: 23.005930. Entropy: 0.543969.
Iteration 7556: Policy loss: -1.573440. Value loss: 10.800488. Entropy: 0.564825.
Iteration 7557: Policy loss: -1.608432. Value loss: 9.159486. Entropy: 0.552501.
episode: 3374   score: 180.0  epsilon: 1.0    steps: 201  evaluation reward: 226.9
episode: 3375   score: 240.0  epsilon: 1.0    steps: 304  evaluation reward: 226.55
episode: 3376   score: 295.0  epsilon: 1.0    steps: 646  evaluation reward: 227.1
Training network. lr: 0.000192. clip: 0.076813
Iteration 7558: Policy loss: 0.387628. Value loss: 33.680740. Entropy: 0.578231.
Iteration 7559: Policy loss: 0.454079. Value loss: 16.258190. Entropy: 0.564505.
Iteration 7560: Policy loss: 0.433494. Value loss: 13.623474. Entropy: 0.584449.
episode: 3377   score: 375.0  epsilon: 1.0    steps: 122  evaluation reward: 228.15
episode: 3378   score: 210.0  epsilon: 1.0    steps: 798  evaluation reward: 228.15


Iteration 7616: Policy loss: 1.738201. Value loss: 20.752251. Entropy: 0.616647.
Iteration 7617: Policy loss: 1.632161. Value loss: 14.452641. Entropy: 0.593627.
now time :  2019-02-25 21:02:26.821956
episode: 3401   score: 485.0  epsilon: 1.0    steps: 68  evaluation reward: 229.4
episode: 3402   score: 280.0  epsilon: 1.0    steps: 478  evaluation reward: 230.05
Training network. lr: 0.000192. clip: 0.076656
Iteration 7618: Policy loss: 0.448295. Value loss: 29.964386. Entropy: 0.509013.
Iteration 7619: Policy loss: 0.364306. Value loss: 17.147083. Entropy: 0.517136.
Iteration 7620: Policy loss: 0.344801. Value loss: 12.710340. Entropy: 0.527677.
Training network. lr: 0.000192. clip: 0.076656
Iteration 7621: Policy loss: 0.953908. Value loss: 36.200886. Entropy: 0.708599.
Iteration 7622: Policy loss: 0.732986. Value loss: 17.490658. Entropy: 0.728476.
Iteration 7623: Policy loss: 0.944093. Value loss: 11.616475. Entropy: 0.716892.
episode: 3403   score: 225.0  epsilon: 1.0    steps: 

Iteration 7676: Policy loss: -1.840669. Value loss: 17.348156. Entropy: 0.632912.
Iteration 7677: Policy loss: -1.918804. Value loss: 12.986900. Entropy: 0.639218.
episode: 3430   score: 260.0  epsilon: 1.0    steps: 201  evaluation reward: 229.7
episode: 3431   score: 180.0  epsilon: 1.0    steps: 437  evaluation reward: 230.45
Training network. lr: 0.000191. clip: 0.076508
Iteration 7678: Policy loss: -2.155740. Value loss: 30.919617. Entropy: 0.586443.
Iteration 7679: Policy loss: -1.864506. Value loss: 17.989641. Entropy: 0.600915.
Iteration 7680: Policy loss: -1.943805. Value loss: 13.189551. Entropy: 0.582050.
episode: 3432   score: 260.0  epsilon: 1.0    steps: 67  evaluation reward: 231.25
episode: 3433   score: 210.0  epsilon: 1.0    steps: 552  evaluation reward: 231.8
episode: 3434   score: 365.0  epsilon: 1.0    steps: 888  evaluation reward: 233.35
Training network. lr: 0.000191. clip: 0.076508
Iteration 7681: Policy loss: 0.258305. Value loss: 29.944201. Entropy: 0.720180

Iteration 7737: Policy loss: -0.650453. Value loss: 14.469857. Entropy: 0.612769.
episode: 3457   score: 180.0  epsilon: 1.0    steps: 291  evaluation reward: 232.8
episode: 3458   score: 255.0  epsilon: 1.0    steps: 646  evaluation reward: 232.8
Training network. lr: 0.000191. clip: 0.076352
Iteration 7738: Policy loss: -0.361140. Value loss: 35.214649. Entropy: 0.666427.
Iteration 7739: Policy loss: -0.521871. Value loss: 23.233587. Entropy: 0.654345.
Iteration 7740: Policy loss: -0.517421. Value loss: 16.862852. Entropy: 0.672717.
Training network. lr: 0.000191. clip: 0.076352
Iteration 7741: Policy loss: 0.676019. Value loss: 30.849482. Entropy: 0.595653.
Iteration 7742: Policy loss: 0.645422. Value loss: 16.397230. Entropy: 0.603082.
Iteration 7743: Policy loss: 0.480783. Value loss: 13.013117. Entropy: 0.606335.
Training network. lr: 0.000191. clip: 0.076352
Iteration 7744: Policy loss: 1.868326. Value loss: 36.228668. Entropy: 0.636752.
Iteration 7745: Policy loss: 1.771827. Va

Iteration 7799: Policy loss: 0.374598. Value loss: 18.254538. Entropy: 0.612487.
Iteration 7800: Policy loss: 0.381249. Value loss: 15.233129. Entropy: 0.618693.
episode: 3484   score: 185.0  epsilon: 1.0    steps: 471  evaluation reward: 237.9
Training network. lr: 0.000190. clip: 0.076048
Iteration 7801: Policy loss: -0.512126. Value loss: 21.580370. Entropy: 0.603303.
Iteration 7802: Policy loss: -0.574197. Value loss: 14.012331. Entropy: 0.604647.
Iteration 7803: Policy loss: -0.455140. Value loss: 12.659803. Entropy: 0.636175.
episode: 3485   score: 210.0  epsilon: 1.0    steps: 163  evaluation reward: 237.9
episode: 3486   score: 260.0  epsilon: 1.0    steps: 558  evaluation reward: 238.2
episode: 3487   score: 210.0  epsilon: 1.0    steps: 898  evaluation reward: 238.4
Training network. lr: 0.000190. clip: 0.076048
Iteration 7804: Policy loss: 1.689678. Value loss: 29.177568. Entropy: 0.563674.
Iteration 7805: Policy loss: 1.984245. Value loss: 12.813570. Entropy: 0.570893.
Iter

Iteration 7860: Policy loss: -0.021478. Value loss: 9.218896. Entropy: 0.702461.
episode: 3512   score: 275.0  epsilon: 1.0    steps: 763  evaluation reward: 226.9
episode: 3513   score: 210.0  epsilon: 1.0    steps: 784  evaluation reward: 226.4
Training network. lr: 0.000190. clip: 0.075891
Iteration 7861: Policy loss: -2.166748. Value loss: 28.306669. Entropy: 0.655091.
Iteration 7862: Policy loss: -2.315167. Value loss: 12.836143. Entropy: 0.632906.
Iteration 7863: Policy loss: -2.348294. Value loss: 9.339906. Entropy: 0.637464.
Training network. lr: 0.000190. clip: 0.075891
Iteration 7864: Policy loss: 0.399484. Value loss: 17.960016. Entropy: 0.534363.
Iteration 7865: Policy loss: 0.280058. Value loss: 11.173434. Entropy: 0.511396.
Iteration 7866: Policy loss: 0.096582. Value loss: 8.225745. Entropy: 0.511342.
episode: 3514   score: 270.0  epsilon: 1.0    steps: 997  evaluation reward: 227.0
Training network. lr: 0.000190. clip: 0.075891
Iteration 7867: Policy loss: -0.934566. Va

Iteration 7921: Policy loss: 1.530359. Value loss: 36.182945. Entropy: 0.588737.
Iteration 7922: Policy loss: 1.487731. Value loss: 19.454988. Entropy: 0.593458.
Iteration 7923: Policy loss: 1.560381. Value loss: 14.338184. Entropy: 0.605101.
episode: 3539   score: 240.0  epsilon: 1.0    steps: 370  evaluation reward: 248.45
episode: 3540   score: 410.0  epsilon: 1.0    steps: 967  evaluation reward: 250.45
Training network. lr: 0.000189. clip: 0.075734
Iteration 7924: Policy loss: 1.528901. Value loss: 39.082909. Entropy: 0.550786.
Iteration 7925: Policy loss: 1.452376. Value loss: 20.054100. Entropy: 0.539016.
Iteration 7926: Policy loss: 1.555296. Value loss: 16.362099. Entropy: 0.552701.
Training network. lr: 0.000189. clip: 0.075734
Iteration 7927: Policy loss: -0.268591. Value loss: 46.467258. Entropy: 0.582283.
Iteration 7928: Policy loss: -0.343325. Value loss: 26.236799. Entropy: 0.566888.
Iteration 7929: Policy loss: -0.365662. Value loss: 20.126282. Entropy: 0.550672.
episod

Iteration 7984: Policy loss: 0.550768. Value loss: 26.699940. Entropy: 0.564290.
Iteration 7985: Policy loss: 0.738125. Value loss: 14.147029. Entropy: 0.581439.
Iteration 7986: Policy loss: 0.359977. Value loss: 11.035158. Entropy: 0.566740.
Training network. lr: 0.000189. clip: 0.075587
Iteration 7987: Policy loss: 2.294861. Value loss: 31.217573. Entropy: 0.635634.
Iteration 7988: Policy loss: 1.827954. Value loss: 16.562092. Entropy: 0.609194.
Iteration 7989: Policy loss: 2.093875. Value loss: 12.694000. Entropy: 0.623300.
episode: 3564   score: 360.0  epsilon: 1.0    steps: 84  evaluation reward: 255.95
episode: 3565   score: 210.0  epsilon: 1.0    steps: 205  evaluation reward: 255.65
episode: 3566   score: 430.0  epsilon: 1.0    steps: 550  evaluation reward: 259.5
episode: 3567   score: 280.0  epsilon: 1.0    steps: 776  evaluation reward: 259.55
Training network. lr: 0.000189. clip: 0.075587
Iteration 7990: Policy loss: -0.468745. Value loss: 20.094006. Entropy: 0.678986.
Iter

Iteration 8048: Policy loss: 0.963183. Value loss: 24.028994. Entropy: 0.591606.
Iteration 8049: Policy loss: 0.686377. Value loss: 20.464241. Entropy: 0.562295.
episode: 3588   score: 225.0  epsilon: 1.0    steps: 444  evaluation reward: 279.35
Training network. lr: 0.000189. clip: 0.075430
Iteration 8050: Policy loss: 1.405582. Value loss: 41.792137. Entropy: 0.672179.
Iteration 8051: Policy loss: 1.194467. Value loss: 19.721859. Entropy: 0.695465.
Iteration 8052: Policy loss: 1.208019. Value loss: 14.979302. Entropy: 0.686717.
episode: 3589   score: 325.0  epsilon: 1.0    steps: 53  evaluation reward: 280.75
Training network. lr: 0.000188. clip: 0.075273
Iteration 8053: Policy loss: -0.094556. Value loss: 45.803627. Entropy: 0.720939.
Iteration 8054: Policy loss: -0.482263. Value loss: 29.797934. Entropy: 0.731287.
Iteration 8055: Policy loss: -0.434755. Value loss: 24.158140. Entropy: 0.717279.
episode: 3590   score: 210.0  epsilon: 1.0    steps: 635  evaluation reward: 281.5
episo

Training network. lr: 0.000188. clip: 0.075126
Iteration 8110: Policy loss: 1.304860. Value loss: 27.330627. Entropy: 0.775612.
Iteration 8111: Policy loss: 1.249070. Value loss: 19.471500. Entropy: 0.743394.
Iteration 8112: Policy loss: 0.895895. Value loss: 13.632736. Entropy: 0.753075.
episode: 3615   score: 155.0  epsilon: 1.0    steps: 501  evaluation reward: 292.4
Training network. lr: 0.000188. clip: 0.075126
Iteration 8113: Policy loss: -1.870504. Value loss: 233.726105. Entropy: 0.734164.
Iteration 8114: Policy loss: -2.164112. Value loss: 133.257141. Entropy: 0.718787.
Iteration 8115: Policy loss: -2.044286. Value loss: 105.500839. Entropy: 0.712075.
Training network. lr: 0.000188. clip: 0.075126
Iteration 8116: Policy loss: 2.831376. Value loss: 32.502399. Entropy: 0.741576.
Iteration 8117: Policy loss: 2.987942. Value loss: 15.916286. Entropy: 0.755838.
Iteration 8118: Policy loss: 2.869224. Value loss: 12.536307. Entropy: 0.754489.
episode: 3616   score: 420.0  epsilon: 1.

episode: 3640   score: 180.0  epsilon: 1.0    steps: 677  evaluation reward: 291.95
Training network. lr: 0.000187. clip: 0.074969
Iteration 8173: Policy loss: 1.823640. Value loss: 19.669361. Entropy: 0.878253.
Iteration 8174: Policy loss: 1.701594. Value loss: 9.846487. Entropy: 0.900078.
Iteration 8175: Policy loss: 1.724483. Value loss: 8.785608. Entropy: 0.893516.
Training network. lr: 0.000187. clip: 0.074969
Iteration 8176: Policy loss: -0.951289. Value loss: 239.895905. Entropy: 1.020871.
Iteration 8177: Policy loss: 0.137325. Value loss: 129.832581. Entropy: 0.980994.
Iteration 8178: Policy loss: -0.601462. Value loss: 152.710983. Entropy: 0.963795.
Training network. lr: 0.000187. clip: 0.074969
Iteration 8179: Policy loss: 0.560190. Value loss: 29.400097. Entropy: 0.786294.
Iteration 8180: Policy loss: 0.756289. Value loss: 16.375431. Entropy: 0.777671.
Iteration 8181: Policy loss: 0.458854. Value loss: 11.683267. Entropy: 0.788940.
episode: 3641   score: 210.0  epsilon: 1.0 

Training network. lr: 0.000187. clip: 0.074813
Iteration 8233: Policy loss: -0.565410. Value loss: 25.862061. Entropy: 0.819463.
Iteration 8234: Policy loss: -0.532010. Value loss: 19.309992. Entropy: 0.820505.
Iteration 8235: Policy loss: -0.552505. Value loss: 15.867217. Entropy: 0.807700.
episode: 3668   score: 155.0  epsilon: 1.0    steps: 286  evaluation reward: 274.9
episode: 3669   score: 375.0  epsilon: 1.0    steps: 413  evaluation reward: 276.55
Training network. lr: 0.000187. clip: 0.074813
Iteration 8236: Policy loss: 0.330941. Value loss: 19.769186. Entropy: 0.912126.
Iteration 8237: Policy loss: 0.406905. Value loss: 12.852826. Entropy: 0.916192.
Iteration 8238: Policy loss: 0.382259. Value loss: 10.349863. Entropy: 0.921047.
episode: 3670   score: 200.0  epsilon: 1.0    steps: 528  evaluation reward: 274.3
episode: 3671   score: 225.0  epsilon: 1.0    steps: 663  evaluation reward: 271.4
Training network. lr: 0.000187. clip: 0.074813
Iteration 8239: Policy loss: -0.78087

Iteration 8293: Policy loss: 1.404238. Value loss: 28.224007. Entropy: 0.810076.
Iteration 8294: Policy loss: 1.409024. Value loss: 16.636559. Entropy: 0.823671.
Iteration 8295: Policy loss: 1.592064. Value loss: 12.006332. Entropy: 0.799807.
episode: 3696   score: 320.0  epsilon: 1.0    steps: 202  evaluation reward: 258.9
Training network. lr: 0.000187. clip: 0.074665
Iteration 8296: Policy loss: 3.215753. Value loss: 41.026943. Entropy: 0.744653.
Iteration 8297: Policy loss: 3.670957. Value loss: 19.535645. Entropy: 0.754733.
Iteration 8298: Policy loss: 3.169511. Value loss: 15.072751. Entropy: 0.760329.
episode: 3697   score: 185.0  epsilon: 1.0    steps: 96  evaluation reward: 257.65
episode: 3698   score: 150.0  epsilon: 1.0    steps: 593  evaluation reward: 256.65
Training network. lr: 0.000187. clip: 0.074665
Iteration 8299: Policy loss: 1.511134. Value loss: 20.411165. Entropy: 0.791360.
Iteration 8300: Policy loss: 1.596914. Value loss: 11.949772. Entropy: 0.778640.
Iteratio

Iteration 8355: Policy loss: -1.029521. Value loss: 14.206028. Entropy: 0.652856.
episode: 3722   score: 150.0  epsilon: 1.0    steps: 276  evaluation reward: 258.45
episode: 3723   score: 170.0  epsilon: 1.0    steps: 701  evaluation reward: 258.05
episode: 3724   score: 285.0  epsilon: 1.0    steps: 936  evaluation reward: 256.8
Training network. lr: 0.000186. clip: 0.074352
Iteration 8356: Policy loss: 2.155053. Value loss: 32.553715. Entropy: 0.809334.
Iteration 8357: Policy loss: 2.173289. Value loss: 17.831619. Entropy: 0.805320.
Iteration 8358: Policy loss: 2.045562. Value loss: 15.224674. Entropy: 0.836442.
episode: 3725   score: 185.0  epsilon: 1.0    steps: 187  evaluation reward: 255.35
Training network. lr: 0.000186. clip: 0.074352
Iteration 8359: Policy loss: -1.754115. Value loss: 32.148415. Entropy: 0.709439.
Iteration 8360: Policy loss: -1.667053. Value loss: 19.894617. Entropy: 0.714691.
Iteration 8361: Policy loss: -1.789657. Value loss: 15.788326. Entropy: 0.693784.


Iteration 8420: Policy loss: 0.345297. Value loss: 13.688594. Entropy: 0.722639.
Iteration 8421: Policy loss: 0.452234. Value loss: 10.877520. Entropy: 0.703229.
episode: 3745   score: 435.0  epsilon: 1.0    steps: 58  evaluation reward: 267.8
episode: 3746   score: 285.0  epsilon: 1.0    steps: 382  evaluation reward: 265.95
Training network. lr: 0.000186. clip: 0.074204
Iteration 8422: Policy loss: -0.291779. Value loss: 38.656403. Entropy: 0.546464.
Iteration 8423: Policy loss: -0.477453. Value loss: 18.296604. Entropy: 0.534585.
Iteration 8424: Policy loss: -0.411683. Value loss: 13.540089. Entropy: 0.531050.
episode: 3747   score: 340.0  epsilon: 1.0    steps: 143  evaluation reward: 267.25
episode: 3748   score: 180.0  epsilon: 1.0    steps: 472  evaluation reward: 267.5
Training network. lr: 0.000186. clip: 0.074204
Iteration 8425: Policy loss: 1.426024. Value loss: 31.506277. Entropy: 0.615977.
Iteration 8426: Policy loss: 1.836607. Value loss: 20.226072. Entropy: 0.585717.
Ite

episode: 3768   score: 240.0  epsilon: 1.0    steps: 711  evaluation reward: 286.7
Training network. lr: 0.000185. clip: 0.074048
Iteration 8485: Policy loss: 0.444534. Value loss: 27.191387. Entropy: 0.566938.
Iteration 8486: Policy loss: 0.378957. Value loss: 15.362684. Entropy: 0.575878.
Iteration 8487: Policy loss: 0.405681. Value loss: 11.613672. Entropy: 0.550197.
episode: 3769   score: 260.0  epsilon: 1.0    steps: 205  evaluation reward: 285.55
episode: 3770   score: 355.0  epsilon: 1.0    steps: 823  evaluation reward: 287.1
episode: 3771   score: 270.0  epsilon: 1.0    steps: 926  evaluation reward: 287.55
Training network. lr: 0.000185. clip: 0.074048
Iteration 8488: Policy loss: 1.646034. Value loss: 29.648720. Entropy: 0.720250.
Iteration 8489: Policy loss: 1.768682. Value loss: 19.012781. Entropy: 0.706126.
Iteration 8490: Policy loss: 1.594553. Value loss: 14.070521. Entropy: 0.715415.
Training network. lr: 0.000185. clip: 0.074048
Iteration 8491: Policy loss: 1.072656. 

episode: 3794   score: 230.0  epsilon: 1.0    steps: 555  evaluation reward: 285.6
episode: 3795   score: 235.0  epsilon: 1.0    steps: 934  evaluation reward: 286.9
Training network. lr: 0.000185. clip: 0.073891
Iteration 8548: Policy loss: 1.130177. Value loss: 12.268158. Entropy: 0.623906.
Iteration 8549: Policy loss: 1.137463. Value loss: 7.448082. Entropy: 0.654360.
Iteration 8550: Policy loss: 1.160329. Value loss: 7.479221. Entropy: 0.667136.
episode: 3796   score: 245.0  epsilon: 1.0    steps: 225  evaluation reward: 286.15
Training network. lr: 0.000184. clip: 0.073744
Iteration 8551: Policy loss: -0.232166. Value loss: 24.166203. Entropy: 0.758709.
Iteration 8552: Policy loss: -0.233009. Value loss: 17.057377. Entropy: 0.752294.
Iteration 8553: Policy loss: -0.032881. Value loss: 14.031604. Entropy: 0.739407.
episode: 3797   score: 220.0  epsilon: 1.0    steps: 49  evaluation reward: 286.5
Training network. lr: 0.000184. clip: 0.073744
Iteration 8554: Policy loss: -0.251610. 

Iteration 8610: Policy loss: 0.418580. Value loss: 7.441772. Entropy: 0.543411.
episode: 3820   score: 285.0  epsilon: 1.0    steps: 118  evaluation reward: 276.7
episode: 3821   score: 100.0  epsilon: 1.0    steps: 176  evaluation reward: 275.45
episode: 3822   score: 105.0  epsilon: 1.0    steps: 749  evaluation reward: 275.0
Training network. lr: 0.000184. clip: 0.073587
Iteration 8611: Policy loss: -0.334067. Value loss: 23.759796. Entropy: 0.547155.
Iteration 8612: Policy loss: -0.478961. Value loss: 13.598880. Entropy: 0.514568.
Iteration 8613: Policy loss: -0.267292. Value loss: 10.978965. Entropy: 0.531684.
episode: 3823   score: 240.0  epsilon: 1.0    steps: 263  evaluation reward: 275.7
Training network. lr: 0.000184. clip: 0.073587
Iteration 8614: Policy loss: -1.011752. Value loss: 13.128079. Entropy: 0.737655.
Iteration 8615: Policy loss: -1.093558. Value loss: 7.643767. Entropy: 0.733006.
Iteration 8616: Policy loss: -1.069473. Value loss: 6.057052. Entropy: 0.716868.
epi

Iteration 8673: Policy loss: 0.577786. Value loss: 9.458999. Entropy: 0.506646.
Training network. lr: 0.000184. clip: 0.073430
Iteration 8674: Policy loss: 2.080467. Value loss: 25.858215. Entropy: 0.500580.
Iteration 8675: Policy loss: 2.220991. Value loss: 15.769832. Entropy: 0.485534.
Iteration 8676: Policy loss: 2.140052. Value loss: 12.330439. Entropy: 0.471513.
episode: 3845   score: 180.0  epsilon: 1.0    steps: 322  evaluation reward: 272.4
Training network. lr: 0.000184. clip: 0.073430
Iteration 8677: Policy loss: -0.934700. Value loss: 226.269516. Entropy: 0.234767.
Iteration 8678: Policy loss: -0.751271. Value loss: 120.866821. Entropy: 0.233244.
Iteration 8679: Policy loss: -2.146240. Value loss: 125.473732. Entropy: 0.229941.
episode: 3846   score: 225.0  epsilon: 1.0    steps: 747  evaluation reward: 271.8
episode: 3847   score: 95.0  epsilon: 1.0    steps: 859  evaluation reward: 269.35
episode: 3848   score: 210.0  epsilon: 1.0    steps: 990  evaluation reward: 269.65
T

Iteration 8736: Policy loss: -0.779848. Value loss: 65.782356. Entropy: 0.785376.
episode: 3870   score: 315.0  epsilon: 1.0    steps: 137  evaluation reward: 256.35
episode: 3871   score: 355.0  epsilon: 1.0    steps: 429  evaluation reward: 257.2
Training network. lr: 0.000183. clip: 0.073283
Iteration 8737: Policy loss: 0.131615. Value loss: 34.590454. Entropy: 0.602328.
Iteration 8738: Policy loss: 0.346305. Value loss: 22.040794. Entropy: 0.624731.
Iteration 8739: Policy loss: -0.149909. Value loss: 16.310228. Entropy: 0.621516.
Training network. lr: 0.000183. clip: 0.073283
Iteration 8740: Policy loss: 1.528782. Value loss: 35.350132. Entropy: 0.688280.
Iteration 8741: Policy loss: 1.386966. Value loss: 21.191610. Entropy: 0.691281.
Iteration 8742: Policy loss: 1.774326. Value loss: 14.583620. Entropy: 0.667026.
episode: 3872   score: 550.0  epsilon: 1.0    steps: 115  evaluation reward: 257.85
episode: 3873   score: 80.0  epsilon: 1.0    steps: 531  evaluation reward: 255.8
Trai

Training network. lr: 0.000183. clip: 0.073126
Iteration 8800: Policy loss: 0.746754. Value loss: 28.667652. Entropy: 0.677263.
Iteration 8801: Policy loss: 0.954711. Value loss: 16.797670. Entropy: 0.674007.
Iteration 8802: Policy loss: 0.708120. Value loss: 12.232882. Entropy: 0.671864.
episode: 3895   score: 335.0  epsilon: 1.0    steps: 8  evaluation reward: 264.55
Training network. lr: 0.000182. clip: 0.072969
Iteration 8803: Policy loss: 0.885150. Value loss: 26.800968. Entropy: 0.718401.
Iteration 8804: Policy loss: 1.192332. Value loss: 12.796166. Entropy: 0.719557.
Iteration 8805: Policy loss: 0.984854. Value loss: 9.972797. Entropy: 0.706746.
episode: 3896   score: 190.0  epsilon: 1.0    steps: 617  evaluation reward: 264.0
Training network. lr: 0.000182. clip: 0.072969
Iteration 8806: Policy loss: -2.854349. Value loss: 352.083099. Entropy: 0.694854.
Iteration 8807: Policy loss: -1.664058. Value loss: 119.387756. Entropy: 0.679231.
Iteration 8808: Policy loss: -2.217090. Val

Training network. lr: 0.000182. clip: 0.072822
Iteration 8863: Policy loss: 0.998643. Value loss: 32.631325. Entropy: 0.688653.
Iteration 8864: Policy loss: 1.341176. Value loss: 15.060885. Entropy: 0.663825.
Iteration 8865: Policy loss: 1.006575. Value loss: 11.732555. Entropy: 0.693201.
Training network. lr: 0.000182. clip: 0.072822
Iteration 8866: Policy loss: -0.588970. Value loss: 32.587265. Entropy: 0.745340.
Iteration 8867: Policy loss: -0.518496. Value loss: 17.147516. Entropy: 0.761659.
Iteration 8868: Policy loss: -0.334998. Value loss: 12.820733. Entropy: 0.749003.
episode: 3920   score: 280.0  epsilon: 1.0    steps: 904  evaluation reward: 286.2
Training network. lr: 0.000182. clip: 0.072822
Iteration 8869: Policy loss: -2.750774. Value loss: 256.889008. Entropy: 0.642282.
Iteration 8870: Policy loss: -2.837662. Value loss: 172.323715. Entropy: 0.654903.
Iteration 8871: Policy loss: -3.305166. Value loss: 211.710556. Entropy: 0.626282.
episode: 3921   score: 440.0  epsilon:

Training network. lr: 0.000182. clip: 0.072665
Iteration 8929: Policy loss: 0.371774. Value loss: 36.830704. Entropy: 0.516407.
Iteration 8930: Policy loss: 0.586180. Value loss: 20.610235. Entropy: 0.502874.
Iteration 8931: Policy loss: 0.242646. Value loss: 14.798834. Entropy: 0.499160.
episode: 3942   score: 270.0  epsilon: 1.0    steps: 264  evaluation reward: 300.45
episode: 3943   score: 455.0  epsilon: 1.0    steps: 616  evaluation reward: 299.75
Training network. lr: 0.000182. clip: 0.072665
Iteration 8932: Policy loss: -0.401930. Value loss: 52.623158. Entropy: 0.570642.
Iteration 8933: Policy loss: -0.478637. Value loss: 26.137793. Entropy: 0.568770.
Iteration 8934: Policy loss: -0.080016. Value loss: 21.367651. Entropy: 0.584755.
episode: 3944   score: 200.0  epsilon: 1.0    steps: 660  evaluation reward: 298.4
Training network. lr: 0.000182. clip: 0.072665
Iteration 8935: Policy loss: -1.685388. Value loss: 212.108536. Entropy: 0.789417.
Iteration 8936: Policy loss: -2.5762

Iteration 8994: Policy loss: -1.700767. Value loss: 21.899963. Entropy: 0.523251.
episode: 3964   score: 550.0  epsilon: 1.0    steps: 998  evaluation reward: 334.65
Training network. lr: 0.000181. clip: 0.072509
Iteration 8995: Policy loss: 2.353090. Value loss: 50.598362. Entropy: 0.555237.
Iteration 8996: Policy loss: 2.101239. Value loss: 27.606026. Entropy: 0.582397.
Iteration 8997: Policy loss: 2.183805. Value loss: 19.885807. Entropy: 0.558221.
episode: 3965   score: 185.0  epsilon: 1.0    steps: 79  evaluation reward: 330.75
episode: 3966   score: 740.0  epsilon: 1.0    steps: 194  evaluation reward: 335.5
Training network. lr: 0.000181. clip: 0.072509
Iteration 8998: Policy loss: 3.845306. Value loss: 62.456875. Entropy: 0.635810.
Iteration 8999: Policy loss: 3.993078. Value loss: 28.504065. Entropy: 0.635572.
Iteration 9000: Policy loss: 3.712785. Value loss: 24.680571. Entropy: 0.627586.
episode: 3967   score: 260.0  epsilon: 1.0    steps: 539  evaluation reward: 336.25
epis

episode: 3988   score: 285.0  epsilon: 1.0    steps: 589  evaluation reward: 342.4
Training network. lr: 0.000181. clip: 0.072205
Iteration 9058: Policy loss: 1.988192. Value loss: 28.781448. Entropy: 0.583007.
Iteration 9059: Policy loss: 2.055382. Value loss: 15.401326. Entropy: 0.596556.
Iteration 9060: Policy loss: 2.035673. Value loss: 12.502118. Entropy: 0.609525.
episode: 3989   score: 270.0  epsilon: 1.0    steps: 167  evaluation reward: 343.4
episode: 3990   score: 420.0  epsilon: 1.0    steps: 686  evaluation reward: 345.0
episode: 3991   score: 240.0  epsilon: 1.0    steps: 992  evaluation reward: 344.5
Training network. lr: 0.000181. clip: 0.072205
Iteration 9061: Policy loss: 0.591462. Value loss: 211.009506. Entropy: 0.497091.
Iteration 9062: Policy loss: 0.325799. Value loss: 120.813499. Entropy: 0.482274.
Iteration 9063: Policy loss: 1.065930. Value loss: 83.439880. Entropy: 0.479694.
episode: 3992   score: 460.0  epsilon: 1.0    steps: 363  evaluation reward: 345.25
Tr

episode: 4014   score: 285.0  epsilon: 1.0    steps: 326  evaluation reward: 347.65
episode: 4015   score: 600.0  epsilon: 1.0    steps: 879  evaluation reward: 351.05
Training network. lr: 0.000180. clip: 0.072048
Iteration 9121: Policy loss: 1.897357. Value loss: 57.850929. Entropy: 0.582639.
Iteration 9122: Policy loss: 1.511711. Value loss: 24.092590. Entropy: 0.579345.
Iteration 9123: Policy loss: 1.947308. Value loss: 17.287880. Entropy: 0.583522.
episode: 4016   score: 265.0  epsilon: 1.0    steps: 702  evaluation reward: 349.15
Training network. lr: 0.000180. clip: 0.072048
Iteration 9124: Policy loss: 2.447845. Value loss: 40.874184. Entropy: 0.636332.
Iteration 9125: Policy loss: 2.298501. Value loss: 20.249191. Entropy: 0.636950.
Iteration 9126: Policy loss: 2.452949. Value loss: 15.655236. Entropy: 0.633365.
Training network. lr: 0.000180. clip: 0.072048
Iteration 9127: Policy loss: -0.080454. Value loss: 30.339668. Entropy: 0.449591.
Iteration 9128: Policy loss: 0.193421. 

Training network. lr: 0.000180. clip: 0.071900
Iteration 9184: Policy loss: -0.997695. Value loss: 25.489801. Entropy: 0.406959.
Iteration 9185: Policy loss: -0.823411. Value loss: 15.321301. Entropy: 0.420610.
Iteration 9186: Policy loss: -0.787293. Value loss: 12.076485. Entropy: 0.381303.
Training network. lr: 0.000180. clip: 0.071900
Iteration 9187: Policy loss: -1.815477. Value loss: 37.899094. Entropy: 0.433516.
Iteration 9188: Policy loss: -1.341140. Value loss: 16.400196. Entropy: 0.454988.
Iteration 9189: Policy loss: -1.740634. Value loss: 13.151350. Entropy: 0.462597.
episode: 4039   score: 435.0  epsilon: 1.0    steps: 400  evaluation reward: 343.55
Training network. lr: 0.000180. clip: 0.071900
Iteration 9190: Policy loss: -3.438821. Value loss: 198.333054. Entropy: 0.463947.
Iteration 9191: Policy loss: -3.728484. Value loss: 141.434418. Entropy: 0.439931.
Iteration 9192: Policy loss: -3.123013. Value loss: 85.450829. Entropy: 0.452480.
episode: 4040   score: 290.0  epsil

Training network. lr: 0.000179. clip: 0.071744
Iteration 9250: Policy loss: 0.836454. Value loss: 25.069584. Entropy: 0.495626.
Iteration 9251: Policy loss: 1.080726. Value loss: 15.993731. Entropy: 0.479423.
Iteration 9252: Policy loss: 0.765661. Value loss: 12.796049. Entropy: 0.489417.
episode: 4061   score: 210.0  epsilon: 1.0    steps: 815  evaluation reward: 341.0
episode: 4062   score: 285.0  epsilon: 1.0    steps: 953  evaluation reward: 338.2
Training network. lr: 0.000179. clip: 0.071587
Iteration 9253: Policy loss: -1.700523. Value loss: 21.230589. Entropy: 0.533942.
Iteration 9254: Policy loss: -1.760337. Value loss: 12.113299. Entropy: 0.528239.
Iteration 9255: Policy loss: -1.793937. Value loss: 10.679745. Entropy: 0.528941.
episode: 4063   score: 435.0  epsilon: 1.0    steps: 407  evaluation reward: 340.45
episode: 4064   score: 365.0  epsilon: 1.0    steps: 631  evaluation reward: 338.6
Training network. lr: 0.000179. clip: 0.071587
Iteration 9256: Policy loss: -0.44737

Training network. lr: 0.000179. clip: 0.071440
Iteration 9316: Policy loss: 1.654623. Value loss: 25.540951. Entropy: 0.540092.
Iteration 9317: Policy loss: 1.297182. Value loss: 16.354801. Entropy: 0.544646.
Iteration 9318: Policy loss: 1.498840. Value loss: 12.911515. Entropy: 0.522732.
episode: 4083   score: 225.0  epsilon: 1.0    steps: 1019  evaluation reward: 349.85
Training network. lr: 0.000179. clip: 0.071440
Iteration 9319: Policy loss: 0.357562. Value loss: 18.325966. Entropy: 0.461247.
Iteration 9320: Policy loss: 0.376131. Value loss: 14.686145. Entropy: 0.465408.
Iteration 9321: Policy loss: 0.419796. Value loss: 11.000345. Entropy: 0.465497.
Training network. lr: 0.000179. clip: 0.071440
Iteration 9322: Policy loss: 1.232798. Value loss: 20.931440. Entropy: 0.444378.
Iteration 9323: Policy loss: 1.229312. Value loss: 10.301147. Entropy: 0.434308.
Iteration 9324: Policy loss: 0.909032. Value loss: 9.957028. Entropy: 0.465986.
Training network. lr: 0.000179. clip: 0.071440

episode: 4104   score: 310.0  epsilon: 1.0    steps: 775  evaluation reward: 355.0
Training network. lr: 0.000178. clip: 0.071283
Iteration 9382: Policy loss: -4.395526. Value loss: 125.303772. Entropy: 0.361305.
Iteration 9383: Policy loss: -5.550783. Value loss: 166.643784. Entropy: 0.386559.
Iteration 9384: Policy loss: -5.174999. Value loss: 71.451416. Entropy: 0.373125.
Training network. lr: 0.000178. clip: 0.071283
Iteration 9385: Policy loss: 2.951899. Value loss: 36.073048. Entropy: 0.609111.
Iteration 9386: Policy loss: 3.093097. Value loss: 19.808092. Entropy: 0.643779.
Iteration 9387: Policy loss: 2.842817. Value loss: 19.487457. Entropy: 0.614954.
episode: 4105   score: 240.0  epsilon: 1.0    steps: 492  evaluation reward: 355.0
episode: 4106   score: 320.0  epsilon: 1.0    steps: 978  evaluation reward: 356.1
Training network. lr: 0.000178. clip: 0.071283
Iteration 9388: Policy loss: 1.100022. Value loss: 21.332989. Entropy: 0.509431.
Iteration 9389: Policy loss: 1.007501.

Iteration 9446: Policy loss: 1.655988. Value loss: 26.237259. Entropy: 0.479152.
Iteration 9447: Policy loss: 1.505047. Value loss: 18.613390. Entropy: 0.440004.
episode: 4128   score: 350.0  epsilon: 1.0    steps: 617  evaluation reward: 355.9
Training network. lr: 0.000178. clip: 0.071126
Iteration 9448: Policy loss: -0.747798. Value loss: 29.769133. Entropy: 0.606711.
Iteration 9449: Policy loss: -0.896105. Value loss: 21.620258. Entropy: 0.586727.
Iteration 9450: Policy loss: -0.815026. Value loss: 16.550741. Entropy: 0.594605.
Training network. lr: 0.000177. clip: 0.070979
Iteration 9451: Policy loss: -1.453630. Value loss: 30.477999. Entropy: 0.585737.
Iteration 9452: Policy loss: -1.462509. Value loss: 16.938412. Entropy: 0.579553.
Iteration 9453: Policy loss: -1.644855. Value loss: 12.550988. Entropy: 0.554419.
episode: 4129   score: 285.0  epsilon: 1.0    steps: 811  evaluation reward: 356.1
episode: 4130   score: 340.0  epsilon: 1.0    steps: 944  evaluation reward: 357.7
Tra

Iteration 9512: Policy loss: 1.511856. Value loss: 18.208586. Entropy: 0.662608.
Iteration 9513: Policy loss: 1.638042. Value loss: 17.036697. Entropy: 0.672754.
episode: 4150   score: 265.0  epsilon: 1.0    steps: 873  evaluation reward: 354.3
Training network. lr: 0.000177. clip: 0.070822
Iteration 9514: Policy loss: -0.124290. Value loss: 127.914978. Entropy: 0.574066.
Iteration 9515: Policy loss: 0.186041. Value loss: 82.634048. Entropy: 0.587282.
Iteration 9516: Policy loss: -0.543470. Value loss: 60.955166. Entropy: 0.578981.
now time :  2019-02-25 21:37:58.391612
episode: 4151   score: 185.0  epsilon: 1.0    steps: 13  evaluation reward: 352.55
Training network. lr: 0.000177. clip: 0.070822
Iteration 9517: Policy loss: 0.248072. Value loss: 20.245773. Entropy: 0.639210.
Iteration 9518: Policy loss: 0.090627. Value loss: 11.394217. Entropy: 0.639949.
Iteration 9519: Policy loss: 0.140106. Value loss: 9.231995. Entropy: 0.639226.
episode: 4152   score: 300.0  epsilon: 1.0    steps

Iteration 9578: Policy loss: 1.246622. Value loss: 19.633371. Entropy: 0.373055.
Iteration 9579: Policy loss: 1.012351. Value loss: 18.273321. Entropy: 0.369172.
episode: 4171   score: 955.0  epsilon: 1.0    steps: 326  evaluation reward: 360.1
Training network. lr: 0.000177. clip: 0.070665
Iteration 9580: Policy loss: 1.628498. Value loss: 39.575691. Entropy: 0.307424.
Iteration 9581: Policy loss: 1.880805. Value loss: 21.441658. Entropy: 0.308586.
Iteration 9582: Policy loss: 1.727700. Value loss: 19.935661. Entropy: 0.330972.
episode: 4172   score: 365.0  epsilon: 1.0    steps: 488  evaluation reward: 355.55
episode: 4173   score: 210.0  epsilon: 1.0    steps: 772  evaluation reward: 354.0
Training network. lr: 0.000177. clip: 0.070665
Iteration 9583: Policy loss: 1.133687. Value loss: 22.196987. Entropy: 0.441552.
Iteration 9584: Policy loss: 1.223886. Value loss: 13.379522. Entropy: 0.456564.
Iteration 9585: Policy loss: 1.175753. Value loss: 12.720636. Entropy: 0.452608.
episode:

Iteration 9645: Policy loss: 2.736868. Value loss: 18.181646. Entropy: 0.331225.
Training network. lr: 0.000176. clip: 0.070518
Iteration 9646: Policy loss: -1.165094. Value loss: 20.768850. Entropy: 0.273126.
Iteration 9647: Policy loss: -1.140035. Value loss: 10.812868. Entropy: 0.253602.
Iteration 9648: Policy loss: -1.180030. Value loss: 8.958117. Entropy: 0.272459.
Training network. lr: 0.000176. clip: 0.070518
Iteration 9649: Policy loss: 1.344256. Value loss: 47.973900. Entropy: 0.227818.
Iteration 9650: Policy loss: 1.346563. Value loss: 24.747730. Entropy: 0.217706.
Iteration 9651: Policy loss: 1.312200. Value loss: 16.902954. Entropy: 0.227995.
Training network. lr: 0.000176. clip: 0.070361
Iteration 9652: Policy loss: 0.765840. Value loss: 30.328762. Entropy: 0.355631.
Iteration 9653: Policy loss: 0.846057. Value loss: 17.273521. Entropy: 0.379390.
Iteration 9654: Policy loss: 0.803083. Value loss: 11.072812. Entropy: 0.373364.
episode: 4192   score: 445.0  epsilon: 1.0    s

Iteration 9713: Policy loss: -2.079514. Value loss: 14.435359. Entropy: 0.350660.
Iteration 9714: Policy loss: -2.102995. Value loss: 10.493537. Entropy: 0.361393.
episode: 4211   score: 235.0  epsilon: 1.0    steps: 258  evaluation reward: 359.5
episode: 4212   score: 210.0  epsilon: 1.0    steps: 599  evaluation reward: 357.35
Training network. lr: 0.000176. clip: 0.070205
Iteration 9715: Policy loss: 3.779915. Value loss: 25.769709. Entropy: 0.306934.
Iteration 9716: Policy loss: 3.518782. Value loss: 12.511119. Entropy: 0.304120.
Iteration 9717: Policy loss: 3.453809. Value loss: 9.896291. Entropy: 0.329572.
episode: 4213   score: 310.0  epsilon: 1.0    steps: 154  evaluation reward: 357.25
Training network. lr: 0.000176. clip: 0.070205
Iteration 9718: Policy loss: -0.865182. Value loss: 20.336941. Entropy: 0.344885.
Iteration 9719: Policy loss: -1.126791. Value loss: 11.901110. Entropy: 0.340182.
Iteration 9720: Policy loss: -1.080341. Value loss: 8.888572. Entropy: 0.339516.
Trai

Iteration 9780: Policy loss: -0.121420. Value loss: 11.995272. Entropy: 0.444689.
episode: 4232   score: 270.0  epsilon: 1.0    steps: 19  evaluation reward: 358.3
Training network. lr: 0.000175. clip: 0.070057
Iteration 9781: Policy loss: 0.176040. Value loss: 15.855047. Entropy: 0.497466.
Iteration 9782: Policy loss: 0.115550. Value loss: 10.597612. Entropy: 0.523043.
Iteration 9783: Policy loss: 0.247902. Value loss: 8.789125. Entropy: 0.549219.
episode: 4233   score: 240.0  epsilon: 1.0    steps: 220  evaluation reward: 357.85
episode: 4234   score: 600.0  epsilon: 1.0    steps: 862  evaluation reward: 360.9
Training network. lr: 0.000175. clip: 0.070057
Iteration 9784: Policy loss: 1.463509. Value loss: 32.052402. Entropy: 0.611877.
Iteration 9785: Policy loss: 1.395972. Value loss: 19.050501. Entropy: 0.573726.
Iteration 9786: Policy loss: 1.623012. Value loss: 15.378757. Entropy: 0.601224.
episode: 4235   score: 620.0  epsilon: 1.0    steps: 538  evaluation reward: 364.05
episod

Iteration 9845: Policy loss: -0.077838. Value loss: 13.753905. Entropy: 0.565430.
Iteration 9846: Policy loss: -0.051078. Value loss: 10.526990. Entropy: 0.557395.
episode: 4255   score: 395.0  epsilon: 1.0    steps: 35  evaluation reward: 366.25
episode: 4256   score: 365.0  epsilon: 1.0    steps: 683  evaluation reward: 367.8
Training network. lr: 0.000175. clip: 0.069901
Iteration 9847: Policy loss: -2.267825. Value loss: 138.021103. Entropy: 0.495000.
Iteration 9848: Policy loss: -1.777942. Value loss: 66.449150. Entropy: 0.484329.
Iteration 9849: Policy loss: -2.948675. Value loss: 42.026020. Entropy: 0.485067.
Training network. lr: 0.000175. clip: 0.069901
Iteration 9850: Policy loss: 1.106516. Value loss: 63.335812. Entropy: 0.598365.
Iteration 9851: Policy loss: 0.706650. Value loss: 30.227688. Entropy: 0.564124.
Iteration 9852: Policy loss: 0.634025. Value loss: 17.257153. Entropy: 0.578180.
Training network. lr: 0.000174. clip: 0.069744
Iteration 9853: Policy loss: -0.033230.

episode: 4278   score: 210.0  epsilon: 1.0    steps: 333  evaluation reward: 351.55
Training network. lr: 0.000174. clip: 0.069596
Iteration 9910: Policy loss: -1.219724. Value loss: 55.334145. Entropy: 0.565871.
Iteration 9911: Policy loss: -1.216798. Value loss: 27.185097. Entropy: 0.559117.
Iteration 9912: Policy loss: -0.969816. Value loss: 18.898607. Entropy: 0.545616.
episode: 4279   score: 405.0  epsilon: 1.0    steps: 190  evaluation reward: 353.2
Training network. lr: 0.000174. clip: 0.069596
Iteration 9913: Policy loss: 2.588674. Value loss: 57.752415. Entropy: 0.666212.
Iteration 9914: Policy loss: 2.702779. Value loss: 21.251747. Entropy: 0.642240.
Iteration 9915: Policy loss: 2.460442. Value loss: 15.169432. Entropy: 0.636572.
episode: 4280   score: 430.0  epsilon: 1.0    steps: 47  evaluation reward: 354.9
episode: 4281   score: 570.0  epsilon: 1.0    steps: 830  evaluation reward: 357.25
Training network. lr: 0.000174. clip: 0.069596
Iteration 9916: Policy loss: 2.548476

Iteration 9975: Policy loss: -0.470191. Value loss: 16.877459. Entropy: 0.597133.
now time :  2019-02-25 21:46:31.999478
episode: 4301   score: 270.0  epsilon: 1.0    steps: 549  evaluation reward: 341.95
Training network. lr: 0.000174. clip: 0.069440
Iteration 9976: Policy loss: 0.857109. Value loss: 23.034527. Entropy: 0.441790.
Iteration 9977: Policy loss: 0.822458. Value loss: 11.883074. Entropy: 0.439537.
Iteration 9978: Policy loss: 0.986623. Value loss: 10.541125. Entropy: 0.445387.
episode: 4302   score: 265.0  epsilon: 1.0    steps: 160  evaluation reward: 341.3
episode: 4303   score: 315.0  epsilon: 1.0    steps: 892  evaluation reward: 338.15
Training network. lr: 0.000174. clip: 0.069440
Iteration 9979: Policy loss: 1.353382. Value loss: 23.040882. Entropy: 0.467669.
Iteration 9980: Policy loss: 1.336816. Value loss: 15.128527. Entropy: 0.473438.
Iteration 9981: Policy loss: 1.429868. Value loss: 11.375375. Entropy: 0.443252.
Training network. lr: 0.000174. clip: 0.069440
I

Training network. lr: 0.000173. clip: 0.069283
Iteration 10039: Policy loss: -0.160039. Value loss: 30.609619. Entropy: 0.379107.
Iteration 10040: Policy loss: -0.045416. Value loss: 15.941801. Entropy: 0.367567.
Iteration 10041: Policy loss: -0.146791. Value loss: 11.832780. Entropy: 0.352539.
episode: 4324   score: 260.0  epsilon: 1.0    steps: 232  evaluation reward: 340.1
Training network. lr: 0.000173. clip: 0.069283
Iteration 10042: Policy loss: 4.659501. Value loss: 42.686008. Entropy: 0.334681.
Iteration 10043: Policy loss: 4.856713. Value loss: 17.966764. Entropy: 0.336907.
Iteration 10044: Policy loss: 4.334857. Value loss: 15.498576. Entropy: 0.340716.
episode: 4325   score: 435.0  epsilon: 1.0    steps: 352  evaluation reward: 342.05
episode: 4326   score: 320.0  epsilon: 1.0    steps: 689  evaluation reward: 343.0
Training network. lr: 0.000173. clip: 0.069283
Iteration 10045: Policy loss: 1.610913. Value loss: 20.358809. Entropy: 0.347674.
Iteration 10046: Policy loss: 1.

Training network. lr: 0.000172. clip: 0.068979
Iteration 10102: Policy loss: 1.577945. Value loss: 23.563925. Entropy: 0.216257.
Iteration 10103: Policy loss: 1.668675. Value loss: 13.307127. Entropy: 0.191285.
Iteration 10104: Policy loss: 1.302428. Value loss: 8.380317. Entropy: 0.202531.
episode: 4349   score: 210.0  epsilon: 1.0    steps: 447  evaluation reward: 321.1
Training network. lr: 0.000172. clip: 0.068979
Iteration 10105: Policy loss: 0.925027. Value loss: 30.254612. Entropy: 0.306203.
Iteration 10106: Policy loss: 0.942607. Value loss: 15.394009. Entropy: 0.293808.
Iteration 10107: Policy loss: 0.473757. Value loss: 13.489351. Entropy: 0.303215.
Training network. lr: 0.000172. clip: 0.068979
Iteration 10108: Policy loss: 3.104066. Value loss: 22.965027. Entropy: 0.429245.
Iteration 10109: Policy loss: 3.092083. Value loss: 12.168923. Entropy: 0.463691.
Iteration 10110: Policy loss: 3.009072. Value loss: 8.799736. Entropy: 0.494912.
episode: 4350   score: 265.0  epsilon: 1

Iteration 10167: Policy loss: 0.536857. Value loss: 15.274498. Entropy: 0.550150.
Training network. lr: 0.000172. clip: 0.068822
Iteration 10168: Policy loss: -0.626236. Value loss: 231.444550. Entropy: 0.413563.
Iteration 10169: Policy loss: -1.468752. Value loss: 169.397308. Entropy: 0.370909.
Iteration 10170: Policy loss: -1.032257. Value loss: 123.792351. Entropy: 0.377371.
episode: 4371   score: 210.0  epsilon: 1.0    steps: 498  evaluation reward: 303.6
episode: 4372   score: 265.0  epsilon: 1.0    steps: 545  evaluation reward: 303.65
Training network. lr: 0.000172. clip: 0.068822
Iteration 10171: Policy loss: -0.783303. Value loss: 248.660461. Entropy: 0.452950.
Iteration 10172: Policy loss: -0.204883. Value loss: 103.711723. Entropy: 0.444172.
Iteration 10173: Policy loss: -0.534150. Value loss: 55.169674. Entropy: 0.479928.
episode: 4373   score: 490.0  epsilon: 1.0    steps: 948  evaluation reward: 306.45
Training network. lr: 0.000172. clip: 0.068822
Iteration 10174: Policy

episode: 4398   score: 210.0  epsilon: 1.0    steps: 936  evaluation reward: 291.8
Training network. lr: 0.000172. clip: 0.068675
Iteration 10228: Policy loss: 0.239651. Value loss: 14.860242. Entropy: 0.224446.
Iteration 10229: Policy loss: 0.383039. Value loss: 10.361681. Entropy: 0.252349.
Iteration 10230: Policy loss: 0.307338. Value loss: 11.115982. Entropy: 0.240659.
Training network. lr: 0.000172. clip: 0.068675
Iteration 10231: Policy loss: 0.915608. Value loss: 26.766533. Entropy: 0.312081.
Iteration 10232: Policy loss: 0.670927. Value loss: 16.055954. Entropy: 0.311290.
Iteration 10233: Policy loss: 0.961060. Value loss: 13.277185. Entropy: 0.288729.
Training network. lr: 0.000172. clip: 0.068675
Iteration 10234: Policy loss: 0.784791. Value loss: 27.401163. Entropy: 0.416791.
Iteration 10235: Policy loss: 0.981069. Value loss: 17.497747. Entropy: 0.427945.
Iteration 10236: Policy loss: 0.987362. Value loss: 10.639919. Entropy: 0.425782.
episode: 4399   score: 360.0  epsilon:

episode: 4419   score: 210.0  epsilon: 1.0    steps: 1020  evaluation reward: 280.15
Training network. lr: 0.000171. clip: 0.068518
Iteration 10294: Policy loss: 1.641513. Value loss: 27.574221. Entropy: 0.331004.
Iteration 10295: Policy loss: 1.102782. Value loss: 20.674601. Entropy: 0.353672.
Iteration 10296: Policy loss: 1.494433. Value loss: 17.641613. Entropy: 0.357388.
episode: 4420   score: 210.0  epsilon: 1.0    steps: 5  evaluation reward: 279.85
episode: 4421   score: 365.0  epsilon: 1.0    steps: 248  evaluation reward: 280.6
episode: 4422   score: 420.0  epsilon: 1.0    steps: 682  evaluation reward: 282.7
episode: 4423   score: 260.0  epsilon: 1.0    steps: 807  evaluation reward: 281.7
Training network. lr: 0.000171. clip: 0.068518
Iteration 10297: Policy loss: 1.144674. Value loss: 22.251036. Entropy: 0.360990.
Iteration 10298: Policy loss: 0.830730. Value loss: 14.726766. Entropy: 0.361537.
Iteration 10299: Policy loss: 1.038965. Value loss: 12.624161. Entropy: 0.360302

Training network. lr: 0.000171. clip: 0.068214
Iteration 10357: Policy loss: 1.046426. Value loss: 11.661334. Entropy: 0.370559.
Iteration 10358: Policy loss: 1.115487. Value loss: 8.881270. Entropy: 0.372548.
Iteration 10359: Policy loss: 0.962223. Value loss: 6.794087. Entropy: 0.371183.
Training network. lr: 0.000171. clip: 0.068214
Iteration 10360: Policy loss: 0.127395. Value loss: 15.297722. Entropy: 0.276485.
Iteration 10361: Policy loss: 0.301486. Value loss: 10.064019. Entropy: 0.266583.
Iteration 10362: Policy loss: 0.293288. Value loss: 6.851783. Entropy: 0.274222.
episode: 4444   score: 210.0  epsilon: 1.0    steps: 366  evaluation reward: 280.7
Training network. lr: 0.000171. clip: 0.068214
Iteration 10363: Policy loss: 1.153941. Value loss: 15.819763. Entropy: 0.282669.
Iteration 10364: Policy loss: 1.327336. Value loss: 9.583534. Entropy: 0.269776.
Iteration 10365: Policy loss: 1.313145. Value loss: 7.875005. Entropy: 0.287415.
episode: 4445   score: 210.0  epsilon: 1.0 

Iteration 10421: Policy loss: 1.172426. Value loss: 19.228342. Entropy: 0.432983.
Iteration 10422: Policy loss: 0.973619. Value loss: 14.275011. Entropy: 0.421452.
Training network. lr: 0.000170. clip: 0.068057
Iteration 10423: Policy loss: 0.043808. Value loss: 11.790151. Entropy: 0.332136.
Iteration 10424: Policy loss: 0.019533. Value loss: 7.208956. Entropy: 0.333170.
Iteration 10425: Policy loss: 0.028418. Value loss: 4.424375. Entropy: 0.339922.
episode: 4467   score: 390.0  epsilon: 1.0    steps: 262  evaluation reward: 279.3
Training network. lr: 0.000170. clip: 0.068057
Iteration 10426: Policy loss: 0.546629. Value loss: 24.965372. Entropy: 0.449812.
Iteration 10427: Policy loss: 0.374990. Value loss: 12.207142. Entropy: 0.468752.
Iteration 10428: Policy loss: 0.774553. Value loss: 9.839233. Entropy: 0.452136.
Training network. lr: 0.000170. clip: 0.068057
Iteration 10429: Policy loss: 0.310986. Value loss: 21.252855. Entropy: 0.612163.
Iteration 10430: Policy loss: 0.524264. V

Training network. lr: 0.000170. clip: 0.067901
Iteration 10486: Policy loss: -2.020324. Value loss: 23.867691. Entropy: 0.451328.
Iteration 10487: Policy loss: -2.073666. Value loss: 12.853422. Entropy: 0.439353.
Iteration 10488: Policy loss: -2.008363. Value loss: 10.459555. Entropy: 0.441241.
episode: 4490   score: 295.0  epsilon: 1.0    steps: 150  evaluation reward: 271.4
Training network. lr: 0.000170. clip: 0.067901
Iteration 10489: Policy loss: -0.479025. Value loss: 18.585262. Entropy: 0.640286.
Iteration 10490: Policy loss: -0.565851. Value loss: 10.876766. Entropy: 0.632192.
Iteration 10491: Policy loss: -0.447335. Value loss: 9.586001. Entropy: 0.663236.
episode: 4491   score: 315.0  epsilon: 1.0    steps: 31  evaluation reward: 273.3
episode: 4492   score: 260.0  epsilon: 1.0    steps: 1024  evaluation reward: 273.35
Training network. lr: 0.000170. clip: 0.067901
Iteration 10492: Policy loss: -0.391492. Value loss: 19.764091. Entropy: 0.648525.
Iteration 10493: Policy loss:

Iteration 10551: Policy loss: 0.661860. Value loss: 9.572926. Entropy: 0.756042.
episode: 4512   score: 245.0  epsilon: 1.0    steps: 408  evaluation reward: 276.9
Training network. lr: 0.000169. clip: 0.067597
Iteration 10552: Policy loss: 0.148360. Value loss: 22.159664. Entropy: 0.651997.
Iteration 10553: Policy loss: 0.344181. Value loss: 12.122003. Entropy: 0.671541.
Iteration 10554: Policy loss: 0.406385. Value loss: 10.382878. Entropy: 0.635647.
Training network. lr: 0.000169. clip: 0.067597
Iteration 10555: Policy loss: -0.526001. Value loss: 22.486141. Entropy: 0.419609.
Iteration 10556: Policy loss: -0.653108. Value loss: 11.127410. Entropy: 0.410378.
Iteration 10557: Policy loss: -0.405769. Value loss: 9.405242. Entropy: 0.400306.
Training network. lr: 0.000169. clip: 0.067597
Iteration 10558: Policy loss: 1.485553. Value loss: 15.380131. Entropy: 0.593295.
Iteration 10559: Policy loss: 1.470163. Value loss: 9.057488. Entropy: 0.601338.
Iteration 10560: Policy loss: 1.589406

Training network. lr: 0.000169. clip: 0.067440
Iteration 10615: Policy loss: 0.567329. Value loss: 24.415401. Entropy: 0.326415.
Iteration 10616: Policy loss: 0.563087. Value loss: 11.740798. Entropy: 0.349077.
Iteration 10617: Policy loss: 0.620112. Value loss: 9.295943. Entropy: 0.346703.
Training network. lr: 0.000169. clip: 0.067440
Iteration 10618: Policy loss: -0.889736. Value loss: 20.167263. Entropy: 0.533675.
Iteration 10619: Policy loss: -1.020937. Value loss: 15.574713. Entropy: 0.523537.
Iteration 10620: Policy loss: -1.038688. Value loss: 11.021159. Entropy: 0.515951.
episode: 4536   score: 240.0  epsilon: 1.0    steps: 695  evaluation reward: 277.45
episode: 4537   score: 470.0  epsilon: 1.0    steps: 1011  evaluation reward: 279.2
Training network. lr: 0.000169. clip: 0.067440
Iteration 10621: Policy loss: 0.846027. Value loss: 18.234623. Entropy: 0.603577.
Iteration 10622: Policy loss: 0.703947. Value loss: 11.744413. Entropy: 0.621772.
Iteration 10623: Policy loss: 0.6

Iteration 10679: Policy loss: 0.131804. Value loss: 13.928133. Entropy: 0.475022.
Iteration 10680: Policy loss: 0.126269. Value loss: 13.475085. Entropy: 0.494089.
Training network. lr: 0.000168. clip: 0.067292
Iteration 10681: Policy loss: -0.943108. Value loss: 13.093358. Entropy: 0.423797.
Iteration 10682: Policy loss: -0.864875. Value loss: 8.472084. Entropy: 0.419332.
Iteration 10683: Policy loss: -0.894357. Value loss: 6.991579. Entropy: 0.428587.
Training network. lr: 0.000168. clip: 0.067292
Iteration 10684: Policy loss: -0.216108. Value loss: 8.629513. Entropy: 0.401901.
Iteration 10685: Policy loss: -0.231309. Value loss: 5.934552. Entropy: 0.407365.
Iteration 10686: Policy loss: -0.012966. Value loss: 4.980376. Entropy: 0.410351.
episode: 4559   score: 290.0  epsilon: 1.0    steps: 594  evaluation reward: 292.85
episode: 4560   score: 210.0  epsilon: 1.0    steps: 653  evaluation reward: 292.2
Training network. lr: 0.000168. clip: 0.067292
Iteration 10687: Policy loss: 1.126

Iteration 10743: Policy loss: 0.758735. Value loss: 11.523955. Entropy: 0.521762.
episode: 4583   score: 260.0  epsilon: 1.0    steps: 187  evaluation reward: 288.0
Training network. lr: 0.000168. clip: 0.067136
Iteration 10744: Policy loss: -4.183414. Value loss: 248.052475. Entropy: 0.389426.
Iteration 10745: Policy loss: -4.444812. Value loss: 101.465248. Entropy: 0.348982.
Iteration 10746: Policy loss: -3.891426. Value loss: 132.165222. Entropy: 0.300474.
episode: 4584   score: 265.0  epsilon: 1.0    steps: 484  evaluation reward: 288.55
Training network. lr: 0.000168. clip: 0.067136
Iteration 10747: Policy loss: -1.973238. Value loss: 22.595589. Entropy: 0.269497.
Iteration 10748: Policy loss: -1.783730. Value loss: 17.007816. Entropy: 0.267783.
Iteration 10749: Policy loss: -1.922092. Value loss: 10.817715. Entropy: 0.268898.
Training network. lr: 0.000168. clip: 0.067136
Iteration 10750: Policy loss: 2.338764. Value loss: 51.237774. Entropy: 0.414902.
Iteration 10751: Policy los

Training network. lr: 0.000167. clip: 0.066832
Iteration 10807: Policy loss: 0.885005. Value loss: 22.174000. Entropy: 0.491976.
Iteration 10808: Policy loss: 1.099859. Value loss: 14.537296. Entropy: 0.501188.
Iteration 10809: Policy loss: 0.912597. Value loss: 15.569583. Entropy: 0.514316.
episode: 4606   score: 260.0  epsilon: 1.0    steps: 488  evaluation reward: 292.55
episode: 4607   score: 345.0  epsilon: 1.0    steps: 563  evaluation reward: 292.35
Training network. lr: 0.000167. clip: 0.066832
Iteration 10810: Policy loss: 0.112947. Value loss: 23.400610. Entropy: 0.586304.
Iteration 10811: Policy loss: 0.350771. Value loss: 14.474855. Entropy: 0.573603.
Iteration 10812: Policy loss: 0.120743. Value loss: 12.295529. Entropy: 0.547379.
episode: 4608   score: 330.0  epsilon: 1.0    steps: 7  evaluation reward: 292.1
episode: 4609   score: 280.0  epsilon: 1.0    steps: 284  evaluation reward: 292.3
Training network. lr: 0.000167. clip: 0.066832
Iteration 10813: Policy loss: 0.932

Iteration 10871: Policy loss: 0.031851. Value loss: 8.768229. Entropy: 0.526073.
Iteration 10872: Policy loss: -0.075013. Value loss: 5.399871. Entropy: 0.507850.
episode: 4629   score: 210.0  epsilon: 1.0    steps: 882  evaluation reward: 296.3
episode: 4630   score: 245.0  epsilon: 1.0    steps: 992  evaluation reward: 296.25
Training network. lr: 0.000167. clip: 0.066675
Iteration 10873: Policy loss: 1.568273. Value loss: 23.696379. Entropy: 0.442419.
Iteration 10874: Policy loss: 1.409196. Value loss: 13.588818. Entropy: 0.445350.
Iteration 10875: Policy loss: 1.298276. Value loss: 10.958455. Entropy: 0.445776.
episode: 4631   score: 225.0  epsilon: 1.0    steps: 112  evaluation reward: 295.65
episode: 4632   score: 290.0  epsilon: 1.0    steps: 521  evaluation reward: 294.75
Training network. lr: 0.000167. clip: 0.066675
Iteration 10876: Policy loss: 0.174657. Value loss: 25.136478. Entropy: 0.409440.
Iteration 10877: Policy loss: 0.111968. Value loss: 13.776089. Entropy: 0.406429

Training network. lr: 0.000166. clip: 0.066518
Iteration 10933: Policy loss: 0.483083. Value loss: 33.911926. Entropy: 0.380694.
Iteration 10934: Policy loss: 0.177191. Value loss: 17.875486. Entropy: 0.349941.
Iteration 10935: Policy loss: 0.225036. Value loss: 14.713708. Entropy: 0.349879.
episode: 4655   score: 275.0  epsilon: 1.0    steps: 199  evaluation reward: 288.75
episode: 4656   score: 275.0  epsilon: 1.0    steps: 889  evaluation reward: 289.05
episode: 4657   score: 260.0  epsilon: 1.0    steps: 1011  evaluation reward: 289.25
Training network. lr: 0.000166. clip: 0.066518
Iteration 10936: Policy loss: 1.883168. Value loss: 29.291775. Entropy: 0.330494.
Iteration 10937: Policy loss: 1.820618. Value loss: 18.777880. Entropy: 0.323488.
Iteration 10938: Policy loss: 1.997171. Value loss: 17.550322. Entropy: 0.322491.
Training network. lr: 0.000166. clip: 0.066518
Iteration 10939: Policy loss: -0.346610. Value loss: 19.642344. Entropy: 0.306838.
Iteration 10940: Policy loss: -

Iteration 10995: Policy loss: -0.053301. Value loss: 14.343207. Entropy: 0.260404.
episode: 4681   score: 270.0  epsilon: 1.0    steps: 936  evaluation reward: 286.7
Training network. lr: 0.000166. clip: 0.066371
Iteration 10996: Policy loss: -0.098207. Value loss: 24.146261. Entropy: 0.291388.
Iteration 10997: Policy loss: 0.050246. Value loss: 16.604813. Entropy: 0.283788.
Iteration 10998: Policy loss: 0.002553. Value loss: 15.595454. Entropy: 0.277734.
episode: 4682   score: 155.0  epsilon: 1.0    steps: 24  evaluation reward: 285.8
episode: 4683   score: 240.0  epsilon: 1.0    steps: 448  evaluation reward: 285.6
Training network. lr: 0.000166. clip: 0.066371
Iteration 10999: Policy loss: -0.809148. Value loss: 216.849289. Entropy: 0.386261.
Iteration 11000: Policy loss: 0.310427. Value loss: 48.006962. Entropy: 0.400432.
Iteration 11001: Policy loss: -0.571550. Value loss: 51.802692. Entropy: 0.378771.
episode: 4684   score: 135.0  epsilon: 1.0    steps: 611  evaluation reward: 28

Iteration 11057: Policy loss: 0.519589. Value loss: 15.775380. Entropy: 0.276589.
Iteration 11058: Policy loss: 0.472875. Value loss: 13.901408. Entropy: 0.281960.
episode: 4706   score: 210.0  epsilon: 1.0    steps: 938  evaluation reward: 283.9
Training network. lr: 0.000165. clip: 0.066057
Iteration 11059: Policy loss: -0.859324. Value loss: 12.154820. Entropy: 0.325926.
Iteration 11060: Policy loss: -0.809614. Value loss: 8.892046. Entropy: 0.326749.
Iteration 11061: Policy loss: -0.810988. Value loss: 8.597590. Entropy: 0.318972.
episode: 4707   score: 290.0  epsilon: 1.0    steps: 728  evaluation reward: 283.35
Training network. lr: 0.000165. clip: 0.066057
Iteration 11062: Policy loss: -0.665444. Value loss: 16.142126. Entropy: 0.384485.
Iteration 11063: Policy loss: -0.935946. Value loss: 8.502662. Entropy: 0.429268.
Iteration 11064: Policy loss: -1.015569. Value loss: 7.723479. Entropy: 0.399852.
Training network. lr: 0.000165. clip: 0.066057
Iteration 11065: Policy loss: 2.10

episode: 4729   score: 350.0  epsilon: 1.0    steps: 742  evaluation reward: 278.65
episode: 4730   score: 670.0  epsilon: 1.0    steps: 943  evaluation reward: 282.9
Training network. lr: 0.000165. clip: 0.065910
Iteration 11122: Policy loss: 2.487108. Value loss: 46.170605. Entropy: 0.302480.
Iteration 11123: Policy loss: 5.518005. Value loss: 23.188782. Entropy: 0.279107.
Iteration 11124: Policy loss: 2.593189. Value loss: 12.849960. Entropy: 0.271771.
episode: 4731   score: 435.0  epsilon: 1.0    steps: 248  evaluation reward: 285.0
Training network. lr: 0.000165. clip: 0.065910
Iteration 11125: Policy loss: -1.517666. Value loss: 37.926521. Entropy: 0.175579.
Iteration 11126: Policy loss: -1.545413. Value loss: 21.217960. Entropy: 0.177829.
Iteration 11127: Policy loss: -1.568188. Value loss: 14.736059. Entropy: 0.158368.
episode: 4732   score: 300.0  epsilon: 1.0    steps: 493  evaluation reward: 285.1
Training network. lr: 0.000165. clip: 0.065910
Iteration 11128: Policy loss: 1

episode: 4753   score: 285.0  epsilon: 1.0    steps: 428  evaluation reward: 283.15
episode: 4754   score: 325.0  epsilon: 1.0    steps: 698  evaluation reward: 283.8
Training network. lr: 0.000164. clip: 0.065753
Iteration 11185: Policy loss: -5.118914. Value loss: 417.661194. Entropy: 0.491107.
Iteration 11186: Policy loss: -5.248228. Value loss: 249.324509. Entropy: 0.450277.
Iteration 11187: Policy loss: -4.738023. Value loss: 184.653641. Entropy: 0.435530.
Training network. lr: 0.000164. clip: 0.065753
Iteration 11188: Policy loss: 1.208844. Value loss: 35.605499. Entropy: 0.386283.
Iteration 11189: Policy loss: 1.219473. Value loss: 20.723276. Entropy: 0.392840.
Iteration 11190: Policy loss: 1.273936. Value loss: 14.346694. Entropy: 0.407501.
episode: 4755   score: 310.0  epsilon: 1.0    steps: 127  evaluation reward: 284.15
episode: 4756   score: 415.0  epsilon: 1.0    steps: 327  evaluation reward: 285.55
Training network. lr: 0.000164. clip: 0.065753
Iteration 11191: Policy lo

Iteration 11248: Policy loss: 1.459166. Value loss: 40.728252. Entropy: 0.404712.
Iteration 11249: Policy loss: 1.645642. Value loss: 22.563931. Entropy: 0.421309.
Iteration 11250: Policy loss: 1.601671. Value loss: 18.632971. Entropy: 0.416867.
episode: 4777   score: 225.0  epsilon: 1.0    steps: 812  evaluation reward: 310.6
episode: 4778   score: 285.0  epsilon: 1.0    steps: 1016  evaluation reward: 311.35
Training network. lr: 0.000164. clip: 0.065449
Iteration 11251: Policy loss: 0.724339. Value loss: 18.209871. Entropy: 0.331650.
Iteration 11252: Policy loss: 0.795269. Value loss: 12.981327. Entropy: 0.313839.
Iteration 11253: Policy loss: 0.715576. Value loss: 12.544946. Entropy: 0.319383.
Training network. lr: 0.000164. clip: 0.065449
Iteration 11254: Policy loss: -0.477783. Value loss: 13.340063. Entropy: 0.452925.
Iteration 11255: Policy loss: -0.459740. Value loss: 9.029248. Entropy: 0.473532.
Iteration 11256: Policy loss: -0.470379. Value loss: 7.505478. Entropy: 0.449695.

episode: 4800   score: 320.0  epsilon: 1.0    steps: 181  evaluation reward: 312.6
now time :  2019-02-25 22:11:28.390336
episode: 4801   score: 240.0  epsilon: 1.0    steps: 919  evaluation reward: 312.35
Training network. lr: 0.000163. clip: 0.065293
Iteration 11314: Policy loss: -1.967307. Value loss: 24.109856. Entropy: 0.619472.
Iteration 11315: Policy loss: -2.155925. Value loss: 15.693100. Entropy: 0.623289.
Iteration 11316: Policy loss: -2.024253. Value loss: 15.805676. Entropy: 0.599935.
episode: 4802   score: 300.0  epsilon: 1.0    steps: 699  evaluation reward: 313.1
Training network. lr: 0.000163. clip: 0.065293
Iteration 11317: Policy loss: -0.878541. Value loss: 179.647614. Entropy: 0.453328.
Iteration 11318: Policy loss: -0.814026. Value loss: 107.057465. Entropy: 0.434457.
Iteration 11319: Policy loss: -0.668838. Value loss: 68.888710. Entropy: 0.416575.
Training network. lr: 0.000163. clip: 0.065293
Iteration 11320: Policy loss: 0.249923. Value loss: 27.953888. Entropy

episode: 4824   score: 290.0  epsilon: 1.0    steps: 623  evaluation reward: 323.15
episode: 4825   score: 210.0  epsilon: 1.0    steps: 835  evaluation reward: 323.15
episode: 4826   score: 390.0  epsilon: 1.0    steps: 976  evaluation reward: 324.95
Training network. lr: 0.000163. clip: 0.065136
Iteration 11377: Policy loss: 1.908059. Value loss: 25.104593. Entropy: 0.680909.
Iteration 11378: Policy loss: 1.847762. Value loss: 16.348944. Entropy: 0.655436.
Iteration 11379: Policy loss: 1.514973. Value loss: 15.768240. Entropy: 0.661384.
episode: 4827   score: 295.0  epsilon: 1.0    steps: 266  evaluation reward: 325.5
episode: 4828   score: 210.0  epsilon: 1.0    steps: 475  evaluation reward: 323.4
episode: 4829   score: 210.0  epsilon: 1.0    steps: 762  evaluation reward: 322.0
Training network. lr: 0.000163. clip: 0.065136
Iteration 11380: Policy loss: -4.137631. Value loss: 205.536255. Entropy: 0.592057.
Iteration 11381: Policy loss: -3.332377. Value loss: 97.507156. Entropy: 0.

episode: 4849   score: 360.0  epsilon: 1.0    steps: 898  evaluation reward: 327.45
Training network. lr: 0.000162. clip: 0.064988
Iteration 11440: Policy loss: 0.547131. Value loss: 35.242260. Entropy: 0.345711.
Iteration 11441: Policy loss: 0.592401. Value loss: 21.431862. Entropy: 0.326444.
Iteration 11442: Policy loss: 0.581219. Value loss: 16.458286. Entropy: 0.324206.
episode: 4850   score: 305.0  epsilon: 1.0    steps: 618  evaluation reward: 327.6
Training network. lr: 0.000162. clip: 0.064988
Iteration 11443: Policy loss: 2.428023. Value loss: 24.197075. Entropy: 0.415382.
Iteration 11444: Policy loss: 2.153132. Value loss: 13.468915. Entropy: 0.436700.
Iteration 11445: Policy loss: 2.313034. Value loss: 9.001195. Entropy: 0.443422.
now time :  2019-02-25 22:13:55.159534
episode: 4851   score: 590.0  epsilon: 1.0    steps: 386  evaluation reward: 330.9
episode: 4852   score: 260.0  epsilon: 1.0    steps: 894  evaluation reward: 330.65
Training network. lr: 0.000162. clip: 0.06

Iteration 11504: Policy loss: 0.124261. Value loss: 17.153952. Entropy: 0.593945.
Iteration 11505: Policy loss: 0.102800. Value loss: 12.966431. Entropy: 0.598794.
episode: 4872   score: 375.0  epsilon: 1.0    steps: 321  evaluation reward: 322.4
episode: 4873   score: 475.0  epsilon: 1.0    steps: 753  evaluation reward: 320.9
episode: 4874   score: 285.0  epsilon: 1.0    steps: 819  evaluation reward: 320.5
Training network. lr: 0.000162. clip: 0.064675
Iteration 11506: Policy loss: -1.167540. Value loss: 134.552628. Entropy: 0.405826.
Iteration 11507: Policy loss: -1.174352. Value loss: 57.808075. Entropy: 0.429516.
Iteration 11508: Policy loss: -1.184607. Value loss: 45.644672. Entropy: 0.456966.
Training network. lr: 0.000162. clip: 0.064675
Iteration 11509: Policy loss: -0.647687. Value loss: 25.787439. Entropy: 0.474113.
Iteration 11510: Policy loss: -0.906025. Value loss: 13.974319. Entropy: 0.477939.
Iteration 11511: Policy loss: -0.660186. Value loss: 10.327871. Entropy: 0.47

Iteration 11565: Policy loss: -0.530497. Value loss: 11.526704. Entropy: 0.412873.
episode: 4899   score: 260.0  epsilon: 1.0    steps: 229  evaluation reward: 320.35
episode: 4900   score: 310.0  epsilon: 1.0    steps: 611  evaluation reward: 320.25
Training network. lr: 0.000161. clip: 0.064528
Iteration 11566: Policy loss: 2.305689. Value loss: 36.495384. Entropy: 0.513174.
Iteration 11567: Policy loss: 2.205919. Value loss: 23.400616. Entropy: 0.533156.
Iteration 11568: Policy loss: 2.336249. Value loss: 20.622622. Entropy: 0.522006.
Training network. lr: 0.000161. clip: 0.064528
Iteration 11569: Policy loss: 0.322347. Value loss: 25.520294. Entropy: 0.526861.
Iteration 11570: Policy loss: 0.415225. Value loss: 13.862970. Entropy: 0.535917.
Iteration 11571: Policy loss: 0.304129. Value loss: 11.319913. Entropy: 0.544071.
now time :  2019-02-25 22:16:15.727927
episode: 4901   score: 180.0  epsilon: 1.0    steps: 475  evaluation reward: 319.65
Training network. lr: 0.000161. clip: 0.

Iteration 11628: Policy loss: 0.611646. Value loss: 11.707225. Entropy: 0.665957.
episode: 4923   score: 265.0  epsilon: 1.0    steps: 996  evaluation reward: 312.75
Training network. lr: 0.000161. clip: 0.064371
Iteration 11629: Policy loss: 2.392077. Value loss: 26.460411. Entropy: 0.437822.
Iteration 11630: Policy loss: 2.729838. Value loss: 15.492867. Entropy: 0.444712.
Iteration 11631: Policy loss: 2.348928. Value loss: 12.714288. Entropy: 0.442533.
episode: 4924   score: 390.0  epsilon: 1.0    steps: 882  evaluation reward: 313.75
Training network. lr: 0.000161. clip: 0.064371
Iteration 11632: Policy loss: 0.396611. Value loss: 17.167509. Entropy: 0.476678.
Iteration 11633: Policy loss: 0.182447. Value loss: 12.060001. Entropy: 0.480327.
Iteration 11634: Policy loss: 0.081295. Value loss: 8.856897. Entropy: 0.488886.
episode: 4925   score: 335.0  epsilon: 1.0    steps: 424  evaluation reward: 315.0
Training network. lr: 0.000161. clip: 0.064371
Iteration 11635: Policy loss: 0.327

Iteration 11693: Policy loss: -3.617790. Value loss: 118.716171. Entropy: 0.486801.
Iteration 11694: Policy loss: -3.691706. Value loss: 114.399078. Entropy: 0.433699.
episode: 4945   score: 285.0  epsilon: 1.0    steps: 617  evaluation reward: 311.4
episode: 4946   score: 345.0  epsilon: 1.0    steps: 838  evaluation reward: 311.95
Training network. lr: 0.000161. clip: 0.064214
Iteration 11695: Policy loss: 1.757690. Value loss: 29.886713. Entropy: 0.436228.
Iteration 11696: Policy loss: 1.625820. Value loss: 17.855000. Entropy: 0.463394.
Iteration 11697: Policy loss: 1.588992. Value loss: 13.798007. Entropy: 0.460047.
episode: 4947   score: 270.0  epsilon: 1.0    steps: 484  evaluation reward: 311.1
Training network. lr: 0.000161. clip: 0.064214
Iteration 11698: Policy loss: -0.154374. Value loss: 26.906359. Entropy: 0.554893.
Iteration 11699: Policy loss: -0.561870. Value loss: 14.026075. Entropy: 0.539211.
Iteration 11700: Policy loss: -0.338924. Value loss: 10.932343. Entropy: 0.5

Iteration 11757: Policy loss: -2.634926. Value loss: 40.934277. Entropy: 0.674022.
episode: 4968   score: 315.0  epsilon: 1.0    steps: 771  evaluation reward: 319.5
Training network. lr: 0.000160. clip: 0.063910
Iteration 11758: Policy loss: 0.212780. Value loss: 21.958103. Entropy: 0.547789.
Iteration 11759: Policy loss: 0.255913. Value loss: 17.002281. Entropy: 0.540890.
Iteration 11760: Policy loss: 0.172665. Value loss: 14.561158. Entropy: 0.548614.
episode: 4969   score: 700.0  epsilon: 1.0    steps: 285  evaluation reward: 323.8
episode: 4970   score: 365.0  epsilon: 1.0    steps: 699  evaluation reward: 322.55
Training network. lr: 0.000160. clip: 0.063910
Iteration 11761: Policy loss: 0.432779. Value loss: 52.462860. Entropy: 0.535652.
Iteration 11762: Policy loss: 0.248962. Value loss: 27.109982. Entropy: 0.545666.
Iteration 11763: Policy loss: 0.440240. Value loss: 22.792032. Entropy: 0.542780.
episode: 4971   score: 410.0  epsilon: 1.0    steps: 418  evaluation reward: 321.

Training network. lr: 0.000159. clip: 0.063753
Iteration 11821: Policy loss: 3.567767. Value loss: 101.604866. Entropy: 0.561719.
Iteration 11822: Policy loss: 3.240744. Value loss: 52.639038. Entropy: 0.524355.
Iteration 11823: Policy loss: 3.436695. Value loss: 30.617304. Entropy: 0.534538.
Training network. lr: 0.000159. clip: 0.063753
Iteration 11824: Policy loss: 1.032296. Value loss: 48.746708. Entropy: 0.615006.
Iteration 11825: Policy loss: 1.558237. Value loss: 25.462288. Entropy: 0.591135.
Iteration 11826: Policy loss: 1.012731. Value loss: 18.831564. Entropy: 0.588963.
episode: 4992   score: 560.0  epsilon: 1.0    steps: 87  evaluation reward: 330.85
episode: 4993   score: 315.0  epsilon: 1.0    steps: 556  evaluation reward: 331.4
Training network. lr: 0.000159. clip: 0.063753
Iteration 11827: Policy loss: -0.640052. Value loss: 32.790905. Entropy: 0.512999.
Iteration 11828: Policy loss: -0.603587. Value loss: 20.061260. Entropy: 0.498737.
Iteration 11829: Policy loss: -0.5

Training network. lr: 0.000159. clip: 0.063606
Iteration 11884: Policy loss: 2.994719. Value loss: 54.719959. Entropy: 0.484280.
Iteration 11885: Policy loss: 2.612178. Value loss: 22.545206. Entropy: 0.482728.
Iteration 11886: Policy loss: 2.401855. Value loss: 19.170879. Entropy: 0.487190.
episode: 5016   score: 180.0  epsilon: 1.0    steps: 762  evaluation reward: 326.65
Training network. lr: 0.000159. clip: 0.063606
Iteration 11887: Policy loss: 2.949208. Value loss: 37.840645. Entropy: 0.473103.
Iteration 11888: Policy loss: 3.456694. Value loss: 20.833603. Entropy: 0.481145.
Iteration 11889: Policy loss: 2.958018. Value loss: 18.511248. Entropy: 0.476086.
episode: 5017   score: 210.0  epsilon: 1.0    steps: 786  evaluation reward: 324.2
Training network. lr: 0.000159. clip: 0.063606
Iteration 11890: Policy loss: -0.506092. Value loss: 32.800022. Entropy: 0.573717.
Iteration 11891: Policy loss: -0.277036. Value loss: 18.578672. Entropy: 0.584985.
Iteration 11892: Policy loss: -0.2

Iteration 11948: Policy loss: 0.398842. Value loss: 16.877108. Entropy: 0.577586.
Iteration 11949: Policy loss: 0.739010. Value loss: 12.348918. Entropy: 0.595264.
episode: 5039   score: 295.0  epsilon: 1.0    steps: 146  evaluation reward: 317.5
Training network. lr: 0.000159. clip: 0.063449
Iteration 11950: Policy loss: -0.484469. Value loss: 18.294228. Entropy: 0.582240.
Iteration 11951: Policy loss: -0.420911. Value loss: 11.982015. Entropy: 0.603314.
Iteration 11952: Policy loss: -0.315036. Value loss: 8.494916. Entropy: 0.585997.
episode: 5040   score: 500.0  epsilon: 1.0    steps: 96  evaluation reward: 317.95
Training network. lr: 0.000158. clip: 0.063293
Iteration 11953: Policy loss: -0.717817. Value loss: 24.026264. Entropy: 0.529185.
Iteration 11954: Policy loss: -0.574170. Value loss: 13.715859. Entropy: 0.544528.
Iteration 11955: Policy loss: -0.674207. Value loss: 11.331214. Entropy: 0.546434.
episode: 5041   score: 210.0  epsilon: 1.0    steps: 301  evaluation reward: 31

Training network. lr: 0.000158. clip: 0.063145
Iteration 12010: Policy loss: -0.433963. Value loss: 19.383909. Entropy: 0.448911.
Iteration 12011: Policy loss: -0.601723. Value loss: 13.715508. Entropy: 0.418112.
Iteration 12012: Policy loss: -0.593481. Value loss: 11.293067. Entropy: 0.454473.
episode: 5065   score: 270.0  epsilon: 1.0    steps: 857  evaluation reward: 295.6
Training network. lr: 0.000158. clip: 0.063145
Iteration 12013: Policy loss: 1.147232. Value loss: 24.261370. Entropy: 0.409410.
Iteration 12014: Policy loss: 1.007542. Value loss: 11.897675. Entropy: 0.390404.
Iteration 12015: Policy loss: 1.076442. Value loss: 8.357045. Entropy: 0.401628.
episode: 5066   score: 110.0  epsilon: 1.0    steps: 296  evaluation reward: 291.55
episode: 5067   score: 105.0  epsilon: 1.0    steps: 473  evaluation reward: 289.7
Training network. lr: 0.000158. clip: 0.063145
Iteration 12016: Policy loss: 1.882857. Value loss: 19.755444. Entropy: 0.512801.
Iteration 12017: Policy loss: 2.1

Iteration 12071: Policy loss: -1.438621. Value loss: 12.921186. Entropy: 0.572530.
Iteration 12072: Policy loss: -1.190870. Value loss: 10.528094. Entropy: 0.580252.
episode: 5092   score: 210.0  epsilon: 1.0    steps: 463  evaluation reward: 250.95
episode: 5093   score: 210.0  epsilon: 1.0    steps: 553  evaluation reward: 249.9
Training network. lr: 0.000157. clip: 0.062989
Iteration 12073: Policy loss: -3.333889. Value loss: 186.135757. Entropy: 0.609644.
Iteration 12074: Policy loss: -3.077235. Value loss: 94.902939. Entropy: 0.611733.
Iteration 12075: Policy loss: -3.291081. Value loss: 51.461380. Entropy: 0.582024.
episode: 5094   score: 225.0  epsilon: 1.0    steps: 321  evaluation reward: 248.55
episode: 5095   score: 230.0  epsilon: 1.0    steps: 699  evaluation reward: 246.95
episode: 5096   score: 210.0  epsilon: 1.0    steps: 796  evaluation reward: 246.3
Training network. lr: 0.000157. clip: 0.062989
Iteration 12076: Policy loss: 1.286437. Value loss: 11.603636. Entropy: 

episode: 5121   score: 210.0  epsilon: 1.0    steps: 725  evaluation reward: 232.5
Training network. lr: 0.000157. clip: 0.062832
Iteration 12130: Policy loss: 3.641014. Value loss: 30.589689. Entropy: 0.533345.
Iteration 12131: Policy loss: 3.785156. Value loss: 23.035078. Entropy: 0.539134.
Iteration 12132: Policy loss: 3.718445. Value loss: 18.630489. Entropy: 0.553280.
episode: 5122   score: 210.0  epsilon: 1.0    steps: 417  evaluation reward: 232.0
episode: 5123   score: 210.0  epsilon: 1.0    steps: 591  evaluation reward: 228.45
Training network. lr: 0.000157. clip: 0.062832
Iteration 12133: Policy loss: 0.556475. Value loss: 29.644060. Entropy: 0.515273.
Iteration 12134: Policy loss: 0.737363. Value loss: 16.690449. Entropy: 0.547583.
Iteration 12135: Policy loss: 0.503334. Value loss: 12.772456. Entropy: 0.543957.
episode: 5124   score: 290.0  epsilon: 1.0    steps: 92  evaluation reward: 229.25
episode: 5125   score: 155.0  epsilon: 1.0    steps: 1010  evaluation reward: 228

Iteration 12190: Policy loss: -2.531227. Value loss: 113.599380. Entropy: 0.545143.
Iteration 12191: Policy loss: -2.859607. Value loss: 57.388329. Entropy: 0.528612.
Iteration 12192: Policy loss: -2.678123. Value loss: 27.071236. Entropy: 0.516512.
episode: 5149   score: 210.0  epsilon: 1.0    steps: 721  evaluation reward: 221.35
Training network. lr: 0.000157. clip: 0.062684
Iteration 12193: Policy loss: 1.047387. Value loss: 132.192184. Entropy: 0.513782.
Iteration 12194: Policy loss: 1.382359. Value loss: 51.585152. Entropy: 0.508886.
Iteration 12195: Policy loss: 1.245163. Value loss: 41.319092. Entropy: 0.488714.
episode: 5150   score: 380.0  epsilon: 1.0    steps: 110  evaluation reward: 223.05
Training network. lr: 0.000157. clip: 0.062684
Iteration 12196: Policy loss: -0.435343. Value loss: 74.927048. Entropy: 0.370771.
Iteration 12197: Policy loss: -0.276946. Value loss: 30.562443. Entropy: 0.348532.
Iteration 12198: Policy loss: -0.112326. Value loss: 25.143345. Entropy: 0.

episode: 5169   score: 255.0  epsilon: 1.0    steps: 100  evaluation reward: 253.85
Training network. lr: 0.000156. clip: 0.062371
Iteration 12256: Policy loss: 0.285077. Value loss: 34.507450. Entropy: 0.354425.
Iteration 12257: Policy loss: 0.149624. Value loss: 19.845474. Entropy: 0.349778.
Iteration 12258: Policy loss: 0.189435. Value loss: 15.875266. Entropy: 0.340052.
episode: 5170   score: 750.0  epsilon: 1.0    steps: 566  evaluation reward: 258.3
Training network. lr: 0.000156. clip: 0.062371
Iteration 12259: Policy loss: 1.368242. Value loss: 44.075733. Entropy: 0.244691.
Iteration 12260: Policy loss: 1.143256. Value loss: 22.477737. Entropy: 0.212638.
Iteration 12261: Policy loss: 1.232849. Value loss: 18.107899. Entropy: 0.231349.
Training network. lr: 0.000156. clip: 0.062371
Iteration 12262: Policy loss: -3.666772. Value loss: 357.089722. Entropy: 0.130692.
Iteration 12263: Policy loss: -3.059283. Value loss: 136.688553. Entropy: 0.119731.
Iteration 12264: Policy loss: -2

Training network. lr: 0.000156. clip: 0.062224
Iteration 12319: Policy loss: 2.431141. Value loss: 28.246037. Entropy: 0.362582.
Iteration 12320: Policy loss: 2.391978. Value loss: 16.538363. Entropy: 0.306053.
Iteration 12321: Policy loss: 2.343306. Value loss: 14.478139. Entropy: 0.317914.
episode: 5195   score: 150.0  epsilon: 1.0    steps: 444  evaluation reward: 273.15
Training network. lr: 0.000156. clip: 0.062224
Iteration 12322: Policy loss: -1.543373. Value loss: 19.979721. Entropy: 0.231838.
Iteration 12323: Policy loss: -1.568164. Value loss: 15.058178. Entropy: 0.236024.
Iteration 12324: Policy loss: -1.679838. Value loss: 11.577452. Entropy: 0.252577.
episode: 5196   score: 260.0  epsilon: 1.0    steps: 280  evaluation reward: 273.65
episode: 5197   score: 260.0  epsilon: 1.0    steps: 525  evaluation reward: 270.35
Training network. lr: 0.000156. clip: 0.062224
Iteration 12325: Policy loss: 1.220845. Value loss: 15.456151. Entropy: 0.317466.
Iteration 12326: Policy loss: 

Iteration 12379: Policy loss: -0.326183. Value loss: 34.616211. Entropy: 0.352521.
Iteration 12380: Policy loss: -0.758070. Value loss: 24.551256. Entropy: 0.380045.
Iteration 12381: Policy loss: -0.344001. Value loss: 17.524426. Entropy: 0.356576.
episode: 5222   score: 180.0  epsilon: 1.0    steps: 273  evaluation reward: 281.95
episode: 5223   score: 260.0  epsilon: 1.0    steps: 417  evaluation reward: 282.45
episode: 5224   score: 265.0  epsilon: 1.0    steps: 537  evaluation reward: 282.2
episode: 5225   score: 210.0  epsilon: 1.0    steps: 805  evaluation reward: 282.75
Training network. lr: 0.000155. clip: 0.062067
Iteration 12382: Policy loss: 0.571219. Value loss: 11.637756. Entropy: 0.588950.
Iteration 12383: Policy loss: 0.562609. Value loss: 8.116124. Entropy: 0.595237.
Iteration 12384: Policy loss: 0.450152. Value loss: 8.818583. Entropy: 0.596225.
episode: 5226   score: 260.0  epsilon: 1.0    steps: 915  evaluation reward: 281.9
Training network. lr: 0.000155. clip: 0.06

Iteration 12440: Policy loss: 0.217096. Value loss: 12.103096. Entropy: 0.327916.
Iteration 12441: Policy loss: 0.199349. Value loss: 9.307557. Entropy: 0.344728.
Training network. lr: 0.000155. clip: 0.061910
Iteration 12442: Policy loss: -0.466863. Value loss: 28.548901. Entropy: 0.450395.
Iteration 12443: Policy loss: -0.515070. Value loss: 16.245567. Entropy: 0.453488.
Iteration 12444: Policy loss: -0.319368. Value loss: 10.347536. Entropy: 0.467341.
episode: 5249   score: 395.0  epsilon: 1.0    steps: 159  evaluation reward: 276.7
episode: 5250   score: 155.0  epsilon: 1.0    steps: 768  evaluation reward: 274.45
now time :  2019-02-25 22:32:31.583160
episode: 5251   score: 165.0  epsilon: 1.0    steps: 796  evaluation reward: 273.5
Training network. lr: 0.000155. clip: 0.061910
Iteration 12445: Policy loss: 2.292376. Value loss: 25.068861. Entropy: 0.749831.
Iteration 12446: Policy loss: 2.043333. Value loss: 13.075802. Entropy: 0.748554.
Iteration 12447: Policy loss: 2.323013. V

episode: 5278   score: 210.0  epsilon: 1.0    steps: 439  evaluation reward: 230.2
Training network. lr: 0.000154. clip: 0.061763
Iteration 12499: Policy loss: 0.853048. Value loss: 24.532686. Entropy: 0.539803.
Iteration 12500: Policy loss: 0.419300. Value loss: 16.963736. Entropy: 0.543989.
Iteration 12501: Policy loss: 0.811722. Value loss: 13.870691. Entropy: 0.516969.
episode: 5279   score: 365.0  epsilon: 1.0    steps: 83  evaluation reward: 226.4
episode: 5280   score: 155.0  epsilon: 1.0    steps: 604  evaluation reward: 225.85
Training network. lr: 0.000154. clip: 0.061606
Iteration 12502: Policy loss: -0.357645. Value loss: 27.325737. Entropy: 0.386816.
Iteration 12503: Policy loss: -0.457522. Value loss: 16.402498. Entropy: 0.412477.
Iteration 12504: Policy loss: -0.404423. Value loss: 13.529358. Entropy: 0.416986.
episode: 5281   score: 210.0  epsilon: 1.0    steps: 661  evaluation reward: 226.6
episode: 5282   score: 135.0  epsilon: 1.0    steps: 908  evaluation reward: 22

Iteration 12560: Policy loss: 0.473993. Value loss: 16.469248. Entropy: 0.189329.
Iteration 12561: Policy loss: 0.724469. Value loss: 10.477229. Entropy: 0.189395.
Training network. lr: 0.000154. clip: 0.061449
Iteration 12562: Policy loss: 0.426907. Value loss: 37.552860. Entropy: 0.295326.
Iteration 12563: Policy loss: 0.653308. Value loss: 18.364510. Entropy: 0.287730.
Iteration 12564: Policy loss: 0.996525. Value loss: 12.627386. Entropy: 0.274908.
Training network. lr: 0.000154. clip: 0.061449
Iteration 12565: Policy loss: -0.432568. Value loss: 46.275139. Entropy: 0.270651.
Iteration 12566: Policy loss: -0.133638. Value loss: 27.477537. Entropy: 0.269892.
Iteration 12567: Policy loss: -0.202734. Value loss: 20.848869. Entropy: 0.299361.
episode: 5304   score: 265.0  epsilon: 1.0    steps: 434  evaluation reward: 229.25
episode: 5305   score: 355.0  epsilon: 1.0    steps: 639  evaluation reward: 230.7
episode: 5306   score: 260.0  epsilon: 1.0    steps: 729  evaluation reward: 231

episode: 5327   score: 410.0  epsilon: 1.0    steps: 641  evaluation reward: 251.85
episode: 5328   score: 290.0  epsilon: 1.0    steps: 906  evaluation reward: 253.25
Training network. lr: 0.000153. clip: 0.061302
Iteration 12625: Policy loss: -0.955764. Value loss: 207.794662. Entropy: 0.491969.
Iteration 12626: Policy loss: -0.365323. Value loss: 104.520897. Entropy: 0.479095.
Iteration 12627: Policy loss: -1.489520. Value loss: 65.380692. Entropy: 0.468166.
Training network. lr: 0.000153. clip: 0.061302
Iteration 12628: Policy loss: -0.412801. Value loss: 44.035252. Entropy: 0.307402.
Iteration 12629: Policy loss: -0.404773. Value loss: 26.790457. Entropy: 0.304191.
Iteration 12630: Policy loss: -0.393268. Value loss: 23.001629. Entropy: 0.307705.
episode: 5329   score: 135.0  epsilon: 1.0    steps: 373  evaluation reward: 253.25
Training network. lr: 0.000153. clip: 0.061302
Iteration 12631: Policy loss: 0.892396. Value loss: 64.533714. Entropy: 0.288317.
Iteration 12632: Policy l

Iteration 12687: Policy loss: -2.240170. Value loss: 19.187962. Entropy: 0.355285.
episode: 5352   score: 350.0  epsilon: 1.0    steps: 308  evaluation reward: 276.25
Training network. lr: 0.000153. clip: 0.061145
Iteration 12688: Policy loss: -1.403138. Value loss: 200.102432. Entropy: 0.316905.
Iteration 12689: Policy loss: -1.130115. Value loss: 84.960831. Entropy: 0.293247.
Iteration 12690: Policy loss: -1.775915. Value loss: 62.295341. Entropy: 0.313714.
Training network. lr: 0.000153. clip: 0.061145
Iteration 12691: Policy loss: 3.091743. Value loss: 78.074120. Entropy: 0.184119.
Iteration 12692: Policy loss: 2.915453. Value loss: 33.848179. Entropy: 0.189312.
Iteration 12693: Policy loss: 2.909777. Value loss: 22.488058. Entropy: 0.203069.
episode: 5353   score: 210.0  epsilon: 1.0    steps: 707  evaluation reward: 276.4
episode: 5354   score: 375.0  epsilon: 1.0    steps: 988  evaluation reward: 279.05
Training network. lr: 0.000153. clip: 0.061145
Iteration 12694: Policy loss:

Iteration 12750: Policy loss: -1.990929. Value loss: 54.103897. Entropy: 0.565077.
episode: 5377   score: 155.0  epsilon: 1.0    steps: 288  evaluation reward: 297.0
Training network. lr: 0.000152. clip: 0.060841
Iteration 12751: Policy loss: 2.343469. Value loss: 61.351383. Entropy: 0.468370.
Iteration 12752: Policy loss: 2.386430. Value loss: 28.657682. Entropy: 0.467865.
Iteration 12753: Policy loss: 2.034532. Value loss: 24.842731. Entropy: 0.473827.
episode: 5378   score: 460.0  epsilon: 1.0    steps: 978  evaluation reward: 299.5
Training network. lr: 0.000152. clip: 0.060841
Iteration 12754: Policy loss: 4.132251. Value loss: 48.804001. Entropy: 0.456300.
Iteration 12755: Policy loss: 4.405505. Value loss: 24.894567. Entropy: 0.488692.
Iteration 12756: Policy loss: 4.325032. Value loss: 17.167913. Entropy: 0.498952.
episode: 5379   score: 310.0  epsilon: 1.0    steps: 164  evaluation reward: 298.95
episode: 5380   score: 180.0  epsilon: 1.0    steps: 622  evaluation reward: 299.

Iteration 12809: Policy loss: 0.781906. Value loss: 16.492525. Entropy: 0.303514.
Iteration 12810: Policy loss: 0.832739. Value loss: 13.460719. Entropy: 0.315132.
episode: 5406   score: 120.0  epsilon: 1.0    steps: 590  evaluation reward: 297.8
episode: 5407   score: 260.0  epsilon: 1.0    steps: 883  evaluation reward: 297.45
Training network. lr: 0.000152. clip: 0.060685
Iteration 12811: Policy loss: 1.815544. Value loss: 27.356464. Entropy: 0.337736.
Iteration 12812: Policy loss: 2.090455. Value loss: 16.924162. Entropy: 0.364629.
Iteration 12813: Policy loss: 1.869382. Value loss: 15.070192. Entropy: 0.361891.
Training network. lr: 0.000152. clip: 0.060685
Iteration 12814: Policy loss: -0.872468. Value loss: 24.745333. Entropy: 0.336918.
Iteration 12815: Policy loss: -0.991991. Value loss: 10.687421. Entropy: 0.358696.
Iteration 12816: Policy loss: -1.132465. Value loss: 9.589161. Entropy: 0.332300.
episode: 5408   score: 390.0  epsilon: 1.0    steps: 96  evaluation reward: 295.6

Iteration 12872: Policy loss: -0.087866. Value loss: 27.203066. Entropy: 0.295697.
Iteration 12873: Policy loss: 0.007945. Value loss: 26.567924. Entropy: 0.301149.
episode: 5430   score: 210.0  epsilon: 1.0    steps: 188  evaluation reward: 284.85
episode: 5431   score: 210.0  epsilon: 1.0    steps: 668  evaluation reward: 285.6
Training network. lr: 0.000151. clip: 0.060528
Iteration 12874: Policy loss: 0.849812. Value loss: 38.863415. Entropy: 0.402921.
Iteration 12875: Policy loss: 0.862830. Value loss: 19.977037. Entropy: 0.390136.
Iteration 12876: Policy loss: 0.829565. Value loss: 15.067241. Entropy: 0.404235.
Training network. lr: 0.000151. clip: 0.060528
Iteration 12877: Policy loss: -0.457308. Value loss: 120.506386. Entropy: 0.416586.
Iteration 12878: Policy loss: -0.028956. Value loss: 73.727448. Entropy: 0.401048.
Iteration 12879: Policy loss: -0.631352. Value loss: 41.261871. Entropy: 0.416503.
episode: 5432   score: 575.0  epsilon: 1.0    steps: 277  evaluation reward: 2

Iteration 12932: Policy loss: 0.960500. Value loss: 20.403833. Entropy: 0.305269.
Iteration 12933: Policy loss: 0.878708. Value loss: 15.906100. Entropy: 0.303272.
episode: 5457   score: 290.0  epsilon: 1.0    steps: 491  evaluation reward: 270.5
Training network. lr: 0.000151. clip: 0.060380
Iteration 12934: Policy loss: -0.301997. Value loss: 28.484047. Entropy: 0.253992.
Iteration 12935: Policy loss: -0.382465. Value loss: 14.992136. Entropy: 0.244872.
Iteration 12936: Policy loss: -0.191142. Value loss: 13.722949. Entropy: 0.255283.
episode: 5458   score: 180.0  epsilon: 1.0    steps: 17  evaluation reward: 267.75
Training network. lr: 0.000151. clip: 0.060380
Iteration 12937: Policy loss: 1.107911. Value loss: 30.136265. Entropy: 0.247939.
Iteration 12938: Policy loss: 0.708818. Value loss: 16.639650. Entropy: 0.214959.
Iteration 12939: Policy loss: 0.487849. Value loss: 11.444020. Entropy: 0.237204.
Training network. lr: 0.000151. clip: 0.060380
Iteration 12940: Policy loss: -0.4

Iteration 12998: Policy loss: 1.178206. Value loss: 23.999268. Entropy: 0.283296.
Iteration 12999: Policy loss: 1.026852. Value loss: 18.211830. Entropy: 0.287838.
episode: 5478   score: 260.0  epsilon: 1.0    steps: 455  evaluation reward: 285.3
episode: 5479   score: 310.0  epsilon: 1.0    steps: 611  evaluation reward: 285.3
Training network. lr: 0.000151. clip: 0.060224
Iteration 13000: Policy loss: 1.027356. Value loss: 23.497377. Entropy: 0.363210.
Iteration 13001: Policy loss: 1.180309. Value loss: 15.807240. Entropy: 0.355726.
Iteration 13002: Policy loss: 0.843050. Value loss: 11.786574. Entropy: 0.363393.
Training network. lr: 0.000150. clip: 0.060067
Iteration 13003: Policy loss: -0.297845. Value loss: 15.093898. Entropy: 0.190625.
Iteration 13004: Policy loss: -0.205323. Value loss: 8.384995. Entropy: 0.200810.
Iteration 13005: Policy loss: -0.330372. Value loss: 6.072470. Entropy: 0.190285.
Training network. lr: 0.000150. clip: 0.060067
Iteration 13006: Policy loss: -2.874

Training network. lr: 0.000150. clip: 0.059920
Iteration 13060: Policy loss: -1.134454. Value loss: 20.154362. Entropy: 0.359752.
Iteration 13061: Policy loss: -0.985317. Value loss: 10.073955. Entropy: 0.371108.
Iteration 13062: Policy loss: -1.026089. Value loss: 10.886744. Entropy: 0.358296.
episode: 5504   score: 300.0  epsilon: 1.0    steps: 225  evaluation reward: 297.4
Training network. lr: 0.000150. clip: 0.059920
Iteration 13063: Policy loss: -0.991894. Value loss: 46.437881. Entropy: 0.524723.
Iteration 13064: Policy loss: -1.031848. Value loss: 22.065359. Entropy: 0.515986.
Iteration 13065: Policy loss: -1.286238. Value loss: 16.407806. Entropy: 0.518580.
Training network. lr: 0.000150. clip: 0.059920
Iteration 13066: Policy loss: -0.328074. Value loss: 22.891806. Entropy: 0.477525.
Iteration 13067: Policy loss: -0.044386. Value loss: 13.697516. Entropy: 0.481299.
Iteration 13068: Policy loss: -0.489725. Value loss: 10.174489. Entropy: 0.491083.
episode: 5505   score: 180.0 

Iteration 13126: Policy loss: 1.793496. Value loss: 32.604145. Entropy: 0.581106.
Iteration 13127: Policy loss: 1.760361. Value loss: 17.912624. Entropy: 0.590912.
Iteration 13128: Policy loss: 1.844746. Value loss: 14.074750. Entropy: 0.584680.
episode: 5524   score: 530.0  epsilon: 1.0    steps: 606  evaluation reward: 307.6
episode: 5525   score: 365.0  epsilon: 1.0    steps: 987  evaluation reward: 309.45
Training network. lr: 0.000149. clip: 0.059763
Iteration 13129: Policy loss: 1.828465. Value loss: 49.864052. Entropy: 0.534540.
Iteration 13130: Policy loss: 2.047574. Value loss: 18.267302. Entropy: 0.557647.
Iteration 13131: Policy loss: 2.246702. Value loss: 11.336874. Entropy: 0.560151.
episode: 5526   score: 370.0  epsilon: 1.0    steps: 760  evaluation reward: 310.2
episode: 5527   score: 280.0  epsilon: 1.0    steps: 865  evaluation reward: 311.45
Training network. lr: 0.000149. clip: 0.059763
Iteration 13132: Policy loss: 1.627757. Value loss: 32.862206. Entropy: 0.540405

Training network. lr: 0.000149. clip: 0.059606
Iteration 13189: Policy loss: -1.179573. Value loss: 34.142868. Entropy: 0.589588.
Iteration 13190: Policy loss: -1.261462. Value loss: 16.960274. Entropy: 0.601792.
Iteration 13191: Policy loss: -1.294684. Value loss: 13.172438. Entropy: 0.611415.
Training network. lr: 0.000149. clip: 0.059606
Iteration 13192: Policy loss: -0.080673. Value loss: 22.873158. Entropy: 0.665626.
Iteration 13193: Policy loss: 0.192230. Value loss: 12.412015. Entropy: 0.667335.
Iteration 13194: Policy loss: -0.125889. Value loss: 9.535265. Entropy: 0.681196.
episode: 5550   score: 290.0  epsilon: 1.0    steps: 835  evaluation reward: 307.15
Training network. lr: 0.000149. clip: 0.059606
Iteration 13195: Policy loss: 2.600715. Value loss: 22.546856. Entropy: 0.701268.
Iteration 13196: Policy loss: 2.556466. Value loss: 9.969723. Entropy: 0.692602.
Iteration 13197: Policy loss: 2.392936. Value loss: 8.458195. Entropy: 0.688309.
now time :  2019-02-25 22:46:36.401

Iteration 13254: Policy loss: 0.109560. Value loss: 4.953964. Entropy: 0.639189.
episode: 5572   score: 110.0  epsilon: 1.0    steps: 199  evaluation reward: 294.35
Training network. lr: 0.000148. clip: 0.059302
Iteration 13255: Policy loss: 0.142587. Value loss: 19.870640. Entropy: 0.656437.
Iteration 13256: Policy loss: 0.014786. Value loss: 11.298643. Entropy: 0.627977.
Iteration 13257: Policy loss: 0.116237. Value loss: 9.317983. Entropy: 0.636489.
episode: 5573   score: 335.0  epsilon: 1.0    steps: 539  evaluation reward: 295.6
Training network. lr: 0.000148. clip: 0.059302
Iteration 13258: Policy loss: -0.450959. Value loss: 20.640503. Entropy: 0.458903.
Iteration 13259: Policy loss: -0.627366. Value loss: 10.489180. Entropy: 0.437303.
Iteration 13260: Policy loss: -0.536341. Value loss: 6.789019. Entropy: 0.456550.
Training network. lr: 0.000148. clip: 0.059302
Iteration 13261: Policy loss: -0.157082. Value loss: 18.499603. Entropy: 0.539184.
Iteration 13262: Policy loss: -0.05

episode: 5596   score: 110.0  epsilon: 1.0    steps: 447  evaluation reward: 269.55
Training network. lr: 0.000148. clip: 0.059145
Iteration 13318: Policy loss: 1.118154. Value loss: 11.496012. Entropy: 0.519577.
Iteration 13319: Policy loss: 1.083111. Value loss: 7.397561. Entropy: 0.477266.
Iteration 13320: Policy loss: 1.075691. Value loss: 5.026290. Entropy: 0.481375.
episode: 5597   score: 320.0  epsilon: 1.0    steps: 285  evaluation reward: 270.65
Training network. lr: 0.000148. clip: 0.059145
Iteration 13321: Policy loss: -0.468022. Value loss: 22.765053. Entropy: 0.492575.
Iteration 13322: Policy loss: -0.610000. Value loss: 12.723584. Entropy: 0.501247.
Iteration 13323: Policy loss: -0.558711. Value loss: 10.711246. Entropy: 0.499165.
episode: 5598   score: 210.0  epsilon: 1.0    steps: 996  evaluation reward: 270.15
Training network. lr: 0.000148. clip: 0.059145
Iteration 13324: Policy loss: 0.223133. Value loss: 17.592484. Entropy: 0.618173.
Iteration 13325: Policy loss: 0.

episode: 5616   score: 335.0  epsilon: 1.0    steps: 421  evaluation reward: 273.3
Training network. lr: 0.000147. clip: 0.058998
Iteration 13384: Policy loss: -0.112345. Value loss: 19.662487. Entropy: 0.158024.
Iteration 13385: Policy loss: -0.044075. Value loss: 13.702959. Entropy: 0.179344.
Iteration 13386: Policy loss: 0.163845. Value loss: 9.815686. Entropy: 0.179950.
episode: 5617   score: 210.0  epsilon: 1.0    steps: 213  evaluation reward: 269.8
Training network. lr: 0.000147. clip: 0.058998
Iteration 13387: Policy loss: 1.208562. Value loss: 25.910666. Entropy: 0.207912.
Iteration 13388: Policy loss: 1.160485. Value loss: 10.646902. Entropy: 0.221400.
Iteration 13389: Policy loss: 1.139950. Value loss: 8.652035. Entropy: 0.211278.
Training network. lr: 0.000147. clip: 0.058998
Iteration 13390: Policy loss: -0.920717. Value loss: 169.449631. Entropy: 0.174499.
Iteration 13391: Policy loss: -1.379078. Value loss: 81.320511. Entropy: 0.144254.
Iteration 13392: Policy loss: -0.1

Iteration 13448: Policy loss: -0.608391. Value loss: 26.236834. Entropy: 0.243119.
Iteration 13449: Policy loss: -0.391424. Value loss: 19.351107. Entropy: 0.262677.
episode: 5639   score: 915.0  epsilon: 1.0    steps: 85  evaluation reward: 287.85
Training network. lr: 0.000147. clip: 0.058841
Iteration 13450: Policy loss: 1.195371. Value loss: 39.955673. Entropy: 0.218287.
Iteration 13451: Policy loss: 1.360860. Value loss: 20.405180. Entropy: 0.253802.
Iteration 13452: Policy loss: 1.110610. Value loss: 15.373368. Entropy: 0.246159.
episode: 5640   score: 210.0  epsilon: 1.0    steps: 182  evaluation reward: 287.85
episode: 5641   score: 220.0  epsilon: 1.0    steps: 830  evaluation reward: 289.15
Training network. lr: 0.000147. clip: 0.058685
Iteration 13453: Policy loss: 0.209283. Value loss: 98.672394. Entropy: 0.200963.
Iteration 13454: Policy loss: 0.033471. Value loss: 74.786217. Entropy: 0.203687.
Iteration 13455: Policy loss: -0.102035. Value loss: 73.889923. Entropy: 0.2077

Training network. lr: 0.000146. clip: 0.058537
Iteration 13513: Policy loss: 0.181781. Value loss: 41.232681. Entropy: 0.495008.
Iteration 13514: Policy loss: 0.373945. Value loss: 19.480534. Entropy: 0.498439.
Iteration 13515: Policy loss: 0.054704. Value loss: 13.419709. Entropy: 0.509473.
episode: 5661   score: 395.0  epsilon: 1.0    steps: 456  evaluation reward: 319.15
episode: 5662   score: 430.0  epsilon: 1.0    steps: 843  evaluation reward: 320.55
Training network. lr: 0.000146. clip: 0.058537
Iteration 13516: Policy loss: 3.723570. Value loss: 34.411255. Entropy: 0.324963.
Iteration 13517: Policy loss: 3.614481. Value loss: 15.485767. Entropy: 0.340233.
Iteration 13518: Policy loss: 3.593913. Value loss: 14.447801. Entropy: 0.331091.
Training network. lr: 0.000146. clip: 0.058537
Iteration 13519: Policy loss: -1.719169. Value loss: 164.510620. Entropy: 0.238303.
Iteration 13520: Policy loss: -0.508310. Value loss: 48.524536. Entropy: 0.246537.
Iteration 13521: Policy loss: -1

episode: 5682   score: 560.0  epsilon: 1.0    steps: 694  evaluation reward: 343.6
episode: 5683   score: 240.0  epsilon: 1.0    steps: 845  evaluation reward: 343.6
Training network. lr: 0.000146. clip: 0.058381
Iteration 13579: Policy loss: 0.883617. Value loss: 54.355206. Entropy: 0.400237.
Iteration 13580: Policy loss: 0.986155. Value loss: 25.006203. Entropy: 0.380605.
Iteration 13581: Policy loss: 1.114056. Value loss: 20.108789. Entropy: 0.383387.
episode: 5684   score: 210.0  epsilon: 1.0    steps: 283  evaluation reward: 342.4
Training network. lr: 0.000146. clip: 0.058381
Iteration 13582: Policy loss: -0.994079. Value loss: 161.896179. Entropy: 0.357981.
Iteration 13583: Policy loss: -0.891014. Value loss: 62.953281. Entropy: 0.363104.
Iteration 13584: Policy loss: -0.878238. Value loss: 55.870033. Entropy: 0.352323.
episode: 5685   score: 255.0  epsilon: 1.0    steps: 635  evaluation reward: 341.65
Training network. lr: 0.000146. clip: 0.058381
Iteration 13585: Policy loss: 

episode: 5706   score: 155.0  epsilon: 1.0    steps: 981  evaluation reward: 351.55
Training network. lr: 0.000146. clip: 0.058224
Iteration 13642: Policy loss: 1.553153. Value loss: 34.861748. Entropy: 0.360145.
Iteration 13643: Policy loss: 1.345546. Value loss: 17.027058. Entropy: 0.356725.
Iteration 13644: Policy loss: 1.601834. Value loss: 13.911285. Entropy: 0.347091.
episode: 5707   score: 225.0  epsilon: 1.0    steps: 138  evaluation reward: 351.7
Training network. lr: 0.000146. clip: 0.058224
Iteration 13645: Policy loss: 2.902571. Value loss: 22.206673. Entropy: 0.408446.
Iteration 13646: Policy loss: 3.225436. Value loss: 12.074162. Entropy: 0.390537.
Iteration 13647: Policy loss: 3.225204. Value loss: 9.285871. Entropy: 0.402662.
episode: 5708   score: 120.0  epsilon: 1.0    steps: 347  evaluation reward: 350.25
episode: 5709   score: 225.0  epsilon: 1.0    steps: 410  evaluation reward: 349.05
episode: 5710   score: 290.0  epsilon: 1.0    steps: 702  evaluation reward: 347

Iteration 13704: Policy loss: 1.012813. Value loss: 10.520186. Entropy: 0.442881.
episode: 5732   score: 185.0  epsilon: 1.0    steps: 234  evaluation reward: 322.9
episode: 5733   score: 410.0  epsilon: 1.0    steps: 600  evaluation reward: 322.1
episode: 5734   score: 155.0  epsilon: 1.0    steps: 779  evaluation reward: 320.65
episode: 5735   score: 155.0  epsilon: 1.0    steps: 915  evaluation reward: 316.1
Training network. lr: 0.000145. clip: 0.057920
Iteration 13705: Policy loss: 0.774059. Value loss: 18.723835. Entropy: 0.607860.
Iteration 13706: Policy loss: 0.591167. Value loss: 13.406001. Entropy: 0.573146.
Iteration 13707: Policy loss: 0.441895. Value loss: 9.826462. Entropy: 0.609492.
episode: 5736   score: 365.0  epsilon: 1.0    steps: 50  evaluation reward: 317.45
Training network. lr: 0.000145. clip: 0.057920
Iteration 13708: Policy loss: 1.198171. Value loss: 18.181995. Entropy: 0.512530.
Iteration 13709: Policy loss: 1.293522. Value loss: 11.544807. Entropy: 0.527703.

Iteration 13767: Policy loss: 2.293307. Value loss: 17.677803. Entropy: 0.270832.
episode: 5756   score: 265.0  epsilon: 1.0    steps: 889  evaluation reward: 295.15
episode: 5757   score: 245.0  epsilon: 1.0    steps: 923  evaluation reward: 291.45
Training network. lr: 0.000144. clip: 0.057763
Iteration 13768: Policy loss: 1.158701. Value loss: 33.506966. Entropy: 0.327845.
Iteration 13769: Policy loss: 1.110704. Value loss: 16.901028. Entropy: 0.330422.
Iteration 13770: Policy loss: 1.134182. Value loss: 13.165373. Entropy: 0.312576.
episode: 5758   score: 215.0  epsilon: 1.0    steps: 291  evaluation reward: 290.05
episode: 5759   score: 260.0  epsilon: 1.0    steps: 591  evaluation reward: 290.55
Training network. lr: 0.000144. clip: 0.057763
Iteration 13771: Policy loss: -4.794556. Value loss: 255.182755. Entropy: 0.291515.
Iteration 13772: Policy loss: -4.671909. Value loss: 154.862305. Entropy: 0.265075.
Iteration 13773: Policy loss: -5.053753. Value loss: 176.281876. Entropy: 

Iteration 13831: Policy loss: 0.193660. Value loss: 34.048531. Entropy: 0.404351.
Iteration 13832: Policy loss: 0.025977. Value loss: 20.182835. Entropy: 0.399250.
Iteration 13833: Policy loss: 0.093103. Value loss: 17.361126. Entropy: 0.401768.
Training network. lr: 0.000144. clip: 0.057616
Iteration 13834: Policy loss: 1.209727. Value loss: 32.780777. Entropy: 0.218555.
Iteration 13835: Policy loss: 1.119864. Value loss: 20.402771. Entropy: 0.238442.
Iteration 13836: Policy loss: 1.028008. Value loss: 15.643209. Entropy: 0.246906.
episode: 5779   score: 375.0  epsilon: 1.0    steps: 921  evaluation reward: 297.05
Training network. lr: 0.000144. clip: 0.057616
Iteration 13837: Policy loss: 0.169570. Value loss: 34.920849. Entropy: 0.273763.
Iteration 13838: Policy loss: 0.551550. Value loss: 16.451241. Entropy: 0.262495.
Iteration 13839: Policy loss: 0.335712. Value loss: 12.640949. Entropy: 0.257747.
episode: 5780   score: 420.0  epsilon: 1.0    steps: 788  evaluation reward: 296.7
T

Iteration 13898: Policy loss: 3.421375. Value loss: 11.425086. Entropy: 0.198831.
Iteration 13899: Policy loss: 3.248226. Value loss: 10.028264. Entropy: 0.208876.
episode: 5799   score: 320.0  epsilon: 1.0    steps: 608  evaluation reward: 301.2
Training network. lr: 0.000144. clip: 0.057459
Iteration 13900: Policy loss: -1.077812. Value loss: 187.395645. Entropy: 0.093729.
Iteration 13901: Policy loss: -1.344424. Value loss: 96.917496. Entropy: 0.129937.
Iteration 13902: Policy loss: -1.227372. Value loss: 41.562641. Entropy: 0.150556.
episode: 5800   score: 260.0  epsilon: 1.0    steps: 415  evaluation reward: 301.65
Training network. lr: 0.000143. clip: 0.057302
Iteration 13903: Policy loss: -4.463035. Value loss: 353.465912. Entropy: 0.270908.
Iteration 13904: Policy loss: -3.934826. Value loss: 180.452942. Entropy: 0.260849.
Iteration 13905: Policy loss: -4.442397. Value loss: 144.970337. Entropy: 0.271840.
Training network. lr: 0.000143. clip: 0.057302
Iteration 13906: Policy lo

Training network. lr: 0.000143. clip: 0.057155
Iteration 13963: Policy loss: -0.845750. Value loss: 17.022091. Entropy: 0.169289.
Iteration 13964: Policy loss: -0.905762. Value loss: 14.460450. Entropy: 0.170466.
Iteration 13965: Policy loss: -0.843679. Value loss: 11.571091. Entropy: 0.169448.
episode: 5821   score: 295.0  epsilon: 1.0    steps: 64  evaluation reward: 324.65
episode: 5822   score: 275.0  epsilon: 1.0    steps: 500  evaluation reward: 325.55
episode: 5823   score: 260.0  epsilon: 1.0    steps: 815  evaluation reward: 326.6
Training network. lr: 0.000143. clip: 0.057155
Iteration 13966: Policy loss: 1.087360. Value loss: 28.207127. Entropy: 0.324585.
Iteration 13967: Policy loss: 1.126216. Value loss: 17.529114. Entropy: 0.318245.
Iteration 13968: Policy loss: 1.043654. Value loss: 15.021281. Entropy: 0.319386.
episode: 5824   score: 350.0  epsilon: 1.0    steps: 732  evaluation reward: 328.0
Training network. lr: 0.000143. clip: 0.057155
Iteration 13969: Policy loss: -

Training network. lr: 0.000142. clip: 0.056998
Iteration 14029: Policy loss: 0.436743. Value loss: 31.607908. Entropy: 0.185202.
Iteration 14030: Policy loss: 0.739126. Value loss: 21.557793. Entropy: 0.188229.
Iteration 14031: Policy loss: 0.719623. Value loss: 15.778063. Entropy: 0.183357.
episode: 5842   score: 375.0  epsilon: 1.0    steps: 731  evaluation reward: 350.85
Training network. lr: 0.000142. clip: 0.056998
Iteration 14032: Policy loss: 1.026710. Value loss: 38.282368. Entropy: 0.168983.
Iteration 14033: Policy loss: 0.955900. Value loss: 17.972439. Entropy: 0.185027.
Iteration 14034: Policy loss: 1.055892. Value loss: 14.563956. Entropy: 0.172739.
Training network. lr: 0.000142. clip: 0.056998
Iteration 14035: Policy loss: -0.470979. Value loss: 36.304893. Entropy: 0.084223.
Iteration 14036: Policy loss: -0.785769. Value loss: 22.386400. Entropy: 0.080251.
Iteration 14037: Policy loss: -0.727419. Value loss: 15.931402. Entropy: 0.083510.
Training network. lr: 0.000142. cl

episode: 5863   score: 460.0  epsilon: 1.0    steps: 114  evaluation reward: 371.35
episode: 5864   score: 210.0  epsilon: 1.0    steps: 337  evaluation reward: 368.45
Training network. lr: 0.000142. clip: 0.056841
Iteration 14095: Policy loss: 0.511058. Value loss: 27.348551. Entropy: 0.234136.
Iteration 14096: Policy loss: 0.515590. Value loss: 16.534611. Entropy: 0.240212.
Iteration 14097: Policy loss: 0.251885. Value loss: 14.934755. Entropy: 0.213290.
episode: 5865   score: 265.0  epsilon: 1.0    steps: 164  evaluation reward: 368.5
Training network. lr: 0.000142. clip: 0.056841
Iteration 14098: Policy loss: -0.613490. Value loss: 35.773010. Entropy: 0.138087.
Iteration 14099: Policy loss: -0.783291. Value loss: 19.909460. Entropy: 0.143144.
Iteration 14100: Policy loss: -0.748253. Value loss: 14.709541. Entropy: 0.146294.
episode: 5866   score: 240.0  epsilon: 1.0    steps: 796  evaluation reward: 367.35
Training network. lr: 0.000142. clip: 0.056694
Iteration 14101: Policy loss:

episode: 5888   score: 210.0  epsilon: 1.0    steps: 792  evaluation reward: 346.8
episode: 5889   score: 225.0  epsilon: 1.0    steps: 976  evaluation reward: 346.55
Training network. lr: 0.000141. clip: 0.056537
Iteration 14158: Policy loss: -0.513461. Value loss: 22.683619. Entropy: 0.243295.
Iteration 14159: Policy loss: -0.540229. Value loss: 16.767958. Entropy: 0.239930.
Iteration 14160: Policy loss: -0.545014. Value loss: 13.018690. Entropy: 0.256845.
episode: 5890   score: 630.0  epsilon: 1.0    steps: 121  evaluation reward: 349.7
Training network. lr: 0.000141. clip: 0.056537
Iteration 14161: Policy loss: -0.251478. Value loss: 19.053757. Entropy: 0.363841.
Iteration 14162: Policy loss: -0.370506. Value loss: 13.510262. Entropy: 0.365271.
Iteration 14163: Policy loss: -0.178977. Value loss: 10.518099. Entropy: 0.349931.
Training network. lr: 0.000141. clip: 0.056537
Iteration 14164: Policy loss: -1.474700. Value loss: 21.911882. Entropy: 0.211157.
Iteration 14165: Policy loss

episode: 5909   score: 510.0  epsilon: 1.0    steps: 675  evaluation reward: 336.45
Training network. lr: 0.000141. clip: 0.056381
Iteration 14224: Policy loss: -2.268697. Value loss: 255.839218. Entropy: 0.354924.
Iteration 14225: Policy loss: -2.419009. Value loss: 112.169464. Entropy: 0.300988.
Iteration 14226: Policy loss: -1.258945. Value loss: 54.322639. Entropy: 0.292817.
Training network. lr: 0.000141. clip: 0.056381
Iteration 14227: Policy loss: 1.694480. Value loss: 44.340130. Entropy: 0.244531.
Iteration 14228: Policy loss: 2.243868. Value loss: 23.836460. Entropy: 0.249487.
Iteration 14229: Policy loss: 2.234396. Value loss: 16.930494. Entropy: 0.248246.
episode: 5910   score: 265.0  epsilon: 1.0    steps: 232  evaluation reward: 336.0
episode: 5911   score: 275.0  epsilon: 1.0    steps: 612  evaluation reward: 334.35
Training network. lr: 0.000141. clip: 0.056381
Iteration 14230: Policy loss: 0.071416. Value loss: 35.785782. Entropy: 0.286840.
Iteration 14231: Policy loss:

Iteration 14289: Policy loss: -0.380399. Value loss: 15.472457. Entropy: 0.263423.
episode: 5931   score: 260.0  epsilon: 1.0    steps: 429  evaluation reward: 337.9
Training network. lr: 0.000141. clip: 0.056233
Iteration 14290: Policy loss: 0.273325. Value loss: 25.810801. Entropy: 0.101142.
Iteration 14291: Policy loss: 0.543668. Value loss: 15.234637. Entropy: 0.113151.
Iteration 14292: Policy loss: 0.436465. Value loss: 12.108715. Entropy: 0.102537.
episode: 5932   score: 390.0  epsilon: 1.0    steps: 50  evaluation reward: 339.2
Training network. lr: 0.000141. clip: 0.056233
Iteration 14293: Policy loss: -1.834109. Value loss: 203.555176. Entropy: 0.294284.
Iteration 14294: Policy loss: -2.170357. Value loss: 114.979996. Entropy: 0.298590.
Iteration 14295: Policy loss: -2.067190. Value loss: 86.845352. Entropy: 0.303744.
episode: 5933   score: 660.0  epsilon: 1.0    steps: 142  evaluation reward: 339.1
episode: 5934   score: 280.0  epsilon: 1.0    steps: 873  evaluation reward: 3

Training network. lr: 0.000140. clip: 0.055920
Iteration 14353: Policy loss: 2.378241. Value loss: 41.983036. Entropy: 0.228134.
Iteration 14354: Policy loss: 2.396376. Value loss: 19.233805. Entropy: 0.216522.
Iteration 14355: Policy loss: 2.549323. Value loss: 14.999738. Entropy: 0.225978.
Training network. lr: 0.000140. clip: 0.055920
Iteration 14356: Policy loss: 2.735402. Value loss: 36.216782. Entropy: 0.282347.
Iteration 14357: Policy loss: 2.674175. Value loss: 25.633646. Entropy: 0.291446.
Iteration 14358: Policy loss: 2.656276. Value loss: 19.922417. Entropy: 0.284639.
Training network. lr: 0.000140. clip: 0.055920
Iteration 14359: Policy loss: 1.011027. Value loss: 28.431585. Entropy: 0.260566.
Iteration 14360: Policy loss: 1.033900. Value loss: 16.448011. Entropy: 0.267174.
Iteration 14361: Policy loss: 0.945165. Value loss: 14.526196. Entropy: 0.258332.
episode: 5954   score: 210.0  epsilon: 1.0    steps: 914  evaluation reward: 346.75
Training network. lr: 0.000140. clip:

Training network. lr: 0.000139. clip: 0.055772
Iteration 14422: Policy loss: -0.440938. Value loss: 10.056741. Entropy: 0.291187.
Iteration 14423: Policy loss: -0.566495. Value loss: 7.966238. Entropy: 0.297021.
Iteration 14424: Policy loss: -0.573848. Value loss: 6.020120. Entropy: 0.296860.
Training network. lr: 0.000139. clip: 0.055772
Iteration 14425: Policy loss: 1.253473. Value loss: 19.230742. Entropy: 0.271853.
Iteration 14426: Policy loss: 0.778856. Value loss: 9.768477. Entropy: 0.284350.
Iteration 14427: Policy loss: 1.205890. Value loss: 6.777538. Entropy: 0.275501.
episode: 5973   score: 210.0  epsilon: 1.0    steps: 448  evaluation reward: 341.4
episode: 5974   score: 235.0  epsilon: 1.0    steps: 1018  evaluation reward: 341.15
Training network. lr: 0.000139. clip: 0.055772
Iteration 14428: Policy loss: -0.690050. Value loss: 109.487625. Entropy: 0.249711.
Iteration 14429: Policy loss: -0.765041. Value loss: 80.714035. Entropy: 0.248049.
Iteration 14430: Policy loss: -0.

Iteration 14489: Policy loss: 0.768151. Value loss: 8.271681. Entropy: 0.505083.
Iteration 14490: Policy loss: 0.793221. Value loss: 6.381874. Entropy: 0.513273.
episode: 5993   score: 260.0  epsilon: 1.0    steps: 755  evaluation reward: 342.4
Training network. lr: 0.000139. clip: 0.055616
Iteration 14491: Policy loss: 0.695440. Value loss: 24.861967. Entropy: 0.532916.
Iteration 14492: Policy loss: 0.710054. Value loss: 13.175575. Entropy: 0.528292.
Iteration 14493: Policy loss: 0.534054. Value loss: 10.254898. Entropy: 0.524033.
episode: 5994   score: 315.0  epsilon: 1.0    steps: 8  evaluation reward: 342.4
episode: 5995   score: 190.0  epsilon: 1.0    steps: 944  evaluation reward: 340.3
Training network. lr: 0.000139. clip: 0.055616
Iteration 14494: Policy loss: -0.139757. Value loss: 32.420620. Entropy: 0.444462.
Iteration 14495: Policy loss: 0.024642. Value loss: 16.594629. Entropy: 0.423936.
Iteration 14496: Policy loss: -0.189145. Value loss: 12.049762. Entropy: 0.429371.
Tra

episode: 6015   score: 410.0  epsilon: 1.0    steps: 866  evaluation reward: 355.55
Training network. lr: 0.000138. clip: 0.055312
Iteration 14554: Policy loss: 2.239010. Value loss: 40.872444. Entropy: 0.478517.
Iteration 14555: Policy loss: 2.067181. Value loss: 19.772409. Entropy: 0.505128.
Iteration 14556: Policy loss: 2.269552. Value loss: 15.151053. Entropy: 0.509628.
Training network. lr: 0.000138. clip: 0.055312
Iteration 14557: Policy loss: 1.233135. Value loss: 21.726089. Entropy: 0.374108.
Iteration 14558: Policy loss: 1.042361. Value loss: 13.741803. Entropy: 0.335761.
Iteration 14559: Policy loss: 1.145771. Value loss: 9.604845. Entropy: 0.349700.
episode: 6016   score: 350.0  epsilon: 1.0    steps: 25  evaluation reward: 349.7
episode: 6017   score: 260.0  epsilon: 1.0    steps: 234  evaluation reward: 349.35
episode: 6018   score: 530.0  epsilon: 1.0    steps: 727  evaluation reward: 352.05
Training network. lr: 0.000138. clip: 0.055312
Iteration 14560: Policy loss: 0.29

Iteration 14617: Policy loss: -1.229941. Value loss: 17.011293. Entropy: 0.621372.
Iteration 14618: Policy loss: -1.093515. Value loss: 9.820701. Entropy: 0.597662.
Iteration 14619: Policy loss: -1.050782. Value loss: 10.252944. Entropy: 0.631392.
episode: 6040   score: 270.0  epsilon: 1.0    steps: 119  evaluation reward: 334.35
episode: 6041   score: 180.0  epsilon: 1.0    steps: 316  evaluation reward: 333.4
Training network. lr: 0.000138. clip: 0.055155
Iteration 14620: Policy loss: 1.357802. Value loss: 31.169031. Entropy: 0.469401.
Iteration 14621: Policy loss: 1.508877. Value loss: 15.016833. Entropy: 0.457253.
Iteration 14622: Policy loss: 1.845128. Value loss: 12.100787. Entropy: 0.476692.
Training network. lr: 0.000138. clip: 0.055155
Iteration 14623: Policy loss: -0.571924. Value loss: 27.014927. Entropy: 0.400001.
Iteration 14624: Policy loss: -0.772574. Value loss: 13.471923. Entropy: 0.372471.
Iteration 14625: Policy loss: -0.625712. Value loss: 11.097292. Entropy: 0.3836

Iteration 14680: Policy loss: -0.700819. Value loss: 307.868988. Entropy: 0.398361.
Iteration 14681: Policy loss: -0.774871. Value loss: 186.943634. Entropy: 0.386894.
Iteration 14682: Policy loss: -1.225349. Value loss: 216.985321. Entropy: 0.392542.
Training network. lr: 0.000137. clip: 0.054998
Iteration 14683: Policy loss: -2.950744. Value loss: 198.861298. Entropy: 0.335366.
Iteration 14684: Policy loss: -2.918508. Value loss: 94.591606. Entropy: 0.326396.
Iteration 14685: Policy loss: -3.061625. Value loss: 79.262207. Entropy: 0.330240.
episode: 6063   score: 485.0  epsilon: 1.0    steps: 168  evaluation reward: 313.2
episode: 6064   score: 260.0  epsilon: 1.0    steps: 418  evaluation reward: 312.8
episode: 6065   score: 260.0  epsilon: 1.0    steps: 820  evaluation reward: 311.15
Training network. lr: 0.000137. clip: 0.054998
Iteration 14686: Policy loss: -0.286669. Value loss: 37.859467. Entropy: 0.411856.
Iteration 14687: Policy loss: -0.314150. Value loss: 22.968311. Entropy

Iteration 14743: Policy loss: 1.793031. Value loss: 33.695065. Entropy: 0.301057.
Iteration 14744: Policy loss: 1.837685. Value loss: 25.822229. Entropy: 0.325959.
Iteration 14745: Policy loss: 1.988296. Value loss: 18.814024. Entropy: 0.296347.
episode: 6087   score: 530.0  epsilon: 1.0    steps: 928  evaluation reward: 340.7
Training network. lr: 0.000137. clip: 0.054851
Iteration 14746: Policy loss: 0.806759. Value loss: 122.191475. Entropy: 0.250293.
Iteration 14747: Policy loss: 0.699934. Value loss: 35.925266. Entropy: 0.245431.
Iteration 14748: Policy loss: 1.210875. Value loss: 27.601877. Entropy: 0.249197.
Training network. lr: 0.000137. clip: 0.054851
Iteration 14749: Policy loss: 1.453062. Value loss: 38.167858. Entropy: 0.465938.
Iteration 14750: Policy loss: 1.584901. Value loss: 23.392206. Entropy: 0.451366.
Iteration 14751: Policy loss: 1.644988. Value loss: 17.251474. Entropy: 0.465463.
episode: 6088   score: 210.0  epsilon: 1.0    steps: 42  evaluation reward: 338.95
e

Iteration 14811: Policy loss: -0.039896. Value loss: 11.887508. Entropy: 0.486212.
episode: 6106   score: 390.0  epsilon: 1.0    steps: 906  evaluation reward: 338.2
Training network. lr: 0.000136. clip: 0.054537
Iteration 14812: Policy loss: 1.052391. Value loss: 33.891953. Entropy: 0.531203.
Iteration 14813: Policy loss: 0.874507. Value loss: 22.118767. Entropy: 0.508257.
Iteration 14814: Policy loss: 1.013022. Value loss: 18.318853. Entropy: 0.528210.
episode: 6107   score: 340.0  epsilon: 1.0    steps: 798  evaluation reward: 332.7
Training network. lr: 0.000136. clip: 0.054537
Iteration 14815: Policy loss: 0.559067. Value loss: 24.093740. Entropy: 0.517433.
Iteration 14816: Policy loss: 0.584302. Value loss: 11.923223. Entropy: 0.499693.
Iteration 14817: Policy loss: 0.638273. Value loss: 11.704606. Entropy: 0.499488.
episode: 6108   score: 250.0  epsilon: 1.0    steps: 268  evaluation reward: 331.05
episode: 6109   score: 260.0  epsilon: 1.0    steps: 627  evaluation reward: 328.

Iteration 14874: Policy loss: 3.194839. Value loss: 16.832150. Entropy: 0.410456.
episode: 6131   score: 155.0  epsilon: 1.0    steps: 351  evaluation reward: 335.7
Training network. lr: 0.000136. clip: 0.054390
Iteration 14875: Policy loss: 3.732513. Value loss: 40.798210. Entropy: 0.344825.
Iteration 14876: Policy loss: 3.626330. Value loss: 22.931454. Entropy: 0.377045.
Iteration 14877: Policy loss: 3.825872. Value loss: 18.130384. Entropy: 0.362835.
Training network. lr: 0.000136. clip: 0.054390
Iteration 14878: Policy loss: -1.331388. Value loss: 119.604500. Entropy: 0.369541.
Iteration 14879: Policy loss: -1.320592. Value loss: 72.093086. Entropy: 0.390536.
Iteration 14880: Policy loss: -1.454566. Value loss: 68.647919. Entropy: 0.368175.
episode: 6132   score: 180.0  epsilon: 1.0    steps: 397  evaluation reward: 333.1
episode: 6133   score: 260.0  epsilon: 1.0    steps: 1000  evaluation reward: 332.7
Training network. lr: 0.000136. clip: 0.054390
Iteration 14881: Policy loss: 2

Iteration 14936: Policy loss: 3.367829. Value loss: 16.832731. Entropy: 0.194340.
Iteration 14937: Policy loss: 3.494467. Value loss: 12.397567. Entropy: 0.229900.
Training network. lr: 0.000136. clip: 0.054233
Iteration 14938: Policy loss: 0.631321. Value loss: 37.304279. Entropy: 0.340723.
Iteration 14939: Policy loss: 0.650581. Value loss: 25.264151. Entropy: 0.337536.
Iteration 14940: Policy loss: 0.656619. Value loss: 19.638512. Entropy: 0.345058.
episode: 6156   score: 240.0  epsilon: 1.0    steps: 648  evaluation reward: 334.55
Training network. lr: 0.000136. clip: 0.054233
Iteration 14941: Policy loss: -3.043473. Value loss: 206.427551. Entropy: 0.480414.
Iteration 14942: Policy loss: -2.999434. Value loss: 107.659447. Entropy: 0.443811.
Iteration 14943: Policy loss: -3.171620. Value loss: 63.596661. Entropy: 0.430654.
episode: 6157   score: 260.0  epsilon: 1.0    steps: 347  evaluation reward: 332.1
episode: 6158   score: 410.0  epsilon: 1.0    steps: 618  evaluation reward: 3

Iteration 14999: Policy loss: 0.481873. Value loss: 13.198456. Entropy: 0.212045.
Iteration 15000: Policy loss: 0.601482. Value loss: 12.178910. Entropy: 0.223300.
episode: 6181   score: 210.0  epsilon: 1.0    steps: 416  evaluation reward: 312.55
Training network. lr: 0.000135. clip: 0.053929
Iteration 15001: Policy loss: 0.764728. Value loss: 33.129414. Entropy: 0.387307.
Iteration 15002: Policy loss: 0.554387. Value loss: 16.169422. Entropy: 0.393994.
Iteration 15003: Policy loss: 0.746967. Value loss: 14.900433. Entropy: 0.380377.
episode: 6182   score: 215.0  epsilon: 1.0    steps: 557  evaluation reward: 311.1
Training network. lr: 0.000135. clip: 0.053929
Iteration 15004: Policy loss: 0.818181. Value loss: 22.679392. Entropy: 0.350129.
Iteration 15005: Policy loss: 1.025180. Value loss: 14.395096. Entropy: 0.350536.
Iteration 15006: Policy loss: 0.452619. Value loss: 11.875287. Entropy: 0.339399.
episode: 6183   score: 285.0  epsilon: 1.0    steps: 67  evaluation reward: 306.3
T

Training network. lr: 0.000134. clip: 0.053773
Iteration 15061: Policy loss: 0.913162. Value loss: 40.959023. Entropy: 0.406971.
Iteration 15062: Policy loss: 0.630818. Value loss: 22.996719. Entropy: 0.432329.
Iteration 15063: Policy loss: 0.807782. Value loss: 19.405457. Entropy: 0.422063.
episode: 6206   score: 240.0  epsilon: 1.0    steps: 477  evaluation reward: 292.25
episode: 6207   score: 260.0  epsilon: 1.0    steps: 847  evaluation reward: 291.45
episode: 6208   score: 420.0  epsilon: 1.0    steps: 966  evaluation reward: 293.15
Training network. lr: 0.000134. clip: 0.053773
Iteration 15064: Policy loss: 1.020694. Value loss: 27.772072. Entropy: 0.409107.
Iteration 15065: Policy loss: 1.049446. Value loss: 16.633759. Entropy: 0.382153.
Iteration 15066: Policy loss: 0.912709. Value loss: 14.768144. Entropy: 0.393872.
episode: 6209   score: 585.0  epsilon: 1.0    steps: 354  evaluation reward: 296.4
Training network. lr: 0.000134. clip: 0.053773
Iteration 15067: Policy loss: -0

Iteration 15126: Policy loss: -1.359000. Value loss: 9.809820. Entropy: 0.451981.
episode: 6228   score: 250.0  epsilon: 1.0    steps: 256  evaluation reward: 298.4
episode: 6229   score: 260.0  epsilon: 1.0    steps: 464  evaluation reward: 298.9
episode: 6230   score: 260.0  epsilon: 1.0    steps: 613  evaluation reward: 296.15
episode: 6231   score: 210.0  epsilon: 1.0    steps: 684  evaluation reward: 296.7
Training network. lr: 0.000134. clip: 0.053616
Iteration 15127: Policy loss: 3.427373. Value loss: 27.316761. Entropy: 0.421318.
Iteration 15128: Policy loss: 3.004091. Value loss: 13.481393. Entropy: 0.397344.
Iteration 15129: Policy loss: 3.394981. Value loss: 11.996657. Entropy: 0.404032.
Training network. lr: 0.000134. clip: 0.053616
Iteration 15130: Policy loss: -0.401854. Value loss: 20.575052. Entropy: 0.477096.
Iteration 15131: Policy loss: -0.176020. Value loss: 11.365851. Entropy: 0.488136.
Iteration 15132: Policy loss: -0.329001. Value loss: 11.621424. Entropy: 0.4756

Iteration 15187: Policy loss: 0.530889. Value loss: 37.311756. Entropy: 0.535032.
Iteration 15188: Policy loss: 0.398243. Value loss: 22.982355. Entropy: 0.554684.
Iteration 15189: Policy loss: 0.425622. Value loss: 17.369209. Entropy: 0.522993.
episode: 6254   score: 210.0  epsilon: 1.0    steps: 388  evaluation reward: 300.85
Training network. lr: 0.000134. clip: 0.053468
Iteration 15190: Policy loss: -0.593709. Value loss: 23.762758. Entropy: 0.448690.
Iteration 15191: Policy loss: -0.946374. Value loss: 13.661889. Entropy: 0.467052.
Iteration 15192: Policy loss: -1.036475. Value loss: 13.373862. Entropy: 0.455904.
episode: 6255   score: 220.0  epsilon: 1.0    steps: 881  evaluation reward: 300.95
Training network. lr: 0.000134. clip: 0.053468
Iteration 15193: Policy loss: 1.384543. Value loss: 22.057804. Entropy: 0.387665.
Iteration 15194: Policy loss: 1.279642. Value loss: 12.941618. Entropy: 0.388184.
Iteration 15195: Policy loss: 1.357941. Value loss: 11.541144. Entropy: 0.39969

Iteration 15253: Policy loss: 1.229157. Value loss: 36.098679. Entropy: 0.550969.
Iteration 15254: Policy loss: 1.247334. Value loss: 22.304287. Entropy: 0.543376.
Iteration 15255: Policy loss: 1.207406. Value loss: 16.578377. Entropy: 0.534481.
episode: 6276   score: 475.0  epsilon: 1.0    steps: 48  evaluation reward: 310.55
episode: 6277   score: 180.0  epsilon: 1.0    steps: 282  evaluation reward: 309.75
episode: 6278   score: 240.0  epsilon: 1.0    steps: 425  evaluation reward: 310.05
Training network. lr: 0.000133. clip: 0.053155
Iteration 15256: Policy loss: -0.177517. Value loss: 21.327196. Entropy: 0.401317.
Iteration 15257: Policy loss: -0.199387. Value loss: 18.309290. Entropy: 0.385056.
Iteration 15258: Policy loss: -0.229306. Value loss: 13.208226. Entropy: 0.423549.
episode: 6279   score: 210.0  epsilon: 1.0    steps: 666  evaluation reward: 310.35
Training network. lr: 0.000133. clip: 0.053155
Iteration 15259: Policy loss: -0.379808. Value loss: 47.974743. Entropy: 0.3

Iteration 15316: Policy loss: 0.531619. Value loss: 36.574646. Entropy: 0.246437.
Iteration 15317: Policy loss: 0.601416. Value loss: 20.919786. Entropy: 0.250453.
Iteration 15318: Policy loss: 0.838251. Value loss: 19.872307. Entropy: 0.258465.
Training network. lr: 0.000133. clip: 0.053008
Iteration 15319: Policy loss: 1.472726. Value loss: 39.756203. Entropy: 0.468328.
Iteration 15320: Policy loss: 1.337312. Value loss: 22.448505. Entropy: 0.484312.
Iteration 15321: Policy loss: 1.337524. Value loss: 17.362906. Entropy: 0.483633.
episode: 6300   score: 270.0  epsilon: 1.0    steps: 429  evaluation reward: 322.5
Training network. lr: 0.000133. clip: 0.053008
Iteration 15322: Policy loss: -0.427304. Value loss: 40.990711. Entropy: 0.525369.
Iteration 15323: Policy loss: -0.490727. Value loss: 24.340193. Entropy: 0.577584.
Iteration 15324: Policy loss: -0.596509. Value loss: 21.369080. Entropy: 0.537916.
now time :  2019-02-25 23:26:14.860272
episode: 6301   score: 415.0  epsilon: 1.0 

In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")