# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10

### Main Training Loop

In [2]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
max_eval = -np.inf
reset_count = 0

while (frame < 10000000):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                    
                    if np.mean(evaluation_reward) > max_eval:
                        torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
                        max_eval = float(np.mean(evaluation_reward))
                        reset_count = 0
                    elif e > 5000:
                        reset_count += 1
                        """
                        if (reset_count == reset_max):
                            print("Training went nowhere, starting again at best model")
                            agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                            agent.update_target_net()
                            reset_count = 0
                        """
                e += 1
                evaluation_reward.append(env.score)
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                
                env.done = False
                env.score = 0
                env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()


NameError: name 'envs' is not defined

In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

In [None]:
### Loop through all environments and run PPO on them

env_names = ['SpaceInvaders-v0', 'MsPacman-v0', 'Asteroids-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']

for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size)
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[HISTORY_SIZE-1,:,:] for i in range(num_envs)])
            next_states = []
            net_in = np.stack([envs[i].history[:HISTORY_SIZE,:,:] for i in range(num_envs)])
            step += num_envs
            frame += num_envs
            actions, values = agent.get_action(np.float32(net_in) / 255.)
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, deepcopy(curr_states[i]), actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    _, frame_next_vals = agent.get_action(np.float32(net_in) / 255.)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()




 ------- STARTING TRAINING FOR SpaceInvaders-v0 ------- 





  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Determing min/max rewards of environment
Min: 0. Max: 200.


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: 0.003325. Value loss: 0.055819. Entropy: 1.384684.
Iteration 2: Policy loss: 0.000642. Value loss: 0.055059. Entropy: 1.385803.
Iteration 3: Policy loss: 0.000076. Value loss: 0.052915. Entropy: 1.385432.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: -0.001717. Value loss: 0.169128. Entropy: 1.383981.
Iteration 5: Policy loss: -0.007052. Value loss: 0.151069. Entropy: 1.383703.
Iteration 6: Policy loss: -0.007076. Value loss: 0.143791. Entropy: 1.384128.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: -0.000305. Value loss: 0.040818. Entropy: 1.383972.
Iteration 8: Policy loss: -0.003539. Value loss: 0.035559. Entropy: 1.381132.
Iteration 9: Policy loss: -0.003339. Value loss: 0.032229. Entropy: 1.380908.
now time :  2019-03-05 20:37:50.662255
episode: 1   score: 15.0  epsilon: 1.0    steps: 392  evaluation reward: 15.0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode: 2   score: 35.0  epsilon: 1.0    steps: 936  evaluation reward: 25.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: 0.001400. Value loss: 0.097520. Entropy: 1.384594.
Iteration 11: Policy loss: 0.001532. Value loss: 0.078596. Entropy: 1.383772.
Iteration 12: Policy loss: 0.002312. Value loss: 0.069231. Entropy: 1.383800.
episode: 3   score: 90.0  epsilon: 1.0    steps: 704  evaluation reward: 46.666666666666664
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: 0.001533. Value loss: 0.103985. Entropy: 1.376456.
Iteration 14: Policy loss: -0.002323. Value loss: 0.083894. Entropy: 1.381140.
Iteration 15: Policy loss: -0.003166. Value loss: 0.069195. Entropy: 1.379613.
episode: 4   score: 50.0  epsilon: 1.0    steps: 64  evaluation reward: 47.5
episode: 5   score: 110.0  epsilon: 1.0    steps: 200  evaluation reward: 60.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 16: Policy loss: 0.000547. Value loss: 0.051954. Entr

Training network. lr: 0.000250. clip: 0.099853
Iteration 70: Policy loss: 0.001002. Value loss: 0.148032. Entropy: 1.361147.
Iteration 71: Policy loss: -0.003972. Value loss: 0.094081. Entropy: 1.368483.
Iteration 72: Policy loss: -0.006332. Value loss: 0.078850. Entropy: 1.368646.
Training network. lr: 0.000250. clip: 0.099853
Iteration 73: Policy loss: -0.000902. Value loss: 0.095616. Entropy: 1.370766.
Iteration 74: Policy loss: -0.005280. Value loss: 0.060505. Entropy: 1.369109.
Iteration 75: Policy loss: -0.006371. Value loss: 0.050706. Entropy: 1.370824.
episode: 31   score: 110.0  epsilon: 1.0    steps: 904  evaluation reward: 140.32258064516128
Training network. lr: 0.000250. clip: 0.099853
Iteration 76: Policy loss: -0.000645. Value loss: 0.077667. Entropy: 1.373059.
Iteration 77: Policy loss: -0.005975. Value loss: 0.050389. Entropy: 1.370381.
Iteration 78: Policy loss: -0.009066. Value loss: 0.042009. Entropy: 1.370367.
episode: 32   score: 365.0  epsilon: 1.0    steps: 160 

Iteration 131: Policy loss: -0.002750. Value loss: 0.077096. Entropy: 1.268812.
Iteration 132: Policy loss: -0.007130. Value loss: 0.061391. Entropy: 1.274400.
episode: 57   score: 135.0  epsilon: 1.0    steps: 824  evaluation reward: 162.98245614035088
Training network. lr: 0.000249. clip: 0.099696
Iteration 133: Policy loss: 0.001886. Value loss: 0.418334. Entropy: 1.263659.
Iteration 134: Policy loss: -0.000357. Value loss: 0.316190. Entropy: 1.256662.
Iteration 135: Policy loss: 0.001945. Value loss: 0.250664. Entropy: 1.247284.
episode: 58   score: 450.0  epsilon: 1.0    steps: 72  evaluation reward: 167.93103448275863
episode: 59   score: 80.0  epsilon: 1.0    steps: 184  evaluation reward: 166.4406779661017
episode: 60   score: 180.0  epsilon: 1.0    steps: 464  evaluation reward: 166.66666666666666
Training network. lr: 0.000249. clip: 0.099696
Iteration 136: Policy loss: 0.002627. Value loss: 0.056110. Entropy: 1.266888.
Iteration 137: Policy loss: -0.003255. Value loss: 0.046

episode: 86   score: 105.0  epsilon: 1.0    steps: 528  evaluation reward: 169.47674418604652
Training network. lr: 0.000249. clip: 0.099548
Iteration 190: Policy loss: -0.001025. Value loss: 0.146729. Entropy: 1.351293.
Iteration 191: Policy loss: -0.007082. Value loss: 0.097168. Entropy: 1.352753.
Iteration 192: Policy loss: -0.008029. Value loss: 0.076757. Entropy: 1.354181.
episode: 87   score: 80.0  epsilon: 1.0    steps: 976  evaluation reward: 168.44827586206895
Training network. lr: 0.000249. clip: 0.099548
Iteration 193: Policy loss: -0.001635. Value loss: 0.146331. Entropy: 1.352994.
Iteration 194: Policy loss: -0.004663. Value loss: 0.084312. Entropy: 1.350562.
Iteration 195: Policy loss: -0.004627. Value loss: 0.069085. Entropy: 1.353925.
Training network. lr: 0.000249. clip: 0.099548
Iteration 196: Policy loss: 0.000368. Value loss: 0.065170. Entropy: 1.335675.
Iteration 197: Policy loss: -0.002858. Value loss: 0.040822. Entropy: 1.334900.
Iteration 198: Policy loss: -0.00

Training network. lr: 0.000248. clip: 0.099392
Iteration 250: Policy loss: 0.000898. Value loss: 0.075230. Entropy: 1.314022.
Iteration 251: Policy loss: -0.005295. Value loss: 0.052270. Entropy: 1.298672.
Iteration 252: Policy loss: -0.004634. Value loss: 0.047832. Entropy: 1.300911.
episode: 114   score: 110.0  epsilon: 1.0    steps: 200  evaluation reward: 166.25
episode: 115   score: 155.0  epsilon: 1.0    steps: 304  evaluation reward: 167.0
episode: 116   score: 210.0  epsilon: 1.0    steps: 512  evaluation reward: 167.9
Training network. lr: 0.000248. clip: 0.099235
Iteration 253: Policy loss: 0.001132. Value loss: 0.186973. Entropy: 1.312203.
Iteration 254: Policy loss: -0.003245. Value loss: 0.159428. Entropy: 1.311417.
Iteration 255: Policy loss: -0.007871. Value loss: 0.144726. Entropy: 1.301140.
episode: 117   score: 380.0  epsilon: 1.0    steps: 288  evaluation reward: 170.45
Training network. lr: 0.000248. clip: 0.099235
Iteration 256: Policy loss: 0.002967. Value loss: 0

Iteration 312: Policy loss: -0.007241. Value loss: 0.032642. Entropy: 1.282560.
episode: 142   score: 135.0  epsilon: 1.0    steps: 904  evaluation reward: 167.5
episode: 143   score: 210.0  epsilon: 1.0    steps: 944  evaluation reward: 168.25
Training network. lr: 0.000248. clip: 0.099088
Iteration 313: Policy loss: 0.000387. Value loss: 0.114717. Entropy: 1.279324.
Iteration 314: Policy loss: -0.003697. Value loss: 0.066647. Entropy: 1.271281.
Iteration 315: Policy loss: -0.002124. Value loss: 0.060093. Entropy: 1.287341.
episode: 144   score: 155.0  epsilon: 1.0    steps: 256  evaluation reward: 168.75
Training network. lr: 0.000248. clip: 0.099088
Iteration 316: Policy loss: -0.000696. Value loss: 0.080340. Entropy: 1.266841.
Iteration 317: Policy loss: -0.004590. Value loss: 0.059151. Entropy: 1.283183.
Iteration 318: Policy loss: -0.006688. Value loss: 0.045180. Entropy: 1.283178.
episode: 145   score: 210.0  epsilon: 1.0    steps: 584  evaluation reward: 169.8
Training network.

episode: 168   score: 210.0  epsilon: 1.0    steps: 272  evaluation reward: 173.35
episode: 169   score: 440.0  epsilon: 1.0    steps: 712  evaluation reward: 176.7
episode: 170   score: 105.0  epsilon: 1.0    steps: 920  evaluation reward: 177.25
Training network. lr: 0.000247. clip: 0.098931
Iteration 376: Policy loss: -0.002569. Value loss: 0.090407. Entropy: 1.303210.
Iteration 377: Policy loss: -0.008632. Value loss: 0.060830. Entropy: 1.305787.
Iteration 378: Policy loss: -0.010821. Value loss: 0.050799. Entropy: 1.294593.
episode: 171   score: 225.0  epsilon: 1.0    steps: 432  evaluation reward: 175.4
episode: 172   score: 565.0  epsilon: 1.0    steps: 976  evaluation reward: 180.85
Training network. lr: 0.000247. clip: 0.098931
Iteration 379: Policy loss: -0.000376. Value loss: 0.347666. Entropy: 1.260325.
Iteration 380: Policy loss: -0.002567. Value loss: 0.234665. Entropy: 1.257815.
Iteration 381: Policy loss: -0.001534. Value loss: 0.165941. Entropy: 1.268073.
Training netw

episode: 196   score: 260.0  epsilon: 1.0    steps: 696  evaluation reward: 196.65
Training network. lr: 0.000247. clip: 0.098774
Iteration 439: Policy loss: 0.001556. Value loss: 0.108831. Entropy: 1.306086.
Iteration 440: Policy loss: -0.003110. Value loss: 0.066066. Entropy: 1.306202.
Iteration 441: Policy loss: -0.003617. Value loss: 0.052625. Entropy: 1.310836.
episode: 197   score: 105.0  epsilon: 1.0    steps: 840  evaluation reward: 196.95
Training network. lr: 0.000247. clip: 0.098774
Iteration 442: Policy loss: 0.003086. Value loss: 0.372858. Entropy: 1.311205.
Iteration 443: Policy loss: -0.001777. Value loss: 0.242437. Entropy: 1.309332.
Iteration 444: Policy loss: -0.003246. Value loss: 0.220987. Entropy: 1.308962.
episode: 198   score: 240.0  epsilon: 1.0    steps: 160  evaluation reward: 198.3
episode: 199   score: 135.0  epsilon: 1.0    steps: 440  evaluation reward: 198.1
Training network. lr: 0.000247. clip: 0.098774
Iteration 445: Policy loss: 0.001413. Value loss: 0

Training network. lr: 0.000246. clip: 0.098470
Iteration 502: Policy loss: -0.000652. Value loss: 0.135542. Entropy: 1.222241.
Iteration 503: Policy loss: -0.002986. Value loss: 0.077221. Entropy: 1.223629.
Iteration 504: Policy loss: -0.004082. Value loss: 0.059247. Entropy: 1.216679.
episode: 223   score: 425.0  epsilon: 1.0    steps: 264  evaluation reward: 215.75
episode: 224   score: 120.0  epsilon: 1.0    steps: 648  evaluation reward: 215.4
Training network. lr: 0.000246. clip: 0.098470
Iteration 505: Policy loss: -0.001238. Value loss: 0.136519. Entropy: 1.230127.
Iteration 506: Policy loss: -0.005739. Value loss: 0.091198. Entropy: 1.227875.
Iteration 507: Policy loss: -0.007170. Value loss: 0.068841. Entropy: 1.221119.
Training network. lr: 0.000246. clip: 0.098470
Iteration 508: Policy loss: 0.001093. Value loss: 0.113624. Entropy: 1.285091.
Iteration 509: Policy loss: -0.005762. Value loss: 0.053346. Entropy: 1.291852.
Iteration 510: Policy loss: -0.009476. Value loss: 0.03

Training network. lr: 0.000246. clip: 0.098313
Iteration 568: Policy loss: 0.001487. Value loss: 0.079385. Entropy: 1.225480.
Iteration 569: Policy loss: -0.003779. Value loss: 0.035220. Entropy: 1.224988.
Iteration 570: Policy loss: -0.008255. Value loss: 0.025994. Entropy: 1.228434.
episode: 247   score: 275.0  epsilon: 1.0    steps: 272  evaluation reward: 226.5
episode: 248   score: 215.0  epsilon: 1.0    steps: 744  evaluation reward: 224.85
Training network. lr: 0.000246. clip: 0.098313
Iteration 571: Policy loss: -0.000017. Value loss: 0.094273. Entropy: 1.217143.
Iteration 572: Policy loss: -0.004691. Value loss: 0.048362. Entropy: 1.221641.
Iteration 573: Policy loss: -0.009269. Value loss: 0.038030. Entropy: 1.201267.
episode: 249   score: 180.0  epsilon: 1.0    steps: 280  evaluation reward: 224.55
episode: 250   score: 190.0  epsilon: 1.0    steps: 368  evaluation reward: 224.85
now time :  2019-03-05 20:45:29.440083
episode: 251   score: 210.0  epsilon: 1.0    steps: 840  

Iteration 629: Policy loss: -0.001523. Value loss: 0.199879. Entropy: 1.257846.
Iteration 630: Policy loss: -0.004320. Value loss: 0.157732. Entropy: 1.252407.
episode: 275   score: 460.0  epsilon: 1.0    steps: 648  evaluation reward: 219.55
Training network. lr: 0.000245. clip: 0.098166
Iteration 631: Policy loss: 0.000861. Value loss: 0.164990. Entropy: 1.216887.
Iteration 632: Policy loss: -0.008434. Value loss: 0.069466. Entropy: 1.212931.
Iteration 633: Policy loss: -0.013341. Value loss: 0.043768. Entropy: 1.216226.
episode: 276   score: 30.0  epsilon: 1.0    steps: 144  evaluation reward: 217.75
episode: 277   score: 445.0  epsilon: 1.0    steps: 912  evaluation reward: 220.35
Training network. lr: 0.000245. clip: 0.098166
Iteration 634: Policy loss: 0.002732. Value loss: 0.164690. Entropy: 1.225545.
Iteration 635: Policy loss: -0.004140. Value loss: 0.071386. Entropy: 1.221117.
Iteration 636: Policy loss: -0.006958. Value loss: 0.052519. Entropy: 1.201172.
episode: 278   score

Iteration 692: Policy loss: -0.003062. Value loss: 0.037905. Entropy: 1.195698.
Iteration 693: Policy loss: -0.005592. Value loss: 0.030347. Entropy: 1.196777.
episode: 302   score: 120.0  epsilon: 1.0    steps: 168  evaluation reward: 216.15
Training network. lr: 0.000245. clip: 0.098009
Iteration 694: Policy loss: 0.000170. Value loss: 0.072833. Entropy: 1.221166.
Iteration 695: Policy loss: -0.007659. Value loss: 0.034926. Entropy: 1.208591.
Iteration 696: Policy loss: -0.012764. Value loss: 0.026734. Entropy: 1.213532.
episode: 303   score: 210.0  epsilon: 1.0    steps: 72  evaluation reward: 216.45
episode: 304   score: 210.0  epsilon: 1.0    steps: 776  evaluation reward: 217.4
Training network. lr: 0.000245. clip: 0.098009
Iteration 697: Policy loss: 0.001821. Value loss: 0.325926. Entropy: 1.217295.
Iteration 698: Policy loss: -0.004021. Value loss: 0.182940. Entropy: 1.219706.
Iteration 699: Policy loss: -0.002166. Value loss: 0.131360. Entropy: 1.232653.
Training network. lr:

Training network. lr: 0.000244. clip: 0.097705
Iteration 754: Policy loss: 0.000770. Value loss: 0.080075. Entropy: 1.222003.
Iteration 755: Policy loss: -0.004833. Value loss: 0.042433. Entropy: 1.227309.
Iteration 756: Policy loss: -0.006231. Value loss: 0.035854. Entropy: 1.223278.
episode: 331   score: 410.0  epsilon: 1.0    steps: 368  evaluation reward: 211.4
Training network. lr: 0.000244. clip: 0.097705
Iteration 757: Policy loss: 0.001093. Value loss: 0.079757. Entropy: 1.219032.
Iteration 758: Policy loss: -0.002940. Value loss: 0.038533. Entropy: 1.213712.
Iteration 759: Policy loss: -0.006674. Value loss: 0.027098. Entropy: 1.216930.
episode: 332   score: 110.0  epsilon: 1.0    steps: 312  evaluation reward: 209.15
Training network. lr: 0.000244. clip: 0.097705
Iteration 760: Policy loss: -0.001567. Value loss: 0.127015. Entropy: 1.209398.
Iteration 761: Policy loss: -0.006369. Value loss: 0.075595. Entropy: 1.218718.
Iteration 762: Policy loss: -0.009847. Value loss: 0.059

Iteration 817: Policy loss: 0.002811. Value loss: 0.082373. Entropy: 1.261095.
Iteration 818: Policy loss: -0.003981. Value loss: 0.032095. Entropy: 1.263041.
Iteration 819: Policy loss: -0.009026. Value loss: 0.024464. Entropy: 1.259554.
Training network. lr: 0.000244. clip: 0.097549
Iteration 820: Policy loss: 0.004579. Value loss: 0.061955. Entropy: 1.236639.
Iteration 821: Policy loss: -0.005278. Value loss: 0.027717. Entropy: 1.247339.
Iteration 822: Policy loss: -0.010294. Value loss: 0.020563. Entropy: 1.234986.
episode: 357   score: 210.0  epsilon: 1.0    steps: 40  evaluation reward: 202.6
episode: 358   score: 230.0  epsilon: 1.0    steps: 176  evaluation reward: 203.85
episode: 359   score: 210.0  epsilon: 1.0    steps: 216  evaluation reward: 205.45
episode: 360   score: 30.0  epsilon: 1.0    steps: 424  evaluation reward: 203.6
Training network. lr: 0.000244. clip: 0.097549
Iteration 823: Policy loss: -0.000537. Value loss: 0.081782. Entropy: 1.285682.
Iteration 824: Polic

Iteration 880: Policy loss: 0.001221. Value loss: 0.087630. Entropy: 1.195523.
Iteration 881: Policy loss: -0.008601. Value loss: 0.036024. Entropy: 1.190101.
Iteration 882: Policy loss: -0.013167. Value loss: 0.029096. Entropy: 1.193525.
episode: 385   score: 180.0  epsilon: 1.0    steps: 960  evaluation reward: 195.55
Training network. lr: 0.000243. clip: 0.097392
Iteration 883: Policy loss: 0.002325. Value loss: 0.060416. Entropy: 1.230403.
Iteration 884: Policy loss: -0.005124. Value loss: 0.030559. Entropy: 1.221910.
Iteration 885: Policy loss: -0.009329. Value loss: 0.023392. Entropy: 1.221823.
episode: 386   score: 80.0  epsilon: 1.0    steps: 632  evaluation reward: 195.15
Training network. lr: 0.000243. clip: 0.097392
Iteration 886: Policy loss: 0.001930. Value loss: 0.090619. Entropy: 1.218400.
Iteration 887: Policy loss: -0.006358. Value loss: 0.042938. Entropy: 1.222094.
Iteration 888: Policy loss: -0.008391. Value loss: 0.033095. Entropy: 1.221150.
episode: 387   score: 30

Training network. lr: 0.000243. clip: 0.097244
Iteration 946: Policy loss: 0.002188. Value loss: 0.087504. Entropy: 1.187403.
Iteration 947: Policy loss: -0.003565. Value loss: 0.059843. Entropy: 1.196118.
Iteration 948: Policy loss: -0.006108. Value loss: 0.048916. Entropy: 1.193494.
episode: 410   score: 180.0  epsilon: 1.0    steps: 760  evaluation reward: 196.15
Training network. lr: 0.000243. clip: 0.097244
Iteration 949: Policy loss: 0.003126. Value loss: 0.118596. Entropy: 1.227560.
Iteration 950: Policy loss: -0.004011. Value loss: 0.060034. Entropy: 1.238857.
Iteration 951: Policy loss: -0.008421. Value loss: 0.039207. Entropy: 1.244325.
Training network. lr: 0.000243. clip: 0.097088
Iteration 952: Policy loss: -0.000331. Value loss: 0.324616. Entropy: 1.211225.
Iteration 953: Policy loss: -0.003439. Value loss: 0.227715. Entropy: 1.215947.
Iteration 954: Policy loss: -0.006166. Value loss: 0.144409. Entropy: 1.209860.
episode: 411   score: 105.0  epsilon: 1.0    steps: 480  e

Iteration 1010: Policy loss: -0.005358. Value loss: 0.061415. Entropy: 1.230367.
Iteration 1011: Policy loss: -0.008876. Value loss: 0.046228. Entropy: 1.229340.
episode: 435   score: 50.0  epsilon: 1.0    steps: 664  evaluation reward: 198.15
episode: 436   score: 155.0  epsilon: 1.0    steps: 912  evaluation reward: 196.95
Training network. lr: 0.000242. clip: 0.096931
Iteration 1012: Policy loss: 0.000457. Value loss: 0.077830. Entropy: 1.210454.
Iteration 1013: Policy loss: -0.006785. Value loss: 0.033812. Entropy: 1.204565.
Iteration 1014: Policy loss: -0.012863. Value loss: 0.024026. Entropy: 1.211580.
episode: 437   score: 210.0  epsilon: 1.0    steps: 8  evaluation reward: 197.95
episode: 438   score: 365.0  epsilon: 1.0    steps: 216  evaluation reward: 199.5
episode: 439   score: 125.0  epsilon: 1.0    steps: 448  evaluation reward: 199.55
episode: 440   score: 275.0  epsilon: 1.0    steps: 968  evaluation reward: 201.55
Training network. lr: 0.000242. clip: 0.096931
Iteratio

Iteration 1073: Policy loss: -0.008998. Value loss: 0.051693. Entropy: 1.185041.
Iteration 1074: Policy loss: -0.008935. Value loss: 0.041316. Entropy: 1.181098.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1075: Policy loss: -0.000050. Value loss: 0.075889. Entropy: 1.218275.
Iteration 1076: Policy loss: -0.002852. Value loss: 0.047769. Entropy: 1.205565.
Iteration 1077: Policy loss: -0.009045. Value loss: 0.036342. Entropy: 1.206209.
episode: 461   score: 395.0  epsilon: 1.0    steps: 512  evaluation reward: 212.55
Training network. lr: 0.000242. clip: 0.096784
Iteration 1078: Policy loss: 0.000185. Value loss: 0.061145. Entropy: 1.234867.
Iteration 1079: Policy loss: -0.008772. Value loss: 0.033561. Entropy: 1.230593.
Iteration 1080: Policy loss: -0.012422. Value loss: 0.027911. Entropy: 1.224855.
episode: 462   score: 265.0  epsilon: 1.0    steps: 120  evaluation reward: 213.1
episode: 463   score: 260.0  epsilon: 1.0    steps: 392  evaluation reward: 214.15
Training ne

Training network. lr: 0.000242. clip: 0.096627
Iteration 1138: Policy loss: 0.000389. Value loss: 0.081194. Entropy: 1.143789.
Iteration 1139: Policy loss: -0.009600. Value loss: 0.032320. Entropy: 1.158314.
Iteration 1140: Policy loss: -0.014738. Value loss: 0.024332. Entropy: 1.151614.
episode: 485   score: 155.0  epsilon: 1.0    steps: 168  evaluation reward: 233.7
episode: 486   score: 300.0  epsilon: 1.0    steps: 576  evaluation reward: 235.9
episode: 487   score: 225.0  epsilon: 1.0    steps: 872  evaluation reward: 235.15
Training network. lr: 0.000242. clip: 0.096627
Iteration 1141: Policy loss: 0.002890. Value loss: 0.106925. Entropy: 1.182673.
Iteration 1142: Policy loss: -0.001464. Value loss: 0.056798. Entropy: 1.186320.
Iteration 1143: Policy loss: -0.006994. Value loss: 0.041534. Entropy: 1.186839.
episode: 488   score: 210.0  epsilon: 1.0    steps: 896  evaluation reward: 236.15
Training network. lr: 0.000242. clip: 0.096627
Iteration 1144: Policy loss: -0.001248. Value

Iteration 1201: Policy loss: 0.000874. Value loss: 0.077741. Entropy: 1.176462.
Iteration 1202: Policy loss: -0.008652. Value loss: 0.039320. Entropy: 1.174070.
Iteration 1203: Policy loss: -0.010904. Value loss: 0.031349. Entropy: 1.168447.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1204: Policy loss: -0.000938. Value loss: 0.070352. Entropy: 1.179269.
Iteration 1205: Policy loss: -0.006715. Value loss: 0.035628. Entropy: 1.176290.
Iteration 1206: Policy loss: -0.009115. Value loss: 0.026992. Entropy: 1.162948.
episode: 510   score: 225.0  epsilon: 1.0    steps: 800  evaluation reward: 245.2
episode: 511   score: 485.0  epsilon: 1.0    steps: 968  evaluation reward: 249.0
Training network. lr: 0.000241. clip: 0.096323
Iteration 1207: Policy loss: 0.002192. Value loss: 0.085078. Entropy: 1.196839.
Iteration 1208: Policy loss: -0.004944. Value loss: 0.043760. Entropy: 1.196207.
Iteration 1209: Policy loss: -0.009797. Value loss: 0.038156. Entropy: 1.196028.
Training networ

Iteration 1265: Policy loss: -0.006196. Value loss: 0.059094. Entropy: 1.201756.
Iteration 1266: Policy loss: -0.011521. Value loss: 0.047623. Entropy: 1.201863.
episode: 535   score: 210.0  epsilon: 1.0    steps: 408  evaluation reward: 263.25
Training network. lr: 0.000240. clip: 0.096166
Iteration 1267: Policy loss: 0.001779. Value loss: 0.089794. Entropy: 1.258960.
Iteration 1268: Policy loss: -0.002389. Value loss: 0.042673. Entropy: 1.254973.
Iteration 1269: Policy loss: -0.007717. Value loss: 0.035055. Entropy: 1.255955.
Training network. lr: 0.000240. clip: 0.096166
Iteration 1270: Policy loss: 0.001943. Value loss: 0.086110. Entropy: 1.279144.
Iteration 1271: Policy loss: -0.006654. Value loss: 0.049422. Entropy: 1.283914.
Iteration 1272: Policy loss: -0.011606. Value loss: 0.038513. Entropy: 1.283085.
episode: 536   score: 285.0  epsilon: 1.0    steps: 632  evaluation reward: 264.55
Training network. lr: 0.000240. clip: 0.096166
Iteration 1273: Policy loss: 0.004489. Value lo

Training network. lr: 0.000240. clip: 0.096009
Iteration 1330: Policy loss: 0.001726. Value loss: 0.079043. Entropy: 1.291073.
Iteration 1331: Policy loss: -0.005524. Value loss: 0.045277. Entropy: 1.295730.
Iteration 1332: Policy loss: -0.008861. Value loss: 0.035444. Entropy: 1.293808.
episode: 559   score: 215.0  epsilon: 1.0    steps: 16  evaluation reward: 267.7
episode: 560   score: 120.0  epsilon: 1.0    steps: 32  evaluation reward: 263.0
episode: 561   score: 300.0  epsilon: 1.0    steps: 616  evaluation reward: 262.05
Training network. lr: 0.000240. clip: 0.096009
Iteration 1333: Policy loss: -0.000142. Value loss: 0.095242. Entropy: 1.291256.
Iteration 1334: Policy loss: -0.006152. Value loss: 0.040516. Entropy: 1.288111.
Iteration 1335: Policy loss: -0.012114. Value loss: 0.030679. Entropy: 1.288376.
episode: 562   score: 50.0  epsilon: 1.0    steps: 272  evaluation reward: 259.9
Training network. lr: 0.000240. clip: 0.096009
Iteration 1336: Policy loss: 0.000010. Value los

Iteration 1392: Policy loss: -0.005180. Value loss: 0.034175. Entropy: 1.278865.
episode: 586   score: 65.0  epsilon: 1.0    steps: 576  evaluation reward: 245.45
Training network. lr: 0.000240. clip: 0.095862
Iteration 1393: Policy loss: -0.000999. Value loss: 0.113269. Entropy: 1.244337.
Iteration 1394: Policy loss: -0.005721. Value loss: 0.056928. Entropy: 1.237808.
Iteration 1395: Policy loss: -0.010572. Value loss: 0.047313. Entropy: 1.239825.
episode: 587   score: 275.0  epsilon: 1.0    steps: 208  evaluation reward: 245.95
episode: 588   score: 210.0  epsilon: 1.0    steps: 648  evaluation reward: 245.95
Training network. lr: 0.000240. clip: 0.095862
Iteration 1396: Policy loss: 0.000260. Value loss: 0.067745. Entropy: 1.284440.
Iteration 1397: Policy loss: -0.005610. Value loss: 0.040986. Entropy: 1.287548.
Iteration 1398: Policy loss: -0.006937. Value loss: 0.034492. Entropy: 1.292796.
Training network. lr: 0.000240. clip: 0.095862
Iteration 1399: Policy loss: -0.001770. Value

Iteration 1456: Policy loss: 0.001686. Value loss: 0.305894. Entropy: 1.252770.
Iteration 1457: Policy loss: -0.003419. Value loss: 0.196168. Entropy: 1.252683.
Iteration 1458: Policy loss: -0.007149. Value loss: 0.145785. Entropy: 1.256364.
episode: 610   score: 210.0  epsilon: 1.0    steps: 48  evaluation reward: 233.05
episode: 611   score: 210.0  epsilon: 1.0    steps: 1024  evaluation reward: 230.3
Training network. lr: 0.000239. clip: 0.095549
Iteration 1459: Policy loss: -0.000365. Value loss: 0.077611. Entropy: 1.292030.
Iteration 1460: Policy loss: -0.009931. Value loss: 0.032764. Entropy: 1.290802.
Iteration 1461: Policy loss: -0.013247. Value loss: 0.028301. Entropy: 1.293279.
Training network. lr: 0.000239. clip: 0.095549
Iteration 1462: Policy loss: 0.003247. Value loss: 0.078740. Entropy: 1.237199.
Iteration 1463: Policy loss: -0.005175. Value loss: 0.033886. Entropy: 1.248958.
Iteration 1464: Policy loss: -0.009334. Value loss: 0.026628. Entropy: 1.240126.
episode: 612  

Training network. lr: 0.000239. clip: 0.095401
Iteration 1519: Policy loss: -0.001161. Value loss: 0.315685. Entropy: 1.209200.
Iteration 1520: Policy loss: -0.003538. Value loss: 0.182418. Entropy: 1.197178.
Iteration 1521: Policy loss: -0.008843. Value loss: 0.129940. Entropy: 1.202681.
episode: 637   score: 210.0  epsilon: 1.0    steps: 256  evaluation reward: 220.5
Training network. lr: 0.000239. clip: 0.095401
Iteration 1522: Policy loss: -0.002194. Value loss: 0.081703. Entropy: 1.254375.
Iteration 1523: Policy loss: -0.008911. Value loss: 0.035386. Entropy: 1.243521.
Iteration 1524: Policy loss: -0.014040. Value loss: 0.029173. Entropy: 1.240020.
Training network. lr: 0.000239. clip: 0.095401
Iteration 1525: Policy loss: 0.005670. Value loss: 0.093196. Entropy: 1.277215.
Iteration 1526: Policy loss: -0.002067. Value loss: 0.054342. Entropy: 1.285712.
Iteration 1527: Policy loss: -0.006722. Value loss: 0.038523. Entropy: 1.283853.
episode: 638   score: 285.0  epsilon: 1.0    step

episode: 663   score: 140.0  epsilon: 1.0    steps: 872  evaluation reward: 223.0
Training network. lr: 0.000238. clip: 0.095245
Iteration 1582: Policy loss: 0.000764. Value loss: 0.081418. Entropy: 1.206583.
Iteration 1583: Policy loss: -0.005646. Value loss: 0.048264. Entropy: 1.207570.
Iteration 1584: Policy loss: -0.011424. Value loss: 0.040892. Entropy: 1.212589.
episode: 664   score: 50.0  epsilon: 1.0    steps: 864  evaluation reward: 220.2
Training network. lr: 0.000238. clip: 0.095245
Iteration 1585: Policy loss: 0.001212. Value loss: 0.105767. Entropy: 1.232891.
Iteration 1586: Policy loss: -0.005328. Value loss: 0.057544. Entropy: 1.228297.
Iteration 1587: Policy loss: -0.009427. Value loss: 0.046191. Entropy: 1.232150.
Training network. lr: 0.000238. clip: 0.095245
Iteration 1588: Policy loss: 0.000068. Value loss: 0.051064. Entropy: 1.237072.
Iteration 1589: Policy loss: -0.008689. Value loss: 0.022717. Entropy: 1.229182.
Iteration 1590: Policy loss: -0.010989. Value loss:

Iteration 1647: Policy loss: -0.011391. Value loss: 0.038444. Entropy: 1.269106.
Training network. lr: 0.000238. clip: 0.095088
Iteration 1648: Policy loss: -0.000153. Value loss: 0.179381. Entropy: 1.317592.
Iteration 1649: Policy loss: -0.003980. Value loss: 0.081451. Entropy: 1.314154.
Iteration 1650: Policy loss: -0.009648. Value loss: 0.052408. Entropy: 1.311662.
episode: 687   score: 120.0  epsilon: 1.0    steps: 952  evaluation reward: 231.3
episode: 688   score: 550.0  epsilon: 1.0    steps: 952  evaluation reward: 234.7
episode: 689   score: 240.0  epsilon: 1.0    steps: 1016  evaluation reward: 236.05
Training network. lr: 0.000237. clip: 0.094940
Iteration 1651: Policy loss: -0.000914. Value loss: 0.146777. Entropy: 1.311283.
Iteration 1652: Policy loss: -0.007652. Value loss: 0.064169. Entropy: 1.305328.
Iteration 1653: Policy loss: -0.011568. Value loss: 0.044426. Entropy: 1.302925.
Training network. lr: 0.000237. clip: 0.094940
Iteration 1654: Policy loss: 0.001800. Value

Training network. lr: 0.000237. clip: 0.094784
Iteration 1711: Policy loss: 0.001553. Value loss: 0.119193. Entropy: 1.271911.
Iteration 1712: Policy loss: -0.007064. Value loss: 0.062374. Entropy: 1.275224.
Iteration 1713: Policy loss: -0.010505. Value loss: 0.044487. Entropy: 1.276982.
episode: 712   score: 515.0  epsilon: 1.0    steps: 1016  evaluation reward: 242.75
Training network. lr: 0.000237. clip: 0.094784
Iteration 1714: Policy loss: 0.000249. Value loss: 0.302670. Entropy: 1.279839.
Iteration 1715: Policy loss: -0.008191. Value loss: 0.161952. Entropy: 1.282782.
Iteration 1716: Policy loss: -0.013177. Value loss: 0.128951. Entropy: 1.283727.
episode: 713   score: 100.0  epsilon: 1.0    steps: 792  evaluation reward: 241.9
Training network. lr: 0.000237. clip: 0.094784
Iteration 1717: Policy loss: 0.000587. Value loss: 0.052615. Entropy: 1.302519.
Iteration 1718: Policy loss: -0.008632. Value loss: 0.022606. Entropy: 1.303351.
Iteration 1719: Policy loss: -0.014179. Value lo

episode: 735   score: 305.0  epsilon: 1.0    steps: 592  evaluation reward: 254.95
Training network. lr: 0.000237. clip: 0.094627
Iteration 1777: Policy loss: 0.000362. Value loss: 0.086049. Entropy: 1.311440.
Iteration 1778: Policy loss: -0.005776. Value loss: 0.043069. Entropy: 1.308855.
Iteration 1779: Policy loss: -0.009595. Value loss: 0.037362. Entropy: 1.310601.
episode: 736   score: 260.0  epsilon: 1.0    steps: 176  evaluation reward: 256.5
episode: 737   score: 365.0  epsilon: 1.0    steps: 816  evaluation reward: 258.05
episode: 738   score: 140.0  epsilon: 1.0    steps: 944  evaluation reward: 256.6
Training network. lr: 0.000237. clip: 0.094627
Iteration 1780: Policy loss: 0.002918. Value loss: 0.145210. Entropy: 1.307460.
Iteration 1781: Policy loss: -0.004713. Value loss: 0.058517. Entropy: 1.300620.
Iteration 1782: Policy loss: -0.009534. Value loss: 0.044108. Entropy: 1.298602.
episode: 739   score: 295.0  epsilon: 1.0    steps: 144  evaluation reward: 257.1
Training n

episode: 761   score: 105.0  epsilon: 1.0    steps: 648  evaluation reward: 254.85
Training network. lr: 0.000236. clip: 0.094480
Iteration 1840: Policy loss: 0.004128. Value loss: 0.059424. Entropy: 1.264506.
Iteration 1841: Policy loss: -0.005054. Value loss: 0.032272. Entropy: 1.259396.
Iteration 1842: Policy loss: -0.010868. Value loss: 0.025072. Entropy: 1.263205.
episode: 762   score: 260.0  epsilon: 1.0    steps: 496  evaluation reward: 256.9
episode: 763   score: 260.0  epsilon: 1.0    steps: 944  evaluation reward: 258.1
Training network. lr: 0.000236. clip: 0.094480
Iteration 1843: Policy loss: 0.001500. Value loss: 0.128061. Entropy: 1.250226.
Iteration 1844: Policy loss: -0.002968. Value loss: 0.058665. Entropy: 1.258083.
Iteration 1845: Policy loss: -0.006637. Value loss: 0.041440. Entropy: 1.257208.
episode: 764   score: 475.0  epsilon: 1.0    steps: 88  evaluation reward: 262.35
Training network. lr: 0.000236. clip: 0.094480
Iteration 1846: Policy loss: 0.003040. Value l

Iteration 1904: Policy loss: 0.002229. Value loss: 0.094479. Entropy: 1.268262.
Iteration 1905: Policy loss: -0.006923. Value loss: 0.065941. Entropy: 1.265321.
episode: 786   score: 210.0  epsilon: 1.0    steps: 872  evaluation reward: 258.95
Training network. lr: 0.000235. clip: 0.094166
Iteration 1906: Policy loss: -0.000447. Value loss: 0.106127. Entropy: 1.246657.
Iteration 1907: Policy loss: -0.008096. Value loss: 0.044889. Entropy: 1.251962.
Iteration 1908: Policy loss: -0.014055. Value loss: 0.035505. Entropy: 1.250893.
episode: 787   score: 410.0  epsilon: 1.0    steps: 200  evaluation reward: 261.85
episode: 788   score: 320.0  epsilon: 1.0    steps: 664  evaluation reward: 259.55
Training network. lr: 0.000235. clip: 0.094166
Iteration 1909: Policy loss: -0.000718. Value loss: 0.071751. Entropy: 1.221457.
Iteration 1910: Policy loss: -0.010545. Value loss: 0.032850. Entropy: 1.234995.
Iteration 1911: Policy loss: -0.012998. Value loss: 0.023701. Entropy: 1.227541.
episode: 7

episode: 811   score: 410.0  epsilon: 1.0    steps: 184  evaluation reward: 257.45
episode: 812   score: 180.0  epsilon: 1.0    steps: 600  evaluation reward: 254.1
Training network. lr: 0.000235. clip: 0.094019
Iteration 1969: Policy loss: -0.000978. Value loss: 0.069555. Entropy: 1.138425.
Iteration 1970: Policy loss: -0.007550. Value loss: 0.029716. Entropy: 1.141009.
Iteration 1971: Policy loss: -0.010560. Value loss: 0.028290. Entropy: 1.140446.
Training network. lr: 0.000235. clip: 0.094019
Iteration 1972: Policy loss: 0.001324. Value loss: 0.204968. Entropy: 1.220572.
Iteration 1973: Policy loss: -0.006247. Value loss: 0.150446. Entropy: 1.220269.
Iteration 1974: Policy loss: -0.003542. Value loss: 0.129856. Entropy: 1.208886.
episode: 813   score: 355.0  epsilon: 1.0    steps: 544  evaluation reward: 256.65
episode: 814   score: 80.0  epsilon: 1.0    steps: 648  evaluation reward: 254.8
Training network. lr: 0.000235. clip: 0.094019
Iteration 1975: Policy loss: 0.001190. Value 

Iteration 2033: Policy loss: -0.005296. Value loss: 0.028161. Entropy: 1.264465.
Iteration 2034: Policy loss: -0.012484. Value loss: 0.020708. Entropy: 1.272846.
episode: 836   score: 105.0  epsilon: 1.0    steps: 872  evaluation reward: 253.65
Training network. lr: 0.000235. clip: 0.093862
Iteration 2035: Policy loss: 0.003488. Value loss: 0.079423. Entropy: 1.286372.
Iteration 2036: Policy loss: -0.003714. Value loss: 0.037238. Entropy: 1.284693.
Iteration 2037: Policy loss: -0.008708. Value loss: 0.029702. Entropy: 1.288761.
episode: 837   score: 110.0  epsilon: 1.0    steps: 184  evaluation reward: 251.1
Training network. lr: 0.000235. clip: 0.093862
Iteration 2038: Policy loss: 0.005150. Value loss: 0.093690. Entropy: 1.214639.
Iteration 2039: Policy loss: -0.003932. Value loss: 0.036491. Entropy: 1.205326.
Iteration 2040: Policy loss: -0.008023. Value loss: 0.027196. Entropy: 1.193739.
Training network. lr: 0.000235. clip: 0.093862
Iteration 2041: Policy loss: 0.000577. Value los

Iteration 2097: Policy loss: -0.012795. Value loss: 0.031092. Entropy: 1.157767.
episode: 861   score: 210.0  epsilon: 1.0    steps: 864  evaluation reward: 254.05
Training network. lr: 0.000234. clip: 0.093705
Iteration 2098: Policy loss: 0.000358. Value loss: 0.065872. Entropy: 1.238103.
Iteration 2099: Policy loss: -0.007958. Value loss: 0.031316. Entropy: 1.243639.
Iteration 2100: Policy loss: -0.008267. Value loss: 0.023634. Entropy: 1.244150.
episode: 862   score: 250.0  epsilon: 1.0    steps: 824  evaluation reward: 253.95
Training network. lr: 0.000234. clip: 0.093558
Iteration 2101: Policy loss: 0.000691. Value loss: 0.099843. Entropy: 1.245904.
Iteration 2102: Policy loss: -0.004250. Value loss: 0.053897. Entropy: 1.243427.
Iteration 2103: Policy loss: -0.011299. Value loss: 0.040317. Entropy: 1.240702.
Training network. lr: 0.000234. clip: 0.093558
Iteration 2104: Policy loss: 0.000877. Value loss: 0.070598. Entropy: 1.252755.
Iteration 2105: Policy loss: -0.008526. Value lo

Iteration 2163: Policy loss: -0.012685. Value loss: 0.054416. Entropy: 1.248812.
episode: 884   score: 405.0  epsilon: 1.0    steps: 344  evaluation reward: 261.7
episode: 885   score: 115.0  epsilon: 1.0    steps: 816  evaluation reward: 258.75
Training network. lr: 0.000234. clip: 0.093401
Iteration 2164: Policy loss: 0.001610. Value loss: 0.108024. Entropy: 1.178759.
Iteration 2165: Policy loss: -0.008071. Value loss: 0.052224. Entropy: 1.177783.
Iteration 2166: Policy loss: -0.010838. Value loss: 0.039149. Entropy: 1.176211.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2167: Policy loss: -0.003194. Value loss: 0.309854. Entropy: 1.212267.
Iteration 2168: Policy loss: -0.008212. Value loss: 0.204427. Entropy: 1.218091.
Iteration 2169: Policy loss: -0.009715. Value loss: 0.127085. Entropy: 1.201889.
episode: 886   score: 275.0  epsilon: 1.0    steps: 640  evaluation reward: 259.4
Training network. lr: 0.000234. clip: 0.093401
Iteration 2170: Policy loss: 0.001425. Value l

Iteration 2226: Policy loss: -0.008490. Value loss: 0.188756. Entropy: 1.139158.
episode: 910   score: 440.0  epsilon: 1.0    steps: 376  evaluation reward: 257.55
episode: 911   score: 50.0  epsilon: 1.0    steps: 520  evaluation reward: 253.95
Training network. lr: 0.000233. clip: 0.093245
Iteration 2227: Policy loss: 0.001525. Value loss: 0.086690. Entropy: 1.180816.
Iteration 2228: Policy loss: -0.006693. Value loss: 0.038862. Entropy: 1.175014.
Iteration 2229: Policy loss: -0.011540. Value loss: 0.027862. Entropy: 1.178825.
episode: 912   score: 50.0  epsilon: 1.0    steps: 936  evaluation reward: 252.65
Training network. lr: 0.000233. clip: 0.093245
Iteration 2230: Policy loss: -0.006212. Value loss: 0.115056. Entropy: 1.217447.
Iteration 2231: Policy loss: -0.016539. Value loss: 0.054661. Entropy: 1.228631.
Iteration 2232: Policy loss: -0.020132. Value loss: 0.040419. Entropy: 1.217367.
episode: 913   score: 240.0  epsilon: 1.0    steps: 936  evaluation reward: 251.5
Training ne

Iteration 2287: Policy loss: 0.000154. Value loss: 0.098329. Entropy: 1.147029.
Iteration 2288: Policy loss: -0.007784. Value loss: 0.051794. Entropy: 1.138292.
Iteration 2289: Policy loss: -0.013286. Value loss: 0.040377. Entropy: 1.139642.
Training network. lr: 0.000233. clip: 0.093097
Iteration 2290: Policy loss: -0.005135. Value loss: 0.065274. Entropy: 1.269984.
Iteration 2291: Policy loss: -0.015529. Value loss: 0.037842. Entropy: 1.270905.
Iteration 2292: Policy loss: -0.019735. Value loss: 0.027761. Entropy: 1.262442.
Training network. lr: 0.000233. clip: 0.093097
Iteration 2293: Policy loss: 0.000757. Value loss: 0.046948. Entropy: 1.306182.
Iteration 2294: Policy loss: -0.010934. Value loss: 0.020700. Entropy: 1.301776.
Iteration 2295: Policy loss: -0.013173. Value loss: 0.015775. Entropy: 1.301207.
episode: 938   score: 210.0  epsilon: 1.0    steps: 216  evaluation reward: 243.3
episode: 939   score: 110.0  epsilon: 1.0    steps: 328  evaluation reward: 241.2
episode: 940   

Iteration 2350: Policy loss: 0.000090. Value loss: 0.069436. Entropy: 1.161287.
Iteration 2351: Policy loss: -0.008566. Value loss: 0.038542. Entropy: 1.163451.
Iteration 2352: Policy loss: -0.011673. Value loss: 0.031291. Entropy: 1.163702.
episode: 964   score: 440.0  epsilon: 1.0    steps: 824  evaluation reward: 237.75
episode: 965   score: 105.0  epsilon: 1.0    steps: 920  evaluation reward: 236.4
Training network. lr: 0.000232. clip: 0.092784
Iteration 2353: Policy loss: 0.003673. Value loss: 0.252868. Entropy: 1.142216.
Iteration 2354: Policy loss: -0.005435. Value loss: 0.180546. Entropy: 1.137452.
Iteration 2355: Policy loss: -0.008335. Value loss: 0.118992. Entropy: 1.144847.
episode: 966   score: 425.0  epsilon: 1.0    steps: 184  evaluation reward: 237.1
Training network. lr: 0.000232. clip: 0.092784
Iteration 2356: Policy loss: -0.000071. Value loss: 0.089165. Entropy: 1.096625.
Iteration 2357: Policy loss: -0.010322. Value loss: 0.042593. Entropy: 1.105828.
Iteration 235

episode: 987   score: 260.0  epsilon: 1.0    steps: 504  evaluation reward: 237.9
episode: 988   score: 240.0  epsilon: 1.0    steps: 680  evaluation reward: 237.7
Training network. lr: 0.000232. clip: 0.092636
Iteration 2416: Policy loss: 0.005523. Value loss: 0.082518. Entropy: 1.067628.
Iteration 2417: Policy loss: -0.003734. Value loss: 0.044601. Entropy: 1.067649.
Iteration 2418: Policy loss: -0.008743. Value loss: 0.034325. Entropy: 1.071037.
Training network. lr: 0.000232. clip: 0.092636
Iteration 2419: Policy loss: 0.002710. Value loss: 0.121372. Entropy: 1.212643.
Iteration 2420: Policy loss: -0.006312. Value loss: 0.063422. Entropy: 1.221081.
Iteration 2421: Policy loss: -0.010820. Value loss: 0.044177. Entropy: 1.215279.
episode: 989   score: 155.0  epsilon: 1.0    steps: 792  evaluation reward: 234.4
Training network. lr: 0.000232. clip: 0.092636
Iteration 2422: Policy loss: 0.004499. Value loss: 0.126551. Entropy: 1.203489.
Iteration 2423: Policy loss: -0.005128. Value los

episode: 1009   score: 575.0  epsilon: 1.0    steps: 928  evaluation reward: 259.75
Training network. lr: 0.000231. clip: 0.092480
Iteration 2482: Policy loss: 0.002974. Value loss: 0.328975. Entropy: 1.233911.
Iteration 2483: Policy loss: -0.002904. Value loss: 0.197523. Entropy: 1.227037.
Iteration 2484: Policy loss: -0.007394. Value loss: 0.087701. Entropy: 1.223629.
episode: 1010   score: 410.0  epsilon: 1.0    steps: 496  evaluation reward: 259.45
Training network. lr: 0.000231. clip: 0.092480
Iteration 2485: Policy loss: 0.000022. Value loss: 0.149831. Entropy: 1.112671.
Iteration 2486: Policy loss: -0.006700. Value loss: 0.058423. Entropy: 1.113747.
Iteration 2487: Policy loss: -0.007849. Value loss: 0.042173. Entropy: 1.119126.
episode: 1011   score: 275.0  epsilon: 1.0    steps: 1000  evaluation reward: 261.7
Training network. lr: 0.000231. clip: 0.092480
Iteration 2488: Policy loss: 0.002274. Value loss: 0.129358. Entropy: 1.236174.
Iteration 2489: Policy loss: -0.005303. Val

Iteration 2549: Policy loss: -0.005511. Value loss: 0.068934. Entropy: 1.251063.
Iteration 2550: Policy loss: -0.006911. Value loss: 0.051795. Entropy: 1.252707.
episode: 1030   score: 210.0  epsilon: 1.0    steps: 208  evaluation reward: 275.6
Training network. lr: 0.000230. clip: 0.092176
Iteration 2551: Policy loss: 0.001357. Value loss: 0.064279. Entropy: 1.215738.
Iteration 2552: Policy loss: -0.008533. Value loss: 0.024380. Entropy: 1.207079.
Iteration 2553: Policy loss: -0.013845. Value loss: 0.018433. Entropy: 1.206041.
episode: 1031   score: 365.0  epsilon: 1.0    steps: 760  evaluation reward: 277.45
episode: 1032   score: 260.0  epsilon: 1.0    steps: 960  evaluation reward: 276.75
episode: 1033   score: 210.0  epsilon: 1.0    steps: 976  evaluation reward: 276.25
Training network. lr: 0.000230. clip: 0.092176
Iteration 2554: Policy loss: 0.000504. Value loss: 0.094482. Entropy: 1.214996.
Iteration 2555: Policy loss: -0.007701. Value loss: 0.045492. Entropy: 1.219067.
Iterat

Iteration 2612: Policy loss: -0.004520. Value loss: 0.081907. Entropy: 1.186381.
Iteration 2613: Policy loss: -0.009686. Value loss: 0.060080. Entropy: 1.187367.
episode: 1055   score: 135.0  epsilon: 1.0    steps: 648  evaluation reward: 296.2
Training network. lr: 0.000230. clip: 0.092019
Iteration 2614: Policy loss: 0.001551. Value loss: 0.296342. Entropy: 1.130619.
Iteration 2615: Policy loss: -0.004720. Value loss: 0.111067. Entropy: 1.125172.
Iteration 2616: Policy loss: -0.006707. Value loss: 0.069085. Entropy: 1.133433.
Training network. lr: 0.000230. clip: 0.092019
Iteration 2617: Policy loss: 0.001555. Value loss: 0.268860. Entropy: 1.237609.
Iteration 2618: Policy loss: -0.008213. Value loss: 0.127838. Entropy: 1.235556.
Iteration 2619: Policy loss: -0.012349. Value loss: 0.086316. Entropy: 1.237368.
Training network. lr: 0.000230. clip: 0.092019
Iteration 2620: Policy loss: -0.002455. Value loss: 0.151948. Entropy: 1.292603.
Iteration 2621: Policy loss: -0.008934. Value los

Training network. lr: 0.000230. clip: 0.091862
Iteration 2680: Policy loss: 0.000161. Value loss: 0.125544. Entropy: 1.172474.
Iteration 2681: Policy loss: -0.007860. Value loss: 0.071709. Entropy: 1.178968.
Iteration 2682: Policy loss: -0.015584. Value loss: 0.046623. Entropy: 1.176984.
episode: 1076   score: 265.0  epsilon: 1.0    steps: 304  evaluation reward: 301.4
Training network. lr: 0.000230. clip: 0.091862
Iteration 2683: Policy loss: 0.000285. Value loss: 0.102611. Entropy: 1.218505.
Iteration 2684: Policy loss: -0.007043. Value loss: 0.049615. Entropy: 1.216604.
Iteration 2685: Policy loss: -0.013916. Value loss: 0.031711. Entropy: 1.218013.
episode: 1077   score: 650.0  epsilon: 1.0    steps: 296  evaluation reward: 306.35
Training network. lr: 0.000230. clip: 0.091862
Iteration 2686: Policy loss: 0.000268. Value loss: 0.300583. Entropy: 1.196296.
Iteration 2687: Policy loss: -0.005635. Value loss: 0.176037. Entropy: 1.197002.
Iteration 2688: Policy loss: -0.011128. Value l

Iteration 2747: Policy loss: -0.004844. Value loss: 0.042928. Entropy: 1.244184.
Iteration 2748: Policy loss: -0.012013. Value loss: 0.029966. Entropy: 1.233791.
Training network. lr: 0.000229. clip: 0.091715
Iteration 2749: Policy loss: 0.000907. Value loss: 0.080461. Entropy: 1.274560.
Iteration 2750: Policy loss: -0.007764. Value loss: 0.030097. Entropy: 1.274253.
Iteration 2751: Policy loss: -0.016544. Value loss: 0.020928. Entropy: 1.268111.
episode: 1097   score: 315.0  epsilon: 1.0    steps: 576  evaluation reward: 319.45
Training network. lr: 0.000229. clip: 0.091558
Iteration 2752: Policy loss: 0.002777. Value loss: 0.079253. Entropy: 1.220220.
Iteration 2753: Policy loss: -0.006959. Value loss: 0.032942. Entropy: 1.210666.
Iteration 2754: Policy loss: -0.011649. Value loss: 0.021978. Entropy: 1.219194.
episode: 1098   score: 390.0  epsilon: 1.0    steps: 160  evaluation reward: 320.4
Training network. lr: 0.000229. clip: 0.091558
Iteration 2755: Policy loss: 0.000291. Value l

Iteration 2814: Policy loss: -0.012566. Value loss: 0.027547. Entropy: 1.094677.
Training network. lr: 0.000229. clip: 0.091401
Iteration 2815: Policy loss: 0.003847. Value loss: 0.073102. Entropy: 1.249959.
Iteration 2816: Policy loss: -0.005863. Value loss: 0.035358. Entropy: 1.253420.
Iteration 2817: Policy loss: -0.010704. Value loss: 0.025519. Entropy: 1.251337.
episode: 1118   score: 300.0  epsilon: 1.0    steps: 736  evaluation reward: 309.6
Training network. lr: 0.000229. clip: 0.091401
Iteration 2818: Policy loss: 0.003543. Value loss: 0.080796. Entropy: 1.203296.
Iteration 2819: Policy loss: -0.008135. Value loss: 0.036388. Entropy: 1.209945.
Iteration 2820: Policy loss: -0.011150. Value loss: 0.026948. Entropy: 1.206238.
episode: 1119   score: 590.0  epsilon: 1.0    steps: 104  evaluation reward: 312.05
episode: 1120   score: 210.0  epsilon: 1.0    steps: 768  evaluation reward: 312.05
Training network. lr: 0.000229. clip: 0.091401
Iteration 2821: Policy loss: 0.000933. Valu

Training network. lr: 0.000228. clip: 0.091254
Iteration 2881: Policy loss: 0.000832. Value loss: 0.095330. Entropy: 1.188750.
Iteration 2882: Policy loss: -0.007160. Value loss: 0.048630. Entropy: 1.174136.
Iteration 2883: Policy loss: -0.015912. Value loss: 0.037830. Entropy: 1.168694.
episode: 1140   score: 295.0  epsilon: 1.0    steps: 256  evaluation reward: 331.25
Training network. lr: 0.000228. clip: 0.091254
Iteration 2884: Policy loss: 0.000768. Value loss: 0.071706. Entropy: 1.187050.
Iteration 2885: Policy loss: -0.006994. Value loss: 0.034624. Entropy: 1.189156.
Iteration 2886: Policy loss: -0.008823. Value loss: 0.029252. Entropy: 1.190728.
Training network. lr: 0.000228. clip: 0.091254
Iteration 2887: Policy loss: 0.002017. Value loss: 0.051317. Entropy: 1.264604.
Iteration 2888: Policy loss: -0.005249. Value loss: 0.025159. Entropy: 1.273422.
Iteration 2889: Policy loss: -0.008768. Value loss: 0.021900. Entropy: 1.273009.
Training network. lr: 0.000228. clip: 0.091254
It

Iteration 2951: Policy loss: -0.009016. Value loss: 0.074311. Entropy: 1.277579.
Iteration 2952: Policy loss: -0.012717. Value loss: 0.052463. Entropy: 1.275960.
episode: 1157   score: 410.0  epsilon: 1.0    steps: 752  evaluation reward: 349.65
episode: 1158   score: 135.0  epsilon: 1.0    steps: 1000  evaluation reward: 343.8
Training network. lr: 0.000227. clip: 0.090941
Iteration 2953: Policy loss: 0.006309. Value loss: 0.426854. Entropy: 1.239078.
Iteration 2954: Policy loss: 0.000430. Value loss: 0.202349. Entropy: 1.231052.
Iteration 2955: Policy loss: -0.003662. Value loss: 0.137459. Entropy: 1.231483.
episode: 1159   score: 500.0  epsilon: 1.0    steps: 880  evaluation reward: 348.0
Training network. lr: 0.000227. clip: 0.090941
Iteration 2956: Policy loss: 0.000540. Value loss: 0.180858. Entropy: 1.173316.
Iteration 2957: Policy loss: -0.007397. Value loss: 0.080653. Entropy: 1.176290.
Iteration 2958: Policy loss: -0.014898. Value loss: 0.048888. Entropy: 1.176916.
episode: 1

Training network. lr: 0.000227. clip: 0.090793
Iteration 3019: Policy loss: 0.003894. Value loss: 0.119744. Entropy: 1.074757.
Iteration 3020: Policy loss: -0.002193. Value loss: 0.061368. Entropy: 1.068799.
Iteration 3021: Policy loss: -0.007553. Value loss: 0.049877. Entropy: 1.066960.
episode: 1178   score: 350.0  epsilon: 1.0    steps: 376  evaluation reward: 365.5
episode: 1179   score: 135.0  epsilon: 1.0    steps: 880  evaluation reward: 363.05
Training network. lr: 0.000227. clip: 0.090793
Iteration 3022: Policy loss: -0.003839. Value loss: 0.174063. Entropy: 1.135991.
Iteration 3023: Policy loss: -0.006043. Value loss: 0.063501. Entropy: 1.134670.
Iteration 3024: Policy loss: -0.012497. Value loss: 0.045590. Entropy: 1.134963.
Training network. lr: 0.000227. clip: 0.090793
Iteration 3025: Policy loss: 0.000313. Value loss: 0.073463. Entropy: 1.237831.
Iteration 3026: Policy loss: -0.006383. Value loss: 0.041381. Entropy: 1.238605.
Iteration 3027: Policy loss: -0.011750. Value 

episode: 1197   score: 365.0  epsilon: 1.0    steps: 728  evaluation reward: 367.45
Training network. lr: 0.000227. clip: 0.090637
Iteration 3088: Policy loss: 0.001335. Value loss: 0.095009. Entropy: 1.150971.
Iteration 3089: Policy loss: -0.011644. Value loss: 0.044813. Entropy: 1.143298.
Iteration 3090: Policy loss: -0.015071. Value loss: 0.037685. Entropy: 1.140707.
episode: 1198   score: 345.0  epsilon: 1.0    steps: 16  evaluation reward: 367.0
episode: 1199   score: 485.0  epsilon: 1.0    steps: 432  evaluation reward: 367.9
episode: 1200   score: 270.0  epsilon: 1.0    steps: 768  evaluation reward: 367.6
Training network. lr: 0.000227. clip: 0.090637
Iteration 3091: Policy loss: 0.002436. Value loss: 0.095165. Entropy: 1.081298.
Iteration 3092: Policy loss: -0.011377. Value loss: 0.042637. Entropy: 1.069719.
Iteration 3093: Policy loss: -0.014240. Value loss: 0.034942. Entropy: 1.073282.
now time :  2019-03-05 21:19:04.383574
episode: 1201   score: 220.0  epsilon: 1.0    steps

Iteration 3152: Policy loss: -0.006966. Value loss: 0.246725. Entropy: 0.998908.
Iteration 3153: Policy loss: -0.008581. Value loss: 0.193597. Entropy: 0.996772.
episode: 1221   score: 140.0  epsilon: 1.0    steps: 560  evaluation reward: 369.9
Training network. lr: 0.000226. clip: 0.090332
Iteration 3154: Policy loss: 0.003609. Value loss: 0.122649. Entropy: 1.058470.
Iteration 3155: Policy loss: -0.004861. Value loss: 0.060796. Entropy: 1.055584.
Iteration 3156: Policy loss: -0.010425. Value loss: 0.045618. Entropy: 1.060327.
Training network. lr: 0.000226. clip: 0.090332
Iteration 3157: Policy loss: 0.000017. Value loss: 0.087915. Entropy: 1.239432.
Iteration 3158: Policy loss: -0.009728. Value loss: 0.042737. Entropy: 1.227846.
Iteration 3159: Policy loss: -0.011436. Value loss: 0.028990. Entropy: 1.226972.
Training network. lr: 0.000226. clip: 0.090332
Iteration 3160: Policy loss: 0.002174. Value loss: 0.078589. Entropy: 1.239249.
Iteration 3161: Policy loss: -0.004528. Value loss

Iteration 3219: Policy loss: -0.013777. Value loss: 0.034845. Entropy: 1.187058.
episode: 1242   score: 515.0  epsilon: 1.0    steps: 448  evaluation reward: 360.3
episode: 1243   score: 260.0  epsilon: 1.0    steps: 648  evaluation reward: 359.7
Training network. lr: 0.000225. clip: 0.090176
Iteration 3220: Policy loss: 0.002465. Value loss: 0.294501. Entropy: 1.113592.
Iteration 3221: Policy loss: -0.006151. Value loss: 0.172253. Entropy: 1.102418.
Iteration 3222: Policy loss: -0.011263. Value loss: 0.105092. Entropy: 1.099644.
Training network. lr: 0.000225. clip: 0.090176
Iteration 3223: Policy loss: 0.000586. Value loss: 0.115353. Entropy: 1.203927.
Iteration 3224: Policy loss: -0.008780. Value loss: 0.055181. Entropy: 1.225337.
Iteration 3225: Policy loss: -0.014287. Value loss: 0.044574. Entropy: 1.220503.
Training network. lr: 0.000225. clip: 0.090176
Iteration 3226: Policy loss: 0.000771. Value loss: 0.091613. Entropy: 1.242192.
Iteration 3227: Policy loss: -0.011414. Value lo

episode: 1260   score: 365.0  epsilon: 1.0    steps: 480  evaluation reward: 349.65
Training network. lr: 0.000225. clip: 0.090019
Iteration 3289: Policy loss: 0.001106. Value loss: 0.054161. Entropy: 1.153193.
Iteration 3290: Policy loss: -0.005489. Value loss: 0.023484. Entropy: 1.153540.
Iteration 3291: Policy loss: -0.012993. Value loss: 0.018195. Entropy: 1.159168.
episode: 1261   score: 460.0  epsilon: 1.0    steps: 656  evaluation reward: 351.25
episode: 1262   score: 485.0  epsilon: 1.0    steps: 664  evaluation reward: 352.8
episode: 1263   score: 270.0  epsilon: 1.0    steps: 832  evaluation reward: 349.55
Training network. lr: 0.000225. clip: 0.090019
Iteration 3292: Policy loss: 0.003658. Value loss: 0.099351. Entropy: 1.067445.
Iteration 3293: Policy loss: -0.009135. Value loss: 0.051214. Entropy: 1.071115.
Iteration 3294: Policy loss: -0.016670. Value loss: 0.037565. Entropy: 1.063694.
episode: 1264   score: 210.0  epsilon: 1.0    steps: 392  evaluation reward: 349.35
Tra

episode: 1282   score: 390.0  epsilon: 1.0    steps: 512  evaluation reward: 345.6
Training network. lr: 0.000224. clip: 0.089715
Iteration 3355: Policy loss: 0.003838. Value loss: 0.083199. Entropy: 1.112740.
Iteration 3356: Policy loss: -0.003628. Value loss: 0.043430. Entropy: 1.106229.
Iteration 3357: Policy loss: -0.009179. Value loss: 0.032250. Entropy: 1.095193.
episode: 1283   score: 475.0  epsilon: 1.0    steps: 616  evaluation reward: 344.4
Training network. lr: 0.000224. clip: 0.089715
Iteration 3358: Policy loss: 0.003378. Value loss: 0.160521. Entropy: 1.171153.
Iteration 3359: Policy loss: -0.004268. Value loss: 0.048807. Entropy: 1.167541.
Iteration 3360: Policy loss: -0.012900. Value loss: 0.035508. Entropy: 1.164993.
Training network. lr: 0.000224. clip: 0.089715
Iteration 3361: Policy loss: 0.000492. Value loss: 0.187647. Entropy: 1.147525.
Iteration 3362: Policy loss: -0.004950. Value loss: 0.055736. Entropy: 1.148206.
Iteration 3363: Policy loss: -0.010297. Value lo

Iteration 3421: Policy loss: 0.000587. Value loss: 0.086028. Entropy: 1.146768.
Iteration 3422: Policy loss: -0.008757. Value loss: 0.044107. Entropy: 1.152546.
Iteration 3423: Policy loss: -0.014174. Value loss: 0.032921. Entropy: 1.153542.
episode: 1303   score: 265.0  epsilon: 1.0    steps: 704  evaluation reward: 350.45
episode: 1304   score: 270.0  epsilon: 1.0    steps: 864  evaluation reward: 349.35
Training network. lr: 0.000224. clip: 0.089558
Iteration 3424: Policy loss: 0.003046. Value loss: 0.333266. Entropy: 1.143683.
Iteration 3425: Policy loss: -0.002588. Value loss: 0.209896. Entropy: 1.153327.
Iteration 3426: Policy loss: -0.004719. Value loss: 0.120296. Entropy: 1.151657.
Training network. lr: 0.000224. clip: 0.089558
Iteration 3427: Policy loss: 0.001822. Value loss: 0.416119. Entropy: 1.194434.
Iteration 3428: Policy loss: -0.000082. Value loss: 0.116656. Entropy: 1.194193.
Iteration 3429: Policy loss: -0.002819. Value loss: 0.065101. Entropy: 1.198926.
episode: 130

Iteration 3489: Policy loss: -0.014387. Value loss: 0.027398. Entropy: 1.102744.
episode: 1323   score: 240.0  epsilon: 1.0    steps: 488  evaluation reward: 366.6
Training network. lr: 0.000224. clip: 0.089411
Iteration 3490: Policy loss: 0.002452. Value loss: 0.135674. Entropy: 1.180274.
Iteration 3491: Policy loss: -0.003896. Value loss: 0.064470. Entropy: 1.183196.
Iteration 3492: Policy loss: -0.008172. Value loss: 0.048004. Entropy: 1.190419.
episode: 1324   score: 365.0  epsilon: 1.0    steps: 728  evaluation reward: 367.7
Training network. lr: 0.000224. clip: 0.089411
Iteration 3493: Policy loss: 0.002311. Value loss: 0.229217. Entropy: 1.199834.
Iteration 3494: Policy loss: -0.007768. Value loss: 0.075889. Entropy: 1.202731.
Iteration 3495: Policy loss: -0.014040. Value loss: 0.048947. Entropy: 1.200555.
episode: 1325   score: 325.0  epsilon: 1.0    steps: 184  evaluation reward: 366.85
Training network. lr: 0.000224. clip: 0.089411
Iteration 3496: Policy loss: 0.004271. Value

Iteration 3558: Policy loss: -0.006237. Value loss: 0.082360. Entropy: 1.135536.
Training network. lr: 0.000223. clip: 0.089097
Iteration 3559: Policy loss: 0.004332. Value loss: 0.310112. Entropy: 1.217467.
Iteration 3560: Policy loss: -0.001469. Value loss: 0.133775. Entropy: 1.217575.
Iteration 3561: Policy loss: -0.002139. Value loss: 0.100986. Entropy: 1.213331.
episode: 1342   score: 405.0  epsilon: 1.0    steps: 360  evaluation reward: 385.65
Training network. lr: 0.000223. clip: 0.089097
Iteration 3562: Policy loss: 0.002031. Value loss: 0.193207. Entropy: 1.162964.
Iteration 3563: Policy loss: -0.004633. Value loss: 0.075159. Entropy: 1.164080.
Iteration 3564: Policy loss: -0.010526. Value loss: 0.045052. Entropy: 1.170597.
episode: 1343   score: 765.0  epsilon: 1.0    steps: 592  evaluation reward: 390.7
Training network. lr: 0.000223. clip: 0.089097
Iteration 3565: Policy loss: 0.004395. Value loss: 0.420700. Entropy: 1.178030.
Iteration 3566: Policy loss: -0.000808. Value l

Training network. lr: 0.000222. clip: 0.088950
Iteration 3628: Policy loss: -0.002327. Value loss: 0.123893. Entropy: 1.263351.
Iteration 3629: Policy loss: -0.013556. Value loss: 0.044761. Entropy: 1.243716.
Iteration 3630: Policy loss: -0.018944. Value loss: 0.029924. Entropy: 1.250764.
episode: 1360   score: 365.0  epsilon: 1.0    steps: 624  evaluation reward: 406.1
Training network. lr: 0.000222. clip: 0.088950
Iteration 3631: Policy loss: 0.002184. Value loss: 0.091687. Entropy: 1.185620.
Iteration 3632: Policy loss: -0.007329. Value loss: 0.040357. Entropy: 1.175525.
Iteration 3633: Policy loss: -0.016729. Value loss: 0.029483. Entropy: 1.176553.
episode: 1361   score: 300.0  epsilon: 1.0    steps: 784  evaluation reward: 404.5
Training network. lr: 0.000222. clip: 0.088950
Iteration 3634: Policy loss: 0.003236. Value loss: 0.093313. Entropy: 1.182746.
Iteration 3635: Policy loss: -0.009433. Value loss: 0.036265. Entropy: 1.173990.
Iteration 3636: Policy loss: -0.016947. Value l

Training network. lr: 0.000222. clip: 0.088793
Iteration 3694: Policy loss: 0.002562. Value loss: 0.290308. Entropy: 1.120466.
Iteration 3695: Policy loss: -0.005785. Value loss: 0.166021. Entropy: 1.114562.
Iteration 3696: Policy loss: -0.005897. Value loss: 0.097162. Entropy: 1.132800.
episode: 1382   score: 225.0  epsilon: 1.0    steps: 208  evaluation reward: 404.65
episode: 1383   score: 265.0  epsilon: 1.0    steps: 216  evaluation reward: 402.55
Training network. lr: 0.000222. clip: 0.088793
Iteration 3697: Policy loss: 0.001849. Value loss: 0.093833. Entropy: 1.104942.
Iteration 3698: Policy loss: -0.006467. Value loss: 0.039634. Entropy: 1.109926.
Iteration 3699: Policy loss: -0.012363. Value loss: 0.031834. Entropy: 1.102956.
episode: 1384   score: 440.0  epsilon: 1.0    steps: 640  evaluation reward: 404.35
Training network. lr: 0.000222. clip: 0.088793
Iteration 3700: Policy loss: 0.000965. Value loss: 0.375660. Entropy: 1.091659.
Iteration 3701: Policy loss: -0.009364. Val

Training network. lr: 0.000221. clip: 0.088489
Iteration 3763: Policy loss: 0.001866. Value loss: 0.059491. Entropy: 1.042554.
Iteration 3764: Policy loss: -0.010396. Value loss: 0.029346. Entropy: 1.060608.
Iteration 3765: Policy loss: -0.013800. Value loss: 0.024369. Entropy: 1.058515.
Training network. lr: 0.000221. clip: 0.088489
Iteration 3766: Policy loss: 0.001979. Value loss: 0.152440. Entropy: 0.997468.
Iteration 3767: Policy loss: -0.007288. Value loss: 0.067956. Entropy: 0.978169.
Iteration 3768: Policy loss: -0.011157. Value loss: 0.044995. Entropy: 0.979654.
now time :  2019-03-05 21:28:07.384933
episode: 1401   score: 420.0  epsilon: 1.0    steps: 344  evaluation reward: 407.65
Training network. lr: 0.000221. clip: 0.088489
Iteration 3769: Policy loss: 0.002611. Value loss: 0.103989. Entropy: 1.124137.
Iteration 3770: Policy loss: -0.006416. Value loss: 0.047809. Entropy: 1.104035.
Iteration 3771: Policy loss: -0.013317. Value loss: 0.035480. Entropy: 1.106743.
episode: 1

Training network. lr: 0.000221. clip: 0.088333
Iteration 3832: Policy loss: 0.005213. Value loss: 0.135754. Entropy: 1.049892.
Iteration 3833: Policy loss: -0.003051. Value loss: 0.061681. Entropy: 1.053150.
Iteration 3834: Policy loss: -0.007214. Value loss: 0.052117. Entropy: 1.048305.
Training network. lr: 0.000221. clip: 0.088333
Iteration 3835: Policy loss: 0.006043. Value loss: 0.352849. Entropy: 1.130702.
Iteration 3836: Policy loss: -0.000790. Value loss: 0.236970. Entropy: 1.129593.
Iteration 3837: Policy loss: -0.004162. Value loss: 0.187156. Entropy: 1.136885.
Training network. lr: 0.000221. clip: 0.088333
Iteration 3838: Policy loss: 0.001235. Value loss: 0.071086. Entropy: 1.208006.
Iteration 3839: Policy loss: -0.011304. Value loss: 0.031433. Entropy: 1.211967.
Iteration 3840: Policy loss: -0.017160. Value loss: 0.022657. Entropy: 1.208803.
episode: 1419   score: 435.0  epsilon: 1.0    steps: 440  evaluation reward: 423.4
Training network. lr: 0.000221. clip: 0.088333
Ite

Iteration 3900: Policy loss: -0.007189. Value loss: 0.045375. Entropy: 1.073449.
episode: 1439   score: 345.0  epsilon: 1.0    steps: 960  evaluation reward: 414.7
Training network. lr: 0.000220. clip: 0.088028
Iteration 3901: Policy loss: 0.001816. Value loss: 0.075576. Entropy: 1.034536.
Iteration 3902: Policy loss: -0.005081. Value loss: 0.031535. Entropy: 1.030513.
Iteration 3903: Policy loss: -0.013082. Value loss: 0.024231. Entropy: 1.008580.
Training network. lr: 0.000220. clip: 0.088028
Iteration 3904: Policy loss: 0.004319. Value loss: 0.079669. Entropy: 1.093215.
Iteration 3905: Policy loss: -0.008753. Value loss: 0.031421. Entropy: 1.086516.
Iteration 3906: Policy loss: -0.014649. Value loss: 0.021051. Entropy: 1.075262.
episode: 1440   score: 365.0  epsilon: 1.0    steps: 168  evaluation reward: 411.45
Training network. lr: 0.000220. clip: 0.088028
Iteration 3907: Policy loss: 0.001336. Value loss: 0.082668. Entropy: 1.076766.
Iteration 3908: Policy loss: -0.009086. Value l

Iteration 3967: Policy loss: 0.006333. Value loss: 0.355614. Entropy: 1.118075.
Iteration 3968: Policy loss: -0.002287. Value loss: 0.256340. Entropy: 1.126834.
Iteration 3969: Policy loss: -0.006151. Value loss: 0.201494. Entropy: 1.121668.
Training network. lr: 0.000220. clip: 0.087872
Iteration 3970: Policy loss: 0.002107. Value loss: 0.081054. Entropy: 1.124349.
Iteration 3971: Policy loss: -0.005614. Value loss: 0.041244. Entropy: 1.127970.
Iteration 3972: Policy loss: -0.010717. Value loss: 0.029591. Entropy: 1.129524.
episode: 1459   score: 210.0  epsilon: 1.0    steps: 192  evaluation reward: 386.3
Training network. lr: 0.000220. clip: 0.087872
Iteration 3973: Policy loss: -0.000777. Value loss: 0.058955. Entropy: 1.040354.
Iteration 3974: Policy loss: -0.007552. Value loss: 0.027472. Entropy: 1.043118.
Iteration 3975: Policy loss: -0.010818. Value loss: 0.021566. Entropy: 1.035929.
Training network. lr: 0.000220. clip: 0.087872
Iteration 3976: Policy loss: 0.007936. Value loss

episode: 1479   score: 530.0  epsilon: 1.0    steps: 248  evaluation reward: 390.75
Training network. lr: 0.000219. clip: 0.087715
Iteration 4036: Policy loss: 0.002873. Value loss: 0.100204. Entropy: 1.005693.
Iteration 4037: Policy loss: -0.005729. Value loss: 0.046918. Entropy: 1.008870.
Iteration 4038: Policy loss: -0.012760. Value loss: 0.035046. Entropy: 0.998063.
episode: 1480   score: 420.0  epsilon: 1.0    steps: 640  evaluation reward: 392.55
Training network. lr: 0.000219. clip: 0.087715
Iteration 4039: Policy loss: 0.002689. Value loss: 0.100392. Entropy: 0.964306.
Iteration 4040: Policy loss: -0.008359. Value loss: 0.049201. Entropy: 0.967445.
Iteration 4041: Policy loss: -0.014415. Value loss: 0.036562. Entropy: 0.956407.
Training network. lr: 0.000219. clip: 0.087715
Iteration 4042: Policy loss: 0.003194. Value loss: 0.054784. Entropy: 1.078779.
Iteration 4043: Policy loss: -0.008319. Value loss: 0.031298. Entropy: 1.084814.
Iteration 4044: Policy loss: -0.014582. Value 

Training network. lr: 0.000219. clip: 0.087411
Iteration 4102: Policy loss: 0.002506. Value loss: 0.289620. Entropy: 0.962132.
Iteration 4103: Policy loss: -0.002138. Value loss: 0.164713. Entropy: 0.959824.
Iteration 4104: Policy loss: -0.002801. Value loss: 0.130640. Entropy: 0.959586.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4105: Policy loss: 0.002218. Value loss: 0.123705. Entropy: 1.008992.
Iteration 4106: Policy loss: -0.003684. Value loss: 0.069564. Entropy: 0.987526.
Iteration 4107: Policy loss: -0.013343. Value loss: 0.052361. Entropy: 1.015737.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4108: Policy loss: 0.000962. Value loss: 0.081853. Entropy: 1.097382.
Iteration 4109: Policy loss: -0.008191. Value loss: 0.038533. Entropy: 1.097154.
Iteration 4110: Policy loss: -0.016214. Value loss: 0.028964. Entropy: 1.092393.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4111: Policy loss: 0.003936. Value loss: 0.041092. Entropy: 1.142979.
Iterat

Iteration 4171: Policy loss: 0.004330. Value loss: 0.318739. Entropy: 1.211747.
Iteration 4172: Policy loss: 0.000052. Value loss: 0.173787. Entropy: 1.216675.
Iteration 4173: Policy loss: -0.006330. Value loss: 0.113866. Entropy: 1.208807.
Training network. lr: 0.000218. clip: 0.087254
Iteration 4174: Policy loss: 0.002028. Value loss: 0.179847. Entropy: 1.141057.
Iteration 4175: Policy loss: -0.010260. Value loss: 0.050021. Entropy: 1.142713.
Iteration 4176: Policy loss: -0.016835. Value loss: 0.031738. Entropy: 1.131318.
episode: 1520   score: 515.0  epsilon: 1.0    steps: 416  evaluation reward: 374.35
Training network. lr: 0.000218. clip: 0.087254
Iteration 4177: Policy loss: 0.002385. Value loss: 0.071801. Entropy: 0.997265.
Iteration 4178: Policy loss: -0.003754. Value loss: 0.029374. Entropy: 0.995462.
Iteration 4179: Policy loss: -0.008745. Value loss: 0.022987. Entropy: 0.995988.
Training network. lr: 0.000218. clip: 0.087254
Iteration 4180: Policy loss: 0.001796. Value loss:

episode: 1540   score: 840.0  epsilon: 1.0    steps: 192  evaluation reward: 376.35
episode: 1541   score: 265.0  epsilon: 1.0    steps: 280  evaluation reward: 376.4
Training network. lr: 0.000218. clip: 0.087107
Iteration 4240: Policy loss: 0.003478. Value loss: 0.179420. Entropy: 1.003267.
Iteration 4241: Policy loss: 0.002804. Value loss: 0.074975. Entropy: 1.018708.
Iteration 4242: Policy loss: -0.008779. Value loss: 0.056540. Entropy: 1.005767.
episode: 1542   score: 440.0  epsilon: 1.0    steps: 976  evaluation reward: 376.7
Training network. lr: 0.000218. clip: 0.087107
Iteration 4243: Policy loss: 0.000498. Value loss: 0.142415. Entropy: 0.979970.
Iteration 4244: Policy loss: -0.006687. Value loss: 0.057474. Entropy: 0.989172.
Iteration 4245: Policy loss: -0.010770. Value loss: 0.041122. Entropy: 1.004070.
Training network. lr: 0.000218. clip: 0.087107
Iteration 4246: Policy loss: 0.005332. Value loss: 0.215535. Entropy: 1.090054.
Iteration 4247: Policy loss: -0.003650. Value 

episode: 1562   score: 700.0  epsilon: 1.0    steps: 1024  evaluation reward: 377.5
Training network. lr: 0.000217. clip: 0.086793
Iteration 4306: Policy loss: 0.003820. Value loss: 0.206057. Entropy: 1.097501.
Iteration 4307: Policy loss: -0.005939. Value loss: 0.097863. Entropy: 1.086973.
Iteration 4308: Policy loss: -0.012060. Value loss: 0.059871. Entropy: 1.087515.
episode: 1563   score: 300.0  epsilon: 1.0    steps: 144  evaluation reward: 377.85
episode: 1564   score: 310.0  epsilon: 1.0    steps: 752  evaluation reward: 375.95
episode: 1565   score: 270.0  epsilon: 1.0    steps: 832  evaluation reward: 376.0
Training network. lr: 0.000217. clip: 0.086793
Iteration 4309: Policy loss: 0.000530. Value loss: 0.100545. Entropy: 1.019235.
Iteration 4310: Policy loss: -0.008799. Value loss: 0.054973. Entropy: 1.032084.
Iteration 4311: Policy loss: -0.016021. Value loss: 0.047585. Entropy: 1.032538.
Training network. lr: 0.000217. clip: 0.086793
Iteration 4312: Policy loss: 0.000742. V

Iteration 4371: Policy loss: -0.015798. Value loss: 0.026267. Entropy: 1.117592.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4372: Policy loss: 0.003020. Value loss: 0.284964. Entropy: 1.143489.
Iteration 4373: Policy loss: -0.002488. Value loss: 0.081901. Entropy: 1.155109.
Iteration 4374: Policy loss: -0.009625. Value loss: 0.057769. Entropy: 1.167711.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4375: Policy loss: -0.000326. Value loss: 0.184604. Entropy: 1.150733.
Iteration 4376: Policy loss: -0.010055. Value loss: 0.089992. Entropy: 1.152528.
Iteration 4377: Policy loss: -0.015556. Value loss: 0.070662. Entropy: 1.152822.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4378: Policy loss: 0.003612. Value loss: 0.057408. Entropy: 1.067236.
Iteration 4379: Policy loss: -0.004268. Value loss: 0.021466. Entropy: 1.062088.
Iteration 4380: Policy loss: -0.013445. Value loss: 0.014675. Entropy: 1.054801.
episode: 1585   score: 455.0  epsilon: 1.0    steps

Training network. lr: 0.000216. clip: 0.086489
Iteration 4438: Policy loss: 0.000832. Value loss: 0.099892. Entropy: 1.163485.
Iteration 4439: Policy loss: -0.006874. Value loss: 0.046673. Entropy: 1.168968.
Iteration 4440: Policy loss: -0.011944. Value loss: 0.036527. Entropy: 1.168225.
Training network. lr: 0.000216. clip: 0.086489
Iteration 4441: Policy loss: 0.003857. Value loss: 0.097642. Entropy: 1.146792.
Iteration 4442: Policy loss: -0.007101. Value loss: 0.041219. Entropy: 1.155820.
Iteration 4443: Policy loss: -0.013821. Value loss: 0.027406. Entropy: 1.158158.
episode: 1606   score: 530.0  epsilon: 1.0    steps: 240  evaluation reward: 367.3
Training network. lr: 0.000216. clip: 0.086489
Iteration 4444: Policy loss: -0.000257. Value loss: 0.075379. Entropy: 1.058385.
Iteration 4445: Policy loss: -0.010915. Value loss: 0.037476. Entropy: 1.049091.
Iteration 4446: Policy loss: -0.016483. Value loss: 0.027846. Entropy: 1.044332.
episode: 1607   score: 330.0  epsilon: 1.0    ste

episode: 1625   score: 470.0  epsilon: 1.0    steps: 472  evaluation reward: 354.0
Training network. lr: 0.000215. clip: 0.086185
Iteration 4507: Policy loss: 0.000879. Value loss: 0.119280. Entropy: 1.128753.
Iteration 4508: Policy loss: -0.008626. Value loss: 0.053587. Entropy: 1.130294.
Iteration 4509: Policy loss: -0.018031. Value loss: 0.038550. Entropy: 1.138563.
episode: 1626   score: 455.0  epsilon: 1.0    steps: 368  evaluation reward: 355.0
episode: 1627   score: 450.0  epsilon: 1.0    steps: 456  evaluation reward: 353.4
episode: 1628   score: 360.0  epsilon: 1.0    steps: 792  evaluation reward: 354.9
Training network. lr: 0.000215. clip: 0.086185
Iteration 4510: Policy loss: 0.000861. Value loss: 0.085378. Entropy: 0.992319.
Iteration 4511: Policy loss: -0.008696. Value loss: 0.043911. Entropy: 0.990778.
Iteration 4512: Policy loss: -0.011224. Value loss: 0.035664. Entropy: 0.990314.
episode: 1629   score: 285.0  epsilon: 1.0    steps: 728  evaluation reward: 351.5
Trainin

Training network. lr: 0.000215. clip: 0.086029
Iteration 4573: Policy loss: 0.004422. Value loss: 0.124348. Entropy: 1.170570.
Iteration 4574: Policy loss: -0.002743. Value loss: 0.044828. Entropy: 1.193307.
Iteration 4575: Policy loss: -0.009859. Value loss: 0.032811. Entropy: 1.187013.
episode: 1648   score: 210.0  epsilon: 1.0    steps: 800  evaluation reward: 348.35
Training network. lr: 0.000215. clip: 0.086029
Iteration 4576: Policy loss: 0.002845. Value loss: 0.057055. Entropy: 1.112690.
Iteration 4577: Policy loss: -0.011780. Value loss: 0.029896. Entropy: 1.106117.
Iteration 4578: Policy loss: -0.020252. Value loss: 0.022607. Entropy: 1.104551.
Training network. lr: 0.000215. clip: 0.086029
Iteration 4579: Policy loss: 0.001339. Value loss: 0.491832. Entropy: 1.110925.
Iteration 4580: Policy loss: 0.001652. Value loss: 0.275378. Entropy: 1.116828.
Iteration 4581: Policy loss: -0.004990. Value loss: 0.128732. Entropy: 1.103012.
episode: 1649   score: 620.0  epsilon: 1.0    step

Training network. lr: 0.000215. clip: 0.085872
Iteration 4642: Policy loss: 0.003318. Value loss: 0.079519. Entropy: 1.180299.
Iteration 4643: Policy loss: -0.009977. Value loss: 0.033052. Entropy: 1.162351.
Iteration 4644: Policy loss: -0.015978. Value loss: 0.019888. Entropy: 1.167737.
episode: 1667   score: 520.0  epsilon: 1.0    steps: 88  evaluation reward: 355.95
episode: 1668   score: 285.0  epsilon: 1.0    steps: 168  evaluation reward: 355.15
episode: 1669   score: 260.0  epsilon: 1.0    steps: 816  evaluation reward: 355.35
Training network. lr: 0.000215. clip: 0.085872
Iteration 4645: Policy loss: 0.001535. Value loss: 0.123685. Entropy: 1.046625.
Iteration 4646: Policy loss: -0.007876. Value loss: 0.054023. Entropy: 1.043624.
Iteration 4647: Policy loss: -0.012955. Value loss: 0.043434. Entropy: 1.037265.
episode: 1670   score: 535.0  epsilon: 1.0    steps: 64  evaluation reward: 357.25
episode: 1671   score: 375.0  epsilon: 1.0    steps: 816  evaluation reward: 355.4
Train

Iteration 4709: Policy loss: -0.007945. Value loss: 0.057409. Entropy: 1.109262.
Iteration 4710: Policy loss: -0.012682. Value loss: 0.040598. Entropy: 1.120284.
episode: 1688   score: 275.0  epsilon: 1.0    steps: 592  evaluation reward: 373.95
Training network. lr: 0.000214. clip: 0.085568
Iteration 4711: Policy loss: 0.004340. Value loss: 0.108732. Entropy: 1.112107.
Iteration 4712: Policy loss: -0.003119. Value loss: 0.056700. Entropy: 1.117045.
Iteration 4713: Policy loss: -0.006099. Value loss: 0.045861. Entropy: 1.110619.
Training network. lr: 0.000214. clip: 0.085568
Iteration 4714: Policy loss: 0.004438. Value loss: 0.158516. Entropy: 1.169712.
Iteration 4715: Policy loss: -0.007627. Value loss: 0.062767. Entropy: 1.163677.
Iteration 4716: Policy loss: -0.014027. Value loss: 0.046666. Entropy: 1.157278.
episode: 1689   score: 180.0  epsilon: 1.0    steps: 248  evaluation reward: 372.9
Training network. lr: 0.000214. clip: 0.085568
Iteration 4717: Policy loss: 0.002035. Value l

Iteration 4775: Policy loss: -0.005501. Value loss: 0.047032. Entropy: 1.059068.
Iteration 4776: Policy loss: -0.008504. Value loss: 0.038974. Entropy: 1.062024.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4777: Policy loss: -0.000533. Value loss: 0.123622. Entropy: 1.115903.
Iteration 4778: Policy loss: -0.014446. Value loss: 0.048837. Entropy: 1.116857.
Iteration 4779: Policy loss: -0.018197. Value loss: 0.037644. Entropy: 1.110823.
episode: 1710   score: 420.0  epsilon: 1.0    steps: 568  evaluation reward: 365.95
Training network. lr: 0.000214. clip: 0.085411
Iteration 4780: Policy loss: 0.005007. Value loss: 0.690776. Entropy: 1.203325.
Iteration 4781: Policy loss: -0.003448. Value loss: 0.440634. Entropy: 1.198643.
Iteration 4782: Policy loss: -0.003707. Value loss: 0.271244. Entropy: 1.202197.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4783: Policy loss: 0.003810. Value loss: 0.194374. Entropy: 1.102042.
Iteration 4784: Policy loss: -0.005476. Value lo

episode: 1731   score: 325.0  epsilon: 1.0    steps: 440  evaluation reward: 374.05
Training network. lr: 0.000213. clip: 0.085264
Iteration 4843: Policy loss: 0.003225. Value loss: 0.115192. Entropy: 0.947744.
Iteration 4844: Policy loss: -0.008075. Value loss: 0.065130. Entropy: 0.944193.
Iteration 4845: Policy loss: -0.010292. Value loss: 0.055276. Entropy: 0.937924.
episode: 1732   score: 330.0  epsilon: 1.0    steps: 32  evaluation reward: 373.9
Training network. lr: 0.000213. clip: 0.085264
Iteration 4846: Policy loss: -0.001290. Value loss: 0.121996. Entropy: 0.942093.
Iteration 4847: Policy loss: -0.010990. Value loss: 0.057022. Entropy: 0.929798.
Iteration 4848: Policy loss: -0.016967. Value loss: 0.040449. Entropy: 0.929450.
Training network. lr: 0.000213. clip: 0.085264
Iteration 4849: Policy loss: 0.007784. Value loss: 0.358101. Entropy: 1.060711.
Iteration 4850: Policy loss: 0.002604. Value loss: 0.120970. Entropy: 1.079092.
Iteration 4851: Policy loss: -0.006188. Value lo

Training network. lr: 0.000212. clip: 0.084950
Iteration 4909: Policy loss: 0.005297. Value loss: 0.200167. Entropy: 1.087336.
Iteration 4910: Policy loss: -0.009211. Value loss: 0.094409. Entropy: 1.082125.
Iteration 4911: Policy loss: -0.015627. Value loss: 0.062478. Entropy: 1.076894.
episode: 1753   score: 290.0  epsilon: 1.0    steps: 160  evaluation reward: 381.4
Training network. lr: 0.000212. clip: 0.084950
Iteration 4912: Policy loss: 0.000456. Value loss: 0.187997. Entropy: 1.007907.
Iteration 4913: Policy loss: -0.012956. Value loss: 0.059128. Entropy: 1.005189.
Iteration 4914: Policy loss: -0.015736. Value loss: 0.033697. Entropy: 0.997398.
episode: 1754   score: 425.0  epsilon: 1.0    steps: 976  evaluation reward: 378.0
Training network. lr: 0.000212. clip: 0.084950
Iteration 4915: Policy loss: 0.002254. Value loss: 0.089146. Entropy: 1.067720.
Iteration 4916: Policy loss: -0.007331. Value loss: 0.040022. Entropy: 1.063546.
Iteration 4917: Policy loss: -0.011841. Value lo

Iteration 4975: Policy loss: 0.002678. Value loss: 0.103877. Entropy: 0.879929.
Iteration 4976: Policy loss: -0.003475. Value loss: 0.053701. Entropy: 0.879327.
Iteration 4977: Policy loss: -0.009212. Value loss: 0.046987. Entropy: 0.881294.
episode: 1775   score: 550.0  epsilon: 1.0    steps: 864  evaluation reward: 388.0
Training network. lr: 0.000212. clip: 0.084803
Iteration 4978: Policy loss: 0.002343. Value loss: 0.093896. Entropy: 0.976029.
Iteration 4979: Policy loss: -0.005062. Value loss: 0.047032. Entropy: 0.979012.
Iteration 4980: Policy loss: -0.009107. Value loss: 0.036927. Entropy: 0.975022.
episode: 1776   score: 265.0  epsilon: 1.0    steps: 664  evaluation reward: 387.0
Training network. lr: 0.000212. clip: 0.084803
Iteration 4981: Policy loss: 0.007198. Value loss: 0.116437. Entropy: 1.088936.
Iteration 4982: Policy loss: -0.002519. Value loss: 0.064979. Entropy: 1.082026.
Iteration 4983: Policy loss: -0.005307. Value loss: 0.050600. Entropy: 1.083552.
Training netwo

Training network. lr: 0.000212. clip: 0.084646
Iteration 5044: Policy loss: 0.002355. Value loss: 0.140770. Entropy: 1.110274.
Iteration 5045: Policy loss: -0.005961. Value loss: 0.077049. Entropy: 1.106393.
Iteration 5046: Policy loss: -0.008821. Value loss: 0.062544. Entropy: 1.111381.
episode: 1795   score: 180.0  epsilon: 1.0    steps: 24  evaluation reward: 388.7
episode: 1796   score: 365.0  epsilon: 1.0    steps: 664  evaluation reward: 386.15
Training network. lr: 0.000212. clip: 0.084646
Iteration 5047: Policy loss: 0.003559. Value loss: 0.150516. Entropy: 0.956871.
Iteration 5048: Policy loss: -0.003008. Value loss: 0.070995. Entropy: 0.945846.
Iteration 5049: Policy loss: -0.006382. Value loss: 0.050673. Entropy: 0.957124.
episode: 1797   score: 490.0  epsilon: 1.0    steps: 104  evaluation reward: 386.05
Training network. lr: 0.000212. clip: 0.084646
Iteration 5050: Policy loss: 0.004927. Value loss: 0.130038. Entropy: 1.035737.
Iteration 5051: Policy loss: -0.003642. Value

Iteration 5111: Policy loss: -0.009211. Value loss: 0.043251. Entropy: 1.218856.
Iteration 5112: Policy loss: -0.015662. Value loss: 0.031758. Entropy: 1.214700.
episode: 1815   score: 345.0  epsilon: 1.0    steps: 1016  evaluation reward: 388.5
Training network. lr: 0.000211. clip: 0.084342
Iteration 5113: Policy loss: 0.001101. Value loss: 0.169272. Entropy: 1.180143.
Iteration 5114: Policy loss: -0.010196. Value loss: 0.076152. Entropy: 1.170694.
Iteration 5115: Policy loss: -0.014309. Value loss: 0.050985. Entropy: 1.173503.
episode: 1816   score: 275.0  epsilon: 1.0    steps: 616  evaluation reward: 385.75
Training network. lr: 0.000211. clip: 0.084342
Iteration 5116: Policy loss: 0.002432. Value loss: 0.142202. Entropy: 1.127153.
Iteration 5117: Policy loss: -0.011318. Value loss: 0.070041. Entropy: 1.123723.
Iteration 5118: Policy loss: -0.015279. Value loss: 0.052608. Entropy: 1.118033.
episode: 1817   score: 310.0  epsilon: 1.0    steps: 648  evaluation reward: 386.0
episode: 

Iteration 5180: Policy loss: -0.003851. Value loss: 0.069052. Entropy: 1.096175.
Iteration 5181: Policy loss: -0.006772. Value loss: 0.044343. Entropy: 1.109258.
Training network. lr: 0.000210. clip: 0.084185
Iteration 5182: Policy loss: 0.005005. Value loss: 0.159954. Entropy: 1.167873.
Iteration 5183: Policy loss: -0.008548. Value loss: 0.061463. Entropy: 1.165434.
Iteration 5184: Policy loss: -0.011847. Value loss: 0.038586. Entropy: 1.158527.
episode: 1834   score: 365.0  epsilon: 1.0    steps: 200  evaluation reward: 399.35
episode: 1835   score: 590.0  epsilon: 1.0    steps: 304  evaluation reward: 400.5
episode: 1836   score: 565.0  epsilon: 1.0    steps: 624  evaluation reward: 399.95
Training network. lr: 0.000210. clip: 0.084185
Iteration 5185: Policy loss: 0.005406. Value loss: 0.214542. Entropy: 0.976830.
Iteration 5186: Policy loss: -0.003767. Value loss: 0.067641. Entropy: 0.967304.
Iteration 5187: Policy loss: -0.010615. Value loss: 0.048225. Entropy: 0.968618.
Training 

Iteration 5245: Policy loss: 0.003301. Value loss: 0.115148. Entropy: 0.983106.
Iteration 5246: Policy loss: -0.006064. Value loss: 0.049169. Entropy: 0.983605.
Iteration 5247: Policy loss: -0.010842. Value loss: 0.036472. Entropy: 0.988851.
Training network. lr: 0.000210. clip: 0.084029
Iteration 5248: Policy loss: 0.004058. Value loss: 0.164043. Entropy: 1.094290.
Iteration 5249: Policy loss: -0.002860. Value loss: 0.076006. Entropy: 1.089867.
Iteration 5250: Policy loss: -0.010235. Value loss: 0.054501. Entropy: 1.089667.
episode: 1857   score: 270.0  epsilon: 1.0    steps: 640  evaluation reward: 395.6
Training network. lr: 0.000210. clip: 0.083881
Iteration 5251: Policy loss: 0.002922. Value loss: 0.237164. Entropy: 1.155445.
Iteration 5252: Policy loss: -0.003308. Value loss: 0.124929. Entropy: 1.160100.
Iteration 5253: Policy loss: -0.011088. Value loss: 0.089559. Entropy: 1.161446.
episode: 1858   score: 545.0  epsilon: 1.0    steps: 976  evaluation reward: 397.05
Training netw

Iteration 5314: Policy loss: 0.003237. Value loss: 0.086687. Entropy: 1.164715.
Iteration 5315: Policy loss: -0.012093. Value loss: 0.037094. Entropy: 1.161916.
Iteration 5316: Policy loss: -0.017922. Value loss: 0.025961. Entropy: 1.160769.
Training network. lr: 0.000209. clip: 0.083725
Iteration 5317: Policy loss: 0.004065. Value loss: 0.327071. Entropy: 1.071691.
Iteration 5318: Policy loss: -0.002442. Value loss: 0.231588. Entropy: 1.062386.
Iteration 5319: Policy loss: -0.007924. Value loss: 0.180209. Entropy: 1.066435.
Training network. lr: 0.000209. clip: 0.083725
Iteration 5320: Policy loss: 0.004037. Value loss: 0.100420. Entropy: 1.046412.
Iteration 5321: Policy loss: -0.007332. Value loss: 0.048991. Entropy: 1.068088.
Iteration 5322: Policy loss: -0.013407. Value loss: 0.034823. Entropy: 1.057292.
episode: 1876   score: 300.0  epsilon: 1.0    steps: 40  evaluation reward: 398.5
Training network. lr: 0.000209. clip: 0.083725
Iteration 5323: Policy loss: 0.004279. Value loss: 

Training network. lr: 0.000209. clip: 0.083568
Iteration 5383: Policy loss: 0.003234. Value loss: 0.319512. Entropy: 1.139358.
Iteration 5384: Policy loss: -0.005541. Value loss: 0.203036. Entropy: 1.144791.
Iteration 5385: Policy loss: -0.003458. Value loss: 0.145745. Entropy: 1.148322.
Training network. lr: 0.000209. clip: 0.083568
Iteration 5386: Policy loss: 0.002306. Value loss: 0.070463. Entropy: 1.175984.
Iteration 5387: Policy loss: -0.007823. Value loss: 0.031502. Entropy: 1.178894.
Iteration 5388: Policy loss: -0.015431. Value loss: 0.021217. Entropy: 1.182014.
episode: 1896   score: 625.0  epsilon: 1.0    steps: 368  evaluation reward: 399.05
Training network. lr: 0.000209. clip: 0.083568
Iteration 5389: Policy loss: 0.001592. Value loss: 0.094994. Entropy: 1.135388.
Iteration 5390: Policy loss: -0.008784. Value loss: 0.051241. Entropy: 1.127969.
Iteration 5391: Policy loss: -0.012512. Value loss: 0.038554. Entropy: 1.126532.
episode: 1897   score: 470.0  epsilon: 1.0    ste

Iteration 5449: Policy loss: 0.003041. Value loss: 0.080538. Entropy: 1.145612.
Iteration 5450: Policy loss: -0.004992. Value loss: 0.039100. Entropy: 1.135682.
Iteration 5451: Policy loss: -0.013996. Value loss: 0.024792. Entropy: 1.134518.
Training network. lr: 0.000208. clip: 0.083264
Iteration 5452: Policy loss: 0.000760. Value loss: 0.117209. Entropy: 1.113637.
Iteration 5453: Policy loss: -0.008968. Value loss: 0.063215. Entropy: 1.119101.
Iteration 5454: Policy loss: -0.014384. Value loss: 0.039449. Entropy: 1.114667.
episode: 1917   score: 350.0  epsilon: 1.0    steps: 896  evaluation reward: 397.55
episode: 1918   score: 215.0  epsilon: 1.0    steps: 1008  evaluation reward: 394.7
Training network. lr: 0.000208. clip: 0.083264
Iteration 5455: Policy loss: 0.001591. Value loss: 0.117292. Entropy: 1.172103.
Iteration 5456: Policy loss: -0.011536. Value loss: 0.053267. Entropy: 1.179854.
Iteration 5457: Policy loss: -0.018463. Value loss: 0.035791. Entropy: 1.181348.
episode: 191

episode: 1937   score: 345.0  epsilon: 1.0    steps: 760  evaluation reward: 398.45
Training network. lr: 0.000208. clip: 0.083107
Iteration 5518: Policy loss: 0.008630. Value loss: 0.155636. Entropy: 0.986195.
Iteration 5519: Policy loss: -0.004041. Value loss: 0.071397. Entropy: 0.991945.
Iteration 5520: Policy loss: -0.013643. Value loss: 0.053817. Entropy: 0.987068.
episode: 1938   score: 675.0  epsilon: 1.0    steps: 680  evaluation reward: 399.0
Training network. lr: 0.000208. clip: 0.083107
Iteration 5521: Policy loss: 0.001390. Value loss: 0.143869. Entropy: 1.094979.
Iteration 5522: Policy loss: -0.006575. Value loss: 0.068706. Entropy: 1.101453.
Iteration 5523: Policy loss: -0.010656. Value loss: 0.047955. Entropy: 1.094028.
Training network. lr: 0.000208. clip: 0.083107
Iteration 5524: Policy loss: 0.005926. Value loss: 0.095519. Entropy: 1.152158.
Iteration 5525: Policy loss: -0.005845. Value loss: 0.040824. Entropy: 1.140865.
Iteration 5526: Policy loss: -0.014998. Value l

Iteration 5585: Policy loss: -0.009544. Value loss: 0.073611. Entropy: 1.117650.
Iteration 5586: Policy loss: -0.011661. Value loss: 0.048194. Entropy: 1.118279.
episode: 1957   score: 555.0  epsilon: 1.0    steps: 392  evaluation reward: 421.9
episode: 1958   score: 260.0  epsilon: 1.0    steps: 472  evaluation reward: 419.05
episode: 1959   score: 345.0  epsilon: 1.0    steps: 768  evaluation reward: 416.25
Training network. lr: 0.000207. clip: 0.082960
Iteration 5587: Policy loss: -0.000613. Value loss: 0.117331. Entropy: 0.951895.
Iteration 5588: Policy loss: -0.006526. Value loss: 0.066454. Entropy: 0.944872.
Iteration 5589: Policy loss: -0.011840. Value loss: 0.053827. Entropy: 0.951481.
episode: 1960   score: 310.0  epsilon: 1.0    steps: 584  evaluation reward: 414.05
Training network. lr: 0.000207. clip: 0.082960
Iteration 5590: Policy loss: 0.003190. Value loss: 0.131924. Entropy: 0.941577.
Iteration 5591: Policy loss: -0.004325. Value loss: 0.078608. Entropy: 0.948581.
Itera

episode: 1978   score: 560.0  epsilon: 1.0    steps: 640  evaluation reward: 411.05
Training network. lr: 0.000207. clip: 0.082646
Iteration 5653: Policy loss: 0.000899. Value loss: 0.135663. Entropy: 1.016415.
Iteration 5654: Policy loss: -0.011906. Value loss: 0.073730. Entropy: 1.013390.
Iteration 5655: Policy loss: -0.016095. Value loss: 0.052648. Entropy: 1.019309.
episode: 1979   score: 435.0  epsilon: 1.0    steps: 304  evaluation reward: 410.25
episode: 1980   score: 310.0  epsilon: 1.0    steps: 840  evaluation reward: 410.65
Training network. lr: 0.000207. clip: 0.082646
Iteration 5656: Policy loss: 0.000660. Value loss: 0.127480. Entropy: 0.962692.
Iteration 5657: Policy loss: -0.007076. Value loss: 0.065581. Entropy: 0.958913.
Iteration 5658: Policy loss: -0.014929. Value loss: 0.047453. Entropy: 0.959274.
episode: 1981   score: 315.0  epsilon: 1.0    steps: 176  evaluation reward: 409.3
episode: 1982   score: 285.0  epsilon: 1.0    steps: 392  evaluation reward: 405.45
Tra

Iteration 5718: Policy loss: -0.014264. Value loss: 0.030131. Entropy: 1.058414.
Training network. lr: 0.000206. clip: 0.082499
Iteration 5719: Policy loss: 0.006067. Value loss: 0.196342. Entropy: 1.073806.
Iteration 5720: Policy loss: -0.003655. Value loss: 0.083033. Entropy: 1.077227.
Iteration 5721: Policy loss: -0.011565. Value loss: 0.046139. Entropy: 1.083784.
now time :  2019-03-05 21:53:57.989758
episode: 2001   score: 240.0  epsilon: 1.0    steps: 920  evaluation reward: 398.1
Training network. lr: 0.000206. clip: 0.082499
Iteration 5722: Policy loss: 0.001084. Value loss: 0.100762. Entropy: 1.126913.
Iteration 5723: Policy loss: -0.014156. Value loss: 0.043437. Entropy: 1.112532.
Iteration 5724: Policy loss: -0.020071. Value loss: 0.030980. Entropy: 1.111927.
episode: 2002   score: 590.0  epsilon: 1.0    steps: 416  evaluation reward: 400.8
Training network. lr: 0.000206. clip: 0.082499
Iteration 5725: Policy loss: 0.004217. Value loss: 0.111039. Entropy: 1.048699.
Iteration

Training network. lr: 0.000206. clip: 0.082342
Iteration 5785: Policy loss: 0.001827. Value loss: 0.056786. Entropy: 0.917907.
Iteration 5786: Policy loss: -0.008969. Value loss: 0.029746. Entropy: 0.903540.
Iteration 5787: Policy loss: -0.014492. Value loss: 0.022327. Entropy: 0.902008.
episode: 2022   score: 795.0  epsilon: 1.0    steps: 216  evaluation reward: 413.7
Training network. lr: 0.000206. clip: 0.082342
Iteration 5788: Policy loss: 0.004301. Value loss: 0.105093. Entropy: 1.073851.
Iteration 5789: Policy loss: -0.006789. Value loss: 0.050392. Entropy: 1.063488.
Iteration 5790: Policy loss: -0.011103. Value loss: 0.037192. Entropy: 1.062977.
episode: 2023   score: 565.0  epsilon: 1.0    steps: 16  evaluation reward: 414.8
Training network. lr: 0.000206. clip: 0.082342
Iteration 5791: Policy loss: -0.001627. Value loss: 0.115519. Entropy: 0.976502.
Iteration 5792: Policy loss: -0.011108. Value loss: 0.053326. Entropy: 0.977238.
Iteration 5793: Policy loss: -0.016340. Value lo

Training network. lr: 0.000205. clip: 0.082038
Iteration 5851: Policy loss: 0.000761. Value loss: 0.169469. Entropy: 0.992053.
Iteration 5852: Policy loss: -0.007548. Value loss: 0.079797. Entropy: 0.980435.
Iteration 5853: Policy loss: -0.011932. Value loss: 0.054314. Entropy: 0.985135.
episode: 2044   score: 465.0  epsilon: 1.0    steps: 992  evaluation reward: 389.6
Training network. lr: 0.000205. clip: 0.082038
Iteration 5854: Policy loss: 0.003233. Value loss: 0.146207. Entropy: 1.127404.
Iteration 5855: Policy loss: -0.005222. Value loss: 0.070293. Entropy: 1.118901.
Iteration 5856: Policy loss: -0.008934. Value loss: 0.053205. Entropy: 1.122326.
episode: 2045   score: 520.0  epsilon: 1.0    steps: 744  evaluation reward: 389.35
Training network. lr: 0.000205. clip: 0.082038
Iteration 5857: Policy loss: 0.002917. Value loss: 0.070462. Entropy: 0.985433.
Iteration 5858: Policy loss: -0.008304. Value loss: 0.040490. Entropy: 0.987148.
Iteration 5859: Policy loss: -0.012789. Value l

Iteration 5916: Policy loss: -0.006128. Value loss: 0.213220. Entropy: 1.073341.
episode: 2067   score: 230.0  epsilon: 1.0    steps: 752  evaluation reward: 362.0
Training network. lr: 0.000205. clip: 0.081881
Iteration 5917: Policy loss: 0.000744. Value loss: 0.072976. Entropy: 1.101181.
Iteration 5918: Policy loss: -0.007839. Value loss: 0.037704. Entropy: 1.099352.
Iteration 5919: Policy loss: -0.015958. Value loss: 0.030325. Entropy: 1.097904.
Training network. lr: 0.000205. clip: 0.081881
Iteration 5920: Policy loss: 0.005927. Value loss: 0.107551. Entropy: 1.022009.
Iteration 5921: Policy loss: -0.002137. Value loss: 0.043812. Entropy: 1.048342.
Iteration 5922: Policy loss: -0.009737. Value loss: 0.032684. Entropy: 1.044971.
episode: 2068   score: 210.0  epsilon: 1.0    steps: 128  evaluation reward: 359.55
Training network. lr: 0.000205. clip: 0.081881
Iteration 5923: Policy loss: 0.002986. Value loss: 0.091531. Entropy: 0.969170.
Iteration 5924: Policy loss: -0.006273. Value l

Iteration 5981: Policy loss: -0.004591. Value loss: 0.087739. Entropy: 1.035616.
Iteration 5982: Policy loss: -0.009438. Value loss: 0.064314. Entropy: 1.036208.
episode: 2090   score: 210.0  epsilon: 1.0    steps: 576  evaluation reward: 367.8
Training network. lr: 0.000204. clip: 0.081725
Iteration 5983: Policy loss: 0.002132. Value loss: 0.106532. Entropy: 1.040212.
Iteration 5984: Policy loss: -0.012315. Value loss: 0.043273. Entropy: 1.027982.
Iteration 5985: Policy loss: -0.017655. Value loss: 0.031859. Entropy: 1.030032.
episode: 2091   score: 460.0  epsilon: 1.0    steps: 728  evaluation reward: 369.0
Training network. lr: 0.000204. clip: 0.081725
Iteration 5986: Policy loss: 0.006199. Value loss: 0.136383. Entropy: 1.056152.
Iteration 5987: Policy loss: -0.001988. Value loss: 0.060953. Entropy: 1.065285.
Iteration 5988: Policy loss: -0.009359. Value loss: 0.043733. Entropy: 1.061347.
Training network. lr: 0.000204. clip: 0.081725
Iteration 5989: Policy loss: 0.000111. Value lo

Training network. lr: 0.000204. clip: 0.081577
Iteration 6049: Policy loss: 0.000383. Value loss: 0.227060. Entropy: 1.078679.
Iteration 6050: Policy loss: -0.003068. Value loss: 0.095337. Entropy: 1.088087.
Iteration 6051: Policy loss: -0.008437. Value loss: 0.064064. Entropy: 1.087984.
episode: 2110   score: 345.0  epsilon: 1.0    steps: 8  evaluation reward: 372.1
Training network. lr: 0.000204. clip: 0.081421
Iteration 6052: Policy loss: 0.009304. Value loss: 0.318243. Entropy: 0.977310.
Iteration 6053: Policy loss: -0.001291. Value loss: 0.093928. Entropy: 0.955588.
Iteration 6054: Policy loss: -0.002732. Value loss: 0.053372. Entropy: 0.957245.
episode: 2111   score: 325.0  epsilon: 1.0    steps: 904  evaluation reward: 372.95
Training network. lr: 0.000204. clip: 0.081421
Iteration 6055: Policy loss: 0.002044. Value loss: 0.413882. Entropy: 1.059105.
Iteration 6056: Policy loss: -0.005722. Value loss: 0.302241. Entropy: 1.061402.
Iteration 6057: Policy loss: -0.010236. Value los

Iteration 6117: Policy loss: -0.005079. Value loss: 0.175034. Entropy: 1.096097.
Training network. lr: 0.000203. clip: 0.081264
Iteration 6118: Policy loss: 0.001472. Value loss: 0.094642. Entropy: 1.016428.
Iteration 6119: Policy loss: -0.006315. Value loss: 0.048325. Entropy: 1.025970.
Iteration 6120: Policy loss: -0.013649. Value loss: 0.032571. Entropy: 1.006595.
episode: 2130   score: 345.0  epsilon: 1.0    steps: 816  evaluation reward: 374.7
Training network. lr: 0.000203. clip: 0.081264
Iteration 6121: Policy loss: 0.001210. Value loss: 0.121521. Entropy: 1.126104.
Iteration 6122: Policy loss: -0.009378. Value loss: 0.054268. Entropy: 1.131293.
Iteration 6123: Policy loss: -0.014812. Value loss: 0.036212. Entropy: 1.129467.
episode: 2131   score: 345.0  epsilon: 1.0    steps: 104  evaluation reward: 376.3
episode: 2132   score: 285.0  epsilon: 1.0    steps: 368  evaluation reward: 375.5
Training network. lr: 0.000203. clip: 0.081264
Iteration 6124: Policy loss: 0.001589. Value 

Iteration 6185: Policy loss: -0.009709. Value loss: 0.088013. Entropy: 1.157555.
Iteration 6186: Policy loss: -0.014171. Value loss: 0.070162. Entropy: 1.159819.
episode: 2150   score: 360.0  epsilon: 1.0    steps: 168  evaluation reward: 387.35
now time :  2019-03-05 22:00:06.012584
episode: 2151   score: 605.0  epsilon: 1.0    steps: 216  evaluation reward: 391.3
episode: 2152   score: 260.0  epsilon: 1.0    steps: 624  evaluation reward: 390.0
episode: 2153   score: 345.0  epsilon: 1.0    steps: 640  evaluation reward: 390.6
Training network. lr: 0.000203. clip: 0.081116
Iteration 6187: Policy loss: 0.000085. Value loss: 0.129784. Entropy: 0.966105.
Iteration 6188: Policy loss: -0.008285. Value loss: 0.076630. Entropy: 0.972808.
Iteration 6189: Policy loss: -0.013992. Value loss: 0.060706. Entropy: 0.971212.
Training network. lr: 0.000203. clip: 0.081116
Iteration 6190: Policy loss: 0.000062. Value loss: 0.099882. Entropy: 1.025008.
Iteration 6191: Policy loss: -0.008210. Value loss

Iteration 6250: Policy loss: 0.001047. Value loss: 0.166894. Entropy: 1.019298.
Iteration 6251: Policy loss: -0.007361. Value loss: 0.073634. Entropy: 1.001462.
Iteration 6252: Policy loss: -0.011286. Value loss: 0.048121. Entropy: 1.001988.
Training network. lr: 0.000202. clip: 0.080803
Iteration 6253: Policy loss: 0.000074. Value loss: 0.118750. Entropy: 1.150497.
Iteration 6254: Policy loss: -0.007823. Value loss: 0.048467. Entropy: 1.156345.
Iteration 6255: Policy loss: -0.013683. Value loss: 0.032506. Entropy: 1.151649.
episode: 2173   score: 485.0  epsilon: 1.0    steps: 832  evaluation reward: 415.55
Training network. lr: 0.000202. clip: 0.080803
Iteration 6256: Policy loss: 0.002720. Value loss: 0.282000. Entropy: 1.125560.
Iteration 6257: Policy loss: -0.005455. Value loss: 0.095610. Entropy: 1.122519.
Iteration 6258: Policy loss: -0.007916. Value loss: 0.057817. Entropy: 1.131292.
episode: 2174   score: 310.0  epsilon: 1.0    steps: 768  evaluation reward: 414.65
Training net

episode: 2196   score: 310.0  epsilon: 1.0    steps: 928  evaluation reward: 407.4
Training network. lr: 0.000202. clip: 0.080656
Iteration 6316: Policy loss: 0.001767. Value loss: 0.111495. Entropy: 0.778305.
Iteration 6317: Policy loss: -0.011668. Value loss: 0.064334. Entropy: 0.768751.
Iteration 6318: Policy loss: -0.012638. Value loss: 0.052794. Entropy: 0.777819.
Training network. lr: 0.000202. clip: 0.080656
Iteration 6319: Policy loss: 0.003582. Value loss: 0.147333. Entropy: 0.990662.
Iteration 6320: Policy loss: -0.006260. Value loss: 0.070930. Entropy: 0.974296.
Iteration 6321: Policy loss: -0.008396. Value loss: 0.056379. Entropy: 0.971818.
Training network. lr: 0.000202. clip: 0.080656
Iteration 6322: Policy loss: 0.002471. Value loss: 0.088119. Entropy: 1.066704.
Iteration 6323: Policy loss: -0.005914. Value loss: 0.050740. Entropy: 1.076313.
Iteration 6324: Policy loss: -0.011292. Value loss: 0.041398. Entropy: 1.061620.
Training network. lr: 0.000202. clip: 0.080656
Ite

Iteration 6385: Policy loss: 0.002225. Value loss: 0.123141. Entropy: 1.063873.
Iteration 6386: Policy loss: -0.003080. Value loss: 0.057857. Entropy: 1.049475.
Iteration 6387: Policy loss: -0.010413. Value loss: 0.048741. Entropy: 1.060557.
episode: 2215   score: 465.0  epsilon: 1.0    steps: 336  evaluation reward: 406.15
Training network. lr: 0.000201. clip: 0.080499
Iteration 6388: Policy loss: -0.000314. Value loss: 0.165710. Entropy: 1.065967.
Iteration 6389: Policy loss: -0.011066. Value loss: 0.083943. Entropy: 1.064088.
Iteration 6390: Policy loss: -0.017929. Value loss: 0.061838. Entropy: 1.057886.
episode: 2216   score: 580.0  epsilon: 1.0    steps: 56  evaluation reward: 409.85
Training network. lr: 0.000201. clip: 0.080499
Iteration 6391: Policy loss: 0.002536. Value loss: 0.127261. Entropy: 1.033984.
Iteration 6392: Policy loss: -0.009818. Value loss: 0.063005. Entropy: 1.028218.
Iteration 6393: Policy loss: -0.013942. Value loss: 0.046118. Entropy: 1.029824.
Training net

episode: 2235   score: 400.0  epsilon: 1.0    steps: 224  evaluation reward: 407.3
episode: 2236   score: 215.0  epsilon: 1.0    steps: 280  evaluation reward: 402.65
episode: 2237   score: 460.0  epsilon: 1.0    steps: 712  evaluation reward: 403.95
Training network. lr: 0.000200. clip: 0.080195
Iteration 6454: Policy loss: 0.000184. Value loss: 0.083105. Entropy: 0.965721.
Iteration 6455: Policy loss: -0.008895. Value loss: 0.042636. Entropy: 0.974138.
Iteration 6456: Policy loss: -0.013006. Value loss: 0.037177. Entropy: 0.965100.
Training network. lr: 0.000200. clip: 0.080195
Iteration 6457: Policy loss: 0.004961. Value loss: 0.123865. Entropy: 1.001901.
Iteration 6458: Policy loss: -0.004225. Value loss: 0.052564. Entropy: 1.005062.
Iteration 6459: Policy loss: -0.011808. Value loss: 0.041736. Entropy: 1.002206.
episode: 2238   score: 240.0  epsilon: 1.0    steps: 232  evaluation reward: 403.2
episode: 2239   score: 465.0  epsilon: 1.0    steps: 952  evaluation reward: 402.7
Train

Iteration 6519: Policy loss: -0.007605. Value loss: 0.052087. Entropy: 1.099505.
Training network. lr: 0.000200. clip: 0.080038
Iteration 6520: Policy loss: 0.001732. Value loss: 0.114243. Entropy: 1.140401.
Iteration 6521: Policy loss: -0.007958. Value loss: 0.055740. Entropy: 1.134405.
Iteration 6522: Policy loss: -0.012794. Value loss: 0.043445. Entropy: 1.128253.
episode: 2258   score: 950.0  epsilon: 1.0    steps: 696  evaluation reward: 399.15
Training network. lr: 0.000200. clip: 0.080038
Iteration 6523: Policy loss: 0.002528. Value loss: 0.233142. Entropy: 1.160307.
Iteration 6524: Policy loss: -0.005311. Value loss: 0.101458. Entropy: 1.163968.
Iteration 6525: Policy loss: -0.011136. Value loss: 0.068887. Entropy: 1.163327.
episode: 2259   score: 605.0  epsilon: 1.0    steps: 48  evaluation reward: 401.15
Training network. lr: 0.000200. clip: 0.080038
Iteration 6526: Policy loss: -0.001435. Value loss: 0.126295. Entropy: 1.059957.
Iteration 6527: Policy loss: -0.008633. Value 

Iteration 6587: Policy loss: -0.008974. Value loss: 0.076901. Entropy: 1.111643.
Iteration 6588: Policy loss: -0.012852. Value loss: 0.055521. Entropy: 1.114574.
episode: 2278   score: 605.0  epsilon: 1.0    steps: 240  evaluation reward: 390.7
episode: 2279   score: 335.0  epsilon: 1.0    steps: 904  evaluation reward: 390.15
Training network. lr: 0.000200. clip: 0.079881
Iteration 6589: Policy loss: 0.005648. Value loss: 0.304163. Entropy: 1.048810.
Iteration 6590: Policy loss: -0.003521. Value loss: 0.112981. Entropy: 1.048358.
Iteration 6591: Policy loss: -0.006670. Value loss: 0.079801. Entropy: 1.050548.
episode: 2280   score: 260.0  epsilon: 1.0    steps: 720  evaluation reward: 390.95
Training network. lr: 0.000200. clip: 0.079881
Iteration 6592: Policy loss: 0.002317. Value loss: 0.138953. Entropy: 0.950841.
Iteration 6593: Policy loss: -0.008443. Value loss: 0.073144. Entropy: 0.936680.
Iteration 6594: Policy loss: -0.010455. Value loss: 0.051806. Entropy: 0.937099.
episode: 

episode: 2299   score: 260.0  epsilon: 1.0    steps: 776  evaluation reward: 405.65
episode: 2300   score: 540.0  epsilon: 1.0    steps: 928  evaluation reward: 407.55
Training network. lr: 0.000199. clip: 0.079577
Iteration 6655: Policy loss: 0.001428. Value loss: 0.243325. Entropy: 1.020075.
Iteration 6656: Policy loss: -0.004730. Value loss: 0.092224. Entropy: 1.012277.
Iteration 6657: Policy loss: -0.009470. Value loss: 0.059743. Entropy: 1.011911.
Training network. lr: 0.000199. clip: 0.079577
Iteration 6658: Policy loss: 0.004093. Value loss: 0.124650. Entropy: 1.019170.
Iteration 6659: Policy loss: -0.007870. Value loss: 0.056151. Entropy: 1.028505.
Iteration 6660: Policy loss: -0.012151. Value loss: 0.045273. Entropy: 1.022259.
Training network. lr: 0.000199. clip: 0.079577
Iteration 6661: Policy loss: 0.002076. Value loss: 0.104123. Entropy: 1.052797.
Iteration 6662: Policy loss: -0.010152. Value loss: 0.061710. Entropy: 1.040492.
Iteration 6663: Policy loss: -0.013611. Value 

Training network. lr: 0.000199. clip: 0.079421
Iteration 6724: Policy loss: 0.002861. Value loss: 0.179465. Entropy: 1.133369.
Iteration 6725: Policy loss: -0.006172. Value loss: 0.078361. Entropy: 1.133945.
Iteration 6726: Policy loss: -0.013532. Value loss: 0.053257. Entropy: 1.136833.
episode: 2318   score: 285.0  epsilon: 1.0    steps: 448  evaluation reward: 422.8
episode: 2319   score: 590.0  epsilon: 1.0    steps: 968  evaluation reward: 426.55
Training network. lr: 0.000199. clip: 0.079421
Iteration 6727: Policy loss: -0.000367. Value loss: 0.264272. Entropy: 1.156646.
Iteration 6728: Policy loss: -0.004188. Value loss: 0.123839. Entropy: 1.147238.
Iteration 6729: Policy loss: -0.010474. Value loss: 0.085866. Entropy: 1.159913.
episode: 2320   score: 240.0  epsilon: 1.0    steps: 144  evaluation reward: 424.35
episode: 2321   score: 260.0  epsilon: 1.0    steps: 672  evaluation reward: 421.6
Training network. lr: 0.000199. clip: 0.079421
Iteration 6730: Policy loss: 0.005234. V

Iteration 6791: Policy loss: -0.004022. Value loss: 0.052329. Entropy: 1.203301.
Iteration 6792: Policy loss: -0.011438. Value loss: 0.044561. Entropy: 1.205294.
episode: 2339   score: 255.0  epsilon: 1.0    steps: 464  evaluation reward: 415.9
Training network. lr: 0.000198. clip: 0.079273
Iteration 6793: Policy loss: 0.003925. Value loss: 0.140786. Entropy: 1.062034.
Iteration 6794: Policy loss: -0.007188. Value loss: 0.050521. Entropy: 1.062830.
Iteration 6795: Policy loss: -0.011849. Value loss: 0.035285. Entropy: 1.063418.
episode: 2340   score: 820.0  epsilon: 1.0    steps: 576  evaluation reward: 421.5
Training network. lr: 0.000198. clip: 0.079273
Iteration 6796: Policy loss: 0.001529. Value loss: 0.126095. Entropy: 1.047149.
Iteration 6797: Policy loss: -0.007237. Value loss: 0.052464. Entropy: 1.050640.
Iteration 6798: Policy loss: -0.012830. Value loss: 0.037734. Entropy: 1.050314.
episode: 2341   score: 180.0  epsilon: 1.0    steps: 176  evaluation reward: 418.0
episode: 23

Iteration 6858: Policy loss: -0.016556. Value loss: 0.028240. Entropy: 0.993046.
episode: 2360   score: 515.0  epsilon: 1.0    steps: 640  evaluation reward: 406.65
episode: 2361   score: 345.0  epsilon: 1.0    steps: 688  evaluation reward: 406.95
Training network. lr: 0.000197. clip: 0.078960
Iteration 6859: Policy loss: -0.000870. Value loss: 0.181713. Entropy: 1.037528.
Iteration 6860: Policy loss: -0.008251. Value loss: 0.061294. Entropy: 1.032207.
Iteration 6861: Policy loss: -0.014292. Value loss: 0.041592. Entropy: 1.025121.
Training network. lr: 0.000197. clip: 0.078960
Iteration 6862: Policy loss: 0.001640. Value loss: 0.054623. Entropy: 1.084674.
Iteration 6863: Policy loss: -0.011039. Value loss: 0.026555. Entropy: 1.083216.
Iteration 6864: Policy loss: -0.016064. Value loss: 0.019694. Entropy: 1.085914.
episode: 2362   score: 390.0  epsilon: 1.0    steps: 584  evaluation reward: 407.95
Training network. lr: 0.000197. clip: 0.078960
Iteration 6865: Policy loss: 0.001363. Va

Training network. lr: 0.000197. clip: 0.078812
Iteration 6925: Policy loss: 0.002396. Value loss: 0.118745. Entropy: 1.157882.
Iteration 6926: Policy loss: -0.008729. Value loss: 0.057084. Entropy: 1.157570.
Iteration 6927: Policy loss: -0.014625. Value loss: 0.039808. Entropy: 1.158780.
episode: 2381   score: 250.0  epsilon: 1.0    steps: 264  evaluation reward: 421.8
episode: 2382   score: 365.0  epsilon: 1.0    steps: 368  evaluation reward: 420.0
Training network. lr: 0.000197. clip: 0.078812
Iteration 6928: Policy loss: 0.003025. Value loss: 0.097987. Entropy: 1.023152.
Iteration 6929: Policy loss: -0.008475. Value loss: 0.037411. Entropy: 1.021381.
Iteration 6930: Policy loss: -0.014042. Value loss: 0.027288. Entropy: 1.031506.
episode: 2383   score: 310.0  epsilon: 1.0    steps: 600  evaluation reward: 418.25
Training network. lr: 0.000197. clip: 0.078812
Iteration 6931: Policy loss: 0.000966. Value loss: 0.395437. Entropy: 1.032018.
Iteration 6932: Policy loss: -0.009627. Value

Iteration 6993: Policy loss: -0.011428. Value loss: 0.084502. Entropy: 1.176523.
now time :  2019-03-05 22:10:48.352760
episode: 2401   score: 375.0  epsilon: 1.0    steps: 816  evaluation reward: 413.75
Training network. lr: 0.000197. clip: 0.078656
Iteration 6994: Policy loss: 0.004509. Value loss: 0.098417. Entropy: 1.148328.
Iteration 6995: Policy loss: -0.005577. Value loss: 0.033960. Entropy: 1.146712.
Iteration 6996: Policy loss: -0.014585. Value loss: 0.022263. Entropy: 1.147403.
episode: 2402   score: 370.0  epsilon: 1.0    steps: 584  evaluation reward: 413.0
episode: 2403   score: 575.0  epsilon: 1.0    steps: 600  evaluation reward: 416.95
Training network. lr: 0.000197. clip: 0.078656
Iteration 6997: Policy loss: 0.002921. Value loss: 0.252244. Entropy: 1.059866.
Iteration 6998: Policy loss: -0.001048. Value loss: 0.087890. Entropy: 1.053283.
Iteration 6999: Policy loss: -0.006353. Value loss: 0.052380. Entropy: 1.056268.
Training network. lr: 0.000197. clip: 0.078656
Iter

Iteration 7062: Policy loss: -0.013170. Value loss: 0.031533. Entropy: 1.103320.
Training network. lr: 0.000196. clip: 0.078352
Iteration 7063: Policy loss: 0.002541. Value loss: 0.208000. Entropy: 1.195700.
Iteration 7064: Policy loss: -0.005906. Value loss: 0.087948. Entropy: 1.202371.
Iteration 7065: Policy loss: -0.008561. Value loss: 0.061665. Entropy: 1.206228.
episode: 2420   score: 285.0  epsilon: 1.0    steps: 56  evaluation reward: 412.8
episode: 2421   score: 360.0  epsilon: 1.0    steps: 368  evaluation reward: 413.8
episode: 2422   score: 380.0  epsilon: 1.0    steps: 552  evaluation reward: 413.4
episode: 2423   score: 535.0  epsilon: 1.0    steps: 760  evaluation reward: 414.15
Training network. lr: 0.000196. clip: 0.078352
Iteration 7066: Policy loss: 0.001539. Value loss: 0.125364. Entropy: 0.954326.
Iteration 7067: Policy loss: -0.007549. Value loss: 0.061200. Entropy: 0.947458.
Iteration 7068: Policy loss: -0.015471. Value loss: 0.044175. Entropy: 0.953509.
episode: 

Iteration 7126: Policy loss: -0.002557. Value loss: 0.108120. Entropy: 1.017522.
Iteration 7127: Policy loss: -0.008973. Value loss: 0.051607. Entropy: 1.009069.
Iteration 7128: Policy loss: -0.017986. Value loss: 0.034991. Entropy: 1.015744.
episode: 2444   score: 335.0  epsilon: 1.0    steps: 120  evaluation reward: 406.65
episode: 2445   score: 435.0  epsilon: 1.0    steps: 832  evaluation reward: 406.5
Training network. lr: 0.000195. clip: 0.078195
Iteration 7129: Policy loss: 0.003180. Value loss: 0.294521. Entropy: 0.997149.
Iteration 7130: Policy loss: -0.002824. Value loss: 0.095535. Entropy: 1.002876.
Iteration 7131: Policy loss: -0.006296. Value loss: 0.069345. Entropy: 1.008818.
Training network. lr: 0.000195. clip: 0.078195
Iteration 7132: Policy loss: 0.001235. Value loss: 0.184181. Entropy: 0.914250.
Iteration 7133: Policy loss: -0.008146. Value loss: 0.059003. Entropy: 0.914820.
Iteration 7134: Policy loss: -0.011865. Value loss: 0.040593. Entropy: 0.914540.
episode: 244

Iteration 7192: Policy loss: 0.004475. Value loss: 0.347024. Entropy: 1.048777.
Iteration 7193: Policy loss: -0.000812. Value loss: 0.202107. Entropy: 1.046830.
Iteration 7194: Policy loss: -0.005841. Value loss: 0.164779. Entropy: 1.036244.
Training network. lr: 0.000195. clip: 0.078038
Iteration 7195: Policy loss: 0.006495. Value loss: 0.384975. Entropy: 1.047639.
Iteration 7196: Policy loss: -0.003042. Value loss: 0.220534. Entropy: 1.055292.
Iteration 7197: Policy loss: -0.008871. Value loss: 0.178132. Entropy: 1.056575.
Training network. lr: 0.000195. clip: 0.078038
Iteration 7198: Policy loss: 0.002915. Value loss: 0.458309. Entropy: 1.112944.
Iteration 7199: Policy loss: -0.002510. Value loss: 0.203740. Entropy: 1.122980.
Iteration 7200: Policy loss: -0.009802. Value loss: 0.154089. Entropy: 1.125828.
Training network. lr: 0.000195. clip: 0.077891
Iteration 7201: Policy loss: -0.001860. Value loss: 0.322278. Entropy: 1.037514.
Iteration 7202: Policy loss: -0.002761. Value loss: 

Training network. lr: 0.000194. clip: 0.077734
Iteration 7261: Policy loss: 0.006814. Value loss: 0.199725. Entropy: 0.883479.
Iteration 7262: Policy loss: 0.001972. Value loss: 0.117901. Entropy: 0.893683.
Iteration 7263: Policy loss: -0.003838. Value loss: 0.084728. Entropy: 0.911148.
Training network. lr: 0.000194. clip: 0.077734
Iteration 7264: Policy loss: -0.000217. Value loss: 0.157378. Entropy: 1.090226.
Iteration 7265: Policy loss: -0.008618. Value loss: 0.071434. Entropy: 1.093829.
Iteration 7266: Policy loss: -0.015699. Value loss: 0.049847. Entropy: 1.090515.
episode: 2486   score: 520.0  epsilon: 1.0    steps: 440  evaluation reward: 441.85
Training network. lr: 0.000194. clip: 0.077734
Iteration 7267: Policy loss: 0.000858. Value loss: 0.186144. Entropy: 1.015110.
Iteration 7268: Policy loss: -0.008354. Value loss: 0.088167. Entropy: 1.021383.
Iteration 7269: Policy loss: -0.013119. Value loss: 0.057806. Entropy: 1.017925.
episode: 2487   score: 420.0  epsilon: 1.0    ste

Training network. lr: 0.000194. clip: 0.077577
Iteration 7330: Policy loss: 0.003850. Value loss: 0.193795. Entropy: 1.068709.
Iteration 7331: Policy loss: -0.004876. Value loss: 0.086510. Entropy: 1.051632.
Iteration 7332: Policy loss: -0.009435. Value loss: 0.061946. Entropy: 1.052654.
episode: 2505   score: 495.0  epsilon: 1.0    steps: 48  evaluation reward: 451.55
episode: 2506   score: 520.0  epsilon: 1.0    steps: 360  evaluation reward: 448.8
episode: 2507   score: 590.0  epsilon: 1.0    steps: 920  evaluation reward: 451.3
Training network. lr: 0.000194. clip: 0.077577
Iteration 7333: Policy loss: 0.000252. Value loss: 0.151133. Entropy: 0.892614.
Iteration 7334: Policy loss: -0.009770. Value loss: 0.079520. Entropy: 0.885505.
Iteration 7335: Policy loss: -0.014820. Value loss: 0.059947. Entropy: 0.891923.
Training network. lr: 0.000194. clip: 0.077577
Iteration 7336: Policy loss: 0.002666. Value loss: 0.141493. Entropy: 0.949280.
Iteration 7337: Policy loss: -0.005333. Value 

Iteration 7399: Policy loss: 0.001409. Value loss: 0.207340. Entropy: 1.065892.
Iteration 7400: Policy loss: -0.006632. Value loss: 0.096936. Entropy: 1.058159.
Iteration 7401: Policy loss: -0.014364. Value loss: 0.067525. Entropy: 1.052114.
episode: 2523   score: 395.0  epsilon: 1.0    steps: 416  evaluation reward: 462.95
episode: 2524   score: 390.0  epsilon: 1.0    steps: 912  evaluation reward: 458.65
Training network. lr: 0.000193. clip: 0.077273
Iteration 7402: Policy loss: 0.001524. Value loss: 0.183490. Entropy: 1.064482.
Iteration 7403: Policy loss: -0.006690. Value loss: 0.096835. Entropy: 1.049515.
Iteration 7404: Policy loss: -0.012308. Value loss: 0.074526. Entropy: 1.047923.
Training network. lr: 0.000193. clip: 0.077273
Iteration 7405: Policy loss: 0.003372. Value loss: 0.229170. Entropy: 0.963434.
Iteration 7406: Policy loss: -0.007044. Value loss: 0.098448. Entropy: 0.964690.
Iteration 7407: Policy loss: -0.013141. Value loss: 0.079238. Entropy: 0.956551.
episode: 252

episode: 2543   score: 940.0  epsilon: 1.0    steps: 1008  evaluation reward: 495.6
Training network. lr: 0.000193. clip: 0.077117
Iteration 7468: Policy loss: 0.002713. Value loss: 0.210449. Entropy: 1.072682.
Iteration 7469: Policy loss: -0.006157. Value loss: 0.110241. Entropy: 1.065710.
Iteration 7470: Policy loss: -0.008024. Value loss: 0.074279. Entropy: 1.066430.
episode: 2544   score: 870.0  epsilon: 1.0    steps: 96  evaluation reward: 500.95
Training network. lr: 0.000193. clip: 0.077117
Iteration 7471: Policy loss: 0.005456. Value loss: 0.207166. Entropy: 0.971173.
Iteration 7472: Policy loss: -0.005241. Value loss: 0.091049. Entropy: 0.970397.
Iteration 7473: Policy loss: -0.011459. Value loss: 0.055888. Entropy: 0.975162.
episode: 2545   score: 545.0  epsilon: 1.0    steps: 440  evaluation reward: 502.05
episode: 2546   score: 600.0  epsilon: 1.0    steps: 496  evaluation reward: 503.85
Training network. lr: 0.000193. clip: 0.077117
Iteration 7474: Policy loss: -0.000016. 

Training network. lr: 0.000192. clip: 0.076969
Iteration 7534: Policy loss: 0.002097. Value loss: 0.185300. Entropy: 1.053765.
Iteration 7535: Policy loss: -0.009190. Value loss: 0.102336. Entropy: 1.040036.
Iteration 7536: Policy loss: -0.014878. Value loss: 0.081777. Entropy: 1.037994.
Training network. lr: 0.000192. clip: 0.076969
Iteration 7537: Policy loss: 0.000563. Value loss: 0.408319. Entropy: 1.122330.
Iteration 7538: Policy loss: -0.004600. Value loss: 0.255436. Entropy: 1.130335.
Iteration 7539: Policy loss: -0.007260. Value loss: 0.190087. Entropy: 1.125767.
Training network. lr: 0.000192. clip: 0.076969
Iteration 7540: Policy loss: 0.004457. Value loss: 0.158494. Entropy: 1.104421.
Iteration 7541: Policy loss: -0.006436. Value loss: 0.057688. Entropy: 1.095952.
Iteration 7542: Policy loss: -0.015766. Value loss: 0.037907. Entropy: 1.102726.
episode: 2565   score: 575.0  epsilon: 1.0    steps: 440  evaluation reward: 501.65
episode: 2566   score: 315.0  epsilon: 1.0    ste

episode: 2584   score: 315.0  epsilon: 1.0    steps: 496  evaluation reward: 493.45
Training network. lr: 0.000192. clip: 0.076656
Iteration 7603: Policy loss: 0.005739. Value loss: 0.151851. Entropy: 1.151481.
Iteration 7604: Policy loss: -0.007026. Value loss: 0.077327. Entropy: 1.143424.
Iteration 7605: Policy loss: -0.012904. Value loss: 0.051764. Entropy: 1.141289.
episode: 2585   score: 495.0  epsilon: 1.0    steps: 320  evaluation reward: 493.1
Training network. lr: 0.000192. clip: 0.076656
Iteration 7606: Policy loss: 0.000108. Value loss: 0.166489. Entropy: 1.058789.
Iteration 7607: Policy loss: -0.009800. Value loss: 0.064137. Entropy: 1.063274.
Iteration 7608: Policy loss: -0.016459. Value loss: 0.044858. Entropy: 1.066512.
Training network. lr: 0.000192. clip: 0.076656
Iteration 7609: Policy loss: -0.001505. Value loss: 0.226683. Entropy: 1.160001.
Iteration 7610: Policy loss: -0.009368. Value loss: 0.115713. Entropy: 1.158571.
Iteration 7611: Policy loss: -0.013272. Value 

Iteration 7670: Policy loss: -0.003234. Value loss: 0.308141. Entropy: 1.160167.
Iteration 7671: Policy loss: -0.009934. Value loss: 0.253459. Entropy: 1.158978.
episode: 2604   score: 315.0  epsilon: 1.0    steps: 936  evaluation reward: 486.0
Training network. lr: 0.000191. clip: 0.076508
Iteration 7672: Policy loss: 0.002608. Value loss: 0.427544. Entropy: 1.040276.
Iteration 7673: Policy loss: -0.004324. Value loss: 0.268097. Entropy: 1.028833.
Iteration 7674: Policy loss: -0.006416. Value loss: 0.195877. Entropy: 1.037268.
Training network. lr: 0.000191. clip: 0.076508
Iteration 7675: Policy loss: 0.001141. Value loss: 0.207039. Entropy: 1.053607.
Iteration 7676: Policy loss: -0.006320. Value loss: 0.089992. Entropy: 1.052459.
Iteration 7677: Policy loss: -0.015217. Value loss: 0.057730. Entropy: 1.054444.
episode: 2605   score: 500.0  epsilon: 1.0    steps: 120  evaluation reward: 486.05
episode: 2606   score: 620.0  epsilon: 1.0    steps: 384  evaluation reward: 487.05
Training 

Iteration 7739: Policy loss: -0.006480. Value loss: 0.064381. Entropy: 0.985439.
Iteration 7740: Policy loss: -0.011203. Value loss: 0.042304. Entropy: 0.991986.
episode: 2623   score: 260.0  epsilon: 1.0    steps: 520  evaluation reward: 487.9
episode: 2624   score: 365.0  epsilon: 1.0    steps: 560  evaluation reward: 487.65
Training network. lr: 0.000191. clip: 0.076352
Iteration 7741: Policy loss: 0.000174. Value loss: 0.204110. Entropy: 1.099576.
Iteration 7742: Policy loss: -0.007534. Value loss: 0.096539. Entropy: 1.102872.
Iteration 7743: Policy loss: -0.014145. Value loss: 0.075452. Entropy: 1.102054.
episode: 2625   score: 345.0  epsilon: 1.0    steps: 288  evaluation reward: 485.65
episode: 2626   score: 315.0  epsilon: 1.0    steps: 656  evaluation reward: 485.9
Training network. lr: 0.000191. clip: 0.076352
Iteration 7744: Policy loss: 0.001374. Value loss: 0.142227. Entropy: 0.983331.
Iteration 7745: Policy loss: -0.006113. Value loss: 0.076543. Entropy: 0.988269.
Iterati

Training network. lr: 0.000190. clip: 0.076048
Iteration 7807: Policy loss: 0.005343. Value loss: 0.459487. Entropy: 1.088432.
Iteration 7808: Policy loss: -0.002957. Value loss: 0.202769. Entropy: 1.088439.
Iteration 7809: Policy loss: -0.009065. Value loss: 0.095191. Entropy: 1.101882.
Training network. lr: 0.000190. clip: 0.076048
Iteration 7810: Policy loss: -0.000927. Value loss: 0.094742. Entropy: 1.037196.
Iteration 7811: Policy loss: -0.009450. Value loss: 0.048369. Entropy: 1.042714.
Iteration 7812: Policy loss: -0.015546. Value loss: 0.034293. Entropy: 1.035866.
Training network. lr: 0.000190. clip: 0.076048
Iteration 7813: Policy loss: 0.002370. Value loss: 0.112575. Entropy: 1.072856.
Iteration 7814: Policy loss: -0.008617. Value loss: 0.058578. Entropy: 1.073134.
Iteration 7815: Policy loss: -0.013538. Value loss: 0.041836. Entropy: 1.061945.
episode: 2644   score: 410.0  epsilon: 1.0    steps: 472  evaluation reward: 466.25
Training network. lr: 0.000190. clip: 0.076048
I

Iteration 7875: Policy loss: -0.014473. Value loss: 0.219887. Entropy: 1.155698.
Training network. lr: 0.000190. clip: 0.075891
Iteration 7876: Policy loss: -0.001806. Value loss: 0.261059. Entropy: 1.015657.
Iteration 7877: Policy loss: -0.007704. Value loss: 0.083002. Entropy: 1.013070.
Iteration 7878: Policy loss: -0.012299. Value loss: 0.042456. Entropy: 1.012988.
episode: 2663   score: 515.0  epsilon: 1.0    steps: 80  evaluation reward: 475.85
Training network. lr: 0.000190. clip: 0.075891
Iteration 7879: Policy loss: 0.000330. Value loss: 0.127350. Entropy: 1.128739.
Iteration 7880: Policy loss: -0.009483. Value loss: 0.081064. Entropy: 1.122734.
Iteration 7881: Policy loss: -0.015097. Value loss: 0.061272. Entropy: 1.123581.
episode: 2664   score: 290.0  epsilon: 1.0    steps: 504  evaluation reward: 473.55
episode: 2665   score: 560.0  epsilon: 1.0    steps: 520  evaluation reward: 473.4
Training network. lr: 0.000190. clip: 0.075891
Iteration 7882: Policy loss: 0.004064. Valu

Iteration 7942: Policy loss: 0.005006. Value loss: 0.145919. Entropy: 1.116617.
Iteration 7943: Policy loss: -0.006575. Value loss: 0.074055. Entropy: 1.094903.
Iteration 7944: Policy loss: -0.015330. Value loss: 0.051521. Entropy: 1.093389.
episode: 2684   score: 590.0  epsilon: 1.0    steps: 72  evaluation reward: 479.2
episode: 2685   score: 480.0  epsilon: 1.0    steps: 136  evaluation reward: 479.05
Training network. lr: 0.000189. clip: 0.075734
Iteration 7945: Policy loss: 0.000082. Value loss: 0.132170. Entropy: 0.984752.
Iteration 7946: Policy loss: -0.006721. Value loss: 0.069065. Entropy: 0.977072.
Iteration 7947: Policy loss: -0.009962. Value loss: 0.058113. Entropy: 0.975272.
Training network. lr: 0.000189. clip: 0.075734
Iteration 7948: Policy loss: 0.000901. Value loss: 0.194265. Entropy: 0.990452.
Iteration 7949: Policy loss: -0.008601. Value loss: 0.089915. Entropy: 1.005643.
Iteration 7950: Policy loss: -0.011451. Value loss: 0.073707. Entropy: 1.001798.
Training netwo

Training network. lr: 0.000189. clip: 0.075430
Iteration 8011: Policy loss: 0.003929. Value loss: 0.121205. Entropy: 1.105660.
Iteration 8012: Policy loss: -0.007265. Value loss: 0.060319. Entropy: 1.119383.
Iteration 8013: Policy loss: -0.014311. Value loss: 0.044032. Entropy: 1.115218.
Training network. lr: 0.000189. clip: 0.075430
Iteration 8014: Policy loss: -0.000451. Value loss: 0.116486. Entropy: 1.083712.
Iteration 8015: Policy loss: -0.007472. Value loss: 0.054800. Entropy: 1.085974.
Iteration 8016: Policy loss: -0.011025. Value loss: 0.040011. Entropy: 1.074229.
episode: 2703   score: 450.0  epsilon: 1.0    steps: 64  evaluation reward: 477.15
episode: 2704   score: 225.0  epsilon: 1.0    steps: 144  evaluation reward: 476.25
episode: 2705   score: 620.0  epsilon: 1.0    steps: 624  evaluation reward: 477.45
Training network. lr: 0.000189. clip: 0.075430
Iteration 8017: Policy loss: 0.001554. Value loss: 0.219832. Entropy: 1.051993.
Iteration 8018: Policy loss: -0.008384. Val

Iteration 8078: Policy loss: -0.010363. Value loss: 0.053010. Entropy: 1.034790.
Iteration 8079: Policy loss: -0.016488. Value loss: 0.037267. Entropy: 1.044249.
Training network. lr: 0.000188. clip: 0.075273
Iteration 8080: Policy loss: 0.001428. Value loss: 0.103310. Entropy: 1.157761.
Iteration 8081: Policy loss: -0.006829. Value loss: 0.045686. Entropy: 1.162555.
Iteration 8082: Policy loss: -0.014093. Value loss: 0.029343. Entropy: 1.160841.
episode: 2724   score: 505.0  epsilon: 1.0    steps: 512  evaluation reward: 465.5
Training network. lr: 0.000188. clip: 0.075273
Iteration 8083: Policy loss: 0.005031. Value loss: 0.064937. Entropy: 1.141207.
Iteration 8084: Policy loss: -0.003813. Value loss: 0.033378. Entropy: 1.140023.
Iteration 8085: Policy loss: -0.013258. Value loss: 0.024624. Entropy: 1.140990.
episode: 2725   score: 395.0  epsilon: 1.0    steps: 600  evaluation reward: 466.0
Training network. lr: 0.000188. clip: 0.075273
Iteration 8086: Policy loss: 0.001780. Value lo

Iteration 8147: Policy loss: -0.003451. Value loss: 0.095458. Entropy: 1.102724.
Iteration 8148: Policy loss: -0.009303. Value loss: 0.065840. Entropy: 1.104548.
Training network. lr: 0.000188. clip: 0.075126
Iteration 8149: Policy loss: 0.002019. Value loss: 0.145259. Entropy: 1.119708.
Iteration 8150: Policy loss: -0.002250. Value loss: 0.055220. Entropy: 1.112960.
Iteration 8151: Policy loss: -0.010261. Value loss: 0.036953. Entropy: 1.119998.
episode: 2743   score: 630.0  epsilon: 1.0    steps: 792  evaluation reward: 470.7
Training network. lr: 0.000187. clip: 0.074969
Iteration 8152: Policy loss: 0.004007. Value loss: 0.240360. Entropy: 1.143221.
Iteration 8153: Policy loss: -0.004407. Value loss: 0.124992. Entropy: 1.141234.
Iteration 8154: Policy loss: -0.008408. Value loss: 0.086596. Entropy: 1.150905.
episode: 2744   score: 240.0  epsilon: 1.0    steps: 48  evaluation reward: 469.0
Training network. lr: 0.000187. clip: 0.074969
Iteration 8155: Policy loss: 0.001406. Value los

Iteration 8214: Policy loss: -0.013786. Value loss: 0.031473. Entropy: 1.142575.
episode: 2764   score: 575.0  epsilon: 1.0    steps: 888  evaluation reward: 478.3
Training network. lr: 0.000187. clip: 0.074813
Iteration 8215: Policy loss: 0.004010. Value loss: 0.221322. Entropy: 1.052587.
Iteration 8216: Policy loss: -0.001585. Value loss: 0.088900. Entropy: 1.059133.
Iteration 8217: Policy loss: -0.009399. Value loss: 0.059268. Entropy: 1.051425.
episode: 2765   score: 315.0  epsilon: 1.0    steps: 592  evaluation reward: 475.85
Training network. lr: 0.000187. clip: 0.074813
Iteration 8218: Policy loss: 0.004419. Value loss: 0.310117. Entropy: 1.097715.
Iteration 8219: Policy loss: -0.002042. Value loss: 0.124417. Entropy: 1.091335.
Iteration 8220: Policy loss: -0.010981. Value loss: 0.085130. Entropy: 1.084651.
Training network. lr: 0.000187. clip: 0.074813
Iteration 8221: Policy loss: 0.005438. Value loss: 0.315742. Entropy: 1.075716.
Iteration 8222: Policy loss: -0.002540. Value l

Iteration 8283: Policy loss: -0.006005. Value loss: 0.113893. Entropy: 1.124542.
Training network. lr: 0.000187. clip: 0.074665
Iteration 8284: Policy loss: 0.000791. Value loss: 0.163120. Entropy: 1.141229.
Iteration 8285: Policy loss: -0.007274. Value loss: 0.061359. Entropy: 1.137216.
Iteration 8286: Policy loss: -0.011701. Value loss: 0.039628. Entropy: 1.139009.
episode: 2783   score: 425.0  epsilon: 1.0    steps: 984  evaluation reward: 486.4
Training network. lr: 0.000187. clip: 0.074665
Iteration 8287: Policy loss: 0.003410. Value loss: 0.337612. Entropy: 1.251303.
Iteration 8288: Policy loss: -0.004554. Value loss: 0.178847. Entropy: 1.254225.
Iteration 8289: Policy loss: -0.010183. Value loss: 0.126707. Entropy: 1.246729.
episode: 2784   score: 670.0  epsilon: 1.0    steps: 336  evaluation reward: 487.2
episode: 2785   score: 195.0  epsilon: 1.0    steps: 912  evaluation reward: 484.35
Training network. lr: 0.000187. clip: 0.074665
Iteration 8290: Policy loss: 0.003235. Value

episode: 2804   score: 375.0  epsilon: 1.0    steps: 536  evaluation reward: 480.9
episode: 2805   score: 785.0  epsilon: 1.0    steps: 1024  evaluation reward: 482.55
Training network. lr: 0.000186. clip: 0.074509
Iteration 8350: Policy loss: 0.002509. Value loss: 0.235744. Entropy: 1.067166.
Iteration 8351: Policy loss: -0.006917. Value loss: 0.162392. Entropy: 1.076001.
Iteration 8352: Policy loss: -0.011714. Value loss: 0.115706. Entropy: 1.075570.
episode: 2806   score: 290.0  epsilon: 1.0    steps: 224  evaluation reward: 483.3
Training network. lr: 0.000186. clip: 0.074352
Iteration 8353: Policy loss: 0.006940. Value loss: 0.170071. Entropy: 0.988669.
Iteration 8354: Policy loss: -0.007709. Value loss: 0.100303. Entropy: 0.978423.
Iteration 8355: Policy loss: -0.011257. Value loss: 0.080111. Entropy: 0.979122.
Training network. lr: 0.000186. clip: 0.074352
Iteration 8356: Policy loss: 0.006922. Value loss: 0.160804. Entropy: 0.980355.
Iteration 8357: Policy loss: -0.005124. Valu

Iteration 8418: Policy loss: -0.011980. Value loss: 0.066959. Entropy: 1.155231.
Training network. lr: 0.000186. clip: 0.074204
Iteration 8419: Policy loss: 0.004868. Value loss: 0.201902. Entropy: 1.190580.
Iteration 8420: Policy loss: -0.003531. Value loss: 0.094343. Entropy: 1.192383.
Iteration 8421: Policy loss: -0.010617. Value loss: 0.063984. Entropy: 1.192771.
episode: 2824   score: 180.0  epsilon: 1.0    steps: 424  evaluation reward: 485.45
Training network. lr: 0.000186. clip: 0.074204
Iteration 8422: Policy loss: 0.002872. Value loss: 0.354127. Entropy: 1.167841.
Iteration 8423: Policy loss: 0.000733. Value loss: 0.147860. Entropy: 1.170640.
Iteration 8424: Policy loss: -0.005629. Value loss: 0.098223. Entropy: 1.178682.
episode: 2825   score: 710.0  epsilon: 1.0    steps: 24  evaluation reward: 488.6
episode: 2826   score: 315.0  epsilon: 1.0    steps: 152  evaluation reward: 488.85
episode: 2827   score: 615.0  epsilon: 1.0    steps: 616  evaluation reward: 492.1
Training 

episode: 2846   score: 485.0  epsilon: 1.0    steps: 584  evaluation reward: 503.15
episode: 2847   score: 210.0  epsilon: 1.0    steps: 624  evaluation reward: 500.65
Training network. lr: 0.000185. clip: 0.074048
Iteration 8485: Policy loss: 0.004482. Value loss: 0.120243. Entropy: 0.958770.
Iteration 8486: Policy loss: -0.004839. Value loss: 0.051338. Entropy: 0.965621.
Iteration 8487: Policy loss: -0.008448. Value loss: 0.035747. Entropy: 0.955285.
episode: 2848   score: 340.0  epsilon: 1.0    steps: 1016  evaluation reward: 493.25
Training network. lr: 0.000185. clip: 0.074048
Iteration 8488: Policy loss: 0.006823. Value loss: 0.176281. Entropy: 1.060752.
Iteration 8489: Policy loss: -0.004530. Value loss: 0.056966. Entropy: 1.059673.
Iteration 8490: Policy loss: -0.011260. Value loss: 0.037319. Entropy: 1.057564.
episode: 2849   score: 180.0  epsilon: 1.0    steps: 984  evaluation reward: 488.05
Training network. lr: 0.000185. clip: 0.074048
Iteration 8491: Policy loss: 0.002472.

Iteration 8550: Policy loss: -0.006576. Value loss: 0.038140. Entropy: 1.036040.
episode: 2869   score: 670.0  epsilon: 1.0    steps: 880  evaluation reward: 464.7
Training network. lr: 0.000184. clip: 0.073744
Iteration 8551: Policy loss: 0.003573. Value loss: 0.418215. Entropy: 1.083394.
Iteration 8552: Policy loss: -0.002792. Value loss: 0.230747. Entropy: 1.081102.
Iteration 8553: Policy loss: -0.008773. Value loss: 0.139819. Entropy: 1.077650.
episode: 2870   score: 710.0  epsilon: 1.0    steps: 288  evaluation reward: 469.95
episode: 2871   score: 250.0  epsilon: 1.0    steps: 904  evaluation reward: 466.5
Training network. lr: 0.000184. clip: 0.073744
Iteration 8554: Policy loss: 0.004075. Value loss: 0.142894. Entropy: 1.017613.
Iteration 8555: Policy loss: -0.008535. Value loss: 0.072550. Entropy: 1.005884.
Iteration 8556: Policy loss: -0.015314. Value loss: 0.048748. Entropy: 1.003083.
Training network. lr: 0.000184. clip: 0.073744
Iteration 8557: Policy loss: 0.003360. Value

episode: 2891   score: 430.0  epsilon: 1.0    steps: 568  evaluation reward: 444.5
Training network. lr: 0.000184. clip: 0.073587
Iteration 8617: Policy loss: -0.000467. Value loss: 0.186866. Entropy: 0.947701.
Iteration 8618: Policy loss: -0.005318. Value loss: 0.074295. Entropy: 0.965620.
Iteration 8619: Policy loss: -0.012721. Value loss: 0.055870. Entropy: 0.955623.
episode: 2892   score: 590.0  epsilon: 1.0    steps: 640  evaluation reward: 444.9
Training network. lr: 0.000184. clip: 0.073587
Iteration 8620: Policy loss: 0.006136. Value loss: 0.240745. Entropy: 1.026508.
Iteration 8621: Policy loss: 0.000024. Value loss: 0.147911. Entropy: 1.027577.
Iteration 8622: Policy loss: -0.007283. Value loss: 0.090722. Entropy: 1.033328.
Training network. lr: 0.000184. clip: 0.073587
Iteration 8623: Policy loss: 0.001561. Value loss: 0.252043. Entropy: 1.041072.
Iteration 8624: Policy loss: -0.006125. Value loss: 0.119604. Entropy: 1.045521.
Iteration 8625: Policy loss: -0.012084. Value lo

Training network. lr: 0.000184. clip: 0.073430
Iteration 8683: Policy loss: 0.008876. Value loss: 0.163524. Entropy: 0.963338.
Iteration 8684: Policy loss: -0.007075. Value loss: 0.073428. Entropy: 0.943996.
Iteration 8685: Policy loss: -0.011945. Value loss: 0.050040. Entropy: 0.949466.
episode: 2913   score: 420.0  epsilon: 1.0    steps: 904  evaluation reward: 447.9
Training network. lr: 0.000184. clip: 0.073430
Iteration 8686: Policy loss: 0.000967. Value loss: 0.298461. Entropy: 1.032716.
Iteration 8687: Policy loss: -0.006462. Value loss: 0.138616. Entropy: 1.033564.
Iteration 8688: Policy loss: -0.011262. Value loss: 0.087547. Entropy: 1.027956.
episode: 2914   score: 285.0  epsilon: 1.0    steps: 832  evaluation reward: 447.8
episode: 2915   score: 350.0  epsilon: 1.0    steps: 896  evaluation reward: 448.6
Training network. lr: 0.000184. clip: 0.073430
Iteration 8689: Policy loss: 0.003799. Value loss: 0.111665. Entropy: 1.030003.
Iteration 8690: Policy loss: -0.005354. Value 

Iteration 8750: Policy loss: -0.003879. Value loss: 0.058665. Entropy: 1.048827.
Iteration 8751: Policy loss: -0.011602. Value loss: 0.041645. Entropy: 1.052076.
episode: 2934   score: 240.0  epsilon: 1.0    steps: 248  evaluation reward: 421.6
Training network. lr: 0.000183. clip: 0.073126
Iteration 8752: Policy loss: 0.003997. Value loss: 0.130500. Entropy: 1.098280.
Iteration 8753: Policy loss: -0.006967. Value loss: 0.060437. Entropy: 1.094802.
Iteration 8754: Policy loss: -0.013230. Value loss: 0.043448. Entropy: 1.097669.
Training network. lr: 0.000183. clip: 0.073126
Iteration 8755: Policy loss: 0.004165. Value loss: 0.170514. Entropy: 1.079866.
Iteration 8756: Policy loss: -0.003281. Value loss: 0.059934. Entropy: 1.066013.
Iteration 8757: Policy loss: -0.008603. Value loss: 0.040624. Entropy: 1.068030.
episode: 2935   score: 535.0  epsilon: 1.0    steps: 48  evaluation reward: 422.05
episode: 2936   score: 695.0  epsilon: 1.0    steps: 272  evaluation reward: 425.85
episode: 2

Iteration 8817: Policy loss: -0.014762. Value loss: 0.032164. Entropy: 1.038350.
episode: 2955   score: 740.0  epsilon: 1.0    steps: 760  evaluation reward: 431.65
Training network. lr: 0.000182. clip: 0.072969
Iteration 8818: Policy loss: 0.003355. Value loss: 0.137260. Entropy: 1.087922.
Iteration 8819: Policy loss: -0.002100. Value loss: 0.068516. Entropy: 1.085795.
Iteration 8820: Policy loss: -0.009374. Value loss: 0.049376. Entropy: 1.085639.
episode: 2956   score: 240.0  epsilon: 1.0    steps: 144  evaluation reward: 431.1
Training network. lr: 0.000182. clip: 0.072969
Iteration 8821: Policy loss: 0.008077. Value loss: 0.205662. Entropy: 1.023994.
Iteration 8822: Policy loss: 0.010939. Value loss: 0.073737. Entropy: 1.022858.
Iteration 8823: Policy loss: 0.001092. Value loss: 0.052424. Entropy: 1.011860.
episode: 2957   score: 395.0  epsilon: 1.0    steps: 752  evaluation reward: 429.35
episode: 2958   score: 390.0  epsilon: 1.0    steps: 1024  evaluation reward: 431.15
Trainin

Iteration 8886: Policy loss: -0.012735. Value loss: 0.084562. Entropy: 1.113750.
Training network. lr: 0.000182. clip: 0.072822
Iteration 8887: Policy loss: 0.006374. Value loss: 0.226684. Entropy: 1.103101.
Iteration 8888: Policy loss: 0.002827. Value loss: 0.083649. Entropy: 1.112661.
Iteration 8889: Policy loss: -0.009122. Value loss: 0.052929. Entropy: 1.115071.
episode: 2974   score: 330.0  epsilon: 1.0    steps: 80  evaluation reward: 450.3
episode: 2975   score: 210.0  epsilon: 1.0    steps: 200  evaluation reward: 448.0
episode: 2976   score: 820.0  epsilon: 1.0    steps: 648  evaluation reward: 454.1
episode: 2977   score: 650.0  epsilon: 1.0    steps: 808  evaluation reward: 454.85
Training network. lr: 0.000182. clip: 0.072822
Iteration 8890: Policy loss: 0.001580. Value loss: 0.324256. Entropy: 0.950249.
Iteration 8891: Policy loss: -0.008067. Value loss: 0.148186. Entropy: 0.953198.
Iteration 8892: Policy loss: -0.009402. Value loss: 0.110471. Entropy: 0.961173.
Training n

episode: 2996   score: 215.0  epsilon: 1.0    steps: 736  evaluation reward: 467.6
Training network. lr: 0.000181. clip: 0.072509
Iteration 8953: Policy loss: 0.001219. Value loss: 0.216785. Entropy: 0.890505.
Iteration 8954: Policy loss: -0.000877. Value loss: 0.091616. Entropy: 0.893941.
Iteration 8955: Policy loss: -0.006301. Value loss: 0.067550. Entropy: 0.892957.
episode: 2997   score: 615.0  epsilon: 1.0    steps: 808  evaluation reward: 471.5
Training network. lr: 0.000181. clip: 0.072509
Iteration 8956: Policy loss: 0.003327. Value loss: 0.237639. Entropy: 0.964758.
Iteration 8957: Policy loss: -0.006393. Value loss: 0.152400. Entropy: 0.952424.
Iteration 8958: Policy loss: -0.007474. Value loss: 0.114488. Entropy: 0.949877.
Training network. lr: 0.000181. clip: 0.072509
Iteration 8959: Policy loss: 0.005721. Value loss: 0.353077. Entropy: 1.004540.
Iteration 8960: Policy loss: -0.004518. Value loss: 0.137440. Entropy: 1.006449.
Iteration 8961: Policy loss: -0.009164. Value lo

Iteration 9017: Policy loss: -0.006276. Value loss: 0.144900. Entropy: 0.973232.
Iteration 9018: Policy loss: -0.011291. Value loss: 0.112060. Entropy: 0.977690.
episode: 3020   score: 260.0  epsilon: 1.0    steps: 160  evaluation reward: 447.3
Training network. lr: 0.000181. clip: 0.072361
Iteration 9019: Policy loss: 0.000506. Value loss: 0.130159. Entropy: 0.877133.
Iteration 9020: Policy loss: -0.007500. Value loss: 0.057987. Entropy: 0.882519.
Iteration 9021: Policy loss: -0.014355. Value loss: 0.043849. Entropy: 0.892448.
episode: 3021   score: 940.0  epsilon: 1.0    steps: 40  evaluation reward: 454.55
episode: 3022   score: 335.0  epsilon: 1.0    steps: 800  evaluation reward: 455.05
Training network. lr: 0.000181. clip: 0.072361
Iteration 9022: Policy loss: 0.002838. Value loss: 0.192863. Entropy: 0.855747.
Iteration 9023: Policy loss: -0.005415. Value loss: 0.100093. Entropy: 0.863684.
Iteration 9024: Policy loss: -0.007044. Value loss: 0.071957. Entropy: 0.863247.
Training n

Training network. lr: 0.000181. clip: 0.072205
Iteration 9085: Policy loss: 0.002081. Value loss: 0.271103. Entropy: 1.017561.
Iteration 9086: Policy loss: -0.004294. Value loss: 0.132877. Entropy: 1.026661.
Iteration 9087: Policy loss: -0.010546. Value loss: 0.087495. Entropy: 1.024740.
episode: 3041   score: 925.0  epsilon: 1.0    steps: 744  evaluation reward: 454.25
Training network. lr: 0.000181. clip: 0.072205
Iteration 9088: Policy loss: 0.001640. Value loss: 0.226433. Entropy: 1.053923.
Iteration 9089: Policy loss: -0.004712. Value loss: 0.108976. Entropy: 1.050715.
Iteration 9090: Policy loss: -0.011993. Value loss: 0.087091. Entropy: 1.055881.
episode: 3042   score: 365.0  epsilon: 1.0    steps: 264  evaluation reward: 455.75
episode: 3043   score: 750.0  epsilon: 1.0    steps: 496  evaluation reward: 460.4
Training network. lr: 0.000181. clip: 0.072205
Iteration 9091: Policy loss: 0.000632. Value loss: 0.158829. Entropy: 1.027523.
Iteration 9092: Policy loss: -0.009523. Valu

Training network. lr: 0.000180. clip: 0.071900
Iteration 9151: Policy loss: 0.001884. Value loss: 0.200660. Entropy: 1.173032.
Iteration 9152: Policy loss: -0.007325. Value loss: 0.095674. Entropy: 1.172516.
Iteration 9153: Policy loss: -0.012994. Value loss: 0.062611. Entropy: 1.173686.
Training network. lr: 0.000180. clip: 0.071900
Iteration 9154: Policy loss: 0.000891. Value loss: 0.125783. Entropy: 1.208056.
Iteration 9155: Policy loss: -0.010241. Value loss: 0.058953. Entropy: 1.203115.
Iteration 9156: Policy loss: -0.016730. Value loss: 0.038897. Entropy: 1.203861.
Training network. lr: 0.000180. clip: 0.071900
Iteration 9157: Policy loss: 0.003243. Value loss: 0.123117. Entropy: 1.197445.
Iteration 9158: Policy loss: -0.006790. Value loss: 0.053563. Entropy: 1.203374.
Iteration 9159: Policy loss: -0.014615. Value loss: 0.035604. Entropy: 1.201781.
episode: 3063   score: 360.0  epsilon: 1.0    steps: 40  evaluation reward: 445.95
episode: 3064   score: 215.0  epsilon: 1.0    step

Iteration 9217: Policy loss: 0.005803. Value loss: 0.327005. Entropy: 0.987874.
Iteration 9218: Policy loss: -0.001603. Value loss: 0.152371. Entropy: 0.984655.
Iteration 9219: Policy loss: -0.005958. Value loss: 0.121910. Entropy: 0.990662.
Training network. lr: 0.000179. clip: 0.071744
Iteration 9220: Policy loss: 0.003452. Value loss: 0.198728. Entropy: 1.038509.
Iteration 9221: Policy loss: -0.006649. Value loss: 0.096551. Entropy: 1.033264.
Iteration 9222: Policy loss: -0.012612. Value loss: 0.063766. Entropy: 1.041295.
episode: 3085   score: 1110.0  epsilon: 1.0    steps: 488  evaluation reward: 438.65
Training network. lr: 0.000179. clip: 0.071744
Iteration 9223: Policy loss: 0.003078. Value loss: 0.168934. Entropy: 1.029962.
Iteration 9224: Policy loss: -0.003502. Value loss: 0.082222. Entropy: 1.033077.
Iteration 9225: Policy loss: -0.011389. Value loss: 0.058736. Entropy: 1.033431.
episode: 3086   score: 440.0  epsilon: 1.0    steps: 120  evaluation reward: 440.4
Training net

episode: 3108   score: 330.0  epsilon: 1.0    steps: 904  evaluation reward: 433.55
Training network. lr: 0.000179. clip: 0.071587
Iteration 9283: Policy loss: 0.001201. Value loss: 0.293725. Entropy: 1.014493.
Iteration 9284: Policy loss: -0.007608. Value loss: 0.250791. Entropy: 1.013930.
Iteration 9285: Policy loss: -0.009966. Value loss: 0.203621. Entropy: 1.020587.
episode: 3109   score: 210.0  epsilon: 1.0    steps: 872  evaluation reward: 430.15
Training network. lr: 0.000179. clip: 0.071587
Iteration 9286: Policy loss: 0.003506. Value loss: 0.461252. Entropy: 0.981767.
Iteration 9287: Policy loss: -0.004654. Value loss: 0.297052. Entropy: 0.975924.
Iteration 9288: Policy loss: -0.003277. Value loss: 0.244403. Entropy: 0.980875.
episode: 3110   score: 670.0  epsilon: 1.0    steps: 152  evaluation reward: 432.1
episode: 3111   score: 330.0  epsilon: 1.0    steps: 816  evaluation reward: 430.35
Training network. lr: 0.000179. clip: 0.071587
Iteration 9289: Policy loss: 0.005066. V

Iteration 9349: Policy loss: 0.003901. Value loss: 0.270315. Entropy: 1.015798.
Iteration 9350: Policy loss: -0.001870. Value loss: 0.168183. Entropy: 1.013585.
Iteration 9351: Policy loss: -0.009227. Value loss: 0.126597. Entropy: 1.008613.
episode: 3130   score: 555.0  epsilon: 1.0    steps: 152  evaluation reward: 438.7
episode: 3131   score: 715.0  epsilon: 1.0    steps: 960  evaluation reward: 443.45
Training network. lr: 0.000178. clip: 0.071283
Iteration 9352: Policy loss: 0.004323. Value loss: 0.479265. Entropy: 1.074737.
Iteration 9353: Policy loss: 0.003036. Value loss: 0.204127. Entropy: 1.062616.
Iteration 9354: Policy loss: -0.001948. Value loss: 0.119887. Entropy: 1.063441.
episode: 3132   score: 625.0  epsilon: 1.0    steps: 24  evaluation reward: 447.85
episode: 3133   score: 695.0  epsilon: 1.0    steps: 776  evaluation reward: 452.7
Training network. lr: 0.000178. clip: 0.071283
Iteration 9355: Policy loss: 0.004295. Value loss: 0.223172. Entropy: 0.996385.
Iteration 

Training network. lr: 0.000178. clip: 0.071126
Iteration 9418: Policy loss: 0.004484. Value loss: 0.153695. Entropy: 1.181581.
Iteration 9419: Policy loss: -0.008760. Value loss: 0.070402. Entropy: 1.178884.
Iteration 9420: Policy loss: -0.015828. Value loss: 0.047060. Entropy: 1.183612.
episode: 3150   score: 330.0  epsilon: 1.0    steps: 24  evaluation reward: 438.85
now time :  2019-03-05 22:42:52.834635
episode: 3151   score: 700.0  epsilon: 1.0    steps: 656  evaluation reward: 440.25
Training network. lr: 0.000178. clip: 0.071126
Iteration 9421: Policy loss: 0.006371. Value loss: 0.242379. Entropy: 1.025308.
Iteration 9422: Policy loss: -0.002535. Value loss: 0.113848. Entropy: 1.026252.
Iteration 9423: Policy loss: -0.011451. Value loss: 0.076182. Entropy: 1.035863.
episode: 3152   score: 645.0  epsilon: 1.0    steps: 392  evaluation reward: 440.3
Training network. lr: 0.000178. clip: 0.071126
Iteration 9424: Policy loss: 0.002400. Value loss: 0.166023. Entropy: 1.020409.
Iterat

Iteration 9485: Policy loss: -0.004598. Value loss: 0.083592. Entropy: 1.137130.
Iteration 9486: Policy loss: -0.007768. Value loss: 0.059475. Entropy: 1.140273.
episode: 3170   score: 210.0  epsilon: 1.0    steps: 16  evaluation reward: 461.8
episode: 3171   score: 715.0  epsilon: 1.0    steps: 640  evaluation reward: 464.8
episode: 3172   score: 275.0  epsilon: 1.0    steps: 928  evaluation reward: 466.2
Training network. lr: 0.000177. clip: 0.070979
Iteration 9487: Policy loss: 0.001270. Value loss: 0.123521. Entropy: 0.901997.
Iteration 9488: Policy loss: -0.006622. Value loss: 0.066386. Entropy: 0.891062.
Iteration 9489: Policy loss: -0.008992. Value loss: 0.054021. Entropy: 0.886465.
episode: 3173   score: 555.0  epsilon: 1.0    steps: 8  evaluation reward: 465.65
Training network. lr: 0.000177. clip: 0.070979
Iteration 9490: Policy loss: 0.002932. Value loss: 0.146193. Entropy: 0.861525.
Iteration 9491: Policy loss: -0.003791. Value loss: 0.078660. Entropy: 0.868837.
Iteration 9

Iteration 9553: Policy loss: 0.005361. Value loss: 0.416890. Entropy: 1.097202.
Iteration 9554: Policy loss: 0.002833. Value loss: 0.192633. Entropy: 1.096307.
Iteration 9555: Policy loss: -0.004121. Value loss: 0.135641. Entropy: 1.095424.
episode: 3190   score: 155.0  epsilon: 1.0    steps: 464  evaluation reward: 467.9
Training network. lr: 0.000177. clip: 0.070665
Iteration 9556: Policy loss: 0.004295. Value loss: 0.222946. Entropy: 1.142489.
Iteration 9557: Policy loss: -0.004700. Value loss: 0.116200. Entropy: 1.142150.
Iteration 9558: Policy loss: -0.010388. Value loss: 0.083252. Entropy: 1.142771.
episode: 3191   score: 445.0  epsilon: 1.0    steps: 448  evaluation reward: 469.35
Training network. lr: 0.000177. clip: 0.070665
Iteration 9559: Policy loss: 0.005601. Value loss: 0.263751. Entropy: 1.054426.
Iteration 9560: Policy loss: 0.000185. Value loss: 0.079825. Entropy: 1.050100.
Iteration 9561: Policy loss: -0.010996. Value loss: 0.046749. Entropy: 1.047638.
episode: 3192  

Training network. lr: 0.000176. clip: 0.070518
Iteration 9619: Policy loss: 0.005571. Value loss: 0.134451. Entropy: 1.063358.
Iteration 9620: Policy loss: -0.007127. Value loss: 0.069212. Entropy: 1.068676.
Iteration 9621: Policy loss: -0.014360. Value loss: 0.052010. Entropy: 1.064424.
Training network. lr: 0.000176. clip: 0.070518
Iteration 9622: Policy loss: 0.003041. Value loss: 0.100369. Entropy: 0.961123.
Iteration 9623: Policy loss: -0.007553. Value loss: 0.049280. Entropy: 0.968600.
Iteration 9624: Policy loss: -0.012048. Value loss: 0.037551. Entropy: 0.974035.
episode: 3213   score: 260.0  epsilon: 1.0    steps: 560  evaluation reward: 479.3
Training network. lr: 0.000176. clip: 0.070518
Iteration 9625: Policy loss: 0.002523. Value loss: 0.133551. Entropy: 1.062842.
Iteration 9626: Policy loss: -0.005996. Value loss: 0.085553. Entropy: 1.069991.
Iteration 9627: Policy loss: -0.011487. Value loss: 0.072382. Entropy: 1.071328.
Training network. lr: 0.000176. clip: 0.070518
Ite

Iteration 9688: Policy loss: 0.001385. Value loss: 0.553488. Entropy: 1.060071.
Iteration 9689: Policy loss: 0.000094. Value loss: 0.260512. Entropy: 1.053230.
Iteration 9690: Policy loss: -0.005507. Value loss: 0.188798. Entropy: 1.058951.
episode: 3231   score: 580.0  epsilon: 1.0    steps: 184  evaluation reward: 483.55
episode: 3232   score: 650.0  epsilon: 1.0    steps: 688  evaluation reward: 483.8
Training network. lr: 0.000176. clip: 0.070361
Iteration 9691: Policy loss: 0.002365. Value loss: 0.186154. Entropy: 1.069216.
Iteration 9692: Policy loss: -0.000953. Value loss: 0.073108. Entropy: 1.071922.
Iteration 9693: Policy loss: -0.005639. Value loss: 0.041444. Entropy: 1.065076.
episode: 3233   score: 655.0  epsilon: 1.0    steps: 576  evaluation reward: 483.4
episode: 3234   score: 560.0  epsilon: 1.0    steps: 832  evaluation reward: 486.0
Training network. lr: 0.000176. clip: 0.070361
Iteration 9694: Policy loss: 0.005671. Value loss: 0.139372. Entropy: 0.972972.
Iteration 

Training network. lr: 0.000175. clip: 0.070057
Iteration 9754: Policy loss: 0.000888. Value loss: 0.341464. Entropy: 1.053275.
Iteration 9755: Policy loss: -0.006783. Value loss: 0.230123. Entropy: 1.059909.
Iteration 9756: Policy loss: -0.007295. Value loss: 0.169407. Entropy: 1.059921.
Training network. lr: 0.000175. clip: 0.070057
Iteration 9757: Policy loss: 0.003511. Value loss: 0.129843. Entropy: 1.052767.
Iteration 9758: Policy loss: -0.009385. Value loss: 0.055462. Entropy: 1.055251.
Iteration 9759: Policy loss: -0.015028. Value loss: 0.036614. Entropy: 1.052057.
Training network. lr: 0.000175. clip: 0.070057
Iteration 9760: Policy loss: 0.002380. Value loss: 0.176388. Entropy: 1.073374.
Iteration 9761: Policy loss: -0.008629. Value loss: 0.071416. Entropy: 1.084443.
Iteration 9762: Policy loss: -0.014640. Value loss: 0.048256. Entropy: 1.081083.
episode: 3254   score: 425.0  epsilon: 1.0    steps: 960  evaluation reward: 481.4
Training network. lr: 0.000175. clip: 0.070057
Ite

Iteration 9823: Policy loss: 0.003082. Value loss: 0.289160. Entropy: 1.111722.
Iteration 9824: Policy loss: -0.002287. Value loss: 0.121507. Entropy: 1.122498.
Iteration 9825: Policy loss: -0.009349. Value loss: 0.079022. Entropy: 1.114138.
episode: 3272   score: 525.0  epsilon: 1.0    steps: 128  evaluation reward: 479.0
Training network. lr: 0.000175. clip: 0.069901
Iteration 9826: Policy loss: 0.001805. Value loss: 0.206777. Entropy: 1.082488.
Iteration 9827: Policy loss: -0.008823. Value loss: 0.098887. Entropy: 1.072210.
Iteration 9828: Policy loss: -0.015738. Value loss: 0.069094. Entropy: 1.071955.
episode: 3273   score: 970.0  epsilon: 1.0    steps: 488  evaluation reward: 483.15
episode: 3274   score: 360.0  epsilon: 1.0    steps: 536  evaluation reward: 480.3
episode: 3275   score: 615.0  epsilon: 1.0    steps: 936  evaluation reward: 481.4
Training network. lr: 0.000175. clip: 0.069901
Iteration 9829: Policy loss: -0.000732. Value loss: 0.170475. Entropy: 1.034583.
Iteratio

Iteration 9891: Policy loss: -0.014007. Value loss: 0.042500. Entropy: 1.065794.
Training network. lr: 0.000174. clip: 0.069744
Iteration 9892: Policy loss: 0.005234. Value loss: 0.289228. Entropy: 1.071929.
Iteration 9893: Policy loss: -0.000122. Value loss: 0.107006. Entropy: 1.058125.
Iteration 9894: Policy loss: -0.008939. Value loss: 0.056422. Entropy: 1.061151.
episode: 3293   score: 260.0  epsilon: 1.0    steps: 80  evaluation reward: 476.6
Training network. lr: 0.000174. clip: 0.069744
Iteration 9895: Policy loss: 0.001691. Value loss: 0.133398. Entropy: 1.064042.
Iteration 9896: Policy loss: -0.005775. Value loss: 0.070132. Entropy: 1.054546.
Iteration 9897: Policy loss: -0.011790. Value loss: 0.056072. Entropy: 1.050759.
episode: 3294   score: 535.0  epsilon: 1.0    steps: 704  evaluation reward: 476.65
episode: 3295   score: 435.0  epsilon: 1.0    steps: 768  evaluation reward: 475.15
episode: 3296   score: 390.0  epsilon: 1.0    steps: 856  evaluation reward: 473.15
Trainin

episode: 3311   score: 650.0  epsilon: 1.0    steps: 232  evaluation reward: 481.05
episode: 3312   score: 720.0  epsilon: 1.0    steps: 968  evaluation reward: 482.95
episode: 3313   score: 260.0  epsilon: 1.0    steps: 1000  evaluation reward: 482.95
Training network. lr: 0.000174. clip: 0.069440
Iteration 9961: Policy loss: 0.005120. Value loss: 0.135196. Entropy: 1.087917.
Iteration 9962: Policy loss: -0.004383. Value loss: 0.070203. Entropy: 1.100820.
Iteration 9963: Policy loss: -0.011236. Value loss: 0.051065. Entropy: 1.094169.
Training network. lr: 0.000174. clip: 0.069440
Iteration 9964: Policy loss: -0.001155. Value loss: 0.111702. Entropy: 0.858435.
Iteration 9965: Policy loss: -0.011015. Value loss: 0.051427. Entropy: 0.850281.
Iteration 9966: Policy loss: -0.016496. Value loss: 0.034984. Entropy: 0.852384.
episode: 3314   score: 300.0  epsilon: 1.0    steps: 168  evaluation reward: 479.0
Training network. lr: 0.000174. clip: 0.069440
Iteration 9967: Policy loss: 0.005970.

Iteration 10028: Policy loss: -0.002948. Value loss: 0.103492. Entropy: 1.144744.
Iteration 10029: Policy loss: -0.009601. Value loss: 0.069512. Entropy: 1.133761.
episode: 3332   score: 320.0  epsilon: 1.0    steps: 216  evaluation reward: 471.55
episode: 3333   score: 305.0  epsilon: 1.0    steps: 632  evaluation reward: 468.05
Training network. lr: 0.000173. clip: 0.069283
Iteration 10030: Policy loss: 0.002807. Value loss: 0.145000. Entropy: 1.065207.
Iteration 10031: Policy loss: -0.006395. Value loss: 0.062806. Entropy: 1.062060.
Iteration 10032: Policy loss: -0.012487. Value loss: 0.046322. Entropy: 1.056443.
episode: 3334   score: 210.0  epsilon: 1.0    steps: 192  evaluation reward: 464.55
Training network. lr: 0.000173. clip: 0.069283
Iteration 10033: Policy loss: 0.005630. Value loss: 0.142169. Entropy: 0.960791.
Iteration 10034: Policy loss: -0.005821. Value loss: 0.064463. Entropy: 0.958024.
Iteration 10035: Policy loss: -0.010305. Value loss: 0.045462. Entropy: 0.957610.


Iteration 10094: Policy loss: -0.005307. Value loss: 0.059151. Entropy: 1.106686.
Iteration 10095: Policy loss: -0.011085. Value loss: 0.046520. Entropy: 1.104405.
episode: 3353   score: 975.0  epsilon: 1.0    steps: 408  evaluation reward: 476.25
Training network. lr: 0.000173. clip: 0.069136
Iteration 10096: Policy loss: 0.002588. Value loss: 0.127499. Entropy: 1.071606.
Iteration 10097: Policy loss: -0.006583. Value loss: 0.070747. Entropy: 1.073534.
Iteration 10098: Policy loss: -0.012902. Value loss: 0.051993. Entropy: 1.073448.
Training network. lr: 0.000173. clip: 0.069136
Iteration 10099: Policy loss: 0.008743. Value loss: 0.284662. Entropy: 1.090343.
Iteration 10100: Policy loss: -0.000830. Value loss: 0.146819. Entropy: 1.097915.
Iteration 10101: Policy loss: -0.008219. Value loss: 0.104790. Entropy: 1.096884.
Training network. lr: 0.000172. clip: 0.068979
Iteration 10102: Policy loss: 0.002945. Value loss: 0.189864. Entropy: 1.126749.
Iteration 10103: Policy loss: -0.008470.

Iteration 10161: Policy loss: -0.008360. Value loss: 0.115702. Entropy: 0.889943.
Training network. lr: 0.000172. clip: 0.068822
Iteration 10162: Policy loss: 0.005024. Value loss: 0.125920. Entropy: 0.909481.
Iteration 10163: Policy loss: -0.005868. Value loss: 0.069353. Entropy: 0.919337.
Iteration 10164: Policy loss: -0.011126. Value loss: 0.057370. Entropy: 0.921311.
Training network. lr: 0.000172. clip: 0.068822
Iteration 10165: Policy loss: 0.003691. Value loss: 0.212868. Entropy: 1.079522.
Iteration 10166: Policy loss: -0.005277. Value loss: 0.104645. Entropy: 1.081474.
Iteration 10167: Policy loss: -0.013076. Value loss: 0.070062. Entropy: 1.073543.
episode: 3374   score: 520.0  epsilon: 1.0    steps: 432  evaluation reward: 482.4
episode: 3375   score: 390.0  epsilon: 1.0    steps: 768  evaluation reward: 480.15
Training network. lr: 0.000172. clip: 0.068822
Iteration 10168: Policy loss: 0.005442. Value loss: 0.206239. Entropy: 0.984955.
Iteration 10169: Policy loss: -0.003409

episode: 3395   score: 375.0  epsilon: 1.0    steps: 944  evaluation reward: 481.45
Training network. lr: 0.000172. clip: 0.068675
Iteration 10228: Policy loss: 0.003499. Value loss: 0.170919. Entropy: 0.968189.
Iteration 10229: Policy loss: -0.006503. Value loss: 0.095991. Entropy: 0.967929.
Iteration 10230: Policy loss: -0.012030. Value loss: 0.069515. Entropy: 0.963863.
Training network. lr: 0.000172. clip: 0.068675
Iteration 10231: Policy loss: 0.002360. Value loss: 0.173475. Entropy: 1.001822.
Iteration 10232: Policy loss: -0.001652. Value loss: 0.085961. Entropy: 1.012321.
Iteration 10233: Policy loss: -0.004528. Value loss: 0.048460. Entropy: 1.009147.
episode: 3396   score: 425.0  epsilon: 1.0    steps: 680  evaluation reward: 481.8
Training network. lr: 0.000172. clip: 0.068675
Iteration 10234: Policy loss: 0.005412. Value loss: 0.335604. Entropy: 1.084373.
Iteration 10235: Policy loss: -0.001669. Value loss: 0.204136. Entropy: 1.083054.
Iteration 10236: Policy loss: -0.011363

episode: 3416   score: 410.0  epsilon: 1.0    steps: 416  evaluation reward: 477.2
Training network. lr: 0.000171. clip: 0.068518
Iteration 10294: Policy loss: 0.004357. Value loss: 0.159209. Entropy: 1.097029.
Iteration 10295: Policy loss: -0.001181. Value loss: 0.068739. Entropy: 1.097187.
Iteration 10296: Policy loss: -0.005898. Value loss: 0.038951. Entropy: 1.098529.
episode: 3417   score: 180.0  epsilon: 1.0    steps: 792  evaluation reward: 477.5
episode: 3418   score: 265.0  epsilon: 1.0    steps: 896  evaluation reward: 474.15
Training network. lr: 0.000171. clip: 0.068518
Iteration 10297: Policy loss: 0.004095. Value loss: 0.159113. Entropy: 1.078267.
Iteration 10298: Policy loss: -0.010245. Value loss: 0.083853. Entropy: 1.082130.
Iteration 10299: Policy loss: -0.014267. Value loss: 0.056574. Entropy: 1.083228.
episode: 3419   score: 345.0  epsilon: 1.0    steps: 912  evaluation reward: 474.8
Training network. lr: 0.000171. clip: 0.068518
Iteration 10300: Policy loss: 0.0040

Training network. lr: 0.000171. clip: 0.068214
Iteration 10360: Policy loss: 0.002016. Value loss: 0.225596. Entropy: 1.093530.
Iteration 10361: Policy loss: -0.006141. Value loss: 0.112289. Entropy: 1.091523.
Iteration 10362: Policy loss: -0.009278. Value loss: 0.077255. Entropy: 1.091830.
Training network. lr: 0.000171. clip: 0.068214
Iteration 10363: Policy loss: 0.007080. Value loss: 0.275775. Entropy: 1.156989.
Iteration 10364: Policy loss: -0.004061. Value loss: 0.072656. Entropy: 1.141539.
Iteration 10365: Policy loss: -0.009333. Value loss: 0.045003. Entropy: 1.141383.
Training network. lr: 0.000171. clip: 0.068214
Iteration 10366: Policy loss: 0.002502. Value loss: 0.371147. Entropy: 1.153559.
Iteration 10367: Policy loss: -0.002849. Value loss: 0.175528. Entropy: 1.148723.
Iteration 10368: Policy loss: -0.009304. Value loss: 0.097846. Entropy: 1.151477.
episode: 3438   score: 605.0  epsilon: 1.0    steps: 808  evaluation reward: 475.25
episode: 3439   score: 210.0  epsilon: 1

Iteration 10425: Policy loss: -0.012194. Value loss: 0.036324. Entropy: 1.068524.
episode: 3460   score: 150.0  epsilon: 1.0    steps: 704  evaluation reward: 450.35
Training network. lr: 0.000170. clip: 0.068057
Iteration 10426: Policy loss: 0.000853. Value loss: 0.222937. Entropy: 1.144006.
Iteration 10427: Policy loss: -0.007115. Value loss: 0.129914. Entropy: 1.149771.
Iteration 10428: Policy loss: -0.012450. Value loss: 0.091633. Entropy: 1.148991.
episode: 3461   score: 425.0  epsilon: 1.0    steps: 584  evaluation reward: 450.8
Training network. lr: 0.000170. clip: 0.068057
Iteration 10429: Policy loss: 0.001605. Value loss: 0.279722. Entropy: 1.140868.
Iteration 10430: Policy loss: -0.003990. Value loss: 0.110922. Entropy: 1.141598.
Iteration 10431: Policy loss: -0.010709. Value loss: 0.073401. Entropy: 1.139333.
Training network. lr: 0.000170. clip: 0.068057
Iteration 10432: Policy loss: 0.006844. Value loss: 0.157519. Entropy: 1.221658.
Iteration 10433: Policy loss: -0.002286

Iteration 10488: Policy loss: -0.008907. Value loss: 0.041132. Entropy: 1.204676.
Training network. lr: 0.000170. clip: 0.067901
Iteration 10489: Policy loss: 0.003632. Value loss: 0.070166. Entropy: 1.112821.
Iteration 10490: Policy loss: -0.002654. Value loss: 0.032358. Entropy: 1.103235.
Iteration 10491: Policy loss: -0.007064. Value loss: 0.024754. Entropy: 1.099408.
episode: 3485   score: 260.0  epsilon: 1.0    steps: 632  evaluation reward: 409.85
Training network. lr: 0.000170. clip: 0.067901
Iteration 10492: Policy loss: 0.003639. Value loss: 0.271153. Entropy: 1.133346.
Iteration 10493: Policy loss: -0.002231. Value loss: 0.100762. Entropy: 1.138817.
Iteration 10494: Policy loss: -0.010800. Value loss: 0.054695. Entropy: 1.128401.
episode: 3486   score: 485.0  epsilon: 1.0    steps: 208  evaluation reward: 409.7
episode: 3487   score: 295.0  epsilon: 1.0    steps: 392  evaluation reward: 405.25
episode: 3488   score: 220.0  epsilon: 1.0    steps: 448  evaluation reward: 405.35

Iteration 10550: Policy loss: -0.001850. Value loss: 0.123777. Entropy: 1.089168.
Iteration 10551: Policy loss: -0.009322. Value loss: 0.085211. Entropy: 1.093931.
episode: 3511   score: 210.0  epsilon: 1.0    steps: 624  evaluation reward: 365.9
Training network. lr: 0.000169. clip: 0.067597
Iteration 10552: Policy loss: 0.003873. Value loss: 0.279345. Entropy: 1.070909.
Iteration 10553: Policy loss: -0.004358. Value loss: 0.163471. Entropy: 1.081493.
Iteration 10554: Policy loss: -0.006324. Value loss: 0.087613. Entropy: 1.075251.
episode: 3512   score: 340.0  epsilon: 1.0    steps: 64  evaluation reward: 367.5
Training network. lr: 0.000169. clip: 0.067597
Iteration 10555: Policy loss: 0.002709. Value loss: 0.090796. Entropy: 1.085885.
Iteration 10556: Policy loss: -0.007258. Value loss: 0.042722. Entropy: 1.083242.
Iteration 10557: Policy loss: -0.011722. Value loss: 0.031346. Entropy: 1.071664.
episode: 3513   score: 155.0  epsilon: 1.0    steps: 888  evaluation reward: 367.2
Trai

episode: 3534   score: 390.0  epsilon: 1.0    steps: 400  evaluation reward: 340.45
episode: 3535   score: 315.0  epsilon: 1.0    steps: 456  evaluation reward: 339.2
episode: 3536   score: 355.0  epsilon: 1.0    steps: 464  evaluation reward: 339.3
Training network. lr: 0.000169. clip: 0.067440
Iteration 10615: Policy loss: 0.003071. Value loss: 0.120340. Entropy: 0.941409.
Iteration 10616: Policy loss: -0.005585. Value loss: 0.064909. Entropy: 0.942386.
Iteration 10617: Policy loss: -0.011439. Value loss: 0.053363. Entropy: 0.941608.
episode: 3537   score: 615.0  epsilon: 1.0    steps: 480  evaluation reward: 342.55
Training network. lr: 0.000169. clip: 0.067440
Iteration 10618: Policy loss: 0.004209. Value loss: 0.138267. Entropy: 1.050658.
Iteration 10619: Policy loss: -0.004503. Value loss: 0.083346. Entropy: 1.048048.
Iteration 10620: Policy loss: -0.009074. Value loss: 0.069126. Entropy: 1.057737.
Training network. lr: 0.000169. clip: 0.067440
Iteration 10621: Policy loss: 0.003

Training network. lr: 0.000168. clip: 0.067292
Iteration 10678: Policy loss: 0.000214. Value loss: 0.111879. Entropy: 1.103873.
Iteration 10679: Policy loss: -0.006842. Value loss: 0.044007. Entropy: 1.099562.
Iteration 10680: Policy loss: -0.012403. Value loss: 0.034644. Entropy: 1.108421.
Training network. lr: 0.000168. clip: 0.067292
Iteration 10681: Policy loss: 0.007834. Value loss: 0.261170. Entropy: 1.149139.
Iteration 10682: Policy loss: 0.000435. Value loss: 0.105362. Entropy: 1.157818.
Iteration 10683: Policy loss: -0.009872. Value loss: 0.056955. Entropy: 1.148373.
episode: 3560   score: 265.0  epsilon: 1.0    steps: 648  evaluation reward: 314.1
Training network. lr: 0.000168. clip: 0.067292
Iteration 10684: Policy loss: 0.000936. Value loss: 0.128762. Entropy: 1.158581.
Iteration 10685: Policy loss: -0.009255. Value loss: 0.060324. Entropy: 1.160867.
Iteration 10686: Policy loss: -0.015637. Value loss: 0.042000. Entropy: 1.162932.
Training network. lr: 0.000168. clip: 0.06

Iteration 10741: Policy loss: 0.000470. Value loss: 0.157492. Entropy: 1.050146.
Iteration 10742: Policy loss: -0.004452. Value loss: 0.077575. Entropy: 1.058150.
Iteration 10743: Policy loss: -0.011852. Value loss: 0.057211. Entropy: 1.052998.
episode: 3585   score: 240.0  epsilon: 1.0    steps: 464  evaluation reward: 316.65
Training network. lr: 0.000168. clip: 0.067136
Iteration 10744: Policy loss: 0.004813. Value loss: 0.338726. Entropy: 1.015102.
Iteration 10745: Policy loss: 0.001269. Value loss: 0.226653. Entropy: 0.993958.
Iteration 10746: Policy loss: -0.001970. Value loss: 0.196061. Entropy: 1.010610.
Training network. lr: 0.000168. clip: 0.067136
Iteration 10747: Policy loss: 0.002130. Value loss: 0.150129. Entropy: 1.163327.
Iteration 10748: Policy loss: -0.004364. Value loss: 0.071917. Entropy: 1.168294.
Iteration 10749: Policy loss: -0.011414. Value loss: 0.050619. Entropy: 1.167731.
Training network. lr: 0.000168. clip: 0.067136
Iteration 10750: Policy loss: 0.001018. V

Iteration 10808: Policy loss: -0.008553. Value loss: 0.074991. Entropy: 1.080381.
Iteration 10809: Policy loss: -0.011964. Value loss: 0.056932. Entropy: 1.078280.
episode: 3605   score: 620.0  epsilon: 1.0    steps: 960  evaluation reward: 336.9
Training network. lr: 0.000167. clip: 0.066832
Iteration 10810: Policy loss: 0.003057. Value loss: 0.399937. Entropy: 1.196668.
Iteration 10811: Policy loss: -0.004257. Value loss: 0.187935. Entropy: 1.188878.
Iteration 10812: Policy loss: -0.007155. Value loss: 0.132428. Entropy: 1.197501.
episode: 3606   score: 270.0  epsilon: 1.0    steps: 504  evaluation reward: 337.8
Training network. lr: 0.000167. clip: 0.066832
Iteration 10813: Policy loss: 0.001834. Value loss: 0.143725. Entropy: 1.050872.
Iteration 10814: Policy loss: -0.005478. Value loss: 0.062912. Entropy: 1.053584.
Iteration 10815: Policy loss: -0.012663. Value loss: 0.041767. Entropy: 1.057240.
episode: 3607   score: 590.0  epsilon: 1.0    steps: 304  evaluation reward: 341.9
Tra

episode: 3628   score: 180.0  epsilon: 1.0    steps: 584  evaluation reward: 351.25
Training network. lr: 0.000167. clip: 0.066675
Iteration 10873: Policy loss: 0.003348. Value loss: 0.142267. Entropy: 1.007336.
Iteration 10874: Policy loss: -0.003274. Value loss: 0.075636. Entropy: 1.009377.
Iteration 10875: Policy loss: -0.007234. Value loss: 0.054237. Entropy: 1.002383.
Training network. lr: 0.000167. clip: 0.066675
Iteration 10876: Policy loss: 0.001680. Value loss: 0.105058. Entropy: 1.021909.
Iteration 10877: Policy loss: -0.007796. Value loss: 0.043628. Entropy: 1.028850.
Iteration 10878: Policy loss: -0.010882. Value loss: 0.029827. Entropy: 1.029503.
episode: 3629   score: 170.0  epsilon: 1.0    steps: 392  evaluation reward: 351.15
Training network. lr: 0.000167. clip: 0.066675
Iteration 10879: Policy loss: 0.006055. Value loss: 0.349836. Entropy: 1.085049.
Iteration 10880: Policy loss: -0.000445. Value loss: 0.109290. Entropy: 1.081046.
Iteration 10881: Policy loss: -0.00458

Iteration 10936: Policy loss: 0.003001. Value loss: 0.115858. Entropy: 1.010036.
Iteration 10937: Policy loss: -0.003803. Value loss: 0.061600. Entropy: 1.004491.
Iteration 10938: Policy loss: -0.008615. Value loss: 0.042971. Entropy: 1.006741.
episode: 3652   score: 375.0  epsilon: 1.0    steps: 208  evaluation reward: 367.1
episode: 3653   score: 240.0  epsilon: 1.0    steps: 936  evaluation reward: 367.2
Training network. lr: 0.000166. clip: 0.066518
Iteration 10939: Policy loss: -0.001420. Value loss: 0.223786. Entropy: 1.007426.
Iteration 10940: Policy loss: -0.009311. Value loss: 0.100392. Entropy: 1.001070.
Iteration 10941: Policy loss: -0.012780. Value loss: 0.075790. Entropy: 0.995238.
Training network. lr: 0.000166. clip: 0.066518
Iteration 10942: Policy loss: 0.002231. Value loss: 0.237389. Entropy: 1.045329.
Iteration 10943: Policy loss: -0.000901. Value loss: 0.113783. Entropy: 1.050636.
Iteration 10944: Policy loss: -0.008711. Value loss: 0.071173. Entropy: 1.046638.
Trai

episode: 3674   score: 405.0  epsilon: 1.0    steps: 736  evaluation reward: 376.65
Training network. lr: 0.000166. clip: 0.066214
Iteration 11002: Policy loss: -0.000845. Value loss: 0.130134. Entropy: 1.104768.
Iteration 11003: Policy loss: -0.010390. Value loss: 0.059942. Entropy: 1.108271.
Iteration 11004: Policy loss: -0.016338. Value loss: 0.048570. Entropy: 1.104818.
episode: 3675   score: 500.0  epsilon: 1.0    steps: 280  evaluation reward: 376.55
Training network. lr: 0.000166. clip: 0.066214
Iteration 11005: Policy loss: 0.005201. Value loss: 0.313051. Entropy: 1.061710.
Iteration 11006: Policy loss: 0.001575. Value loss: 0.116011. Entropy: 1.065393.
Iteration 11007: Policy loss: -0.005725. Value loss: 0.079959. Entropy: 1.060573.
episode: 3676   score: 180.0  epsilon: 1.0    steps: 80  evaluation reward: 372.65
Training network. lr: 0.000166. clip: 0.066214
Iteration 11008: Policy loss: 0.002716. Value loss: 0.159384. Entropy: 1.056404.
Iteration 11009: Policy loss: -0.0073

Iteration 11066: Policy loss: -0.004789. Value loss: 0.096992. Entropy: 1.040303.
Iteration 11067: Policy loss: -0.011886. Value loss: 0.071475. Entropy: 1.034328.
episode: 3698   score: 640.0  epsilon: 1.0    steps: 8  evaluation reward: 387.75
Training network. lr: 0.000165. clip: 0.066057
Iteration 11068: Policy loss: 0.003110. Value loss: 0.123144. Entropy: 0.996240.
Iteration 11069: Policy loss: -0.005722. Value loss: 0.066158. Entropy: 0.998367.
Iteration 11070: Policy loss: -0.009639. Value loss: 0.047419. Entropy: 0.998134.
episode: 3699   score: 245.0  epsilon: 1.0    steps: 16  evaluation reward: 388.1
Training network. lr: 0.000165. clip: 0.066057
Iteration 11071: Policy loss: 0.004187. Value loss: 0.141386. Entropy: 0.994810.
Iteration 11072: Policy loss: -0.004912. Value loss: 0.065339. Entropy: 0.995684.
Iteration 11073: Policy loss: -0.010286. Value loss: 0.051993. Entropy: 0.997315.
episode: 3700   score: 275.0  epsilon: 1.0    steps: 368  evaluation reward: 386.3
now t

episode: 3721   score: 590.0  epsilon: 1.0    steps: 672  evaluation reward: 374.15
Training network. lr: 0.000165. clip: 0.065910
Iteration 11131: Policy loss: 0.002458. Value loss: 0.228103. Entropy: 1.045434.
Iteration 11132: Policy loss: -0.001393. Value loss: 0.098650. Entropy: 1.041949.
Iteration 11133: Policy loss: -0.008515. Value loss: 0.071557. Entropy: 1.038427.
episode: 3722   score: 390.0  epsilon: 1.0    steps: 656  evaluation reward: 375.65
episode: 3723   score: 615.0  epsilon: 1.0    steps: 816  evaluation reward: 375.95
Training network. lr: 0.000165. clip: 0.065910
Iteration 11134: Policy loss: 0.001121. Value loss: 0.112359. Entropy: 1.015514.
Iteration 11135: Policy loss: -0.005799. Value loss: 0.055182. Entropy: 1.017887.
Iteration 11136: Policy loss: -0.010369. Value loss: 0.043958. Entropy: 1.019336.
episode: 3724   score: 465.0  epsilon: 1.0    steps: 784  evaluation reward: 378.05
Training network. lr: 0.000165. clip: 0.065910
Iteration 11137: Policy loss: -0.

Iteration 11195: Policy loss: -0.006500. Value loss: 0.086980. Entropy: 1.108373.
Iteration 11196: Policy loss: -0.011782. Value loss: 0.061134. Entropy: 1.108149.
episode: 3745   score: 365.0  epsilon: 1.0    steps: 1024  evaluation reward: 370.0
Training network. lr: 0.000164. clip: 0.065753
Iteration 11197: Policy loss: 0.002980. Value loss: 0.356836. Entropy: 1.127945.
Iteration 11198: Policy loss: -0.006042. Value loss: 0.287406. Entropy: 1.128406.
Iteration 11199: Policy loss: -0.008939. Value loss: 0.245554. Entropy: 1.126947.
episode: 3746   score: 650.0  epsilon: 1.0    steps: 480  evaluation reward: 372.3
Training network. lr: 0.000164. clip: 0.065753
Iteration 11200: Policy loss: 0.003806. Value loss: 0.125319. Entropy: 1.079243.
Iteration 11201: Policy loss: -0.006496. Value loss: 0.058937. Entropy: 1.074109.
Iteration 11202: Policy loss: -0.013378. Value loss: 0.044064. Entropy: 1.064310.
Training network. lr: 0.000164. clip: 0.065597
Iteration 11203: Policy loss: 0.002387

Training network. lr: 0.000164. clip: 0.065449
Iteration 11263: Policy loss: -0.002308. Value loss: 0.424101. Entropy: 1.093677.
Iteration 11264: Policy loss: -0.007701. Value loss: 0.312855. Entropy: 1.088656.
Iteration 11265: Policy loss: -0.009745. Value loss: 0.281577. Entropy: 1.091002.
episode: 3764   score: 495.0  epsilon: 1.0    steps: 136  evaluation reward: 408.3
episode: 3765   score: 285.0  epsilon: 1.0    steps: 664  evaluation reward: 408.75
Training network. lr: 0.000164. clip: 0.065449
Iteration 11266: Policy loss: 0.005370. Value loss: 0.136411. Entropy: 1.051751.
Iteration 11267: Policy loss: -0.002226. Value loss: 0.081125. Entropy: 1.049508.
Iteration 11268: Policy loss: -0.009516. Value loss: 0.068953. Entropy: 1.048165.
Training network. lr: 0.000164. clip: 0.065449
Iteration 11269: Policy loss: 0.001694. Value loss: 0.177817. Entropy: 1.086676.
Iteration 11270: Policy loss: -0.003164. Value loss: 0.078089. Entropy: 1.073633.
Iteration 11271: Policy loss: -0.00889

Training network. lr: 0.000163. clip: 0.065293
Iteration 11329: Policy loss: 0.002111. Value loss: 0.117353. Entropy: 1.094236.
Iteration 11330: Policy loss: -0.006117. Value loss: 0.066603. Entropy: 1.096441.
Iteration 11331: Policy loss: -0.011025. Value loss: 0.052886. Entropy: 1.095460.
Training network. lr: 0.000163. clip: 0.065293
Iteration 11332: Policy loss: 0.000382. Value loss: 0.134380. Entropy: 1.177578.
Iteration 11333: Policy loss: -0.006642. Value loss: 0.069489. Entropy: 1.176938.
Iteration 11334: Policy loss: -0.011543. Value loss: 0.046328. Entropy: 1.177380.
episode: 3786   score: 285.0  epsilon: 1.0    steps: 920  evaluation reward: 420.3
episode: 3787   score: 550.0  epsilon: 1.0    steps: 944  evaluation reward: 423.35
Training network. lr: 0.000163. clip: 0.065293
Iteration 11335: Policy loss: 0.005296. Value loss: 0.173596. Entropy: 1.183819.
Iteration 11336: Policy loss: -0.004644. Value loss: 0.078930. Entropy: 1.184734.
Iteration 11337: Policy loss: -0.007564

Iteration 11393: Policy loss: -0.004210. Value loss: 0.046117. Entropy: 1.186784.
Iteration 11394: Policy loss: -0.009329. Value loss: 0.032977. Entropy: 1.183206.
episode: 3809   score: 285.0  epsilon: 1.0    steps: 592  evaluation reward: 411.35
Training network. lr: 0.000163. clip: 0.065136
Iteration 11395: Policy loss: 0.006161. Value loss: 0.087666. Entropy: 1.139812.
Iteration 11396: Policy loss: 0.001458. Value loss: 0.047244. Entropy: 1.117049.
Iteration 11397: Policy loss: -0.004095. Value loss: 0.038826. Entropy: 1.129811.
episode: 3810   score: 285.0  epsilon: 1.0    steps: 496  evaluation reward: 410.3
episode: 3811   score: 105.0  epsilon: 1.0    steps: 688  evaluation reward: 408.95
Training network. lr: 0.000163. clip: 0.065136
Iteration 11398: Policy loss: 0.002304. Value loss: 0.346687. Entropy: 1.097359.
Iteration 11399: Policy loss: -0.005846. Value loss: 0.247614. Entropy: 1.094436.
Iteration 11400: Policy loss: -0.005504. Value loss: 0.228338. Entropy: 1.100857.
ep

episode: 3832   score: 820.0  epsilon: 1.0    steps: 352  evaluation reward: 409.6
Training network. lr: 0.000162. clip: 0.064832
Iteration 11458: Policy loss: 0.005712. Value loss: 0.143594. Entropy: 1.010615.
Iteration 11459: Policy loss: -0.004468. Value loss: 0.071294. Entropy: 1.015268.
Iteration 11460: Policy loss: -0.005619. Value loss: 0.058522. Entropy: 1.005742.
episode: 3833   score: 400.0  epsilon: 1.0    steps: 1016  evaluation reward: 411.5
Training network. lr: 0.000162. clip: 0.064832
Iteration 11461: Policy loss: 0.002397. Value loss: 0.113571. Entropy: 1.108036.
Iteration 11462: Policy loss: -0.004810. Value loss: 0.059013. Entropy: 1.110841.
Iteration 11463: Policy loss: -0.010398. Value loss: 0.047031. Entropy: 1.103833.
Training network. lr: 0.000162. clip: 0.064832
Iteration 11464: Policy loss: 0.002401. Value loss: 0.125647. Entropy: 1.112407.
Iteration 11465: Policy loss: -0.005688. Value loss: 0.063874. Entropy: 1.103693.
Iteration 11466: Policy loss: -0.013618

Iteration 11521: Policy loss: 0.001339. Value loss: 0.121917. Entropy: 1.085256.
Iteration 11522: Policy loss: -0.008816. Value loss: 0.055767. Entropy: 1.079836.
Iteration 11523: Policy loss: -0.012825. Value loss: 0.041850. Entropy: 1.081390.
Training network. lr: 0.000162. clip: 0.064675
Iteration 11524: Policy loss: 0.004131. Value loss: 0.227222. Entropy: 1.077767.
Iteration 11525: Policy loss: -0.002127. Value loss: 0.072348. Entropy: 1.077327.
Iteration 11526: Policy loss: -0.006807. Value loss: 0.047669. Entropy: 1.079107.
episode: 3856   score: 210.0  epsilon: 1.0    steps: 248  evaluation reward: 382.25
Training network. lr: 0.000162. clip: 0.064675
Iteration 11527: Policy loss: 0.001642. Value loss: 0.179482. Entropy: 1.118898.
Iteration 11528: Policy loss: -0.003292. Value loss: 0.077144. Entropy: 1.124784.
Iteration 11529: Policy loss: -0.008093. Value loss: 0.040510. Entropy: 1.119992.
episode: 3857   score: 455.0  epsilon: 1.0    steps: 144  evaluation reward: 383.5
Trai

Iteration 11586: Policy loss: -0.008612. Value loss: 0.086815. Entropy: 1.107955.
episode: 3879   score: 330.0  epsilon: 1.0    steps: 824  evaluation reward: 363.4
Training network. lr: 0.000161. clip: 0.064528
Iteration 11587: Policy loss: 0.001348. Value loss: 0.063364. Entropy: 1.003573.
Iteration 11588: Policy loss: -0.007950. Value loss: 0.034951. Entropy: 1.007375.
Iteration 11589: Policy loss: -0.011445. Value loss: 0.029223. Entropy: 1.010637.
Training network. lr: 0.000161. clip: 0.064528
Iteration 11590: Policy loss: 0.002202. Value loss: 0.220248. Entropy: 1.084932.
Iteration 11591: Policy loss: -0.007437. Value loss: 0.088989. Entropy: 1.091402.
Iteration 11592: Policy loss: -0.006762. Value loss: 0.059342. Entropy: 1.083966.
episode: 3880   score: 485.0  epsilon: 1.0    steps: 56  evaluation reward: 365.4
episode: 3881   score: 505.0  epsilon: 1.0    steps: 392  evaluation reward: 364.65
Training network. lr: 0.000161. clip: 0.064528
Iteration 11593: Policy loss: 0.000455

Iteration 11650: Policy loss: 0.008555. Value loss: 0.113908. Entropy: 1.082426.
Iteration 11651: Policy loss: -0.003349. Value loss: 0.061277. Entropy: 1.093146.
Iteration 11652: Policy loss: -0.006828. Value loss: 0.047785. Entropy: 1.104658.
Training network. lr: 0.000161. clip: 0.064214
Iteration 11653: Policy loss: 0.002310. Value loss: 0.253951. Entropy: 1.097623.
Iteration 11654: Policy loss: -0.002911. Value loss: 0.089678. Entropy: 1.094244.
Iteration 11655: Policy loss: -0.009903. Value loss: 0.054358. Entropy: 1.100448.
episode: 3902   score: 330.0  epsilon: 1.0    steps: 48  evaluation reward: 385.1
Training network. lr: 0.000161. clip: 0.064214
Iteration 11656: Policy loss: 0.003907. Value loss: 0.127164. Entropy: 1.053795.
Iteration 11657: Policy loss: -0.006168. Value loss: 0.057896. Entropy: 1.058694.
Iteration 11658: Policy loss: -0.011587. Value loss: 0.037210. Entropy: 1.054091.
episode: 3903   score: 285.0  epsilon: 1.0    steps: 544  evaluation reward: 384.5
Traini

Iteration 11718: Policy loss: -0.005757. Value loss: 0.071778. Entropy: 1.068845.
episode: 3922   score: 605.0  epsilon: 1.0    steps: 776  evaluation reward: 415.4
Training network. lr: 0.000160. clip: 0.064067
Iteration 11719: Policy loss: 0.001445. Value loss: 0.452530. Entropy: 1.128572.
Iteration 11720: Policy loss: -0.004025. Value loss: 0.296827. Entropy: 1.131627.
Iteration 11721: Policy loss: -0.009778. Value loss: 0.236688. Entropy: 1.123292.
episode: 3923   score: 820.0  epsilon: 1.0    steps: 944  evaluation reward: 419.65
Training network. lr: 0.000160. clip: 0.064067
Iteration 11722: Policy loss: 0.003929. Value loss: 0.222788. Entropy: 1.123888.
Iteration 11723: Policy loss: -0.003581. Value loss: 0.074862. Entropy: 1.128558.
Iteration 11724: Policy loss: -0.007032. Value loss: 0.040756. Entropy: 1.130670.
episode: 3924   score: 245.0  epsilon: 1.0    steps: 328  evaluation reward: 418.95
Training network. lr: 0.000160. clip: 0.064067
Iteration 11725: Policy loss: 0.0031

Training network. lr: 0.000160. clip: 0.063910
Iteration 11785: Policy loss: 0.001034. Value loss: 0.142964. Entropy: 1.141636.
Iteration 11786: Policy loss: -0.006581. Value loss: 0.072860. Entropy: 1.138993.
Iteration 11787: Policy loss: -0.014970. Value loss: 0.054784. Entropy: 1.133652.
Training network. lr: 0.000160. clip: 0.063910
Iteration 11788: Policy loss: 0.002346. Value loss: 0.358791. Entropy: 1.097549.
Iteration 11789: Policy loss: -0.000825. Value loss: 0.251960. Entropy: 1.083283.
Iteration 11790: Policy loss: -0.006419. Value loss: 0.183893. Entropy: 1.083633.
episode: 3943   score: 585.0  epsilon: 1.0    steps: 552  evaluation reward: 430.05
episode: 3944   score: 425.0  epsilon: 1.0    steps: 904  evaluation reward: 429.5
Training network. lr: 0.000160. clip: 0.063910
Iteration 11791: Policy loss: 0.002136. Value loss: 0.118139. Entropy: 1.095418.
Iteration 11792: Policy loss: -0.007627. Value loss: 0.057778. Entropy: 1.100521.
Iteration 11793: Policy loss: -0.013536

episode: 3964   score: 225.0  epsilon: 1.0    steps: 720  evaluation reward: 431.6
Training network. lr: 0.000159. clip: 0.063606
Iteration 11851: Policy loss: 0.001072. Value loss: 0.142632. Entropy: 1.142172.
Iteration 11852: Policy loss: -0.006723. Value loss: 0.068474. Entropy: 1.140080.
Iteration 11853: Policy loss: -0.011649. Value loss: 0.047207. Entropy: 1.136547.
episode: 3965   score: 240.0  epsilon: 1.0    steps: 80  evaluation reward: 430.25
episode: 3966   score: 345.0  epsilon: 1.0    steps: 880  evaluation reward: 430.5
Training network. lr: 0.000159. clip: 0.063606
Iteration 11854: Policy loss: 0.004107. Value loss: 0.210609. Entropy: 1.047498.
Iteration 11855: Policy loss: -0.002639. Value loss: 0.110607. Entropy: 1.045709.
Iteration 11856: Policy loss: -0.009146. Value loss: 0.076868. Entropy: 1.040620.
episode: 3967   score: 655.0  epsilon: 1.0    steps: 424  evaluation reward: 432.5
Training network. lr: 0.000159. clip: 0.063606
Iteration 11857: Policy loss: 0.00222

Iteration 11917: Policy loss: 0.000540. Value loss: 0.162052. Entropy: 1.046737.
Iteration 11918: Policy loss: -0.007728. Value loss: 0.085288. Entropy: 1.040899.
Iteration 11919: Policy loss: -0.010705. Value loss: 0.061824. Entropy: 1.047194.
episode: 3985   score: 450.0  epsilon: 1.0    steps: 880  evaluation reward: 453.9
episode: 3986   score: 575.0  epsilon: 1.0    steps: 944  evaluation reward: 456.9
Training network. lr: 0.000159. clip: 0.063449
Iteration 11920: Policy loss: 0.002701. Value loss: 0.098049. Entropy: 1.139554.
Iteration 11921: Policy loss: -0.004797. Value loss: 0.054156. Entropy: 1.134103.
Iteration 11922: Policy loss: -0.010144. Value loss: 0.043812. Entropy: 1.131826.
episode: 3987   score: 360.0  epsilon: 1.0    steps: 296  evaluation reward: 452.1
episode: 3988   score: 390.0  epsilon: 1.0    steps: 536  evaluation reward: 452.9
episode: 3989   score: 210.0  epsilon: 1.0    steps: 1024  evaluation reward: 452.9
Training network. lr: 0.000159. clip: 0.063449


episode: 4007   score: 290.0  epsilon: 1.0    steps: 544  evaluation reward: 441.4
Training network. lr: 0.000158. clip: 0.063293
Iteration 11983: Policy loss: 0.004739. Value loss: 0.207287. Entropy: 1.134557.
Iteration 11984: Policy loss: -0.001807. Value loss: 0.078822. Entropy: 1.132311.
Iteration 11985: Policy loss: -0.006481. Value loss: 0.050658. Entropy: 1.128678.
episode: 4008   score: 330.0  epsilon: 1.0    steps: 80  evaluation reward: 438.6
episode: 4009   score: 535.0  epsilon: 1.0    steps: 640  evaluation reward: 437.15
Training network. lr: 0.000158. clip: 0.063293
Iteration 11986: Policy loss: 0.003135. Value loss: 0.156650. Entropy: 0.948155.
Iteration 11987: Policy loss: -0.005451. Value loss: 0.068236. Entropy: 0.939755.
Iteration 11988: Policy loss: -0.011887. Value loss: 0.050578. Entropy: 0.944496.
Training network. lr: 0.000158. clip: 0.063293
Iteration 11989: Policy loss: 0.001504. Value loss: 0.372852. Entropy: 1.057394.
Iteration 11990: Policy loss: -0.000932

Iteration 12048: Policy loss: -0.009373. Value loss: 0.038305. Entropy: 1.009410.
Training network. lr: 0.000158. clip: 0.063145
Iteration 12049: Policy loss: 0.001901. Value loss: 0.158104. Entropy: 1.037243.
Iteration 12050: Policy loss: -0.006393. Value loss: 0.065251. Entropy: 1.046375.
Iteration 12051: Policy loss: -0.011598. Value loss: 0.043776. Entropy: 1.041956.
episode: 4030   score: 345.0  epsilon: 1.0    steps: 888  evaluation reward: 428.7
Training network. lr: 0.000157. clip: 0.062989
Iteration 12052: Policy loss: 0.004935. Value loss: 0.378011. Entropy: 1.144435.
Iteration 12053: Policy loss: 0.000289. Value loss: 0.209156. Entropy: 1.141804.
Iteration 12054: Policy loss: -0.005281. Value loss: 0.147244. Entropy: 1.141533.
Training network. lr: 0.000157. clip: 0.062989
Iteration 12055: Policy loss: 0.000688. Value loss: 0.105067. Entropy: 1.139452.
Iteration 12056: Policy loss: -0.006822. Value loss: 0.034679. Entropy: 1.139346.
Iteration 12057: Policy loss: -0.010790. V

now time :  2019-03-05 23:18:36.549732
episode: 4051   score: 615.0  epsilon: 1.0    steps: 920  evaluation reward: 437.05
Training network. lr: 0.000157. clip: 0.062832
Iteration 12115: Policy loss: 0.003217. Value loss: 0.109314. Entropy: 0.931693.
Iteration 12116: Policy loss: -0.003878. Value loss: 0.069535. Entropy: 0.934397.
Iteration 12117: Policy loss: -0.006301. Value loss: 0.055751. Entropy: 0.931154.
episode: 4052   score: 360.0  epsilon: 1.0    steps: 904  evaluation reward: 437.2
Training network. lr: 0.000157. clip: 0.062832
Iteration 12118: Policy loss: 0.005610. Value loss: 0.279917. Entropy: 1.031773.
Iteration 12119: Policy loss: -0.004150. Value loss: 0.143605. Entropy: 1.043401.
Iteration 12120: Policy loss: -0.007733. Value loss: 0.105831. Entropy: 1.042888.
Training network. lr: 0.000157. clip: 0.062832
Iteration 12121: Policy loss: 0.000211. Value loss: 0.351895. Entropy: 1.089977.
Iteration 12122: Policy loss: -0.007767. Value loss: 0.162562. Entropy: 1.100023.


Training network. lr: 0.000157. clip: 0.062684
Iteration 12181: Policy loss: 0.002052. Value loss: 0.133585. Entropy: 0.890739.
Iteration 12182: Policy loss: -0.006445. Value loss: 0.068266. Entropy: 0.892956.
Iteration 12183: Policy loss: -0.010983. Value loss: 0.053290. Entropy: 0.894396.
Training network. lr: 0.000157. clip: 0.062684
Iteration 12184: Policy loss: 0.002518. Value loss: 0.124325. Entropy: 1.134807.
Iteration 12185: Policy loss: -0.003144. Value loss: 0.067254. Entropy: 1.143161.
Iteration 12186: Policy loss: -0.009825. Value loss: 0.051503. Entropy: 1.135175.
episode: 4072   score: 435.0  epsilon: 1.0    steps: 384  evaluation reward: 450.35
Training network. lr: 0.000157. clip: 0.062684
Iteration 12187: Policy loss: 0.001104. Value loss: 0.140499. Entropy: 1.122593.
Iteration 12188: Policy loss: -0.006306. Value loss: 0.077710. Entropy: 1.122944.
Iteration 12189: Policy loss: -0.012644. Value loss: 0.058311. Entropy: 1.124955.
episode: 4073   score: 355.0  epsilon: 1

Training network. lr: 0.000156. clip: 0.062528
Iteration 12247: Policy loss: 0.004041. Value loss: 0.146650. Entropy: 1.094816.
Iteration 12248: Policy loss: -0.009616. Value loss: 0.060159. Entropy: 1.096385.
Iteration 12249: Policy loss: -0.014358. Value loss: 0.045242. Entropy: 1.103941.
episode: 4094   score: 755.0  epsilon: 1.0    steps: 344  evaluation reward: 437.2
episode: 4095   score: 330.0  epsilon: 1.0    steps: 968  evaluation reward: 437.65
Training network. lr: 0.000156. clip: 0.062528
Iteration 12250: Policy loss: 0.004335. Value loss: 0.121304. Entropy: 1.104549.
Iteration 12251: Policy loss: -0.004895. Value loss: 0.062717. Entropy: 1.105070.
Iteration 12252: Policy loss: -0.009680. Value loss: 0.045221. Entropy: 1.096660.
Training network. lr: 0.000156. clip: 0.062371
Iteration 12253: Policy loss: 0.005132. Value loss: 0.124198. Entropy: 1.101355.
Iteration 12254: Policy loss: -0.003418. Value loss: 0.058785. Entropy: 1.103867.
Iteration 12255: Policy loss: -0.008472

Iteration 12311: Policy loss: -0.004729. Value loss: 0.060906. Entropy: 1.152725.
Iteration 12312: Policy loss: -0.012637. Value loss: 0.041638. Entropy: 1.150388.
episode: 4117   score: 350.0  epsilon: 1.0    steps: 376  evaluation reward: 417.05
Training network. lr: 0.000156. clip: 0.062224
Iteration 12313: Policy loss: 0.001527. Value loss: 0.136472. Entropy: 1.048545.
Iteration 12314: Policy loss: -0.005111. Value loss: 0.062002. Entropy: 1.037533.
Iteration 12315: Policy loss: -0.007135. Value loss: 0.043149. Entropy: 1.029242.
episode: 4118   score: 235.0  epsilon: 1.0    steps: 448  evaluation reward: 416.1
episode: 4119   score: 345.0  epsilon: 1.0    steps: 480  evaluation reward: 416.25
Training network. lr: 0.000156. clip: 0.062224
Iteration 12316: Policy loss: 0.002802. Value loss: 0.225453. Entropy: 1.005644.
Iteration 12317: Policy loss: 0.003655. Value loss: 0.090178. Entropy: 1.003973.
Iteration 12318: Policy loss: -0.006916. Value loss: 0.053715. Entropy: 1.007543.
Tr

Iteration 12375: Policy loss: -0.012595. Value loss: 0.051376. Entropy: 1.002524.
Training network. lr: 0.000155. clip: 0.062067
Iteration 12376: Policy loss: 0.002984. Value loss: 0.098774. Entropy: 1.126938.
Iteration 12377: Policy loss: -0.004443. Value loss: 0.046979. Entropy: 1.134417.
Iteration 12378: Policy loss: -0.009926. Value loss: 0.034437. Entropy: 1.129723.
Training network. lr: 0.000155. clip: 0.062067
Iteration 12379: Policy loss: 0.001971. Value loss: 0.210852. Entropy: 1.182905.
Iteration 12380: Policy loss: -0.004934. Value loss: 0.078316. Entropy: 1.180679.
Iteration 12381: Policy loss: -0.012260. Value loss: 0.049233. Entropy: 1.183017.
episode: 4141   score: 390.0  epsilon: 1.0    steps: 64  evaluation reward: 391.3
episode: 4142   score: 245.0  epsilon: 1.0    steps: 392  evaluation reward: 389.6
Training network. lr: 0.000155. clip: 0.062067
Iteration 12382: Policy loss: 0.002146. Value loss: 0.099925. Entropy: 1.064791.
Iteration 12383: Policy loss: -0.004724. 

Iteration 12440: Policy loss: -0.004447. Value loss: 0.103505. Entropy: 1.117563.
Iteration 12441: Policy loss: -0.013956. Value loss: 0.064817. Entropy: 1.122368.
episode: 4163   score: 240.0  epsilon: 1.0    steps: 272  evaluation reward: 378.4
episode: 4164   score: 695.0  epsilon: 1.0    steps: 360  evaluation reward: 380.7
Training network. lr: 0.000155. clip: 0.061910
Iteration 12442: Policy loss: 0.002000. Value loss: 0.173940. Entropy: 1.039887.
Iteration 12443: Policy loss: -0.006346. Value loss: 0.107983. Entropy: 1.026926.
Iteration 12444: Policy loss: -0.009813. Value loss: 0.075832. Entropy: 1.026773.
episode: 4165   score: 335.0  epsilon: 1.0    steps: 744  evaluation reward: 381.95
episode: 4166   score: 210.0  epsilon: 1.0    steps: 984  evaluation reward: 379.8
Training network. lr: 0.000155. clip: 0.061910
Iteration 12445: Policy loss: 0.001513. Value loss: 0.210767. Entropy: 1.111156.
Iteration 12446: Policy loss: -0.002249. Value loss: 0.089299. Entropy: 1.111333.
I

Iteration 12503: Policy loss: -0.005002. Value loss: 0.060791. Entropy: 1.110827.
Iteration 12504: Policy loss: -0.008821. Value loss: 0.046258. Entropy: 1.110225.
Training network. lr: 0.000154. clip: 0.061606
Iteration 12505: Policy loss: 0.004332. Value loss: 0.186961. Entropy: 1.107237.
Iteration 12506: Policy loss: -0.004999. Value loss: 0.074423. Entropy: 1.110160.
Iteration 12507: Policy loss: -0.008323. Value loss: 0.048927. Entropy: 1.104640.
Training network. lr: 0.000154. clip: 0.061606
Iteration 12508: Policy loss: 0.000824. Value loss: 0.372455. Entropy: 1.148499.
Iteration 12509: Policy loss: -0.004961. Value loss: 0.247358. Entropy: 1.156190.
Iteration 12510: Policy loss: -0.011477. Value loss: 0.191889. Entropy: 1.151513.
episode: 4188   score: 620.0  epsilon: 1.0    steps: 104  evaluation reward: 372.3
episode: 4189   score: 495.0  epsilon: 1.0    steps: 344  evaluation reward: 373.6
episode: 4190   score: 315.0  epsilon: 1.0    steps: 416  evaluation reward: 375.2
epi

episode: 4211   score: 420.0  epsilon: 1.0    steps: 864  evaluation reward: 381.6
Training network. lr: 0.000154. clip: 0.061449
Iteration 12568: Policy loss: 0.003136. Value loss: 0.096570. Entropy: 1.061351.
Iteration 12569: Policy loss: -0.005964. Value loss: 0.054929. Entropy: 1.064335.
Iteration 12570: Policy loss: -0.011354. Value loss: 0.046780. Entropy: 1.067081.
episode: 4212   score: 565.0  epsilon: 1.0    steps: 352  evaluation reward: 384.65
Training network. lr: 0.000154. clip: 0.061449
Iteration 12571: Policy loss: 0.001143. Value loss: 0.114659. Entropy: 1.059924.
Iteration 12572: Policy loss: -0.009185. Value loss: 0.058050. Entropy: 1.059010.
Iteration 12573: Policy loss: -0.013757. Value loss: 0.042621. Entropy: 1.060779.
Training network. lr: 0.000154. clip: 0.061449
Iteration 12574: Policy loss: 0.003825. Value loss: 0.150277. Entropy: 1.142386.
Iteration 12575: Policy loss: -0.003436. Value loss: 0.070844. Entropy: 1.138566.
Iteration 12576: Policy loss: -0.008599

episode: 4233   score: 670.0  epsilon: 1.0    steps: 456  evaluation reward: 400.8
Training network. lr: 0.000153. clip: 0.061302
Iteration 12634: Policy loss: 0.002034. Value loss: 0.237793. Entropy: 1.090737.
Iteration 12635: Policy loss: -0.003932. Value loss: 0.113625. Entropy: 1.078821.
Iteration 12636: Policy loss: -0.005986. Value loss: 0.079797. Entropy: 1.087604.
Training network. lr: 0.000153. clip: 0.061302
Iteration 12637: Policy loss: 0.001668. Value loss: 0.130638. Entropy: 1.104015.
Iteration 12638: Policy loss: -0.004909. Value loss: 0.069414. Entropy: 1.107891.
Iteration 12639: Policy loss: -0.010960. Value loss: 0.050217. Entropy: 1.115191.
Training network. lr: 0.000153. clip: 0.061302
Iteration 12640: Policy loss: 0.002526. Value loss: 0.078069. Entropy: 1.137288.
Iteration 12641: Policy loss: -0.006581. Value loss: 0.033943. Entropy: 1.132285.
Iteration 12642: Policy loss: -0.012443. Value loss: 0.024465. Entropy: 1.131468.
episode: 4234   score: 575.0  epsilon: 1.

episode: 4254   score: 420.0  epsilon: 1.0    steps: 808  evaluation reward: 413.95
Training network. lr: 0.000153. clip: 0.061145
Iteration 12700: Policy loss: 0.003611. Value loss: 0.163820. Entropy: 1.023947.
Iteration 12701: Policy loss: -0.004327. Value loss: 0.083786. Entropy: 1.027374.
Iteration 12702: Policy loss: -0.009785. Value loss: 0.065181. Entropy: 1.028815.
Training network. lr: 0.000152. clip: 0.060989
Iteration 12703: Policy loss: 0.002107. Value loss: 0.204057. Entropy: 1.072063.
Iteration 12704: Policy loss: -0.005716. Value loss: 0.130126. Entropy: 1.076389.
Iteration 12705: Policy loss: -0.011136. Value loss: 0.096440. Entropy: 1.072749.
episode: 4255   score: 395.0  epsilon: 1.0    steps: 504  evaluation reward: 415.05
Training network. lr: 0.000152. clip: 0.060989
Iteration 12706: Policy loss: 0.001190. Value loss: 0.150710. Entropy: 1.110006.
Iteration 12707: Policy loss: -0.004597. Value loss: 0.072202. Entropy: 1.109899.
Iteration 12708: Policy loss: -0.01108

episode: 4276   score: 450.0  epsilon: 1.0    steps: 592  evaluation reward: 424.8
Training network. lr: 0.000152. clip: 0.060841
Iteration 12766: Policy loss: 0.003805. Value loss: 0.130282. Entropy: 0.987881.
Iteration 12767: Policy loss: 0.000580. Value loss: 0.075148. Entropy: 0.963674.
Iteration 12768: Policy loss: -0.006287. Value loss: 0.059139. Entropy: 0.970752.
episode: 4277   score: 465.0  epsilon: 1.0    steps: 104  evaluation reward: 424.05
Training network. lr: 0.000152. clip: 0.060841
Iteration 12769: Policy loss: 0.004521. Value loss: 0.225616. Entropy: 1.015029.
Iteration 12770: Policy loss: -0.003803. Value loss: 0.111529. Entropy: 1.016118.
Iteration 12771: Policy loss: -0.010246. Value loss: 0.074432. Entropy: 1.012615.
Training network. lr: 0.000152. clip: 0.060841
Iteration 12772: Policy loss: 0.003515. Value loss: 0.104246. Entropy: 1.126518.
Iteration 12773: Policy loss: -0.005562. Value loss: 0.057984. Entropy: 1.115499.
Iteration 12774: Policy loss: -0.009570.

episode: 4298   score: 285.0  epsilon: 1.0    steps: 160  evaluation reward: 420.8
Training network. lr: 0.000152. clip: 0.060685
Iteration 12832: Policy loss: 0.003206. Value loss: 0.115600. Entropy: 1.048607.
Iteration 12833: Policy loss: -0.005126. Value loss: 0.064537. Entropy: 1.038484.
Iteration 12834: Policy loss: -0.009278. Value loss: 0.047684. Entropy: 1.042697.
Training network. lr: 0.000152. clip: 0.060685
Iteration 12835: Policy loss: 0.005223. Value loss: 0.237075. Entropy: 1.117078.
Iteration 12836: Policy loss: -0.003571. Value loss: 0.124459. Entropy: 1.121463.
Iteration 12837: Policy loss: -0.010018. Value loss: 0.099456. Entropy: 1.119829.
episode: 4299   score: 270.0  epsilon: 1.0    steps: 304  evaluation reward: 419.7
Training network. lr: 0.000152. clip: 0.060685
Iteration 12838: Policy loss: 0.001718. Value loss: 0.132764. Entropy: 1.094491.
Iteration 12839: Policy loss: -0.004145. Value loss: 0.052254. Entropy: 1.094869.
Iteration 12840: Policy loss: -0.009045.

Iteration 12899: Policy loss: -0.006668. Value loss: 0.066606. Entropy: 1.191348.
Iteration 12900: Policy loss: -0.008497. Value loss: 0.050069. Entropy: 1.189003.
episode: 4318   score: 345.0  epsilon: 1.0    steps: 168  evaluation reward: 432.5
Training network. lr: 0.000151. clip: 0.060380
Iteration 12901: Policy loss: 0.005361. Value loss: 0.154914. Entropy: 1.023069.
Iteration 12902: Policy loss: -0.002360. Value loss: 0.071919. Entropy: 1.033656.
Iteration 12903: Policy loss: -0.005578. Value loss: 0.045832. Entropy: 1.033732.
episode: 4319   score: 210.0  epsilon: 1.0    steps: 288  evaluation reward: 431.75
Training network. lr: 0.000151. clip: 0.060380
Iteration 12904: Policy loss: 0.007244. Value loss: 0.246445. Entropy: 1.116087.
Iteration 12905: Policy loss: 0.000050. Value loss: 0.081412. Entropy: 1.111333.
Iteration 12906: Policy loss: -0.005399. Value loss: 0.054123. Entropy: 1.120136.
Training network. lr: 0.000151. clip: 0.060380
Iteration 12907: Policy loss: 0.004117.

Training network. lr: 0.000151. clip: 0.060224
Iteration 12964: Policy loss: 0.001945. Value loss: 0.123783. Entropy: 1.013700.
Iteration 12965: Policy loss: -0.007560. Value loss: 0.066866. Entropy: 1.020050.
Iteration 12966: Policy loss: -0.009500. Value loss: 0.051391. Entropy: 1.017596.
episode: 4341   score: 335.0  epsilon: 1.0    steps: 744  evaluation reward: 428.35
episode: 4342   score: 320.0  epsilon: 1.0    steps: 1008  evaluation reward: 428.95
Training network. lr: 0.000151. clip: 0.060224
Iteration 12967: Policy loss: 0.007141. Value loss: 0.144423. Entropy: 1.054813.
Iteration 12968: Policy loss: -0.004809. Value loss: 0.066675. Entropy: 1.046316.
Iteration 12969: Policy loss: -0.009020. Value loss: 0.053194. Entropy: 1.047322.
Training network. lr: 0.000151. clip: 0.060224
Iteration 12970: Policy loss: 0.005236. Value loss: 0.249288. Entropy: 1.039500.
Iteration 12971: Policy loss: 0.001818. Value loss: 0.115929. Entropy: 1.041262.
Iteration 12972: Policy loss: -0.00452

episode: 4362   score: 320.0  epsilon: 1.0    steps: 456  evaluation reward: 408.8
Training network. lr: 0.000150. clip: 0.060067
Iteration 13030: Policy loss: 0.003112. Value loss: 0.138765. Entropy: 1.115692.
Iteration 13031: Policy loss: -0.008024. Value loss: 0.080853. Entropy: 1.113965.
Iteration 13032: Policy loss: -0.014197. Value loss: 0.058992. Entropy: 1.120584.
episode: 4363   score: 665.0  epsilon: 1.0    steps: 24  evaluation reward: 411.5
episode: 4364   score: 225.0  epsilon: 1.0    steps: 784  evaluation reward: 408.6
Training network. lr: 0.000150. clip: 0.060067
Iteration 13033: Policy loss: 0.001147. Value loss: 0.151806. Entropy: 1.096203.
Iteration 13034: Policy loss: -0.008303. Value loss: 0.074750. Entropy: 1.100631.
Iteration 13035: Policy loss: -0.013036. Value loss: 0.057046. Entropy: 1.105136.
episode: 4365   score: 315.0  epsilon: 1.0    steps: 408  evaluation reward: 408.75
Training network. lr: 0.000150. clip: 0.060067
Iteration 13036: Policy loss: -0.0000

Iteration 13092: Policy loss: -0.009374. Value loss: 0.047492. Entropy: 1.007849.
episode: 4388   score: 315.0  epsilon: 1.0    steps: 152  evaluation reward: 399.8
episode: 4389   score: 240.0  epsilon: 1.0    steps: 248  evaluation reward: 396.6
Training network. lr: 0.000150. clip: 0.059920
Iteration 13093: Policy loss: 0.003370. Value loss: 0.164321. Entropy: 0.935784.
Iteration 13094: Policy loss: -0.001957. Value loss: 0.059858. Entropy: 0.932201.
Iteration 13095: Policy loss: -0.003334. Value loss: 0.041719. Entropy: 0.936507.
Training network. lr: 0.000150. clip: 0.059920
Iteration 13096: Policy loss: 0.000125. Value loss: 0.124655. Entropy: 1.146447.
Iteration 13097: Policy loss: -0.006067. Value loss: 0.068018. Entropy: 1.152997.
Iteration 13098: Policy loss: -0.008139. Value loss: 0.049302. Entropy: 1.156654.
episode: 4390   score: 955.0  epsilon: 1.0    steps: 344  evaluation reward: 403.05
Training network. lr: 0.000150. clip: 0.059920
Iteration 13099: Policy loss: 0.00214

Iteration 13157: Policy loss: -0.004170. Value loss: 0.118408. Entropy: 0.943115.
Iteration 13158: Policy loss: -0.008766. Value loss: 0.085793. Entropy: 0.959359.
Training network. lr: 0.000149. clip: 0.059606
Iteration 13159: Policy loss: 0.003523. Value loss: 0.129341. Entropy: 1.092584.
Iteration 13160: Policy loss: -0.005073. Value loss: 0.054681. Entropy: 1.094601.
Iteration 13161: Policy loss: -0.009286. Value loss: 0.033630. Entropy: 1.090642.
Training network. lr: 0.000149. clip: 0.059606
Iteration 13162: Policy loss: 0.003333. Value loss: 0.170036. Entropy: 1.135306.
Iteration 13163: Policy loss: -0.005169. Value loss: 0.080255. Entropy: 1.133067.
Iteration 13164: Policy loss: -0.011264. Value loss: 0.053471. Entropy: 1.138269.
Training network. lr: 0.000149. clip: 0.059606
Iteration 13165: Policy loss: 0.001957. Value loss: 0.171973. Entropy: 1.188160.
Iteration 13166: Policy loss: -0.002842. Value loss: 0.081030. Entropy: 1.195508.
Iteration 13167: Policy loss: -0.010847. V

Iteration 13223: Policy loss: -0.004476. Value loss: 0.064097. Entropy: 0.972137.
Iteration 13224: Policy loss: -0.012145. Value loss: 0.046919. Entropy: 0.970218.
episode: 4432   score: 380.0  epsilon: 1.0    steps: 208  evaluation reward: 377.45
Training network. lr: 0.000149. clip: 0.059459
Iteration 13225: Policy loss: 0.007031. Value loss: 0.140821. Entropy: 1.048039.
Iteration 13226: Policy loss: -0.003206. Value loss: 0.075562. Entropy: 1.046979.
Iteration 13227: Policy loss: -0.009801. Value loss: 0.063339. Entropy: 1.051844.
Training network. lr: 0.000149. clip: 0.059459
Iteration 13228: Policy loss: 0.004039. Value loss: 0.103214. Entropy: 1.188258.
Iteration 13229: Policy loss: -0.009603. Value loss: 0.050333. Entropy: 1.187032.
Iteration 13230: Policy loss: -0.013995. Value loss: 0.034799. Entropy: 1.191348.
episode: 4433   score: 330.0  epsilon: 1.0    steps: 808  evaluation reward: 377.6
episode: 4434   score: 600.0  epsilon: 1.0    steps: 984  evaluation reward: 378.2
Tr

Iteration 13288: Policy loss: 0.003028. Value loss: 0.142237. Entropy: 1.047474.
Iteration 13289: Policy loss: -0.005186. Value loss: 0.065122. Entropy: 1.051513.
Iteration 13290: Policy loss: -0.010730. Value loss: 0.049303. Entropy: 1.044549.
episode: 4454   score: 215.0  epsilon: 1.0    steps: 24  evaluation reward: 381.65
episode: 4455   score: 350.0  epsilon: 1.0    steps: 304  evaluation reward: 382.65
Training network. lr: 0.000148. clip: 0.059302
Iteration 13291: Policy loss: 0.002982. Value loss: 0.107136. Entropy: 0.927438.
Iteration 13292: Policy loss: -0.006941. Value loss: 0.068859. Entropy: 0.929334.
Iteration 13293: Policy loss: -0.011599. Value loss: 0.054787. Entropy: 0.924022.
episode: 4456   score: 485.0  epsilon: 1.0    steps: 992  evaluation reward: 383.1
Training network. lr: 0.000148. clip: 0.059302
Iteration 13294: Policy loss: 0.000531. Value loss: 0.132929. Entropy: 1.094301.
Iteration 13295: Policy loss: -0.005541. Value loss: 0.064381. Entropy: 1.096290.
Ite

episode: 4476   score: 490.0  epsilon: 1.0    steps: 688  evaluation reward: 391.05
Training network. lr: 0.000147. clip: 0.058998
Iteration 13354: Policy loss: 0.002131. Value loss: 0.126634. Entropy: 0.853137.
Iteration 13355: Policy loss: -0.004560. Value loss: 0.069755. Entropy: 0.849453.
Iteration 13356: Policy loss: -0.011066. Value loss: 0.056370. Entropy: 0.845558.
episode: 4477   score: 655.0  epsilon: 1.0    steps: 24  evaluation reward: 395.45
Training network. lr: 0.000147. clip: 0.058998
Iteration 13357: Policy loss: 0.005164. Value loss: 0.347644. Entropy: 0.995323.
Iteration 13358: Policy loss: -0.000639. Value loss: 0.156675. Entropy: 0.996487.
Iteration 13359: Policy loss: -0.005298. Value loss: 0.088739. Entropy: 0.995526.
episode: 4478   score: 550.0  epsilon: 1.0    steps: 552  evaluation reward: 397.75
Training network. lr: 0.000147. clip: 0.058998
Iteration 13360: Policy loss: 0.000127. Value loss: 0.469511. Entropy: 1.032081.
Iteration 13361: Policy loss: 0.00342

Training network. lr: 0.000147. clip: 0.058841
Iteration 13420: Policy loss: 0.002851. Value loss: 0.185808. Entropy: 1.116569.
Iteration 13421: Policy loss: -0.005050. Value loss: 0.089745. Entropy: 1.123142.
Iteration 13422: Policy loss: -0.010651. Value loss: 0.063223. Entropy: 1.118210.
episode: 4498   score: 425.0  epsilon: 1.0    steps: 928  evaluation reward: 397.85
Training network. lr: 0.000147. clip: 0.058841
Iteration 13423: Policy loss: 0.003015. Value loss: 0.161380. Entropy: 1.155103.
Iteration 13424: Policy loss: -0.004278. Value loss: 0.074501. Entropy: 1.159073.
Iteration 13425: Policy loss: -0.012556. Value loss: 0.053653. Entropy: 1.161784.
Training network. lr: 0.000147. clip: 0.058841
Iteration 13426: Policy loss: 0.003506. Value loss: 0.133262. Entropy: 1.123682.
Iteration 13427: Policy loss: -0.006495. Value loss: 0.058839. Entropy: 1.130228.
Iteration 13428: Policy loss: -0.011212. Value loss: 0.039299. Entropy: 1.128765.
episode: 4499   score: 255.0  epsilon: 1

Training network. lr: 0.000147. clip: 0.058685
Iteration 13486: Policy loss: 0.003103. Value loss: 0.156544. Entropy: 1.061008.
Iteration 13487: Policy loss: -0.002305. Value loss: 0.097176. Entropy: 1.059109.
Iteration 13488: Policy loss: -0.009003. Value loss: 0.077902. Entropy: 1.054682.
episode: 4519   score: 265.0  epsilon: 1.0    steps: 376  evaluation reward: 405.2
episode: 4520   score: 740.0  epsilon: 1.0    steps: 672  evaluation reward: 409.1
Training network. lr: 0.000147. clip: 0.058685
Iteration 13489: Policy loss: 0.006015. Value loss: 0.184928. Entropy: 1.037118.
Iteration 13490: Policy loss: -0.001151. Value loss: 0.094370. Entropy: 1.037254.
Iteration 13491: Policy loss: -0.004436. Value loss: 0.064243. Entropy: 1.028353.
episode: 4521   score: 125.0  epsilon: 1.0    steps: 96  evaluation reward: 407.5
episode: 4522   score: 765.0  epsilon: 1.0    steps: 440  evaluation reward: 411.55
Training network. lr: 0.000147. clip: 0.058685
Iteration 13492: Policy loss: 0.00419

Training network. lr: 0.000146. clip: 0.058381
Iteration 13552: Policy loss: 0.005280. Value loss: 0.114217. Entropy: 0.964001.
Iteration 13553: Policy loss: -0.003719. Value loss: 0.063395. Entropy: 0.958409.
Iteration 13554: Policy loss: -0.007371. Value loss: 0.048787. Entropy: 0.953495.
episode: 4541   score: 290.0  epsilon: 1.0    steps: 184  evaluation reward: 426.0
episode: 4542   score: 505.0  epsilon: 1.0    steps: 208  evaluation reward: 424.35
Training network. lr: 0.000146. clip: 0.058381
Iteration 13555: Policy loss: 0.003059. Value loss: 0.107763. Entropy: 0.917657.
Iteration 13556: Policy loss: -0.005872. Value loss: 0.063034. Entropy: 0.926974.
Iteration 13557: Policy loss: -0.012085. Value loss: 0.048672. Entropy: 0.911833.
episode: 4543   score: 345.0  epsilon: 1.0    steps: 272  evaluation reward: 424.95
Training network. lr: 0.000146. clip: 0.058381
Iteration 13558: Policy loss: 0.002362. Value loss: 0.150136. Entropy: 1.010389.
Iteration 13559: Policy loss: -0.0054

episode: 4562   score: 755.0  epsilon: 1.0    steps: 216  evaluation reward: 434.8
episode: 4563   score: 210.0  epsilon: 1.0    steps: 752  evaluation reward: 432.4
Training network. lr: 0.000146. clip: 0.058224
Iteration 13618: Policy loss: 0.004408. Value loss: 0.154850. Entropy: 1.015875.
Iteration 13619: Policy loss: -0.003894. Value loss: 0.077661. Entropy: 1.012252.
Iteration 13620: Policy loss: -0.006837. Value loss: 0.062431. Entropy: 1.007104.
episode: 4564   score: 275.0  epsilon: 1.0    steps: 272  evaluation reward: 431.8
Training network. lr: 0.000146. clip: 0.058224
Iteration 13621: Policy loss: 0.004954. Value loss: 0.522676. Entropy: 1.022584.
Iteration 13622: Policy loss: 0.001634. Value loss: 0.297491. Entropy: 1.014539.
Iteration 13623: Policy loss: -0.004940. Value loss: 0.245973. Entropy: 1.009784.
episode: 4565   score: 225.0  epsilon: 1.0    steps: 296  evaluation reward: 427.35
Training network. lr: 0.000146. clip: 0.058224
Iteration 13624: Policy loss: 0.00327

Iteration 13682: Policy loss: -0.004355. Value loss: 0.069944. Entropy: 1.036295.
Iteration 13683: Policy loss: -0.007600. Value loss: 0.047552. Entropy: 1.029253.
episode: 4586   score: 320.0  epsilon: 1.0    steps: 424  evaluation reward: 429.5
episode: 4587   score: 260.0  epsilon: 1.0    steps: 432  evaluation reward: 428.95
Training network. lr: 0.000145. clip: 0.058076
Iteration 13684: Policy loss: 0.004763. Value loss: 0.110380. Entropy: 1.053254.
Iteration 13685: Policy loss: -0.004296. Value loss: 0.056603. Entropy: 1.052237.
Iteration 13686: Policy loss: -0.008807. Value loss: 0.045541. Entropy: 1.047392.
Training network. lr: 0.000145. clip: 0.058076
Iteration 13687: Policy loss: 0.002605. Value loss: 0.096639. Entropy: 1.137036.
Iteration 13688: Policy loss: -0.004077. Value loss: 0.046421. Entropy: 1.133208.
Iteration 13689: Policy loss: -0.010762. Value loss: 0.033316. Entropy: 1.133089.
Training network. lr: 0.000145. clip: 0.058076
Iteration 13690: Policy loss: 0.002755

Iteration 13747: Policy loss: 0.003470. Value loss: 0.064796. Entropy: 1.122340.
Iteration 13748: Policy loss: -0.007516. Value loss: 0.030476. Entropy: 1.113990.
Iteration 13749: Policy loss: -0.011145. Value loss: 0.022987. Entropy: 1.118867.
episode: 4608   score: 290.0  epsilon: 1.0    steps: 848  evaluation reward: 429.65
episode: 4609   score: 470.0  epsilon: 1.0    steps: 992  evaluation reward: 429.65
Training network. lr: 0.000145. clip: 0.057920
Iteration 13750: Policy loss: 0.003277. Value loss: 0.147034. Entropy: 1.158981.
Iteration 13751: Policy loss: -0.007571. Value loss: 0.085088. Entropy: 1.154201.
Iteration 13752: Policy loss: -0.009080. Value loss: 0.065481. Entropy: 1.146576.
Training network. lr: 0.000144. clip: 0.057763
Iteration 13753: Policy loss: 0.002040. Value loss: 0.143265. Entropy: 1.061286.
Iteration 13754: Policy loss: -0.004210. Value loss: 0.055698. Entropy: 1.059610.
Iteration 13755: Policy loss: -0.009076. Value loss: 0.038957. Entropy: 1.045948.
Tra

episode: 4631   score: 250.0  epsilon: 1.0    steps: 1000  evaluation reward: 403.15
Training network. lr: 0.000144. clip: 0.057616
Iteration 13813: Policy loss: 0.002207. Value loss: 0.173043. Entropy: 1.027484.
Iteration 13814: Policy loss: -0.004076. Value loss: 0.095963. Entropy: 1.018679.
Iteration 13815: Policy loss: -0.008034. Value loss: 0.072960. Entropy: 1.027858.
Training network. lr: 0.000144. clip: 0.057616
Iteration 13816: Policy loss: 0.003189. Value loss: 0.228747. Entropy: 1.105704.
Iteration 13817: Policy loss: -0.001414. Value loss: 0.074895. Entropy: 1.099667.
Iteration 13818: Policy loss: -0.003390. Value loss: 0.053144. Entropy: 1.099148.
episode: 4632   score: 270.0  epsilon: 1.0    steps: 592  evaluation reward: 403.4
episode: 4633   score: 405.0  epsilon: 1.0    steps: 688  evaluation reward: 401.25
Training network. lr: 0.000144. clip: 0.057616
Iteration 13819: Policy loss: 0.002280. Value loss: 0.117201. Entropy: 1.032214.
Iteration 13820: Policy loss: -0.003

episode: 4652   score: 635.0  epsilon: 1.0    steps: 416  evaluation reward: 410.55
Training network. lr: 0.000144. clip: 0.057459
Iteration 13879: Policy loss: 0.005356. Value loss: 0.148961. Entropy: 0.865303.
Iteration 13880: Policy loss: -0.001961. Value loss: 0.076315. Entropy: 0.845082.
Iteration 13881: Policy loss: -0.007160. Value loss: 0.061422. Entropy: 0.857366.
episode: 4653   score: 315.0  epsilon: 1.0    steps: 784  evaluation reward: 411.6
Training network. lr: 0.000144. clip: 0.057459
Iteration 13882: Policy loss: 0.000680. Value loss: 0.172741. Entropy: 1.107002.
Iteration 13883: Policy loss: -0.005672. Value loss: 0.088660. Entropy: 1.103585.
Iteration 13884: Policy loss: -0.011105. Value loss: 0.066989. Entropy: 1.103725.
Training network. lr: 0.000144. clip: 0.057459
Iteration 13885: Policy loss: 0.002906. Value loss: 0.251758. Entropy: 1.080129.
Iteration 13886: Policy loss: -0.001061. Value loss: 0.137775. Entropy: 1.075721.
Iteration 13887: Policy loss: -0.005747

Iteration 13945: Policy loss: 0.000296. Value loss: 0.441910. Entropy: 1.164991.
Iteration 13946: Policy loss: -0.001213. Value loss: 0.306645. Entropy: 1.158505.
Iteration 13947: Policy loss: -0.002196. Value loss: 0.205317. Entropy: 1.154420.
Training network. lr: 0.000143. clip: 0.057302
Iteration 13948: Policy loss: 0.001446. Value loss: 0.160092. Entropy: 1.157261.
Iteration 13949: Policy loss: -0.006657. Value loss: 0.076497. Entropy: 1.152026.
Iteration 13950: Policy loss: -0.008369. Value loss: 0.057740. Entropy: 1.150439.
episode: 4673   score: 460.0  epsilon: 1.0    steps: 384  evaluation reward: 422.5
Training network. lr: 0.000143. clip: 0.057155
Iteration 13951: Policy loss: 0.000194. Value loss: 0.376678. Entropy: 1.095821.
Iteration 13952: Policy loss: -0.005464. Value loss: 0.245839. Entropy: 1.087544.
Iteration 13953: Policy loss: -0.005526. Value loss: 0.204951. Entropy: 1.080364.
episode: 4674   score: 440.0  epsilon: 1.0    steps: 456  evaluation reward: 423.2
Train

Iteration 14010: Policy loss: -0.007523. Value loss: 0.086728. Entropy: 0.890472.
Training network. lr: 0.000142. clip: 0.056998
Iteration 14011: Policy loss: 0.002124. Value loss: 0.194224. Entropy: 1.030013.
Iteration 14012: Policy loss: -0.002093. Value loss: 0.101642. Entropy: 1.028653.
Iteration 14013: Policy loss: -0.008623. Value loss: 0.065147. Entropy: 1.030690.
episode: 4696   score: 540.0  epsilon: 1.0    steps: 696  evaluation reward: 424.85
Training network. lr: 0.000142. clip: 0.056998
Iteration 14014: Policy loss: 0.001822. Value loss: 0.143551. Entropy: 1.085443.
Iteration 14015: Policy loss: -0.007696. Value loss: 0.080618. Entropy: 1.086651.
Iteration 14016: Policy loss: -0.012603. Value loss: 0.058498. Entropy: 1.083187.
Training network. lr: 0.000142. clip: 0.056998
Iteration 14017: Policy loss: 0.003074. Value loss: 0.176601. Entropy: 1.163024.
Iteration 14018: Policy loss: -0.003518. Value loss: 0.095302. Entropy: 1.163378.
Iteration 14019: Policy loss: -0.007755.

Iteration 14076: Policy loss: -0.010282. Value loss: 0.051479. Entropy: 1.134164.
episode: 4717   score: 505.0  epsilon: 1.0    steps: 368  evaluation reward: 422.9
Training network. lr: 0.000142. clip: 0.056841
Iteration 14077: Policy loss: 0.001097. Value loss: 0.164536. Entropy: 1.092996.
Iteration 14078: Policy loss: -0.006024. Value loss: 0.074008. Entropy: 1.098381.
Iteration 14079: Policy loss: -0.009063. Value loss: 0.053532. Entropy: 1.095231.
episode: 4718   score: 470.0  epsilon: 1.0    steps: 560  evaluation reward: 425.5
Training network. lr: 0.000142. clip: 0.056841
Iteration 14080: Policy loss: 0.000683. Value loss: 0.109516. Entropy: 1.127121.
Iteration 14081: Policy loss: -0.007651. Value loss: 0.052842. Entropy: 1.124805.
Iteration 14082: Policy loss: -0.012326. Value loss: 0.042903. Entropy: 1.120906.
episode: 4719   score: 225.0  epsilon: 1.0    steps: 376  evaluation reward: 424.25
episode: 4720   score: 275.0  epsilon: 1.0    steps: 992  evaluation reward: 422.35


Training network. lr: 0.000142. clip: 0.056694
Iteration 14143: Policy loss: 0.003848. Value loss: 0.119091. Entropy: 1.136702.
Iteration 14144: Policy loss: -0.003959. Value loss: 0.066718. Entropy: 1.133702.
Iteration 14145: Policy loss: -0.009340. Value loss: 0.050286. Entropy: 1.135120.
episode: 4738   score: 530.0  epsilon: 1.0    steps: 712  evaluation reward: 432.95
episode: 4739   score: 640.0  epsilon: 1.0    steps: 768  evaluation reward: 436.95
Training network. lr: 0.000142. clip: 0.056694
Iteration 14146: Policy loss: 0.001769. Value loss: 0.172763. Entropy: 1.132509.
Iteration 14147: Policy loss: -0.006649. Value loss: 0.088089. Entropy: 1.133089.
Iteration 14148: Policy loss: -0.011679. Value loss: 0.066859. Entropy: 1.132702.
episode: 4740   score: 360.0  epsilon: 1.0    steps: 64  evaluation reward: 431.8
episode: 4741   score: 450.0  epsilon: 1.0    steps: 840  evaluation reward: 428.5
Training network. lr: 0.000142. clip: 0.056694
Iteration 14149: Policy loss: 0.0023

Iteration 14207: Policy loss: -0.011367. Value loss: 0.070399. Entropy: 0.992763.
Iteration 14208: Policy loss: -0.015806. Value loss: 0.053216. Entropy: 0.997858.
episode: 4761   score: 300.0  epsilon: 1.0    steps: 736  evaluation reward: 404.4
Training network. lr: 0.000141. clip: 0.056381
Iteration 14209: Policy loss: 0.003376. Value loss: 0.347771. Entropy: 1.070549.
Iteration 14210: Policy loss: 0.002640. Value loss: 0.216621. Entropy: 1.066614.
Iteration 14211: Policy loss: -0.001256. Value loss: 0.152118. Entropy: 1.074742.
episode: 4762   score: 680.0  epsilon: 1.0    steps: 24  evaluation reward: 409.3
episode: 4763   score: 245.0  epsilon: 1.0    steps: 256  evaluation reward: 409.25
episode: 4764   score: 630.0  epsilon: 1.0    steps: 272  evaluation reward: 411.9
episode: 4765   score: 420.0  epsilon: 1.0    steps: 824  evaluation reward: 408.75
Training network. lr: 0.000141. clip: 0.056381
Iteration 14212: Policy loss: 0.005400. Value loss: 0.281117. Entropy: 0.810439.
I

Iteration 14274: Policy loss: -0.010162. Value loss: 0.057827. Entropy: 0.952298.
episode: 4782   score: 355.0  epsilon: 1.0    steps: 736  evaluation reward: 414.8
Training network. lr: 0.000141. clip: 0.056233
Iteration 14275: Policy loss: 0.002535. Value loss: 0.293281. Entropy: 1.024634.
Iteration 14276: Policy loss: -0.000330. Value loss: 0.146607. Entropy: 1.023526.
Iteration 14277: Policy loss: -0.005412. Value loss: 0.099500. Entropy: 1.020572.
episode: 4783   score: 305.0  epsilon: 1.0    steps: 904  evaluation reward: 415.6
Training network. lr: 0.000141. clip: 0.056233
Iteration 14278: Policy loss: 0.002661. Value loss: 0.174508. Entropy: 1.053350.
Iteration 14279: Policy loss: -0.007971. Value loss: 0.087276. Entropy: 1.043627.
Iteration 14280: Policy loss: -0.010100. Value loss: 0.061488. Entropy: 1.052436.
Training network. lr: 0.000141. clip: 0.056233
Iteration 14281: Policy loss: 0.002436. Value loss: 0.133241. Entropy: 1.053949.
Iteration 14282: Policy loss: -0.006661.

Iteration 14339: Policy loss: 0.005658. Value loss: 0.194271. Entropy: 1.102713.
Iteration 14340: Policy loss: 0.001486. Value loss: 0.118564. Entropy: 1.105041.
Training network. lr: 0.000140. clip: 0.056077
Iteration 14341: Policy loss: 0.003697. Value loss: 0.262256. Entropy: 1.134297.
Iteration 14342: Policy loss: -0.001016. Value loss: 0.120067. Entropy: 1.131969.
Iteration 14343: Policy loss: -0.006233. Value loss: 0.087840. Entropy: 1.137930.
episode: 4804   score: 315.0  epsilon: 1.0    steps: 16  evaluation reward: 424.45
episode: 4805   score: 630.0  epsilon: 1.0    steps: 568  evaluation reward: 425.75
Training network. lr: 0.000140. clip: 0.056077
Iteration 14344: Policy loss: 0.001698. Value loss: 0.092355. Entropy: 1.043623.
Iteration 14345: Policy loss: -0.007814. Value loss: 0.051107. Entropy: 1.037253.
Iteration 14346: Policy loss: -0.011103. Value loss: 0.039078. Entropy: 1.049089.
Training network. lr: 0.000140. clip: 0.056077
Iteration 14347: Policy loss: 0.000664. 

Iteration 14405: Policy loss: -0.003350. Value loss: 0.321889. Entropy: 1.020110.
Iteration 14406: Policy loss: -0.006777. Value loss: 0.252842. Entropy: 1.035613.
episode: 4826   score: 280.0  epsilon: 1.0    steps: 520  evaluation reward: 445.6
Training network. lr: 0.000139. clip: 0.055772
Iteration 14407: Policy loss: 0.006011. Value loss: 0.135206. Entropy: 1.061416.
Iteration 14408: Policy loss: -0.001007. Value loss: 0.071679. Entropy: 1.071742.
Iteration 14409: Policy loss: -0.001890. Value loss: 0.047239. Entropy: 1.073011.
episode: 4827   score: 615.0  epsilon: 1.0    steps: 736  evaluation reward: 448.4
Training network. lr: 0.000139. clip: 0.055772
Iteration 14410: Policy loss: 0.002471. Value loss: 0.170461. Entropy: 1.042418.
Iteration 14411: Policy loss: -0.004466. Value loss: 0.082340. Entropy: 1.046181.
Iteration 14412: Policy loss: -0.006557. Value loss: 0.060978. Entropy: 1.045886.
episode: 4828   score: 215.0  epsilon: 1.0    steps: 528  evaluation reward: 445.35
Tr

Iteration 14471: Policy loss: -0.004127. Value loss: 0.060702. Entropy: 0.964191.
Iteration 14472: Policy loss: -0.012263. Value loss: 0.040510. Entropy: 0.969388.
Training network. lr: 0.000139. clip: 0.055616
Iteration 14473: Policy loss: 0.002689. Value loss: 0.114199. Entropy: 0.964335.
Iteration 14474: Policy loss: -0.005375. Value loss: 0.057121. Entropy: 0.949929.
Iteration 14475: Policy loss: -0.011210. Value loss: 0.038734. Entropy: 0.957541.
Training network. lr: 0.000139. clip: 0.055616
Iteration 14476: Policy loss: 0.003078. Value loss: 0.164787. Entropy: 1.065375.
Iteration 14477: Policy loss: -0.004481. Value loss: 0.075970. Entropy: 1.067711.
Iteration 14478: Policy loss: -0.008841. Value loss: 0.053377. Entropy: 1.068356.
Training network. lr: 0.000139. clip: 0.055616
Iteration 14479: Policy loss: 0.003848. Value loss: 0.204917. Entropy: 1.150706.
Iteration 14480: Policy loss: -0.002750. Value loss: 0.092927. Entropy: 1.155752.
Iteration 14481: Policy loss: -0.009287. V

Iteration 14536: Policy loss: 0.003381. Value loss: 0.148298. Entropy: 0.928287.
Iteration 14537: Policy loss: -0.004426. Value loss: 0.081655. Entropy: 0.940234.
Iteration 14538: Policy loss: -0.008924. Value loss: 0.065855. Entropy: 0.936767.
Training network. lr: 0.000139. clip: 0.055459
Iteration 14539: Policy loss: 0.002863. Value loss: 0.081579. Entropy: 1.009762.
Iteration 14540: Policy loss: -0.004991. Value loss: 0.039815. Entropy: 0.994775.
Iteration 14541: Policy loss: -0.010476. Value loss: 0.030822. Entropy: 1.006324.
episode: 4870   score: 215.0  epsilon: 1.0    steps: 728  evaluation reward: 459.9
Training network. lr: 0.000139. clip: 0.055459
Iteration 14542: Policy loss: 0.004513. Value loss: 0.251123. Entropy: 1.074620.
Iteration 14543: Policy loss: -0.001312. Value loss: 0.095365. Entropy: 1.074801.
Iteration 14544: Policy loss: -0.006737. Value loss: 0.069292. Entropy: 1.074968.
Training network. lr: 0.000139. clip: 0.055459
Iteration 14545: Policy loss: 0.000677. V

Iteration 14604: Policy loss: -0.009500. Value loss: 0.035192. Entropy: 1.161946.
episode: 4890   score: 450.0  epsilon: 1.0    steps: 728  evaluation reward: 428.95
Training network. lr: 0.000138. clip: 0.055155
Iteration 14605: Policy loss: 0.002022. Value loss: 0.211839. Entropy: 1.164899.
Iteration 14606: Policy loss: -0.006059. Value loss: 0.106054. Entropy: 1.157129.
Iteration 14607: Policy loss: -0.011143. Value loss: 0.076598. Entropy: 1.156561.
Training network. lr: 0.000138. clip: 0.055155
Iteration 14608: Policy loss: 0.001756. Value loss: 0.273376. Entropy: 1.123296.
Iteration 14609: Policy loss: -0.004636. Value loss: 0.132315. Entropy: 1.113244.
Iteration 14610: Policy loss: -0.008656. Value loss: 0.094942. Entropy: 1.115678.
episode: 4891   score: 430.0  epsilon: 1.0    steps: 64  evaluation reward: 429.1
episode: 4892   score: 285.0  epsilon: 1.0    steps: 336  evaluation reward: 428.8
episode: 4893   score: 950.0  epsilon: 1.0    steps: 360  evaluation reward: 433.65
T

Iteration 14668: Policy loss: 0.000589. Value loss: 0.341531. Entropy: 1.121988.
Iteration 14669: Policy loss: -0.006857. Value loss: 0.256329. Entropy: 1.122425.
Iteration 14670: Policy loss: -0.008533. Value loss: 0.224572. Entropy: 1.127988.
Training network. lr: 0.000137. clip: 0.054998
Iteration 14671: Policy loss: 0.000020. Value loss: 0.279972. Entropy: 1.100329.
Iteration 14672: Policy loss: 0.000350. Value loss: 0.093206. Entropy: 1.107994.
Iteration 14673: Policy loss: -0.004329. Value loss: 0.054606. Entropy: 1.108921.
Training network. lr: 0.000137. clip: 0.054998
Iteration 14674: Policy loss: 0.002861. Value loss: 0.194160. Entropy: 1.236685.
Iteration 14675: Policy loss: -0.005302. Value loss: 0.089094. Entropy: 1.233386.
Iteration 14676: Policy loss: -0.010810. Value loss: 0.062911. Entropy: 1.229431.
episode: 4913   score: 515.0  epsilon: 1.0    steps: 280  evaluation reward: 436.2
episode: 4914   score: 315.0  epsilon: 1.0    steps: 776  evaluation reward: 434.0
Traini

Iteration 14735: Policy loss: -0.007103. Value loss: 0.038077. Entropy: 1.097350.
Iteration 14736: Policy loss: -0.011215. Value loss: 0.026879. Entropy: 1.092178.
episode: 4934   score: 210.0  epsilon: 1.0    steps: 512  evaluation reward: 435.75
Training network. lr: 0.000137. clip: 0.054851
Iteration 14737: Policy loss: 0.003168. Value loss: 0.085631. Entropy: 0.994800.
Iteration 14738: Policy loss: -0.006960. Value loss: 0.052660. Entropy: 0.992841.
Iteration 14739: Policy loss: -0.009069. Value loss: 0.044407. Entropy: 0.990806.
episode: 4935   score: 560.0  epsilon: 1.0    steps: 192  evaluation reward: 437.6
Training network. lr: 0.000137. clip: 0.054851
Iteration 14740: Policy loss: 0.003177. Value loss: 0.481539. Entropy: 1.058609.
Iteration 14741: Policy loss: 0.003143. Value loss: 0.334999. Entropy: 1.044307.
Iteration 14742: Policy loss: -0.000228. Value loss: 0.228033. Entropy: 1.049914.
episode: 4936   score: 345.0  epsilon: 1.0    steps: 8  evaluation reward: 437.2
Train

Iteration 14800: Policy loss: 0.002068. Value loss: 0.112571. Entropy: 1.012878.
Iteration 14801: Policy loss: -0.006948. Value loss: 0.065481. Entropy: 1.013548.
Iteration 14802: Policy loss: -0.010686. Value loss: 0.047667. Entropy: 1.014300.
Training network. lr: 0.000136. clip: 0.054537
Iteration 14803: Policy loss: 0.003370. Value loss: 0.108492. Entropy: 1.160790.
Iteration 14804: Policy loss: -0.007235. Value loss: 0.051072. Entropy: 1.167313.
Iteration 14805: Policy loss: -0.012145. Value loss: 0.036725. Entropy: 1.164758.
episode: 4956   score: 355.0  epsilon: 1.0    steps: 48  evaluation reward: 413.9
episode: 4957   score: 290.0  epsilon: 1.0    steps: 376  evaluation reward: 412.05
episode: 4958   score: 485.0  epsilon: 1.0    steps: 1024  evaluation reward: 413.0
Training network. lr: 0.000136. clip: 0.054537
Iteration 14806: Policy loss: 0.001290. Value loss: 0.090974. Entropy: 0.977157.
Iteration 14807: Policy loss: -0.007378. Value loss: 0.041606. Entropy: 0.973225.
Ite

episode: 4979   score: 330.0  epsilon: 1.0    steps: 696  evaluation reward: 407.75
episode: 4980   score: 260.0  epsilon: 1.0    steps: 928  evaluation reward: 407.35
Training network. lr: 0.000136. clip: 0.054390
Iteration 14866: Policy loss: 0.000828. Value loss: 0.101983. Entropy: 0.959600.
Iteration 14867: Policy loss: -0.009913. Value loss: 0.065163. Entropy: 0.959082.
Iteration 14868: Policy loss: -0.012440. Value loss: 0.053568. Entropy: 0.959988.
episode: 4981   score: 510.0  epsilon: 1.0    steps: 752  evaluation reward: 408.5
Training network. lr: 0.000136. clip: 0.054390
Iteration 14869: Policy loss: 0.004241. Value loss: 0.202941. Entropy: 0.983856.
Iteration 14870: Policy loss: 0.000538. Value loss: 0.111550. Entropy: 0.991396.
Iteration 14871: Policy loss: -0.004171. Value loss: 0.066273. Entropy: 0.987318.
Training network. lr: 0.000136. clip: 0.054390
Iteration 14872: Policy loss: 0.001163. Value loss: 0.201854. Entropy: 1.056990.
Iteration 14873: Policy loss: -0.00528

Iteration 14929: Policy loss: 0.000973. Value loss: 0.130359. Entropy: 1.174325.
Iteration 14930: Policy loss: -0.007492. Value loss: 0.066280. Entropy: 1.172919.
Iteration 14931: Policy loss: -0.010288. Value loss: 0.047921. Entropy: 1.174443.
episode: 5003   score: 485.0  epsilon: 1.0    steps: 432  evaluation reward: 391.8
episode: 5004   score: 305.0  epsilon: 1.0    steps: 536  evaluation reward: 389.55
episode: 5005   score: 210.0  epsilon: 1.0    steps: 880  evaluation reward: 387.15
Training network. lr: 0.000136. clip: 0.054233
Iteration 14932: Policy loss: 0.001902. Value loss: 0.124084. Entropy: 0.946915.
Iteration 14933: Policy loss: -0.004595. Value loss: 0.068960. Entropy: 0.943307.
Iteration 14934: Policy loss: -0.010500. Value loss: 0.054867. Entropy: 0.941802.
Training network. lr: 0.000136. clip: 0.054233
Iteration 14935: Policy loss: -0.000404. Value loss: 0.112298. Entropy: 1.089289.
Iteration 14936: Policy loss: -0.006312. Value loss: 0.068049. Entropy: 1.089703.
I

Iteration 14992: Policy loss: 0.002903. Value loss: 0.296782. Entropy: 0.823510.
Iteration 14993: Policy loss: 0.002438. Value loss: 0.190841. Entropy: 0.820316.
Iteration 14994: Policy loss: 0.002085. Value loss: 0.104547. Entropy: 0.816156.
episode: 5028   score: 635.0  epsilon: 1.0    steps: 472  evaluation reward: 367.4
Training network. lr: 0.000135. clip: 0.054077
Iteration 14995: Policy loss: 0.000242. Value loss: 0.098132. Entropy: 0.929989.
Iteration 14996: Policy loss: -0.006828. Value loss: 0.058649. Entropy: 0.920792.
Iteration 14997: Policy loss: -0.010684. Value loss: 0.050182. Entropy: 0.925023.
Training network. lr: 0.000135. clip: 0.054077
Iteration 14998: Policy loss: 0.003781. Value loss: 0.104055. Entropy: 1.025815.
Iteration 14999: Policy loss: -0.004861. Value loss: 0.064569. Entropy: 1.014922.
Iteration 15000: Policy loss: -0.004664. Value loss: 0.053937. Entropy: 1.017588.
Training network. lr: 0.000135. clip: 0.053929
Iteration 15001: Policy loss: 0.000508. Val

Iteration 15060: Policy loss: -0.006539. Value loss: 0.224161. Entropy: 0.987195.
episode: 5048   score: 390.0  epsilon: 1.0    steps: 704  evaluation reward: 373.65
episode: 5049   score: 210.0  epsilon: 1.0    steps: 920  evaluation reward: 370.55
Training network. lr: 0.000134. clip: 0.053773
Iteration 15061: Policy loss: 0.002008. Value loss: 0.102725. Entropy: 0.962519.
Iteration 15062: Policy loss: -0.003993. Value loss: 0.060022. Entropy: 0.966230.
Iteration 15063: Policy loss: -0.008796. Value loss: 0.047364. Entropy: 0.970645.
Training network. lr: 0.000134. clip: 0.053773
Iteration 15064: Policy loss: 0.001233. Value loss: 0.257565. Entropy: 0.937506.
Iteration 15065: Policy loss: -0.005082. Value loss: 0.130740. Entropy: 0.951170.
Iteration 15066: Policy loss: -0.011120. Value loss: 0.095575. Entropy: 0.954124.
Training network. lr: 0.000134. clip: 0.053773
Iteration 15067: Policy loss: 0.002173. Value loss: 0.348007. Entropy: 1.106697.
Iteration 15068: Policy loss: -0.00398

Training network. lr: 0.000134. clip: 0.053616
Iteration 15127: Policy loss: 0.001174. Value loss: 0.162807. Entropy: 0.966505.
Iteration 15128: Policy loss: -0.003185. Value loss: 0.095358. Entropy: 0.953022.
Iteration 15129: Policy loss: -0.008437. Value loss: 0.070622. Entropy: 0.955698.
episode: 5069   score: 395.0  epsilon: 1.0    steps: 568  evaluation reward: 395.65
Training network. lr: 0.000134. clip: 0.053616
Iteration 15130: Policy loss: 0.004399. Value loss: 0.206044. Entropy: 1.055315.
Iteration 15131: Policy loss: -0.001931. Value loss: 0.100828. Entropy: 1.053755.
Iteration 15132: Policy loss: -0.006136. Value loss: 0.067465. Entropy: 1.058320.
Training network. lr: 0.000134. clip: 0.053616
Iteration 15133: Policy loss: 0.004063. Value loss: 0.362838. Entropy: 1.093066.
Iteration 15134: Policy loss: 0.002553. Value loss: 0.146197. Entropy: 1.086976.
Iteration 15135: Policy loss: -0.003841. Value loss: 0.083706. Entropy: 1.091340.
episode: 5070   score: 455.0  epsilon: 1.

Iteration 15193: Policy loss: 0.002459. Value loss: 0.082813. Entropy: 1.063816.
Iteration 15194: Policy loss: -0.004996. Value loss: 0.045857. Entropy: 1.067367.
Iteration 15195: Policy loss: -0.010215. Value loss: 0.033932. Entropy: 1.061600.
Training network. lr: 0.000134. clip: 0.053468
Iteration 15196: Policy loss: 0.007279. Value loss: 0.767399. Entropy: 1.089639.
Iteration 15197: Policy loss: 0.006142. Value loss: 0.263930. Entropy: 1.082003.
Iteration 15198: Policy loss: 0.002449. Value loss: 0.136322. Entropy: 1.083076.
episode: 5090   score: 360.0  epsilon: 1.0    steps: 760  evaluation reward: 418.65
Training network. lr: 0.000134. clip: 0.053468
Iteration 15199: Policy loss: 0.002942. Value loss: 0.168081. Entropy: 1.198397.
Iteration 15200: Policy loss: -0.003612. Value loss: 0.082193. Entropy: 1.197679.
Iteration 15201: Policy loss: -0.009848. Value loss: 0.052914. Entropy: 1.196538.
episode: 5091   score: 895.0  epsilon: 1.0    steps: 448  evaluation reward: 424.45
Train

Training network. lr: 0.000133. clip: 0.053155
Iteration 15259: Policy loss: 0.003573. Value loss: 0.115552. Entropy: 1.076066.
Iteration 15260: Policy loss: -0.004100. Value loss: 0.066313. Entropy: 1.075294.
Iteration 15261: Policy loss: -0.007394. Value loss: 0.050975. Entropy: 1.068715.
Training network. lr: 0.000133. clip: 0.053155
Iteration 15262: Policy loss: 0.001246. Value loss: 0.096344. Entropy: 1.042368.
Iteration 15263: Policy loss: -0.007295. Value loss: 0.040707. Entropy: 1.037850.
Iteration 15264: Policy loss: -0.011794. Value loss: 0.028217. Entropy: 1.041425.
Training network. lr: 0.000133. clip: 0.053155
Iteration 15265: Policy loss: 0.001070. Value loss: 0.211406. Entropy: 1.139189.
Iteration 15266: Policy loss: -0.002902. Value loss: 0.122860. Entropy: 1.143513.
Iteration 15267: Policy loss: -0.008233. Value loss: 0.083111. Entropy: 1.141065.
episode: 5112   score: 390.0  epsilon: 1.0    steps: 112  evaluation reward: 433.5
episode: 5113   score: 330.0  epsilon: 1.

Training network. lr: 0.000133. clip: 0.053008
Iteration 15325: Policy loss: 0.004421. Value loss: 0.171464. Entropy: 1.016559.
Iteration 15326: Policy loss: -0.002810. Value loss: 0.088527. Entropy: 1.012689.
Iteration 15327: Policy loss: -0.007691. Value loss: 0.061745. Entropy: 1.021400.
episode: 5134   score: 290.0  epsilon: 1.0    steps: 968  evaluation reward: 449.25
Training network. lr: 0.000133. clip: 0.053008
Iteration 15328: Policy loss: 0.005147. Value loss: 0.143900. Entropy: 1.063974.
Iteration 15329: Policy loss: -0.002187. Value loss: 0.079350. Entropy: 1.061265.
Iteration 15330: Policy loss: -0.007930. Value loss: 0.060159. Entropy: 1.057455.
episode: 5135   score: 420.0  epsilon: 1.0    steps: 256  evaluation reward: 449.55
Training network. lr: 0.000133. clip: 0.053008
Iteration 15331: Policy loss: 0.002646. Value loss: 0.108825. Entropy: 1.034060.
Iteration 15332: Policy loss: -0.005346. Value loss: 0.062218. Entropy: 1.028867.
Iteration 15333: Policy loss: -0.00916

Training network. lr: 0.000132. clip: 0.052851
Iteration 15391: Policy loss: 0.000734. Value loss: 0.072389. Entropy: 1.067206.
Iteration 15392: Policy loss: -0.005365. Value loss: 0.035407. Entropy: 1.073853.
Iteration 15393: Policy loss: -0.010096. Value loss: 0.025760. Entropy: 1.068118.
episode: 5155   score: 425.0  epsilon: 1.0    steps: 448  evaluation reward: 433.85
episode: 5156   score: 215.0  epsilon: 1.0    steps: 488  evaluation reward: 431.9
episode: 5157   score: 410.0  epsilon: 1.0    steps: 840  evaluation reward: 430.25
Training network. lr: 0.000132. clip: 0.052851
Iteration 15394: Policy loss: 0.002128. Value loss: 0.106650. Entropy: 0.976393.
Iteration 15395: Policy loss: -0.003219. Value loss: 0.053571. Entropy: 0.973945.
Iteration 15396: Policy loss: -0.008478. Value loss: 0.041164. Entropy: 0.974498.
episode: 5158   score: 495.0  epsilon: 1.0    steps: 816  evaluation reward: 429.9
Training network. lr: 0.000132. clip: 0.052851
Iteration 15397: Policy loss: 0.001

episode: 5177   score: 315.0  epsilon: 1.0    steps: 344  evaluation reward: 411.1
episode: 5178   score: 470.0  epsilon: 1.0    steps: 664  evaluation reward: 411.6
Training network. lr: 0.000131. clip: 0.052547
Iteration 15457: Policy loss: 0.001920. Value loss: 0.229397. Entropy: 1.042657.
Iteration 15458: Policy loss: -0.001771. Value loss: 0.110763. Entropy: 1.047966.
Iteration 15459: Policy loss: -0.008812. Value loss: 0.083024. Entropy: 1.041854.
episode: 5179   score: 610.0  epsilon: 1.0    steps: 696  evaluation reward: 414.8
Training network. lr: 0.000131. clip: 0.052547
Iteration 15460: Policy loss: 0.004511. Value loss: 0.174717. Entropy: 1.011560.
Iteration 15461: Policy loss: -0.000964. Value loss: 0.082078. Entropy: 1.007849.
Iteration 15462: Policy loss: -0.008624. Value loss: 0.056009. Entropy: 1.008747.
Training network. lr: 0.000131. clip: 0.052547
Iteration 15463: Policy loss: 0.001788. Value loss: 0.218139. Entropy: 1.094304.
Iteration 15464: Policy loss: -0.002992

Training network. lr: 0.000131. clip: 0.052390
Iteration 15523: Policy loss: 0.001119. Value loss: 0.159545. Entropy: 1.011268.
Iteration 15524: Policy loss: -0.003265. Value loss: 0.091538. Entropy: 1.015860.
Iteration 15525: Policy loss: -0.006631. Value loss: 0.071137. Entropy: 1.015049.
episode: 5199   score: 545.0  epsilon: 1.0    steps: 216  evaluation reward: 419.15
Training network. lr: 0.000131. clip: 0.052390
Iteration 15526: Policy loss: 0.004854. Value loss: 0.131119. Entropy: 1.072141.
Iteration 15527: Policy loss: -0.005671. Value loss: 0.060126. Entropy: 1.072530.
Iteration 15528: Policy loss: -0.009583. Value loss: 0.046287. Entropy: 1.069926.
episode: 5200   score: 750.0  epsilon: 1.0    steps: 128  evaluation reward: 424.55
Training network. lr: 0.000131. clip: 0.052390
Iteration 15529: Policy loss: 0.000818. Value loss: 0.120399. Entropy: 0.968238.
Iteration 15530: Policy loss: -0.005571. Value loss: 0.059156. Entropy: 0.965091.
Iteration 15531: Policy loss: -0.00951

episode: 5220   score: 670.0  epsilon: 1.0    steps: 1000  evaluation reward: 420.95
Training network. lr: 0.000131. clip: 0.052233
Iteration 15589: Policy loss: 0.001591. Value loss: 0.151292. Entropy: 1.033177.
Iteration 15590: Policy loss: -0.006652. Value loss: 0.065025. Entropy: 1.023107.
Iteration 15591: Policy loss: -0.009205. Value loss: 0.046080. Entropy: 1.029038.
episode: 5221   score: 530.0  epsilon: 1.0    steps: 752  evaluation reward: 424.45
episode: 5222   score: 350.0  epsilon: 1.0    steps: 888  evaluation reward: 422.25
Training network. lr: 0.000131. clip: 0.052233
Iteration 15592: Policy loss: -0.001383. Value loss: 0.086325. Entropy: 0.978691.
Iteration 15593: Policy loss: -0.005606. Value loss: 0.043950. Entropy: 0.980425.
Iteration 15594: Policy loss: -0.011781. Value loss: 0.037803. Entropy: 0.984745.
Training network. lr: 0.000131. clip: 0.052233
Iteration 15595: Policy loss: 0.001881. Value loss: 0.146075. Entropy: 0.999286.
Iteration 15596: Policy loss: -0.0

Training network. lr: 0.000130. clip: 0.051929
Iteration 15655: Policy loss: 0.003155. Value loss: 0.125033. Entropy: 1.014004.
Iteration 15656: Policy loss: -0.003656. Value loss: 0.059962. Entropy: 1.023048.
Iteration 15657: Policy loss: -0.009810. Value loss: 0.041301. Entropy: 1.018674.
episode: 5242   score: 430.0  epsilon: 1.0    steps: 200  evaluation reward: 418.15
Training network. lr: 0.000130. clip: 0.051929
Iteration 15658: Policy loss: 0.011137. Value loss: 0.628401. Entropy: 1.067695.
Iteration 15659: Policy loss: 0.009794. Value loss: 0.336525. Entropy: 1.068819.
Iteration 15660: Policy loss: 0.002012. Value loss: 0.246849. Entropy: 1.070107.
episode: 5243   score: 595.0  epsilon: 1.0    steps: 56  evaluation reward: 418.45
episode: 5244   score: 680.0  epsilon: 1.0    steps: 72  evaluation reward: 421.95
episode: 5245   score: 270.0  epsilon: 1.0    steps: 816  evaluation reward: 422.05
Training network. lr: 0.000130. clip: 0.051929
Iteration 15661: Policy loss: 0.00204

Training network. lr: 0.000129. clip: 0.051773
Iteration 15721: Policy loss: 0.004725. Value loss: 0.109006. Entropy: 0.893917.
Iteration 15722: Policy loss: -0.002853. Value loss: 0.050989. Entropy: 0.903387.
Iteration 15723: Policy loss: -0.008177. Value loss: 0.037388. Entropy: 0.895424.
episode: 5263   score: 420.0  epsilon: 1.0    steps: 544  evaluation reward: 433.8
episode: 5264   score: 285.0  epsilon: 1.0    steps: 864  evaluation reward: 434.5
Training network. lr: 0.000129. clip: 0.051773
Iteration 15724: Policy loss: 0.004097. Value loss: 0.169951. Entropy: 1.022559.
Iteration 15725: Policy loss: -0.002456. Value loss: 0.078615. Entropy: 1.016654.
Iteration 15726: Policy loss: -0.007060. Value loss: 0.060625. Entropy: 1.020175.
Training network. lr: 0.000129. clip: 0.051773
Iteration 15727: Policy loss: 0.002529. Value loss: 0.103778. Entropy: 1.047364.
Iteration 15728: Policy loss: -0.003090. Value loss: 0.060355. Entropy: 1.057006.
Iteration 15729: Policy loss: -0.005830.

Iteration 15787: Policy loss: 0.002591. Value loss: 0.101778. Entropy: 0.891120.
Iteration 15788: Policy loss: -0.004857. Value loss: 0.049444. Entropy: 0.878554.
Iteration 15789: Policy loss: -0.008579. Value loss: 0.039518. Entropy: 0.884607.
Training network. lr: 0.000129. clip: 0.051625
Iteration 15790: Policy loss: 0.004108. Value loss: 0.256636. Entropy: 1.009743.
Iteration 15791: Policy loss: -0.002301. Value loss: 0.141466. Entropy: 1.010941.
Iteration 15792: Policy loss: -0.005632. Value loss: 0.090778. Entropy: 1.014939.
episode: 5284   score: 525.0  epsilon: 1.0    steps: 832  evaluation reward: 440.9
episode: 5285   score: 410.0  epsilon: 1.0    steps: 888  evaluation reward: 439.1
Training network. lr: 0.000129. clip: 0.051625
Iteration 15793: Policy loss: 0.001018. Value loss: 0.148091. Entropy: 1.040486.
Iteration 15794: Policy loss: -0.001714. Value loss: 0.056694. Entropy: 1.036882.
Iteration 15795: Policy loss: -0.004186. Value loss: 0.043925. Entropy: 1.030846.
Train

Iteration 15855: Policy loss: -0.011700. Value loss: 0.070947. Entropy: 0.940809.
episode: 5303   score: 385.0  epsilon: 1.0    steps: 552  evaluation reward: 458.05
Training network. lr: 0.000128. clip: 0.051312
Iteration 15856: Policy loss: 0.003425. Value loss: 0.106076. Entropy: 0.896038.
Iteration 15857: Policy loss: -0.005133. Value loss: 0.058587. Entropy: 0.887532.
Iteration 15858: Policy loss: -0.006953. Value loss: 0.044094. Entropy: 0.878819.
episode: 5304   score: 670.0  epsilon: 1.0    steps: 880  evaluation reward: 458.05
Training network. lr: 0.000128. clip: 0.051312
Iteration 15859: Policy loss: 0.005080. Value loss: 0.135265. Entropy: 0.826537.
Iteration 15860: Policy loss: 0.000008. Value loss: 0.058193. Entropy: 0.816865.
Iteration 15861: Policy loss: -0.003885. Value loss: 0.039121. Entropy: 0.804532.
Training network. lr: 0.000128. clip: 0.051312
Iteration 15862: Policy loss: 0.004015. Value loss: 0.722693. Entropy: 0.945413.
Iteration 15863: Policy loss: 0.003495.

Iteration 15922: Policy loss: 0.005100. Value loss: 0.102067. Entropy: 0.942609.
Iteration 15923: Policy loss: -0.002399. Value loss: 0.051230. Entropy: 0.944879.
Iteration 15924: Policy loss: -0.007672. Value loss: 0.037109. Entropy: 0.950155.
episode: 5323   score: 670.0  epsilon: 1.0    steps: 136  evaluation reward: 476.25
Training network. lr: 0.000128. clip: 0.051164
Iteration 15925: Policy loss: 0.004084. Value loss: 0.455873. Entropy: 0.897578.
Iteration 15926: Policy loss: 0.000410. Value loss: 0.207743. Entropy: 0.911235.
Iteration 15927: Policy loss: -0.000047. Value loss: 0.110312. Entropy: 0.916617.
episode: 5324   score: 360.0  epsilon: 1.0    steps: 88  evaluation reward: 477.4
Training network. lr: 0.000128. clip: 0.051164
Iteration 15928: Policy loss: 0.001945. Value loss: 0.150520. Entropy: 1.029268.
Iteration 15929: Policy loss: -0.004874. Value loss: 0.067177. Entropy: 1.031700.
Iteration 15930: Policy loss: -0.007395. Value loss: 0.048062. Entropy: 1.033647.
episod

Training network. lr: 0.000128. clip: 0.051008
Iteration 15988: Policy loss: 0.004049. Value loss: 0.610087. Entropy: 1.084678.
Iteration 15989: Policy loss: 0.001969. Value loss: 0.405603. Entropy: 1.082653.
Iteration 15990: Policy loss: 0.000136. Value loss: 0.281981. Entropy: 1.083964.
episode: 5345   score: 410.0  epsilon: 1.0    steps: 328  evaluation reward: 490.5
Training network. lr: 0.000128. clip: 0.051008
Iteration 15991: Policy loss: 0.003392. Value loss: 0.171016. Entropy: 1.067724.
Iteration 15992: Policy loss: -0.002541. Value loss: 0.095106. Entropy: 1.069424.
Iteration 15993: Policy loss: -0.005650. Value loss: 0.075189. Entropy: 1.059958.
Training network. lr: 0.000128. clip: 0.051008
Iteration 15994: Policy loss: 0.003792. Value loss: 0.202134. Entropy: 1.095362.
Iteration 15995: Policy loss: -0.001746. Value loss: 0.094365. Entropy: 1.093121.
Iteration 15996: Policy loss: -0.004638. Value loss: 0.057753. Entropy: 1.100099.
episode: 5346   score: 580.0  epsilon: 1.0 

Training network. lr: 0.000127. clip: 0.050704
Iteration 16054: Policy loss: 0.006680. Value loss: 0.219198. Entropy: 1.048284.
Iteration 16055: Policy loss: -0.000562. Value loss: 0.126438. Entropy: 1.055317.
Iteration 16056: Policy loss: -0.005130. Value loss: 0.098037. Entropy: 1.050491.
episode: 5366   score: 410.0  epsilon: 1.0    steps: 672  evaluation reward: 492.6
Training network. lr: 0.000127. clip: 0.050704
Iteration 16057: Policy loss: 0.000691. Value loss: 0.200203. Entropy: 1.135919.
Iteration 16058: Policy loss: -0.001223. Value loss: 0.086571. Entropy: 1.143937.
Iteration 16059: Policy loss: -0.008215. Value loss: 0.061390. Entropy: 1.144047.
Training network. lr: 0.000127. clip: 0.050704
Iteration 16060: Policy loss: 0.001831. Value loss: 0.085225. Entropy: 1.135844.
Iteration 16061: Policy loss: -0.001612. Value loss: 0.039014. Entropy: 1.133104.
Iteration 16062: Policy loss: -0.009492. Value loss: 0.028597. Entropy: 1.131216.
episode: 5367   score: 620.0  epsilon: 1.

Iteration 16122: Policy loss: -0.001042. Value loss: 0.075223. Entropy: 0.968773.
Training network. lr: 0.000126. clip: 0.050547
Iteration 16123: Policy loss: 0.001822. Value loss: 0.669830. Entropy: 1.099798.
Iteration 16124: Policy loss: 0.001581. Value loss: 0.405422. Entropy: 1.099472.
Iteration 16125: Policy loss: -0.004248. Value loss: 0.316172. Entropy: 1.092998.
episode: 5385   score: 390.0  epsilon: 1.0    steps: 392  evaluation reward: 499.25
episode: 5386   score: 765.0  epsilon: 1.0    steps: 872  evaluation reward: 502.95
episode: 5387   score: 640.0  epsilon: 1.0    steps: 904  evaluation reward: 502.2
Training network. lr: 0.000126. clip: 0.050547
Iteration 16126: Policy loss: 0.001980. Value loss: 0.481196. Entropy: 1.039432.
Iteration 16127: Policy loss: -0.004708. Value loss: 0.333415. Entropy: 1.035578.
Iteration 16128: Policy loss: -0.007526. Value loss: 0.259776. Entropy: 1.039517.
Training network. lr: 0.000126. clip: 0.050547
Iteration 16129: Policy loss: 0.00280

Iteration 16186: Policy loss: 0.000787. Value loss: 0.122821. Entropy: 0.967935.
Iteration 16187: Policy loss: -0.003408. Value loss: 0.075816. Entropy: 0.977635.
Iteration 16188: Policy loss: -0.006990. Value loss: 0.058433. Entropy: 0.972020.
episode: 5410   score: 375.0  epsilon: 1.0    steps: 928  evaluation reward: 501.65
Training network. lr: 0.000126. clip: 0.050390
Iteration 16189: Policy loss: 0.003587. Value loss: 0.114919. Entropy: 0.931882.
Iteration 16190: Policy loss: -0.001613. Value loss: 0.054046. Entropy: 0.936581.
Iteration 16191: Policy loss: -0.006288. Value loss: 0.038816. Entropy: 0.933556.
Training network. lr: 0.000126. clip: 0.050390
Iteration 16192: Policy loss: 0.006218. Value loss: 0.299488. Entropy: 0.916102.
Iteration 16193: Policy loss: 0.004013. Value loss: 0.087416. Entropy: 0.919983.
Iteration 16194: Policy loss: -0.006203. Value loss: 0.052317. Entropy: 0.917221.
Training network. lr: 0.000126. clip: 0.050390
Iteration 16195: Policy loss: 0.001489. V

Training network. lr: 0.000125. clip: 0.050086
Iteration 16255: Policy loss: 0.003581. Value loss: 0.236127. Entropy: 1.060181.
Iteration 16256: Policy loss: -0.003971. Value loss: 0.104695. Entropy: 1.065546.
Iteration 16257: Policy loss: -0.009189. Value loss: 0.071022. Entropy: 1.056185.
episode: 5429   score: 765.0  epsilon: 1.0    steps: 352  evaluation reward: 489.1
Training network. lr: 0.000125. clip: 0.050086
Iteration 16258: Policy loss: 0.002828. Value loss: 0.728522. Entropy: 1.124524.
Iteration 16259: Policy loss: 0.000385. Value loss: 0.498751. Entropy: 1.115059.
Iteration 16260: Policy loss: -0.002669. Value loss: 0.430331. Entropy: 1.118466.
Training network. lr: 0.000125. clip: 0.050086
Iteration 16261: Policy loss: 0.005036. Value loss: 0.203961. Entropy: 1.219411.
Iteration 16262: Policy loss: -0.004435. Value loss: 0.107549. Entropy: 1.220464.
Iteration 16263: Policy loss: -0.008389. Value loss: 0.079037. Entropy: 1.215554.
Training network. lr: 0.000125. clip: 0.05

Iteration 16322: Policy loss: -0.004120. Value loss: 0.109339. Entropy: 1.054279.
Iteration 16323: Policy loss: -0.007237. Value loss: 0.083541. Entropy: 1.057658.
episode: 5449   score: 495.0  epsilon: 1.0    steps: 88  evaluation reward: 496.15
Training network. lr: 0.000125. clip: 0.049929
Iteration 16324: Policy loss: 0.004032. Value loss: 0.212333. Entropy: 1.000476.
Iteration 16325: Policy loss: -0.002229. Value loss: 0.117520. Entropy: 0.992087.
Iteration 16326: Policy loss: -0.006403. Value loss: 0.089157. Entropy: 0.997901.
Training network. lr: 0.000125. clip: 0.049929
Iteration 16327: Policy loss: 0.000860. Value loss: 0.142344. Entropy: 1.181581.
Iteration 16328: Policy loss: -0.005919. Value loss: 0.065923. Entropy: 1.186980.
Iteration 16329: Policy loss: -0.011122. Value loss: 0.044899. Entropy: 1.183644.
episode: 5450   score: 185.0  epsilon: 1.0    steps: 400  evaluation reward: 493.0
now time :  2019-03-06 00:14:24.991667
episode: 5451   score: 440.0  epsilon: 1.0    s

Iteration 16387: Policy loss: 0.000638. Value loss: 0.154570. Entropy: 1.049378.
Iteration 16388: Policy loss: -0.005519. Value loss: 0.085776. Entropy: 1.047925.
Iteration 16389: Policy loss: -0.010758. Value loss: 0.068936. Entropy: 1.049529.
episode: 5471   score: 285.0  epsilon: 1.0    steps: 712  evaluation reward: 494.2
Training network. lr: 0.000124. clip: 0.049782
Iteration 16390: Policy loss: 0.003615. Value loss: 0.207220. Entropy: 0.943328.
Iteration 16391: Policy loss: -0.000134. Value loss: 0.102107. Entropy: 0.944498.
Iteration 16392: Policy loss: -0.003287. Value loss: 0.076886. Entropy: 0.945351.
episode: 5472   score: 670.0  epsilon: 1.0    steps: 544  evaluation reward: 493.45
Training network. lr: 0.000124. clip: 0.049782
Iteration 16393: Policy loss: 0.003020. Value loss: 0.136520. Entropy: 0.918267.
Iteration 16394: Policy loss: -0.003479. Value loss: 0.088313. Entropy: 0.919941.
Iteration 16395: Policy loss: -0.004915. Value loss: 0.079184. Entropy: 0.920359.
epis

Iteration 16453: Policy loss: 0.002475. Value loss: 0.283755. Entropy: 1.116745.
Iteration 16454: Policy loss: 0.002264. Value loss: 0.104439. Entropy: 1.118660.
Iteration 16455: Policy loss: -0.007453. Value loss: 0.072618. Entropy: 1.123165.
episode: 5493   score: 260.0  epsilon: 1.0    steps: 968  evaluation reward: 452.2
Training network. lr: 0.000124. clip: 0.049469
Iteration 16456: Policy loss: 0.003954. Value loss: 0.144894. Entropy: 1.056107.
Iteration 16457: Policy loss: -0.001346. Value loss: 0.077659. Entropy: 1.051137.
Iteration 16458: Policy loss: -0.007835. Value loss: 0.054143. Entropy: 1.044814.
episode: 5494   score: 670.0  epsilon: 1.0    steps: 384  evaluation reward: 455.0
Training network. lr: 0.000124. clip: 0.049469
Iteration 16459: Policy loss: 0.006054. Value loss: 0.328399. Entropy: 0.998489.
Iteration 16460: Policy loss: 0.000377. Value loss: 0.139463. Entropy: 1.014484.
Iteration 16461: Policy loss: -0.003774. Value loss: 0.099390. Entropy: 1.011678.
Trainin

Iteration 16523: Policy loss: -0.002751. Value loss: 0.145937. Entropy: 1.101933.
Iteration 16524: Policy loss: -0.006173. Value loss: 0.095113. Entropy: 1.096817.
episode: 5510   score: 515.0  epsilon: 1.0    steps: 432  evaluation reward: 470.3
episode: 5511   score: 605.0  epsilon: 1.0    steps: 720  evaluation reward: 472.9
episode: 5512   score: 390.0  epsilon: 1.0    steps: 928  evaluation reward: 471.55
Training network. lr: 0.000123. clip: 0.049321
Iteration 16525: Policy loss: 0.002546. Value loss: 0.222735. Entropy: 1.101851.
Iteration 16526: Policy loss: -0.004217. Value loss: 0.131786. Entropy: 1.095548.
Iteration 16527: Policy loss: -0.006954. Value loss: 0.093380. Entropy: 1.098025.
episode: 5513   score: 470.0  epsilon: 1.0    steps: 344  evaluation reward: 470.7
episode: 5514   score: 310.0  epsilon: 1.0    steps: 424  evaluation reward: 467.45
Training network. lr: 0.000123. clip: 0.049321
Iteration 16528: Policy loss: 0.001240. Value loss: 0.107696. Entropy: 0.842217.

Training network. lr: 0.000123. clip: 0.049165
Iteration 16588: Policy loss: 0.001630. Value loss: 0.100367. Entropy: 1.053909.
Iteration 16589: Policy loss: -0.006161. Value loss: 0.044062. Entropy: 1.051830.
Iteration 16590: Policy loss: -0.009934. Value loss: 0.033979. Entropy: 1.048404.
episode: 5533   score: 705.0  epsilon: 1.0    steps: 808  evaluation reward: 474.55
Training network. lr: 0.000123. clip: 0.049165
Iteration 16591: Policy loss: 0.002013. Value loss: 0.175872. Entropy: 1.049298.
Iteration 16592: Policy loss: -0.000643. Value loss: 0.066366. Entropy: 1.053207.
Iteration 16593: Policy loss: -0.005411. Value loss: 0.049582. Entropy: 1.047008.
episode: 5534   score: 210.0  epsilon: 1.0    steps: 24  evaluation reward: 471.0
Training network. lr: 0.000123. clip: 0.049165
Iteration 16594: Policy loss: 0.003210. Value loss: 0.221200. Entropy: 0.964470.
Iteration 16595: Policy loss: -0.001413. Value loss: 0.104599. Entropy: 0.971837.
Iteration 16596: Policy loss: -0.004937.

Iteration 16652: Policy loss: -0.001725. Value loss: 0.436009. Entropy: 1.033808.
Iteration 16653: Policy loss: -0.005029. Value loss: 0.387091. Entropy: 1.040028.
episode: 5556   score: 235.0  epsilon: 1.0    steps: 760  evaluation reward: 451.0
Training network. lr: 0.000122. clip: 0.048860
Iteration 16654: Policy loss: 0.002758. Value loss: 0.165011. Entropy: 1.044191.
Iteration 16655: Policy loss: -0.000459. Value loss: 0.093204. Entropy: 1.041174.
Iteration 16656: Policy loss: -0.009085. Value loss: 0.073857. Entropy: 1.041375.
Training network. lr: 0.000122. clip: 0.048860
Iteration 16657: Policy loss: 0.005509. Value loss: 0.125225. Entropy: 1.086240.
Iteration 16658: Policy loss: -0.003633. Value loss: 0.064891. Entropy: 1.076500.
Iteration 16659: Policy loss: -0.008976. Value loss: 0.049136. Entropy: 1.077048.
episode: 5557   score: 380.0  epsilon: 1.0    steps: 48  evaluation reward: 450.3
episode: 5558   score: 290.0  epsilon: 1.0    steps: 424  evaluation reward: 449.6
epis

Iteration 16719: Policy loss: -0.007202. Value loss: 0.060360. Entropy: 0.919224.
episode: 5577   score: 490.0  epsilon: 1.0    steps: 352  evaluation reward: 464.2
episode: 5578   score: 570.0  epsilon: 1.0    steps: 448  evaluation reward: 465.4
Training network. lr: 0.000122. clip: 0.048704
Iteration 16720: Policy loss: 0.005479. Value loss: 0.394774. Entropy: 0.859144.
Iteration 16721: Policy loss: 0.006961. Value loss: 0.208227. Entropy: 0.862104.
Iteration 16722: Policy loss: 0.002068. Value loss: 0.093387. Entropy: 0.846184.
Training network. lr: 0.000122. clip: 0.048704
Iteration 16723: Policy loss: 0.004829. Value loss: 0.143388. Entropy: 0.960940.
Iteration 16724: Policy loss: -0.002433. Value loss: 0.077283. Entropy: 0.956879.
Iteration 16725: Policy loss: -0.007451. Value loss: 0.060199. Entropy: 0.959020.
episode: 5579   score: 210.0  epsilon: 1.0    steps: 664  evaluation reward: 461.45
Training network. lr: 0.000122. clip: 0.048704
Iteration 16726: Policy loss: 0.001667.

Iteration 16785: Policy loss: -0.006171. Value loss: 0.060161. Entropy: 0.922047.
episode: 5599   score: 355.0  epsilon: 1.0    steps: 832  evaluation reward: 480.4
Training network. lr: 0.000121. clip: 0.048547
Iteration 16786: Policy loss: 0.002409. Value loss: 0.148399. Entropy: 0.946253.
Iteration 16787: Policy loss: -0.004115. Value loss: 0.075944. Entropy: 0.939805.
Iteration 16788: Policy loss: -0.004987. Value loss: 0.059454. Entropy: 0.945176.
Training network. lr: 0.000121. clip: 0.048547
Iteration 16789: Policy loss: 0.006326. Value loss: 0.357310. Entropy: 0.905970.
Iteration 16790: Policy loss: -0.003304. Value loss: 0.197078. Entropy: 0.897158.
Iteration 16791: Policy loss: -0.007860. Value loss: 0.146200. Entropy: 0.906684.
Training network. lr: 0.000121. clip: 0.048547
Iteration 16792: Policy loss: 0.003864. Value loss: 0.202914. Entropy: 1.107724.
Iteration 16793: Policy loss: -0.002672. Value loss: 0.079542. Entropy: 1.104396.
Iteration 16794: Policy loss: -0.008334. 

Iteration 16851: Policy loss: -0.005204. Value loss: 0.058352. Entropy: 0.957592.
Training network. lr: 0.000121. clip: 0.048243
Iteration 16852: Policy loss: 0.000896. Value loss: 0.170089. Entropy: 1.086663.
Iteration 16853: Policy loss: -0.003201. Value loss: 0.084824. Entropy: 1.087092.
Iteration 16854: Policy loss: -0.009523. Value loss: 0.063567. Entropy: 1.088052.
Training network. lr: 0.000121. clip: 0.048243
Iteration 16855: Policy loss: 0.001260. Value loss: 0.380359. Entropy: 1.135262.
Iteration 16856: Policy loss: -0.002039. Value loss: 0.296387. Entropy: 1.130839.
Iteration 16857: Policy loss: -0.005262. Value loss: 0.254767. Entropy: 1.129263.
episode: 5620   score: 320.0  epsilon: 1.0    steps: 344  evaluation reward: 482.85
episode: 5621   score: 290.0  epsilon: 1.0    steps: 368  evaluation reward: 479.3
episode: 5622   score: 465.0  epsilon: 1.0    steps: 896  evaluation reward: 475.0
Training network. lr: 0.000121. clip: 0.048243
Iteration 16858: Policy loss: 0.00384

Training network. lr: 0.000120. clip: 0.048086
Iteration 16918: Policy loss: 0.004214. Value loss: 0.130412. Entropy: 0.933850.
Iteration 16919: Policy loss: -0.004664. Value loss: 0.067520. Entropy: 0.931571.
Iteration 16920: Policy loss: -0.006807. Value loss: 0.053204. Entropy: 0.929553.
Training network. lr: 0.000120. clip: 0.048086
Iteration 16921: Policy loss: 0.002787. Value loss: 0.356568. Entropy: 1.023831.
Iteration 16922: Policy loss: -0.002886. Value loss: 0.256580. Entropy: 1.022468.
Iteration 16923: Policy loss: -0.005064. Value loss: 0.205926. Entropy: 1.018935.
episode: 5641   score: 725.0  epsilon: 1.0    steps: 536  evaluation reward: 482.4
Training network. lr: 0.000120. clip: 0.048086
Iteration 16924: Policy loss: 0.005667. Value loss: 0.328992. Entropy: 0.976201.
Iteration 16925: Policy loss: -0.000783. Value loss: 0.160700. Entropy: 0.976715.
Iteration 16926: Policy loss: -0.004034. Value loss: 0.106799. Entropy: 0.982060.
episode: 5642   score: 330.0  epsilon: 1.

episode: 5662   score: 955.0  epsilon: 1.0    steps: 1016  evaluation reward: 489.05
Training network. lr: 0.000120. clip: 0.047939
Iteration 16984: Policy loss: 0.000398. Value loss: 0.165530. Entropy: 1.069051.
Iteration 16985: Policy loss: -0.006955. Value loss: 0.092066. Entropy: 1.066090.
Iteration 16986: Policy loss: -0.010441. Value loss: 0.069633. Entropy: 1.067583.
episode: 5663   score: 535.0  epsilon: 1.0    steps: 1016  evaluation reward: 488.0
Training network. lr: 0.000120. clip: 0.047939
Iteration 16987: Policy loss: 0.002549. Value loss: 0.153149. Entropy: 0.845563.
Iteration 16988: Policy loss: -0.002645. Value loss: 0.064766. Entropy: 0.854307.
Iteration 16989: Policy loss: -0.005726. Value loss: 0.048304. Entropy: 0.843805.
episode: 5664   score: 390.0  epsilon: 1.0    steps: 8  evaluation reward: 487.75
Training network. lr: 0.000120. clip: 0.047939
Iteration 16990: Policy loss: 0.002221. Value loss: 0.107183. Entropy: 0.757515.
Iteration 16991: Policy loss: -0.0052

Iteration 17052: Policy loss: -0.010683. Value loss: 0.057489. Entropy: 0.996219.
episode: 5681   score: 315.0  epsilon: 1.0    steps: 320  evaluation reward: 506.05
Training network. lr: 0.000119. clip: 0.047625
Iteration 17053: Policy loss: 0.002652. Value loss: 0.179331. Entropy: 0.968293.
Iteration 17054: Policy loss: -0.004178. Value loss: 0.101885. Entropy: 0.974799.
Iteration 17055: Policy loss: -0.010192. Value loss: 0.076203. Entropy: 0.969200.
episode: 5682   score: 390.0  epsilon: 1.0    steps: 304  evaluation reward: 505.5
episode: 5683   score: 515.0  epsilon: 1.0    steps: 632  evaluation reward: 506.4
Training network. lr: 0.000119. clip: 0.047625
Iteration 17056: Policy loss: 0.001002. Value loss: 0.226346. Entropy: 0.897817.
Iteration 17057: Policy loss: -0.002978. Value loss: 0.082879. Entropy: 0.913619.
Iteration 17058: Policy loss: -0.006456. Value loss: 0.056545. Entropy: 0.913829.
Training network. lr: 0.000119. clip: 0.047625
Iteration 17059: Policy loss: 0.00379

Iteration 17118: Policy loss: -0.004428. Value loss: 0.037391. Entropy: 0.739811.
Training network. lr: 0.000119. clip: 0.047478
Iteration 17119: Policy loss: 0.003540. Value loss: 0.067977. Entropy: 0.933977.
Iteration 17120: Policy loss: -0.001025. Value loss: 0.043170. Entropy: 0.932297.
Iteration 17121: Policy loss: -0.007164. Value loss: 0.036288. Entropy: 0.931335.
Training network. lr: 0.000119. clip: 0.047478
Iteration 17122: Policy loss: 0.006220. Value loss: 0.530330. Entropy: 1.133554.
Iteration 17123: Policy loss: -0.000602. Value loss: 0.292157. Entropy: 1.135677.
Iteration 17124: Policy loss: -0.002423. Value loss: 0.213392. Entropy: 1.134893.
Training network. lr: 0.000119. clip: 0.047478
Iteration 17125: Policy loss: 0.002035. Value loss: 0.166455. Entropy: 1.126815.
Iteration 17126: Policy loss: -0.004521. Value loss: 0.080443. Entropy: 1.123619.
Iteration 17127: Policy loss: -0.008145. Value loss: 0.057528. Entropy: 1.120700.
episode: 5702   score: 390.0  epsilon: 1.0

episode: 5719   score: 365.0  epsilon: 1.0    steps: 704  evaluation reward: 511.5
Training network. lr: 0.000118. clip: 0.047321
Iteration 17188: Policy loss: 0.002031. Value loss: 0.165576. Entropy: 0.919826.
Iteration 17189: Policy loss: -0.004623. Value loss: 0.108248. Entropy: 0.917669.
Iteration 17190: Policy loss: -0.007691. Value loss: 0.082407. Entropy: 0.916839.
episode: 5720   score: 645.0  epsilon: 1.0    steps: 200  evaluation reward: 514.75
Training network. lr: 0.000118. clip: 0.047321
Iteration 17191: Policy loss: 0.003137. Value loss: 0.138017. Entropy: 0.858848.
Iteration 17192: Policy loss: -0.002038. Value loss: 0.048904. Entropy: 0.855878.
Iteration 17193: Policy loss: -0.006085. Value loss: 0.034284. Entropy: 0.856473.
episode: 5721   score: 870.0  epsilon: 1.0    steps: 936  evaluation reward: 520.55
Training network. lr: 0.000118. clip: 0.047321
Iteration 17194: Policy loss: 0.002430. Value loss: 0.128191. Entropy: 0.989875.
Iteration 17195: Policy loss: -0.0047

Training network. lr: 0.000118. clip: 0.047017
Iteration 17257: Policy loss: 0.001205. Value loss: 0.143802. Entropy: 1.096963.
Iteration 17258: Policy loss: -0.006636. Value loss: 0.065018. Entropy: 1.096987.
Iteration 17259: Policy loss: -0.008439. Value loss: 0.050027. Entropy: 1.093526.
episode: 5737   score: 615.0  epsilon: 1.0    steps: 48  evaluation reward: 524.95
episode: 5738   score: 375.0  epsilon: 1.0    steps: 208  evaluation reward: 524.75
Training network. lr: 0.000118. clip: 0.047017
Iteration 17260: Policy loss: 0.003465. Value loss: 0.240664. Entropy: 0.795160.
Iteration 17261: Policy loss: -0.000768. Value loss: 0.096685. Entropy: 0.796375.
Iteration 17262: Policy loss: -0.006215. Value loss: 0.067476. Entropy: 0.791058.
Training network. lr: 0.000118. clip: 0.047017
Iteration 17263: Policy loss: 0.002849. Value loss: 0.129607. Entropy: 0.975038.
Iteration 17264: Policy loss: -0.006592. Value loss: 0.056699. Entropy: 0.983298.
Iteration 17265: Policy loss: -0.009946

Iteration 17325: Policy loss: -0.004651. Value loss: 0.040069. Entropy: 0.878239.
episode: 5756   score: 550.0  epsilon: 1.0    steps: 8  evaluation reward: 527.65
episode: 5757   score: 610.0  epsilon: 1.0    steps: 384  evaluation reward: 530.9
Training network. lr: 0.000117. clip: 0.046861
Iteration 17326: Policy loss: 0.004205. Value loss: 0.470438. Entropy: 0.860903.
Iteration 17327: Policy loss: 0.001140. Value loss: 0.266619. Entropy: 0.860508.
Iteration 17328: Policy loss: -0.002174. Value loss: 0.180856. Entropy: 0.869055.
episode: 5758   score: 695.0  epsilon: 1.0    steps: 160  evaluation reward: 535.45
Training network. lr: 0.000117. clip: 0.046861
Iteration 17329: Policy loss: 0.004885. Value loss: 0.111656. Entropy: 0.946409.
Iteration 17330: Policy loss: 0.000059. Value loss: 0.041601. Entropy: 0.953888.
Iteration 17331: Policy loss: -0.005121. Value loss: 0.034112. Entropy: 0.953477.
Training network. lr: 0.000117. clip: 0.046861
Iteration 17332: Policy loss: 0.004297. 

Training network. lr: 0.000117. clip: 0.046704
Iteration 17392: Policy loss: -0.000174. Value loss: 0.154477. Entropy: 0.749352.
Iteration 17393: Policy loss: -0.002154. Value loss: 0.089916. Entropy: 0.744818.
Iteration 17394: Policy loss: -0.004898. Value loss: 0.063198. Entropy: 0.751214.
Training network. lr: 0.000117. clip: 0.046704
Iteration 17395: Policy loss: 0.002402. Value loss: 0.115333. Entropy: 0.870586.
Iteration 17396: Policy loss: -0.001116. Value loss: 0.072378. Entropy: 0.868271.
Iteration 17397: Policy loss: -0.004402. Value loss: 0.057578. Entropy: 0.867884.
episode: 5777   score: 345.0  epsilon: 1.0    steps: 384  evaluation reward: 518.85
episode: 5778   score: 650.0  epsilon: 1.0    steps: 832  evaluation reward: 522.25
Training network. lr: 0.000117. clip: 0.046704
Iteration 17398: Policy loss: 0.004213. Value loss: 0.210979. Entropy: 0.942767.
Iteration 17399: Policy loss: -0.003538. Value loss: 0.104018. Entropy: 0.936606.
Iteration 17400: Policy loss: -0.0067

episode: 5795   score: 450.0  epsilon: 1.0    steps: 752  evaluation reward: 517.5
Training network. lr: 0.000116. clip: 0.046400
Iteration 17461: Policy loss: 0.001491. Value loss: 0.172114. Entropy: 1.059993.
Iteration 17462: Policy loss: -0.006390. Value loss: 0.076862. Entropy: 1.053117.
Iteration 17463: Policy loss: -0.008210. Value loss: 0.055799. Entropy: 1.054637.
Training network. lr: 0.000116. clip: 0.046400
Iteration 17464: Policy loss: 0.003642. Value loss: 0.400436. Entropy: 1.053265.
Iteration 17465: Policy loss: -0.001137. Value loss: 0.273260. Entropy: 1.055397.
Iteration 17466: Policy loss: -0.003647. Value loss: 0.225260. Entropy: 1.056288.
episode: 5796   score: 925.0  epsilon: 1.0    steps: 640  evaluation reward: 524.65
Training network. lr: 0.000116. clip: 0.046400
Iteration 17467: Policy loss: 0.001902. Value loss: 0.266461. Entropy: 1.027122.
Iteration 17468: Policy loss: -0.003314. Value loss: 0.239955. Entropy: 1.019275.
Iteration 17469: Policy loss: -0.003846

Iteration 17529: Policy loss: -0.009511. Value loss: 0.057262. Entropy: 1.156635.
episode: 5814   score: 740.0  epsilon: 1.0    steps: 160  evaluation reward: 523.75
episode: 5815   score: 440.0  epsilon: 1.0    steps: 920  evaluation reward: 524.2
Training network. lr: 0.000116. clip: 0.046243
Iteration 17530: Policy loss: 0.000949. Value loss: 0.178006. Entropy: 1.007899.
Iteration 17531: Policy loss: -0.006260. Value loss: 0.098789. Entropy: 1.009513.
Iteration 17532: Policy loss: -0.009884. Value loss: 0.070810. Entropy: 1.008906.
Training network. lr: 0.000116. clip: 0.046243
Iteration 17533: Policy loss: 0.000857. Value loss: 0.203139. Entropy: 0.951118.
Iteration 17534: Policy loss: -0.004536. Value loss: 0.107570. Entropy: 0.949047.
Iteration 17535: Policy loss: -0.009723. Value loss: 0.081317. Entropy: 0.952569.
episode: 5816   score: 370.0  epsilon: 1.0    steps: 304  evaluation reward: 520.35
episode: 5817   score: 835.0  epsilon: 1.0    steps: 568  evaluation reward: 521.85

Iteration 17596: Policy loss: 0.002943. Value loss: 0.129636. Entropy: 0.924157.
Iteration 17597: Policy loss: -0.003804. Value loss: 0.068348. Entropy: 0.918892.
Iteration 17598: Policy loss: -0.008110. Value loss: 0.050098. Entropy: 0.912092.
episode: 5834   score: 600.0  epsilon: 1.0    steps: 872  evaluation reward: 522.8
Training network. lr: 0.000115. clip: 0.046096
Iteration 17599: Policy loss: 0.003054. Value loss: 0.101656. Entropy: 1.009004.
Iteration 17600: Policy loss: -0.003107. Value loss: 0.053719. Entropy: 1.008774.
Iteration 17601: Policy loss: -0.008901. Value loss: 0.038013. Entropy: 1.008157.
Training network. lr: 0.000115. clip: 0.045939
Iteration 17602: Policy loss: 0.001064. Value loss: 0.374737. Entropy: 0.969862.
Iteration 17603: Policy loss: 0.000007. Value loss: 0.281814. Entropy: 0.973366.
Iteration 17604: Policy loss: -0.002910. Value loss: 0.228697. Entropy: 0.975276.
Training network. lr: 0.000115. clip: 0.045939
Iteration 17605: Policy loss: 0.004675. Va

Iteration 17663: Policy loss: -0.004674. Value loss: 0.058844. Entropy: 0.889417.
Iteration 17664: Policy loss: -0.007138. Value loss: 0.047091. Entropy: 0.887238.
Training network. lr: 0.000114. clip: 0.045782
Iteration 17665: Policy loss: 0.004821. Value loss: 0.165204. Entropy: 1.004875.
Iteration 17666: Policy loss: -0.000603. Value loss: 0.091840. Entropy: 1.006518.
Iteration 17667: Policy loss: -0.003562. Value loss: 0.068763. Entropy: 1.005642.
episode: 5854   score: 345.0  epsilon: 1.0    steps: 816  evaluation reward: 522.4
Training network. lr: 0.000114. clip: 0.045782
Iteration 17668: Policy loss: 0.004054. Value loss: 0.401733. Entropy: 1.090820.
Iteration 17669: Policy loss: -0.002641. Value loss: 0.268938. Entropy: 1.081646.
Iteration 17670: Policy loss: -0.004022. Value loss: 0.217638. Entropy: 1.084064.
episode: 5855   score: 365.0  epsilon: 1.0    steps: 448  evaluation reward: 524.25
episode: 5856   score: 715.0  epsilon: 1.0    steps: 984  evaluation reward: 525.9
Tr

Iteration 17729: Policy loss: -0.002480. Value loss: 0.147825. Entropy: 0.809506.
Iteration 17730: Policy loss: -0.007632. Value loss: 0.097541. Entropy: 0.807194.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17731: Policy loss: 0.001683. Value loss: 0.140846. Entropy: 1.128165.
Iteration 17732: Policy loss: -0.003226. Value loss: 0.058490. Entropy: 1.124064.
Iteration 17733: Policy loss: -0.006643. Value loss: 0.043360. Entropy: 1.123768.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17734: Policy loss: 0.002405. Value loss: 0.182159. Entropy: 1.116003.
Iteration 17735: Policy loss: -0.000950. Value loss: 0.069821. Entropy: 1.116505.
Iteration 17736: Policy loss: -0.007508. Value loss: 0.042045. Entropy: 1.111566.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17737: Policy loss: 0.002915. Value loss: 0.098818. Entropy: 1.141243.
Iteration 17738: Policy loss: -0.003449. Value loss: 0.046569. Entropy: 1.133113.
Iteration 17739: Policy loss: -0.009205. V

Iteration 17797: Policy loss: 0.002073. Value loss: 0.147652. Entropy: 0.971308.
Iteration 17798: Policy loss: -0.002840. Value loss: 0.077927. Entropy: 0.971451.
Iteration 17799: Policy loss: -0.006230. Value loss: 0.059460. Entropy: 0.964460.
Training network. lr: 0.000114. clip: 0.045478
Iteration 17800: Policy loss: 0.004828. Value loss: 0.202213. Entropy: 1.051245.
Iteration 17801: Policy loss: -0.002588. Value loss: 0.099645. Entropy: 1.059844.
Iteration 17802: Policy loss: -0.006926. Value loss: 0.075824. Entropy: 1.058486.
Training network. lr: 0.000113. clip: 0.045321
Iteration 17803: Policy loss: 0.001806. Value loss: 0.394594. Entropy: 1.084667.
Iteration 17804: Policy loss: 0.000225. Value loss: 0.226037. Entropy: 1.075316.
Iteration 17805: Policy loss: -0.002971. Value loss: 0.156774. Entropy: 1.075437.
episode: 5895   score: 1110.0  epsilon: 1.0    steps: 320  evaluation reward: 513.4
episode: 5896   score: 620.0  epsilon: 1.0    steps: 408  evaluation reward: 510.35
epis

Iteration 17864: Policy loss: -0.002205. Value loss: 0.132767. Entropy: 0.879996.
Iteration 17865: Policy loss: -0.006585. Value loss: 0.093005. Entropy: 0.886638.
episode: 5915   score: 495.0  epsilon: 1.0    steps: 128  evaluation reward: 516.25
Training network. lr: 0.000113. clip: 0.045174
Iteration 17866: Policy loss: 0.000383. Value loss: 0.093969. Entropy: 0.733377.
Iteration 17867: Policy loss: -0.003985. Value loss: 0.052016. Entropy: 0.728817.
Iteration 17868: Policy loss: -0.006766. Value loss: 0.043297. Entropy: 0.740136.
Training network. lr: 0.000113. clip: 0.045174
Iteration 17869: Policy loss: 0.000661. Value loss: 0.233697. Entropy: 0.963473.
Iteration 17870: Policy loss: -0.001743. Value loss: 0.112745. Entropy: 0.950614.
Iteration 17871: Policy loss: -0.003994. Value loss: 0.075564. Entropy: 0.951991.
episode: 5916   score: 260.0  epsilon: 1.0    steps: 304  evaluation reward: 515.15
Training network. lr: 0.000113. clip: 0.045174
Iteration 17872: Policy loss: 0.00308

Training network. lr: 0.000113. clip: 0.045017
Iteration 17932: Policy loss: 0.002541. Value loss: 0.398863. Entropy: 0.930877.
Iteration 17933: Policy loss: 0.005022. Value loss: 0.242937. Entropy: 0.941270.
Iteration 17934: Policy loss: -0.000028. Value loss: 0.143130. Entropy: 0.942961.
Training network. lr: 0.000113. clip: 0.045017
Iteration 17935: Policy loss: 0.003483. Value loss: 0.267802. Entropy: 1.023906.
Iteration 17936: Policy loss: 0.003212. Value loss: 0.154920. Entropy: 1.026808.
Iteration 17937: Policy loss: -0.005269. Value loss: 0.076730. Entropy: 1.022400.
Training network. lr: 0.000113. clip: 0.045017
Iteration 17938: Policy loss: 0.004509. Value loss: 0.166347. Entropy: 1.133151.
Iteration 17939: Policy loss: -0.003473. Value loss: 0.069787. Entropy: 1.132498.
Iteration 17940: Policy loss: -0.006169. Value loss: 0.047743. Entropy: 1.132525.
episode: 5935   score: 700.0  epsilon: 1.0    steps: 376  evaluation reward: 513.8
Training network. lr: 0.000113. clip: 0.045

Iteration 18000: Policy loss: -0.001723. Value loss: 0.096055. Entropy: 1.129605.
episode: 5954   score: 420.0  epsilon: 1.0    steps: 632  evaluation reward: 516.45
Training network. lr: 0.000112. clip: 0.044713
Iteration 18001: Policy loss: 0.003774. Value loss: 0.297348. Entropy: 1.026682.
Iteration 18002: Policy loss: -0.002503. Value loss: 0.176070. Entropy: 1.026116.
Iteration 18003: Policy loss: -0.006803. Value loss: 0.125277. Entropy: 1.021056.
episode: 5955   score: 670.0  epsilon: 1.0    steps: 312  evaluation reward: 519.5
episode: 5956   score: 260.0  epsilon: 1.0    steps: 352  evaluation reward: 514.95
Training network. lr: 0.000112. clip: 0.044713
Iteration 18004: Policy loss: 0.002613. Value loss: 0.322288. Entropy: 0.912597.
Iteration 18005: Policy loss: -0.000556. Value loss: 0.162955. Entropy: 0.906638.
Iteration 18006: Policy loss: -0.002693. Value loss: 0.125075. Entropy: 0.905018.
Training network. lr: 0.000112. clip: 0.044713
Iteration 18007: Policy loss: 0.0018

episode: 5975   score: 420.0  epsilon: 1.0    steps: 304  evaluation reward: 526.05
Training network. lr: 0.000111. clip: 0.044557
Iteration 18067: Policy loss: 0.001980. Value loss: 0.512192. Entropy: 1.096570.
Iteration 18068: Policy loss: -0.001142. Value loss: 0.338763. Entropy: 1.088481.
Iteration 18069: Policy loss: -0.001787. Value loss: 0.293953. Entropy: 1.089002.
episode: 5976   score: 445.0  epsilon: 1.0    steps: 128  evaluation reward: 523.5
episode: 5977   score: 370.0  epsilon: 1.0    steps: 592  evaluation reward: 523.0
Training network. lr: 0.000111. clip: 0.044557
Iteration 18070: Policy loss: 0.000155. Value loss: 0.150278. Entropy: 0.964676.
Iteration 18071: Policy loss: -0.006058. Value loss: 0.066325. Entropy: 0.959723.
Iteration 18072: Policy loss: -0.007668. Value loss: 0.046916. Entropy: 0.957531.
Training network. lr: 0.000111. clip: 0.044557
Iteration 18073: Policy loss: 0.002039. Value loss: 0.467084. Entropy: 1.061853.
Iteration 18074: Policy loss: -0.00013

Training network. lr: 0.000111. clip: 0.044400
Iteration 18133: Policy loss: 0.004173. Value loss: 0.145840. Entropy: 1.110010.
Iteration 18134: Policy loss: -0.001591. Value loss: 0.067643. Entropy: 1.109359.
Iteration 18135: Policy loss: -0.007510. Value loss: 0.051173. Entropy: 1.101585.
episode: 5997   score: 425.0  epsilon: 1.0    steps: 104  evaluation reward: 505.95
episode: 5998   score: 335.0  epsilon: 1.0    steps: 400  evaluation reward: 505.25
Training network. lr: 0.000111. clip: 0.044400
Iteration 18136: Policy loss: 0.004116. Value loss: 0.325146. Entropy: 1.000649.
Iteration 18137: Policy loss: -0.002477. Value loss: 0.174239. Entropy: 0.996159.
Iteration 18138: Policy loss: -0.006309. Value loss: 0.117485. Entropy: 0.996088.
episode: 5999   score: 180.0  epsilon: 1.0    steps: 856  evaluation reward: 502.35
Training network. lr: 0.000111. clip: 0.044400
Iteration 18139: Policy loss: 0.000820. Value loss: 0.259403. Entropy: 1.074305.
Iteration 18140: Policy loss: -0.000

episode: 6019   score: 695.0  epsilon: 1.0    steps: 720  evaluation reward: 480.4
Training network. lr: 0.000111. clip: 0.044252
Iteration 18199: Policy loss: 0.001525. Value loss: 0.188959. Entropy: 1.001151.
Iteration 18200: Policy loss: -0.000796. Value loss: 0.104414. Entropy: 1.000640.
Iteration 18201: Policy loss: -0.007603. Value loss: 0.077453. Entropy: 0.999662.
episode: 6020   score: 650.0  epsilon: 1.0    steps: 256  evaluation reward: 483.5
Training network. lr: 0.000110. clip: 0.044096
Iteration 18202: Policy loss: 0.002139. Value loss: 0.180923. Entropy: 0.996825.
Iteration 18203: Policy loss: -0.004256. Value loss: 0.095686. Entropy: 1.000641.
Iteration 18204: Policy loss: -0.007072. Value loss: 0.067213. Entropy: 0.994473.
Training network. lr: 0.000110. clip: 0.044096
Iteration 18205: Policy loss: 0.004125. Value loss: 0.147849. Entropy: 1.025488.
Iteration 18206: Policy loss: 0.000772. Value loss: 0.062882. Entropy: 1.029684.
Iteration 18207: Policy loss: -0.002196. 

Iteration 18266: Policy loss: -0.004944. Value loss: 0.167392. Entropy: 1.076218.
Iteration 18267: Policy loss: -0.010161. Value loss: 0.137851. Entropy: 1.082529.
episode: 6039   score: 420.0  epsilon: 1.0    steps: 944  evaluation reward: 463.5
Training network. lr: 0.000110. clip: 0.043939
Iteration 18268: Policy loss: 0.003482. Value loss: 0.493584. Entropy: 1.126772.
Iteration 18269: Policy loss: -0.002073. Value loss: 0.334190. Entropy: 1.133574.
Iteration 18270: Policy loss: -0.004404. Value loss: 0.293569. Entropy: 1.131325.
episode: 6040   score: 590.0  epsilon: 1.0    steps: 248  evaluation reward: 465.65
episode: 6041   score: 345.0  epsilon: 1.0    steps: 1024  evaluation reward: 463.35
Training network. lr: 0.000110. clip: 0.043939
Iteration 18271: Policy loss: 0.001273. Value loss: 0.181626. Entropy: 0.976110.
Iteration 18272: Policy loss: -0.004006. Value loss: 0.109513. Entropy: 0.970003.
Iteration 18273: Policy loss: -0.007548. Value loss: 0.087863. Entropy: 0.975133.


Iteration 18331: Policy loss: 0.003544. Value loss: 0.123532. Entropy: 0.868147.
Iteration 18332: Policy loss: -0.003437. Value loss: 0.069491. Entropy: 0.875506.
Iteration 18333: Policy loss: -0.003569. Value loss: 0.051727. Entropy: 0.870843.
Training network. lr: 0.000109. clip: 0.043792
Iteration 18334: Policy loss: 0.002795. Value loss: 0.359252. Entropy: 1.086723.
Iteration 18335: Policy loss: 0.006059. Value loss: 0.167889. Entropy: 1.102750.
Iteration 18336: Policy loss: 0.000488. Value loss: 0.093895. Entropy: 1.104757.
Training network. lr: 0.000109. clip: 0.043792
Iteration 18337: Policy loss: 0.001821. Value loss: 0.158097. Entropy: 1.144809.
Iteration 18338: Policy loss: -0.003927. Value loss: 0.079301. Entropy: 1.146557.
Iteration 18339: Policy loss: -0.007569. Value loss: 0.054565. Entropy: 1.146180.
Training network. lr: 0.000109. clip: 0.043792
Iteration 18340: Policy loss: 0.003110. Value loss: 0.227217. Entropy: 1.136632.
Iteration 18341: Policy loss: -0.001050. Valu

Iteration 18398: Policy loss: 0.000645. Value loss: 0.181130. Entropy: 1.017435.
Iteration 18399: Policy loss: -0.003359. Value loss: 0.126746. Entropy: 1.017281.
Training network. lr: 0.000109. clip: 0.043635
Iteration 18400: Policy loss: 0.004753. Value loss: 0.382563. Entropy: 0.984787.
Iteration 18401: Policy loss: -0.000837. Value loss: 0.299045. Entropy: 0.985230.
Iteration 18402: Policy loss: -0.001092. Value loss: 0.271150. Entropy: 0.986875.
episode: 6082   score: 695.0  epsilon: 1.0    steps: 976  evaluation reward: 476.7
Training network. lr: 0.000109. clip: 0.043478
Iteration 18403: Policy loss: 0.001402. Value loss: 0.130189. Entropy: 1.066274.
Iteration 18404: Policy loss: -0.005409. Value loss: 0.082130. Entropy: 1.061713.
Iteration 18405: Policy loss: -0.009283. Value loss: 0.063292. Entropy: 1.062051.
episode: 6083   score: 605.0  epsilon: 1.0    steps: 848  evaluation reward: 478.9
Training network. lr: 0.000109. clip: 0.043478
Iteration 18406: Policy loss: 0.003343. 

episode: 6102   score: 590.0  epsilon: 1.0    steps: 880  evaluation reward: 486.45
Training network. lr: 0.000108. clip: 0.043331
Iteration 18466: Policy loss: 0.000823. Value loss: 0.113293. Entropy: 1.025742.
Iteration 18467: Policy loss: -0.004363. Value loss: 0.058425. Entropy: 1.023875.
Iteration 18468: Policy loss: -0.008459. Value loss: 0.048781. Entropy: 1.021827.
episode: 6103   score: 180.0  epsilon: 1.0    steps: 64  evaluation reward: 482.7
Training network. lr: 0.000108. clip: 0.043331
Iteration 18469: Policy loss: 0.003396. Value loss: 0.115207. Entropy: 0.895306.
Iteration 18470: Policy loss: -0.004788. Value loss: 0.071844. Entropy: 0.897567.
Iteration 18471: Policy loss: -0.009760. Value loss: 0.052392. Entropy: 0.897301.
episode: 6104   score: 435.0  epsilon: 1.0    steps: 304  evaluation reward: 481.45
episode: 6105   score: 390.0  epsilon: 1.0    steps: 568  evaluation reward: 482.2
Training network. lr: 0.000108. clip: 0.043331
Iteration 18472: Policy loss: 0.0053

episode: 6124   score: 590.0  epsilon: 1.0    steps: 184  evaluation reward: 472.9
Training network. lr: 0.000108. clip: 0.043174
Iteration 18532: Policy loss: 0.001816. Value loss: 0.160886. Entropy: 1.030865.
Iteration 18533: Policy loss: -0.002775. Value loss: 0.092765. Entropy: 1.029221.
Iteration 18534: Policy loss: -0.007747. Value loss: 0.059963. Entropy: 1.024857.
episode: 6125   score: 470.0  epsilon: 1.0    steps: 104  evaluation reward: 474.15
episode: 6126   score: 660.0  epsilon: 1.0    steps: 456  evaluation reward: 477.1
Training network. lr: 0.000108. clip: 0.043174
Iteration 18535: Policy loss: 0.003014. Value loss: 0.155164. Entropy: 0.951713.
Iteration 18536: Policy loss: -0.001508. Value loss: 0.063017. Entropy: 0.941566.
Iteration 18537: Policy loss: -0.006174. Value loss: 0.044867. Entropy: 0.944827.
episode: 6127   score: 405.0  epsilon: 1.0    steps: 104  evaluation reward: 477.5
episode: 6128   score: 390.0  epsilon: 1.0    steps: 536  evaluation reward: 476.45

Iteration 18597: Policy loss: -0.002452. Value loss: 0.264385. Entropy: 0.982045.
Training network. lr: 0.000108. clip: 0.043017
Iteration 18598: Policy loss: 0.005518. Value loss: 0.148696. Entropy: 1.089702.
Iteration 18599: Policy loss: -0.001229. Value loss: 0.069207. Entropy: 1.082632.
Iteration 18600: Policy loss: -0.003045. Value loss: 0.053764. Entropy: 1.081816.
episode: 6147   score: 200.0  epsilon: 1.0    steps: 160  evaluation reward: 461.1
Training network. lr: 0.000107. clip: 0.042870
Iteration 18601: Policy loss: 0.002598. Value loss: 0.171269. Entropy: 0.977946.
Iteration 18602: Policy loss: -0.003603. Value loss: 0.089646. Entropy: 0.973567.
Iteration 18603: Policy loss: -0.007759. Value loss: 0.063818. Entropy: 0.969412.
episode: 6148   score: 365.0  epsilon: 1.0    steps: 816  evaluation reward: 458.5
Training network. lr: 0.000107. clip: 0.042870
Iteration 18604: Policy loss: 0.004244. Value loss: 0.221783. Entropy: 1.101734.
Iteration 18605: Policy loss: -0.000089.

Iteration 18663: Policy loss: -0.002747. Value loss: 0.221645. Entropy: 1.014433.
Training network. lr: 0.000107. clip: 0.042713
Iteration 18664: Policy loss: 0.000561. Value loss: 0.116723. Entropy: 1.032627.
Iteration 18665: Policy loss: -0.002083. Value loss: 0.051293. Entropy: 1.031420.
Iteration 18666: Policy loss: -0.006498. Value loss: 0.034064. Entropy: 1.035239.
Training network. lr: 0.000107. clip: 0.042713
Iteration 18667: Policy loss: 0.002908. Value loss: 0.161616. Entropy: 1.079773.
Iteration 18668: Policy loss: -0.002083. Value loss: 0.088939. Entropy: 1.073227.
Iteration 18669: Policy loss: -0.006935. Value loss: 0.062020. Entropy: 1.073260.
episode: 6168   score: 365.0  epsilon: 1.0    steps: 80  evaluation reward: 446.8
episode: 6169   score: 420.0  epsilon: 1.0    steps: 400  evaluation reward: 442.05
Training network. lr: 0.000107. clip: 0.042713
Iteration 18670: Policy loss: 0.002423. Value loss: 0.162385. Entropy: 0.991352.
Iteration 18671: Policy loss: -0.001555.

Iteration 18730: Policy loss: 0.000811. Value loss: 0.119799. Entropy: 1.050211.
Iteration 18731: Policy loss: -0.005235. Value loss: 0.061115. Entropy: 1.051519.
Iteration 18732: Policy loss: -0.008177. Value loss: 0.046697. Entropy: 1.051893.
episode: 6188   score: 345.0  epsilon: 1.0    steps: 416  evaluation reward: 442.25
Training network. lr: 0.000106. clip: 0.042557
Iteration 18733: Policy loss: 0.001554. Value loss: 0.320002. Entropy: 1.148470.
Iteration 18734: Policy loss: -0.000437. Value loss: 0.214224. Entropy: 1.149019.
Iteration 18735: Policy loss: -0.003172. Value loss: 0.185676. Entropy: 1.148562.
episode: 6189   score: 670.0  epsilon: 1.0    steps: 456  evaluation reward: 442.7
Training network. lr: 0.000106. clip: 0.042557
Iteration 18736: Policy loss: 0.000123. Value loss: 0.156976. Entropy: 1.099406.
Iteration 18737: Policy loss: -0.004755. Value loss: 0.082533. Entropy: 1.095278.
Iteration 18738: Policy loss: -0.007238. Value loss: 0.063563. Entropy: 1.091314.
epis

Iteration 18796: Policy loss: 0.007663. Value loss: 0.231785. Entropy: 1.115890.
Iteration 18797: Policy loss: 0.001721. Value loss: 0.103897. Entropy: 1.114872.
Iteration 18798: Policy loss: -0.001860. Value loss: 0.074775. Entropy: 1.111647.
episode: 6209   score: 775.0  epsilon: 1.0    steps: 416  evaluation reward: 457.2
episode: 6210   score: 440.0  epsilon: 1.0    steps: 456  evaluation reward: 457.55
episode: 6211   score: 245.0  epsilon: 1.0    steps: 920  evaluation reward: 457.05
Training network. lr: 0.000106. clip: 0.042409
Iteration 18799: Policy loss: 0.003528. Value loss: 0.141139. Entropy: 1.053302.
Iteration 18800: Policy loss: 0.002013. Value loss: 0.082373. Entropy: 1.046580.
Iteration 18801: Policy loss: -0.004134. Value loss: 0.061353. Entropy: 1.046247.
Training network. lr: 0.000106. clip: 0.042253
Iteration 18802: Policy loss: 0.001761. Value loss: 0.146459. Entropy: 0.981307.
Iteration 18803: Policy loss: -0.003905. Value loss: 0.088826. Entropy: 0.975675.
Iter

Iteration 18863: Policy loss: -0.002413. Value loss: 0.071242. Entropy: 0.966259.
Iteration 18864: Policy loss: -0.006156. Value loss: 0.053434. Entropy: 0.968331.
Training network. lr: 0.000105. clip: 0.042096
Iteration 18865: Policy loss: 0.002215. Value loss: 0.173209. Entropy: 1.030232.
Iteration 18866: Policy loss: -0.001725. Value loss: 0.098187. Entropy: 1.037073.
Iteration 18867: Policy loss: -0.005218. Value loss: 0.072456. Entropy: 1.030659.
Training network. lr: 0.000105. clip: 0.042096
Iteration 18868: Policy loss: 0.000432. Value loss: 0.226074. Entropy: 1.082507.
Iteration 18869: Policy loss: -0.000807. Value loss: 0.127786. Entropy: 1.086424.
Iteration 18870: Policy loss: -0.006118. Value loss: 0.089272. Entropy: 1.079114.
episode: 6230   score: 285.0  epsilon: 1.0    steps: 272  evaluation reward: 474.25
episode: 6231   score: 435.0  epsilon: 1.0    steps: 352  evaluation reward: 470.55
episode: 6232   score: 320.0  epsilon: 1.0    steps: 432  evaluation reward: 470.6
T

Training network. lr: 0.000105. clip: 0.041948
Iteration 18931: Policy loss: 0.001780. Value loss: 0.164193. Entropy: 0.823233.
Iteration 18932: Policy loss: 0.000013. Value loss: 0.098402. Entropy: 0.816840.
Iteration 18933: Policy loss: -0.002880. Value loss: 0.076493. Entropy: 0.816680.
now time :  2019-03-06 00:48:49.894511
episode: 6251   score: 450.0  epsilon: 1.0    steps: 896  evaluation reward: 499.7
Training network. lr: 0.000105. clip: 0.041948
Iteration 18934: Policy loss: 0.001653. Value loss: 0.393227. Entropy: 0.919279.
Iteration 18935: Policy loss: 0.000427. Value loss: 0.269838. Entropy: 0.912651.
Iteration 18936: Policy loss: 0.000781. Value loss: 0.220447. Entropy: 0.911035.
Training network. lr: 0.000105. clip: 0.041948
Iteration 18937: Policy loss: 0.005551. Value loss: 0.247971. Entropy: 0.942784.
Iteration 18938: Policy loss: 0.003400. Value loss: 0.100442. Entropy: 0.944316.
Iteration 18939: Policy loss: -0.001045. Value loss: 0.068669. Entropy: 0.951006.
episod

Iteration 18999: Policy loss: -0.007193. Value loss: 0.083334. Entropy: 1.133323.
Training network. lr: 0.000104. clip: 0.041792
Iteration 19000: Policy loss: 0.001776. Value loss: 0.260043. Entropy: 1.181616.
Iteration 19001: Policy loss: -0.002873. Value loss: 0.165799. Entropy: 1.183141.
Iteration 19002: Policy loss: -0.004880. Value loss: 0.114523. Entropy: 1.179435.
episode: 6270   score: 315.0  epsilon: 1.0    steps: 952  evaluation reward: 507.85
Training network. lr: 0.000104. clip: 0.041635
Iteration 19003: Policy loss: 0.002251. Value loss: 0.276296. Entropy: 1.181452.
Iteration 19004: Policy loss: -0.004239. Value loss: 0.152973. Entropy: 1.180726.
Iteration 19005: Policy loss: -0.008205. Value loss: 0.104073. Entropy: 1.179518.
episode: 6271   score: 900.0  epsilon: 1.0    steps: 136  evaluation reward: 513.7
episode: 6272   score: 650.0  epsilon: 1.0    steps: 232  evaluation reward: 514.0
Training network. lr: 0.000104. clip: 0.041635
Iteration 19006: Policy loss: 0.00068

Iteration 19064: Policy loss: -0.002721. Value loss: 0.119413. Entropy: 1.055198.
Iteration 19065: Policy loss: -0.005194. Value loss: 0.088770. Entropy: 1.056740.
episode: 6293   score: 470.0  epsilon: 1.0    steps: 224  evaluation reward: 517.1
Training network. lr: 0.000104. clip: 0.041488
Iteration 19066: Policy loss: 0.001290. Value loss: 0.075428. Entropy: 0.789702.
Iteration 19067: Policy loss: -0.003394. Value loss: 0.050807. Entropy: 0.796701.
Iteration 19068: Policy loss: -0.005838. Value loss: 0.042161. Entropy: 0.798858.
Training network. lr: 0.000104. clip: 0.041488
Iteration 19069: Policy loss: 0.000532. Value loss: 0.347459. Entropy: 0.981141.
Iteration 19070: Policy loss: 0.000129. Value loss: 0.226738. Entropy: 0.976197.
Iteration 19071: Policy loss: 0.000314. Value loss: 0.204984. Entropy: 0.975336.
episode: 6294   score: 180.0  epsilon: 1.0    steps: 720  evaluation reward: 514.95
Training network. lr: 0.000104. clip: 0.041488
Iteration 19072: Policy loss: 0.001681. 

Iteration 19131: Policy loss: -0.007544. Value loss: 0.051619. Entropy: 0.969497.
episode: 6313   score: 385.0  epsilon: 1.0    steps: 400  evaluation reward: 509.45
Training network. lr: 0.000103. clip: 0.041331
Iteration 19132: Policy loss: 0.001389. Value loss: 0.148875. Entropy: 0.986699.
Iteration 19133: Policy loss: -0.004911. Value loss: 0.091229. Entropy: 0.990564.
Iteration 19134: Policy loss: -0.007910. Value loss: 0.073878. Entropy: 0.999612.
episode: 6314   score: 870.0  epsilon: 1.0    steps: 312  evaluation reward: 515.25
Training network. lr: 0.000103. clip: 0.041331
Iteration 19135: Policy loss: 0.001473. Value loss: 0.252025. Entropy: 1.013405.
Iteration 19136: Policy loss: -0.002058. Value loss: 0.108012. Entropy: 1.011458.
Iteration 19137: Policy loss: -0.006290. Value loss: 0.064406. Entropy: 1.017216.
episode: 6315   score: 425.0  epsilon: 1.0    steps: 784  evaluation reward: 512.25
Training network. lr: 0.000103. clip: 0.041331
Iteration 19138: Policy loss: -0.00

Iteration 19197: Policy loss: -0.006636. Value loss: 0.043539. Entropy: 1.019468.
Training network. lr: 0.000103. clip: 0.041174
Iteration 19198: Policy loss: 0.003186. Value loss: 0.210149. Entropy: 1.024010.
Iteration 19199: Policy loss: -0.002123. Value loss: 0.133571. Entropy: 1.024878.
Iteration 19200: Policy loss: -0.005373. Value loss: 0.102107. Entropy: 1.031072.
Training network. lr: 0.000103. clip: 0.041027
Iteration 19201: Policy loss: 0.003249. Value loss: 0.208685. Entropy: 1.054516.
Iteration 19202: Policy loss: -0.002539. Value loss: 0.092015. Entropy: 1.051093.
Iteration 19203: Policy loss: -0.004988. Value loss: 0.061799. Entropy: 1.049519.
Training network. lr: 0.000103. clip: 0.041027
Iteration 19204: Policy loss: 0.001911. Value loss: 0.198437. Entropy: 1.165679.
Iteration 19205: Policy loss: -0.005241. Value loss: 0.113238. Entropy: 1.165826.
Iteration 19206: Policy loss: -0.008923. Value loss: 0.082078. Entropy: 1.163050.
episode: 6335   score: 305.0  epsilon: 1.0

Training network. lr: 0.000102. clip: 0.040870
Iteration 19264: Policy loss: 0.007107. Value loss: 0.353682. Entropy: 0.885765.
Iteration 19265: Policy loss: 0.000931. Value loss: 0.140725. Entropy: 0.889530.
Iteration 19266: Policy loss: -0.001789. Value loss: 0.089999. Entropy: 0.893977.
Training network. lr: 0.000102. clip: 0.040870
Iteration 19267: Policy loss: 0.001517. Value loss: 0.312134. Entropy: 0.976980.
Iteration 19268: Policy loss: -0.002717. Value loss: 0.191056. Entropy: 0.979512.
Iteration 19269: Policy loss: -0.004986. Value loss: 0.146776. Entropy: 0.977449.
Training network. lr: 0.000102. clip: 0.040870
Iteration 19270: Policy loss: 0.003474. Value loss: 0.405140. Entropy: 1.191107.
Iteration 19271: Policy loss: -0.001147. Value loss: 0.257907. Entropy: 1.182595.
Iteration 19272: Policy loss: -0.005068. Value loss: 0.207977. Entropy: 1.182578.
episode: 6355   score: 675.0  epsilon: 1.0    steps: 336  evaluation reward: 503.8
episode: 6356   score: 740.0  epsilon: 1.0

Training network. lr: 0.000102. clip: 0.040713
Iteration 19333: Policy loss: 0.000979. Value loss: 0.115770. Entropy: 1.239296.
Iteration 19334: Policy loss: -0.005895. Value loss: 0.059147. Entropy: 1.239530.
Iteration 19335: Policy loss: -0.009654. Value loss: 0.041129. Entropy: 1.235753.
Training network. lr: 0.000102. clip: 0.040713
Iteration 19336: Policy loss: 0.001626. Value loss: 0.212777. Entropy: 1.230253.
Iteration 19337: Policy loss: -0.001760. Value loss: 0.115131. Entropy: 1.229308.
Iteration 19338: Policy loss: -0.005052. Value loss: 0.080088. Entropy: 1.223984.
episode: 6373   score: 550.0  epsilon: 1.0    steps: 376  evaluation reward: 506.05
Training network. lr: 0.000102. clip: 0.040713
Iteration 19339: Policy loss: 0.003713. Value loss: 0.400145. Entropy: 1.121973.
Iteration 19340: Policy loss: 0.000689. Value loss: 0.207635. Entropy: 1.118257.
Iteration 19341: Policy loss: -0.002592. Value loss: 0.156623. Entropy: 1.112184.
episode: 6374   score: 710.0  epsilon: 1.

Iteration 19400: Policy loss: 0.000679. Value loss: 0.100727. Entropy: 1.020646.
Iteration 19401: Policy loss: -0.005715. Value loss: 0.072676. Entropy: 1.018077.
episode: 6393   score: 620.0  epsilon: 1.0    steps: 984  evaluation reward: 520.1
Training network. lr: 0.000101. clip: 0.040409
Iteration 19402: Policy loss: 0.003397. Value loss: 0.233166. Entropy: 1.195994.
Iteration 19403: Policy loss: -0.002519. Value loss: 0.109082. Entropy: 1.195884.
Iteration 19404: Policy loss: -0.004300. Value loss: 0.075150. Entropy: 1.193270.
episode: 6394   score: 360.0  epsilon: 1.0    steps: 952  evaluation reward: 521.9
Training network. lr: 0.000101. clip: 0.040409
Iteration 19405: Policy loss: 0.003111. Value loss: 0.360682. Entropy: 1.099638.
Iteration 19406: Policy loss: 0.003516. Value loss: 0.167046. Entropy: 1.094365.
Iteration 19407: Policy loss: -0.003405. Value loss: 0.128639. Entropy: 1.090490.
Training network. lr: 0.000101. clip: 0.040409
Iteration 19408: Policy loss: -0.001062. 

Iteration 19465: Policy loss: 0.004059. Value loss: 0.183899. Entropy: 0.712157.
Iteration 19466: Policy loss: -0.001634. Value loss: 0.067429. Entropy: 0.713906.
Iteration 19467: Policy loss: -0.005807. Value loss: 0.047262. Entropy: 0.712877.
Training network. lr: 0.000101. clip: 0.040253
Iteration 19468: Policy loss: 0.004193. Value loss: 0.182355. Entropy: 1.114764.
Iteration 19469: Policy loss: -0.000978. Value loss: 0.081662. Entropy: 1.121105.
Iteration 19470: Policy loss: -0.005425. Value loss: 0.064748. Entropy: 1.114889.
Training network. lr: 0.000101. clip: 0.040253
Iteration 19471: Policy loss: 0.000538. Value loss: 0.202259. Entropy: 1.105600.
Iteration 19472: Policy loss: -0.003964. Value loss: 0.099665. Entropy: 1.106690.
Iteration 19473: Policy loss: -0.006891. Value loss: 0.068264. Entropy: 1.109105.
Training network. lr: 0.000101. clip: 0.040253
Iteration 19474: Policy loss: 0.000392. Value loss: 0.157433. Entropy: 1.178104.
Iteration 19475: Policy loss: -0.006035. Va

Iteration 19534: Policy loss: 0.003557. Value loss: 0.345128. Entropy: 1.112839.
Iteration 19535: Policy loss: 0.000426. Value loss: 0.193669. Entropy: 1.109651.
Iteration 19536: Policy loss: -0.004572. Value loss: 0.140400. Entropy: 1.108181.
Training network. lr: 0.000100. clip: 0.040105
Iteration 19537: Policy loss: 0.000247. Value loss: 0.264356. Entropy: 1.177981.
Iteration 19538: Policy loss: -0.003532. Value loss: 0.115032. Entropy: 1.177503.
Iteration 19539: Policy loss: -0.007910. Value loss: 0.076009. Entropy: 1.177295.
episode: 6433   score: 740.0  epsilon: 1.0    steps: 208  evaluation reward: 533.3
episode: 6434   score: 480.0  epsilon: 1.0    steps: 576  evaluation reward: 534.95
episode: 6435   score: 450.0  epsilon: 1.0    steps: 864  evaluation reward: 536.4
Training network. lr: 0.000100. clip: 0.040105
Iteration 19540: Policy loss: 0.000271. Value loss: 0.274293. Entropy: 1.003289.
Iteration 19541: Policy loss: -0.004245. Value loss: 0.172959. Entropy: 1.005083.
Iter

Training network. lr: 0.000100. clip: 0.039949
Iteration 19600: Policy loss: 0.002583. Value loss: 0.116423. Entropy: 0.784364.
Iteration 19601: Policy loss: -0.003477. Value loss: 0.062413. Entropy: 0.778621.
Iteration 19602: Policy loss: -0.005870. Value loss: 0.047994. Entropy: 0.776671.
episode: 6455   score: 470.0  epsilon: 1.0    steps: 432  evaluation reward: 545.65
Training network. lr: 0.000099. clip: 0.039792
Iteration 19603: Policy loss: 0.003333. Value loss: 0.292452. Entropy: 0.876039.
Iteration 19604: Policy loss: 0.000848. Value loss: 0.173371. Entropy: 0.877073.
Iteration 19605: Policy loss: 0.000810. Value loss: 0.117277. Entropy: 0.881945.
Training network. lr: 0.000099. clip: 0.039792
Iteration 19606: Policy loss: 0.004244. Value loss: 0.241317. Entropy: 0.974377.
Iteration 19607: Policy loss: -0.002347. Value loss: 0.114997. Entropy: 0.963093.
Iteration 19608: Policy loss: -0.007200. Value loss: 0.079010. Entropy: 0.969452.
Training network. lr: 0.000099. clip: 0.03

episode: 6473   score: 640.0  epsilon: 1.0    steps: 480  evaluation reward: 538.9
episode: 6474   score: 265.0  epsilon: 1.0    steps: 744  evaluation reward: 534.45
Training network. lr: 0.000099. clip: 0.039644
Iteration 19669: Policy loss: -0.000097. Value loss: 0.153656. Entropy: 0.989204.
Iteration 19670: Policy loss: -0.006274. Value loss: 0.087702. Entropy: 0.986967.
Iteration 19671: Policy loss: -0.007697. Value loss: 0.062397. Entropy: 0.982644.
Training network. lr: 0.000099. clip: 0.039644
Iteration 19672: Policy loss: 0.000635. Value loss: 0.259223. Entropy: 0.994625.
Iteration 19673: Policy loss: -0.002109. Value loss: 0.117915. Entropy: 0.992587.
Iteration 19674: Policy loss: -0.005333. Value loss: 0.068979. Entropy: 0.998088.
Training network. lr: 0.000099. clip: 0.039644
Iteration 19675: Policy loss: 0.002255. Value loss: 0.166389. Entropy: 1.058619.
Iteration 19676: Policy loss: -0.004115. Value loss: 0.081808. Entropy: 1.055345.
Iteration 19677: Policy loss: -0.00701

Iteration 19735: Policy loss: 0.005125. Value loss: 0.217683. Entropy: 1.016741.
Iteration 19736: Policy loss: 0.003571. Value loss: 0.108992. Entropy: 1.014378.
Iteration 19737: Policy loss: -0.004824. Value loss: 0.081290. Entropy: 1.013001.
episode: 6494   score: 620.0  epsilon: 1.0    steps: 296  evaluation reward: 542.05
Training network. lr: 0.000099. clip: 0.039488
Iteration 19738: Policy loss: 0.003266. Value loss: 0.233051. Entropy: 1.007131.
Iteration 19739: Policy loss: 0.002329. Value loss: 0.097321. Entropy: 1.000765.
Iteration 19740: Policy loss: -0.003031. Value loss: 0.066641. Entropy: 1.004594.
Training network. lr: 0.000099. clip: 0.039488
Iteration 19741: Policy loss: 0.004333. Value loss: 0.235862. Entropy: 1.078947.
Iteration 19742: Policy loss: -0.002095. Value loss: 0.113563. Entropy: 1.071333.
Iteration 19743: Policy loss: -0.007167. Value loss: 0.075817. Entropy: 1.066333.
episode: 6495   score: 260.0  epsilon: 1.0    steps: 992  evaluation reward: 537.75
Train

episode: 6512   score: 790.0  epsilon: 1.0    steps: 256  evaluation reward: 545.7
episode: 6513   score: 895.0  epsilon: 1.0    steps: 720  evaluation reward: 549.4
Training network. lr: 0.000098. clip: 0.039184
Iteration 19804: Policy loss: 0.001938. Value loss: 0.371901. Entropy: 0.947904.
Iteration 19805: Policy loss: 0.001931. Value loss: 0.172686. Entropy: 0.942139.
Iteration 19806: Policy loss: -0.000592. Value loss: 0.102350. Entropy: 0.939416.
episode: 6514   score: 695.0  epsilon: 1.0    steps: 64  evaluation reward: 552.55
Training network. lr: 0.000098. clip: 0.039184
Iteration 19807: Policy loss: 0.002969. Value loss: 0.465885. Entropy: 0.950685.
Iteration 19808: Policy loss: 0.000770. Value loss: 0.293693. Entropy: 0.950570.
Iteration 19809: Policy loss: 0.000556. Value loss: 0.256730. Entropy: 0.949146.
Training network. lr: 0.000098. clip: 0.039184
Iteration 19810: Policy loss: 0.002029. Value loss: 0.488717. Entropy: 0.944751.
Iteration 19811: Policy loss: 0.000117. Va

Iteration 19870: Policy loss: 0.001591. Value loss: 0.118494. Entropy: 1.047517.
Iteration 19871: Policy loss: -0.002578. Value loss: 0.049135. Entropy: 1.042421.
Iteration 19872: Policy loss: -0.006647. Value loss: 0.033412. Entropy: 1.042022.
Training network. lr: 0.000098. clip: 0.039027
Iteration 19873: Policy loss: -0.000613. Value loss: 0.352372. Entropy: 1.101972.
Iteration 19874: Policy loss: -0.000643. Value loss: 0.137540. Entropy: 1.099790.
Iteration 19875: Policy loss: -0.003651. Value loss: 0.089338. Entropy: 1.108828.
episode: 6533   score: 755.0  epsilon: 1.0    steps: 920  evaluation reward: 553.0
Training network. lr: 0.000098. clip: 0.039027
Iteration 19876: Policy loss: 0.003287. Value loss: 0.482159. Entropy: 1.215068.
Iteration 19877: Policy loss: 0.000240. Value loss: 0.342229. Entropy: 1.210260.
Iteration 19878: Policy loss: -0.003594. Value loss: 0.262198. Entropy: 1.212244.
episode: 6534   score: 330.0  epsilon: 1.0    steps: 120  evaluation reward: 551.5
Train

Iteration 19939: Policy loss: 0.001254. Value loss: 0.150858. Entropy: 1.015727.
Iteration 19940: Policy loss: -0.001423. Value loss: 0.092412. Entropy: 1.016261.
Iteration 19941: Policy loss: -0.005099. Value loss: 0.067347. Entropy: 1.015851.
now time :  2019-03-06 01:02:10.459655
episode: 6551   score: 415.0  epsilon: 1.0    steps: 776  evaluation reward: 557.55
episode: 6552   score: 615.0  epsilon: 1.0    steps: 840  evaluation reward: 555.2
Training network. lr: 0.000097. clip: 0.038870
Iteration 19942: Policy loss: 0.001644. Value loss: 0.171103. Entropy: 1.023993.
Iteration 19943: Policy loss: -0.003817. Value loss: 0.099004. Entropy: 1.025537.
Iteration 19944: Policy loss: -0.006888. Value loss: 0.073989. Entropy: 1.028883.
episode: 6553   score: 470.0  epsilon: 1.0    steps: 480  evaluation reward: 552.65
episode: 6554   score: 835.0  epsilon: 1.0    steps: 544  evaluation reward: 553.85
Training network. lr: 0.000097. clip: 0.038870
Iteration 19945: Policy loss: 0.003640. Va

Iteration 20007: Policy loss: -0.004039. Value loss: 0.170205. Entropy: 1.088012.
episode: 6570   score: 530.0  epsilon: 1.0    steps: 616  evaluation reward: 553.75
Training network. lr: 0.000096. clip: 0.038566
Iteration 20008: Policy loss: 0.000435. Value loss: 0.164925. Entropy: 1.029466.
Iteration 20009: Policy loss: -0.003464. Value loss: 0.082679. Entropy: 1.027560.
Iteration 20010: Policy loss: -0.005169. Value loss: 0.059212. Entropy: 1.025995.
episode: 6571   score: 465.0  epsilon: 1.0    steps: 304  evaluation reward: 554.65
episode: 6572   score: 670.0  epsilon: 1.0    steps: 1016  evaluation reward: 554.85
Training network. lr: 0.000096. clip: 0.038566
Iteration 20011: Policy loss: 0.002349. Value loss: 0.241947. Entropy: 0.929998.
Iteration 20012: Policy loss: -0.001607. Value loss: 0.106544. Entropy: 0.932898.
Iteration 20013: Policy loss: -0.005060. Value loss: 0.073407. Entropy: 0.931213.
episode: 6573   score: 780.0  epsilon: 1.0    steps: 272  evaluation reward: 556.

Iteration 20074: Policy loss: 0.002305. Value loss: 0.179925. Entropy: 1.055009.
Iteration 20075: Policy loss: -0.001982. Value loss: 0.089380. Entropy: 1.049858.
Iteration 20076: Policy loss: -0.002858. Value loss: 0.066343. Entropy: 1.052333.
episode: 6590   score: 435.0  epsilon: 1.0    steps: 344  evaluation reward: 554.45
episode: 6591   score: 600.0  epsilon: 1.0    steps: 592  evaluation reward: 554.3
Training network. lr: 0.000096. clip: 0.038409
Iteration 20077: Policy loss: 0.003532. Value loss: 0.137069. Entropy: 0.879564.
Iteration 20078: Policy loss: -0.002650. Value loss: 0.073955. Entropy: 0.891273.
Iteration 20079: Policy loss: -0.004691. Value loss: 0.059893. Entropy: 0.891882.
episode: 6592   score: 425.0  epsilon: 1.0    steps: 528  evaluation reward: 552.3
Training network. lr: 0.000096. clip: 0.038409
Iteration 20080: Policy loss: 0.002976. Value loss: 0.259975. Entropy: 0.937095.
Iteration 20081: Policy loss: 0.001030. Value loss: 0.104716. Entropy: 0.938807.
Iter

episode: 6612   score: 300.0  epsilon: 1.0    steps: 824  evaluation reward: 534.1
Training network. lr: 0.000096. clip: 0.038262
Iteration 20140: Policy loss: 0.002710. Value loss: 0.157631. Entropy: 1.062324.
Iteration 20141: Policy loss: -0.000463. Value loss: 0.097877. Entropy: 1.069164.
Iteration 20142: Policy loss: -0.004510. Value loss: 0.072424. Entropy: 1.066135.
episode: 6613   score: 135.0  epsilon: 1.0    steps: 48  evaluation reward: 526.5
Training network. lr: 0.000096. clip: 0.038262
Iteration 20143: Policy loss: 0.001813. Value loss: 0.283283. Entropy: 0.997710.
Iteration 20144: Policy loss: -0.000323. Value loss: 0.146566. Entropy: 0.996556.
Iteration 20145: Policy loss: -0.003046. Value loss: 0.107979. Entropy: 0.990364.
Training network. lr: 0.000096. clip: 0.038262
Iteration 20146: Policy loss: 0.002872. Value loss: 0.321952. Entropy: 1.062768.
Iteration 20147: Policy loss: -0.000662. Value loss: 0.142615. Entropy: 1.071754.
Iteration 20148: Policy loss: -0.002235. 

Training network. lr: 0.000095. clip: 0.037949
Iteration 20206: Policy loss: 0.002563. Value loss: 0.128823. Entropy: 0.904635.
Iteration 20207: Policy loss: -0.003409. Value loss: 0.077009. Entropy: 0.905735.
Iteration 20208: Policy loss: -0.006644. Value loss: 0.059037. Entropy: 0.911609.
episode: 6634   score: 695.0  epsilon: 1.0    steps: 488  evaluation reward: 516.05
Training network. lr: 0.000095. clip: 0.037949
Iteration 20209: Policy loss: 0.004776. Value loss: 0.576922. Entropy: 1.021448.
Iteration 20210: Policy loss: 0.002201. Value loss: 0.441949. Entropy: 1.025033.
Iteration 20211: Policy loss: -0.002073. Value loss: 0.396314. Entropy: 1.022409.
episode: 6635   score: 575.0  epsilon: 1.0    steps: 72  evaluation reward: 514.05
episode: 6636   score: 605.0  epsilon: 1.0    steps: 656  evaluation reward: 515.6
Training network. lr: 0.000095. clip: 0.037949
Iteration 20212: Policy loss: 0.001857. Value loss: 0.288032. Entropy: 0.936836.
Iteration 20213: Policy loss: -0.002216

Training network. lr: 0.000095. clip: 0.037801
Iteration 20272: Policy loss: 0.002190. Value loss: 0.168492. Entropy: 1.144363.
Iteration 20273: Policy loss: -0.003345. Value loss: 0.088191. Entropy: 1.143621.
Iteration 20274: Policy loss: -0.006650. Value loss: 0.060386. Entropy: 1.147232.
Training network. lr: 0.000095. clip: 0.037801
Iteration 20275: Policy loss: 0.002202. Value loss: 0.193985. Entropy: 1.131685.
Iteration 20276: Policy loss: -0.003746. Value loss: 0.097914. Entropy: 1.127420.
Iteration 20277: Policy loss: -0.007446. Value loss: 0.068879. Entropy: 1.129107.
episode: 6655   score: 420.0  epsilon: 1.0    steps: 512  evaluation reward: 499.8
Training network. lr: 0.000095. clip: 0.037801
Iteration 20278: Policy loss: 0.002937. Value loss: 0.758344. Entropy: 1.110009.
Iteration 20279: Policy loss: 0.000382. Value loss: 0.489675. Entropy: 1.107425.
Iteration 20280: Policy loss: 0.000189. Value loss: 0.366848. Entropy: 1.106134.
episode: 6656   score: 590.0  epsilon: 1.0 

Iteration 20338: Policy loss: 0.002044. Value loss: 0.130606. Entropy: 1.017364.
Iteration 20339: Policy loss: -0.004493. Value loss: 0.072838. Entropy: 1.017651.
Iteration 20340: Policy loss: -0.005932. Value loss: 0.053854. Entropy: 1.008858.
episode: 6676   score: 555.0  epsilon: 1.0    steps: 240  evaluation reward: 485.55
Training network. lr: 0.000094. clip: 0.037645
Iteration 20341: Policy loss: 0.001708. Value loss: 0.105742. Entropy: 1.024467.
Iteration 20342: Policy loss: -0.002129. Value loss: 0.060892. Entropy: 1.028200.
Iteration 20343: Policy loss: -0.004871. Value loss: 0.049002. Entropy: 1.025139.
Training network. lr: 0.000094. clip: 0.037645
Iteration 20344: Policy loss: 0.003764. Value loss: 0.194660. Entropy: 1.101816.
Iteration 20345: Policy loss: -0.002166. Value loss: 0.096210. Entropy: 1.102216.
Iteration 20346: Policy loss: -0.005219. Value loss: 0.068718. Entropy: 1.104383.
episode: 6677   score: 760.0  epsilon: 1.0    steps: 336  evaluation reward: 488.6
epis

Training network. lr: 0.000093. clip: 0.037340
Iteration 20404: Policy loss: 0.001001. Value loss: 0.121285. Entropy: 1.096977.
Iteration 20405: Policy loss: -0.004358. Value loss: 0.071442. Entropy: 1.092265.
Iteration 20406: Policy loss: -0.007378. Value loss: 0.050158. Entropy: 1.093292.
Training network. lr: 0.000093. clip: 0.037340
Iteration 20407: Policy loss: 0.001746. Value loss: 0.457550. Entropy: 1.174704.
Iteration 20408: Policy loss: -0.001684. Value loss: 0.330255. Entropy: 1.177544.
Iteration 20409: Policy loss: -0.004347. Value loss: 0.271130. Entropy: 1.175510.
episode: 6698   score: 665.0  epsilon: 1.0    steps: 328  evaluation reward: 471.05
episode: 6699   score: 295.0  epsilon: 1.0    steps: 624  evaluation reward: 470.3
Training network. lr: 0.000093. clip: 0.037340
Iteration 20410: Policy loss: 0.003377. Value loss: 0.176087. Entropy: 1.055839.
Iteration 20411: Policy loss: 0.000629. Value loss: 0.108679. Entropy: 1.049384.
Iteration 20412: Policy loss: -0.004695.

Iteration 20467: Policy loss: 0.000898. Value loss: 0.336884. Entropy: 1.049500.
Iteration 20468: Policy loss: -0.001953. Value loss: 0.238492. Entropy: 1.049078.
Iteration 20469: Policy loss: -0.003196. Value loss: 0.183490. Entropy: 1.051708.
Training network. lr: 0.000093. clip: 0.037184
Iteration 20470: Policy loss: 0.002421. Value loss: 0.289740. Entropy: 0.999106.
Iteration 20471: Policy loss: -0.001337. Value loss: 0.179559. Entropy: 1.003782.
Iteration 20472: Policy loss: -0.003342. Value loss: 0.134627. Entropy: 1.008463.
Training network. lr: 0.000093. clip: 0.037184
Iteration 20473: Policy loss: 0.001862. Value loss: 0.367955. Entropy: 1.089754.
Iteration 20474: Policy loss: 0.002862. Value loss: 0.219463. Entropy: 1.098055.
Iteration 20475: Policy loss: -0.002443. Value loss: 0.164070. Entropy: 1.094262.
Training network. lr: 0.000093. clip: 0.037184
Iteration 20476: Policy loss: 0.003070. Value loss: 0.248232. Entropy: 1.155017.
Iteration 20477: Policy loss: -0.002028. Val

Training network. lr: 0.000093. clip: 0.037027
Iteration 20536: Policy loss: 0.005186. Value loss: 0.409961. Entropy: 0.963565.
Iteration 20537: Policy loss: 0.001072. Value loss: 0.183961. Entropy: 0.950665.
Iteration 20538: Policy loss: -0.002240. Value loss: 0.117575. Entropy: 0.950510.
episode: 6741   score: 540.0  epsilon: 1.0    steps: 176  evaluation reward: 489.6
episode: 6742   score: 910.0  epsilon: 1.0    steps: 408  evaluation reward: 495.4
Training network. lr: 0.000093. clip: 0.037027
Iteration 20539: Policy loss: -0.000169. Value loss: 0.129725. Entropy: 0.952792.
Iteration 20540: Policy loss: -0.002956. Value loss: 0.070126. Entropy: 0.946975.
Iteration 20541: Policy loss: -0.004829. Value loss: 0.053764. Entropy: 0.948092.
Training network. lr: 0.000093. clip: 0.037027
Iteration 20542: Policy loss: 0.004319. Value loss: 0.276193. Entropy: 0.967807.
Iteration 20543: Policy loss: 0.002157. Value loss: 0.135545. Entropy: 0.965037.
Iteration 20544: Policy loss: -0.001962. 

Training network. lr: 0.000092. clip: 0.036723
Iteration 20602: Policy loss: 0.001343. Value loss: 0.255874. Entropy: 1.151254.
Iteration 20603: Policy loss: -0.000613. Value loss: 0.122588. Entropy: 1.150969.
Iteration 20604: Policy loss: -0.002617. Value loss: 0.087853. Entropy: 1.154859.
Training network. lr: 0.000092. clip: 0.036723
Iteration 20605: Policy loss: 0.002445. Value loss: 0.311392. Entropy: 1.107330.
Iteration 20606: Policy loss: -0.001014. Value loss: 0.163968. Entropy: 1.107891.
Iteration 20607: Policy loss: -0.004675. Value loss: 0.112380. Entropy: 1.103670.
episode: 6762   score: 240.0  epsilon: 1.0    steps: 264  evaluation reward: 476.35
episode: 6763   score: 575.0  epsilon: 1.0    steps: 328  evaluation reward: 474.35
Training network. lr: 0.000092. clip: 0.036723
Iteration 20608: Policy loss: 0.001892. Value loss: 0.234142. Entropy: 0.975951.
Iteration 20609: Policy loss: -0.002704. Value loss: 0.134414. Entropy: 0.981148.
Iteration 20610: Policy loss: -0.00626

Iteration 20670: Policy loss: -0.003741. Value loss: 0.070651. Entropy: 0.953722.
Training network. lr: 0.000091. clip: 0.036566
Iteration 20671: Policy loss: 0.001477. Value loss: 0.248691. Entropy: 1.060728.
Iteration 20672: Policy loss: -0.001369. Value loss: 0.164938. Entropy: 1.062559.
Iteration 20673: Policy loss: -0.003445. Value loss: 0.122525. Entropy: 1.058832.
episode: 6781   score: 260.0  epsilon: 1.0    steps: 96  evaluation reward: 506.7
Training network. lr: 0.000091. clip: 0.036566
Iteration 20674: Policy loss: 0.001490. Value loss: 0.138679. Entropy: 1.029149.
Iteration 20675: Policy loss: -0.003696. Value loss: 0.064892. Entropy: 1.029034.
Iteration 20676: Policy loss: -0.007069. Value loss: 0.045006. Entropy: 1.030998.
episode: 6782   score: 940.0  epsilon: 1.0    steps: 448  evaluation reward: 509.7
Training network. lr: 0.000091. clip: 0.036566
Iteration 20677: Policy loss: 0.002756. Value loss: 0.152944. Entropy: 1.063105.
Iteration 20678: Policy loss: -0.001034. 

Iteration 20737: Policy loss: 0.003738. Value loss: 0.111796. Entropy: 0.912700.
Iteration 20738: Policy loss: -0.000712. Value loss: 0.078266. Entropy: 0.913376.
Iteration 20739: Policy loss: -0.002157. Value loss: 0.059635. Entropy: 0.914966.
Training network. lr: 0.000091. clip: 0.036419
Iteration 20740: Policy loss: 0.001750. Value loss: 0.460505. Entropy: 0.955693.
Iteration 20741: Policy loss: -0.000782. Value loss: 0.204303. Entropy: 0.955151.
Iteration 20742: Policy loss: -0.004191. Value loss: 0.116942. Entropy: 0.951934.
now time :  2019-03-06 01:12:46.176025
episode: 6801   score: 350.0  epsilon: 1.0    steps: 16  evaluation reward: 539.1
Training network. lr: 0.000091. clip: 0.036419
Iteration 20743: Policy loss: 0.002422. Value loss: 0.180552. Entropy: 0.971552.
Iteration 20744: Policy loss: -0.002632. Value loss: 0.096604. Entropy: 0.965212.
Iteration 20745: Policy loss: -0.006567. Value loss: 0.075638. Entropy: 0.960666.
Training network. lr: 0.000091. clip: 0.036419
Ite

episode: 6819   score: 470.0  epsilon: 1.0    steps: 736  evaluation reward: 545.4
Training network. lr: 0.000090. clip: 0.036105
Iteration 20806: Policy loss: 0.003371. Value loss: 0.168225. Entropy: 1.079403.
Iteration 20807: Policy loss: -0.004024. Value loss: 0.097357. Entropy: 1.077070.
Iteration 20808: Policy loss: -0.005225. Value loss: 0.071070. Entropy: 1.074517.
Training network. lr: 0.000090. clip: 0.036105
Iteration 20809: Policy loss: 0.000989. Value loss: 0.104136. Entropy: 1.055745.
Iteration 20810: Policy loss: -0.004848. Value loss: 0.056205. Entropy: 1.059723.
Iteration 20811: Policy loss: -0.007263. Value loss: 0.045488. Entropy: 1.058062.
episode: 6820   score: 580.0  epsilon: 1.0    steps: 392  evaluation reward: 545.25
episode: 6821   score: 615.0  epsilon: 1.0    steps: 568  evaluation reward: 548.5
Training network. lr: 0.000090. clip: 0.036105
Iteration 20812: Policy loss: 0.003938. Value loss: 0.124235. Entropy: 1.014556.
Iteration 20813: Policy loss: -0.00140

Iteration 20872: Policy loss: 0.000489. Value loss: 0.098333. Entropy: 0.892022.
Iteration 20873: Policy loss: -0.004939. Value loss: 0.071847. Entropy: 0.890658.
Iteration 20874: Policy loss: -0.008029. Value loss: 0.060822. Entropy: 0.888796.
episode: 6840   score: 785.0  epsilon: 1.0    steps: 104  evaluation reward: 534.15
episode: 6841   score: 420.0  epsilon: 1.0    steps: 216  evaluation reward: 532.95
Training network. lr: 0.000090. clip: 0.035958
Iteration 20875: Policy loss: 0.001446. Value loss: 0.156700. Entropy: 0.900066.
Iteration 20876: Policy loss: -0.003000. Value loss: 0.096528. Entropy: 0.899690.
Iteration 20877: Policy loss: -0.004452. Value loss: 0.075068. Entropy: 0.897429.
Training network. lr: 0.000090. clip: 0.035958
Iteration 20878: Policy loss: 0.002094. Value loss: 0.164721. Entropy: 1.031180.
Iteration 20879: Policy loss: -0.003247. Value loss: 0.084466. Entropy: 1.027129.
Iteration 20880: Policy loss: -0.006839. Value loss: 0.061741. Entropy: 1.028887.
epi

episode: 6862   score: 345.0  epsilon: 1.0    steps: 872  evaluation reward: 516.05
Training network. lr: 0.000090. clip: 0.035801
Iteration 20938: Policy loss: 0.003005. Value loss: 0.137606. Entropy: 0.912552.
Iteration 20939: Policy loss: -0.003326. Value loss: 0.095476. Entropy: 0.903594.
Iteration 20940: Policy loss: -0.005318. Value loss: 0.078141. Entropy: 0.905047.
Training network. lr: 0.000090. clip: 0.035801
Iteration 20941: Policy loss: 0.001761. Value loss: 0.251161. Entropy: 0.964144.
Iteration 20942: Policy loss: -0.000906. Value loss: 0.102761. Entropy: 0.962064.
Iteration 20943: Policy loss: -0.002576. Value loss: 0.074003. Entropy: 0.962232.
Training network. lr: 0.000090. clip: 0.035801
Iteration 20944: Policy loss: 0.002894. Value loss: 0.227966. Entropy: 0.997435.
Iteration 20945: Policy loss: -0.002374. Value loss: 0.137670. Entropy: 0.996932.
Iteration 20946: Policy loss: -0.003009. Value loss: 0.106325. Entropy: 0.991852.
episode: 6863   score: 695.0  epsilon: 1

Iteration 21005: Policy loss: 0.002590. Value loss: 0.244603. Entropy: 1.015027.
Iteration 21006: Policy loss: -0.000376. Value loss: 0.165022. Entropy: 1.013271.
episode: 6882   score: 670.0  epsilon: 1.0    steps: 376  evaluation reward: 500.7
episode: 6883   score: 230.0  epsilon: 1.0    steps: 696  evaluation reward: 498.2
Training network. lr: 0.000089. clip: 0.035497
Iteration 21007: Policy loss: 0.001878. Value loss: 0.204603. Entropy: 1.058276.
Iteration 21008: Policy loss: -0.002261. Value loss: 0.122130. Entropy: 1.054173.
Iteration 21009: Policy loss: -0.005065. Value loss: 0.087381. Entropy: 1.057595.
episode: 6884   score: 370.0  epsilon: 1.0    steps: 88  evaluation reward: 495.7
Training network. lr: 0.000089. clip: 0.035497
Iteration 21010: Policy loss: 0.001044. Value loss: 0.324605. Entropy: 0.976644.
Iteration 21011: Policy loss: -0.002236. Value loss: 0.272719. Entropy: 0.978999.
Iteration 21012: Policy loss: -0.003321. Value loss: 0.218132. Entropy: 0.978352.
episo

Iteration 21072: Policy loss: -0.005349. Value loss: 0.049762. Entropy: 0.978858.
Training network. lr: 0.000088. clip: 0.035341
Iteration 21073: Policy loss: 0.004572. Value loss: 0.493521. Entropy: 1.072465.
Iteration 21074: Policy loss: 0.005080. Value loss: 0.308032. Entropy: 1.069230.
Iteration 21075: Policy loss: -0.001218. Value loss: 0.252700. Entropy: 1.068805.
episode: 6902   score: 420.0  epsilon: 1.0    steps: 880  evaluation reward: 491.8
Training network. lr: 0.000088. clip: 0.035341
Iteration 21076: Policy loss: 0.003901. Value loss: 0.345786. Entropy: 1.050431.
Iteration 21077: Policy loss: 0.001167. Value loss: 0.161342. Entropy: 1.048720.
Iteration 21078: Policy loss: -0.004228. Value loss: 0.113092. Entropy: 1.039537.
episode: 6903   score: 495.0  epsilon: 1.0    steps: 120  evaluation reward: 493.2
episode: 6904   score: 380.0  epsilon: 1.0    steps: 152  evaluation reward: 493.5
Training network. lr: 0.000088. clip: 0.035341
Iteration 21079: Policy loss: 0.002211. 

Training network. lr: 0.000088. clip: 0.035184
Iteration 21139: Policy loss: 0.002432. Value loss: 0.108345. Entropy: 1.000665.
Iteration 21140: Policy loss: -0.003071. Value loss: 0.061686. Entropy: 1.000118.
Iteration 21141: Policy loss: -0.005202. Value loss: 0.047202. Entropy: 1.001349.
Training network. lr: 0.000088. clip: 0.035184
Iteration 21142: Policy loss: 0.001469. Value loss: 0.118528. Entropy: 0.959771.
Iteration 21143: Policy loss: -0.004793. Value loss: 0.068727. Entropy: 0.962722.
Iteration 21144: Policy loss: -0.007148. Value loss: 0.054021. Entropy: 0.957997.
episode: 6923   score: 565.0  epsilon: 1.0    steps: 744  evaluation reward: 492.9
Training network. lr: 0.000088. clip: 0.035184
Iteration 21145: Policy loss: 0.002296. Value loss: 0.491008. Entropy: 1.038003.
Iteration 21146: Policy loss: 0.004711. Value loss: 0.197408. Entropy: 1.039877.
Iteration 21147: Policy loss: 0.000857. Value loss: 0.141133. Entropy: 1.045770.
Training network. lr: 0.000088. clip: 0.035

Iteration 21207: Policy loss: -0.001964. Value loss: 0.106279. Entropy: 1.105569.
episode: 6942   score: 445.0  epsilon: 1.0    steps: 400  evaluation reward: 500.8
episode: 6943   score: 355.0  epsilon: 1.0    steps: 552  evaluation reward: 501.75
Training network. lr: 0.000087. clip: 0.034880
Iteration 21208: Policy loss: 0.003232. Value loss: 0.134789. Entropy: 0.990101.
Iteration 21209: Policy loss: -0.002701. Value loss: 0.070621. Entropy: 0.990426.
Iteration 21210: Policy loss: -0.005983. Value loss: 0.057457. Entropy: 0.992024.
Training network. lr: 0.000087. clip: 0.034880
Iteration 21211: Policy loss: 0.001168. Value loss: 0.162307. Entropy: 1.069038.
Iteration 21212: Policy loss: -0.001321. Value loss: 0.097839. Entropy: 1.070710.
Iteration 21213: Policy loss: -0.004391. Value loss: 0.075928. Entropy: 1.066463.
episode: 6944   score: 390.0  epsilon: 1.0    steps: 224  evaluation reward: 498.7
episode: 6945   score: 555.0  epsilon: 1.0    steps: 360  evaluation reward: 500.2
T

Training network. lr: 0.000087. clip: 0.034723
Iteration 21274: Policy loss: 0.005648. Value loss: 0.572465. Entropy: 0.966587.
Iteration 21275: Policy loss: 0.003804. Value loss: 0.278333. Entropy: 0.955336.
Iteration 21276: Policy loss: 0.001156. Value loss: 0.160145. Entropy: 0.957350.
episode: 6963   score: 285.0  epsilon: 1.0    steps: 136  evaluation reward: 536.4
Training network. lr: 0.000087. clip: 0.034723
Iteration 21277: Policy loss: 0.001018. Value loss: 0.178545. Entropy: 1.001714.
Iteration 21278: Policy loss: -0.002922. Value loss: 0.098887. Entropy: 1.004813.
Iteration 21279: Policy loss: -0.004196. Value loss: 0.069641. Entropy: 1.000667.
episode: 6964   score: 330.0  epsilon: 1.0    steps: 88  evaluation reward: 535.0
episode: 6965   score: 670.0  epsilon: 1.0    steps: 488  evaluation reward: 535.2
Training network. lr: 0.000087. clip: 0.034723
Iteration 21280: Policy loss: 0.001828. Value loss: 0.307127. Entropy: 0.874344.
Iteration 21281: Policy loss: 0.000976. Va

Iteration 21341: Policy loss: 0.005717. Value loss: 0.237737. Entropy: 0.950046.
Iteration 21342: Policy loss: 0.002433. Value loss: 0.180148. Entropy: 0.946609.
Training network. lr: 0.000086. clip: 0.034576
Iteration 21343: Policy loss: 0.002927. Value loss: 0.249295. Entropy: 1.049777.
Iteration 21344: Policy loss: 0.000385. Value loss: 0.110588. Entropy: 1.049221.
Iteration 21345: Policy loss: -0.002473. Value loss: 0.074457. Entropy: 1.054358.
Training network. lr: 0.000086. clip: 0.034576
Iteration 21346: Policy loss: 0.000327. Value loss: 0.208313. Entropy: 1.139329.
Iteration 21347: Policy loss: -0.001498. Value loss: 0.121254. Entropy: 1.140290.
Iteration 21348: Policy loss: -0.003536. Value loss: 0.090277. Entropy: 1.144244.
episode: 6983   score: 600.0  epsilon: 1.0    steps: 328  evaluation reward: 551.65
Training network. lr: 0.000086. clip: 0.034576
Iteration 21349: Policy loss: 0.004401. Value loss: 0.336778. Entropy: 1.051984.
Iteration 21350: Policy loss: 0.000303. Val

Iteration 21407: Policy loss: -0.002404. Value loss: 0.069420. Entropy: 1.019401.
Iteration 21408: Policy loss: -0.005043. Value loss: 0.052375. Entropy: 1.025664.
Training network. lr: 0.000086. clip: 0.034262
Iteration 21409: Policy loss: 0.000235. Value loss: 0.120363. Entropy: 1.062617.
Iteration 21410: Policy loss: -0.003392. Value loss: 0.074347. Entropy: 1.064153.
Iteration 21411: Policy loss: -0.005652. Value loss: 0.051404. Entropy: 1.065673.
episode: 7004   score: 515.0  epsilon: 1.0    steps: 232  evaluation reward: 535.2
episode: 7005   score: 315.0  epsilon: 1.0    steps: 384  evaluation reward: 531.1
episode: 7006   score: 365.0  epsilon: 1.0    steps: 856  evaluation reward: 528.7
Training network. lr: 0.000086. clip: 0.034262
Iteration 21412: Policy loss: 0.004655. Value loss: 0.150685. Entropy: 0.959843.
Iteration 21413: Policy loss: -0.001814. Value loss: 0.081562. Entropy: 0.957128.
Iteration 21414: Policy loss: -0.005172. Value loss: 0.058652. Entropy: 0.960692.
Tra

Iteration 21473: Policy loss: 0.000929. Value loss: 0.113467. Entropy: 1.020152.
Iteration 21474: Policy loss: -0.002656. Value loss: 0.090581. Entropy: 1.014985.
Training network. lr: 0.000085. clip: 0.034115
Iteration 21475: Policy loss: 0.004772. Value loss: 0.274545. Entropy: 0.912821.
Iteration 21476: Policy loss: 0.001573. Value loss: 0.174546. Entropy: 0.916560.
Iteration 21477: Policy loss: -0.000044. Value loss: 0.127421. Entropy: 0.917699.
Training network. lr: 0.000085. clip: 0.034115
Iteration 21478: Policy loss: 0.001579. Value loss: 0.356514. Entropy: 1.094562.
Iteration 21479: Policy loss: -0.000002. Value loss: 0.154687. Entropy: 1.089010.
Iteration 21480: Policy loss: -0.001695. Value loss: 0.093865. Entropy: 1.083847.
episode: 7026   score: 420.0  epsilon: 1.0    steps: 512  evaluation reward: 511.95
episode: 7027   score: 725.0  epsilon: 1.0    steps: 912  evaluation reward: 516.5
Training network. lr: 0.000085. clip: 0.034115
Iteration 21481: Policy loss: 0.001612. 

Iteration 21541: Policy loss: 0.005231. Value loss: 0.292230. Entropy: 1.026509.
Iteration 21542: Policy loss: 0.002756. Value loss: 0.131624. Entropy: 1.033336.
Iteration 21543: Policy loss: -0.001237. Value loss: 0.087760. Entropy: 1.025899.
episode: 7045   score: 515.0  epsilon: 1.0    steps: 904  evaluation reward: 520.5
Training network. lr: 0.000085. clip: 0.033958
Iteration 21544: Policy loss: 0.002480. Value loss: 0.204142. Entropy: 1.075391.
Iteration 21545: Policy loss: -0.000971. Value loss: 0.108468. Entropy: 1.074709.
Iteration 21546: Policy loss: -0.004489. Value loss: 0.078191. Entropy: 1.077039.
episode: 7046   score: 400.0  epsilon: 1.0    steps: 48  evaluation reward: 519.55
Training network. lr: 0.000085. clip: 0.033958
Iteration 21547: Policy loss: 0.001507. Value loss: 0.162009. Entropy: 0.958084.
Iteration 21548: Policy loss: -0.003406. Value loss: 0.100337. Entropy: 0.952012.
Iteration 21549: Policy loss: -0.007036. Value loss: 0.072774. Entropy: 0.951365.
episod

Iteration 21609: Policy loss: -0.000150. Value loss: 0.512329. Entropy: 1.210831.
episode: 7064   score: 315.0  epsilon: 1.0    steps: 8  evaluation reward: 508.75
episode: 7065   score: 385.0  epsilon: 1.0    steps: 840  evaluation reward: 505.9
Training network. lr: 0.000084. clip: 0.033654
Iteration 21610: Policy loss: 0.002939. Value loss: 0.123092. Entropy: 1.095448.
Iteration 21611: Policy loss: -0.001791. Value loss: 0.086843. Entropy: 1.090819.
Iteration 21612: Policy loss: -0.005745. Value loss: 0.070413. Entropy: 1.090228.
Training network. lr: 0.000084. clip: 0.033654
Iteration 21613: Policy loss: 0.002980. Value loss: 0.188966. Entropy: 0.967168.
Iteration 21614: Policy loss: -0.000054. Value loss: 0.105601. Entropy: 0.967443.
Iteration 21615: Policy loss: -0.003883. Value loss: 0.077121. Entropy: 0.961703.
Training network. lr: 0.000084. clip: 0.033654
Iteration 21616: Policy loss: 0.001859. Value loss: 0.200531. Entropy: 1.090819.
Iteration 21617: Policy loss: -0.002200. 

episode: 7085   score: 725.0  epsilon: 1.0    steps: 328  evaluation reward: 498.5
Training network. lr: 0.000084. clip: 0.033497
Iteration 21676: Policy loss: 0.002257. Value loss: 0.219125. Entropy: 1.031313.
Iteration 21677: Policy loss: -0.001214. Value loss: 0.102789. Entropy: 1.035977.
Iteration 21678: Policy loss: -0.004863. Value loss: 0.069117. Entropy: 1.027234.
episode: 7086   score: 695.0  epsilon: 1.0    steps: 136  evaluation reward: 501.55
episode: 7087   score: 330.0  epsilon: 1.0    steps: 720  evaluation reward: 496.45
Training network. lr: 0.000084. clip: 0.033497
Iteration 21679: Policy loss: 0.003039. Value loss: 0.411769. Entropy: 0.961744.
Iteration 21680: Policy loss: 0.002121. Value loss: 0.278798. Entropy: 0.967933.
Iteration 21681: Policy loss: -0.001602. Value loss: 0.227630. Entropy: 0.971931.
Training network. lr: 0.000084. clip: 0.033497
Iteration 21682: Policy loss: 0.002144. Value loss: 0.149692. Entropy: 0.975883.
Iteration 21683: Policy loss: -0.00293

Iteration 21741: Policy loss: -0.004525. Value loss: 0.051352. Entropy: 0.963537.
episode: 7107   score: 390.0  epsilon: 1.0    steps: 1024  evaluation reward: 509.35
Training network. lr: 0.000083. clip: 0.033341
Iteration 21742: Policy loss: 0.002635. Value loss: 0.247583. Entropy: 0.950494.
Iteration 21743: Policy loss: -0.001040. Value loss: 0.148897. Entropy: 0.944814.
Iteration 21744: Policy loss: -0.003503. Value loss: 0.116352. Entropy: 0.947254.
episode: 7108   score: 545.0  epsilon: 1.0    steps: 688  evaluation reward: 511.65
episode: 7109   score: 390.0  epsilon: 1.0    steps: 848  evaluation reward: 511.65
Training network. lr: 0.000083. clip: 0.033341
Iteration 21745: Policy loss: 0.002739. Value loss: 0.311060. Entropy: 0.982517.
Iteration 21746: Policy loss: 0.003941. Value loss: 0.165749. Entropy: 0.981354.
Iteration 21747: Policy loss: 0.001097. Value loss: 0.103258. Entropy: 0.984190.
Training network. lr: 0.000083. clip: 0.033341
Iteration 21748: Policy loss: 0.0023

Iteration 21808: Policy loss: 0.001901. Value loss: 0.139335. Entropy: 1.073278.
Iteration 21809: Policy loss: -0.002666. Value loss: 0.089636. Entropy: 1.070671.
Iteration 21810: Policy loss: -0.004073. Value loss: 0.070581. Entropy: 1.069971.
Training network. lr: 0.000083. clip: 0.033037
Iteration 21811: Policy loss: 0.002595. Value loss: 0.260653. Entropy: 1.041462.
Iteration 21812: Policy loss: 0.003230. Value loss: 0.093698. Entropy: 1.041828.
Iteration 21813: Policy loss: -0.002812. Value loss: 0.063690. Entropy: 1.040647.
episode: 7127   score: 725.0  epsilon: 1.0    steps: 376  evaluation reward: 532.5
episode: 7128   score: 770.0  epsilon: 1.0    steps: 488  evaluation reward: 531.25
Training network. lr: 0.000083. clip: 0.033037
Iteration 21814: Policy loss: 0.001818. Value loss: 0.126537. Entropy: 0.921169.
Iteration 21815: Policy loss: -0.002160. Value loss: 0.079600. Entropy: 0.924478.
Iteration 21816: Policy loss: -0.005119. Value loss: 0.064527. Entropy: 0.920241.
episo

Training network. lr: 0.000082. clip: 0.032880
Iteration 21874: Policy loss: 0.004141. Value loss: 0.310753. Entropy: 0.840415.
Iteration 21875: Policy loss: 0.000918. Value loss: 0.189436. Entropy: 0.836421.
Iteration 21876: Policy loss: -0.001467. Value loss: 0.151449. Entropy: 0.836915.
Training network. lr: 0.000082. clip: 0.032880
Iteration 21877: Policy loss: 0.000823. Value loss: 0.303575. Entropy: 0.984783.
Iteration 21878: Policy loss: 0.002551. Value loss: 0.188401. Entropy: 0.988290.
Iteration 21879: Policy loss: -0.002505. Value loss: 0.116054. Entropy: 0.983834.
episode: 7149   score: 670.0  epsilon: 1.0    steps: 48  evaluation reward: 519.95
Training network. lr: 0.000082. clip: 0.032880
Iteration 21880: Policy loss: 0.000771. Value loss: 0.364396. Entropy: 1.120245.
Iteration 21881: Policy loss: 0.002624. Value loss: 0.185107. Entropy: 1.126361.
Iteration 21882: Policy loss: -0.000324. Value loss: 0.110187. Entropy: 1.121615.
Training network. lr: 0.000082. clip: 0.0328

Iteration 21942: Policy loss: -0.004789. Value loss: 0.048766. Entropy: 0.955580.
Training network. lr: 0.000082. clip: 0.032732
Iteration 21943: Policy loss: 0.001837. Value loss: 0.130822. Entropy: 0.944971.
Iteration 21944: Policy loss: -0.002912. Value loss: 0.068296. Entropy: 0.945865.
Iteration 21945: Policy loss: -0.004266. Value loss: 0.050764. Entropy: 0.946491.
episode: 7168   score: 670.0  epsilon: 1.0    steps: 152  evaluation reward: 519.35
Training network. lr: 0.000082. clip: 0.032732
Iteration 21946: Policy loss: 0.000963. Value loss: 0.105374. Entropy: 0.935748.
Iteration 21947: Policy loss: -0.001674. Value loss: 0.063150. Entropy: 0.938131.
Iteration 21948: Policy loss: -0.004756. Value loss: 0.043968. Entropy: 0.935312.
Training network. lr: 0.000082. clip: 0.032732
Iteration 21949: Policy loss: 0.002285. Value loss: 0.179666. Entropy: 1.135135.
Iteration 21950: Policy loss: -0.001068. Value loss: 0.114794. Entropy: 1.138220.
Iteration 21951: Policy loss: -0.005167.

Iteration 22010: Policy loss: -0.000087. Value loss: 0.177200. Entropy: 1.031910.
Iteration 22011: Policy loss: -0.001059. Value loss: 0.123871. Entropy: 1.038906.
Training network. lr: 0.000081. clip: 0.032419
Iteration 22012: Policy loss: 0.002667. Value loss: 0.128772. Entropy: 1.062414.
Iteration 22013: Policy loss: -0.002117. Value loss: 0.080669. Entropy: 1.070721.
Iteration 22014: Policy loss: -0.003218. Value loss: 0.065970. Entropy: 1.068264.
Training network. lr: 0.000081. clip: 0.032419
Iteration 22015: Policy loss: 0.002450. Value loss: 0.181495. Entropy: 1.032717.
Iteration 22016: Policy loss: -0.000528. Value loss: 0.095312. Entropy: 1.033114.
Iteration 22017: Policy loss: -0.003394. Value loss: 0.054519. Entropy: 1.027468.
episode: 7187   score: 380.0  epsilon: 1.0    steps: 72  evaluation reward: 530.95
Training network. lr: 0.000081. clip: 0.032419
Iteration 22018: Policy loss: 0.002071. Value loss: 0.321136. Entropy: 1.061048.
Iteration 22019: Policy loss: 0.001170. V

Training network. lr: 0.000081. clip: 0.032272
Iteration 22078: Policy loss: 0.002251. Value loss: 0.292317. Entropy: 1.094413.
Iteration 22079: Policy loss: -0.001926. Value loss: 0.160103. Entropy: 1.095078.
Iteration 22080: Policy loss: -0.004678. Value loss: 0.124620. Entropy: 1.096443.
episode: 7206   score: 1160.0  epsilon: 1.0    steps: 216  evaluation reward: 551.45
episode: 7207   score: 495.0  epsilon: 1.0    steps: 472  evaluation reward: 552.5
Training network. lr: 0.000081. clip: 0.032272
Iteration 22081: Policy loss: 0.001224. Value loss: 0.268357. Entropy: 0.933961.
Iteration 22082: Policy loss: -0.000139. Value loss: 0.168728. Entropy: 0.929242.
Iteration 22083: Policy loss: -0.001940. Value loss: 0.134165. Entropy: 0.924880.
Training network. lr: 0.000081. clip: 0.032272
Iteration 22084: Policy loss: 0.001047. Value loss: 0.230135. Entropy: 0.998825.
Iteration 22085: Policy loss: -0.001003. Value loss: 0.147562. Entropy: 1.002563.
Iteration 22086: Policy loss: -0.00246

Iteration 22146: Policy loss: -0.003442. Value loss: 0.078349. Entropy: 1.046864.
episode: 7225   score: 615.0  epsilon: 1.0    steps: 104  evaluation reward: 559.95
episode: 7226   score: 640.0  epsilon: 1.0    steps: 344  evaluation reward: 559.65
Training network. lr: 0.000080. clip: 0.032115
Iteration 22147: Policy loss: 0.001169. Value loss: 0.308042. Entropy: 0.860082.
Iteration 22148: Policy loss: -0.001679. Value loss: 0.280350. Entropy: 0.866222.
Iteration 22149: Policy loss: -0.002886. Value loss: 0.248966. Entropy: 0.860773.
Training network. lr: 0.000080. clip: 0.032115
Iteration 22150: Policy loss: 0.001611. Value loss: 0.166738. Entropy: 0.920047.
Iteration 22151: Policy loss: -0.003224. Value loss: 0.110552. Entropy: 0.916927.
Iteration 22152: Policy loss: -0.005966. Value loss: 0.088277. Entropy: 0.920997.
episode: 7227   score: 730.0  epsilon: 1.0    steps: 104  evaluation reward: 559.7
episode: 7228   score: 765.0  epsilon: 1.0    steps: 128  evaluation reward: 559.65

Iteration 22213: Policy loss: 0.001316. Value loss: 0.120679. Entropy: 1.049797.
Iteration 22214: Policy loss: -0.003046. Value loss: 0.075847. Entropy: 1.058163.
Iteration 22215: Policy loss: -0.006798. Value loss: 0.060786. Entropy: 1.054593.
episode: 7245   score: 760.0  epsilon: 1.0    steps: 48  evaluation reward: 581.35
episode: 7246   score: 925.0  epsilon: 1.0    steps: 512  evaluation reward: 585.2
episode: 7247   score: 585.0  epsilon: 1.0    steps: 824  evaluation reward: 585.15
Training network. lr: 0.000080. clip: 0.031811
Iteration 22216: Policy loss: 0.001542. Value loss: 0.700220. Entropy: 0.868928.
Iteration 22217: Policy loss: 0.002341. Value loss: 0.539849. Entropy: 0.864336.
Iteration 22218: Policy loss: -0.000228. Value loss: 0.489587. Entropy: 0.868698.
episode: 7248   score: 575.0  epsilon: 1.0    steps: 304  evaluation reward: 585.95
episode: 7249   score: 530.0  epsilon: 1.0    steps: 560  evaluation reward: 584.55
Training network. lr: 0.000080. clip: 0.031811

Iteration 22279: Policy loss: 0.001673. Value loss: 0.281023. Entropy: 0.961565.
Iteration 22280: Policy loss: -0.001538. Value loss: 0.136554. Entropy: 0.964401.
Iteration 22281: Policy loss: -0.002280. Value loss: 0.079576. Entropy: 0.961404.
episode: 7266   score: 620.0  epsilon: 1.0    steps: 696  evaluation reward: 587.35
Training network. lr: 0.000079. clip: 0.031654
Iteration 22282: Policy loss: 0.004846. Value loss: 0.386235. Entropy: 1.024707.
Iteration 22283: Policy loss: 0.003069. Value loss: 0.146682. Entropy: 1.020332.
Iteration 22284: Policy loss: 0.000292. Value loss: 0.097390. Entropy: 1.012914.
episode: 7267   score: 590.0  epsilon: 1.0    steps: 864  evaluation reward: 586.55
Training network. lr: 0.000079. clip: 0.031654
Iteration 22285: Policy loss: 0.003130. Value loss: 0.138440. Entropy: 0.991372.
Iteration 22286: Policy loss: -0.000310. Value loss: 0.076410. Entropy: 0.996176.
Iteration 22287: Policy loss: -0.004173. Value loss: 0.055972. Entropy: 0.992220.
episo

Iteration 22347: Policy loss: -0.003152. Value loss: 0.118217. Entropy: 1.042720.
episode: 7286   score: 740.0  epsilon: 1.0    steps: 632  evaluation reward: 578.9
Training network. lr: 0.000079. clip: 0.031497
Iteration 22348: Policy loss: 0.000762. Value loss: 0.105464. Entropy: 0.957428.
Iteration 22349: Policy loss: -0.002477. Value loss: 0.061649. Entropy: 0.960625.
Iteration 22350: Policy loss: -0.001858. Value loss: 0.049628. Entropy: 0.963381.
episode: 7287   score: 435.0  epsilon: 1.0    steps: 784  evaluation reward: 579.45
Training network. lr: 0.000078. clip: 0.031350
Iteration 22351: Policy loss: 0.002562. Value loss: 0.172811. Entropy: 0.947868.
Iteration 22352: Policy loss: -0.000826. Value loss: 0.108154. Entropy: 0.949683.
Iteration 22353: Policy loss: -0.003723. Value loss: 0.081006. Entropy: 0.953769.
Training network. lr: 0.000078. clip: 0.031350
Iteration 22354: Policy loss: 0.001023. Value loss: 0.471529. Entropy: 1.012957.
Iteration 22355: Policy loss: 0.001774.

Iteration 22415: Policy loss: -0.000836. Value loss: 0.087383. Entropy: 1.109702.
Iteration 22416: Policy loss: -0.004738. Value loss: 0.063629. Entropy: 1.112621.
episode: 7305   score: 315.0  epsilon: 1.0    steps: 192  evaluation reward: 559.75
Training network. lr: 0.000078. clip: 0.031193
Iteration 22417: Policy loss: 0.004276. Value loss: 0.331946. Entropy: 1.006564.
Iteration 22418: Policy loss: 0.003094. Value loss: 0.207716. Entropy: 1.009422.
Iteration 22419: Policy loss: 0.002445. Value loss: 0.139782. Entropy: 1.005209.
episode: 7306   score: 590.0  epsilon: 1.0    steps: 392  evaluation reward: 554.05
Training network. lr: 0.000078. clip: 0.031193
Iteration 22420: Policy loss: 0.000632. Value loss: 0.085371. Entropy: 1.031063.
Iteration 22421: Policy loss: -0.002991. Value loss: 0.050701. Entropy: 1.026435.
Iteration 22422: Policy loss: -0.006132. Value loss: 0.039854. Entropy: 1.021867.
episode: 7307   score: 425.0  epsilon: 1.0    steps: 440  evaluation reward: 553.35
Tr

Training network. lr: 0.000078. clip: 0.031037
Iteration 22483: Policy loss: 0.001004. Value loss: 0.122054. Entropy: 1.030940.
Iteration 22484: Policy loss: -0.003515. Value loss: 0.066840. Entropy: 1.029580.
Iteration 22485: Policy loss: -0.007181. Value loss: 0.053037. Entropy: 1.024806.
Training network. lr: 0.000078. clip: 0.031037
Iteration 22486: Policy loss: 0.003666. Value loss: 0.371032. Entropy: 1.034171.
Iteration 22487: Policy loss: 0.001823. Value loss: 0.168079. Entropy: 1.044292.
Iteration 22488: Policy loss: -0.000355. Value loss: 0.118943. Entropy: 1.040767.
episode: 7325   score: 450.0  epsilon: 1.0    steps: 520  evaluation reward: 554.8
Training network. lr: 0.000078. clip: 0.031037
Iteration 22489: Policy loss: 0.003866. Value loss: 0.317715. Entropy: 1.096285.
Iteration 22490: Policy loss: 0.003141. Value loss: 0.166540. Entropy: 1.102534.
Iteration 22491: Policy loss: 0.000626. Value loss: 0.114272. Entropy: 1.105118.
Training network. lr: 0.000078. clip: 0.0310

Training network. lr: 0.000077. clip: 0.030733
Iteration 22552: Policy loss: 0.002229. Value loss: 0.191312. Entropy: 0.985885.
Iteration 22553: Policy loss: 0.000108. Value loss: 0.110072. Entropy: 0.986847.
Iteration 22554: Policy loss: -0.002996. Value loss: 0.081388. Entropy: 0.987304.
episode: 7344   score: 740.0  epsilon: 1.0    steps: 400  evaluation reward: 559.15
Training network. lr: 0.000077. clip: 0.030733
Iteration 22555: Policy loss: 0.001291. Value loss: 0.247811. Entropy: 0.964038.
Iteration 22556: Policy loss: -0.001778. Value loss: 0.136197. Entropy: 0.965302.
Iteration 22557: Policy loss: -0.003648. Value loss: 0.102403. Entropy: 0.958736.
Training network. lr: 0.000077. clip: 0.030733
Iteration 22558: Policy loss: 0.001616. Value loss: 0.206329. Entropy: 1.036162.
Iteration 22559: Policy loss: -0.002383. Value loss: 0.114474. Entropy: 1.034525.
Iteration 22560: Policy loss: -0.005839. Value loss: 0.092000. Entropy: 1.028988.
Training network. lr: 0.000077. clip: 0.0

Iteration 22620: Policy loss: -0.004072. Value loss: 0.071241. Entropy: 1.011989.
Training network. lr: 0.000076. clip: 0.030576
Iteration 22621: Policy loss: 0.001324. Value loss: 0.401200. Entropy: 0.977695.
Iteration 22622: Policy loss: 0.000847. Value loss: 0.297703. Entropy: 0.976692.
Iteration 22623: Policy loss: -0.003005. Value loss: 0.278140. Entropy: 0.979511.
episode: 7363   score: 390.0  epsilon: 1.0    steps: 152  evaluation reward: 551.6
episode: 7364   score: 755.0  epsilon: 1.0    steps: 816  evaluation reward: 553.95
Training network. lr: 0.000076. clip: 0.030576
Iteration 22624: Policy loss: 0.000928. Value loss: 0.340177. Entropy: 0.982752.
Iteration 22625: Policy loss: -0.000498. Value loss: 0.275769. Entropy: 0.982677.
Iteration 22626: Policy loss: -0.003113. Value loss: 0.238936. Entropy: 0.990343.
Training network. lr: 0.000076. clip: 0.030576
Iteration 22627: Policy loss: 0.003157. Value loss: 0.386571. Entropy: 1.045371.
Iteration 22628: Policy loss: 0.002026. 

Iteration 22687: Policy loss: 0.002338. Value loss: 0.221128. Entropy: 1.073389.
Iteration 22688: Policy loss: -0.001489. Value loss: 0.144019. Entropy: 1.069330.
Iteration 22689: Policy loss: -0.004330. Value loss: 0.109324. Entropy: 1.073326.
episode: 7383   score: 535.0  epsilon: 1.0    steps: 40  evaluation reward: 550.85
episode: 7384   score: 190.0  epsilon: 1.0    steps: 376  evaluation reward: 547.3
episode: 7385   score: 495.0  epsilon: 1.0    steps: 776  evaluation reward: 550.1
Training network. lr: 0.000076. clip: 0.030428
Iteration 22690: Policy loss: 0.002863. Value loss: 0.131861. Entropy: 0.895763.
Iteration 22691: Policy loss: -0.001257. Value loss: 0.077393. Entropy: 0.893230.
Iteration 22692: Policy loss: -0.001921. Value loss: 0.054126. Entropy: 0.897881.
episode: 7386   score: 630.0  epsilon: 1.0    steps: 912  evaluation reward: 549.0
Training network. lr: 0.000076. clip: 0.030428
Iteration 22693: Policy loss: 0.000055. Value loss: 0.147521. Entropy: 0.920505.
Ite

episode: 7405   score: 465.0  epsilon: 1.0    steps: 696  evaluation reward: 568.3
Training network. lr: 0.000075. clip: 0.030115
Iteration 22753: Policy loss: 0.001461. Value loss: 0.123920. Entropy: 0.957927.
Iteration 22754: Policy loss: -0.001758. Value loss: 0.083377. Entropy: 0.955793.
Iteration 22755: Policy loss: -0.004781. Value loss: 0.069355. Entropy: 0.957569.
Training network. lr: 0.000075. clip: 0.030115
Iteration 22756: Policy loss: 0.004299. Value loss: 0.261007. Entropy: 0.985447.
Iteration 22757: Policy loss: 0.000982. Value loss: 0.122661. Entropy: 0.986437.
Iteration 22758: Policy loss: -0.000983. Value loss: 0.074062. Entropy: 0.978493.
Training network. lr: 0.000075. clip: 0.030115
Iteration 22759: Policy loss: 0.004638. Value loss: 0.297280. Entropy: 1.032147.
Iteration 22760: Policy loss: 0.000296. Value loss: 0.158917. Entropy: 1.033701.
Iteration 22761: Policy loss: -0.003748. Value loss: 0.106626. Entropy: 1.035365.
Training network. lr: 0.000075. clip: 0.030

Training network. lr: 0.000075. clip: 0.029968
Iteration 22822: Policy loss: 0.000709. Value loss: 0.208493. Entropy: 1.131230.
Iteration 22823: Policy loss: -0.000243. Value loss: 0.122087. Entropy: 1.131833.
Iteration 22824: Policy loss: -0.002907. Value loss: 0.077156. Entropy: 1.131396.
episode: 7423   score: 695.0  epsilon: 1.0    steps: 824  evaluation reward: 564.9
Training network. lr: 0.000075. clip: 0.029968
Iteration 22825: Policy loss: 0.004125. Value loss: 0.297781. Entropy: 1.084651.
Iteration 22826: Policy loss: 0.002485. Value loss: 0.119046. Entropy: 1.081246.
Iteration 22827: Policy loss: -0.000499. Value loss: 0.091720. Entropy: 1.080546.
episode: 7424   score: 515.0  epsilon: 1.0    steps: 328  evaluation reward: 565.85
Training network. lr: 0.000075. clip: 0.029968
Iteration 22828: Policy loss: 0.002589. Value loss: 0.220603. Entropy: 0.977949.
Iteration 22829: Policy loss: -0.001528. Value loss: 0.154739. Entropy: 0.984019.
Iteration 22830: Policy loss: -0.004793.

Iteration 22888: Policy loss: 0.003953. Value loss: 0.334020. Entropy: 1.053022.
Iteration 22889: Policy loss: 0.003902. Value loss: 0.155348. Entropy: 1.052142.
Iteration 22890: Policy loss: 0.004638. Value loss: 0.105563. Entropy: 1.055825.
Training network. lr: 0.000075. clip: 0.029811
Iteration 22891: Policy loss: 0.007493. Value loss: 0.392163. Entropy: 1.057600.
Iteration 22892: Policy loss: 0.005166. Value loss: 0.178221. Entropy: 1.050277.
Iteration 22893: Policy loss: 0.002579. Value loss: 0.095143. Entropy: 1.051311.
episode: 7444   score: 695.0  epsilon: 1.0    steps: 32  evaluation reward: 560.65
episode: 7445   score: 600.0  epsilon: 1.0    steps: 720  evaluation reward: 562.4
Training network. lr: 0.000075. clip: 0.029811
Iteration 22894: Policy loss: 0.001986. Value loss: 0.233449. Entropy: 0.995597.
Iteration 22895: Policy loss: -0.001116. Value loss: 0.149696. Entropy: 0.983817.
Iteration 22896: Policy loss: -0.002933. Value loss: 0.120997. Entropy: 0.989809.
Training 

Iteration 22956: Policy loss: -0.004309. Value loss: 0.065389. Entropy: 0.969933.
Training network. lr: 0.000074. clip: 0.029507
Iteration 22957: Policy loss: 0.003305. Value loss: 0.484466. Entropy: 1.056364.
Iteration 22958: Policy loss: -0.000420. Value loss: 0.339123. Entropy: 1.053586.
Iteration 22959: Policy loss: 0.000827. Value loss: 0.233232. Entropy: 1.056661.
episode: 7463   score: 640.0  epsilon: 1.0    steps: 520  evaluation reward: 569.95
Training network. lr: 0.000074. clip: 0.029507
Iteration 22960: Policy loss: 0.003005. Value loss: 0.244351. Entropy: 1.116092.
Iteration 22961: Policy loss: 0.001864. Value loss: 0.131203. Entropy: 1.118688.
Iteration 22962: Policy loss: -0.001717. Value loss: 0.094672. Entropy: 1.114527.
episode: 7464   score: 760.0  epsilon: 1.0    steps: 680  evaluation reward: 570.0
Training network. lr: 0.000074. clip: 0.029507
Iteration 22963: Policy loss: 0.001257. Value loss: 0.141657. Entropy: 1.030935.
Iteration 22964: Policy loss: -0.003150. 

Iteration 23023: Policy loss: 0.004471. Value loss: 0.179328. Entropy: 0.935025.
Iteration 23024: Policy loss: -0.001747. Value loss: 0.108294. Entropy: 0.931686.
Iteration 23025: Policy loss: -0.002565. Value loss: 0.067982. Entropy: 0.932376.
episode: 7483   score: 365.0  epsilon: 1.0    steps: 144  evaluation reward: 585.45
episode: 7484   score: 530.0  epsilon: 1.0    steps: 888  evaluation reward: 588.85
Training network. lr: 0.000073. clip: 0.029350
Iteration 23026: Policy loss: 0.001929. Value loss: 0.108565. Entropy: 0.897328.
Iteration 23027: Policy loss: -0.001947. Value loss: 0.068557. Entropy: 0.893037.
Iteration 23028: Policy loss: -0.003305. Value loss: 0.053780. Entropy: 0.897599.
Training network. lr: 0.000073. clip: 0.029350
Iteration 23029: Policy loss: 0.000653. Value loss: 0.130315. Entropy: 1.041648.
Iteration 23030: Policy loss: -0.004229. Value loss: 0.085439. Entropy: 1.042580.
Iteration 23031: Policy loss: -0.005276. Value loss: 0.064153. Entropy: 1.039055.
Tra

Iteration 23089: Policy loss: 0.003446. Value loss: 0.228628. Entropy: 1.010189.
Iteration 23090: Policy loss: 0.001631. Value loss: 0.109915. Entropy: 1.019388.
Iteration 23091: Policy loss: 0.000258. Value loss: 0.075838. Entropy: 1.010789.
Training network. lr: 0.000073. clip: 0.029193
Iteration 23092: Policy loss: 0.003648. Value loss: 0.519477. Entropy: 0.969879.
Iteration 23093: Policy loss: 0.001876. Value loss: 0.324136. Entropy: 0.963031.
Iteration 23094: Policy loss: 0.001597. Value loss: 0.246512. Entropy: 0.963300.
Training network. lr: 0.000073. clip: 0.029193
Iteration 23095: Policy loss: 0.002362. Value loss: 0.130281. Entropy: 1.104299.
Iteration 23096: Policy loss: 0.000566. Value loss: 0.064368. Entropy: 1.105564.
Iteration 23097: Policy loss: -0.004288. Value loss: 0.044615. Entropy: 1.108027.
Training network. lr: 0.000073. clip: 0.029193
Iteration 23098: Policy loss: 0.000281. Value loss: 0.140393. Entropy: 1.137399.
Iteration 23099: Policy loss: -0.001584. Value l

Iteration 23158: Policy loss: 0.001864. Value loss: 0.650110. Entropy: 1.123354.
Iteration 23159: Policy loss: 0.000238. Value loss: 0.414730. Entropy: 1.119892.
Iteration 23160: Policy loss: 0.000595. Value loss: 0.297553. Entropy: 1.118560.
episode: 7522   score: 670.0  epsilon: 1.0    steps: 192  evaluation reward: 558.85
episode: 7523   score: 240.0  epsilon: 1.0    steps: 656  evaluation reward: 554.3
Training network. lr: 0.000072. clip: 0.028889
Iteration 23161: Policy loss: 0.001806. Value loss: 0.236439. Entropy: 1.009744.
Iteration 23162: Policy loss: -0.001625. Value loss: 0.155681. Entropy: 1.004359.
Iteration 23163: Policy loss: -0.005121. Value loss: 0.119928. Entropy: 1.007867.
episode: 7524   score: 650.0  epsilon: 1.0    steps: 832  evaluation reward: 555.65
Training network. lr: 0.000072. clip: 0.028889
Iteration 23164: Policy loss: 0.002229. Value loss: 0.145423. Entropy: 1.111071.
Iteration 23165: Policy loss: -0.000301. Value loss: 0.075510. Entropy: 1.113811.
Iter

Iteration 23226: Policy loss: -0.003733. Value loss: 0.118885. Entropy: 1.096151.
episode: 7542   score: 540.0  epsilon: 1.0    steps: 528  evaluation reward: 556.25
Training network. lr: 0.000072. clip: 0.028733
Iteration 23227: Policy loss: 0.001199. Value loss: 0.403897. Entropy: 1.098127.
Iteration 23228: Policy loss: -0.001085. Value loss: 0.301083. Entropy: 1.095036.
Iteration 23229: Policy loss: -0.000840. Value loss: 0.251980. Entropy: 1.096310.
episode: 7543   score: 415.0  epsilon: 1.0    steps: 464  evaluation reward: 554.2
episode: 7544   score: 605.0  epsilon: 1.0    steps: 680  evaluation reward: 553.3
episode: 7545   score: 895.0  epsilon: 1.0    steps: 776  evaluation reward: 556.25
Training network. lr: 0.000072. clip: 0.028733
Iteration 23230: Policy loss: 0.000995. Value loss: 0.128168. Entropy: 1.012843.
Iteration 23231: Policy loss: -0.003427. Value loss: 0.077789. Entropy: 1.013893.
Iteration 23232: Policy loss: -0.004522. Value loss: 0.060590. Entropy: 1.018998.


episode: 7562   score: 670.0  epsilon: 1.0    steps: 520  evaluation reward: 556.6
Training network. lr: 0.000071. clip: 0.028585
Iteration 23293: Policy loss: 0.001100. Value loss: 0.265389. Entropy: 1.049723.
Iteration 23294: Policy loss: -0.002275. Value loss: 0.167635. Entropy: 1.049096.
Iteration 23295: Policy loss: -0.003103. Value loss: 0.121159. Entropy: 1.048445.
episode: 7563   score: 725.0  epsilon: 1.0    steps: 40  evaluation reward: 557.45
Training network. lr: 0.000071. clip: 0.028585
Iteration 23296: Policy loss: 0.003509. Value loss: 0.331121. Entropy: 0.970243.
Iteration 23297: Policy loss: -0.000129. Value loss: 0.290343. Entropy: 0.979354.
Iteration 23298: Policy loss: -0.001661. Value loss: 0.259169. Entropy: 0.975290.
episode: 7564   score: 395.0  epsilon: 1.0    steps: 504  evaluation reward: 553.8
episode: 7565   score: 700.0  epsilon: 1.0    steps: 520  evaluation reward: 553.5
Training network. lr: 0.000071. clip: 0.028585
Iteration 23299: Policy loss: 0.00291

Training network. lr: 0.000071. clip: 0.028272
Iteration 23359: Policy loss: 0.002292. Value loss: 0.312094. Entropy: 0.940269.
Iteration 23360: Policy loss: 0.004419. Value loss: 0.125989. Entropy: 0.941218.
Iteration 23361: Policy loss: 0.001885. Value loss: 0.080330. Entropy: 0.936027.
episode: 7585   score: 665.0  epsilon: 1.0    steps: 744  evaluation reward: 544.1
Training network. lr: 0.000071. clip: 0.028272
Iteration 23362: Policy loss: 0.002222. Value loss: 0.264523. Entropy: 1.011611.
Iteration 23363: Policy loss: -0.000075. Value loss: 0.137060. Entropy: 1.003942.
Iteration 23364: Policy loss: -0.001776. Value loss: 0.098519. Entropy: 1.002419.
episode: 7586   score: 870.0  epsilon: 1.0    steps: 888  evaluation reward: 549.2
Training network. lr: 0.000071. clip: 0.028272
Iteration 23365: Policy loss: 0.002795. Value loss: 0.337715. Entropy: 0.996746.
Iteration 23366: Policy loss: 0.001147. Value loss: 0.151469. Entropy: 0.996255.
Iteration 23367: Policy loss: 0.000321. Val

episode: 7604   score: 755.0  epsilon: 1.0    steps: 960  evaluation reward: 572.95
Training network. lr: 0.000070. clip: 0.028124
Iteration 23428: Policy loss: 0.001385. Value loss: 0.117039. Entropy: 0.986928.
Iteration 23429: Policy loss: -0.001961. Value loss: 0.072886. Entropy: 0.987682.
Iteration 23430: Policy loss: -0.004591. Value loss: 0.058154. Entropy: 0.988711.
episode: 7605   score: 475.0  epsilon: 1.0    steps: 784  evaluation reward: 568.45
Training network. lr: 0.000070. clip: 0.028124
Iteration 23431: Policy loss: 0.003011. Value loss: 0.150165. Entropy: 0.977758.
Iteration 23432: Policy loss: 0.002025. Value loss: 0.110500. Entropy: 0.982752.
Iteration 23433: Policy loss: -0.002367. Value loss: 0.092930. Entropy: 0.977485.
Training network. lr: 0.000070. clip: 0.028124
Iteration 23434: Policy loss: 0.002170. Value loss: 0.311499. Entropy: 0.962216.
Iteration 23435: Policy loss: 0.000723. Value loss: 0.176369. Entropy: 0.963850.
Iteration 23436: Policy loss: 0.000479. 

Iteration 23495: Policy loss: -0.001839. Value loss: 0.112855. Entropy: 1.019502.
Iteration 23496: Policy loss: -0.003789. Value loss: 0.089313. Entropy: 1.020687.
episode: 7624   score: 450.0  epsilon: 1.0    steps: 952  evaluation reward: 563.7
Training network. lr: 0.000070. clip: 0.027968
Iteration 23497: Policy loss: 0.002237. Value loss: 0.343352. Entropy: 1.093354.
Iteration 23498: Policy loss: 0.000923. Value loss: 0.230862. Entropy: 1.094705.
Iteration 23499: Policy loss: -0.000322. Value loss: 0.183363. Entropy: 1.093046.
episode: 7625   score: 525.0  epsilon: 1.0    steps: 568  evaluation reward: 565.05
Training network. lr: 0.000070. clip: 0.027968
Iteration 23500: Policy loss: 0.001158. Value loss: 0.117863. Entropy: 0.985084.
Iteration 23501: Policy loss: -0.002261. Value loss: 0.088282. Entropy: 0.990271.
Iteration 23502: Policy loss: -0.004299. Value loss: 0.072748. Entropy: 0.990372.
Training network. lr: 0.000070. clip: 0.027811
Iteration 23503: Policy loss: 0.002212.

Iteration 23562: Policy loss: -0.003341. Value loss: 0.103313. Entropy: 0.966517.
episode: 7645   score: 380.0  epsilon: 1.0    steps: 136  evaluation reward: 561.95
episode: 7646   score: 560.0  epsilon: 1.0    steps: 616  evaluation reward: 563.0
Training network. lr: 0.000069. clip: 0.027664
Iteration 23563: Policy loss: 0.004604. Value loss: 0.420547. Entropy: 0.869963.
Iteration 23564: Policy loss: 0.001101. Value loss: 0.322948. Entropy: 0.869806.
Iteration 23565: Policy loss: 0.000151. Value loss: 0.269357. Entropy: 0.870646.
Training network. lr: 0.000069. clip: 0.027664
Iteration 23566: Policy loss: 0.003051. Value loss: 0.282821. Entropy: 1.042584.
Iteration 23567: Policy loss: 0.004293. Value loss: 0.142647. Entropy: 1.035332.
Iteration 23568: Policy loss: -0.002148. Value loss: 0.094205. Entropy: 1.042432.
episode: 7647   score: 345.0  epsilon: 1.0    steps: 672  evaluation reward: 558.85
Training network. lr: 0.000069. clip: 0.027664
Iteration 23569: Policy loss: 0.003335.

Training network. lr: 0.000069. clip: 0.027507
Iteration 23629: Policy loss: 0.003569. Value loss: 0.251991. Entropy: 1.011052.
Iteration 23630: Policy loss: 0.001513. Value loss: 0.149714. Entropy: 1.009096.
Iteration 23631: Policy loss: -0.000285. Value loss: 0.114788. Entropy: 1.004862.
episode: 7665   score: 785.0  epsilon: 1.0    steps: 88  evaluation reward: 550.85
episode: 7666   score: 470.0  epsilon: 1.0    steps: 888  evaluation reward: 550.6
Training network. lr: 0.000069. clip: 0.027507
Iteration 23632: Policy loss: 0.002781. Value loss: 0.168576. Entropy: 0.970073.
Iteration 23633: Policy loss: 0.000344. Value loss: 0.103795. Entropy: 0.968019.
Iteration 23634: Policy loss: -0.001789. Value loss: 0.078095. Entropy: 0.963013.
episode: 7667   score: 620.0  epsilon: 1.0    steps: 912  evaluation reward: 552.3
Training network. lr: 0.000069. clip: 0.027507
Iteration 23635: Policy loss: 0.002566. Value loss: 0.152726. Entropy: 0.983010.
Iteration 23636: Policy loss: -0.000969. 

Iteration 23695: Policy loss: 0.002175. Value loss: 0.130789. Entropy: 1.024836.
Iteration 23696: Policy loss: -0.002393. Value loss: 0.081140. Entropy: 1.029123.
Iteration 23697: Policy loss: -0.002308. Value loss: 0.063069. Entropy: 1.024312.
episode: 7686   score: 635.0  epsilon: 1.0    steps: 496  evaluation reward: 537.3
Training network. lr: 0.000068. clip: 0.027350
Iteration 23698: Policy loss: 0.001628. Value loss: 0.296907. Entropy: 1.011863.
Iteration 23699: Policy loss: -0.000575. Value loss: 0.181398. Entropy: 1.004938.
Iteration 23700: Policy loss: 0.001011. Value loss: 0.134887. Entropy: 1.009680.
episode: 7687   score: 425.0  epsilon: 1.0    steps: 240  evaluation reward: 534.95
Training network. lr: 0.000068. clip: 0.027203
Iteration 23701: Policy loss: 0.002234. Value loss: 0.165891. Entropy: 0.978626.
Iteration 23702: Policy loss: -0.001156. Value loss: 0.116957. Entropy: 0.981480.
Iteration 23703: Policy loss: -0.002706. Value loss: 0.087556. Entropy: 0.979243.
episo

Iteration 23763: Policy loss: -0.001960. Value loss: 0.189547. Entropy: 0.880293.
episode: 7705   score: 480.0  epsilon: 1.0    steps: 576  evaluation reward: 537.2
Training network. lr: 0.000068. clip: 0.027046
Iteration 23764: Policy loss: 0.000537. Value loss: 0.141102. Entropy: 0.962377.
Iteration 23765: Policy loss: -0.001346. Value loss: 0.105522. Entropy: 0.961927.
Iteration 23766: Policy loss: -0.004254. Value loss: 0.086790. Entropy: 0.967039.
episode: 7706   score: 650.0  epsilon: 1.0    steps: 752  evaluation reward: 539.35
Training network. lr: 0.000068. clip: 0.027046
Iteration 23767: Policy loss: 0.002484. Value loss: 0.278346. Entropy: 0.996225.
Iteration 23768: Policy loss: 0.000452. Value loss: 0.168368. Entropy: 1.004590.
Iteration 23769: Policy loss: -0.002843. Value loss: 0.126561. Entropy: 1.006915.
Training network. lr: 0.000068. clip: 0.027046
Iteration 23770: Policy loss: 0.002224. Value loss: 0.135223. Entropy: 1.043346.
Iteration 23771: Policy loss: -0.002332.

Iteration 23830: Policy loss: 0.002157. Value loss: 0.254888. Entropy: 1.048032.
Iteration 23831: Policy loss: 0.000241. Value loss: 0.166088. Entropy: 1.047297.
Iteration 23832: Policy loss: -0.001564. Value loss: 0.133392. Entropy: 1.047258.
episode: 7725   score: 380.0  epsilon: 1.0    steps: 592  evaluation reward: 535.1
Training network. lr: 0.000067. clip: 0.026889
Iteration 23833: Policy loss: 0.003808. Value loss: 0.452392. Entropy: 1.034726.
Iteration 23834: Policy loss: 0.000500. Value loss: 0.326421. Entropy: 1.034419.
Iteration 23835: Policy loss: 0.000187. Value loss: 0.274202. Entropy: 1.035907.
episode: 7726   score: 590.0  epsilon: 1.0    steps: 192  evaluation reward: 534.5
Training network. lr: 0.000067. clip: 0.026889
Iteration 23836: Policy loss: 0.000420. Value loss: 0.139480. Entropy: 0.974221.
Iteration 23837: Policy loss: -0.003010. Value loss: 0.090666. Entropy: 0.972613.
Iteration 23838: Policy loss: -0.004520. Value loss: 0.069027. Entropy: 0.975387.
episode:

episode: 7747   score: 465.0  epsilon: 1.0    steps: 568  evaluation reward: 538.6
Training network. lr: 0.000067. clip: 0.026742
Iteration 23896: Policy loss: 0.004769. Value loss: 0.377344. Entropy: 0.894718.
Iteration 23897: Policy loss: 0.002112. Value loss: 0.224762. Entropy: 0.901001.
Iteration 23898: Policy loss: 0.002237. Value loss: 0.157980. Entropy: 0.895428.
Training network. lr: 0.000067. clip: 0.026742
Iteration 23899: Policy loss: 0.001943. Value loss: 0.150808. Entropy: 1.016453.
Iteration 23900: Policy loss: -0.000919. Value loss: 0.104870. Entropy: 1.009353.
Iteration 23901: Policy loss: -0.003929. Value loss: 0.086186. Entropy: 1.009881.
episode: 7748   score: 210.0  epsilon: 1.0    steps: 528  evaluation reward: 536.3
Training network. lr: 0.000066. clip: 0.026585
Iteration 23902: Policy loss: 0.002531. Value loss: 0.758739. Entropy: 1.021049.
Iteration 23903: Policy loss: 0.002236. Value loss: 0.506723. Entropy: 1.021586.
Iteration 23904: Policy loss: 0.002410. Val

episode: 7768   score: 690.0  epsilon: 1.0    steps: 776  evaluation reward: 541.35
Training network. lr: 0.000066. clip: 0.026429
Iteration 23962: Policy loss: 0.001547. Value loss: 0.281075. Entropy: 0.874259.
Iteration 23963: Policy loss: -0.000172. Value loss: 0.189391. Entropy: 0.881631.
Iteration 23964: Policy loss: 0.000408. Value loss: 0.151761. Entropy: 0.885631.
Training network. lr: 0.000066. clip: 0.026429
Iteration 23965: Policy loss: 0.001544. Value loss: 0.166583. Entropy: 1.032315.
Iteration 23966: Policy loss: -0.000240. Value loss: 0.106193. Entropy: 1.026299.
Iteration 23967: Policy loss: -0.003163. Value loss: 0.079954. Entropy: 1.030183.
episode: 7769   score: 225.0  epsilon: 1.0    steps: 640  evaluation reward: 539.25
Training network. lr: 0.000066. clip: 0.026429
Iteration 23968: Policy loss: 0.002840. Value loss: 0.309366. Entropy: 1.047780.
Iteration 23969: Policy loss: 0.000545. Value loss: 0.172228. Entropy: 1.052978.
Iteration 23970: Policy loss: -0.003088.

Iteration 24029: Policy loss: 0.001302. Value loss: 0.185970. Entropy: 1.048256.
Iteration 24030: Policy loss: -0.001507. Value loss: 0.139018. Entropy: 1.045707.
Training network. lr: 0.000066. clip: 0.026281
Iteration 24031: Policy loss: 0.002828. Value loss: 0.445797. Entropy: 1.126897.
Iteration 24032: Policy loss: 0.000919. Value loss: 0.312461. Entropy: 1.123246.
Iteration 24033: Policy loss: -0.000950. Value loss: 0.247381. Entropy: 1.121260.
Training network. lr: 0.000066. clip: 0.026281
Iteration 24034: Policy loss: 0.002877. Value loss: 0.401737. Entropy: 1.151376.
Iteration 24035: Policy loss: 0.004572. Value loss: 0.171457. Entropy: 1.146330.
Iteration 24036: Policy loss: 0.000575. Value loss: 0.114362. Entropy: 1.146101.
episode: 7788   score: 700.0  epsilon: 1.0    steps: 736  evaluation reward: 544.95
Training network. lr: 0.000066. clip: 0.026281
Iteration 24037: Policy loss: 0.001895. Value loss: 0.389764. Entropy: 1.095657.
Iteration 24038: Policy loss: 0.000840. Valu

Iteration 24098: Policy loss: -0.000638. Value loss: 0.139517. Entropy: 1.096984.
Iteration 24099: Policy loss: -0.002014. Value loss: 0.110725. Entropy: 1.096592.
episode: 7806   score: 765.0  epsilon: 1.0    steps: 200  evaluation reward: 546.7
episode: 7807   score: 1050.0  epsilon: 1.0    steps: 352  evaluation reward: 549.9
Training network. lr: 0.000065. clip: 0.026125
Iteration 24100: Policy loss: 0.001920. Value loss: 0.171145. Entropy: 0.986894.
Iteration 24101: Policy loss: 0.001008. Value loss: 0.100897. Entropy: 0.986282.
Iteration 24102: Policy loss: -0.000778. Value loss: 0.072117. Entropy: 0.981700.
episode: 7808   score: 425.0  epsilon: 1.0    steps: 504  evaluation reward: 550.95
episode: 7809   score: 590.0  epsilon: 1.0    steps: 584  evaluation reward: 549.8
Training network. lr: 0.000065. clip: 0.025968
Iteration 24103: Policy loss: 0.002671. Value loss: 0.282428. Entropy: 0.976389.
Iteration 24104: Policy loss: -0.000445. Value loss: 0.196614. Entropy: 0.979090.
I

Training network. lr: 0.000065. clip: 0.025820
Iteration 24166: Policy loss: 0.001791. Value loss: 0.169467. Entropy: 1.175647.
Iteration 24167: Policy loss: 0.000865. Value loss: 0.096371. Entropy: 1.180302.
Iteration 24168: Policy loss: -0.002086. Value loss: 0.074330. Entropy: 1.180431.
episode: 7826   score: 740.0  epsilon: 1.0    steps: 8  evaluation reward: 578.45
episode: 7827   score: 335.0  epsilon: 1.0    steps: 664  evaluation reward: 577.1
Training network. lr: 0.000065. clip: 0.025820
Iteration 24169: Policy loss: 0.002006. Value loss: 0.284706. Entropy: 0.964581.
Iteration 24170: Policy loss: 0.000198. Value loss: 0.197194. Entropy: 0.968438.
Iteration 24171: Policy loss: -0.002680. Value loss: 0.172664. Entropy: 0.970488.
episode: 7828   score: 805.0  epsilon: 1.0    steps: 384  evaluation reward: 575.95
episode: 7829   score: 445.0  epsilon: 1.0    steps: 672  evaluation reward: 571.0
Training network. lr: 0.000065. clip: 0.025820
Iteration 24172: Policy loss: 0.000357.

Iteration 24232: Policy loss: 0.004904. Value loss: 0.506483. Entropy: 0.951276.
Iteration 24233: Policy loss: 0.010183. Value loss: 0.290734. Entropy: 0.952397.
Iteration 24234: Policy loss: 0.006675. Value loss: 0.211469. Entropy: 0.946968.
episode: 7847   score: 755.0  epsilon: 1.0    steps: 928  evaluation reward: 564.15
Training network. lr: 0.000064. clip: 0.025664
Iteration 24235: Policy loss: 0.001541. Value loss: 0.366620. Entropy: 1.056235.
Iteration 24236: Policy loss: 0.003428. Value loss: 0.189014. Entropy: 1.063793.
Iteration 24237: Policy loss: 0.002223. Value loss: 0.109190. Entropy: 1.056790.
episode: 7848   score: 330.0  epsilon: 1.0    steps: 448  evaluation reward: 565.35
Training network. lr: 0.000064. clip: 0.025664
Iteration 24238: Policy loss: 0.000935. Value loss: 0.203651. Entropy: 0.966857.
Iteration 24239: Policy loss: 0.001155. Value loss: 0.112911. Entropy: 0.958656.
Iteration 24240: Policy loss: -0.002069. Value loss: 0.094228. Entropy: 0.957973.
Training

episode: 7866   score: 790.0  epsilon: 1.0    steps: 648  evaluation reward: 571.4
episode: 7867   score: 480.0  epsilon: 1.0    steps: 736  evaluation reward: 573.05
episode: 7868   score: 230.0  epsilon: 1.0    steps: 896  evaluation reward: 568.45
Training network. lr: 0.000063. clip: 0.025360
Iteration 24301: Policy loss: 0.002839. Value loss: 0.302478. Entropy: 1.022238.
Iteration 24302: Policy loss: 0.001659. Value loss: 0.229340. Entropy: 1.025643.
Iteration 24303: Policy loss: -0.001827. Value loss: 0.190195. Entropy: 1.023922.
Training network. lr: 0.000063. clip: 0.025360
Iteration 24304: Policy loss: 0.003281. Value loss: 0.205334. Entropy: 0.922318.
Iteration 24305: Policy loss: 0.001408. Value loss: 0.097452. Entropy: 0.923504.
Iteration 24306: Policy loss: -0.001201. Value loss: 0.067737. Entropy: 0.926236.
Training network. lr: 0.000063. clip: 0.025360
Iteration 24307: Policy loss: 0.002348. Value loss: 0.224596. Entropy: 1.029120.
Iteration 24308: Policy loss: 0.001859.

Training network. lr: 0.000063. clip: 0.025203
Iteration 24367: Policy loss: 0.003391. Value loss: 0.457566. Entropy: 1.059565.
Iteration 24368: Policy loss: 0.000870. Value loss: 0.348612. Entropy: 1.060933.
Iteration 24369: Policy loss: 0.001895. Value loss: 0.295947. Entropy: 1.054993.
Training network. lr: 0.000063. clip: 0.025203
Iteration 24370: Policy loss: 0.000726. Value loss: 0.529391. Entropy: 1.092442.
Iteration 24371: Policy loss: -0.002092. Value loss: 0.398119. Entropy: 1.098566.
Iteration 24372: Policy loss: -0.001522. Value loss: 0.326448. Entropy: 1.098453.
episode: 7888   score: 470.0  epsilon: 1.0    steps: 872  evaluation reward: 575.75
Training network. lr: 0.000063. clip: 0.025203
Iteration 24373: Policy loss: 0.001764. Value loss: 0.150019. Entropy: 1.142566.
Iteration 24374: Policy loss: 0.000436. Value loss: 0.087887. Entropy: 1.139891.
Iteration 24375: Policy loss: -0.003299. Value loss: 0.071064. Entropy: 1.138990.
episode: 7889   score: 755.0  epsilon: 1.0 

episode: 7906   score: 495.0  epsilon: 1.0    steps: 920  evaluation reward: 568.1
Training network. lr: 0.000063. clip: 0.025046
Iteration 24436: Policy loss: 0.002022. Value loss: 0.180366. Entropy: 1.069685.
Iteration 24437: Policy loss: -0.000907. Value loss: 0.098041. Entropy: 1.069254.
Iteration 24438: Policy loss: -0.002660. Value loss: 0.074376. Entropy: 1.068601.
Training network. lr: 0.000063. clip: 0.025046
Iteration 24439: Policy loss: 0.006282. Value loss: 0.318305. Entropy: 1.036398.
Iteration 24440: Policy loss: 0.003152. Value loss: 0.213424. Entropy: 1.041440.
Iteration 24441: Policy loss: 0.000571. Value loss: 0.168972. Entropy: 1.044445.
episode: 7907   score: 805.0  epsilon: 1.0    steps: 48  evaluation reward: 565.65
episode: 7908   score: 540.0  epsilon: 1.0    steps: 280  evaluation reward: 566.8
episode: 7909   score: 740.0  epsilon: 1.0    steps: 752  evaluation reward: 568.3
episode: 7910   score: 390.0  epsilon: 1.0    steps: 824  evaluation reward: 566.25
Tr

Iteration 24502: Policy loss: 0.001781. Value loss: 0.331630. Entropy: 1.066003.
Iteration 24503: Policy loss: 0.000710. Value loss: 0.201640. Entropy: 1.064325.
Iteration 24504: Policy loss: 0.001695. Value loss: 0.128857. Entropy: 1.063988.
episode: 7927   score: 450.0  epsilon: 1.0    steps: 160  evaluation reward: 544.7
episode: 7928   score: 500.0  epsilon: 1.0    steps: 512  evaluation reward: 541.65
Training network. lr: 0.000062. clip: 0.024742
Iteration 24505: Policy loss: 0.001216. Value loss: 0.147630. Entropy: 0.915537.
Iteration 24506: Policy loss: -0.002157. Value loss: 0.111273. Entropy: 0.918128.
Iteration 24507: Policy loss: -0.002711. Value loss: 0.090499. Entropy: 0.910894.
episode: 7929   score: 710.0  epsilon: 1.0    steps: 672  evaluation reward: 544.3
episode: 7930   score: 315.0  epsilon: 1.0    steps: 792  evaluation reward: 543.85
Training network. lr: 0.000062. clip: 0.024742
Iteration 24508: Policy loss: 0.001767. Value loss: 0.143718. Entropy: 0.997651.
Ite

Training network. lr: 0.000061. clip: 0.024585
Iteration 24568: Policy loss: 0.002855. Value loss: 0.511773. Entropy: 0.919921.
Iteration 24569: Policy loss: 0.002623. Value loss: 0.302004. Entropy: 0.930592.
Iteration 24570: Policy loss: 0.002721. Value loss: 0.233582. Entropy: 0.928425.
Training network. lr: 0.000061. clip: 0.024585
Iteration 24571: Policy loss: 0.000316. Value loss: 0.220418. Entropy: 1.057615.
Iteration 24572: Policy loss: -0.000755. Value loss: 0.150092. Entropy: 1.055571.
Iteration 24573: Policy loss: -0.002326. Value loss: 0.125045. Entropy: 1.051215.
Training network. lr: 0.000061. clip: 0.024585
Iteration 24574: Policy loss: 0.003069. Value loss: 0.200022. Entropy: 1.072517.
Iteration 24575: Policy loss: 0.002685. Value loss: 0.129401. Entropy: 1.075066.
Iteration 24576: Policy loss: 0.000735. Value loss: 0.097016. Entropy: 1.070383.
episode: 7949   score: 255.0  epsilon: 1.0    steps: 520  evaluation reward: 552.85
Training network. lr: 0.000061. clip: 0.0245

Training network. lr: 0.000061. clip: 0.024438
Iteration 24637: Policy loss: 0.002810. Value loss: 0.351965. Entropy: 0.947350.
Iteration 24638: Policy loss: 0.002195. Value loss: 0.212451. Entropy: 0.958227.
Iteration 24639: Policy loss: 0.000670. Value loss: 0.160057. Entropy: 0.956735.
episode: 7967   score: 1045.0  epsilon: 1.0    steps: 792  evaluation reward: 553.1
Training network. lr: 0.000061. clip: 0.024438
Iteration 24640: Policy loss: 0.001121. Value loss: 0.339855. Entropy: 1.039393.
Iteration 24641: Policy loss: 0.000348. Value loss: 0.245691. Entropy: 1.038615.
Iteration 24642: Policy loss: -0.001631. Value loss: 0.200428. Entropy: 1.035714.
episode: 7968   score: 480.0  epsilon: 1.0    steps: 600  evaluation reward: 555.6
episode: 7969   score: 725.0  epsilon: 1.0    steps: 672  evaluation reward: 558.45
Training network. lr: 0.000061. clip: 0.024438
Iteration 24643: Policy loss: 0.001516. Value loss: 0.356685. Entropy: 0.981573.
Iteration 24644: Policy loss: 0.000318. 

Iteration 24704: Policy loss: 0.003349. Value loss: 0.187200. Entropy: 0.927410.
Iteration 24705: Policy loss: 0.000355. Value loss: 0.140219. Entropy: 0.939522.
episode: 7987   score: 670.0  epsilon: 1.0    steps: 904  evaluation reward: 580.35
Training network. lr: 0.000060. clip: 0.024125
Iteration 24706: Policy loss: 0.001701. Value loss: 0.150483. Entropy: 1.063636.
Iteration 24707: Policy loss: -0.002115. Value loss: 0.102734. Entropy: 1.064867.
Iteration 24708: Policy loss: -0.004000. Value loss: 0.085674. Entropy: 1.063107.
episode: 7988   score: 800.0  epsilon: 1.0    steps: 112  evaluation reward: 583.65
Training network. lr: 0.000060. clip: 0.024125
Iteration 24709: Policy loss: 0.003041. Value loss: 0.170916. Entropy: 0.906747.
Iteration 24710: Policy loss: 0.000407. Value loss: 0.091651. Entropy: 0.904364.
Iteration 24711: Policy loss: -0.002115. Value loss: 0.067905. Entropy: 0.896329.
episode: 7989   score: 665.0  epsilon: 1.0    steps: 864  evaluation reward: 582.75
Tra

Iteration 24770: Policy loss: -0.001579. Value loss: 0.063895. Entropy: 0.933699.
Iteration 24771: Policy loss: -0.002783. Value loss: 0.052907. Entropy: 0.931243.
Training network. lr: 0.000060. clip: 0.023977
Iteration 24772: Policy loss: 0.002007. Value loss: 0.132199. Entropy: 1.065627.
Iteration 24773: Policy loss: -0.001652. Value loss: 0.077669. Entropy: 1.067297.
Iteration 24774: Policy loss: -0.003600. Value loss: 0.057674. Entropy: 1.063208.
episode: 8008   score: 470.0  epsilon: 1.0    steps: 584  evaluation reward: 564.05
Training network. lr: 0.000060. clip: 0.023977
Iteration 24775: Policy loss: 0.002178. Value loss: 0.123599. Entropy: 1.013335.
Iteration 24776: Policy loss: -0.000943. Value loss: 0.071076. Entropy: 1.010776.
Iteration 24777: Policy loss: -0.002941. Value loss: 0.056960. Entropy: 1.011670.
Training network. lr: 0.000060. clip: 0.023977
Iteration 24778: Policy loss: 0.001542. Value loss: 0.159477. Entropy: 1.095152.
Iteration 24779: Policy loss: -0.000955.

Iteration 24837: Policy loss: -0.002388. Value loss: 0.097408. Entropy: 0.987557.
episode: 8029   score: 450.0  epsilon: 1.0    steps: 296  evaluation reward: 554.9
Training network. lr: 0.000060. clip: 0.023821
Iteration 24838: Policy loss: 0.002281. Value loss: 0.148740. Entropy: 0.934930.
Iteration 24839: Policy loss: -0.001609. Value loss: 0.091040. Entropy: 0.935675.
Iteration 24840: Policy loss: -0.002949. Value loss: 0.073027. Entropy: 0.935077.
Training network. lr: 0.000060. clip: 0.023821
Iteration 24841: Policy loss: 0.002642. Value loss: 0.163532. Entropy: 1.040310.
Iteration 24842: Policy loss: -0.001712. Value loss: 0.101216. Entropy: 1.042974.
Iteration 24843: Policy loss: -0.001466. Value loss: 0.081862. Entropy: 1.048664.
episode: 8030   score: 560.0  epsilon: 1.0    steps: 856  evaluation reward: 557.35
Training network. lr: 0.000060. clip: 0.023821
Iteration 24844: Policy loss: 0.003313. Value loss: 0.394240. Entropy: 1.099918.
Iteration 24845: Policy loss: 0.004875.

Training network. lr: 0.000059. clip: 0.023516
Iteration 24904: Policy loss: 0.000999. Value loss: 0.246665. Entropy: 0.978281.
Iteration 24905: Policy loss: -0.000198. Value loss: 0.172984. Entropy: 0.984365.
Iteration 24906: Policy loss: -0.001730. Value loss: 0.139445. Entropy: 0.988760.
episode: 8050   score: 525.0  epsilon: 1.0    steps: 752  evaluation reward: 546.6
Training network. lr: 0.000059. clip: 0.023516
Iteration 24907: Policy loss: 0.001205. Value loss: 0.213914. Entropy: 1.028158.
Iteration 24908: Policy loss: -0.002416. Value loss: 0.148886. Entropy: 1.025193.
Iteration 24909: Policy loss: -0.004190. Value loss: 0.117916. Entropy: 1.027333.
now time :  2019-03-06 02:08:05.091972
episode: 8051   score: 360.0  epsilon: 1.0    steps: 144  evaluation reward: 545.45
episode: 8052   score: 425.0  epsilon: 1.0    steps: 632  evaluation reward: 543.75
Training network. lr: 0.000059. clip: 0.023516
Iteration 24910: Policy loss: 0.000746. Value loss: 0.433890. Entropy: 0.916068

Iteration 24969: Policy loss: 0.003949. Value loss: 0.085145. Entropy: 0.884448.
Training network. lr: 0.000058. clip: 0.023360
Iteration 24970: Policy loss: 0.003509. Value loss: 0.236210. Entropy: 1.077447.
Iteration 24971: Policy loss: 0.002227. Value loss: 0.135326. Entropy: 1.075467.
Iteration 24972: Policy loss: 0.000761. Value loss: 0.099019. Entropy: 1.072341.
Training network. lr: 0.000058. clip: 0.023360
Iteration 24973: Policy loss: 0.001860. Value loss: 0.130240. Entropy: 1.078377.
Iteration 24974: Policy loss: -0.000376. Value loss: 0.087872. Entropy: 1.076192.
Iteration 24975: Policy loss: -0.002786. Value loss: 0.068851. Entropy: 1.078296.
Training network. lr: 0.000058. clip: 0.023360
Iteration 24976: Policy loss: 0.003554. Value loss: 0.287236. Entropy: 1.109288.
Iteration 24977: Policy loss: 0.001789. Value loss: 0.193826. Entropy: 1.108289.
Iteration 24978: Policy loss: -0.000400. Value loss: 0.144967. Entropy: 1.108931.
Training network. lr: 0.000058. clip: 0.023360

Iteration 25036: Policy loss: 0.002536. Value loss: 0.158037. Entropy: 0.913671.
Iteration 25037: Policy loss: -0.000015. Value loss: 0.099412. Entropy: 0.918150.
Iteration 25038: Policy loss: -0.001114. Value loss: 0.077592. Entropy: 0.919306.
episode: 8092   score: 760.0  epsilon: 1.0    steps: 96  evaluation reward: 488.9
episode: 8093   score: 600.0  epsilon: 1.0    steps: 488  evaluation reward: 492.05
Training network. lr: 0.000058. clip: 0.023203
Iteration 25039: Policy loss: 0.001706. Value loss: 0.150631. Entropy: 0.862406.
Iteration 25040: Policy loss: -0.000105. Value loss: 0.104726. Entropy: 0.859717.
Iteration 25041: Policy loss: -0.002790. Value loss: 0.081558. Entropy: 0.862055.
episode: 8094   score: 440.0  epsilon: 1.0    steps: 816  evaluation reward: 491.5
episode: 8095   score: 215.0  epsilon: 1.0    steps: 840  evaluation reward: 487.95
Training network. lr: 0.000058. clip: 0.023203
Iteration 25042: Policy loss: 0.002298. Value loss: 0.201402. Entropy: 0.921639.
It

Training network. lr: 0.000057. clip: 0.022899
Iteration 25102: Policy loss: 0.002331. Value loss: 0.152153. Entropy: 0.970416.
Iteration 25103: Policy loss: -0.001056. Value loss: 0.096616. Entropy: 0.971469.
Iteration 25104: Policy loss: -0.003248. Value loss: 0.077034. Entropy: 0.965291.
Training network. lr: 0.000057. clip: 0.022899
Iteration 25105: Policy loss: 0.003489. Value loss: 0.132112. Entropy: 1.045643.
Iteration 25106: Policy loss: 0.000020. Value loss: 0.091184. Entropy: 1.048675.
Iteration 25107: Policy loss: -0.002248. Value loss: 0.077138. Entropy: 1.042085.
Training network. lr: 0.000057. clip: 0.022899
Iteration 25108: Policy loss: 0.002384. Value loss: 0.246590. Entropy: 1.111368.
Iteration 25109: Policy loss: -0.000272. Value loss: 0.164263. Entropy: 1.113793.
Iteration 25110: Policy loss: -0.002406. Value loss: 0.122826. Entropy: 1.111489.
episode: 8114   score: 465.0  epsilon: 1.0    steps: 144  evaluation reward: 490.95
Training network. lr: 0.000057. clip: 0.0

Iteration 25168: Policy loss: 0.001238. Value loss: 0.089746. Entropy: 1.024389.
Iteration 25169: Policy loss: -0.000484. Value loss: 0.066712. Entropy: 1.027948.
Iteration 25170: Policy loss: -0.003637. Value loss: 0.056498. Entropy: 1.026097.
Training network. lr: 0.000057. clip: 0.022742
Iteration 25171: Policy loss: 0.002296. Value loss: 0.225527. Entropy: 1.059756.
Iteration 25172: Policy loss: -0.000612. Value loss: 0.162615. Entropy: 1.058550.
Iteration 25173: Policy loss: -0.003529. Value loss: 0.132286. Entropy: 1.061657.
Training network. lr: 0.000057. clip: 0.022742
Iteration 25174: Policy loss: 0.001786. Value loss: 0.213604. Entropy: 1.178268.
Iteration 25175: Policy loss: -0.000388. Value loss: 0.142518. Entropy: 1.182775.
Iteration 25176: Policy loss: -0.002194. Value loss: 0.108074. Entropy: 1.181438.
episode: 8135   score: 395.0  epsilon: 1.0    steps: 136  evaluation reward: 495.45
episode: 8136   score: 315.0  epsilon: 1.0    steps: 408  evaluation reward: 492.05
epi

Iteration 25235: Policy loss: -0.000361. Value loss: 0.150885. Entropy: 1.025419.
Iteration 25236: Policy loss: -0.001919. Value loss: 0.114349. Entropy: 1.020196.
episode: 8155   score: 230.0  epsilon: 1.0    steps: 200  evaluation reward: 477.5
episode: 8156   score: 420.0  epsilon: 1.0    steps: 368  evaluation reward: 477.5
Training network. lr: 0.000056. clip: 0.022595
Iteration 25237: Policy loss: 0.001675. Value loss: 0.393856. Entropy: 1.015843.
Iteration 25238: Policy loss: -0.001738. Value loss: 0.330978. Entropy: 1.015528.
Iteration 25239: Policy loss: -0.000585. Value loss: 0.268839. Entropy: 1.014543.
episode: 8157   score: 600.0  epsilon: 1.0    steps: 432  evaluation reward: 478.55
Training network. lr: 0.000056. clip: 0.022595
Iteration 25240: Policy loss: 0.003458. Value loss: 0.144809. Entropy: 1.032035.
Iteration 25241: Policy loss: 0.000546. Value loss: 0.095459. Entropy: 1.036124.
Iteration 25242: Policy loss: -0.000511. Value loss: 0.078894. Entropy: 1.027129.
epi

episode: 8178   score: 585.0  epsilon: 1.0    steps: 752  evaluation reward: 471.2
Training network. lr: 0.000056. clip: 0.022438
Iteration 25300: Policy loss: 0.002437. Value loss: 0.336832. Entropy: 0.966627.
Iteration 25301: Policy loss: 0.001158. Value loss: 0.318028. Entropy: 0.972239.
Iteration 25302: Policy loss: -0.000415. Value loss: 0.278864. Entropy: 0.968544.
episode: 8179   score: 605.0  epsilon: 1.0    steps: 504  evaluation reward: 471.7
Training network. lr: 0.000056. clip: 0.022281
Iteration 25303: Policy loss: 0.003562. Value loss: 0.173424. Entropy: 0.963265.
Iteration 25304: Policy loss: 0.001574. Value loss: 0.115327. Entropy: 0.962417.
Iteration 25305: Policy loss: 0.000937. Value loss: 0.086426. Entropy: 0.965601.
episode: 8180   score: 620.0  epsilon: 1.0    steps: 8  evaluation reward: 474.75
episode: 8181   score: 555.0  epsilon: 1.0    steps: 864  evaluation reward: 475.85
Training network. lr: 0.000056. clip: 0.022281
Iteration 25306: Policy loss: 0.001130. 

Iteration 25363: Policy loss: 0.001811. Value loss: 0.352056. Entropy: 0.903990.
Iteration 25364: Policy loss: 0.003692. Value loss: 0.200252. Entropy: 0.912227.
Iteration 25365: Policy loss: 0.003016. Value loss: 0.106549. Entropy: 0.913988.
Training network. lr: 0.000055. clip: 0.022134
Iteration 25366: Policy loss: 0.002111. Value loss: 0.201422. Entropy: 1.018904.
Iteration 25367: Policy loss: 0.000151. Value loss: 0.135979. Entropy: 1.015971.
Iteration 25368: Policy loss: -0.000910. Value loss: 0.113600. Entropy: 1.014963.
episode: 8202   score: 245.0  epsilon: 1.0    steps: 760  evaluation reward: 442.2
Training network. lr: 0.000055. clip: 0.022134
Iteration 25369: Policy loss: 0.002306. Value loss: 0.144108. Entropy: 1.034070.
Iteration 25370: Policy loss: -0.000587. Value loss: 0.093948. Entropy: 1.035985.
Iteration 25371: Policy loss: -0.001476. Value loss: 0.071780. Entropy: 1.033350.
Training network. lr: 0.000055. clip: 0.022134
Iteration 25372: Policy loss: 0.001749. Valu

Training network. lr: 0.000055. clip: 0.021977
Iteration 25432: Policy loss: 0.000522. Value loss: 0.190964. Entropy: 1.085000.
Iteration 25433: Policy loss: -0.001271. Value loss: 0.116868. Entropy: 1.082199.
Iteration 25434: Policy loss: -0.002415. Value loss: 0.084643. Entropy: 1.076973.
episode: 8221   score: 155.0  epsilon: 1.0    steps: 72  evaluation reward: 455.3
episode: 8222   score: 445.0  epsilon: 1.0    steps: 216  evaluation reward: 454.9
episode: 8223   score: 635.0  epsilon: 1.0    steps: 592  evaluation reward: 454.0
episode: 8224   score: 650.0  epsilon: 1.0    steps: 904  evaluation reward: 453.85
Training network. lr: 0.000055. clip: 0.021977
Iteration 25435: Policy loss: 0.002676. Value loss: 0.307119. Entropy: 0.863937.
Iteration 25436: Policy loss: 0.003699. Value loss: 0.161244. Entropy: 0.863913.
Iteration 25437: Policy loss: -0.000449. Value loss: 0.132480. Entropy: 0.858009.
episode: 8225   score: 515.0  epsilon: 1.0    steps: 608  evaluation reward: 452.55
T

Training network. lr: 0.000055. clip: 0.021821
Iteration 25498: Policy loss: 0.000737. Value loss: 0.155153. Entropy: 1.029588.
Iteration 25499: Policy loss: -0.001455. Value loss: 0.088865. Entropy: 1.027599.
Iteration 25500: Policy loss: -0.002946. Value loss: 0.064773. Entropy: 1.030839.
episode: 8243   score: 725.0  epsilon: 1.0    steps: 800  evaluation reward: 449.2
Training network. lr: 0.000054. clip: 0.021673
Iteration 25501: Policy loss: 0.001343. Value loss: 0.138368. Entropy: 1.031026.
Iteration 25502: Policy loss: -0.000484. Value loss: 0.094774. Entropy: 1.025242.
Iteration 25503: Policy loss: -0.001959. Value loss: 0.076160. Entropy: 1.027739.
episode: 8244   score: 285.0  epsilon: 1.0    steps: 848  evaluation reward: 448.1
Training network. lr: 0.000054. clip: 0.021673
Iteration 25504: Policy loss: 0.003938. Value loss: 0.262698. Entropy: 1.059831.
Iteration 25505: Policy loss: 0.004482. Value loss: 0.137658. Entropy: 1.056551.
Iteration 25506: Policy loss: 0.000783. V

Training network. lr: 0.000054. clip: 0.021517
Iteration 25564: Policy loss: 0.000116. Value loss: 0.172685. Entropy: 1.091718.
Iteration 25565: Policy loss: -0.000324. Value loss: 0.112199. Entropy: 1.087648.
Iteration 25566: Policy loss: -0.002001. Value loss: 0.087576. Entropy: 1.090240.
episode: 8264   score: 420.0  epsilon: 1.0    steps: 280  evaluation reward: 462.95
Training network. lr: 0.000054. clip: 0.021517
Iteration 25567: Policy loss: 0.001683. Value loss: 0.376348. Entropy: 1.002169.
Iteration 25568: Policy loss: 0.001186. Value loss: 0.286074. Entropy: 0.996721.
Iteration 25569: Policy loss: 0.001693. Value loss: 0.250980. Entropy: 0.996240.
Training network. lr: 0.000054. clip: 0.021517
Iteration 25570: Policy loss: 0.001245. Value loss: 0.118797. Entropy: 1.060337.
Iteration 25571: Policy loss: -0.001201. Value loss: 0.073915. Entropy: 1.058669.
Iteration 25572: Policy loss: -0.002737. Value loss: 0.057665. Entropy: 1.055749.
Training network. lr: 0.000054. clip: 0.02

Iteration 25631: Policy loss: -0.000713. Value loss: 0.118409. Entropy: 1.086500.
Iteration 25632: Policy loss: -0.002427. Value loss: 0.087525. Entropy: 1.086880.
episode: 8284   score: 315.0  epsilon: 1.0    steps: 896  evaluation reward: 482.5
episode: 8285   score: 355.0  epsilon: 1.0    steps: 976  evaluation reward: 483.2
Training network. lr: 0.000053. clip: 0.021360
Iteration 25633: Policy loss: 0.001146. Value loss: 0.149431. Entropy: 1.117827.
Iteration 25634: Policy loss: -0.002101. Value loss: 0.112526. Entropy: 1.113999.
Iteration 25635: Policy loss: -0.003585. Value loss: 0.088727. Entropy: 1.113380.
episode: 8286   score: 575.0  epsilon: 1.0    steps: 256  evaluation reward: 485.65
episode: 8287   score: 670.0  epsilon: 1.0    steps: 440  evaluation reward: 489.5
Training network. lr: 0.000053. clip: 0.021360
Iteration 25636: Policy loss: 0.001668. Value loss: 0.160386. Entropy: 0.910020.
Iteration 25637: Policy loss: 0.000418. Value loss: 0.110155. Entropy: 0.921668.
It

Iteration 25696: Policy loss: 0.000520. Value loss: 0.105601. Entropy: 0.823233.
Iteration 25697: Policy loss: -0.001151. Value loss: 0.072301. Entropy: 0.824345.
Iteration 25698: Policy loss: -0.002063. Value loss: 0.061933. Entropy: 0.824531.
Training network. lr: 0.000053. clip: 0.021212
Iteration 25699: Policy loss: 0.001577. Value loss: 0.145164. Entropy: 0.971368.
Iteration 25700: Policy loss: -0.001164. Value loss: 0.105193. Entropy: 0.973112.
Iteration 25701: Policy loss: -0.003744. Value loss: 0.086974. Entropy: 0.972931.
Training network. lr: 0.000053. clip: 0.021056
Iteration 25702: Policy loss: 0.000312. Value loss: 0.143084. Entropy: 1.075479.
Iteration 25703: Policy loss: -0.000593. Value loss: 0.084885. Entropy: 1.078427.
Iteration 25704: Policy loss: -0.003044. Value loss: 0.066919. Entropy: 1.071897.
episode: 8306   score: 725.0  epsilon: 1.0    steps: 408  evaluation reward: 503.2
Training network. lr: 0.000053. clip: 0.021056
Iteration 25705: Policy loss: 0.002543. V

Training network. lr: 0.000052. clip: 0.020899
Iteration 25765: Policy loss: 0.000683. Value loss: 0.168290. Entropy: 1.108907.
Iteration 25766: Policy loss: -0.001196. Value loss: 0.105712. Entropy: 1.106439.
Iteration 25767: Policy loss: -0.003252. Value loss: 0.085035. Entropy: 1.100570.
episode: 8325   score: 510.0  epsilon: 1.0    steps: 440  evaluation reward: 490.95
Training network. lr: 0.000052. clip: 0.020899
Iteration 25768: Policy loss: 0.002987. Value loss: 0.181122. Entropy: 0.968476.
Iteration 25769: Policy loss: 0.001949. Value loss: 0.087164. Entropy: 0.972139.
Iteration 25770: Policy loss: -0.000278. Value loss: 0.064882. Entropy: 0.966054.
episode: 8326   score: 695.0  epsilon: 1.0    steps: 992  evaluation reward: 493.7
Training network. lr: 0.000052. clip: 0.020899
Iteration 25771: Policy loss: 0.000942. Value loss: 0.107031. Entropy: 1.067705.
Iteration 25772: Policy loss: -0.000680. Value loss: 0.076293. Entropy: 1.070741.
Iteration 25773: Policy loss: -0.002548.

Iteration 25832: Policy loss: -0.001165. Value loss: 0.092639. Entropy: 1.005832.
Iteration 25833: Policy loss: -0.003067. Value loss: 0.077195. Entropy: 1.004609.
Training network. lr: 0.000052. clip: 0.020752
Iteration 25834: Policy loss: -0.000159. Value loss: 0.119125. Entropy: 1.002297.
Iteration 25835: Policy loss: -0.001772. Value loss: 0.092299. Entropy: 1.006813.
Iteration 25836: Policy loss: -0.003274. Value loss: 0.077702. Entropy: 1.005755.
episode: 8345   score: 360.0  epsilon: 1.0    steps: 792  evaluation reward: 505.65
Training network. lr: 0.000052. clip: 0.020752
Iteration 25837: Policy loss: 0.002884. Value loss: 0.228073. Entropy: 1.092079.
Iteration 25838: Policy loss: 0.003176. Value loss: 0.097326. Entropy: 1.094181.
Iteration 25839: Policy loss: 0.002176. Value loss: 0.068922. Entropy: 1.094051.
Training network. lr: 0.000052. clip: 0.020752
Iteration 25840: Policy loss: 0.001337. Value loss: 0.203277. Entropy: 1.122499.
Iteration 25841: Policy loss: 0.002919. V

Training network. lr: 0.000051. clip: 0.020595
Iteration 25900: Policy loss: 0.001825. Value loss: 0.468373. Entropy: 1.081607.
Iteration 25901: Policy loss: 0.001401. Value loss: 0.355791. Entropy: 1.085203.
Iteration 25902: Policy loss: -0.000344. Value loss: 0.310696. Entropy: 1.085162.
episode: 8364   score: 345.0  epsilon: 1.0    steps: 104  evaluation reward: 503.15
episode: 8365   score: 695.0  epsilon: 1.0    steps: 688  evaluation reward: 506.2
Training network. lr: 0.000051. clip: 0.020438
Iteration 25903: Policy loss: 0.000465. Value loss: 0.117722. Entropy: 1.011911.
Iteration 25904: Policy loss: -0.001489. Value loss: 0.085896. Entropy: 1.004714.
Iteration 25905: Policy loss: -0.002484. Value loss: 0.070889. Entropy: 1.009381.
episode: 8366   score: 265.0  epsilon: 1.0    steps: 352  evaluation reward: 503.9
Training network. lr: 0.000051. clip: 0.020438
Iteration 25906: Policy loss: 0.001531. Value loss: 0.342612. Entropy: 1.019765.
Iteration 25907: Policy loss: 0.000807.

episode: 8386   score: 480.0  epsilon: 1.0    steps: 776  evaluation reward: 506.65
Training network. lr: 0.000051. clip: 0.020291
Iteration 25966: Policy loss: 0.002510. Value loss: 0.236915. Entropy: 1.096089.
Iteration 25967: Policy loss: 0.002736. Value loss: 0.157995. Entropy: 1.099396.
Iteration 25968: Policy loss: 0.001783. Value loss: 0.126250. Entropy: 1.099097.
Training network. lr: 0.000051. clip: 0.020291
Iteration 25969: Policy loss: 0.002658. Value loss: 0.222296. Entropy: 1.123163.
Iteration 25970: Policy loss: 0.003417. Value loss: 0.122179. Entropy: 1.121439.
Iteration 25971: Policy loss: 0.001544. Value loss: 0.089092. Entropy: 1.118558.
episode: 8387   score: 315.0  epsilon: 1.0    steps: 296  evaluation reward: 503.1
Training network. lr: 0.000051. clip: 0.020291
Iteration 25972: Policy loss: 0.002686. Value loss: 0.441431. Entropy: 1.060664.
Iteration 25973: Policy loss: 0.003150. Value loss: 0.304546. Entropy: 1.058760.
Iteration 25974: Policy loss: 0.000324. Valu

Iteration 26032: Policy loss: 0.001555. Value loss: 0.262794. Entropy: 1.100306.
Iteration 26033: Policy loss: 0.000986. Value loss: 0.204854. Entropy: 1.105286.
Iteration 26034: Policy loss: 0.000050. Value loss: 0.163175. Entropy: 1.107912.
Training network. lr: 0.000050. clip: 0.020134
Iteration 26035: Policy loss: 0.002571. Value loss: 0.292117. Entropy: 1.089239.
Iteration 26036: Policy loss: 0.005023. Value loss: 0.142333. Entropy: 1.096934.
Iteration 26037: Policy loss: 0.001492. Value loss: 0.114804. Entropy: 1.093131.
episode: 8407   score: 795.0  epsilon: 1.0    steps: 1008  evaluation reward: 501.95
Training network. lr: 0.000050. clip: 0.020134
Iteration 26038: Policy loss: 0.001855. Value loss: 0.202996. Entropy: 1.207124.
Iteration 26039: Policy loss: 0.001821. Value loss: 0.113477. Entropy: 1.208162.
Iteration 26040: Policy loss: 0.000958. Value loss: 0.083562. Entropy: 1.209674.
episode: 8408   score: 365.0  epsilon: 1.0    steps: 512  evaluation reward: 498.2
Training 

Iteration 26100: Policy loss: 0.000488. Value loss: 0.106783. Entropy: 1.053885.
Training network. lr: 0.000050. clip: 0.019830
Iteration 26101: Policy loss: 0.003376. Value loss: 0.197902. Entropy: 1.011761.
Iteration 26102: Policy loss: 0.001335. Value loss: 0.148345. Entropy: 1.008735.
Iteration 26103: Policy loss: 0.001398. Value loss: 0.128569. Entropy: 1.011391.
episode: 8427   score: 400.0  epsilon: 1.0    steps: 488  evaluation reward: 520.25
Training network. lr: 0.000050. clip: 0.019830
Iteration 26104: Policy loss: 0.000554. Value loss: 0.125979. Entropy: 1.038890.
Iteration 26105: Policy loss: -0.001942. Value loss: 0.095884. Entropy: 1.035305.
Iteration 26106: Policy loss: -0.003344. Value loss: 0.073495. Entropy: 1.038194.
Training network. lr: 0.000050. clip: 0.019830
Iteration 26107: Policy loss: 0.001587. Value loss: 0.157033. Entropy: 1.054004.
Iteration 26108: Policy loss: -0.000705. Value loss: 0.102486. Entropy: 1.058172.
Iteration 26109: Policy loss: -0.001844. Va

Iteration 26169: Policy loss: -0.002455. Value loss: 0.117637. Entropy: 1.158298.
Training network. lr: 0.000049. clip: 0.019673
Iteration 26170: Policy loss: 0.001734. Value loss: 0.159246. Entropy: 1.175159.
Iteration 26171: Policy loss: -0.000400. Value loss: 0.110308. Entropy: 1.173941.
Iteration 26172: Policy loss: -0.002776. Value loss: 0.085623. Entropy: 1.176881.
episode: 8445   score: 835.0  epsilon: 1.0    steps: 392  evaluation reward: 513.3
Training network. lr: 0.000049. clip: 0.019673
Iteration 26173: Policy loss: 0.001286. Value loss: 0.122751. Entropy: 1.190693.
Iteration 26174: Policy loss: -0.000700. Value loss: 0.074871. Entropy: 1.190869.
Iteration 26175: Policy loss: -0.002041. Value loss: 0.061521. Entropy: 1.188909.
episode: 8446   score: 850.0  epsilon: 1.0    steps: 1024  evaluation reward: 513.95
Training network. lr: 0.000049. clip: 0.019673
Iteration 26176: Policy loss: 0.001578. Value loss: 0.121907. Entropy: 1.171309.
Iteration 26177: Policy loss: -0.00236

Training network. lr: 0.000049. clip: 0.019517
Iteration 26236: Policy loss: 0.001934. Value loss: 0.107930. Entropy: 1.098606.
Iteration 26237: Policy loss: -0.001031. Value loss: 0.065533. Entropy: 1.094822.
Iteration 26238: Policy loss: -0.002173. Value loss: 0.051948. Entropy: 1.094561.
episode: 8466   score: 635.0  epsilon: 1.0    steps: 168  evaluation reward: 529.05
episode: 8467   score: 485.0  epsilon: 1.0    steps: 888  evaluation reward: 528.35
Training network. lr: 0.000049. clip: 0.019517
Iteration 26239: Policy loss: 0.001161. Value loss: 0.303845. Entropy: 1.040803.
Iteration 26240: Policy loss: 0.001084. Value loss: 0.250215. Entropy: 1.042456.
Iteration 26241: Policy loss: 0.000311. Value loss: 0.245016. Entropy: 1.042903.
episode: 8468   score: 470.0  epsilon: 1.0    steps: 376  evaluation reward: 524.75
Training network. lr: 0.000049. clip: 0.019517
Iteration 26242: Policy loss: 0.002715. Value loss: 0.165582. Entropy: 0.980206.
Iteration 26243: Policy loss: -0.00006

Training network. lr: 0.000048. clip: 0.019213
Iteration 26305: Policy loss: 0.002571. Value loss: 0.209442. Entropy: 1.193411.
Iteration 26306: Policy loss: 0.000596. Value loss: 0.153385. Entropy: 1.194607.
Iteration 26307: Policy loss: -0.001798. Value loss: 0.126357. Entropy: 1.192274.
episode: 8484   score: 750.0  epsilon: 1.0    steps: 808  evaluation reward: 525.1
episode: 8485   score: 435.0  epsilon: 1.0    steps: 832  evaluation reward: 527.05
episode: 8486   score: 785.0  epsilon: 1.0    steps: 896  evaluation reward: 530.1
episode: 8487   score: 585.0  epsilon: 1.0    steps: 912  evaluation reward: 532.8
Training network. lr: 0.000048. clip: 0.019213
Iteration 26308: Policy loss: 0.002060. Value loss: 0.403032. Entropy: 1.142024.
Iteration 26309: Policy loss: 0.001422. Value loss: 0.346189. Entropy: 1.148706.
Iteration 26310: Policy loss: 0.000079. Value loss: 0.301054. Entropy: 1.146462.
episode: 8488   score: 605.0  epsilon: 1.0    steps: 336  evaluation reward: 532.9
Tra

Iteration 26370: Policy loss: -0.003930. Value loss: 0.078936. Entropy: 0.968660.
episode: 8506   score: 530.0  epsilon: 1.0    steps: 240  evaluation reward: 538.25
Training network. lr: 0.000048. clip: 0.019056
Iteration 26371: Policy loss: 0.001581. Value loss: 0.504562. Entropy: 1.001209.
Iteration 26372: Policy loss: 0.001358. Value loss: 0.379336. Entropy: 0.996212.
Iteration 26373: Policy loss: 0.001066. Value loss: 0.308708. Entropy: 0.993151.
Training network. lr: 0.000048. clip: 0.019056
Iteration 26374: Policy loss: 0.001992. Value loss: 0.414945. Entropy: 1.149644.
Iteration 26375: Policy loss: 0.002362. Value loss: 0.299656. Entropy: 1.152508.
Iteration 26376: Policy loss: 0.001987. Value loss: 0.241486. Entropy: 1.151502.
Training network. lr: 0.000048. clip: 0.019056
Iteration 26377: Policy loss: 0.001717. Value loss: 0.244268. Entropy: 1.163665.
Iteration 26378: Policy loss: -0.000415. Value loss: 0.162728. Entropy: 1.165157.
Iteration 26379: Policy loss: -0.001887. Val

Iteration 26439: Policy loss: 0.002829. Value loss: 0.164911. Entropy: 1.073979.
episode: 8524   score: 910.0  epsilon: 1.0    steps: 712  evaluation reward: 546.95
Training network. lr: 0.000047. clip: 0.018908
Iteration 26440: Policy loss: 0.001387. Value loss: 0.252316. Entropy: 1.072997.
Iteration 26441: Policy loss: 0.000219. Value loss: 0.170492. Entropy: 1.071229.
Iteration 26442: Policy loss: -0.001292. Value loss: 0.139138. Entropy: 1.071825.
Training network. lr: 0.000047. clip: 0.018908
Iteration 26443: Policy loss: 0.000647. Value loss: 0.169633. Entropy: 1.117035.
Iteration 26444: Policy loss: -0.000603. Value loss: 0.111045. Entropy: 1.116688.
Iteration 26445: Policy loss: -0.002941. Value loss: 0.091971. Entropy: 1.116188.
episode: 8525   score: 695.0  epsilon: 1.0    steps: 344  evaluation reward: 546.85
Training network. lr: 0.000047. clip: 0.018908
Iteration 26446: Policy loss: 0.001026. Value loss: 0.163428. Entropy: 1.068995.
Iteration 26447: Policy loss: -0.001695.

Iteration 26506: Policy loss: 0.001369. Value loss: 0.354126. Entropy: 1.150681.
Iteration 26507: Policy loss: 0.004146. Value loss: 0.196525. Entropy: 1.146743.
Iteration 26508: Policy loss: 0.003515. Value loss: 0.122112. Entropy: 1.144997.
episode: 8544   score: 665.0  epsilon: 1.0    steps: 80  evaluation reward: 556.25
episode: 8545   score: 470.0  epsilon: 1.0    steps: 592  evaluation reward: 552.6
episode: 8546   score: 390.0  epsilon: 1.0    steps: 984  evaluation reward: 548.0
Training network. lr: 0.000046. clip: 0.018595
Iteration 26509: Policy loss: 0.000861. Value loss: 0.159973. Entropy: 1.101427.
Iteration 26510: Policy loss: -0.001397. Value loss: 0.109569. Entropy: 1.094118.
Iteration 26511: Policy loss: -0.002587. Value loss: 0.088686. Entropy: 1.096377.
Training network. lr: 0.000046. clip: 0.018595
Iteration 26512: Policy loss: 0.001227. Value loss: 0.136721. Entropy: 1.058359.
Iteration 26513: Policy loss: -0.001239. Value loss: 0.097155. Entropy: 1.059645.
Iterat

Iteration 26573: Policy loss: -0.000895. Value loss: 0.111172. Entropy: 1.022898.
Iteration 26574: Policy loss: -0.002230. Value loss: 0.091643. Entropy: 1.024130.
Training network. lr: 0.000046. clip: 0.018448
Iteration 26575: Policy loss: 0.001858. Value loss: 0.104697. Entropy: 1.086716.
Iteration 26576: Policy loss: -0.002080. Value loss: 0.070414. Entropy: 1.086571.
Iteration 26577: Policy loss: -0.003033. Value loss: 0.055156. Entropy: 1.089379.
episode: 8565   score: 315.0  epsilon: 1.0    steps: 736  evaluation reward: 551.8
Training network. lr: 0.000046. clip: 0.018448
Iteration 26578: Policy loss: 0.000808. Value loss: 0.371966. Entropy: 1.111505.
Iteration 26579: Policy loss: -0.000088. Value loss: 0.258706. Entropy: 1.110189.
Iteration 26580: Policy loss: -0.001702. Value loss: 0.206665. Entropy: 1.109510.
Training network. lr: 0.000046. clip: 0.018448
Iteration 26581: Policy loss: 0.001566. Value loss: 0.215987. Entropy: 1.091970.
Iteration 26582: Policy loss: 0.001392. V

Training network. lr: 0.000046. clip: 0.018291
Iteration 26641: Policy loss: 0.000783. Value loss: 0.256918. Entropy: 1.130270.
Iteration 26642: Policy loss: 0.001151. Value loss: 0.177255. Entropy: 1.128469.
Iteration 26643: Policy loss: 0.000467. Value loss: 0.141425. Entropy: 1.127273.
episode: 8585   score: 455.0  epsilon: 1.0    steps: 216  evaluation reward: 540.65
Training network. lr: 0.000046. clip: 0.018291
Iteration 26644: Policy loss: 0.001204. Value loss: 0.164819. Entropy: 1.130593.
Iteration 26645: Policy loss: -0.000902. Value loss: 0.114051. Entropy: 1.125355.
Iteration 26646: Policy loss: -0.002214. Value loss: 0.093083. Entropy: 1.124802.
Training network. lr: 0.000046. clip: 0.018291
Iteration 26647: Policy loss: 0.000563. Value loss: 0.164235. Entropy: 1.132675.
Iteration 26648: Policy loss: -0.000354. Value loss: 0.120664. Entropy: 1.132597.
Iteration 26649: Policy loss: -0.002224. Value loss: 0.095947. Entropy: 1.134333.
episode: 8586   score: 545.0  epsilon: 1.0

Iteration 26709: Policy loss: -0.001647. Value loss: 0.099289. Entropy: 1.133251.
episode: 8604   score: 495.0  epsilon: 1.0    steps: 96  evaluation reward: 546.65
episode: 8605   score: 450.0  epsilon: 1.0    steps: 568  evaluation reward: 546.65
Training network. lr: 0.000045. clip: 0.017987
Iteration 26710: Policy loss: 0.000949. Value loss: 0.179177. Entropy: 1.084413.
Iteration 26711: Policy loss: -0.000169. Value loss: 0.130470. Entropy: 1.090823.
Iteration 26712: Policy loss: -0.001276. Value loss: 0.102023. Entropy: 1.085820.
episode: 8606   score: 775.0  epsilon: 1.0    steps: 80  evaluation reward: 549.1
Training network. lr: 0.000045. clip: 0.017987
Iteration 26713: Policy loss: 0.001253. Value loss: 0.184185. Entropy: 1.056854.
Iteration 26714: Policy loss: 0.000566. Value loss: 0.088262. Entropy: 1.047887.
Iteration 26715: Policy loss: -0.000996. Value loss: 0.067584. Entropy: 1.050616.
episode: 8607   score: 675.0  epsilon: 1.0    steps: 776  evaluation reward: 547.8
Tra

Iteration 26774: Policy loss: 0.000372. Value loss: 0.120694. Entropy: 1.077726.
Iteration 26775: Policy loss: -0.000311. Value loss: 0.095125. Entropy: 1.078098.
Training network. lr: 0.000045. clip: 0.017830
Iteration 26776: Policy loss: 0.002911. Value loss: 0.186205. Entropy: 1.101677.
Iteration 26777: Policy loss: 0.001064. Value loss: 0.111371. Entropy: 1.102505.
Iteration 26778: Policy loss: -0.001384. Value loss: 0.092551. Entropy: 1.100831.
episode: 8627   score: 390.0  epsilon: 1.0    steps: 624  evaluation reward: 524.15
Training network. lr: 0.000045. clip: 0.017830
Iteration 26779: Policy loss: 0.001550. Value loss: 0.589998. Entropy: 1.149889.
Iteration 26780: Policy loss: 0.001701. Value loss: 0.473346. Entropy: 1.151880.
Iteration 26781: Policy loss: -0.000018. Value loss: 0.413163. Entropy: 1.150835.
episode: 8628   score: 600.0  epsilon: 1.0    steps: 888  evaluation reward: 522.6
Training network. lr: 0.000045. clip: 0.017830
Iteration 26782: Policy loss: 0.002687. V

episode: 8647   score: 575.0  epsilon: 1.0    steps: 368  evaluation reward: 520.5
Training network. lr: 0.000044. clip: 0.017673
Iteration 26842: Policy loss: 0.002218. Value loss: 0.580276. Entropy: 1.059486.
Iteration 26843: Policy loss: 0.005805. Value loss: 0.382491. Entropy: 1.049619.
Iteration 26844: Policy loss: 0.004553. Value loss: 0.322755. Entropy: 1.049024.
episode: 8648   score: 725.0  epsilon: 1.0    steps: 1016  evaluation reward: 521.8
Training network. lr: 0.000044. clip: 0.017673
Iteration 26845: Policy loss: 0.001689. Value loss: 0.369234. Entropy: 1.105033.
Iteration 26846: Policy loss: 0.003089. Value loss: 0.213088. Entropy: 1.107520.
Iteration 26847: Policy loss: 0.005336. Value loss: 0.145438. Entropy: 1.102666.
Training network. lr: 0.000044. clip: 0.017673
Iteration 26848: Policy loss: 0.001027. Value loss: 0.196879. Entropy: 1.075143.
Iteration 26849: Policy loss: -0.000595. Value loss: 0.148924. Entropy: 1.079776.
Iteration 26850: Policy loss: -0.000692. Va

Iteration 26906: Policy loss: -0.000490. Value loss: 0.103410. Entropy: 0.988412.
Iteration 26907: Policy loss: -0.002276. Value loss: 0.091658. Entropy: 0.985631.
Training network. lr: 0.000043. clip: 0.017369
Iteration 26908: Policy loss: 0.002175. Value loss: 0.177759. Entropy: 1.093512.
Iteration 26909: Policy loss: -0.000925. Value loss: 0.141044. Entropy: 1.093597.
Iteration 26910: Policy loss: -0.001544. Value loss: 0.117650. Entropy: 1.095456.
episode: 8670   score: 315.0  epsilon: 1.0    steps: 192  evaluation reward: 511.5
Training network. lr: 0.000043. clip: 0.017369
Iteration 26911: Policy loss: 0.002689. Value loss: 0.233248. Entropy: 1.138749.
Iteration 26912: Policy loss: 0.002023. Value loss: 0.119826. Entropy: 1.134412.
Iteration 26913: Policy loss: -0.000828. Value loss: 0.085264. Entropy: 1.134189.
episode: 8671   score: 235.0  epsilon: 1.0    steps: 472  evaluation reward: 508.15
Training network. lr: 0.000043. clip: 0.017369
Iteration 26914: Policy loss: 0.001114.

Iteration 26974: Policy loss: 0.001143. Value loss: 0.128546. Entropy: 1.132763.
Iteration 26975: Policy loss: -0.001818. Value loss: 0.098249. Entropy: 1.129525.
Iteration 26976: Policy loss: -0.002693. Value loss: 0.084001. Entropy: 1.130818.
episode: 8689   score: 965.0  epsilon: 1.0    steps: 296  evaluation reward: 509.8
Training network. lr: 0.000043. clip: 0.017213
Iteration 26977: Policy loss: 0.000661. Value loss: 0.257529. Entropy: 1.085519.
Iteration 26978: Policy loss: 0.000076. Value loss: 0.191348. Entropy: 1.083705.
Iteration 26979: Policy loss: -0.001735. Value loss: 0.161761. Entropy: 1.082738.
episode: 8690   score: 800.0  epsilon: 1.0    steps: 112  evaluation reward: 510.35
Training network. lr: 0.000043. clip: 0.017213
Iteration 26980: Policy loss: 0.001574. Value loss: 0.221898. Entropy: 1.094633.
Iteration 26981: Policy loss: 0.003171. Value loss: 0.110556. Entropy: 1.093195.
Iteration 26982: Policy loss: 0.000998. Value loss: 0.078171. Entropy: 1.093431.
episode

Training network. lr: 0.000043. clip: 0.017065
Iteration 27040: Policy loss: 0.000781. Value loss: 0.098994. Entropy: 1.022153.
Iteration 27041: Policy loss: -0.001643. Value loss: 0.076988. Entropy: 1.018176.
Iteration 27042: Policy loss: -0.002648. Value loss: 0.065497. Entropy: 1.021089.
episode: 8711   score: 420.0  epsilon: 1.0    steps: 504  evaluation reward: 503.6
Training network. lr: 0.000043. clip: 0.017065
Iteration 27043: Policy loss: 0.000988. Value loss: 0.161529. Entropy: 1.015674.
Iteration 27044: Policy loss: -0.001107. Value loss: 0.113334. Entropy: 1.015558.
Iteration 27045: Policy loss: -0.002163. Value loss: 0.084576. Entropy: 1.017892.
episode: 8712   score: 495.0  epsilon: 1.0    steps: 264  evaluation reward: 506.15
Training network. lr: 0.000043. clip: 0.017065
Iteration 27046: Policy loss: 0.003489. Value loss: 0.249553. Entropy: 1.035685.
Iteration 27047: Policy loss: 0.006382. Value loss: 0.124783. Entropy: 1.040160.
Iteration 27048: Policy loss: 0.003715. 

Iteration 27107: Policy loss: 0.002844. Value loss: 0.406565. Entropy: 1.062144.
Iteration 27108: Policy loss: 0.003511. Value loss: 0.320031. Entropy: 1.061522.
episode: 8731   score: 425.0  epsilon: 1.0    steps: 832  evaluation reward: 484.75
Training network. lr: 0.000042. clip: 0.016752
Iteration 27109: Policy loss: 0.001245. Value loss: 0.828793. Entropy: 1.145838.
Iteration 27110: Policy loss: 0.000370. Value loss: 0.699048. Entropy: 1.146965.
Iteration 27111: Policy loss: 0.000648. Value loss: 0.613816. Entropy: 1.148610.
Training network. lr: 0.000042. clip: 0.016752
Iteration 27112: Policy loss: 0.001859. Value loss: 0.638792. Entropy: 1.096854.
Iteration 27113: Policy loss: 0.006064. Value loss: 0.446963. Entropy: 1.103226.
Iteration 27114: Policy loss: 0.006707. Value loss: 0.327553. Entropy: 1.097812.
Training network. lr: 0.000042. clip: 0.016752
Iteration 27115: Policy loss: 0.001519. Value loss: 0.304991. Entropy: 1.162247.
Iteration 27116: Policy loss: -0.000177. Value

Iteration 27175: Policy loss: 0.001571. Value loss: 0.958494. Entropy: 1.095882.
Iteration 27176: Policy loss: 0.006975. Value loss: 0.647973. Entropy: 1.095714.
Iteration 27177: Policy loss: 0.005021. Value loss: 0.509960. Entropy: 1.085339.
Training network. lr: 0.000042. clip: 0.016604
Iteration 27178: Policy loss: 0.001175. Value loss: 0.243259. Entropy: 1.121262.
Iteration 27179: Policy loss: -0.000824. Value loss: 0.177809. Entropy: 1.120451.
Iteration 27180: Policy loss: -0.002429. Value loss: 0.143444. Entropy: 1.119757.
Training network. lr: 0.000042. clip: 0.016604
Iteration 27181: Policy loss: 0.001019. Value loss: 0.247076. Entropy: 1.120744.
Iteration 27182: Policy loss: 0.000719. Value loss: 0.183079. Entropy: 1.120357.
Iteration 27183: Policy loss: -0.000711. Value loss: 0.151803. Entropy: 1.120037.
episode: 8750   score: 620.0  epsilon: 1.0    steps: 72  evaluation reward: 511.7
now time :  2019-03-06 02:38:15.217973
episode: 8751   score: 695.0  epsilon: 1.0    steps: 

Training network. lr: 0.000041. clip: 0.016448
Iteration 27241: Policy loss: 0.001920. Value loss: 0.350536. Entropy: 1.063225.
Iteration 27242: Policy loss: 0.002026. Value loss: 0.250895. Entropy: 1.069489.
Iteration 27243: Policy loss: 0.002237. Value loss: 0.183609. Entropy: 1.066771.
Training network. lr: 0.000041. clip: 0.016448
Iteration 27244: Policy loss: 0.003608. Value loss: 0.217651. Entropy: 1.065250.
Iteration 27245: Policy loss: 0.005603. Value loss: 0.105839. Entropy: 1.066735.
Iteration 27246: Policy loss: 0.003059. Value loss: 0.082934. Entropy: 1.058654.
episode: 8772   score: 365.0  epsilon: 1.0    steps: 576  evaluation reward: 531.25
Training network. lr: 0.000041. clip: 0.016448
Iteration 27247: Policy loss: 0.001526. Value loss: 0.183416. Entropy: 1.108561.
Iteration 27248: Policy loss: -0.000540. Value loss: 0.123166. Entropy: 1.109046.
Iteration 27249: Policy loss: -0.001789. Value loss: 0.097604. Entropy: 1.108403.
Training network. lr: 0.000041. clip: 0.0164

Iteration 27309: Policy loss: 0.002885. Value loss: 0.129316. Entropy: 1.111558.
Training network. lr: 0.000040. clip: 0.016144
Iteration 27310: Policy loss: 0.000748. Value loss: 0.174464. Entropy: 1.009188.
Iteration 27311: Policy loss: -0.000234. Value loss: 0.121716. Entropy: 1.012118.
Iteration 27312: Policy loss: -0.001716. Value loss: 0.095740. Entropy: 1.010645.
episode: 8791   score: 605.0  epsilon: 1.0    steps: 24  evaluation reward: 534.45
Training network. lr: 0.000040. clip: 0.016144
Iteration 27313: Policy loss: 0.000963. Value loss: 0.124270. Entropy: 1.021020.
Iteration 27314: Policy loss: -0.000819. Value loss: 0.080119. Entropy: 1.024807.
Iteration 27315: Policy loss: -0.002085. Value loss: 0.062901. Entropy: 1.021436.
episode: 8792   score: 500.0  epsilon: 1.0    steps: 488  evaluation reward: 533.55
Training network. lr: 0.000040. clip: 0.016144
Iteration 27316: Policy loss: 0.002603. Value loss: 0.396102. Entropy: 1.033454.
Iteration 27317: Policy loss: 0.002528. 

Training network. lr: 0.000040. clip: 0.015987
Iteration 27376: Policy loss: 0.001469. Value loss: 0.148294. Entropy: 1.053367.
Iteration 27377: Policy loss: 0.000340. Value loss: 0.085439. Entropy: 1.047947.
Iteration 27378: Policy loss: -0.000389. Value loss: 0.067267. Entropy: 1.049299.
Training network. lr: 0.000040. clip: 0.015987
Iteration 27379: Policy loss: 0.001201. Value loss: 0.233834. Entropy: 1.121078.
Iteration 27380: Policy loss: -0.000582. Value loss: 0.143967. Entropy: 1.120981.
Iteration 27381: Policy loss: -0.000179. Value loss: 0.115078. Entropy: 1.122411.
Training network. lr: 0.000040. clip: 0.015987
Iteration 27382: Policy loss: 0.000957. Value loss: 0.245772. Entropy: 1.058345.
Iteration 27383: Policy loss: -0.000424. Value loss: 0.172392. Entropy: 1.059312.
Iteration 27384: Policy loss: -0.001456. Value loss: 0.130562. Entropy: 1.055284.
Training network. lr: 0.000040. clip: 0.015987
Iteration 27385: Policy loss: 0.001122. Value loss: 0.255957. Entropy: 1.17534

Iteration 27444: Policy loss: 0.001773. Value loss: 0.234889. Entropy: 0.985472.
episode: 8830   score: 590.0  epsilon: 1.0    steps: 472  evaluation reward: 554.05
Training network. lr: 0.000040. clip: 0.015830
Iteration 27445: Policy loss: 0.000840. Value loss: 0.122346. Entropy: 1.045996.
Iteration 27446: Policy loss: -0.001286. Value loss: 0.089656. Entropy: 1.056061.
Iteration 27447: Policy loss: -0.002299. Value loss: 0.073468. Entropy: 1.047998.
episode: 8831   score: 805.0  epsilon: 1.0    steps: 552  evaluation reward: 557.85
Training network. lr: 0.000040. clip: 0.015830
Iteration 27448: Policy loss: -0.000027. Value loss: 0.102805. Entropy: 1.044319.
Iteration 27449: Policy loss: -0.002387. Value loss: 0.085254. Entropy: 1.044504.
Iteration 27450: Policy loss: -0.002453. Value loss: 0.074467. Entropy: 1.046635.
Training network. lr: 0.000039. clip: 0.015683
Iteration 27451: Policy loss: 0.000729. Value loss: 0.105071. Entropy: 1.074463.
Iteration 27452: Policy loss: -0.00177

Iteration 27512: Policy loss: 0.000998. Value loss: 0.130351. Entropy: 1.027248.
Iteration 27513: Policy loss: -0.000213. Value loss: 0.112520. Entropy: 1.025599.
episode: 8849   score: 455.0  epsilon: 1.0    steps: 128  evaluation reward: 545.55
Training network. lr: 0.000039. clip: 0.015526
Iteration 27514: Policy loss: 0.001033. Value loss: 0.288027. Entropy: 1.033996.
Iteration 27515: Policy loss: 0.003187. Value loss: 0.203594. Entropy: 1.034015.
Iteration 27516: Policy loss: 0.001757. Value loss: 0.151202. Entropy: 1.034800.
Training network. lr: 0.000039. clip: 0.015526
Iteration 27517: Policy loss: 0.001622. Value loss: 0.163240. Entropy: 1.063115.
Iteration 27518: Policy loss: 0.000379. Value loss: 0.111593. Entropy: 1.063859.
Iteration 27519: Policy loss: -0.001444. Value loss: 0.087914. Entropy: 1.062805.
episode: 8850   score: 585.0  epsilon: 1.0    steps: 48  evaluation reward: 545.2
Training network. lr: 0.000039. clip: 0.015526
Iteration 27520: Policy loss: 0.000895. Val

Training network. lr: 0.000038. clip: 0.015369
Iteration 27580: Policy loss: 0.000576. Value loss: 0.222791. Entropy: 1.007169.
Iteration 27581: Policy loss: -0.001115. Value loss: 0.156249. Entropy: 1.006467.
Iteration 27582: Policy loss: -0.002685. Value loss: 0.121749. Entropy: 1.008606.
episode: 8869   score: 455.0  epsilon: 1.0    steps: 296  evaluation reward: 541.95
Training network. lr: 0.000038. clip: 0.015369
Iteration 27583: Policy loss: 0.001013. Value loss: 0.268318. Entropy: 0.992688.
Iteration 27584: Policy loss: 0.000757. Value loss: 0.227510. Entropy: 0.992300.
Iteration 27585: Policy loss: 0.000124. Value loss: 0.187132. Entropy: 0.990814.
Training network. lr: 0.000038. clip: 0.015369
Iteration 27586: Policy loss: 0.001043. Value loss: 0.152308. Entropy: 1.043373.
Iteration 27587: Policy loss: 0.000862. Value loss: 0.111235. Entropy: 1.043804.
Iteration 27588: Policy loss: -0.001399. Value loss: 0.091641. Entropy: 1.042679.
episode: 8870   score: 645.0  epsilon: 1.0 

Training network. lr: 0.000038. clip: 0.015222
Iteration 27646: Policy loss: 0.000916. Value loss: 0.144254. Entropy: 0.949146.
Iteration 27647: Policy loss: 0.000262. Value loss: 0.095427. Entropy: 0.948067.
Iteration 27648: Policy loss: -0.000723. Value loss: 0.080330. Entropy: 0.947369.
episode: 8891   score: 515.0  epsilon: 1.0    steps: 832  evaluation reward: 551.45
Training network. lr: 0.000038. clip: 0.015222
Iteration 27649: Policy loss: 0.001800. Value loss: 0.470738. Entropy: 0.926624.
Iteration 27650: Policy loss: 0.001207. Value loss: 0.361436. Entropy: 0.926283.
Iteration 27651: Policy loss: 0.000715. Value loss: 0.315769. Entropy: 0.929166.
Training network. lr: 0.000038. clip: 0.015065
Iteration 27652: Policy loss: 0.004321. Value loss: 0.535085. Entropy: 1.048720.
Iteration 27653: Policy loss: 0.007443. Value loss: 0.294465. Entropy: 1.056321.
Iteration 27654: Policy loss: 0.004717. Value loss: 0.226498. Entropy: 1.050711.
episode: 8892   score: 310.0  epsilon: 1.0   

Training network. lr: 0.000037. clip: 0.014909
Iteration 27715: Policy loss: 0.000363. Value loss: 0.101750. Entropy: 1.045453.
Iteration 27716: Policy loss: -0.001142. Value loss: 0.080477. Entropy: 1.050272.
Iteration 27717: Policy loss: -0.001898. Value loss: 0.068788. Entropy: 1.048776.
episode: 8909   score: 705.0  epsilon: 1.0    steps: 296  evaluation reward: 549.55
episode: 8910   score: 670.0  epsilon: 1.0    steps: 424  evaluation reward: 552.35
episode: 8911   score: 620.0  epsilon: 1.0    steps: 512  evaluation reward: 554.35
Training network. lr: 0.000037. clip: 0.014909
Iteration 27718: Policy loss: 0.000318. Value loss: 0.339961. Entropy: 0.976966.
Iteration 27719: Policy loss: -0.000703. Value loss: 0.294439. Entropy: 0.968947.
Iteration 27720: Policy loss: -0.000118. Value loss: 0.260689. Entropy: 0.970024.
Training network. lr: 0.000037. clip: 0.014909
Iteration 27721: Policy loss: 0.001302. Value loss: 0.188617. Entropy: 1.046263.
Iteration 27722: Policy loss: 0.0008

Iteration 27782: Policy loss: 0.001230. Value loss: 0.476923. Entropy: 1.010618.
Iteration 27783: Policy loss: 0.001202. Value loss: 0.440952. Entropy: 1.008169.
Training network. lr: 0.000037. clip: 0.014761
Iteration 27784: Policy loss: 0.003401. Value loss: 0.612343. Entropy: 1.097096.
Iteration 27785: Policy loss: 0.006478. Value loss: 0.398648. Entropy: 1.096450.
Iteration 27786: Policy loss: 0.004793. Value loss: 0.307446. Entropy: 1.089931.
episode: 8929   score: 900.0  epsilon: 1.0    steps: 120  evaluation reward: 577.4
Training network. lr: 0.000037. clip: 0.014761
Iteration 27787: Policy loss: 0.001941. Value loss: 0.323514. Entropy: 1.015566.
Iteration 27788: Policy loss: 0.002987. Value loss: 0.175840. Entropy: 1.013413.
Iteration 27789: Policy loss: 0.001727. Value loss: 0.140632. Entropy: 1.014461.
episode: 8930   score: 605.0  epsilon: 1.0    steps: 40  evaluation reward: 577.55
Training network. lr: 0.000037. clip: 0.014761
Iteration 27790: Policy loss: 0.001356. Value

episode: 8949   score: 285.0  epsilon: 1.0    steps: 984  evaluation reward: 566.3
Training network. lr: 0.000037. clip: 0.014605
Iteration 27850: Policy loss: 0.001604. Value loss: 0.222888. Entropy: 1.087317.
Iteration 27851: Policy loss: -0.000333. Value loss: 0.175461. Entropy: 1.087560.
Iteration 27852: Policy loss: -0.000441. Value loss: 0.148140. Entropy: 1.088289.
episode: 8950   score: 635.0  epsilon: 1.0    steps: 976  evaluation reward: 566.8
Training network. lr: 0.000036. clip: 0.014448
Iteration 27853: Policy loss: 0.001739. Value loss: 0.376017. Entropy: 1.058120.
Iteration 27854: Policy loss: 0.005603. Value loss: 0.229986. Entropy: 1.064888.
Iteration 27855: Policy loss: 0.003948. Value loss: 0.181600. Entropy: 1.062766.
now time :  2019-03-06 02:47:08.473039
episode: 8951   score: 980.0  epsilon: 1.0    steps: 976  evaluation reward: 571.05
Training network. lr: 0.000036. clip: 0.014448
Iteration 27856: Policy loss: 0.001257. Value loss: 0.150505. Entropy: 1.040086.
I

Iteration 27918: Policy loss: 0.000504. Value loss: 0.083304. Entropy: 1.051933.
episode: 8968   score: 600.0  epsilon: 1.0    steps: 672  evaluation reward: 579.05
episode: 8969   score: 775.0  epsilon: 1.0    steps: 688  evaluation reward: 582.25
Training network. lr: 0.000036. clip: 0.014300
Iteration 27919: Policy loss: 0.002283. Value loss: 0.483096. Entropy: 1.081786.
Iteration 27920: Policy loss: 0.004138. Value loss: 0.405133. Entropy: 1.085729.
Iteration 27921: Policy loss: 0.002740. Value loss: 0.360837. Entropy: 1.084643.
Training network. lr: 0.000036. clip: 0.014300
Iteration 27922: Policy loss: 0.001106. Value loss: 0.127891. Entropy: 1.087425.
Iteration 27923: Policy loss: 0.000286. Value loss: 0.085987. Entropy: 1.083761.
Iteration 27924: Policy loss: -0.002094. Value loss: 0.070708. Entropy: 1.083451.
episode: 8970   score: 525.0  epsilon: 1.0    steps: 696  evaluation reward: 581.05
Training network. lr: 0.000036. clip: 0.014300
Iteration 27925: Policy loss: 0.002027.

Iteration 27984: Policy loss: 0.000217. Value loss: 0.283199. Entropy: 1.011839.
Training network. lr: 0.000035. clip: 0.014144
Iteration 27985: Policy loss: 0.001867. Value loss: 0.232464. Entropy: 1.153324.
Iteration 27986: Policy loss: 0.001957. Value loss: 0.178552. Entropy: 1.153759.
Iteration 27987: Policy loss: 0.000088. Value loss: 0.151965. Entropy: 1.155970.
Training network. lr: 0.000035. clip: 0.014144
Iteration 27988: Policy loss: 0.001256. Value loss: 0.150619. Entropy: 1.114806.
Iteration 27989: Policy loss: -0.000624. Value loss: 0.107471. Entropy: 1.115953.
Iteration 27990: Policy loss: -0.001624. Value loss: 0.090945. Entropy: 1.117656.
episode: 8990   score: 665.0  epsilon: 1.0    steps: 88  evaluation reward: 556.5
episode: 8991   score: 690.0  epsilon: 1.0    steps: 320  evaluation reward: 558.25
episode: 8992   score: 710.0  epsilon: 1.0    steps: 496  evaluation reward: 562.25
episode: 8993   score: 230.0  epsilon: 1.0    steps: 896  evaluation reward: 560.9
Trai

Iteration 28052: Policy loss: 0.001051. Value loss: 0.450920. Entropy: 1.073044.
Iteration 28053: Policy loss: 0.001498. Value loss: 0.430140. Entropy: 1.072565.
episode: 9009   score: 345.0  epsilon: 1.0    steps: 112  evaluation reward: 564.05
episode: 9010   score: 520.0  epsilon: 1.0    steps: 448  evaluation reward: 562.55
Training network. lr: 0.000035. clip: 0.013840
Iteration 28054: Policy loss: 0.000890. Value loss: 0.113706. Entropy: 1.016972.
Iteration 28055: Policy loss: -0.001640. Value loss: 0.097053. Entropy: 1.019387.
Iteration 28056: Policy loss: -0.002024. Value loss: 0.083597. Entropy: 1.020357.
Training network. lr: 0.000035. clip: 0.013840
Iteration 28057: Policy loss: 0.001453. Value loss: 0.389510. Entropy: 1.033031.
Iteration 28058: Policy loss: 0.002602. Value loss: 0.269501. Entropy: 1.032957.
Iteration 28059: Policy loss: 0.002237. Value loss: 0.233633. Entropy: 1.037330.
Training network. lr: 0.000035. clip: 0.013840
Iteration 28060: Policy loss: 0.002263. V

episode: 9029   score: 750.0  epsilon: 1.0    steps: 360  evaluation reward: 552.1
episode: 9030   score: 600.0  epsilon: 1.0    steps: 568  evaluation reward: 552.05
Training network. lr: 0.000034. clip: 0.013683
Iteration 28120: Policy loss: 0.001647. Value loss: 0.191100. Entropy: 0.952266.
Iteration 28121: Policy loss: 0.001024. Value loss: 0.130059. Entropy: 0.951322.
Iteration 28122: Policy loss: -0.000516. Value loss: 0.104713. Entropy: 0.952994.
Training network. lr: 0.000034. clip: 0.013683
Iteration 28123: Policy loss: 0.002012. Value loss: 0.333541. Entropy: 1.054492.
Iteration 28124: Policy loss: 0.002577. Value loss: 0.182946. Entropy: 1.049927.
Iteration 28125: Policy loss: 0.000388. Value loss: 0.135355. Entropy: 1.046255.
episode: 9031   score: 505.0  epsilon: 1.0    steps: 848  evaluation reward: 550.9
Training network. lr: 0.000034. clip: 0.013683
Iteration 28126: Policy loss: 0.001147. Value loss: 0.383296. Entropy: 1.053935.
Iteration 28127: Policy loss: 0.003527. V

Training network. lr: 0.000034. clip: 0.013526
Iteration 28189: Policy loss: 0.001123. Value loss: 0.164082. Entropy: 1.084422.
Iteration 28190: Policy loss: 0.000577. Value loss: 0.103814. Entropy: 1.085533.
Iteration 28191: Policy loss: -0.001063. Value loss: 0.081108. Entropy: 1.087116.
episode: 9048   score: 620.0  epsilon: 1.0    steps: 88  evaluation reward: 553.35
Training network. lr: 0.000034. clip: 0.013526
Iteration 28192: Policy loss: 0.001277. Value loss: 0.148766. Entropy: 1.067153.
Iteration 28193: Policy loss: -0.000345. Value loss: 0.107946. Entropy: 1.067251.
Iteration 28194: Policy loss: -0.000706. Value loss: 0.085557. Entropy: 1.069472.
Training network. lr: 0.000034. clip: 0.013526
Iteration 28195: Policy loss: 0.001294. Value loss: 0.466772. Entropy: 1.105357.
Iteration 28196: Policy loss: 0.000769. Value loss: 0.372920. Entropy: 1.101346.
Iteration 28197: Policy loss: 0.001238. Value loss: 0.326215. Entropy: 1.102764.
episode: 9049   score: 495.0  epsilon: 1.0  

Iteration 28256: Policy loss: 0.001830. Value loss: 0.472407. Entropy: 0.991229.
Iteration 28257: Policy loss: 0.001107. Value loss: 0.402306. Entropy: 0.987325.
Training network. lr: 0.000033. clip: 0.013222
Iteration 28258: Policy loss: 0.002059. Value loss: 0.403294. Entropy: 1.111864.
Iteration 28259: Policy loss: 0.003517. Value loss: 0.227934. Entropy: 1.109481.
Iteration 28260: Policy loss: 0.004036. Value loss: 0.137353. Entropy: 1.110413.
episode: 9068   score: 425.0  epsilon: 1.0    steps: 432  evaluation reward: 554.7
Training network. lr: 0.000033. clip: 0.013222
Iteration 28261: Policy loss: 0.000861. Value loss: 0.318095. Entropy: 1.034521.
Iteration 28262: Policy loss: 0.001703. Value loss: 0.225353. Entropy: 1.035864.
Iteration 28263: Policy loss: 0.000709. Value loss: 0.188561. Entropy: 1.038729.
episode: 9069   score: 635.0  epsilon: 1.0    steps: 440  evaluation reward: 553.3
Training network. lr: 0.000033. clip: 0.013222
Iteration 28264: Policy loss: 0.001918. Value

Iteration 28325: Policy loss: 0.001021. Value loss: 0.357649. Entropy: 1.081775.
Iteration 28326: Policy loss: 0.001333. Value loss: 0.302375. Entropy: 1.081814.
episode: 9087   score: 420.0  epsilon: 1.0    steps: 200  evaluation reward: 592.35
episode: 9088   score: 545.0  epsilon: 1.0    steps: 920  evaluation reward: 591.3
Training network. lr: 0.000033. clip: 0.013065
Iteration 28327: Policy loss: 0.000603. Value loss: 0.333040. Entropy: 1.015909.
Iteration 28328: Policy loss: 0.001838. Value loss: 0.246077. Entropy: 1.015842.
Iteration 28329: Policy loss: 0.000795. Value loss: 0.218877. Entropy: 1.016317.
Training network. lr: 0.000033. clip: 0.013065
Iteration 28330: Policy loss: 0.000400. Value loss: 0.110369. Entropy: 1.059407.
Iteration 28331: Policy loss: 0.000013. Value loss: 0.075497. Entropy: 1.059672.
Iteration 28332: Policy loss: -0.000485. Value loss: 0.057314. Entropy: 1.060988.
episode: 9089   score: 285.0  epsilon: 1.0    steps: 456  evaluation reward: 589.5
Trainin

Iteration 28393: Policy loss: 0.001117. Value loss: 0.339796. Entropy: 1.053890.
Iteration 28394: Policy loss: 0.001317. Value loss: 0.234244. Entropy: 1.057545.
Iteration 28395: Policy loss: 0.000777. Value loss: 0.184371. Entropy: 1.050699.
episode: 9106   score: 635.0  epsilon: 1.0    steps: 64  evaluation reward: 578.85
episode: 9107   score: 390.0  epsilon: 1.0    steps: 344  evaluation reward: 574.65
Training network. lr: 0.000032. clip: 0.012918
Iteration 28396: Policy loss: 0.000965. Value loss: 0.196640. Entropy: 1.016911.
Iteration 28397: Policy loss: -0.001000. Value loss: 0.174719. Entropy: 1.015286.
Iteration 28398: Policy loss: -0.001085. Value loss: 0.148274. Entropy: 1.018495.
episode: 9108   score: 215.0  epsilon: 1.0    steps: 272  evaluation reward: 570.45
episode: 9109   score: 555.0  epsilon: 1.0    steps: 1008  evaluation reward: 572.55
Training network. lr: 0.000032. clip: 0.012918
Iteration 28399: Policy loss: 0.001023. Value loss: 0.528508. Entropy: 1.100502.
I

Training network. lr: 0.000032. clip: 0.012605
Iteration 28459: Policy loss: 0.002038. Value loss: 0.349651. Entropy: 1.045590.
Iteration 28460: Policy loss: 0.000413. Value loss: 0.312406. Entropy: 1.042920.
Iteration 28461: Policy loss: 0.000354. Value loss: 0.290210. Entropy: 1.041980.
Training network. lr: 0.000032. clip: 0.012605
Iteration 28462: Policy loss: 0.003945. Value loss: 0.240769. Entropy: 1.065986.
Iteration 28463: Policy loss: 0.003635. Value loss: 0.134527. Entropy: 1.064829.
Iteration 28464: Policy loss: 0.001634. Value loss: 0.109151. Entropy: 1.064986.
Training network. lr: 0.000032. clip: 0.012605
Iteration 28465: Policy loss: 0.000866. Value loss: 0.167881. Entropy: 1.098340.
Iteration 28466: Policy loss: -0.000545. Value loss: 0.122982. Entropy: 1.094893.
Iteration 28467: Policy loss: -0.001070. Value loss: 0.093412. Entropy: 1.096056.
episode: 9129   score: 320.0  epsilon: 1.0    steps: 120  evaluation reward: 575.6
episode: 9130   score: 475.0  epsilon: 1.0   

Training network. lr: 0.000031. clip: 0.012457
Iteration 28528: Policy loss: 0.000773. Value loss: 0.169302. Entropy: 1.066506.
Iteration 28529: Policy loss: -0.001042. Value loss: 0.132863. Entropy: 1.069385.
Iteration 28530: Policy loss: -0.002307. Value loss: 0.115040. Entropy: 1.069516.
episode: 9148   score: 830.0  epsilon: 1.0    steps: 224  evaluation reward: 588.05
episode: 9149   score: 395.0  epsilon: 1.0    steps: 688  evaluation reward: 587.05
Training network. lr: 0.000031. clip: 0.012457
Iteration 28531: Policy loss: 0.000855. Value loss: 0.222668. Entropy: 1.029131.
Iteration 28532: Policy loss: 0.000244. Value loss: 0.164462. Entropy: 1.026485.
Iteration 28533: Policy loss: -0.000547. Value loss: 0.134904. Entropy: 1.027322.
Training network. lr: 0.000031. clip: 0.012457
Iteration 28534: Policy loss: 0.001474. Value loss: 0.173055. Entropy: 1.059652.
Iteration 28535: Policy loss: 0.000651. Value loss: 0.132251. Entropy: 1.058142.
Iteration 28536: Policy loss: -0.000288.

Iteration 28595: Policy loss: 0.000623. Value loss: 0.483190. Entropy: 1.076474.
Iteration 28596: Policy loss: -0.000300. Value loss: 0.447518. Entropy: 1.079188.
episode: 9168   score: 835.0  epsilon: 1.0    steps: 504  evaluation reward: 578.95
episode: 9169   score: 285.0  epsilon: 1.0    steps: 816  evaluation reward: 575.45
Training network. lr: 0.000031. clip: 0.012301
Iteration 28597: Policy loss: 0.002686. Value loss: 0.344899. Entropy: 1.021492.
Iteration 28598: Policy loss: 0.002927. Value loss: 0.255457. Entropy: 1.026042.
Iteration 28599: Policy loss: 0.001066. Value loss: 0.235959. Entropy: 1.027269.
episode: 9170   score: 635.0  epsilon: 1.0    steps: 80  evaluation reward: 571.75
Training network. lr: 0.000031. clip: 0.012301
Iteration 28600: Policy loss: 0.002139. Value loss: 0.173708. Entropy: 1.030996.
Iteration 28601: Policy loss: 0.000482. Value loss: 0.130532. Entropy: 1.032436.
Iteration 28602: Policy loss: -0.000336. Value loss: 0.103384. Entropy: 1.030425.
episo

Iteration 28663: Policy loss: 0.001058. Value loss: 0.158766. Entropy: 1.084206.
Iteration 28664: Policy loss: 0.000470. Value loss: 0.102512. Entropy: 1.084075.
Iteration 28665: Policy loss: -0.000204. Value loss: 0.082821. Entropy: 1.084249.
episode: 9187   score: 285.0  epsilon: 1.0    steps: 488  evaluation reward: 564.0
Training network. lr: 0.000030. clip: 0.011996
Iteration 28666: Policy loss: 0.001065. Value loss: 0.164651. Entropy: 1.033486.
Iteration 28667: Policy loss: 0.000288. Value loss: 0.124587. Entropy: 1.034883.
Iteration 28668: Policy loss: -0.000339. Value loss: 0.104629. Entropy: 1.033539.
episode: 9188   score: 435.0  epsilon: 1.0    steps: 568  evaluation reward: 562.9
Training network. lr: 0.000030. clip: 0.011996
Iteration 28669: Policy loss: 0.001118. Value loss: 0.217857. Entropy: 1.058744.
Iteration 28670: Policy loss: 0.000315. Value loss: 0.172745. Entropy: 1.055792.
Iteration 28671: Policy loss: -0.001195. Value loss: 0.144091. Entropy: 1.053531.
episode:

Iteration 28730: Policy loss: 0.002063. Value loss: 0.534059. Entropy: 1.075733.
Iteration 28731: Policy loss: 0.001662. Value loss: 0.468338. Entropy: 1.075601.
episode: 9208   score: 590.0  epsilon: 1.0    steps: 72  evaluation reward: 577.8
episode: 9209   score: 625.0  epsilon: 1.0    steps: 96  evaluation reward: 578.5
Training network. lr: 0.000030. clip: 0.011840
Iteration 28732: Policy loss: 0.003009. Value loss: 0.292065. Entropy: 0.948251.
Iteration 28733: Policy loss: 0.010364. Value loss: 0.176934. Entropy: 0.957703.
Iteration 28734: Policy loss: 0.004448. Value loss: 0.138749. Entropy: 0.946973.
Training network. lr: 0.000030. clip: 0.011840
Iteration 28735: Policy loss: 0.000803. Value loss: 0.117542. Entropy: 1.121599.
Iteration 28736: Policy loss: -0.000333. Value loss: 0.098428. Entropy: 1.120597.
Iteration 28737: Policy loss: -0.001407. Value loss: 0.082627. Entropy: 1.118913.
episode: 9210   score: 695.0  epsilon: 1.0    steps: 896  evaluation reward: 578.2
Training 

Training network. lr: 0.000029. clip: 0.011683
Iteration 28798: Policy loss: 0.001116. Value loss: 0.137386. Entropy: 1.020332.
Iteration 28799: Policy loss: 0.000305. Value loss: 0.117524. Entropy: 1.019503.
Iteration 28800: Policy loss: 0.000365. Value loss: 0.102826. Entropy: 1.018671.
episode: 9228   score: 750.0  epsilon: 1.0    steps: 752  evaluation reward: 572.9
episode: 9229   score: 1005.0  epsilon: 1.0    steps: 888  evaluation reward: 579.75
Training network. lr: 0.000029. clip: 0.011536
Iteration 28801: Policy loss: 0.001027. Value loss: 0.296372. Entropy: 1.061129.
Iteration 28802: Policy loss: 0.002388. Value loss: 0.213986. Entropy: 1.058323.
Iteration 28803: Policy loss: 0.002421. Value loss: 0.163460. Entropy: 1.059103.
Training network. lr: 0.000029. clip: 0.011536
Iteration 28804: Policy loss: 0.003291. Value loss: 0.555358. Entropy: 1.014891.
Iteration 28805: Policy loss: 0.009415. Value loss: 0.260826. Entropy: 1.022577.
Iteration 28806: Policy loss: 0.007258. Val

Training network. lr: 0.000028. clip: 0.011379
Iteration 28867: Policy loss: 0.003811. Value loss: 0.424874. Entropy: 1.143533.
Iteration 28868: Policy loss: 0.008628. Value loss: 0.285007. Entropy: 1.150202.
Iteration 28869: Policy loss: 0.009457. Value loss: 0.230909. Entropy: 1.147773.
episode: 9246   score: 985.0  epsilon: 1.0    steps: 488  evaluation reward: 591.8
episode: 9247   score: 465.0  epsilon: 1.0    steps: 568  evaluation reward: 591.75
episode: 9248   score: 350.0  epsilon: 1.0    steps: 832  evaluation reward: 586.95
Training network. lr: 0.000028. clip: 0.011379
Iteration 28870: Policy loss: 0.001244. Value loss: 0.396218. Entropy: 1.009303.
Iteration 28871: Policy loss: 0.001249. Value loss: 0.287710. Entropy: 1.008400.
Iteration 28872: Policy loss: 0.000794. Value loss: 0.219424. Entropy: 1.008431.
Training network. lr: 0.000028. clip: 0.011379
Iteration 28873: Policy loss: 0.000807. Value loss: 0.095331. Entropy: 1.016693.
Iteration 28874: Policy loss: -0.000888. 

Training network. lr: 0.000028. clip: 0.011222
Iteration 28933: Policy loss: 0.000907. Value loss: 0.301295. Entropy: 1.045721.
Iteration 28934: Policy loss: 0.001934. Value loss: 0.212766. Entropy: 1.051819.
Iteration 28935: Policy loss: 0.002215. Value loss: 0.187445. Entropy: 1.054494.
Training network. lr: 0.000028. clip: 0.011222
Iteration 28936: Policy loss: 0.000788. Value loss: 0.232331. Entropy: 1.031335.
Iteration 28937: Policy loss: 0.000113. Value loss: 0.182987. Entropy: 1.030551.
Iteration 28938: Policy loss: -0.000708. Value loss: 0.167310. Entropy: 1.027828.
Training network. lr: 0.000028. clip: 0.011222
Iteration 28939: Policy loss: 0.000589. Value loss: 0.469173. Entropy: 1.138088.
Iteration 28940: Policy loss: 0.000480. Value loss: 0.397342. Entropy: 1.142364.
Iteration 28941: Policy loss: 0.000657. Value loss: 0.325574. Entropy: 1.138779.
episode: 9268   score: 420.0  epsilon: 1.0    steps: 312  evaluation reward: 586.95
episode: 9269   score: 555.0  epsilon: 1.0   

Iteration 29001: Policy loss: -0.000636. Value loss: 0.095069. Entropy: 0.998636.
episode: 9287   score: 1000.0  epsilon: 1.0    steps: 416  evaluation reward: 597.15
Training network. lr: 0.000027. clip: 0.010918
Iteration 29002: Policy loss: 0.000964. Value loss: 0.131416. Entropy: 1.038288.
Iteration 29003: Policy loss: -0.000079. Value loss: 0.091042. Entropy: 1.035624.
Iteration 29004: Policy loss: -0.000017. Value loss: 0.070463. Entropy: 1.039479.
episode: 9288   score: 560.0  epsilon: 1.0    steps: 784  evaluation reward: 598.4
Training network. lr: 0.000027. clip: 0.010918
Iteration 29005: Policy loss: 0.000860. Value loss: 0.224233. Entropy: 1.079532.
Iteration 29006: Policy loss: 0.001079. Value loss: 0.167408. Entropy: 1.080798.
Iteration 29007: Policy loss: 0.000405. Value loss: 0.140257. Entropy: 1.082795.
episode: 9289   score: 605.0  epsilon: 1.0    steps: 640  evaluation reward: 599.9
Training network. lr: 0.000027. clip: 0.010918
Iteration 29008: Policy loss: 0.001794

Iteration 29068: Policy loss: 0.001397. Value loss: 0.212980. Entropy: 1.041947.
Iteration 29069: Policy loss: 0.000588. Value loss: 0.152793. Entropy: 1.040554.
Iteration 29070: Policy loss: 0.000104. Value loss: 0.125704. Entropy: 1.044393.
episode: 9307   score: 375.0  epsilon: 1.0    steps: 120  evaluation reward: 588.3
Training network. lr: 0.000027. clip: 0.010761
Iteration 29071: Policy loss: 0.000639. Value loss: 0.136538. Entropy: 0.984448.
Iteration 29072: Policy loss: 0.000073. Value loss: 0.108914. Entropy: 0.985056.
Iteration 29073: Policy loss: -0.000342. Value loss: 0.091065. Entropy: 0.978899.
episode: 9308   score: 420.0  epsilon: 1.0    steps: 408  evaluation reward: 586.6
episode: 9309   score: 600.0  epsilon: 1.0    steps: 568  evaluation reward: 586.35
Training network. lr: 0.000027. clip: 0.010761
Iteration 29074: Policy loss: 0.001311. Value loss: 0.328367. Entropy: 1.018590.
Iteration 29075: Policy loss: 0.003245. Value loss: 0.244694. Entropy: 1.015333.
Iterati

Iteration 29135: Policy loss: 0.001896. Value loss: 0.203321. Entropy: 1.067156.
Iteration 29136: Policy loss: 0.000926. Value loss: 0.169586. Entropy: 1.065468.
Training network. lr: 0.000027. clip: 0.010614
Iteration 29137: Policy loss: 0.000746. Value loss: 0.351776. Entropy: 1.085170.
Iteration 29138: Policy loss: 0.001282. Value loss: 0.295874. Entropy: 1.090824.
Iteration 29139: Policy loss: 0.000964. Value loss: 0.259204. Entropy: 1.089377.
episode: 9328   score: 485.0  epsilon: 1.0    steps: 432  evaluation reward: 589.1
Training network. lr: 0.000027. clip: 0.010614
Iteration 29140: Policy loss: 0.000986. Value loss: 0.350923. Entropy: 1.110758.
Iteration 29141: Policy loss: 0.000461. Value loss: 0.292188. Entropy: 1.112119.
Iteration 29142: Policy loss: 0.000733. Value loss: 0.255125. Entropy: 1.109954.
episode: 9329   score: 330.0  epsilon: 1.0    steps: 728  evaluation reward: 582.35
episode: 9330   score: 555.0  epsilon: 1.0    steps: 968  evaluation reward: 581.75
Trainin

Training network. lr: 0.000026. clip: 0.010301
Iteration 29203: Policy loss: 0.000339. Value loss: 0.124808. Entropy: 1.021091.
Iteration 29204: Policy loss: -0.000487. Value loss: 0.103545. Entropy: 1.020493.
Iteration 29205: Policy loss: -0.000624. Value loss: 0.091363. Entropy: 1.019935.
Training network. lr: 0.000026. clip: 0.010301
Iteration 29206: Policy loss: 0.006277. Value loss: 0.320948. Entropy: 1.113020.
Iteration 29207: Policy loss: 0.009434. Value loss: 0.194412. Entropy: 1.108428.
Iteration 29208: Policy loss: 0.006274. Value loss: 0.154842. Entropy: 1.115532.
episode: 9348   score: 500.0  epsilon: 1.0    steps: 72  evaluation reward: 563.4
Training network. lr: 0.000026. clip: 0.010301
Iteration 29209: Policy loss: 0.000958. Value loss: 0.209922. Entropy: 0.991082.
Iteration 29210: Policy loss: 0.001583. Value loss: 0.152505. Entropy: 0.983976.
Iteration 29211: Policy loss: 0.000939. Value loss: 0.116116. Entropy: 0.985400.
episode: 9349   score: 560.0  epsilon: 1.0    

Iteration 29269: Policy loss: 0.000847. Value loss: 0.285652. Entropy: 0.993142.
Iteration 29270: Policy loss: 0.001016. Value loss: 0.183473. Entropy: 0.991619.
Iteration 29271: Policy loss: 0.001662. Value loss: 0.147677. Entropy: 0.991543.
Training network. lr: 0.000025. clip: 0.010153
Iteration 29272: Policy loss: 0.000666. Value loss: 0.401010. Entropy: 1.155527.
Iteration 29273: Policy loss: 0.002259. Value loss: 0.367776. Entropy: 1.151413.
Iteration 29274: Policy loss: 0.002045. Value loss: 0.324528. Entropy: 1.152037.
Training network. lr: 0.000025. clip: 0.010153
Iteration 29275: Policy loss: 0.001111. Value loss: 0.337356. Entropy: 1.074941.
Iteration 29276: Policy loss: 0.001017. Value loss: 0.290036. Entropy: 1.073312.
Iteration 29277: Policy loss: 0.000615. Value loss: 0.255454. Entropy: 1.071035.
Training network. lr: 0.000025. clip: 0.010153
Iteration 29278: Policy loss: 0.000495. Value loss: 0.604539. Entropy: 1.107783.
Iteration 29279: Policy loss: 0.001161. Value los

Iteration 36: Policy loss: -0.001790. Value loss: 0.035861. Entropy: 2.158481.
episode: 16   score: 1000.0  epsilon: 1.0    steps: 80  evaluation reward: 235.625
Training network. lr: 0.000250. clip: 0.100000
Iteration 37: Policy loss: 0.001593. Value loss: 0.079841. Entropy: 2.161040.
Iteration 38: Policy loss: -0.003863. Value loss: 0.062138. Entropy: 2.155013.
Iteration 39: Policy loss: -0.004457. Value loss: 0.055246. Entropy: 2.154757.
episode: 17   score: 230.0  epsilon: 1.0    steps: 728  evaluation reward: 235.2941176470588
episode: 18   score: 170.0  epsilon: 1.0    steps: 984  evaluation reward: 231.66666666666666
Training network. lr: 0.000250. clip: 0.100000
Iteration 40: Policy loss: -0.000155. Value loss: 0.057674. Entropy: 2.160723.
Iteration 41: Policy loss: -0.002349. Value loss: 0.044151. Entropy: 2.153814.
Iteration 42: Policy loss: -0.005352. Value loss: 0.035911. Entropy: 2.151081.
episode: 19   score: 300.0  epsilon: 1.0    steps: 448  evaluation reward: 235.26315

episode: 45   score: 550.0  epsilon: 1.0    steps: 976  evaluation reward: 325.55555555555554
Training network. lr: 0.000250. clip: 0.099853
Iteration 97: Policy loss: -0.001530. Value loss: 0.112415. Entropy: 2.095732.
Iteration 98: Policy loss: -0.006948. Value loss: 0.096462. Entropy: 2.088636.
Iteration 99: Policy loss: -0.009767. Value loss: 0.083842. Entropy: 2.086128.
Training network. lr: 0.000250. clip: 0.099853
Iteration 100: Policy loss: 0.001788. Value loss: 0.211395. Entropy: 2.076149.
Iteration 101: Policy loss: -0.001464. Value loss: 0.199559. Entropy: 2.073008.
Iteration 102: Policy loss: -0.003051. Value loss: 0.181453. Entropy: 2.083639.
episode: 46   score: 220.0  epsilon: 1.0    steps: 48  evaluation reward: 323.2608695652174
Training network. lr: 0.000249. clip: 0.099696
Iteration 103: Policy loss: 0.002388. Value loss: 0.354055. Entropy: 2.080600.
Iteration 104: Policy loss: 0.000981. Value loss: 0.308183. Entropy: 2.081347.
Iteration 105: Policy loss: 0.000993. V

Iteration 158: Policy loss: 0.001039. Value loss: 0.431745. Entropy: 2.004827.
Iteration 159: Policy loss: -0.000162. Value loss: 0.418189. Entropy: 2.014267.
episode: 71   score: 330.0  epsilon: 1.0    steps: 40  evaluation reward: 376.3380281690141
episode: 72   score: 260.0  epsilon: 1.0    steps: 88  evaluation reward: 374.72222222222223
Training network. lr: 0.000249. clip: 0.099548
Iteration 160: Policy loss: 0.001330. Value loss: 0.129848. Entropy: 1.960819.
Iteration 161: Policy loss: -0.001380. Value loss: 0.118040. Entropy: 1.960136.
Iteration 162: Policy loss: -0.006467. Value loss: 0.103230. Entropy: 1.956474.
episode: 73   score: 130.0  epsilon: 1.0    steps: 960  evaluation reward: 371.36986301369865
Training network. lr: 0.000249. clip: 0.099548
Iteration 163: Policy loss: 0.001975. Value loss: 0.198358. Entropy: 1.948340.
Iteration 164: Policy loss: -0.003962. Value loss: 0.121552. Entropy: 1.927167.
Iteration 165: Policy loss: -0.004785. Value loss: 0.098705. Entropy: 

Training network. lr: 0.000248. clip: 0.099392
Iteration 220: Policy loss: -0.000217. Value loss: 0.083913. Entropy: 2.064208.
Iteration 221: Policy loss: -0.003144. Value loss: 0.070033. Entropy: 2.059538.
Iteration 222: Policy loss: -0.006732. Value loss: 0.065648. Entropy: 2.061138.
episode: 96   score: 410.0  epsilon: 1.0    steps: 880  evaluation reward: 391.25
Training network. lr: 0.000248. clip: 0.099392
Iteration 223: Policy loss: 0.002977. Value loss: 0.077218. Entropy: 2.056237.
Iteration 224: Policy loss: -0.002284. Value loss: 0.055075. Entropy: 2.056016.
Iteration 225: Policy loss: -0.004517. Value loss: 0.048793. Entropy: 2.066899.
episode: 97   score: 280.0  epsilon: 1.0    steps: 1000  evaluation reward: 390.10309278350513
Training network. lr: 0.000248. clip: 0.099392
Iteration 226: Policy loss: 0.000847. Value loss: 0.052791. Entropy: 2.072015.
Iteration 227: Policy loss: -0.004629. Value loss: 0.040800. Entropy: 2.058466.
Iteration 228: Policy loss: -0.007373. Value

Iteration 284: Policy loss: -0.005092. Value loss: 0.063185. Entropy: 2.145091.
Iteration 285: Policy loss: -0.008385. Value loss: 0.054168. Entropy: 2.147693.
episode: 121   score: 470.0  epsilon: 1.0    steps: 568  evaluation reward: 422.0
Training network. lr: 0.000248. clip: 0.099235
Iteration 286: Policy loss: 0.000838. Value loss: 0.049144. Entropy: 2.138599.
Iteration 287: Policy loss: -0.001439. Value loss: 0.035600. Entropy: 2.139856.
Iteration 288: Policy loss: -0.004784. Value loss: 0.028301. Entropy: 2.135936.
episode: 122   score: 420.0  epsilon: 1.0    steps: 448  evaluation reward: 423.1
Training network. lr: 0.000248. clip: 0.099235
Iteration 289: Policy loss: 0.002589. Value loss: 0.074689. Entropy: 2.123965.
Iteration 290: Policy loss: -0.001483. Value loss: 0.057413. Entropy: 2.128995.
Iteration 291: Policy loss: -0.004850. Value loss: 0.054258. Entropy: 2.132121.
episode: 123   score: 560.0  epsilon: 1.0    steps: 392  evaluation reward: 427.1
episode: 124   score: 

Iteration 347: Policy loss: -0.002688. Value loss: 0.169956. Entropy: 2.020020.
Iteration 348: Policy loss: -0.002882. Value loss: 0.163340. Entropy: 2.026002.
Training network. lr: 0.000248. clip: 0.099088
Iteration 349: Policy loss: 0.006063. Value loss: 0.117667. Entropy: 2.009557.
Iteration 350: Policy loss: 0.003385. Value loss: 0.097506. Entropy: 1.992100.
Iteration 351: Policy loss: -0.000388. Value loss: 0.088815. Entropy: 1.986668.
episode: 148   score: 310.0  epsilon: 1.0    steps: 440  evaluation reward: 427.0
Training network. lr: 0.000247. clip: 0.098931
Iteration 352: Policy loss: 0.002622. Value loss: 0.207778. Entropy: 2.093251.
Iteration 353: Policy loss: 0.001775. Value loss: 0.183818. Entropy: 2.075955.
Iteration 354: Policy loss: -0.000341. Value loss: 0.165760. Entropy: 2.078082.
Training network. lr: 0.000247. clip: 0.098931
Iteration 355: Policy loss: 0.000943. Value loss: 0.139126. Entropy: 2.099499.
Iteration 356: Policy loss: -0.001766. Value loss: 0.122357. E

Iteration 412: Policy loss: 0.002238. Value loss: 0.191170. Entropy: 2.014864.
Iteration 413: Policy loss: -0.006954. Value loss: 0.141840. Entropy: 2.006006.
Iteration 414: Policy loss: -0.009818. Value loss: 0.136598. Entropy: 2.004088.
episode: 172   score: 710.0  epsilon: 1.0    steps: 88  evaluation reward: 429.4
episode: 173   score: 420.0  epsilon: 1.0    steps: 976  evaluation reward: 432.3
Training network. lr: 0.000247. clip: 0.098774
Iteration 415: Policy loss: 0.003241. Value loss: 0.184293. Entropy: 2.028664.
Iteration 416: Policy loss: -0.001934. Value loss: 0.138858. Entropy: 2.019363.
Iteration 417: Policy loss: -0.002255. Value loss: 0.126140. Entropy: 2.027514.
episode: 174   score: 460.0  epsilon: 1.0    steps: 88  evaluation reward: 434.6
episode: 175   score: 480.0  epsilon: 1.0    steps: 248  evaluation reward: 429.6
Training network. lr: 0.000247. clip: 0.098774
Iteration 418: Policy loss: 0.000522. Value loss: 0.167838. Entropy: 1.977337.
Iteration 419: Policy l

Iteration 477: Policy loss: -0.006695. Value loss: 0.252046. Entropy: 1.952947.
episode: 197   score: 360.0  epsilon: 1.0    steps: 632  evaluation reward: 503.7
episode: 198   score: 410.0  epsilon: 1.0    steps: 664  evaluation reward: 504.5
Training network. lr: 0.000247. clip: 0.098627
Iteration 478: Policy loss: 0.001827. Value loss: 0.231264. Entropy: 1.930433.
Iteration 479: Policy loss: -0.004274. Value loss: 0.182931. Entropy: 1.938362.
Iteration 480: Policy loss: -0.008306. Value loss: 0.171077. Entropy: 1.928560.
episode: 199   score: 300.0  epsilon: 1.0    steps: 312  evaluation reward: 502.3
Training network. lr: 0.000247. clip: 0.098627
Iteration 481: Policy loss: 0.001640. Value loss: 0.590667. Entropy: 1.919922.
Iteration 482: Policy loss: -0.006069. Value loss: 0.526976. Entropy: 1.927538.
Iteration 483: Policy loss: -0.007814. Value loss: 0.484313. Entropy: 1.916144.
Training network. lr: 0.000247. clip: 0.098627
Iteration 484: Policy loss: 0.000835. Value loss: 0.148

Training network. lr: 0.000246. clip: 0.098470
Iteration 541: Policy loss: -0.002412. Value loss: 0.308548. Entropy: 1.938948.
Iteration 542: Policy loss: -0.005964. Value loss: 0.226660. Entropy: 1.923794.
Iteration 543: Policy loss: -0.006078. Value loss: 0.193259. Entropy: 1.923127.
Training network. lr: 0.000246. clip: 0.098470
Iteration 544: Policy loss: 0.001266. Value loss: 0.142870. Entropy: 1.929384.
Iteration 545: Policy loss: -0.002957. Value loss: 0.095948. Entropy: 1.939247.
Iteration 546: Policy loss: -0.004623. Value loss: 0.076109. Entropy: 1.935276.
Training network. lr: 0.000246. clip: 0.098470
Iteration 547: Policy loss: 0.004414. Value loss: 0.285024. Entropy: 1.984435.
Iteration 548: Policy loss: -0.002397. Value loss: 0.221418. Entropy: 1.991127.
Iteration 549: Policy loss: -0.005984. Value loss: 0.206017. Entropy: 1.987656.
episode: 223   score: 860.0  epsilon: 1.0    steps: 424  evaluation reward: 592.9
episode: 224   score: 350.0  epsilon: 1.0    steps: 704  ev

Training network. lr: 0.000245. clip: 0.098166
Iteration 607: Policy loss: 0.001806. Value loss: 0.221581. Entropy: 1.856467.
Iteration 608: Policy loss: -0.005029. Value loss: 0.166621. Entropy: 1.830381.
Iteration 609: Policy loss: -0.007713. Value loss: 0.143746. Entropy: 1.829214.
Training network. lr: 0.000245. clip: 0.098166
Iteration 610: Policy loss: 0.000352. Value loss: 0.105564. Entropy: 1.799973.
Iteration 611: Policy loss: -0.008898. Value loss: 0.061372. Entropy: 1.794701.
Iteration 612: Policy loss: -0.009619. Value loss: 0.053168. Entropy: 1.803021.
episode: 247   score: 960.0  epsilon: 1.0    steps: 176  evaluation reward: 650.3
Training network. lr: 0.000245. clip: 0.098166
Iteration 613: Policy loss: 0.002996. Value loss: 0.607917. Entropy: 1.903571.
Iteration 614: Policy loss: 0.002468. Value loss: 0.532053. Entropy: 1.896165.
Iteration 615: Policy loss: 0.001328. Value loss: 0.480216. Entropy: 1.890071.
episode: 248   score: 610.0  epsilon: 1.0    steps: 664  evalu

episode: 270   score: 790.0  epsilon: 1.0    steps: 880  evaluation reward: 726.1
Training network. lr: 0.000245. clip: 0.098009
Iteration 673: Policy loss: 0.003898. Value loss: 0.650418. Entropy: 1.874272.
Iteration 674: Policy loss: -0.003037. Value loss: 0.515787. Entropy: 1.888464.
Iteration 675: Policy loss: -0.001666. Value loss: 0.466679. Entropy: 1.887857.
episode: 271   score: 1030.0  epsilon: 1.0    steps: 64  evaluation reward: 728.7
Training network. lr: 0.000245. clip: 0.098009
Iteration 676: Policy loss: 0.006830. Value loss: 0.218750. Entropy: 1.894559.
Iteration 677: Policy loss: -0.003195. Value loss: 0.172752. Entropy: 1.896237.
Iteration 678: Policy loss: -0.011602. Value loss: 0.140480. Entropy: 1.879999.
episode: 272   score: 860.0  epsilon: 1.0    steps: 184  evaluation reward: 730.2
Training network. lr: 0.000245. clip: 0.098009
Iteration 679: Policy loss: 0.004675. Value loss: 0.304626. Entropy: 1.861129.
Iteration 680: Policy loss: -0.001249. Value loss: 0.230

Training network. lr: 0.000245. clip: 0.097853
Iteration 739: Policy loss: 0.001305. Value loss: 0.238219. Entropy: 1.747742.
Iteration 740: Policy loss: -0.003300. Value loss: 0.155060. Entropy: 1.719001.
Iteration 741: Policy loss: -0.008687. Value loss: 0.117672. Entropy: 1.724747.
episode: 294   score: 1140.0  epsilon: 1.0    steps: 944  evaluation reward: 774.3
Training network. lr: 0.000245. clip: 0.097853
Iteration 742: Policy loss: 0.001738. Value loss: 0.224910. Entropy: 1.787893.
Iteration 743: Policy loss: -0.002564. Value loss: 0.189379. Entropy: 1.769794.
Iteration 744: Policy loss: -0.004372. Value loss: 0.171791. Entropy: 1.765998.
episode: 295   score: 850.0  epsilon: 1.0    steps: 280  evaluation reward: 775.2
Training network. lr: 0.000245. clip: 0.097853
Iteration 745: Policy loss: -0.000251. Value loss: 0.266108. Entropy: 1.829385.
Iteration 746: Policy loss: -0.004951. Value loss: 0.214840. Entropy: 1.839942.
Iteration 747: Policy loss: -0.009150. Value loss: 0.182

Training network. lr: 0.000244. clip: 0.097549
Iteration 805: Policy loss: -0.001581. Value loss: 0.208187. Entropy: 1.678420.
Iteration 806: Policy loss: -0.003136. Value loss: 0.152026. Entropy: 1.680504.
Iteration 807: Policy loss: -0.004554. Value loss: 0.112356. Entropy: 1.670288.
episode: 317   score: 750.0  epsilon: 1.0    steps: 24  evaluation reward: 847.5
episode: 318   score: 480.0  epsilon: 1.0    steps: 120  evaluation reward: 849.0
Training network. lr: 0.000244. clip: 0.097549
Iteration 808: Policy loss: -0.001013. Value loss: 0.282067. Entropy: 1.721505.
Iteration 809: Policy loss: -0.002446. Value loss: 0.228860. Entropy: 1.709815.
Iteration 810: Policy loss: -0.003734. Value loss: 0.198070. Entropy: 1.730651.
Training network. lr: 0.000244. clip: 0.097549
Iteration 811: Policy loss: 0.001456. Value loss: 0.381030. Entropy: 1.751906.
Iteration 812: Policy loss: -0.007611. Value loss: 0.275029. Entropy: 1.777221.
Iteration 813: Policy loss: -0.009728. Value loss: 0.2341

Training network. lr: 0.000243. clip: 0.097392
Iteration 871: Policy loss: 0.003717. Value loss: 0.169563. Entropy: 1.656256.
Iteration 872: Policy loss: -0.001460. Value loss: 0.126101. Entropy: 1.677604.
Iteration 873: Policy loss: -0.004549. Value loss: 0.109379. Entropy: 1.659516.
episode: 341   score: 1040.0  epsilon: 1.0    steps: 296  evaluation reward: 898.1
episode: 342   score: 440.0  epsilon: 1.0    steps: 864  evaluation reward: 900.4
episode: 343   score: 890.0  epsilon: 1.0    steps: 904  evaluation reward: 899.6
Training network. lr: 0.000243. clip: 0.097392
Iteration 874: Policy loss: 0.004303. Value loss: 0.242150. Entropy: 1.687571.
Iteration 875: Policy loss: 0.002006. Value loss: 0.154497. Entropy: 1.650089.
Iteration 876: Policy loss: -0.000259. Value loss: 0.122647. Entropy: 1.632230.
episode: 344   score: 1590.0  epsilon: 1.0    steps: 880  evaluation reward: 901.2
Training network. lr: 0.000243. clip: 0.097392
Iteration 877: Policy loss: 0.001855. Value loss: 0.

Iteration 935: Policy loss: -0.007700. Value loss: 0.349042. Entropy: 1.790474.
Iteration 936: Policy loss: -0.006198. Value loss: 0.336083. Entropy: 1.802829.
episode: 366   score: 460.0  epsilon: 1.0    steps: 712  evaluation reward: 926.3
Training network. lr: 0.000243. clip: 0.097244
Iteration 937: Policy loss: 0.003973. Value loss: 0.231512. Entropy: 1.888724.
Iteration 938: Policy loss: -0.002574. Value loss: 0.184748. Entropy: 1.893235.
Iteration 939: Policy loss: -0.006914. Value loss: 0.171899. Entropy: 1.894042.
episode: 367   score: 1510.0  epsilon: 1.0    steps: 608  evaluation reward: 936.2
episode: 368   score: 570.0  epsilon: 1.0    steps: 944  evaluation reward: 938.8
Training network. lr: 0.000243. clip: 0.097244
Iteration 940: Policy loss: 0.000510. Value loss: 0.132351. Entropy: 1.848073.
Iteration 941: Policy loss: -0.005582. Value loss: 0.098878. Entropy: 1.834427.
Iteration 942: Policy loss: -0.008230. Value loss: 0.091154. Entropy: 1.844238.
episode: 369   score:

Iteration 999: Policy loss: -0.003898. Value loss: 0.093425. Entropy: 1.832053.
Training network. lr: 0.000243. clip: 0.097088
Iteration 1000: Policy loss: 0.001944. Value loss: 0.378772. Entropy: 1.836144.
Iteration 1001: Policy loss: -0.005544. Value loss: 0.297358. Entropy: 1.838866.
Iteration 1002: Policy loss: -0.005735. Value loss: 0.280481. Entropy: 1.846493.
episode: 392   score: 330.0  epsilon: 1.0    steps: 432  evaluation reward: 885.4
episode: 393   score: 710.0  epsilon: 1.0    steps: 784  evaluation reward: 888.1
Training network. lr: 0.000242. clip: 0.096931
Iteration 1003: Policy loss: 0.002963. Value loss: 0.230914. Entropy: 1.914579.
Iteration 1004: Policy loss: -0.001466. Value loss: 0.154445. Entropy: 1.902440.
Iteration 1005: Policy loss: -0.003912. Value loss: 0.135256. Entropy: 1.899032.
Training network. lr: 0.000242. clip: 0.096931
Iteration 1006: Policy loss: 0.004401. Value loss: 0.176199. Entropy: 1.799205.
Iteration 1007: Policy loss: -0.003201. Value loss:

episode: 417   score: 560.0  epsilon: 1.0    steps: 280  evaluation reward: 843.6
episode: 418   score: 220.0  epsilon: 1.0    steps: 648  evaluation reward: 841.0
Training network. lr: 0.000242. clip: 0.096784
Iteration 1063: Policy loss: 0.006152. Value loss: 0.489779. Entropy: 1.953249.
Iteration 1064: Policy loss: 0.000636. Value loss: 0.408866. Entropy: 1.932109.
Iteration 1065: Policy loss: 0.003104. Value loss: 0.380474. Entropy: 1.937871.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1066: Policy loss: 0.003416. Value loss: 0.243308. Entropy: 1.862884.
Iteration 1067: Policy loss: -0.005793. Value loss: 0.195578. Entropy: 1.858686.
Iteration 1068: Policy loss: -0.005177. Value loss: 0.183446. Entropy: 1.850246.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1069: Policy loss: 0.003428. Value loss: 0.578569. Entropy: 1.915659.
Iteration 1070: Policy loss: 0.000129. Value loss: 0.507678. Entropy: 1.915235.
Iteration 1071: Policy loss: -0.004296. Value loss: 0

Iteration 1127: Policy loss: -0.003289. Value loss: 0.087310. Entropy: 1.943379.
Iteration 1128: Policy loss: -0.007527. Value loss: 0.074889. Entropy: 1.938194.
episode: 442   score: 1170.0  epsilon: 1.0    steps: 24  evaluation reward: 818.0
episode: 443   score: 840.0  epsilon: 1.0    steps: 112  evaluation reward: 817.5
Training network. lr: 0.000242. clip: 0.096627
Iteration 1129: Policy loss: 0.001612. Value loss: 0.140550. Entropy: 1.862458.
Iteration 1130: Policy loss: -0.001109. Value loss: 0.118939. Entropy: 1.838219.
Iteration 1131: Policy loss: -0.005136. Value loss: 0.113503. Entropy: 1.852854.
episode: 444   score: 370.0  epsilon: 1.0    steps: 88  evaluation reward: 805.3
episode: 445   score: 590.0  epsilon: 1.0    steps: 680  evaluation reward: 800.6
episode: 446   score: 530.0  epsilon: 1.0    steps: 1008  evaluation reward: 802.5
Training network. lr: 0.000242. clip: 0.096627
Iteration 1132: Policy loss: 0.002868. Value loss: 0.132348. Entropy: 1.855875.
Iteration 11

Iteration 1189: Policy loss: 0.002496. Value loss: 0.259282. Entropy: 1.862460.
Iteration 1190: Policy loss: -0.002759. Value loss: 0.180965. Entropy: 1.871637.
Iteration 1191: Policy loss: -0.006174. Value loss: 0.161377. Entropy: 1.876012.
Training network. lr: 0.000241. clip: 0.096470
Iteration 1192: Policy loss: 0.003278. Value loss: 0.330611. Entropy: 1.916896.
Iteration 1193: Policy loss: -0.000171. Value loss: 0.244309. Entropy: 1.917970.
Iteration 1194: Policy loss: -0.004005. Value loss: 0.214047. Entropy: 1.923666.
episode: 469   score: 250.0  epsilon: 1.0    steps: 528  evaluation reward: 765.9
Training network. lr: 0.000241. clip: 0.096470
Iteration 1195: Policy loss: 0.006745. Value loss: 0.324162. Entropy: 1.987744.
Iteration 1196: Policy loss: 0.001051. Value loss: 0.251394. Entropy: 1.988304.
Iteration 1197: Policy loss: -0.005940. Value loss: 0.226886. Entropy: 1.993908.
episode: 470   score: 330.0  epsilon: 1.0    steps: 728  evaluation reward: 758.4
episode: 471   sc

episode: 494   score: 730.0  epsilon: 1.0    steps: 880  evaluation reward: 771.4
episode: 495   score: 1330.0  epsilon: 1.0    steps: 984  evaluation reward: 780.5
Training network. lr: 0.000240. clip: 0.096166
Iteration 1255: Policy loss: 0.004550. Value loss: 0.424598. Entropy: 1.707908.
Iteration 1256: Policy loss: -0.003908. Value loss: 0.354376. Entropy: 1.673047.
Iteration 1257: Policy loss: -0.001964. Value loss: 0.330891. Entropy: 1.684458.
episode: 496   score: 1240.0  epsilon: 1.0    steps: 192  evaluation reward: 756.9
Training network. lr: 0.000240. clip: 0.096166
Iteration 1258: Policy loss: 0.003426. Value loss: 0.175175. Entropy: 1.774818.
Iteration 1259: Policy loss: -0.002784. Value loss: 0.127881. Entropy: 1.742157.
Iteration 1260: Policy loss: -0.002839. Value loss: 0.111017. Entropy: 1.754007.
Training network. lr: 0.000240. clip: 0.096166
Iteration 1261: Policy loss: 0.001506. Value loss: 0.277163. Entropy: 1.824960.
Iteration 1262: Policy loss: -0.000950. Value l

Training network. lr: 0.000240. clip: 0.096009
Iteration 1321: Policy loss: -0.001069. Value loss: 0.343257. Entropy: 1.739771.
Iteration 1322: Policy loss: -0.005646. Value loss: 0.283029. Entropy: 1.744069.
Iteration 1323: Policy loss: -0.010110. Value loss: 0.257438. Entropy: 1.752889.
episode: 516   score: 800.0  epsilon: 1.0    steps: 8  evaluation reward: 777.3
episode: 517   score: 3720.0  epsilon: 1.0    steps: 32  evaluation reward: 808.9
episode: 518   score: 370.0  epsilon: 1.0    steps: 104  evaluation reward: 810.4
Training network. lr: 0.000240. clip: 0.096009
Iteration 1324: Policy loss: 0.004065. Value loss: 0.269165. Entropy: 1.715586.
Iteration 1325: Policy loss: -0.000224. Value loss: 0.208143. Entropy: 1.723607.
Iteration 1326: Policy loss: -0.002516. Value loss: 0.204671. Entropy: 1.733592.
Training network. lr: 0.000240. clip: 0.096009
Iteration 1327: Policy loss: 0.002961. Value loss: 0.273008. Entropy: 1.726550.
Iteration 1328: Policy loss: 0.002202. Value loss:

Iteration 1385: Policy loss: -0.004519. Value loss: 0.071297. Entropy: 1.667351.
Iteration 1386: Policy loss: -0.006777. Value loss: 0.063784. Entropy: 1.655429.
episode: 541   score: 520.0  epsilon: 1.0    steps: 24  evaluation reward: 782.1
episode: 542   score: 880.0  epsilon: 1.0    steps: 456  evaluation reward: 779.2
Training network. lr: 0.000240. clip: 0.095862
Iteration 1387: Policy loss: 0.005604. Value loss: 0.115149. Entropy: 1.667561.
Iteration 1388: Policy loss: -0.004652. Value loss: 0.067465. Entropy: 1.659216.
Iteration 1389: Policy loss: -0.006455. Value loss: 0.058443. Entropy: 1.656441.
episode: 543   score: 2440.0  epsilon: 1.0    steps: 16  evaluation reward: 795.2
episode: 544   score: 740.0  epsilon: 1.0    steps: 392  evaluation reward: 798.9
episode: 545   score: 860.0  epsilon: 1.0    steps: 936  evaluation reward: 801.6
Training network. lr: 0.000240. clip: 0.095862
Iteration 1390: Policy loss: 0.000768. Value loss: 0.170435. Entropy: 1.611051.
Iteration 139

Training network. lr: 0.000239. clip: 0.095705
Iteration 1450: Policy loss: 0.005297. Value loss: 0.182093. Entropy: 1.629186.
Iteration 1451: Policy loss: -0.000452. Value loss: 0.148134. Entropy: 1.637352.
Iteration 1452: Policy loss: -0.009427. Value loss: 0.137319. Entropy: 1.626819.
episode: 565   score: 550.0  epsilon: 1.0    steps: 408  evaluation reward: 809.8
episode: 566   score: 490.0  epsilon: 1.0    steps: 640  evaluation reward: 807.4
episode: 567   score: 1370.0  epsilon: 1.0    steps: 808  evaluation reward: 813.1
Training network. lr: 0.000239. clip: 0.095549
Iteration 1453: Policy loss: 0.000825. Value loss: 0.440455. Entropy: 1.584486.
Iteration 1454: Policy loss: -0.002683. Value loss: 0.326955. Entropy: 1.610084.
Iteration 1455: Policy loss: -0.006393. Value loss: 0.288464. Entropy: 1.612127.
Training network. lr: 0.000239. clip: 0.095549
Iteration 1456: Policy loss: 0.002707. Value loss: 0.393966. Entropy: 1.553534.
Iteration 1457: Policy loss: -0.000286. Value lo

episode: 589   score: 440.0  epsilon: 1.0    steps: 472  evaluation reward: 822.4
episode: 590   score: 840.0  epsilon: 1.0    steps: 696  evaluation reward: 826.6
Training network. lr: 0.000239. clip: 0.095401
Iteration 1516: Policy loss: -0.000083. Value loss: 0.175101. Entropy: 1.626994.
Iteration 1517: Policy loss: -0.004603. Value loss: 0.150775. Entropy: 1.637399.
Iteration 1518: Policy loss: -0.003841. Value loss: 0.143589. Entropy: 1.635891.
episode: 591   score: 600.0  epsilon: 1.0    steps: 56  evaluation reward: 828.3
Training network. lr: 0.000239. clip: 0.095401
Iteration 1519: Policy loss: 0.000733. Value loss: 0.132212. Entropy: 1.556271.
Iteration 1520: Policy loss: -0.004373. Value loss: 0.118621. Entropy: 1.550766.
Iteration 1521: Policy loss: 0.001009. Value loss: 0.115735. Entropy: 1.555883.
episode: 592   score: 720.0  epsilon: 1.0    steps: 80  evaluation reward: 817.5
Training network. lr: 0.000239. clip: 0.095401
Iteration 1522: Policy loss: 0.003700. Value loss

Training network. lr: 0.000238. clip: 0.095245
Iteration 1579: Policy loss: 0.000697. Value loss: 0.170717. Entropy: 1.726324.
Iteration 1580: Policy loss: -0.003591. Value loss: 0.154275. Entropy: 1.732539.
Iteration 1581: Policy loss: -0.005488. Value loss: 0.135677. Entropy: 1.728029.
episode: 615   score: 780.0  epsilon: 1.0    steps: 560  evaluation reward: 781.3
episode: 616   score: 810.0  epsilon: 1.0    steps: 968  evaluation reward: 781.4
Training network. lr: 0.000238. clip: 0.095245
Iteration 1582: Policy loss: 0.001011. Value loss: 0.375910. Entropy: 1.710537.
Iteration 1583: Policy loss: -0.001067. Value loss: 0.234543. Entropy: 1.699327.
Iteration 1584: Policy loss: -0.002741. Value loss: 0.224661. Entropy: 1.714097.
Training network. lr: 0.000238. clip: 0.095245
Iteration 1585: Policy loss: 0.004994. Value loss: 0.250091. Entropy: 1.693663.
Iteration 1586: Policy loss: 0.000474. Value loss: 0.147951. Entropy: 1.691167.
Iteration 1587: Policy loss: -0.005225. Value loss:

episode: 638   score: 940.0  epsilon: 1.0    steps: 280  evaluation reward: 809.4
Training network. lr: 0.000238. clip: 0.095088
Iteration 1645: Policy loss: 0.004350. Value loss: 0.243660. Entropy: 1.687586.
Iteration 1646: Policy loss: -0.002054. Value loss: 0.178065. Entropy: 1.680846.
Iteration 1647: Policy loss: -0.001911. Value loss: 0.167749. Entropy: 1.686210.
episode: 639   score: 1090.0  epsilon: 1.0    steps: 48  evaluation reward: 810.4
episode: 640   score: 740.0  epsilon: 1.0    steps: 600  evaluation reward: 809.1
Training network. lr: 0.000238. clip: 0.095088
Iteration 1648: Policy loss: 0.001991. Value loss: 0.274441. Entropy: 1.706892.
Iteration 1649: Policy loss: -0.002373. Value loss: 0.175626. Entropy: 1.684376.
Iteration 1650: Policy loss: -0.005363. Value loss: 0.152226. Entropy: 1.678091.
Training network. lr: 0.000237. clip: 0.094940
Iteration 1651: Policy loss: 0.002441. Value loss: 0.274284. Entropy: 1.725397.
Iteration 1652: Policy loss: -0.002741. Value los

Iteration 1710: Policy loss: -0.010177. Value loss: 0.141407. Entropy: 1.841805.
episode: 661   score: 850.0  epsilon: 1.0    steps: 776  evaluation reward: 754.5
Training network. lr: 0.000237. clip: 0.094784
Iteration 1711: Policy loss: -0.000890. Value loss: 0.306778. Entropy: 1.881733.
Iteration 1712: Policy loss: -0.004788. Value loss: 0.231075. Entropy: 1.869390.
Iteration 1713: Policy loss: -0.005560. Value loss: 0.195271. Entropy: 1.868985.
episode: 662   score: 860.0  epsilon: 1.0    steps: 64  evaluation reward: 752.2
Training network. lr: 0.000237. clip: 0.094784
Iteration 1714: Policy loss: 0.003348. Value loss: 0.518314. Entropy: 1.757042.
Iteration 1715: Policy loss: 0.000796. Value loss: 0.468168. Entropy: 1.754797.
Iteration 1716: Policy loss: -0.004830. Value loss: 0.417957. Entropy: 1.757213.
episode: 663   score: 840.0  epsilon: 1.0    steps: 440  evaluation reward: 755.1
episode: 664   score: 440.0  epsilon: 1.0    steps: 1016  evaluation reward: 752.8
Training netw

Training network. lr: 0.000237. clip: 0.094627
Iteration 1774: Policy loss: -0.000474. Value loss: 0.253366. Entropy: 1.865518.
Iteration 1775: Policy loss: -0.003312. Value loss: 0.213571. Entropy: 1.861735.
Iteration 1776: Policy loss: -0.007518. Value loss: 0.191775. Entropy: 1.848919.
Training network. lr: 0.000237. clip: 0.094627
Iteration 1777: Policy loss: -0.000386. Value loss: 0.160182. Entropy: 1.835129.
Iteration 1778: Policy loss: -0.005553. Value loss: 0.099406. Entropy: 1.835831.
Iteration 1779: Policy loss: -0.008896. Value loss: 0.072133. Entropy: 1.833700.
episode: 686   score: 640.0  epsilon: 1.0    steps: 680  evaluation reward: 762.9
episode: 687   score: 500.0  epsilon: 1.0    steps: 960  evaluation reward: 761.8
Training network. lr: 0.000237. clip: 0.094627
Iteration 1780: Policy loss: 0.004187. Value loss: 0.089867. Entropy: 1.918612.
Iteration 1781: Policy loss: -0.004157. Value loss: 0.064248. Entropy: 1.910000.
Iteration 1782: Policy loss: -0.008379. Value lo

Iteration 1839: Policy loss: -0.009087. Value loss: 0.062782. Entropy: 1.751590.
episode: 709   score: 430.0  epsilon: 1.0    steps: 888  evaluation reward: 785.9
Training network. lr: 0.000236. clip: 0.094480
Iteration 1840: Policy loss: -0.000556. Value loss: 0.134885. Entropy: 1.745522.
Iteration 1841: Policy loss: -0.007323. Value loss: 0.091546. Entropy: 1.749339.
Iteration 1842: Policy loss: -0.007578. Value loss: 0.081429. Entropy: 1.745091.
Training network. lr: 0.000236. clip: 0.094480
Iteration 1843: Policy loss: 0.001443. Value loss: 0.181296. Entropy: 1.723425.
Iteration 1844: Policy loss: -0.004769. Value loss: 0.131609. Entropy: 1.725908.
Iteration 1845: Policy loss: -0.005370. Value loss: 0.124085. Entropy: 1.697424.
episode: 710   score: 630.0  epsilon: 1.0    steps: 472  evaluation reward: 780.9
Training network. lr: 0.000236. clip: 0.094480
Iteration 1846: Policy loss: 0.005224. Value loss: 0.222062. Entropy: 1.731980.
Iteration 1847: Policy loss: 0.001663. Value loss

Iteration 1903: Policy loss: 0.003281. Value loss: 0.268999. Entropy: 1.718448.
Iteration 1904: Policy loss: -0.001332. Value loss: 0.176061. Entropy: 1.705064.
Iteration 1905: Policy loss: -0.003714. Value loss: 0.137832. Entropy: 1.696846.
episode: 734   score: 690.0  epsilon: 1.0    steps: 208  evaluation reward: 769.9
Training network. lr: 0.000235. clip: 0.094166
Iteration 1906: Policy loss: 0.005060. Value loss: 0.296037. Entropy: 1.626074.
Iteration 1907: Policy loss: -0.001001. Value loss: 0.209664. Entropy: 1.624323.
Iteration 1908: Policy loss: -0.002078. Value loss: 0.182105. Entropy: 1.616326.
Training network. lr: 0.000235. clip: 0.094166
Iteration 1909: Policy loss: 0.001813. Value loss: 0.321041. Entropy: 1.659187.
Iteration 1910: Policy loss: -0.003064. Value loss: 0.219871. Entropy: 1.661375.
Iteration 1911: Policy loss: -0.007285. Value loss: 0.182796. Entropy: 1.649847.
episode: 735   score: 940.0  epsilon: 1.0    steps: 128  evaluation reward: 775.7
Training network

Iteration 1969: Policy loss: 0.001963. Value loss: 0.264000. Entropy: 1.661632.
Iteration 1970: Policy loss: -0.000108. Value loss: 0.177044. Entropy: 1.668832.
Iteration 1971: Policy loss: -0.006240. Value loss: 0.144942. Entropy: 1.668475.
episode: 756   score: 410.0  epsilon: 1.0    steps: 1024  evaluation reward: 800.5
Training network. lr: 0.000235. clip: 0.094019
Iteration 1972: Policy loss: 0.001231. Value loss: 0.578438. Entropy: 1.733815.
Iteration 1973: Policy loss: 0.000325. Value loss: 0.419229. Entropy: 1.731290.
Iteration 1974: Policy loss: -0.007048. Value loss: 0.365322. Entropy: 1.739174.
episode: 757   score: 860.0  epsilon: 1.0    steps: 24  evaluation reward: 802.0
episode: 758   score: 300.0  epsilon: 1.0    steps: 392  evaluation reward: 797.0
Training network. lr: 0.000235. clip: 0.094019
Iteration 1975: Policy loss: -0.002397. Value loss: 0.273907. Entropy: 1.584912.
Iteration 1976: Policy loss: -0.004311. Value loss: 0.224809. Entropy: 1.589489.
Iteration 1977:

Iteration 2034: Policy loss: -0.006852. Value loss: 0.171593. Entropy: 1.658345.
Training network. lr: 0.000235. clip: 0.093862
Iteration 2035: Policy loss: 0.000230. Value loss: 0.467469. Entropy: 1.640239.
Iteration 2036: Policy loss: -0.005386. Value loss: 0.408792. Entropy: 1.649189.
Iteration 2037: Policy loss: -0.005368. Value loss: 0.352429. Entropy: 1.640291.
episode: 780   score: 770.0  epsilon: 1.0    steps: 720  evaluation reward: 798.8
episode: 781   score: 590.0  epsilon: 1.0    steps: 816  evaluation reward: 795.2
episode: 782   score: 850.0  epsilon: 1.0    steps: 872  evaluation reward: 800.0
episode: 783   score: 870.0  epsilon: 1.0    steps: 952  evaluation reward: 801.3
Training network. lr: 0.000235. clip: 0.093862
Iteration 2038: Policy loss: 0.001201. Value loss: 0.230184. Entropy: 1.695755.
Iteration 2039: Policy loss: -0.002639. Value loss: 0.146610. Entropy: 1.688143.
Iteration 2040: Policy loss: -0.004973. Value loss: 0.118846. Entropy: 1.675075.
episode: 784 

Iteration 2098: Policy loss: 0.003480. Value loss: 0.570372. Entropy: 1.722324.
Iteration 2099: Policy loss: -0.000826. Value loss: 0.469318. Entropy: 1.743985.
Iteration 2100: Policy loss: 0.002342. Value loss: 0.426806. Entropy: 1.745722.
episode: 804   score: 620.0  epsilon: 1.0    steps: 512  evaluation reward: 801.1
episode: 805   score: 590.0  epsilon: 1.0    steps: 704  evaluation reward: 800.3
episode: 806   score: 1480.0  epsilon: 1.0    steps: 800  evaluation reward: 809.2
Training network. lr: 0.000234. clip: 0.093558
Iteration 2101: Policy loss: 0.002509. Value loss: 0.094115. Entropy: 1.694209.
Iteration 2102: Policy loss: -0.001425. Value loss: 0.048698. Entropy: 1.682930.
Iteration 2103: Policy loss: -0.009122. Value loss: 0.040332. Entropy: 1.702990.
Training network. lr: 0.000234. clip: 0.093558
Iteration 2104: Policy loss: 0.002829. Value loss: 0.183998. Entropy: 1.593506.
Iteration 2105: Policy loss: -0.004711. Value loss: 0.131102. Entropy: 1.601770.
Iteration 2106:

Iteration 2163: Policy loss: -0.007458. Value loss: 0.195180. Entropy: 1.783303.
episode: 828   score: 490.0  epsilon: 1.0    steps: 80  evaluation reward: 795.4
episode: 829   score: 420.0  epsilon: 1.0    steps: 160  evaluation reward: 779.9
episode: 830   score: 220.0  epsilon: 1.0    steps: 608  evaluation reward: 777.8
episode: 831   score: 1370.0  epsilon: 1.0    steps: 752  evaluation reward: 786.6
episode: 832   score: 730.0  epsilon: 1.0    steps: 864  evaluation reward: 781.8
Training network. lr: 0.000234. clip: 0.093401
Iteration 2164: Policy loss: 0.003731. Value loss: 0.242692. Entropy: 1.620170.
Iteration 2165: Policy loss: -0.001486. Value loss: 0.182829. Entropy: 1.643940.
Iteration 2166: Policy loss: -0.009775. Value loss: 0.171978. Entropy: 1.623070.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2167: Policy loss: 0.007927. Value loss: 0.228146. Entropy: 1.554394.
Iteration 2168: Policy loss: 0.004137. Value loss: 0.178540. Entropy: 1.582339.
Iteration 216

episode: 853   score: 670.0  epsilon: 1.0    steps: 544  evaluation reward: 779.6
Training network. lr: 0.000233. clip: 0.093245
Iteration 2227: Policy loss: -0.000986. Value loss: 0.136193. Entropy: 1.793186.
Iteration 2228: Policy loss: -0.007411. Value loss: 0.080585. Entropy: 1.783025.
Iteration 2229: Policy loss: -0.010875. Value loss: 0.065204. Entropy: 1.778237.
episode: 854   score: 500.0  epsilon: 1.0    steps: 960  evaluation reward: 775.4
Training network. lr: 0.000233. clip: 0.093245
Iteration 2230: Policy loss: -0.000955. Value loss: 0.530950. Entropy: 1.807559.
Iteration 2231: Policy loss: -0.003395. Value loss: 0.379734. Entropy: 1.819350.
Iteration 2232: Policy loss: -0.005582. Value loss: 0.347634. Entropy: 1.822242.
episode: 855   score: 890.0  epsilon: 1.0    steps: 144  evaluation reward: 779.0
Training network. lr: 0.000233. clip: 0.093245
Iteration 2233: Policy loss: 0.009966. Value loss: 0.276337. Entropy: 1.712799.
Iteration 2234: Policy loss: 0.002618. Value lo

Iteration 2292: Policy loss: -0.010882. Value loss: 0.096710. Entropy: 1.850199.
episode: 877   score: 660.0  epsilon: 1.0    steps: 200  evaluation reward: 836.2
Training network. lr: 0.000233. clip: 0.093097
Iteration 2293: Policy loss: 0.000185. Value loss: 0.159924. Entropy: 1.876449.
Iteration 2294: Policy loss: -0.004696. Value loss: 0.088149. Entropy: 1.860356.
Iteration 2295: Policy loss: -0.010705. Value loss: 0.074260. Entropy: 1.879018.
episode: 878   score: 500.0  epsilon: 1.0    steps: 512  evaluation reward: 834.8
Training network. lr: 0.000233. clip: 0.093097
Iteration 2296: Policy loss: 0.005196. Value loss: 0.263441. Entropy: 1.873771.
Iteration 2297: Policy loss: -0.001992. Value loss: 0.193550. Entropy: 1.892555.
Iteration 2298: Policy loss: -0.003826. Value loss: 0.175520. Entropy: 1.875106.
episode: 879   score: 650.0  epsilon: 1.0    steps: 616  evaluation reward: 835.7
Training network. lr: 0.000233. clip: 0.093097
Iteration 2299: Policy loss: -0.000774. Value lo

Iteration 2357: Policy loss: -0.001859. Value loss: 0.176024. Entropy: 1.795685.
Iteration 2358: Policy loss: -0.004674. Value loss: 0.118558. Entropy: 1.793040.
now time :  2019-03-06 03:38:06.185213
episode: 901   score: 870.0  epsilon: 1.0    steps: 672  evaluation reward: 857.0
episode: 902   score: 410.0  epsilon: 1.0    steps: 720  evaluation reward: 852.3
Training network. lr: 0.000232. clip: 0.092784
Iteration 2359: Policy loss: 0.005078. Value loss: 0.309765. Entropy: 1.884933.
Iteration 2360: Policy loss: 0.005760. Value loss: 0.193312. Entropy: 1.895008.
Iteration 2361: Policy loss: -0.000636. Value loss: 0.141093. Entropy: 1.902116.
episode: 903   score: 760.0  epsilon: 1.0    steps: 48  evaluation reward: 850.2
Training network. lr: 0.000232. clip: 0.092784
Iteration 2362: Policy loss: 0.003461. Value loss: 0.381326. Entropy: 1.870072.
Iteration 2363: Policy loss: 0.005960. Value loss: 0.266418. Entropy: 1.861313.
Iteration 2364: Policy loss: -0.003714. Value loss: 0.20577

Iteration 2420: Policy loss: -0.000695. Value loss: 0.192438. Entropy: 1.838659.
Iteration 2421: Policy loss: -0.008532. Value loss: 0.170706. Entropy: 1.841287.
Training network. lr: 0.000232. clip: 0.092636
Iteration 2422: Policy loss: 0.006966. Value loss: 0.444180. Entropy: 1.761766.
Iteration 2423: Policy loss: -0.000300. Value loss: 0.339213. Entropy: 1.763658.
Iteration 2424: Policy loss: -0.005827. Value loss: 0.308583. Entropy: 1.762090.
Training network. lr: 0.000232. clip: 0.092636
Iteration 2425: Policy loss: 0.006018. Value loss: 0.553572. Entropy: 1.830928.
Iteration 2426: Policy loss: -0.003377. Value loss: 0.380786. Entropy: 1.809188.
Iteration 2427: Policy loss: -0.005011. Value loss: 0.314017. Entropy: 1.801602.
Training network. lr: 0.000232. clip: 0.092636
Iteration 2428: Policy loss: 0.006966. Value loss: 0.287456. Entropy: 1.933337.
Iteration 2429: Policy loss: -0.001638. Value loss: 0.238563. Entropy: 1.918662.
Iteration 2430: Policy loss: -0.002252. Value loss: 

Iteration 2486: Policy loss: -0.005652. Value loss: 0.145744. Entropy: 1.854391.
Iteration 2487: Policy loss: -0.008095. Value loss: 0.133932. Entropy: 1.850815.
Training network. lr: 0.000231. clip: 0.092480
Iteration 2488: Policy loss: -0.002416. Value loss: 0.351477. Entropy: 1.819108.
Iteration 2489: Policy loss: -0.005311. Value loss: 0.229753. Entropy: 1.799266.
Iteration 2490: Policy loss: -0.007225. Value loss: 0.176028. Entropy: 1.789436.
Training network. lr: 0.000231. clip: 0.092480
Iteration 2491: Policy loss: 0.002200. Value loss: 0.522838. Entropy: 1.868909.
Iteration 2492: Policy loss: -0.002432. Value loss: 0.487069. Entropy: 1.882549.
Iteration 2493: Policy loss: -0.003273. Value loss: 0.464050. Entropy: 1.867885.
episode: 950   score: 590.0  epsilon: 1.0    steps: 584  evaluation reward: 836.8
now time :  2019-03-06 03:39:54.799824
episode: 951   score: 940.0  epsilon: 1.0    steps: 920  evaluation reward: 839.0
episode: 952   score: 880.0  epsilon: 1.0    steps: 936 

Iteration 2551: Policy loss: 0.003298. Value loss: 0.248290. Entropy: 1.754714.
Iteration 2552: Policy loss: -0.004468. Value loss: 0.194058. Entropy: 1.763072.
Iteration 2553: Policy loss: -0.005996. Value loss: 0.158100. Entropy: 1.757843.
episode: 973   score: 690.0  epsilon: 1.0    steps: 664  evaluation reward: 862.9
Training network. lr: 0.000230. clip: 0.092176
Iteration 2554: Policy loss: 0.001430. Value loss: 0.131777. Entropy: 1.766649.
Iteration 2555: Policy loss: -0.006968. Value loss: 0.087823. Entropy: 1.750157.
Iteration 2556: Policy loss: -0.009394. Value loss: 0.074783. Entropy: 1.759408.
Training network. lr: 0.000230. clip: 0.092176
Iteration 2557: Policy loss: 0.004906. Value loss: 0.141516. Entropy: 1.710205.
Iteration 2558: Policy loss: -0.004323. Value loss: 0.088256. Entropy: 1.725902.
Iteration 2559: Policy loss: -0.009537. Value loss: 0.074914. Entropy: 1.719557.
episode: 974   score: 1530.0  epsilon: 1.0    steps: 280  evaluation reward: 871.5
Training networ

Training network. lr: 0.000230. clip: 0.092019
Iteration 2617: Policy loss: 0.001624. Value loss: 0.183248. Entropy: 1.743588.
Iteration 2618: Policy loss: -0.004914. Value loss: 0.103409. Entropy: 1.733634.
Iteration 2619: Policy loss: -0.009952. Value loss: 0.075483. Entropy: 1.726962.
Training network. lr: 0.000230. clip: 0.092019
Iteration 2620: Policy loss: 0.005027. Value loss: 0.216385. Entropy: 1.761211.
Iteration 2621: Policy loss: -0.003884. Value loss: 0.160013. Entropy: 1.776105.
Iteration 2622: Policy loss: -0.010071. Value loss: 0.134195. Entropy: 1.772761.
Training network. lr: 0.000230. clip: 0.092019
Iteration 2623: Policy loss: -0.002920. Value loss: 0.139576. Entropy: 1.810680.
Iteration 2624: Policy loss: -0.007916. Value loss: 0.073240. Entropy: 1.785861.
Iteration 2625: Policy loss: -0.013575. Value loss: 0.057687. Entropy: 1.778895.
episode: 996   score: 610.0  epsilon: 1.0    steps: 544  evaluation reward: 856.6
Training network. lr: 0.000230. clip: 0.092019
Ite

episode: 1018   score: 930.0  epsilon: 1.0    steps: 360  evaluation reward: 878.7
episode: 1019   score: 760.0  epsilon: 1.0    steps: 456  evaluation reward: 879.5
Training network. lr: 0.000230. clip: 0.091862
Iteration 2683: Policy loss: 0.001660. Value loss: 0.447303. Entropy: 1.609740.
Iteration 2684: Policy loss: 0.002382. Value loss: 0.342521. Entropy: 1.591013.
Iteration 2685: Policy loss: 0.000496. Value loss: 0.304434. Entropy: 1.594317.
Training network. lr: 0.000230. clip: 0.091862
Iteration 2686: Policy loss: 0.005567. Value loss: 0.476777. Entropy: 1.543116.
Iteration 2687: Policy loss: 0.003598. Value loss: 0.265844. Entropy: 1.516845.
Iteration 2688: Policy loss: 0.000636. Value loss: 0.209230. Entropy: 1.489714.
episode: 1020   score: 550.0  epsilon: 1.0    steps: 872  evaluation reward: 875.9
Training network. lr: 0.000230. clip: 0.091862
Iteration 2689: Policy loss: 0.004721. Value loss: 0.162484. Entropy: 1.533757.
Iteration 2690: Policy loss: -0.002860. Value loss

Iteration 2747: Policy loss: 0.001540. Value loss: 0.204202. Entropy: 1.595637.
Iteration 2748: Policy loss: -0.004605. Value loss: 0.187950. Entropy: 1.589331.
episode: 1042   score: 930.0  epsilon: 1.0    steps: 952  evaluation reward: 890.2
Training network. lr: 0.000229. clip: 0.091715
Iteration 2749: Policy loss: -0.000003. Value loss: 0.231509. Entropy: 1.522828.
Iteration 2750: Policy loss: 0.000522. Value loss: 0.164580. Entropy: 1.533746.
Iteration 2751: Policy loss: -0.004601. Value loss: 0.142935. Entropy: 1.507972.
Training network. lr: 0.000229. clip: 0.091558
Iteration 2752: Policy loss: 0.005737. Value loss: 0.112529. Entropy: 1.613097.
Iteration 2753: Policy loss: -0.002637. Value loss: 0.090081. Entropy: 1.619816.
Iteration 2754: Policy loss: -0.004204. Value loss: 0.079017. Entropy: 1.607961.
Training network. lr: 0.000229. clip: 0.091558
Iteration 2755: Policy loss: -0.002028. Value loss: 0.510209. Entropy: 1.574546.
Iteration 2756: Policy loss: -0.007353. Value loss

Iteration 2812: Policy loss: 0.003007. Value loss: 0.684192. Entropy: 1.472757.
Iteration 2813: Policy loss: -0.002177. Value loss: 0.549028. Entropy: 1.493723.
Iteration 2814: Policy loss: -0.002331. Value loss: 0.464787. Entropy: 1.477711.
episode: 1065   score: 830.0  epsilon: 1.0    steps: 608  evaluation reward: 817.1
Training network. lr: 0.000229. clip: 0.091401
Iteration 2815: Policy loss: 0.003970. Value loss: 0.464274. Entropy: 1.581512.
Iteration 2816: Policy loss: -0.001094. Value loss: 0.358188. Entropy: 1.592316.
Iteration 2817: Policy loss: -0.004027. Value loss: 0.307825. Entropy: 1.581061.
episode: 1066   score: 1800.0  epsilon: 1.0    steps: 160  evaluation reward: 831.2
Training network. lr: 0.000229. clip: 0.091401
Iteration 2818: Policy loss: 0.000812. Value loss: 0.280110. Entropy: 1.689947.
Iteration 2819: Policy loss: -0.007879. Value loss: 0.185160. Entropy: 1.676849.
Iteration 2820: Policy loss: -0.011445. Value loss: 0.154904. Entropy: 1.676064.
Training netw

episode: 1088   score: 930.0  epsilon: 1.0    steps: 408  evaluation reward: 845.9
episode: 1089   score: 720.0  epsilon: 1.0    steps: 632  evaluation reward: 845.7
Training network. lr: 0.000228. clip: 0.091254
Iteration 2878: Policy loss: -0.001644. Value loss: 0.233709. Entropy: 1.667551.
Iteration 2879: Policy loss: -0.004588. Value loss: 0.162027. Entropy: 1.658080.
Iteration 2880: Policy loss: -0.009238. Value loss: 0.153207. Entropy: 1.678158.
Training network. lr: 0.000228. clip: 0.091254
Iteration 2881: Policy loss: 0.001261. Value loss: 0.278136. Entropy: 1.637695.
Iteration 2882: Policy loss: -0.000621. Value loss: 0.160326. Entropy: 1.632141.
Iteration 2883: Policy loss: -0.007843. Value loss: 0.126386. Entropy: 1.616951.
Training network. lr: 0.000228. clip: 0.091254
Iteration 2884: Policy loss: 0.001717. Value loss: 0.606865. Entropy: 1.593060.
Iteration 2885: Policy loss: -0.003477. Value loss: 0.426424. Entropy: 1.567403.
Iteration 2886: Policy loss: -0.006196. Value l

Iteration 2943: Policy loss: -0.011688. Value loss: 0.069040. Entropy: 1.469100.
episode: 1111   score: 2210.0  epsilon: 1.0    steps: 680  evaluation reward: 857.2
Training network. lr: 0.000228. clip: 0.091097
Iteration 2944: Policy loss: 0.001815. Value loss: 0.314471. Entropy: 1.528143.
Iteration 2945: Policy loss: -0.003009. Value loss: 0.197116. Entropy: 1.517875.
Iteration 2946: Policy loss: -0.009214. Value loss: 0.182799. Entropy: 1.530964.
episode: 1112   score: 680.0  epsilon: 1.0    steps: 824  evaluation reward: 856.7
Training network. lr: 0.000228. clip: 0.091097
Iteration 2947: Policy loss: 0.006792. Value loss: 1.326037. Entropy: 1.555887.
Iteration 2948: Policy loss: 0.006382. Value loss: 1.133452. Entropy: 1.551652.
Iteration 2949: Policy loss: 0.010130. Value loss: 0.999561. Entropy: 1.530097.
Training network. lr: 0.000228. clip: 0.091097
Iteration 2950: Policy loss: 0.003741. Value loss: 0.224252. Entropy: 1.515097.
Iteration 2951: Policy loss: -0.000683. Value los

Iteration 3008: Policy loss: -0.006688. Value loss: 0.098004. Entropy: 1.626533.
Iteration 3009: Policy loss: -0.007727. Value loss: 0.090591. Entropy: 1.619550.
Training network. lr: 0.000227. clip: 0.090793
Iteration 3010: Policy loss: 0.004539. Value loss: 0.666318. Entropy: 1.576712.
Iteration 3011: Policy loss: 0.001683. Value loss: 0.514844. Entropy: 1.572140.
Iteration 3012: Policy loss: 0.000809. Value loss: 0.446614. Entropy: 1.553523.
episode: 1135   score: 970.0  epsilon: 1.0    steps: 728  evaluation reward: 862.0
episode: 1136   score: 720.0  epsilon: 1.0    steps: 1016  evaluation reward: 862.8
Training network. lr: 0.000227. clip: 0.090793
Iteration 3013: Policy loss: 0.005644. Value loss: 0.126431. Entropy: 1.572792.
Iteration 3014: Policy loss: -0.004120. Value loss: 0.064420. Entropy: 1.566433.
Iteration 3015: Policy loss: -0.006702. Value loss: 0.049547. Entropy: 1.577615.
Training network. lr: 0.000227. clip: 0.090793
Iteration 3016: Policy loss: 0.003097. Value los

Iteration 3075: Policy loss: -0.004168. Value loss: 0.073153. Entropy: 1.617533.
Training network. lr: 0.000227. clip: 0.090637
Iteration 3076: Policy loss: 0.001737. Value loss: 0.114913. Entropy: 1.686270.
Iteration 3077: Policy loss: -0.005898. Value loss: 0.067757. Entropy: 1.695707.
Iteration 3078: Policy loss: -0.012506. Value loss: 0.058205. Entropy: 1.700258.
episode: 1156   score: 1010.0  epsilon: 1.0    steps: 248  evaluation reward: 909.2
episode: 1157   score: 430.0  epsilon: 1.0    steps: 656  evaluation reward: 907.7
Training network. lr: 0.000227. clip: 0.090637
Iteration 3079: Policy loss: 0.003336. Value loss: 0.179518. Entropy: 1.615358.
Iteration 3080: Policy loss: -0.001219. Value loss: 0.117154. Entropy: 1.614934.
Iteration 3081: Policy loss: -0.002942. Value loss: 0.101939. Entropy: 1.606225.
episode: 1158   score: 2200.0  epsilon: 1.0    steps: 488  evaluation reward: 921.7
episode: 1159   score: 960.0  epsilon: 1.0    steps: 504  evaluation reward: 924.4
Trainin

Iteration 3141: Policy loss: -0.001265. Value loss: 0.100990. Entropy: 1.610680.
episode: 1179   score: 1100.0  epsilon: 1.0    steps: 288  evaluation reward: 909.2
episode: 1180   score: 640.0  epsilon: 1.0    steps: 336  evaluation reward: 909.0
episode: 1181   score: 2110.0  epsilon: 1.0    steps: 672  evaluation reward: 924.4
Training network. lr: 0.000226. clip: 0.090480
Iteration 3142: Policy loss: 0.011162. Value loss: 0.330794. Entropy: 1.600792.
Iteration 3143: Policy loss: -0.008828. Value loss: 0.204944. Entropy: 1.623579.
Iteration 3144: Policy loss: -0.014557. Value loss: 0.156602. Entropy: 1.612951.
Training network. lr: 0.000226. clip: 0.090480
Iteration 3145: Policy loss: 0.013042. Value loss: 0.702176. Entropy: 1.646272.
Iteration 3146: Policy loss: -0.001309. Value loss: 0.418583. Entropy: 1.664419.
Iteration 3147: Policy loss: 0.000260. Value loss: 0.339525. Entropy: 1.654562.
Training network. lr: 0.000226. clip: 0.090480
Iteration 3148: Policy loss: 0.008418. Value

Iteration 3206: Policy loss: -0.002209. Value loss: 0.110889. Entropy: 1.853362.
Iteration 3207: Policy loss: -0.006143. Value loss: 0.088683. Entropy: 1.851820.
Training network. lr: 0.000225. clip: 0.090176
Iteration 3208: Policy loss: 0.002861. Value loss: 0.248233. Entropy: 1.817830.
Iteration 3209: Policy loss: -0.002718. Value loss: 0.140571. Entropy: 1.820706.
Iteration 3210: Policy loss: -0.004034. Value loss: 0.104586. Entropy: 1.809476.
episode: 1202   score: 620.0  epsilon: 1.0    steps: 720  evaluation reward: 915.2
episode: 1203   score: 1310.0  epsilon: 1.0    steps: 1024  evaluation reward: 914.1
Training network. lr: 0.000225. clip: 0.090176
Iteration 3211: Policy loss: 0.007450. Value loss: 0.217202. Entropy: 1.776380.
Iteration 3212: Policy loss: 0.003857. Value loss: 0.120604. Entropy: 1.774107.
Iteration 3213: Policy loss: -0.002414. Value loss: 0.081592. Entropy: 1.765137.
Training network. lr: 0.000225. clip: 0.090176
Iteration 3214: Policy loss: 0.002193. Value l

Iteration 3272: Policy loss: -0.002425. Value loss: 0.207835. Entropy: 1.749535.
Iteration 3273: Policy loss: -0.009813. Value loss: 0.182565. Entropy: 1.771857.
episode: 1225   score: 520.0  epsilon: 1.0    steps: 160  evaluation reward: 907.8
Training network. lr: 0.000225. clip: 0.090019
Iteration 3274: Policy loss: 0.003214. Value loss: 0.299828. Entropy: 1.741592.
Iteration 3275: Policy loss: -0.004637. Value loss: 0.233483. Entropy: 1.720243.
Iteration 3276: Policy loss: -0.003176. Value loss: 0.212407. Entropy: 1.723856.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3277: Policy loss: 0.004525. Value loss: 0.301753. Entropy: 1.735662.
Iteration 3278: Policy loss: -0.000660. Value loss: 0.225689. Entropy: 1.743807.
Iteration 3279: Policy loss: -0.004087. Value loss: 0.201143. Entropy: 1.732579.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3280: Policy loss: 0.005080. Value loss: 0.127602. Entropy: 1.727459.
Iteration 3281: Policy loss: -0.005182. Value loss

Training network. lr: 0.000225. clip: 0.089872
Iteration 3340: Policy loss: 0.005144. Value loss: 0.780974. Entropy: 1.678206.
Iteration 3341: Policy loss: 0.007188. Value loss: 0.620321. Entropy: 1.660549.
Iteration 3342: Policy loss: 0.000574. Value loss: 0.570294. Entropy: 1.677428.
episode: 1246   score: 730.0  epsilon: 1.0    steps: 320  evaluation reward: 899.9
episode: 1247   score: 1100.0  epsilon: 1.0    steps: 544  evaluation reward: 898.7
Training network. lr: 0.000225. clip: 0.089872
Iteration 3343: Policy loss: 0.002171. Value loss: 0.112273. Entropy: 1.713089.
Iteration 3344: Policy loss: -0.000922. Value loss: 0.056384. Entropy: 1.718853.
Iteration 3345: Policy loss: -0.007881. Value loss: 0.044625. Entropy: 1.719256.
episode: 1248   score: 1290.0  epsilon: 1.0    steps: 88  evaluation reward: 902.2
episode: 1249   score: 2460.0  epsilon: 1.0    steps: 672  evaluation reward: 919.9
episode: 1250   score: 2510.0  epsilon: 1.0    steps: 712  evaluation reward: 929.2
Traini

episode: 1268   score: 590.0  epsilon: 1.0    steps: 304  evaluation reward: 912.7
Training network. lr: 0.000224. clip: 0.089558
Iteration 3406: Policy loss: 0.005472. Value loss: 0.242814. Entropy: 1.615502.
Iteration 3407: Policy loss: 0.000003. Value loss: 0.144301. Entropy: 1.625380.
Iteration 3408: Policy loss: -0.001452. Value loss: 0.116028. Entropy: 1.621626.
episode: 1269   score: 1580.0  epsilon: 1.0    steps: 592  evaluation reward: 917.7
Training network. lr: 0.000224. clip: 0.089558
Iteration 3409: Policy loss: 0.005043. Value loss: 0.356199. Entropy: 1.764833.
Iteration 3410: Policy loss: -0.001056. Value loss: 0.208071. Entropy: 1.752658.
Iteration 3411: Policy loss: -0.001923. Value loss: 0.147779. Entropy: 1.743957.
episode: 1270   score: 1140.0  epsilon: 1.0    steps: 216  evaluation reward: 922.7
episode: 1271   score: 1670.0  epsilon: 1.0    steps: 416  evaluation reward: 931.7
Training network. lr: 0.000224. clip: 0.089558
Iteration 3412: Policy loss: 0.005452. Va

episode: 1291   score: 300.0  epsilon: 1.0    steps: 1016  evaluation reward: 928.4
Training network. lr: 0.000224. clip: 0.089411
Iteration 3472: Policy loss: 0.007009. Value loss: 0.350473. Entropy: 1.696222.
Iteration 3473: Policy loss: -0.004321. Value loss: 0.190784. Entropy: 1.697159.
Iteration 3474: Policy loss: -0.009551. Value loss: 0.140804. Entropy: 1.691627.
episode: 1292   score: 960.0  epsilon: 1.0    steps: 520  evaluation reward: 926.0
episode: 1293   score: 1360.0  epsilon: 1.0    steps: 768  evaluation reward: 934.2
Training network. lr: 0.000224. clip: 0.089411
Iteration 3475: Policy loss: 0.003627. Value loss: 0.754649. Entropy: 1.703555.
Iteration 3476: Policy loss: -0.002068. Value loss: 0.549976. Entropy: 1.692461.
Iteration 3477: Policy loss: -0.000104. Value loss: 0.506055. Entropy: 1.682486.
episode: 1294   score: 930.0  epsilon: 1.0    steps: 512  evaluation reward: 931.6
Training network. lr: 0.000224. clip: 0.089411
Iteration 3478: Policy loss: 0.001508. Va

Training network. lr: 0.000223. clip: 0.089254
Iteration 3538: Policy loss: 0.004668. Value loss: 0.287293. Entropy: 1.743502.
Iteration 3539: Policy loss: -0.003008. Value loss: 0.169120. Entropy: 1.741619.
Iteration 3540: Policy loss: -0.011925. Value loss: 0.132721. Entropy: 1.744344.
episode: 1313   score: 1020.0  epsilon: 1.0    steps: 248  evaluation reward: 966.4
Training network. lr: 0.000223. clip: 0.089254
Iteration 3541: Policy loss: 0.002567. Value loss: 0.279374. Entropy: 1.811495.
Iteration 3542: Policy loss: -0.006545. Value loss: 0.204896. Entropy: 1.814148.
Iteration 3543: Policy loss: -0.012329. Value loss: 0.175256. Entropy: 1.811720.
episode: 1314   score: 770.0  epsilon: 1.0    steps: 264  evaluation reward: 970.5
Training network. lr: 0.000223. clip: 0.089254
Iteration 3544: Policy loss: 0.005434. Value loss: 0.425736. Entropy: 1.774794.
Iteration 3545: Policy loss: 0.002922. Value loss: 0.332899. Entropy: 1.789195.
Iteration 3546: Policy loss: -0.004354. Value lo

Training network. lr: 0.000222. clip: 0.088950
Iteration 3604: Policy loss: 0.004194. Value loss: 0.541502. Entropy: 1.840047.
Iteration 3605: Policy loss: -0.008668. Value loss: 0.410970. Entropy: 1.841386.
Iteration 3606: Policy loss: -0.008883. Value loss: 0.307337. Entropy: 1.831518.
episode: 1336   score: 880.0  epsilon: 1.0    steps: 904  evaluation reward: 1012.0
Training network. lr: 0.000222. clip: 0.088950
Iteration 3607: Policy loss: 0.002905. Value loss: 0.277906. Entropy: 1.825906.
Iteration 3608: Policy loss: -0.000825. Value loss: 0.165432. Entropy: 1.824965.
Iteration 3609: Policy loss: -0.002570. Value loss: 0.104632. Entropy: 1.821859.
Training network. lr: 0.000222. clip: 0.088950
Iteration 3610: Policy loss: 0.005096. Value loss: 0.211324. Entropy: 1.841641.
Iteration 3611: Policy loss: -0.000027. Value loss: 0.128910. Entropy: 1.837978.
Iteration 3612: Policy loss: -0.002060. Value loss: 0.114552. Entropy: 1.828803.
episode: 1337   score: 530.0  epsilon: 1.0    ste

episode: 1358   score: 1240.0  epsilon: 1.0    steps: 216  evaluation reward: 977.5
episode: 1359   score: 1460.0  epsilon: 1.0    steps: 328  evaluation reward: 980.2
Training network. lr: 0.000222. clip: 0.088793
Iteration 3670: Policy loss: 0.003828. Value loss: 0.301429. Entropy: 1.497672.
Iteration 3671: Policy loss: 0.002164. Value loss: 0.206444. Entropy: 1.504446.
Iteration 3672: Policy loss: -0.002096. Value loss: 0.187830. Entropy: 1.507997.
Training network. lr: 0.000222. clip: 0.088793
Iteration 3673: Policy loss: 0.004940. Value loss: 0.255597. Entropy: 1.683691.
Iteration 3674: Policy loss: -0.007139. Value loss: 0.134825. Entropy: 1.685167.
Iteration 3675: Policy loss: -0.011388. Value loss: 0.116166. Entropy: 1.677866.
Training network. lr: 0.000222. clip: 0.088793
Iteration 3676: Policy loss: 0.002545. Value loss: 0.245254. Entropy: 1.825943.
Iteration 3677: Policy loss: -0.002022. Value loss: 0.138339. Entropy: 1.824953.
Iteration 3678: Policy loss: -0.003658. Value l

Training network. lr: 0.000222. clip: 0.088637
Iteration 3736: Policy loss: -0.001069. Value loss: 0.239446. Entropy: 1.701523.
Iteration 3737: Policy loss: -0.004611. Value loss: 0.168424. Entropy: 1.713657.
Iteration 3738: Policy loss: -0.011907. Value loss: 0.135714. Entropy: 1.704325.
episode: 1381   score: 810.0  epsilon: 1.0    steps: 296  evaluation reward: 979.4
Training network. lr: 0.000222. clip: 0.088637
Iteration 3739: Policy loss: 0.000855. Value loss: 0.235567. Entropy: 1.718670.
Iteration 3740: Policy loss: -0.003053. Value loss: 0.155813. Entropy: 1.716144.
Iteration 3741: Policy loss: -0.005208. Value loss: 0.122455. Entropy: 1.703983.
episode: 1382   score: 960.0  epsilon: 1.0    steps: 176  evaluation reward: 976.9
Training network. lr: 0.000222. clip: 0.088637
Iteration 3742: Policy loss: 0.003844. Value loss: 0.186789. Entropy: 1.704058.
Iteration 3743: Policy loss: -0.000561. Value loss: 0.119422. Entropy: 1.688323.
Iteration 3744: Policy loss: -0.006740. Value l

episode: 1403   score: 980.0  epsilon: 1.0    steps: 848  evaluation reward: 951.8
Training network. lr: 0.000221. clip: 0.088333
Iteration 3802: Policy loss: 0.002182. Value loss: 0.174547. Entropy: 1.155011.
Iteration 3803: Policy loss: -0.002555. Value loss: 0.118466. Entropy: 1.138261.
Iteration 3804: Policy loss: -0.005845. Value loss: 0.099310. Entropy: 1.116593.
episode: 1404   score: 1470.0  epsilon: 1.0    steps: 120  evaluation reward: 956.2
episode: 1405   score: 1520.0  epsilon: 1.0    steps: 352  evaluation reward: 961.5
episode: 1406   score: 910.0  epsilon: 1.0    steps: 744  evaluation reward: 961.9
Training network. lr: 0.000221. clip: 0.088333
Iteration 3805: Policy loss: 0.001835. Value loss: 0.422102. Entropy: 1.225376.
Iteration 3806: Policy loss: -0.001835. Value loss: 0.354894. Entropy: 1.198307.
Iteration 3807: Policy loss: -0.002025. Value loss: 0.334312. Entropy: 1.187475.
Training network. lr: 0.000221. clip: 0.088333
Iteration 3808: Policy loss: 0.003213. Va

Iteration 3867: Policy loss: -0.001868. Value loss: 0.165489. Entropy: 1.146308.
Training network. lr: 0.000220. clip: 0.088176
Iteration 3868: Policy loss: 0.001269. Value loss: 0.477430. Entropy: 1.378215.
Iteration 3869: Policy loss: 0.000998. Value loss: 0.396117. Entropy: 1.362935.
Iteration 3870: Policy loss: -0.000598. Value loss: 0.365239. Entropy: 1.352767.
Training network. lr: 0.000220. clip: 0.088176
Iteration 3871: Policy loss: 0.001156. Value loss: 0.335163. Entropy: 1.419155.
Iteration 3872: Policy loss: -0.004724. Value loss: 0.210653. Entropy: 1.431794.
Iteration 3873: Policy loss: -0.008378. Value loss: 0.166900. Entropy: 1.424095.
episode: 1427   score: 1520.0  epsilon: 1.0    steps: 216  evaluation reward: 967.0
episode: 1428   score: 770.0  epsilon: 1.0    steps: 448  evaluation reward: 966.5
episode: 1429   score: 1300.0  epsilon: 1.0    steps: 584  evaluation reward: 968.5
episode: 1430   score: 1150.0  epsilon: 1.0    steps: 608  evaluation reward: 971.8
episode

Iteration 3933: Policy loss: -0.008421. Value loss: 0.148747. Entropy: 1.701492.
episode: 1450   score: 1500.0  epsilon: 1.0    steps: 512  evaluation reward: 962.0
Training network. lr: 0.000220. clip: 0.088028
Iteration 3934: Policy loss: 0.004866. Value loss: 0.220255. Entropy: 1.693096.
Iteration 3935: Policy loss: -0.003801. Value loss: 0.156456. Entropy: 1.702971.
Iteration 3936: Policy loss: -0.004789. Value loss: 0.129256. Entropy: 1.688034.
now time :  2019-03-06 03:59:10.683894
episode: 1451   score: 540.0  epsilon: 1.0    steps: 672  evaluation reward: 963.2
Training network. lr: 0.000220. clip: 0.088028
Iteration 3937: Policy loss: 0.003475. Value loss: 0.332413. Entropy: 1.754195.
Iteration 3938: Policy loss: -0.001898. Value loss: 0.257616. Entropy: 1.744784.
Iteration 3939: Policy loss: -0.003840. Value loss: 0.225266. Entropy: 1.744524.
episode: 1452   score: 880.0  epsilon: 1.0    steps: 784  evaluation reward: 964.5
Training network. lr: 0.000220. clip: 0.088028
Itera

episode: 1475   score: 990.0  epsilon: 1.0    steps: 672  evaluation reward: 932.6
episode: 1476   score: 490.0  epsilon: 1.0    steps: 720  evaluation reward: 929.8
Training network. lr: 0.000220. clip: 0.087872
Iteration 3997: Policy loss: 0.001661. Value loss: 0.296714. Entropy: 1.414396.
Iteration 3998: Policy loss: 0.003820. Value loss: 0.203944. Entropy: 1.419585.
Iteration 3999: Policy loss: -0.003280. Value loss: 0.173269. Entropy: 1.421370.
Training network. lr: 0.000220. clip: 0.087872
Iteration 4000: Policy loss: 0.002609. Value loss: 0.151257. Entropy: 1.515735.
Iteration 4001: Policy loss: -0.001568. Value loss: 0.109306. Entropy: 1.513489.
Iteration 4002: Policy loss: -0.004557. Value loss: 0.088867. Entropy: 1.525572.
episode: 1477   score: 510.0  epsilon: 1.0    steps: 728  evaluation reward: 922.8
Training network. lr: 0.000219. clip: 0.087715
Iteration 4003: Policy loss: 0.005937. Value loss: 0.327041. Entropy: 1.661208.
Iteration 4004: Policy loss: 0.001260. Value lo

Iteration 4062: Policy loss: -0.008092. Value loss: 0.130306. Entropy: 1.272385.
Training network. lr: 0.000219. clip: 0.087568
Iteration 4063: Policy loss: 0.003185. Value loss: 0.249555. Entropy: 1.470628.
Iteration 4064: Policy loss: -0.002576. Value loss: 0.213555. Entropy: 1.456848.
Iteration 4065: Policy loss: -0.005993. Value loss: 0.191092. Entropy: 1.457942.
episode: 1499   score: 1770.0  epsilon: 1.0    steps: 360  evaluation reward: 928.3
episode: 1500   score: 1140.0  epsilon: 1.0    steps: 536  evaluation reward: 931.7
now time :  2019-03-06 04:00:54.539420
episode: 1501   score: 740.0  epsilon: 1.0    steps: 920  evaluation reward: 930.8
Training network. lr: 0.000219. clip: 0.087568
Iteration 4066: Policy loss: 0.002834. Value loss: 0.125612. Entropy: 1.556858.
Iteration 4067: Policy loss: -0.008852. Value loss: 0.081327. Entropy: 1.575222.
Iteration 4068: Policy loss: -0.012194. Value loss: 0.067657. Entropy: 1.561779.
Training network. lr: 0.000219. clip: 0.087568
Iter

Iteration 4128: Policy loss: -0.004352. Value loss: 0.204833. Entropy: 1.439877.
episode: 1521   score: 1930.0  epsilon: 1.0    steps: 592  evaluation reward: 932.0
episode: 1522   score: 720.0  epsilon: 1.0    steps: 920  evaluation reward: 926.6
Training network. lr: 0.000219. clip: 0.087411
Iteration 4129: Policy loss: 0.009241. Value loss: 0.259824. Entropy: 1.438820.
Iteration 4130: Policy loss: 0.001017. Value loss: 0.180342. Entropy: 1.438374.
Iteration 4131: Policy loss: -0.000686. Value loss: 0.160314. Entropy: 1.454251.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4132: Policy loss: 0.004157. Value loss: 0.509119. Entropy: 1.547603.
Iteration 4133: Policy loss: -0.000046. Value loss: 0.409032. Entropy: 1.545130.
Iteration 4134: Policy loss: -0.004112. Value loss: 0.357387. Entropy: 1.557759.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4135: Policy loss: 0.000341. Value loss: 0.302991. Entropy: 1.558441.
Iteration 4136: Policy loss: 0.003234. Value los

Iteration 4194: Policy loss: -0.000586. Value loss: 0.111208. Entropy: 1.714570.
episode: 1544   score: 1260.0  epsilon: 1.0    steps: 432  evaluation reward: 926.1
episode: 1545   score: 1060.0  epsilon: 1.0    steps: 848  evaluation reward: 931.3
Training network. lr: 0.000218. clip: 0.087254
Iteration 4195: Policy loss: 0.003233. Value loss: 0.193661. Entropy: 1.592942.
Iteration 4196: Policy loss: -0.000879. Value loss: 0.164714. Entropy: 1.582428.
Iteration 4197: Policy loss: -0.000179. Value loss: 0.149618. Entropy: 1.564798.
episode: 1546   score: 4490.0  epsilon: 1.0    steps: 72  evaluation reward: 969.8
Training network. lr: 0.000218. clip: 0.087254
Iteration 4198: Policy loss: 0.008504. Value loss: 0.306586. Entropy: 1.579205.
Iteration 4199: Policy loss: 0.003942. Value loss: 0.229472. Entropy: 1.575886.
Iteration 4200: Policy loss: 0.004135. Value loss: 0.213150. Entropy: 1.582430.
Training network. lr: 0.000218. clip: 0.087107
Iteration 4201: Policy loss: 0.003225. Value 

Iteration 4259: Policy loss: -0.003923. Value loss: 0.127724. Entropy: 1.438196.
Iteration 4260: Policy loss: -0.008191. Value loss: 0.111270. Entropy: 1.441078.
episode: 1567   score: 830.0  epsilon: 1.0    steps: 352  evaluation reward: 965.5
episode: 1568   score: 990.0  epsilon: 1.0    steps: 1008  evaluation reward: 973.2
Training network. lr: 0.000217. clip: 0.086950
Iteration 4261: Policy loss: 0.000888. Value loss: 0.139965. Entropy: 1.508297.
Iteration 4262: Policy loss: -0.005441. Value loss: 0.073737. Entropy: 1.490350.
Iteration 4263: Policy loss: -0.010342. Value loss: 0.056836. Entropy: 1.492109.
Training network. lr: 0.000217. clip: 0.086950
Iteration 4264: Policy loss: 0.002269. Value loss: 0.505690. Entropy: 1.415935.
Iteration 4265: Policy loss: 0.000801. Value loss: 0.439336. Entropy: 1.433465.
Iteration 4266: Policy loss: 0.001885. Value loss: 0.385920. Entropy: 1.439034.
episode: 1569   score: 740.0  epsilon: 1.0    steps: 160  evaluation reward: 975.5
episode: 157

Training network. lr: 0.000217. clip: 0.086793
Iteration 4327: Policy loss: 0.004678. Value loss: 0.180708. Entropy: 1.647194.
Iteration 4328: Policy loss: 0.002201. Value loss: 0.108820. Entropy: 1.630978.
Iteration 4329: Policy loss: -0.002200. Value loss: 0.094438. Entropy: 1.619859.
episode: 1589   score: 1540.0  epsilon: 1.0    steps: 416  evaluation reward: 1001.0
episode: 1590   score: 810.0  epsilon: 1.0    steps: 912  evaluation reward: 1000.8
Training network. lr: 0.000217. clip: 0.086793
Iteration 4330: Policy loss: -0.000386. Value loss: 0.082595. Entropy: 1.486530.
Iteration 4331: Policy loss: -0.004041. Value loss: 0.052898. Entropy: 1.514139.
Iteration 4332: Policy loss: -0.006988. Value loss: 0.046716. Entropy: 1.503834.
episode: 1591   score: 770.0  epsilon: 1.0    steps: 512  evaluation reward: 999.4
Training network. lr: 0.000217. clip: 0.086793
Iteration 4333: Policy loss: 0.001906. Value loss: 0.148452. Entropy: 1.476086.
Iteration 4334: Policy loss: -0.001259. Val

Iteration 4392: Policy loss: -0.003563. Value loss: 0.144145. Entropy: 1.452593.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4393: Policy loss: 0.003556. Value loss: 0.322283. Entropy: 1.477546.
Iteration 4394: Policy loss: 0.001492. Value loss: 0.226567. Entropy: 1.470655.
Iteration 4395: Policy loss: -0.001363. Value loss: 0.185957. Entropy: 1.468949.
episode: 1612   score: 1930.0  epsilon: 1.0    steps: 520  evaluation reward: 1011.7
Training network. lr: 0.000217. clip: 0.086646
Iteration 4396: Policy loss: 0.002849. Value loss: 0.179260. Entropy: 1.429688.
Iteration 4397: Policy loss: -0.005795. Value loss: 0.101889. Entropy: 1.426126.
Iteration 4398: Policy loss: -0.006960. Value loss: 0.072514. Entropy: 1.441203.
episode: 1613   score: 850.0  epsilon: 1.0    steps: 976  evaluation reward: 1007.5
Training network. lr: 0.000217. clip: 0.086646
Iteration 4399: Policy loss: 0.001516. Value loss: 0.237273. Entropy: 1.513585.
Iteration 4400: Policy loss: -0.002177. Value 

Iteration 4457: Policy loss: 0.003283. Value loss: 0.131967. Entropy: 1.664973.
Iteration 4458: Policy loss: -0.000100. Value loss: 0.112427. Entropy: 1.662955.
episode: 1635   score: 1430.0  epsilon: 1.0    steps: 224  evaluation reward: 1030.9
episode: 1636   score: 1020.0  epsilon: 1.0    steps: 240  evaluation reward: 1029.4
Training network. lr: 0.000216. clip: 0.086333
Iteration 4459: Policy loss: -0.000459. Value loss: 0.394166. Entropy: 1.497589.
Iteration 4460: Policy loss: -0.007166. Value loss: 0.304301. Entropy: 1.519743.
Iteration 4461: Policy loss: -0.009119. Value loss: 0.252195. Entropy: 1.507061.
episode: 1637   score: 870.0  epsilon: 1.0    steps: 968  evaluation reward: 1024.1
Training network. lr: 0.000216. clip: 0.086333
Iteration 4462: Policy loss: 0.000596. Value loss: 0.315071. Entropy: 1.571550.
Iteration 4463: Policy loss: -0.001588. Value loss: 0.202329. Entropy: 1.591785.
Iteration 4464: Policy loss: -0.004269. Value loss: 0.158560. Entropy: 1.586432.
Traini

episode: 1658   score: 800.0  epsilon: 1.0    steps: 560  evaluation reward: 1028.2
Training network. lr: 0.000215. clip: 0.086185
Iteration 4522: Policy loss: 0.003030. Value loss: 0.187194. Entropy: 1.169055.
Iteration 4523: Policy loss: 0.000776. Value loss: 0.152806. Entropy: 1.155245.
Iteration 4524: Policy loss: -0.004959. Value loss: 0.140222. Entropy: 1.177838.
Training network. lr: 0.000215. clip: 0.086185
Iteration 4525: Policy loss: 0.009639. Value loss: 0.227729. Entropy: 1.473238.
Iteration 4526: Policy loss: 0.002409. Value loss: 0.140166. Entropy: 1.466807.
Iteration 4527: Policy loss: -0.002352. Value loss: 0.116649. Entropy: 1.478060.
episode: 1659   score: 1630.0  epsilon: 1.0    steps: 552  evaluation reward: 1030.3
Training network. lr: 0.000215. clip: 0.086185
Iteration 4528: Policy loss: -0.001452. Value loss: 0.546089. Entropy: 1.692854.
Iteration 4529: Policy loss: 0.000605. Value loss: 0.375270. Entropy: 1.679640.
Iteration 4530: Policy loss: -0.001763. Value l

episode: 1681   score: 860.0  epsilon: 1.0    steps: 680  evaluation reward: 1007.8
Training network. lr: 0.000215. clip: 0.086029
Iteration 4588: Policy loss: 0.001426. Value loss: 0.208450. Entropy: 1.534636.
Iteration 4589: Policy loss: -0.007689. Value loss: 0.133186. Entropy: 1.537577.
Iteration 4590: Policy loss: -0.004215. Value loss: 0.119843. Entropy: 1.535030.
episode: 1682   score: 830.0  epsilon: 1.0    steps: 8  evaluation reward: 1005.7
Training network. lr: 0.000215. clip: 0.086029
Iteration 4591: Policy loss: -0.000224. Value loss: 0.370249. Entropy: 1.314125.
Iteration 4592: Policy loss: -0.005846. Value loss: 0.287921. Entropy: 1.309086.
Iteration 4593: Policy loss: -0.010090. Value loss: 0.243081. Entropy: 1.313921.
Training network. lr: 0.000215. clip: 0.086029
Iteration 4594: Policy loss: 0.005290. Value loss: 0.452189. Entropy: 1.625966.
Iteration 4595: Policy loss: 0.001284. Value loss: 0.402213. Entropy: 1.630363.
Iteration 4596: Policy loss: -0.003895. Value lo

Training network. lr: 0.000214. clip: 0.085724
Iteration 4654: Policy loss: 0.002452. Value loss: 0.384261. Entropy: 1.551622.
Iteration 4655: Policy loss: -0.002109. Value loss: 0.278407. Entropy: 1.565530.
Iteration 4656: Policy loss: -0.002248. Value loss: 0.246648. Entropy: 1.563377.
Training network. lr: 0.000214. clip: 0.085724
Iteration 4657: Policy loss: -0.001525. Value loss: 0.349040. Entropy: 1.367089.
Iteration 4658: Policy loss: -0.008204. Value loss: 0.191885. Entropy: 1.368819.
Iteration 4659: Policy loss: -0.009058. Value loss: 0.138589. Entropy: 1.375078.
Training network. lr: 0.000214. clip: 0.085724
Iteration 4660: Policy loss: 0.001936. Value loss: 0.345898. Entropy: 1.425188.
Iteration 4661: Policy loss: 0.000454. Value loss: 0.252097. Entropy: 1.406544.
Iteration 4662: Policy loss: -0.004481. Value loss: 0.217893. Entropy: 1.409836.
episode: 1704   score: 660.0  epsilon: 1.0    steps: 112  evaluation reward: 1021.0
Training network. lr: 0.000214. clip: 0.085724
It

Training network. lr: 0.000214. clip: 0.085568
Iteration 4720: Policy loss: 0.002653. Value loss: 0.177501. Entropy: 1.509690.
Iteration 4721: Policy loss: -0.006195. Value loss: 0.127594. Entropy: 1.510187.
Iteration 4722: Policy loss: -0.008235. Value loss: 0.104199. Entropy: 1.488389.
episode: 1726   score: 1050.0  epsilon: 1.0    steps: 280  evaluation reward: 1009.3
episode: 1727   score: 1060.0  epsilon: 1.0    steps: 928  evaluation reward: 1011.9
Training network. lr: 0.000214. clip: 0.085568
Iteration 4723: Policy loss: 0.001063. Value loss: 0.236386. Entropy: 1.414851.
Iteration 4724: Policy loss: -0.001739. Value loss: 0.191911. Entropy: 1.422628.
Iteration 4725: Policy loss: -0.006506. Value loss: 0.172841. Entropy: 1.403705.
episode: 1728   score: 410.0  epsilon: 1.0    steps: 416  evaluation reward: 1004.7
Training network. lr: 0.000214. clip: 0.085568
Iteration 4726: Policy loss: 0.003300. Value loss: 0.505086. Entropy: 1.444338.
Iteration 4727: Policy loss: -0.000440. V

Iteration 4788: Policy loss: -0.005221. Value loss: 0.025029. Entropy: 1.515004.
episode: 1746   score: 1380.0  epsilon: 1.0    steps: 296  evaluation reward: 1024.3
episode: 1747   score: 1090.0  epsilon: 1.0    steps: 600  evaluation reward: 1027.7
episode: 1748   score: 1830.0  epsilon: 1.0    steps: 1008  evaluation reward: 1038.8
Training network. lr: 0.000214. clip: 0.085411
Iteration 4789: Policy loss: -0.001066. Value loss: 0.178292. Entropy: 1.354522.
Iteration 4790: Policy loss: -0.002768. Value loss: 0.143883. Entropy: 1.377906.
Iteration 4791: Policy loss: -0.006221. Value loss: 0.130161. Entropy: 1.376718.
episode: 1749   score: 1590.0  epsilon: 1.0    steps: 56  evaluation reward: 1043.8
Training network. lr: 0.000214. clip: 0.085411
Iteration 4792: Policy loss: 0.002965. Value loss: 0.269633. Entropy: 1.228569.
Iteration 4793: Policy loss: 0.000863. Value loss: 0.241936. Entropy: 1.213717.
Iteration 4794: Policy loss: -0.006879. Value loss: 0.231398. Entropy: 1.218128.
T

Iteration 4854: Policy loss: -0.000484. Value loss: 0.459476. Entropy: 1.819627.
episode: 1768   score: 910.0  epsilon: 1.0    steps: 40  evaluation reward: 1071.8
episode: 1769   score: 1270.0  epsilon: 1.0    steps: 104  evaluation reward: 1074.6
episode: 1770   score: 1050.0  epsilon: 1.0    steps: 168  evaluation reward: 1075.1
Training network. lr: 0.000213. clip: 0.085107
Iteration 4855: Policy loss: 0.003732. Value loss: 0.247622. Entropy: 1.443928.
Iteration 4856: Policy loss: 0.002184. Value loss: 0.191052. Entropy: 1.432250.
Iteration 4857: Policy loss: 0.000343. Value loss: 0.177262. Entropy: 1.451519.
Training network. lr: 0.000213. clip: 0.085107
Iteration 4858: Policy loss: 0.006783. Value loss: 0.142083. Entropy: 1.501547.
Iteration 4859: Policy loss: 0.001164. Value loss: 0.067589. Entropy: 1.547814.
Iteration 4860: Policy loss: -0.005466. Value loss: 0.053515. Entropy: 1.534805.
episode: 1771   score: 2080.0  epsilon: 1.0    steps: 968  evaluation reward: 1086.0
Traini

Iteration 4919: Policy loss: -0.005478. Value loss: 0.225463. Entropy: 1.736062.
Iteration 4920: Policy loss: -0.011893. Value loss: 0.171910. Entropy: 1.727680.
episode: 1791   score: 1130.0  epsilon: 1.0    steps: 520  evaluation reward: 1059.4
episode: 1792   score: 1240.0  epsilon: 1.0    steps: 688  evaluation reward: 1061.6
Training network. lr: 0.000212. clip: 0.084950
Iteration 4921: Policy loss: 0.000323. Value loss: 0.263124. Entropy: 1.602592.
Iteration 4922: Policy loss: -0.007077. Value loss: 0.197056. Entropy: 1.605910.
Iteration 4923: Policy loss: -0.013056. Value loss: 0.178040. Entropy: 1.589494.
episode: 1793   score: 1050.0  epsilon: 1.0    steps: 168  evaluation reward: 1062.4
episode: 1794   score: 1360.0  epsilon: 1.0    steps: 448  evaluation reward: 1067.2
Training network. lr: 0.000212. clip: 0.084950
Iteration 4924: Policy loss: 0.005419. Value loss: 0.228284. Entropy: 1.215248.
Iteration 4925: Policy loss: 0.001388. Value loss: 0.193735. Entropy: 1.234861.
It

Training network. lr: 0.000212. clip: 0.084803
Iteration 4984: Policy loss: 0.004009. Value loss: 0.482379. Entropy: 1.705723.
Iteration 4985: Policy loss: 0.004499. Value loss: 0.359195. Entropy: 1.711068.
Iteration 4986: Policy loss: 0.005250. Value loss: 0.287925. Entropy: 1.706369.
Training network. lr: 0.000212. clip: 0.084803
Iteration 4987: Policy loss: 0.005752. Value loss: 0.363026. Entropy: 1.829334.
Iteration 4988: Policy loss: 0.002354. Value loss: 0.260132. Entropy: 1.835040.
Iteration 4989: Policy loss: 0.001394. Value loss: 0.234891. Entropy: 1.826852.
Training network. lr: 0.000212. clip: 0.084803
Iteration 4990: Policy loss: 0.007123. Value loss: 0.110491. Entropy: 1.947530.
Iteration 4991: Policy loss: 0.003603. Value loss: 0.061704. Entropy: 1.936095.
Iteration 4992: Policy loss: -0.001977. Value loss: 0.049416. Entropy: 1.943683.
episode: 1814   score: 700.0  epsilon: 1.0    steps: 216  evaluation reward: 1068.8
Training network. lr: 0.000212. clip: 0.084803
Iterati

Iteration 5049: Policy loss: -0.007768. Value loss: 0.076880. Entropy: 1.667426.
Training network. lr: 0.000212. clip: 0.084646
Iteration 5050: Policy loss: 0.003862. Value loss: 0.197296. Entropy: 1.979200.
Iteration 5051: Policy loss: -0.000629. Value loss: 0.168696. Entropy: 1.989986.
Iteration 5052: Policy loss: -0.005331. Value loss: 0.137218. Entropy: 1.985229.
Training network. lr: 0.000211. clip: 0.084489
Iteration 5053: Policy loss: 0.003412. Value loss: 0.179333. Entropy: 1.990724.
Iteration 5054: Policy loss: 0.000280. Value loss: 0.113044. Entropy: 1.996297.
Iteration 5055: Policy loss: -0.002997. Value loss: 0.087836. Entropy: 2.000058.
episode: 1837   score: 890.0  epsilon: 1.0    steps: 464  evaluation reward: 1048.1
episode: 1838   score: 640.0  epsilon: 1.0    steps: 608  evaluation reward: 1043.7
Training network. lr: 0.000211. clip: 0.084489
Iteration 5056: Policy loss: 0.000501. Value loss: 0.099717. Entropy: 1.753780.
Iteration 5057: Policy loss: -0.005062. Value l

Training network. lr: 0.000211. clip: 0.084342
Iteration 5113: Policy loss: 0.006201. Value loss: 0.306457. Entropy: 1.479245.
Iteration 5114: Policy loss: 0.003108. Value loss: 0.184429. Entropy: 1.498807.
Iteration 5115: Policy loss: 0.001403. Value loss: 0.151157. Entropy: 1.503624.
episode: 1861   score: 560.0  epsilon: 1.0    steps: 880  evaluation reward: 992.6
Training network. lr: 0.000211. clip: 0.084342
Iteration 5116: Policy loss: 0.008670. Value loss: 0.501865. Entropy: 1.637906.
Iteration 5117: Policy loss: 0.001993. Value loss: 0.336178. Entropy: 1.676211.
Iteration 5118: Policy loss: -0.002308. Value loss: 0.291202. Entropy: 1.674486.
episode: 1862   score: 770.0  epsilon: 1.0    steps: 536  evaluation reward: 991.5
Training network. lr: 0.000211. clip: 0.084342
Iteration 5119: Policy loss: 0.005799. Value loss: 0.272719. Entropy: 1.430822.
Iteration 5120: Policy loss: 0.000582. Value loss: 0.155368. Entropy: 1.432903.
Iteration 5121: Policy loss: -0.004255. Value loss: 

Iteration 5178: Policy loss: -0.008922. Value loss: 0.100256. Entropy: 1.624572.
episode: 1884   score: 910.0  epsilon: 1.0    steps: 472  evaluation reward: 1010.2
Training network. lr: 0.000210. clip: 0.084185
Iteration 5179: Policy loss: 0.002825. Value loss: 0.832012. Entropy: 1.567914.
Iteration 5180: Policy loss: 0.006121. Value loss: 0.594717. Entropy: 1.561685.
Iteration 5181: Policy loss: 0.000738. Value loss: 0.546463. Entropy: 1.545416.
episode: 1885   score: 1290.0  epsilon: 1.0    steps: 808  evaluation reward: 1013.7
Training network. lr: 0.000210. clip: 0.084185
Iteration 5182: Policy loss: 0.003386. Value loss: 0.202927. Entropy: 1.573582.
Iteration 5183: Policy loss: -0.000700. Value loss: 0.121864. Entropy: 1.554856.
Iteration 5184: Policy loss: -0.004613. Value loss: 0.104056. Entropy: 1.569238.
episode: 1886   score: 770.0  epsilon: 1.0    steps: 112  evaluation reward: 1013.4
Training network. lr: 0.000210. clip: 0.084185
Iteration 5185: Policy loss: 0.002622. Valu

episode: 1905   score: 1000.0  epsilon: 1.0    steps: 216  evaluation reward: 1058.1
Training network. lr: 0.000210. clip: 0.084029
Iteration 5245: Policy loss: 0.002299. Value loss: 0.195030. Entropy: 1.235348.
Iteration 5246: Policy loss: -0.001487. Value loss: 0.166641. Entropy: 1.226438.
Iteration 5247: Policy loss: -0.006132. Value loss: 0.161625. Entropy: 1.228932.
episode: 1906   score: 980.0  epsilon: 1.0    steps: 344  evaluation reward: 1052.5
Training network. lr: 0.000210. clip: 0.084029
Iteration 5248: Policy loss: 0.002303. Value loss: 0.253159. Entropy: 1.469296.
Iteration 5249: Policy loss: -0.000339. Value loss: 0.184045. Entropy: 1.463464.
Iteration 5250: Policy loss: 0.000353. Value loss: 0.154639. Entropy: 1.495258.
episode: 1907   score: 1050.0  epsilon: 1.0    steps: 96  evaluation reward: 1049.9
episode: 1908   score: 1520.0  epsilon: 1.0    steps: 104  evaluation reward: 1056.1
Training network. lr: 0.000210. clip: 0.083881
Iteration 5251: Policy loss: 0.003203.

Iteration 5309: Policy loss: -0.003409. Value loss: 0.194068. Entropy: 1.418206.
Iteration 5310: Policy loss: -0.004245. Value loss: 0.190799. Entropy: 1.416998.
episode: 1929   score: 1150.0  epsilon: 1.0    steps: 640  evaluation reward: 1088.5
Training network. lr: 0.000209. clip: 0.083725
Iteration 5311: Policy loss: 0.001111. Value loss: 0.262793. Entropy: 1.603294.
Iteration 5312: Policy loss: -0.003341. Value loss: 0.192425. Entropy: 1.613888.
Iteration 5313: Policy loss: -0.008348. Value loss: 0.164677. Entropy: 1.611243.
Training network. lr: 0.000209. clip: 0.083725
Iteration 5314: Policy loss: 0.003490. Value loss: 0.443030. Entropy: 1.604689.
Iteration 5315: Policy loss: -0.002333. Value loss: 0.351502. Entropy: 1.603410.
Iteration 5316: Policy loss: -0.003209. Value loss: 0.340421. Entropy: 1.598576.
episode: 1930   score: 930.0  epsilon: 1.0    steps: 8  evaluation reward: 1090.9
episode: 1931   score: 1120.0  epsilon: 1.0    steps: 552  evaluation reward: 1093.4
Training

Training network. lr: 0.000209. clip: 0.083568
Iteration 5374: Policy loss: 0.002127. Value loss: 0.268578. Entropy: 1.032781.
Iteration 5375: Policy loss: 0.000974. Value loss: 0.204590. Entropy: 1.046087.
Iteration 5376: Policy loss: -0.004188. Value loss: 0.193337. Entropy: 1.031739.
Training network. lr: 0.000209. clip: 0.083568
Iteration 5377: Policy loss: 0.003628. Value loss: 0.339267. Entropy: 1.460491.
Iteration 5378: Policy loss: 0.001387. Value loss: 0.235520. Entropy: 1.477462.
Iteration 5379: Policy loss: -0.001038. Value loss: 0.181531. Entropy: 1.471099.
episode: 1952   score: 1690.0  epsilon: 1.0    steps: 104  evaluation reward: 1162.6
episode: 1953   score: 1250.0  epsilon: 1.0    steps: 1016  evaluation reward: 1153.7
Training network. lr: 0.000209. clip: 0.083568
Iteration 5380: Policy loss: 0.007488. Value loss: 1.772617. Entropy: 1.594942.
Iteration 5381: Policy loss: 0.013499. Value loss: 1.560207. Entropy: 1.577001.
Iteration 5382: Policy loss: 0.006558. Value l

Iteration 5441: Policy loss: -0.001863. Value loss: 0.056495. Entropy: 1.794483.
Iteration 5442: Policy loss: -0.004286. Value loss: 0.047122. Entropy: 1.783589.
episode: 1973   score: 1100.0  epsilon: 1.0    steps: 512  evaluation reward: 1200.8
episode: 1974   score: 750.0  epsilon: 1.0    steps: 640  evaluation reward: 1197.6
episode: 1975   score: 1900.0  epsilon: 1.0    steps: 784  evaluation reward: 1202.8
episode: 1976   score: 900.0  epsilon: 1.0    steps: 928  evaluation reward: 1196.2
Training network. lr: 0.000209. clip: 0.083420
Iteration 5443: Policy loss: 0.005478. Value loss: 0.099590. Entropy: 1.585374.
Iteration 5444: Policy loss: -0.003108. Value loss: 0.058041. Entropy: 1.599341.
Iteration 5445: Policy loss: -0.003545. Value loss: 0.051021. Entropy: 1.599247.
Training network. lr: 0.000209. clip: 0.083420
Iteration 5446: Policy loss: 0.002671. Value loss: 0.254020. Entropy: 1.224221.
Iteration 5447: Policy loss: 0.001599. Value loss: 0.222374. Entropy: 1.218642.
Iter

Training network. lr: 0.000208. clip: 0.083107
Iteration 5506: Policy loss: 0.005503. Value loss: 0.570603. Entropy: 1.243268.
Iteration 5507: Policy loss: 0.007562. Value loss: 0.404949. Entropy: 1.237783.
Iteration 5508: Policy loss: 0.004794. Value loss: 0.360422. Entropy: 1.226103.
episode: 1997   score: 760.0  epsilon: 1.0    steps: 144  evaluation reward: 1185.7
Training network. lr: 0.000208. clip: 0.083107
Iteration 5509: Policy loss: 0.006941. Value loss: 0.314893. Entropy: 1.175455.
Iteration 5510: Policy loss: 0.003069. Value loss: 0.210584. Entropy: 1.122348.
Iteration 5511: Policy loss: 0.001426. Value loss: 0.182540. Entropy: 1.130986.
episode: 1998   score: 850.0  epsilon: 1.0    steps: 344  evaluation reward: 1188.8
episode: 1999   score: 1670.0  epsilon: 1.0    steps: 688  evaluation reward: 1190.5
Training network. lr: 0.000208. clip: 0.083107
Iteration 5512: Policy loss: 0.027532. Value loss: 0.358617. Entropy: 1.126872.
Iteration 5513: Policy loss: 0.007902. Value l

Iteration 5570: Policy loss: -0.002356. Value loss: 0.108084. Entropy: 1.530144.
Iteration 5571: Policy loss: -0.005598. Value loss: 0.079336. Entropy: 1.531799.
episode: 2021   score: 340.0  epsilon: 1.0    steps: 160  evaluation reward: 1140.2
Training network. lr: 0.000207. clip: 0.082960
Iteration 5572: Policy loss: 0.004848. Value loss: 0.501295. Entropy: 1.334061.
Iteration 5573: Policy loss: -0.000816. Value loss: 0.353597. Entropy: 1.327534.
Iteration 5574: Policy loss: -0.002462. Value loss: 0.314191. Entropy: 1.332321.
Training network. lr: 0.000207. clip: 0.082960
Iteration 5575: Policy loss: 0.002496. Value loss: 0.463635. Entropy: 1.429031.
Iteration 5576: Policy loss: 0.003290. Value loss: 0.356490. Entropy: 1.434550.
Iteration 5577: Policy loss: 0.000267. Value loss: 0.328668. Entropy: 1.416946.
episode: 2022   score: 1860.0  epsilon: 1.0    steps: 448  evaluation reward: 1143.8
Training network. lr: 0.000207. clip: 0.082960
Iteration 5578: Policy loss: -0.001580. Value 

Training network. lr: 0.000207. clip: 0.082803
Iteration 5635: Policy loss: 0.005022. Value loss: 0.299052. Entropy: 1.758570.
Iteration 5636: Policy loss: 0.002481. Value loss: 0.145218. Entropy: 1.758203.
Iteration 5637: Policy loss: -0.003735. Value loss: 0.104778. Entropy: 1.758010.
episode: 2045   score: 390.0  epsilon: 1.0    steps: 160  evaluation reward: 1050.9
Training network. lr: 0.000207. clip: 0.082803
Iteration 5638: Policy loss: -0.000828. Value loss: 0.595587. Entropy: 1.632406.
Iteration 5639: Policy loss: -0.001563. Value loss: 0.447549. Entropy: 1.637952.
Iteration 5640: Policy loss: -0.004378. Value loss: 0.382810. Entropy: 1.630506.
episode: 2046   score: 1050.0  epsilon: 1.0    steps: 920  evaluation reward: 1047.3
Training network. lr: 0.000207. clip: 0.082803
Iteration 5641: Policy loss: 0.003964. Value loss: 0.180779. Entropy: 1.634245.
Iteration 5642: Policy loss: -0.000067. Value loss: 0.107313. Entropy: 1.639625.
Iteration 5643: Policy loss: -0.000968. Value

episode: 2072   score: 1100.0  epsilon: 1.0    steps: 936  evaluation reward: 909.5
Training network. lr: 0.000207. clip: 0.082646
Iteration 5698: Policy loss: 0.004175. Value loss: 0.237774. Entropy: 1.664625.
Iteration 5699: Policy loss: -0.002699. Value loss: 0.168726. Entropy: 1.660204.
Iteration 5700: Policy loss: -0.001100. Value loss: 0.149412. Entropy: 1.671643.
Training network. lr: 0.000206. clip: 0.082499
Iteration 5701: Policy loss: 0.001134. Value loss: 0.249019. Entropy: 1.208143.
Iteration 5702: Policy loss: -0.004516. Value loss: 0.195030. Entropy: 1.222808.
Iteration 5703: Policy loss: -0.007279. Value loss: 0.184690. Entropy: 1.215111.
episode: 2073   score: 910.0  epsilon: 1.0    steps: 304  evaluation reward: 907.6
episode: 2074   score: 1140.0  epsilon: 1.0    steps: 352  evaluation reward: 911.5
Training network. lr: 0.000206. clip: 0.082499
Iteration 5704: Policy loss: -0.000005. Value loss: 0.232520. Entropy: 1.459424.
Iteration 5705: Policy loss: -0.003002. Val

Iteration 5763: Policy loss: -0.000665. Value loss: 0.239117. Entropy: 1.117597.
Training network. lr: 0.000206. clip: 0.082342
Iteration 5764: Policy loss: 0.002670. Value loss: 0.137467. Entropy: 1.596281.
Iteration 5765: Policy loss: -0.002713. Value loss: 0.075393. Entropy: 1.587468.
Iteration 5766: Policy loss: -0.005904. Value loss: 0.057137. Entropy: 1.578209.
episode: 2096   score: 600.0  epsilon: 1.0    steps: 192  evaluation reward: 851.3
episode: 2097   score: 1670.0  epsilon: 1.0    steps: 328  evaluation reward: 860.4
Training network. lr: 0.000206. clip: 0.082342
Iteration 5767: Policy loss: -0.000871. Value loss: 0.179575. Entropy: 1.546289.
Iteration 5768: Policy loss: -0.009494. Value loss: 0.128585. Entropy: 1.546084.
Iteration 5769: Policy loss: -0.011828. Value loss: 0.116162. Entropy: 1.545790.
episode: 2098   score: 700.0  epsilon: 1.0    steps: 904  evaluation reward: 858.9
Training network. lr: 0.000206. clip: 0.082342
Iteration 5770: Policy loss: 0.000752. Valu

Iteration 5827: Policy loss: 0.002034. Value loss: 0.187167. Entropy: 0.996596.
Iteration 5828: Policy loss: -0.003066. Value loss: 0.123653. Entropy: 0.993718.
Iteration 5829: Policy loss: -0.007441. Value loss: 0.114515. Entropy: 0.987310.
episode: 2121   score: 1420.0  epsilon: 1.0    steps: 32  evaluation reward: 842.5
episode: 2122   score: 610.0  epsilon: 1.0    steps: 464  evaluation reward: 830.0
episode: 2123   score: 1210.0  epsilon: 1.0    steps: 576  evaluation reward: 828.5
Training network. lr: 0.000205. clip: 0.082185
Iteration 5830: Policy loss: 0.000620. Value loss: 0.581964. Entropy: 0.880808.
Iteration 5831: Policy loss: 0.005635. Value loss: 0.479674. Entropy: 0.889752.
Iteration 5832: Policy loss: 0.001875. Value loss: 0.414870. Entropy: 0.887101.
Training network. lr: 0.000205. clip: 0.082185
Iteration 5833: Policy loss: 0.002543. Value loss: 0.162043. Entropy: 1.261951.
Iteration 5834: Policy loss: -0.001283. Value loss: 0.095518. Entropy: 1.277211.
Iteration 583

episode: 2144   score: 710.0  epsilon: 1.0    steps: 784  evaluation reward: 869.7
Training network. lr: 0.000205. clip: 0.082038
Iteration 5893: Policy loss: 0.005882. Value loss: 0.260728. Entropy: 0.871762.
Iteration 5894: Policy loss: -0.000298. Value loss: 0.210304. Entropy: 0.887358.
Iteration 5895: Policy loss: -0.003077. Value loss: 0.188871. Entropy: 0.885235.
episode: 2145   score: 990.0  epsilon: 1.0    steps: 968  evaluation reward: 875.7
Training network. lr: 0.000205. clip: 0.082038
Iteration 5896: Policy loss: 0.003784. Value loss: 0.359240. Entropy: 1.348079.
Iteration 5897: Policy loss: 0.001041. Value loss: 0.267126. Entropy: 1.351147.
Iteration 5898: Policy loss: -0.006621. Value loss: 0.224904. Entropy: 1.352443.
episode: 2146   score: 1450.0  epsilon: 1.0    steps: 112  evaluation reward: 879.7
episode: 2147   score: 720.0  epsilon: 1.0    steps: 912  evaluation reward: 878.2
Training network. lr: 0.000205. clip: 0.082038
Iteration 5899: Policy loss: 0.013572. Valu

Training network. lr: 0.000204. clip: 0.081725
Iteration 5956: Policy loss: -0.000299. Value loss: 0.292965. Entropy: 1.025318.
Iteration 5957: Policy loss: -0.002007. Value loss: 0.267438. Entropy: 1.035949.
Iteration 5958: Policy loss: -0.004395. Value loss: 0.248733. Entropy: 1.040998.
episode: 2170   score: 1360.0  epsilon: 1.0    steps: 752  evaluation reward: 922.1
episode: 2171   score: 700.0  epsilon: 1.0    steps: 976  evaluation reward: 923.1
Training network. lr: 0.000204. clip: 0.081725
Iteration 5959: Policy loss: 0.011095. Value loss: 0.216315. Entropy: 1.374735.
Iteration 5960: Policy loss: 0.002019. Value loss: 0.149710. Entropy: 1.382703.
Iteration 5961: Policy loss: -0.002962. Value loss: 0.128556. Entropy: 1.373803.
episode: 2172   score: 770.0  epsilon: 1.0    steps: 72  evaluation reward: 919.8
Training network. lr: 0.000204. clip: 0.081725
Iteration 5962: Policy loss: 0.003780. Value loss: 0.417021. Entropy: 1.133350.
Iteration 5963: Policy loss: 0.001163. Value l

Iteration 6020: Policy loss: -0.002661. Value loss: 0.119520. Entropy: 1.447288.
Iteration 6021: Policy loss: -0.004531. Value loss: 0.105365. Entropy: 1.433741.
episode: 2195   score: 1330.0  epsilon: 1.0    steps: 736  evaluation reward: 888.7
Training network. lr: 0.000204. clip: 0.081577
Iteration 6022: Policy loss: 0.002296. Value loss: 0.354194. Entropy: 1.426431.
Iteration 6023: Policy loss: -0.005782. Value loss: 0.211774. Entropy: 1.444423.
Iteration 6024: Policy loss: -0.008302. Value loss: 0.167482. Entropy: 1.448995.
Training network. lr: 0.000204. clip: 0.081577
Iteration 6025: Policy loss: 0.003238. Value loss: 0.317710. Entropy: 1.230872.
Iteration 6026: Policy loss: 0.000569. Value loss: 0.234974. Entropy: 1.223726.
Iteration 6027: Policy loss: -0.001658. Value loss: 0.199102. Entropy: 1.216804.
episode: 2196   score: 1120.0  epsilon: 1.0    steps: 360  evaluation reward: 893.9
Training network. lr: 0.000204. clip: 0.081577
Iteration 6028: Policy loss: 0.005502. Value l

Iteration 6084: Policy loss: -0.007596. Value loss: 0.071381. Entropy: 1.398156.
episode: 2220   score: 750.0  epsilon: 1.0    steps: 376  evaluation reward: 887.8
episode: 2221   score: 1060.0  epsilon: 1.0    steps: 952  evaluation reward: 884.2
Training network. lr: 0.000204. clip: 0.081421
Iteration 6085: Policy loss: 0.004911. Value loss: 0.180143. Entropy: 1.257134.
Iteration 6086: Policy loss: -0.001015. Value loss: 0.098102. Entropy: 1.228719.
Iteration 6087: Policy loss: -0.005005. Value loss: 0.082015. Entropy: 1.246671.
Training network. lr: 0.000204. clip: 0.081421
Iteration 6088: Policy loss: 0.006192. Value loss: 0.193772. Entropy: 1.318293.
Iteration 6089: Policy loss: -0.003830. Value loss: 0.149802. Entropy: 1.318037.
Iteration 6090: Policy loss: -0.006961. Value loss: 0.140982. Entropy: 1.319016.
episode: 2222   score: 880.0  epsilon: 1.0    steps: 392  evaluation reward: 886.9
Training network. lr: 0.000204. clip: 0.081421
Iteration 6091: Policy loss: 0.004345. Value

episode: 2245   score: 800.0  epsilon: 1.0    steps: 1024  evaluation reward: 869.8
Training network. lr: 0.000203. clip: 0.081264
Iteration 6148: Policy loss: 0.004745. Value loss: 0.226259. Entropy: 1.488490.
Iteration 6149: Policy loss: 0.001342. Value loss: 0.160068. Entropy: 1.488618.
Iteration 6150: Policy loss: 0.000234. Value loss: 0.153956. Entropy: 1.487101.
episode: 2246   score: 620.0  epsilon: 1.0    steps: 168  evaluation reward: 861.5
episode: 2247   score: 740.0  epsilon: 1.0    steps: 984  evaluation reward: 861.7
Training network. lr: 0.000203. clip: 0.081116
Iteration 6151: Policy loss: 0.003740. Value loss: 0.463785. Entropy: 1.322678.
Iteration 6152: Policy loss: -0.003742. Value loss: 0.369431. Entropy: 1.316482.
Iteration 6153: Policy loss: -0.004129. Value loss: 0.339013. Entropy: 1.327549.
Training network. lr: 0.000203. clip: 0.081116
Iteration 6154: Policy loss: 0.002748. Value loss: 0.361199. Entropy: 1.227458.
Iteration 6155: Policy loss: -0.002406. Value l

episode: 2267   score: 490.0  epsilon: 1.0    steps: 1008  evaluation reward: 851.2
Training network. lr: 0.000202. clip: 0.080960
Iteration 6214: Policy loss: 0.002475. Value loss: 0.218394. Entropy: 1.542591.
Iteration 6215: Policy loss: 0.003548. Value loss: 0.119373. Entropy: 1.524977.
Iteration 6216: Policy loss: 0.003812. Value loss: 0.086741. Entropy: 1.516070.
Training network. lr: 0.000202. clip: 0.080960
Iteration 6217: Policy loss: 0.002583. Value loss: 0.179765. Entropy: 0.963132.
Iteration 6218: Policy loss: -0.002374. Value loss: 0.106369. Entropy: 0.959878.
Iteration 6219: Policy loss: -0.001761. Value loss: 0.087390. Entropy: 0.942555.
episode: 2268   score: 1210.0  epsilon: 1.0    steps: 88  evaluation reward: 854.9
episode: 2269   score: 1160.0  epsilon: 1.0    steps: 160  evaluation reward: 860.4
episode: 2270   score: 670.0  epsilon: 1.0    steps: 248  evaluation reward: 853.5
Training network. lr: 0.000202. clip: 0.080960
Iteration 6220: Policy loss: -0.003647. Val

Training network. lr: 0.000202. clip: 0.080803
Iteration 6277: Policy loss: 0.005259. Value loss: 0.716570. Entropy: 1.164689.
Iteration 6278: Policy loss: 0.001597. Value loss: 0.571711. Entropy: 1.148140.
Iteration 6279: Policy loss: -0.001974. Value loss: 0.493571. Entropy: 1.152962.
episode: 2293   score: 1690.0  epsilon: 1.0    steps: 480  evaluation reward: 891.2
Training network. lr: 0.000202. clip: 0.080803
Iteration 6280: Policy loss: 0.005199. Value loss: 0.208048. Entropy: 0.958830.
Iteration 6281: Policy loss: -0.002635. Value loss: 0.138431. Entropy: 0.948813.
Iteration 6282: Policy loss: -0.007019. Value loss: 0.123770. Entropy: 0.960473.
episode: 2294   score: 570.0  epsilon: 1.0    steps: 584  evaluation reward: 889.4
Training network. lr: 0.000202. clip: 0.080803
Iteration 6283: Policy loss: 0.004327. Value loss: 0.193607. Entropy: 1.181296.
Iteration 6284: Policy loss: -0.002166. Value loss: 0.120870. Entropy: 1.188691.
Iteration 6285: Policy loss: -0.005352. Value lo

Iteration 6342: Policy loss: -0.003806. Value loss: 0.061358. Entropy: 1.673561.
episode: 2316   score: 760.0  epsilon: 1.0    steps: 16  evaluation reward: 933.8
episode: 2317   score: 940.0  epsilon: 1.0    steps: 424  evaluation reward: 921.1
episode: 2318   score: 710.0  epsilon: 1.0    steps: 448  evaluation reward: 922.8
Training network. lr: 0.000202. clip: 0.080656
Iteration 6343: Policy loss: 0.000314. Value loss: 0.261447. Entropy: 1.099109.
Iteration 6344: Policy loss: -0.001809. Value loss: 0.200584. Entropy: 1.097890.
Iteration 6345: Policy loss: -0.003495. Value loss: 0.187682. Entropy: 1.091470.
Training network. lr: 0.000202. clip: 0.080656
Iteration 6346: Policy loss: 0.004775. Value loss: 0.261625. Entropy: 1.452846.
Iteration 6347: Policy loss: -0.000934. Value loss: 0.155134. Entropy: 1.444437.
Iteration 6348: Policy loss: -0.004911. Value loss: 0.111202. Entropy: 1.443510.
Training network. lr: 0.000202. clip: 0.080656
Iteration 6349: Policy loss: 0.004787. Value l

Iteration 6408: Policy loss: -0.005935. Value loss: 0.082343. Entropy: 1.154865.
episode: 2339   score: 1380.0  epsilon: 1.0    steps: 144  evaluation reward: 977.9
episode: 2340   score: 730.0  epsilon: 1.0    steps: 608  evaluation reward: 973.8
Training network. lr: 0.000201. clip: 0.080342
Iteration 6409: Policy loss: 0.001980. Value loss: 0.320893. Entropy: 0.833085.
Iteration 6410: Policy loss: -0.001392. Value loss: 0.234818. Entropy: 0.819050.
Iteration 6411: Policy loss: -0.002546. Value loss: 0.202878. Entropy: 0.825422.
Training network. lr: 0.000201. clip: 0.080342
Iteration 6412: Policy loss: 0.006540. Value loss: 0.501505. Entropy: 1.078190.
Iteration 6413: Policy loss: 0.005957. Value loss: 0.356872. Entropy: 1.063982.
Iteration 6414: Policy loss: 0.004141. Value loss: 0.299002. Entropy: 1.073291.
episode: 2341   score: 730.0  epsilon: 1.0    steps: 568  evaluation reward: 967.1
episode: 2342   score: 960.0  epsilon: 1.0    steps: 952  evaluation reward: 967.8
Training n

Iteration 6471: Policy loss: 0.004742. Value loss: 0.277906. Entropy: 1.315090.
episode: 2365   score: 690.0  epsilon: 1.0    steps: 168  evaluation reward: 938.4
Training network. lr: 0.000200. clip: 0.080195
Iteration 6472: Policy loss: 0.004060. Value loss: 0.253923. Entropy: 0.978422.
Iteration 6473: Policy loss: -0.003318. Value loss: 0.207966. Entropy: 0.977389.
Iteration 6474: Policy loss: -0.000245. Value loss: 0.166094. Entropy: 0.963286.
episode: 2366   score: 750.0  epsilon: 1.0    steps: 296  evaluation reward: 942.0
episode: 2367   score: 580.0  epsilon: 1.0    steps: 392  evaluation reward: 942.9
Training network. lr: 0.000200. clip: 0.080195
Iteration 6475: Policy loss: -0.001064. Value loss: 0.172150. Entropy: 0.832091.
Iteration 6476: Policy loss: -0.002643. Value loss: 0.125943. Entropy: 0.838029.
Iteration 6477: Policy loss: -0.006283. Value loss: 0.106319. Entropy: 0.837608.
episode: 2368   score: 1030.0  epsilon: 1.0    steps: 112  evaluation reward: 941.1
episode:

episode: 2390   score: 1020.0  epsilon: 1.0    steps: 400  evaluation reward: 958.5
Training network. lr: 0.000200. clip: 0.080038
Iteration 6535: Policy loss: 0.001239. Value loss: 0.097957. Entropy: 0.710073.
Iteration 6536: Policy loss: -0.002610. Value loss: 0.060796. Entropy: 0.714247.
Iteration 6537: Policy loss: -0.005408. Value loss: 0.046263. Entropy: 0.707306.
Training network. lr: 0.000200. clip: 0.080038
Iteration 6538: Policy loss: 0.001870. Value loss: 0.133517. Entropy: 0.669978.
Iteration 6539: Policy loss: -0.002535. Value loss: 0.091692. Entropy: 0.677141.
Iteration 6540: Policy loss: -0.003574. Value loss: 0.069982. Entropy: 0.660957.
episode: 2391   score: 910.0  epsilon: 1.0    steps: 112  evaluation reward: 959.6
episode: 2392   score: 970.0  epsilon: 1.0    steps: 520  evaluation reward: 962.1
Training network. lr: 0.000200. clip: 0.080038
Iteration 6541: Policy loss: 0.005724. Value loss: 0.202334. Entropy: 0.817853.
Iteration 6542: Policy loss: -0.003866. Value

episode: 2416   score: 880.0  epsilon: 1.0    steps: 944  evaluation reward: 922.4
Training network. lr: 0.000200. clip: 0.079881
Iteration 6598: Policy loss: 0.003759. Value loss: 0.209384. Entropy: 1.362738.
Iteration 6599: Policy loss: 0.004078. Value loss: 0.145749. Entropy: 1.365762.
Iteration 6600: Policy loss: -0.002206. Value loss: 0.121693. Entropy: 1.379400.
Training network. lr: 0.000199. clip: 0.079734
Iteration 6601: Policy loss: 0.002666. Value loss: 0.558211. Entropy: 1.486086.
Iteration 6602: Policy loss: 0.003368. Value loss: 0.426660. Entropy: 1.483017.
Iteration 6603: Policy loss: 0.005025. Value loss: 0.366262. Entropy: 1.482481.
episode: 2417   score: 770.0  epsilon: 1.0    steps: 224  evaluation reward: 920.7
Training network. lr: 0.000199. clip: 0.079734
Iteration 6604: Policy loss: 0.001024. Value loss: 0.154141. Entropy: 1.503215.
Iteration 6605: Policy loss: -0.004064. Value loss: 0.101056. Entropy: 1.512527.
Iteration 6606: Policy loss: -0.004584. Value loss:

Iteration 6662: Policy loss: 0.000451. Value loss: 0.159613. Entropy: 1.134128.
Iteration 6663: Policy loss: -0.005698. Value loss: 0.140607. Entropy: 1.129317.
episode: 2441   score: 660.0  epsilon: 1.0    steps: 232  evaluation reward: 894.4
Training network. lr: 0.000199. clip: 0.079577
Iteration 6664: Policy loss: 0.004937. Value loss: 0.485041. Entropy: 1.200202.
Iteration 6665: Policy loss: -0.001398. Value loss: 0.375926. Entropy: 1.238311.
Iteration 6666: Policy loss: -0.005475. Value loss: 0.338751. Entropy: 1.218993.
episode: 2442   score: 730.0  epsilon: 1.0    steps: 592  evaluation reward: 892.1
episode: 2443   score: 1160.0  epsilon: 1.0    steps: 616  evaluation reward: 894.7
Training network. lr: 0.000199. clip: 0.079577
Iteration 6667: Policy loss: 0.007837. Value loss: 0.990133. Entropy: 1.111544.
Iteration 6668: Policy loss: 0.002772. Value loss: 0.848241. Entropy: 1.082051.
Iteration 6669: Policy loss: -0.000342. Value loss: 0.750162. Entropy: 1.087270.
Training net

Iteration 6726: Policy loss: -0.000513. Value loss: 0.243374. Entropy: 0.689703.
episode: 2466   score: 1390.0  epsilon: 1.0    steps: 976  evaluation reward: 949.3
Training network. lr: 0.000199. clip: 0.079421
Iteration 6727: Policy loss: 0.007559. Value loss: 0.348055. Entropy: 1.006170.
Iteration 6728: Policy loss: 0.000187. Value loss: 0.145645. Entropy: 0.981633.
Iteration 6729: Policy loss: -0.000784. Value loss: 0.102999. Entropy: 0.983714.
episode: 2467   score: 520.0  epsilon: 1.0    steps: 32  evaluation reward: 948.7
Training network. lr: 0.000199. clip: 0.079421
Iteration 6730: Policy loss: 0.000921. Value loss: 0.670070. Entropy: 0.655279.
Iteration 6731: Policy loss: 0.000857. Value loss: 0.583437. Entropy: 0.647532.
Iteration 6732: Policy loss: -0.002632. Value loss: 0.512198. Entropy: 0.676440.
episode: 2468   score: 1670.0  epsilon: 1.0    steps: 976  evaluation reward: 955.1
episode: 2469   score: 1150.0  epsilon: 1.0    steps: 1000  evaluation reward: 961.3
Training

Iteration 6791: Policy loss: -0.003186. Value loss: 0.135853. Entropy: 0.909183.
Iteration 6792: Policy loss: -0.006019. Value loss: 0.106453. Entropy: 0.904947.
Training network. lr: 0.000198. clip: 0.079273
Iteration 6793: Policy loss: 0.005344. Value loss: 0.380254. Entropy: 1.079152.
Iteration 6794: Policy loss: 0.002118. Value loss: 0.285645. Entropy: 1.077781.
Iteration 6795: Policy loss: 0.002295. Value loss: 0.252293. Entropy: 1.085785.
Training network. lr: 0.000198. clip: 0.079273
Iteration 6796: Policy loss: 0.003855. Value loss: 0.254812. Entropy: 1.421331.
Iteration 6797: Policy loss: 0.000297. Value loss: 0.169818. Entropy: 1.413391.
Iteration 6798: Policy loss: -0.003159. Value loss: 0.140626. Entropy: 1.424435.
episode: 2490   score: 670.0  epsilon: 1.0    steps: 672  evaluation reward: 982.4
episode: 2491   score: 640.0  epsilon: 1.0    steps: 1008  evaluation reward: 979.7
Training network. lr: 0.000198. clip: 0.079273
Iteration 6799: Policy loss: -0.000091. Value los

Training network. lr: 0.000197. clip: 0.078960
Iteration 6856: Policy loss: -0.002283. Value loss: 0.176456. Entropy: 1.270999.
Iteration 6857: Policy loss: -0.006289. Value loss: 0.136641. Entropy: 1.269623.
Iteration 6858: Policy loss: -0.006804. Value loss: 0.121639. Entropy: 1.277834.
Training network. lr: 0.000197. clip: 0.078960
Iteration 6859: Policy loss: 0.007148. Value loss: 0.317429. Entropy: 1.483256.
Iteration 6860: Policy loss: 0.004867. Value loss: 0.270444. Entropy: 1.493050.
Iteration 6861: Policy loss: 0.003707. Value loss: 0.242101. Entropy: 1.492068.
episode: 2514   score: 650.0  epsilon: 1.0    steps: 472  evaluation reward: 970.9
episode: 2515   score: 470.0  epsilon: 1.0    steps: 704  evaluation reward: 970.0
episode: 2516   score: 760.0  epsilon: 1.0    steps: 968  evaluation reward: 968.8
Training network. lr: 0.000197. clip: 0.078960
Iteration 6862: Policy loss: 0.002907. Value loss: 0.307236. Entropy: 1.439604.
Iteration 6863: Policy loss: -0.004986. Value l

Training network. lr: 0.000197. clip: 0.078812
Iteration 6922: Policy loss: 0.006481. Value loss: 0.205667. Entropy: 1.482837.
Iteration 6923: Policy loss: 0.005424. Value loss: 0.104384. Entropy: 1.475147.
Iteration 6924: Policy loss: 0.001806. Value loss: 0.075463. Entropy: 1.481649.
episode: 2537   score: 1730.0  epsilon: 1.0    steps: 16  evaluation reward: 974.4
episode: 2538   score: 710.0  epsilon: 1.0    steps: 736  evaluation reward: 976.4
Training network. lr: 0.000197. clip: 0.078812
Iteration 6925: Policy loss: -0.003124. Value loss: 0.247533. Entropy: 1.623370.
Iteration 6926: Policy loss: 0.000149. Value loss: 0.191844. Entropy: 1.631361.
Iteration 6927: Policy loss: -0.004891. Value loss: 0.162367. Entropy: 1.618878.
episode: 2539   score: 640.0  epsilon: 1.0    steps: 416  evaluation reward: 969.5
episode: 2540   score: 280.0  epsilon: 1.0    steps: 792  evaluation reward: 963.2
Training network. lr: 0.000197. clip: 0.078812
Iteration 6928: Policy loss: 0.000690. Value 

Training network. lr: 0.000197. clip: 0.078656
Iteration 6985: Policy loss: 0.000575. Value loss: 0.163281. Entropy: 1.262346.
Iteration 6986: Policy loss: -0.002503. Value loss: 0.115095. Entropy: 1.265250.
Iteration 6987: Policy loss: -0.004018. Value loss: 0.105454. Entropy: 1.314473.
episode: 2564   score: 1300.0  epsilon: 1.0    steps: 552  evaluation reward: 908.7
Training network. lr: 0.000197. clip: 0.078656
Iteration 6988: Policy loss: 0.007421. Value loss: 0.170460. Entropy: 1.361436.
Iteration 6989: Policy loss: 0.002993. Value loss: 0.142630. Entropy: 1.376841.
Iteration 6990: Policy loss: 0.001764. Value loss: 0.132660. Entropy: 1.369071.
episode: 2565   score: 570.0  epsilon: 1.0    steps: 656  evaluation reward: 904.4
Training network. lr: 0.000197. clip: 0.078656
Iteration 6991: Policy loss: 0.003523. Value loss: 0.320847. Entropy: 1.676328.
Iteration 6992: Policy loss: -0.001464. Value loss: 0.234875. Entropy: 1.665099.
Iteration 6993: Policy loss: -0.003420. Value los

Training network. lr: 0.000196. clip: 0.078352
Iteration 7051: Policy loss: 0.007489. Value loss: 0.118675. Entropy: 1.537496.
Iteration 7052: Policy loss: -0.000494. Value loss: 0.064894. Entropy: 1.518801.
Iteration 7053: Policy loss: -0.005294. Value loss: 0.054603. Entropy: 1.526019.
episode: 2587   score: 1310.0  epsilon: 1.0    steps: 32  evaluation reward: 843.9
episode: 2588   score: 500.0  epsilon: 1.0    steps: 832  evaluation reward: 832.2
Training network. lr: 0.000196. clip: 0.078352
Iteration 7054: Policy loss: 0.005496. Value loss: 0.221166. Entropy: 1.727977.
Iteration 7055: Policy loss: 0.007203. Value loss: 0.160231. Entropy: 1.730782.
Iteration 7056: Policy loss: -0.002999. Value loss: 0.143685. Entropy: 1.748948.
episode: 2589   score: 540.0  epsilon: 1.0    steps: 128  evaluation reward: 825.1
episode: 2590   score: 1150.0  epsilon: 1.0    steps: 296  evaluation reward: 829.9
Training network. lr: 0.000196. clip: 0.078352
Iteration 7057: Policy loss: 0.004639. Valu

Iteration 7114: Policy loss: 0.003997. Value loss: 0.088821. Entropy: 1.891708.
Iteration 7115: Policy loss: -0.000413. Value loss: 0.054490. Entropy: 1.880959.
Iteration 7116: Policy loss: -0.001274. Value loss: 0.040372. Entropy: 1.886411.
episode: 2612   score: 560.0  epsilon: 1.0    steps: 944  evaluation reward: 778.4
Training network. lr: 0.000195. clip: 0.078195
Iteration 7117: Policy loss: 0.003628. Value loss: 0.105252. Entropy: 1.820293.
Iteration 7118: Policy loss: -0.001250. Value loss: 0.060053. Entropy: 1.800275.
Iteration 7119: Policy loss: -0.004405. Value loss: 0.043286. Entropy: 1.798111.
episode: 2613   score: 590.0  epsilon: 1.0    steps: 704  evaluation reward: 778.8
episode: 2614   score: 590.0  epsilon: 1.0    steps: 768  evaluation reward: 778.2
Training network. lr: 0.000195. clip: 0.078195
Iteration 7120: Policy loss: 0.004197. Value loss: 0.112984. Entropy: 1.659219.
Iteration 7121: Policy loss: 0.000020. Value loss: 0.084370. Entropy: 1.698320.
Iteration 712

Iteration 7178: Policy loss: -0.002463. Value loss: 0.145436. Entropy: 1.888919.
Iteration 7179: Policy loss: -0.000969. Value loss: 0.123987. Entropy: 1.890753.
episode: 2637   score: 610.0  epsilon: 1.0    steps: 320  evaluation reward: 707.3
Training network. lr: 0.000195. clip: 0.078038
Iteration 7180: Policy loss: 0.004685. Value loss: 0.080007. Entropy: 1.715870.
Iteration 7181: Policy loss: -0.003755. Value loss: 0.064806. Entropy: 1.728726.
Iteration 7182: Policy loss: -0.005271. Value loss: 0.062317. Entropy: 1.705789.
episode: 2638   score: 490.0  epsilon: 1.0    steps: 888  evaluation reward: 705.1
Training network. lr: 0.000195. clip: 0.078038
Iteration 7183: Policy loss: 0.002145. Value loss: 0.101018. Entropy: 1.930536.
Iteration 7184: Policy loss: -0.004506. Value loss: 0.066723. Entropy: 1.927119.
Iteration 7185: Policy loss: -0.006090. Value loss: 0.054023. Entropy: 1.931560.
episode: 2639   score: 470.0  epsilon: 1.0    steps: 16  evaluation reward: 703.4
episode: 264

Training network. lr: 0.000195. clip: 0.077891
Iteration 7240: Policy loss: 0.002484. Value loss: 0.150178. Entropy: 1.694251.
Iteration 7241: Policy loss: 0.000108. Value loss: 0.124261. Entropy: 1.700079.
Iteration 7242: Policy loss: -0.003221. Value loss: 0.122842. Entropy: 1.704043.
episode: 2664   score: 280.0  epsilon: 1.0    steps: 672  evaluation reward: 675.8
Training network. lr: 0.000195. clip: 0.077891
Iteration 7243: Policy loss: 0.004885. Value loss: 0.195960. Entropy: 1.547792.
Iteration 7244: Policy loss: -0.001578. Value loss: 0.133932. Entropy: 1.536983.
Iteration 7245: Policy loss: -0.004188. Value loss: 0.122353. Entropy: 1.534509.
Training network. lr: 0.000195. clip: 0.077891
Iteration 7246: Policy loss: 0.001187. Value loss: 0.279398. Entropy: 1.757347.
Iteration 7247: Policy loss: -0.003835. Value loss: 0.222316. Entropy: 1.757518.
Iteration 7248: Policy loss: -0.008015. Value loss: 0.189076. Entropy: 1.742820.
episode: 2665   score: 670.0  epsilon: 1.0    steps

Iteration 7304: Policy loss: 0.002365. Value loss: 0.153178. Entropy: 1.218442.
Iteration 7305: Policy loss: -0.002793. Value loss: 0.134674. Entropy: 1.200433.
Training network. lr: 0.000194. clip: 0.077577
Iteration 7306: Policy loss: 0.007188. Value loss: 0.638277. Entropy: 1.442447.
Iteration 7307: Policy loss: 0.014197. Value loss: 0.457826. Entropy: 1.430284.
Iteration 7308: Policy loss: 0.009078. Value loss: 0.359732. Entropy: 1.436421.
episode: 2689   score: 750.0  epsilon: 1.0    steps: 680  evaluation reward: 652.1
Training network. lr: 0.000194. clip: 0.077577
Iteration 7309: Policy loss: 0.008545. Value loss: 0.058475. Entropy: 1.582384.
Iteration 7310: Policy loss: -0.004295. Value loss: 0.033059. Entropy: 1.572528.
Iteration 7311: Policy loss: -0.007110. Value loss: 0.026975. Entropy: 1.574970.
episode: 2690   score: 730.0  epsilon: 1.0    steps: 560  evaluation reward: 647.9
Training network. lr: 0.000194. clip: 0.077577
Iteration 7312: Policy loss: 0.002365. Value loss:

Iteration 7369: Policy loss: 0.006653. Value loss: 1.092616. Entropy: 1.445078.
Iteration 7370: Policy loss: 0.015052. Value loss: 0.808112. Entropy: 1.436315.
Iteration 7371: Policy loss: 0.018443. Value loss: 0.733959. Entropy: 1.438106.
Training network. lr: 0.000194. clip: 0.077430
Iteration 7372: Policy loss: 0.006643. Value loss: 0.121240. Entropy: 1.232285.
Iteration 7373: Policy loss: -0.002084. Value loss: 0.070394. Entropy: 1.233895.
Iteration 7374: Policy loss: -0.002765. Value loss: 0.060591. Entropy: 1.240690.
episode: 2712   score: 790.0  epsilon: 1.0    steps: 896  evaluation reward: 678.1
episode: 2713   score: 2350.0  epsilon: 1.0    steps: 936  evaluation reward: 695.7
Training network. lr: 0.000194. clip: 0.077430
Iteration 7375: Policy loss: 0.007247. Value loss: 0.314471. Entropy: 1.312160.
Iteration 7376: Policy loss: 0.004868. Value loss: 0.255190. Entropy: 1.300472.
Iteration 7377: Policy loss: 0.001280. Value loss: 0.229126. Entropy: 1.313329.
episode: 2714   s

Iteration 7433: Policy loss: 0.003440. Value loss: 0.133238. Entropy: 1.021407.
Iteration 7434: Policy loss: -0.001266. Value loss: 0.125881. Entropy: 1.019878.
episode: 2737   score: 790.0  epsilon: 1.0    steps: 392  evaluation reward: 756.8
Training network. lr: 0.000193. clip: 0.077273
Iteration 7435: Policy loss: 0.007019. Value loss: 0.267908. Entropy: 1.422487.
Iteration 7436: Policy loss: -0.004025. Value loss: 0.164483. Entropy: 1.454172.
Iteration 7437: Policy loss: -0.014952. Value loss: 0.137353. Entropy: 1.462129.
episode: 2738   score: 720.0  epsilon: 1.0    steps: 496  evaluation reward: 759.1
episode: 2739   score: 780.0  epsilon: 1.0    steps: 624  evaluation reward: 762.2
Training network. lr: 0.000193. clip: 0.077273
Iteration 7438: Policy loss: 0.005481. Value loss: 0.313159. Entropy: 1.352883.
Iteration 7439: Policy loss: 0.000837. Value loss: 0.213856. Entropy: 1.374174.
Iteration 7440: Policy loss: -0.003716. Value loss: 0.176024. Entropy: 1.353467.
episode: 2740

Iteration 7498: Policy loss: 0.007387. Value loss: 0.062614. Entropy: 1.579356.
Iteration 7499: Policy loss: -0.003803. Value loss: 0.040934. Entropy: 1.587785.
Iteration 7500: Policy loss: 0.001024. Value loss: 0.040322. Entropy: 1.587220.
episode: 2760   score: 1060.0  epsilon: 1.0    steps: 216  evaluation reward: 826.7
episode: 2761   score: 650.0  epsilon: 1.0    steps: 376  evaluation reward: 827.1
episode: 2762   score: 840.0  epsilon: 1.0    steps: 536  evaluation reward: 828.5
episode: 2763   score: 1230.0  epsilon: 1.0    steps: 968  evaluation reward: 835.0
Training network. lr: 0.000192. clip: 0.076969
Iteration 7501: Policy loss: 0.007052. Value loss: 0.276184. Entropy: 1.489372.
Iteration 7502: Policy loss: -0.000405. Value loss: 0.206627. Entropy: 1.480799.
Iteration 7503: Policy loss: -0.001702. Value loss: 0.185727. Entropy: 1.495514.
Training network. lr: 0.000192. clip: 0.076969
Iteration 7504: Policy loss: 0.001998. Value loss: 0.229691. Entropy: 1.294660.
Iteration

Iteration 7562: Policy loss: 0.003292. Value loss: 0.140340. Entropy: 1.315263.
Iteration 7563: Policy loss: -0.002433. Value loss: 0.127977. Entropy: 1.311451.
Training network. lr: 0.000192. clip: 0.076813
Iteration 7564: Policy loss: 0.009507. Value loss: 0.090466. Entropy: 1.694978.
Iteration 7565: Policy loss: 0.002818. Value loss: 0.039773. Entropy: 1.666288.
Iteration 7566: Policy loss: -0.006645. Value loss: 0.030329. Entropy: 1.661968.
episode: 2785   score: 800.0  epsilon: 1.0    steps: 384  evaluation reward: 881.7
Training network. lr: 0.000192. clip: 0.076813
Iteration 7567: Policy loss: 0.011083. Value loss: 0.286603. Entropy: 1.840784.
Iteration 7568: Policy loss: 0.008650. Value loss: 0.188375. Entropy: 1.824774.
Iteration 7569: Policy loss: 0.003817. Value loss: 0.154360. Entropy: 1.841174.
episode: 2786   score: 1090.0  epsilon: 1.0    steps: 704  evaluation reward: 885.3
Training network. lr: 0.000192. clip: 0.076813
Iteration 7570: Policy loss: 0.007737. Value loss:

Iteration 7629: Policy loss: -0.000468. Value loss: 0.070590. Entropy: 1.055889.
episode: 2806   score: 1310.0  epsilon: 1.0    steps: 56  evaluation reward: 895.3
episode: 2807   score: 1070.0  epsilon: 1.0    steps: 64  evaluation reward: 899.2
episode: 2808   score: 2140.0  epsilon: 1.0    steps: 928  evaluation reward: 912.9
Training network. lr: 0.000192. clip: 0.076656
Iteration 7630: Policy loss: 0.007133. Value loss: 0.188582. Entropy: 1.258875.
Iteration 7631: Policy loss: 0.002283. Value loss: 0.158758. Entropy: 1.259607.
Iteration 7632: Policy loss: 0.000025. Value loss: 0.152977. Entropy: 1.267341.
episode: 2809   score: 670.0  epsilon: 1.0    steps: 528  evaluation reward: 911.2
episode: 2810   score: 820.0  epsilon: 1.0    steps: 976  evaluation reward: 911.7
Training network. lr: 0.000192. clip: 0.076656
Iteration 7633: Policy loss: 0.001352. Value loss: 0.169959. Entropy: 1.282104.
Iteration 7634: Policy loss: -0.000370. Value loss: 0.116076. Entropy: 1.276432.
Iteratio

Training network. lr: 0.000191. clip: 0.076508
Iteration 7693: Policy loss: -0.000426. Value loss: 0.205054. Entropy: 1.295461.
Iteration 7694: Policy loss: -0.010900. Value loss: 0.146501. Entropy: 1.303261.
Iteration 7695: Policy loss: -0.012921. Value loss: 0.139679. Entropy: 1.309415.
Training network. lr: 0.000191. clip: 0.076508
Iteration 7696: Policy loss: 0.007609. Value loss: 0.093117. Entropy: 1.439083.
Iteration 7697: Policy loss: -0.003980. Value loss: 0.055324. Entropy: 1.407750.
Iteration 7698: Policy loss: -0.003904. Value loss: 0.041788. Entropy: 1.425352.
episode: 2831   score: 1030.0  epsilon: 1.0    steps: 104  evaluation reward: 888.2
episode: 2832   score: 430.0  epsilon: 1.0    steps: 360  evaluation reward: 885.5
episode: 2833   score: 910.0  epsilon: 1.0    steps: 448  evaluation reward: 886.9
episode: 2834   score: 690.0  epsilon: 1.0    steps: 616  evaluation reward: 887.3
Training network. lr: 0.000191. clip: 0.076508
Iteration 7699: Policy loss: 0.004894. Va

Iteration 7757: Policy loss: -0.003623. Value loss: 0.135640. Entropy: 1.013955.
Iteration 7758: Policy loss: -0.006457. Value loss: 0.112495. Entropy: 1.026114.
Training network. lr: 0.000190. clip: 0.076195
Iteration 7759: Policy loss: 0.005938. Value loss: 0.220381. Entropy: 0.904043.
Iteration 7760: Policy loss: 0.001749. Value loss: 0.159087. Entropy: 0.894035.
Iteration 7761: Policy loss: -0.000596. Value loss: 0.134521. Entropy: 0.906108.
episode: 2855   score: 850.0  epsilon: 1.0    steps: 48  evaluation reward: 887.6
episode: 2856   score: 820.0  epsilon: 1.0    steps: 392  evaluation reward: 887.4
Training network. lr: 0.000190. clip: 0.076195
Iteration 7762: Policy loss: 0.001272. Value loss: 0.323493. Entropy: 1.148079.
Iteration 7763: Policy loss: -0.001358. Value loss: 0.221487. Entropy: 1.155763.
Iteration 7764: Policy loss: -0.001696. Value loss: 0.205414. Entropy: 1.156712.
Training network. lr: 0.000190. clip: 0.076195
Iteration 7765: Policy loss: 0.000201. Value loss

Iteration 7824: Policy loss: 0.000079. Value loss: 0.076269. Entropy: 1.155985.
episode: 2877   score: 780.0  epsilon: 1.0    steps: 936  evaluation reward: 909.6
Training network. lr: 0.000190. clip: 0.076048
Iteration 7825: Policy loss: 0.005897. Value loss: 0.166739. Entropy: 1.239596.
Iteration 7826: Policy loss: 0.001278. Value loss: 0.105619. Entropy: 1.230692.
Iteration 7827: Policy loss: -0.000825. Value loss: 0.084539. Entropy: 1.230038.
Training network. lr: 0.000190. clip: 0.076048
Iteration 7828: Policy loss: 0.006419. Value loss: 0.228796. Entropy: 1.350053.
Iteration 7829: Policy loss: 0.009948. Value loss: 0.161202. Entropy: 1.354924.
Iteration 7830: Policy loss: 0.003137. Value loss: 0.134765. Entropy: 1.338447.
episode: 2878   score: 680.0  epsilon: 1.0    steps: 864  evaluation reward: 909.7
episode: 2879   score: 1230.0  epsilon: 1.0    steps: 992  evaluation reward: 910.1
Training network. lr: 0.000190. clip: 0.076048
Iteration 7831: Policy loss: 0.004901. Value los

episode: 2899   score: 780.0  epsilon: 1.0    steps: 944  evaluation reward: 954.3
Training network. lr: 0.000190. clip: 0.075891
Iteration 7891: Policy loss: 0.002753. Value loss: 0.113180. Entropy: 1.566999.
Iteration 7892: Policy loss: -0.006224. Value loss: 0.069404. Entropy: 1.572964.
Iteration 7893: Policy loss: -0.008950. Value loss: 0.050011. Entropy: 1.582254.
episode: 2900   score: 1260.0  epsilon: 1.0    steps: 656  evaluation reward: 958.8
now time :  2019-03-06 04:52:09.405919
episode: 2901   score: 990.0  epsilon: 1.0    steps: 888  evaluation reward: 961.1
Training network. lr: 0.000190. clip: 0.075891
Iteration 7894: Policy loss: 0.006173. Value loss: 0.475443. Entropy: 1.230672.
Iteration 7895: Policy loss: 0.001331. Value loss: 0.356342. Entropy: 1.211852.
Iteration 7896: Policy loss: 0.001175. Value loss: 0.323777. Entropy: 1.233529.
Training network. lr: 0.000190. clip: 0.075891
Iteration 7897: Policy loss: 0.002328. Value loss: 0.213508. Entropy: 1.486928.
Iteratio

Iteration 7956: Policy loss: 0.000316. Value loss: 0.139009. Entropy: 1.162437.
episode: 2922   score: 1030.0  epsilon: 1.0    steps: 152  evaluation reward: 944.1
episode: 2923   score: 780.0  epsilon: 1.0    steps: 752  evaluation reward: 944.0
Training network. lr: 0.000189. clip: 0.075587
Iteration 7957: Policy loss: 0.000046. Value loss: 0.153525. Entropy: 1.218220.
Iteration 7958: Policy loss: -0.004838. Value loss: 0.130658. Entropy: 1.232296.
Iteration 7959: Policy loss: -0.008915. Value loss: 0.124733. Entropy: 1.201245.
Training network. lr: 0.000189. clip: 0.075587
Iteration 7960: Policy loss: 0.005191. Value loss: 0.490124. Entropy: 1.316970.
Iteration 7961: Policy loss: -0.002130. Value loss: 0.400779. Entropy: 1.302426.
Iteration 7962: Policy loss: -0.007002. Value loss: 0.347902. Entropy: 1.291798.
Training network. lr: 0.000189. clip: 0.075587
Iteration 7963: Policy loss: 0.003611. Value loss: 0.161548. Entropy: 1.450103.
Iteration 7964: Policy loss: 0.001645. Value los

Training network. lr: 0.000189. clip: 0.075430
Iteration 8020: Policy loss: -0.001597. Value loss: 0.352631. Entropy: 1.515587.
Iteration 8021: Policy loss: -0.002208. Value loss: 0.283231. Entropy: 1.530394.
Iteration 8022: Policy loss: -0.001924. Value loss: 0.245208. Entropy: 1.536106.
Training network. lr: 0.000189. clip: 0.075430
Iteration 8023: Policy loss: 0.003331. Value loss: 0.211288. Entropy: 1.311984.
Iteration 8024: Policy loss: -0.001196. Value loss: 0.181763. Entropy: 1.299736.
Iteration 8025: Policy loss: -0.002027. Value loss: 0.170821. Entropy: 1.300098.
Training network. lr: 0.000189. clip: 0.075430
Iteration 8026: Policy loss: 0.003207. Value loss: 0.225008. Entropy: 1.531077.
Iteration 8027: Policy loss: 0.003889. Value loss: 0.170218. Entropy: 1.527873.
Iteration 8028: Policy loss: -0.001054. Value loss: 0.145399. Entropy: 1.530066.
Training network. lr: 0.000189. clip: 0.075430
Iteration 8029: Policy loss: 0.004632. Value loss: 0.242657. Entropy: 1.535518.
Iterat

Training network. lr: 0.000188. clip: 0.075273
Iteration 8086: Policy loss: 0.004980. Value loss: 0.071982. Entropy: 1.570489.
Iteration 8087: Policy loss: 0.004598. Value loss: 0.048023. Entropy: 1.569773.
Iteration 8088: Policy loss: -0.004590. Value loss: 0.042602. Entropy: 1.564636.
episode: 2970   score: 880.0  epsilon: 1.0    steps: 352  evaluation reward: 923.6
Training network. lr: 0.000188. clip: 0.075273
Iteration 8089: Policy loss: 0.003305. Value loss: 0.272452. Entropy: 1.352519.
Iteration 8090: Policy loss: 0.001476. Value loss: 0.233833. Entropy: 1.340792.
Iteration 8091: Policy loss: -0.003585. Value loss: 0.223643. Entropy: 1.337548.
episode: 2971   score: 870.0  epsilon: 1.0    steps: 24  evaluation reward: 920.2
Training network. lr: 0.000188. clip: 0.075273
Iteration 8092: Policy loss: 0.006597. Value loss: 0.168300. Entropy: 1.451039.
Iteration 8093: Policy loss: -0.001496. Value loss: 0.126260. Entropy: 1.452221.
Iteration 8094: Policy loss: -0.005006. Value loss:

Iteration 8153: Policy loss: 0.000173. Value loss: 0.102846. Entropy: 1.320262.
Iteration 8154: Policy loss: -0.000464. Value loss: 0.093227. Entropy: 1.316490.
episode: 2991   score: 980.0  epsilon: 1.0    steps: 240  evaluation reward: 920.8
Training network. lr: 0.000187. clip: 0.074969
Iteration 8155: Policy loss: 0.001921. Value loss: 0.136104. Entropy: 1.349476.
Iteration 8156: Policy loss: -0.000834. Value loss: 0.110732. Entropy: 1.345970.
Iteration 8157: Policy loss: -0.003532. Value loss: 0.103174. Entropy: 1.344889.
Training network. lr: 0.000187. clip: 0.074969
Iteration 8158: Policy loss: 0.004246. Value loss: 0.101841. Entropy: 1.685585.
Iteration 8159: Policy loss: 0.000710. Value loss: 0.051327. Entropy: 1.701289.
Iteration 8160: Policy loss: -0.001523. Value loss: 0.040230. Entropy: 1.687562.
episode: 2992   score: 850.0  epsilon: 1.0    steps: 576  evaluation reward: 910.0
episode: 2993   score: 900.0  epsilon: 1.0    steps: 584  evaluation reward: 910.3
episode: 2994

Iteration 8219: Policy loss: -0.000694. Value loss: 0.088610. Entropy: 1.410440.
Iteration 8220: Policy loss: -0.005098. Value loss: 0.076225. Entropy: 1.414626.
Training network. lr: 0.000187. clip: 0.074813
Iteration 8221: Policy loss: 0.002741. Value loss: 0.092283. Entropy: 1.681904.
Iteration 8222: Policy loss: -0.001152. Value loss: 0.057016. Entropy: 1.669324.
Iteration 8223: Policy loss: -0.006223. Value loss: 0.048824. Entropy: 1.672935.
episode: 3013   score: 940.0  epsilon: 1.0    steps: 560  evaluation reward: 922.8
episode: 3014   score: 860.0  epsilon: 1.0    steps: 792  evaluation reward: 925.1
Training network. lr: 0.000187. clip: 0.074813
Iteration 8224: Policy loss: 0.007160. Value loss: 0.088561. Entropy: 1.681334.
Iteration 8225: Policy loss: -0.000893. Value loss: 0.053601. Entropy: 1.679580.
Iteration 8226: Policy loss: -0.005903. Value loss: 0.043717. Entropy: 1.673501.
episode: 3015   score: 760.0  epsilon: 1.0    steps: 408  evaluation reward: 926.2
episode: 30

Iteration 8286: Policy loss: -0.004410. Value loss: 0.127359. Entropy: 1.742645.
Training network. lr: 0.000187. clip: 0.074665
Iteration 8287: Policy loss: 0.003255. Value loss: 0.160167. Entropy: 1.944914.
Iteration 8288: Policy loss: 0.000473. Value loss: 0.107546. Entropy: 1.943464.
Iteration 8289: Policy loss: -0.005400. Value loss: 0.097329. Entropy: 1.942122.
Training network. lr: 0.000187. clip: 0.074665
Iteration 8290: Policy loss: 0.001800. Value loss: 0.101061. Entropy: 1.877453.
Iteration 8291: Policy loss: 0.001600. Value loss: 0.058807. Entropy: 1.869458.
Iteration 8292: Policy loss: -0.006921. Value loss: 0.045221. Entropy: 1.866030.
episode: 3035   score: 1080.0  epsilon: 1.0    steps: 128  evaluation reward: 947.0
episode: 3036   score: 970.0  epsilon: 1.0    steps: 752  evaluation reward: 949.7
episode: 3037   score: 980.0  epsilon: 1.0    steps: 840  evaluation reward: 947.5
episode: 3038   score: 630.0  epsilon: 1.0    steps: 856  evaluation reward: 947.5
Training n

episode: 3056   score: 790.0  epsilon: 1.0    steps: 784  evaluation reward: 990.0
Training network. lr: 0.000186. clip: 0.074352
Iteration 8353: Policy loss: 0.007198. Value loss: 0.279125. Entropy: 1.605567.
Iteration 8354: Policy loss: 0.000371. Value loss: 0.221766. Entropy: 1.610341.
Iteration 8355: Policy loss: -0.001048. Value loss: 0.205414. Entropy: 1.617754.
episode: 3057   score: 670.0  epsilon: 1.0    steps: 1000  evaluation reward: 986.7
Training network. lr: 0.000186. clip: 0.074352
Iteration 8356: Policy loss: 0.001968. Value loss: 0.146026. Entropy: 1.705180.
Iteration 8357: Policy loss: -0.001207. Value loss: 0.090637. Entropy: 1.695754.
Iteration 8358: Policy loss: -0.002408. Value loss: 0.075704. Entropy: 1.703139.
episode: 3058   score: 1070.0  epsilon: 1.0    steps: 48  evaluation reward: 991.6
episode: 3059   score: 750.0  epsilon: 1.0    steps: 280  evaluation reward: 988.5
episode: 3060   score: 540.0  epsilon: 1.0    steps: 512  evaluation reward: 984.8
Trainin

Iteration 8418: Policy loss: -0.008268. Value loss: 0.315407. Entropy: 1.138997.
episode: 3080   score: 1120.0  epsilon: 1.0    steps: 432  evaluation reward: 1001.7
Training network. lr: 0.000186. clip: 0.074204
Iteration 8419: Policy loss: 0.004856. Value loss: 0.218183. Entropy: 1.167708.
Iteration 8420: Policy loss: 0.000763. Value loss: 0.180001. Entropy: 1.175046.
Iteration 8421: Policy loss: -0.002202. Value loss: 0.161506. Entropy: 1.165761.
Training network. lr: 0.000186. clip: 0.074204
Iteration 8422: Policy loss: 0.011692. Value loss: 0.352803. Entropy: 1.519384.
Iteration 8423: Policy loss: 0.007169. Value loss: 0.253544. Entropy: 1.506298.
Iteration 8424: Policy loss: 0.003996. Value loss: 0.195662. Entropy: 1.518214.
episode: 3081   score: 1030.0  epsilon: 1.0    steps: 224  evaluation reward: 1007.3
Training network. lr: 0.000186. clip: 0.074204
Iteration 8425: Policy loss: 0.006835. Value loss: 0.392462. Entropy: 1.227410.
Iteration 8426: Policy loss: 0.001975. Value lo

Iteration 8484: Policy loss: -0.003720. Value loss: 0.233268. Entropy: 1.198836.
episode: 3102   score: 550.0  epsilon: 1.0    steps: 336  evaluation reward: 989.3
Training network. lr: 0.000185. clip: 0.074048
Iteration 8485: Policy loss: 0.004247. Value loss: 0.216063. Entropy: 1.465276.
Iteration 8486: Policy loss: -0.001863. Value loss: 0.151197. Entropy: 1.463124.
Iteration 8487: Policy loss: -0.004506. Value loss: 0.131734. Entropy: 1.460864.
episode: 3103   score: 820.0  epsilon: 1.0    steps: 600  evaluation reward: 988.4
Training network. lr: 0.000185. clip: 0.074048
Iteration 8488: Policy loss: 0.001014. Value loss: 0.494002. Entropy: 1.525009.
Iteration 8489: Policy loss: 0.008810. Value loss: 0.302671. Entropy: 1.526563.
Iteration 8490: Policy loss: 0.007422. Value loss: 0.206457. Entropy: 1.508353.
episode: 3104   score: 930.0  epsilon: 1.0    steps: 496  evaluation reward: 988.9
Training network. lr: 0.000185. clip: 0.074048
Iteration 8491: Policy loss: 0.002848. Value lo

Iteration 8549: Policy loss: 0.000762. Value loss: 0.938889. Entropy: 1.312409.
Iteration 8550: Policy loss: -0.003736. Value loss: 0.906589. Entropy: 1.297413.
Training network. lr: 0.000184. clip: 0.073744
Iteration 8551: Policy loss: 0.000575. Value loss: 0.188086. Entropy: 1.288460.
Iteration 8552: Policy loss: -0.003661. Value loss: 0.138840. Entropy: 1.305131.
Iteration 8553: Policy loss: -0.003709. Value loss: 0.104312. Entropy: 1.304557.
episode: 3126   score: 1050.0  epsilon: 1.0    steps: 240  evaluation reward: 971.1
episode: 3127   score: 890.0  epsilon: 1.0    steps: 920  evaluation reward: 965.4
Training network. lr: 0.000184. clip: 0.073744
Iteration 8554: Policy loss: 0.002405. Value loss: 0.300395. Entropy: 1.203241.
Iteration 8555: Policy loss: 0.001543. Value loss: 0.213864. Entropy: 1.208082.
Iteration 8556: Policy loss: -0.002432. Value loss: 0.187653. Entropy: 1.200243.
episode: 3128   score: 870.0  epsilon: 1.0    steps: 600  evaluation reward: 965.4
Training net

Training network. lr: 0.000184. clip: 0.073587
Iteration 8617: Policy loss: 0.000297. Value loss: 0.227849. Entropy: 1.172029.
Iteration 8618: Policy loss: -0.001776. Value loss: 0.190973. Entropy: 1.157014.
Iteration 8619: Policy loss: -0.004609. Value loss: 0.185116. Entropy: 1.158777.
episode: 3148   score: 570.0  epsilon: 1.0    steps: 488  evaluation reward: 949.2
episode: 3149   score: 990.0  epsilon: 1.0    steps: 672  evaluation reward: 950.5
Training network. lr: 0.000184. clip: 0.073587
Iteration 8620: Policy loss: 0.009086. Value loss: 0.237844. Entropy: 1.159606.
Iteration 8621: Policy loss: 0.001714. Value loss: 0.153593. Entropy: 1.179808.
Iteration 8622: Policy loss: -0.003616. Value loss: 0.133718. Entropy: 1.159453.
episode: 3150   score: 1440.0  epsilon: 1.0    steps: 592  evaluation reward: 955.4
Training network. lr: 0.000184. clip: 0.073587
Iteration 8623: Policy loss: 0.001141. Value loss: 0.287802. Entropy: 1.295998.
Iteration 8624: Policy loss: -0.002328. Value 

Training network. lr: 0.000184. clip: 0.073430
Iteration 8683: Policy loss: 0.003429. Value loss: 0.239169. Entropy: 1.514536.
Iteration 8684: Policy loss: 0.000230. Value loss: 0.187931. Entropy: 1.538708.
Iteration 8685: Policy loss: -0.001446. Value loss: 0.173250. Entropy: 1.553170.
episode: 3170   score: 960.0  epsilon: 1.0    steps: 24  evaluation reward: 952.3
episode: 3171   score: 800.0  epsilon: 1.0    steps: 160  evaluation reward: 951.9
Training network. lr: 0.000184. clip: 0.073430
Iteration 8686: Policy loss: -0.000969. Value loss: 0.199070. Entropy: 0.957968.
Iteration 8687: Policy loss: -0.002472. Value loss: 0.192577. Entropy: 0.964653.
Iteration 8688: Policy loss: -0.001859. Value loss: 0.184584. Entropy: 0.971951.
Training network. lr: 0.000184. clip: 0.073430
Iteration 8689: Policy loss: 0.003454. Value loss: 0.087175. Entropy: 1.353556.
Iteration 8690: Policy loss: -0.001496. Value loss: 0.042360. Entropy: 1.365577.
Iteration 8691: Policy loss: -0.007496. Value los

Iteration 8749: Policy loss: 0.006031. Value loss: 0.476661. Entropy: 1.570120.
Iteration 8750: Policy loss: 0.004471. Value loss: 0.404352. Entropy: 1.553035.
Iteration 8751: Policy loss: -0.001196. Value loss: 0.370979. Entropy: 1.554018.
episode: 3192   score: 750.0  epsilon: 1.0    steps: 736  evaluation reward: 939.9
episode: 3193   score: 860.0  epsilon: 1.0    steps: 808  evaluation reward: 939.8
Training network. lr: 0.000183. clip: 0.073126
Iteration 8752: Policy loss: 0.002206. Value loss: 0.165582. Entropy: 1.527148.
Iteration 8753: Policy loss: -0.003488. Value loss: 0.122428. Entropy: 1.507323.
Iteration 8754: Policy loss: -0.003970. Value loss: 0.114929. Entropy: 1.516940.
Training network. lr: 0.000183. clip: 0.073126
Iteration 8755: Policy loss: 0.003303. Value loss: 0.164219. Entropy: 1.453851.
Iteration 8756: Policy loss: -0.001010. Value loss: 0.145069. Entropy: 1.464063.
Iteration 8757: Policy loss: -0.001608. Value loss: 0.145584. Entropy: 1.444878.
episode: 3194  

Iteration 8816: Policy loss: -0.002132. Value loss: 0.028241. Entropy: 1.535926.
Iteration 8817: Policy loss: -0.002661. Value loss: 0.023170. Entropy: 1.559976.
episode: 3213   score: 920.0  epsilon: 1.0    steps: 320  evaluation reward: 944.4
episode: 3214   score: 1270.0  epsilon: 1.0    steps: 640  evaluation reward: 949.4
episode: 3215   score: 1110.0  epsilon: 1.0    steps: 712  evaluation reward: 952.1
episode: 3216   score: 890.0  epsilon: 1.0    steps: 944  evaluation reward: 956.3
Training network. lr: 0.000182. clip: 0.072969
Iteration 8818: Policy loss: 0.000187. Value loss: 0.245440. Entropy: 1.511391.
Iteration 8819: Policy loss: 0.000375. Value loss: 0.195719. Entropy: 1.514905.
Iteration 8820: Policy loss: -0.002518. Value loss: 0.169870. Entropy: 1.528635.
episode: 3217   score: 850.0  epsilon: 1.0    steps: 136  evaluation reward: 956.5
Training network. lr: 0.000182. clip: 0.072969
Iteration 8821: Policy loss: 0.001555. Value loss: 0.263339. Entropy: 0.915222.
Iterat

Iteration 8882: Policy loss: 0.000816. Value loss: 0.416648. Entropy: 1.151694.
Iteration 8883: Policy loss: -0.004146. Value loss: 0.355185. Entropy: 1.168939.
episode: 3236   score: 840.0  epsilon: 1.0    steps: 720  evaluation reward: 942.6
Training network. lr: 0.000182. clip: 0.072822
Iteration 8884: Policy loss: 0.002525. Value loss: 0.074497. Entropy: 1.022947.
Iteration 8885: Policy loss: -0.003764. Value loss: 0.043559. Entropy: 1.016795.
Iteration 8886: Policy loss: -0.005731. Value loss: 0.038568. Entropy: 1.011303.
episode: 3237   score: 910.0  epsilon: 1.0    steps: 328  evaluation reward: 943.1
Training network. lr: 0.000182. clip: 0.072822
Iteration 8887: Policy loss: 0.000451. Value loss: 0.150612. Entropy: 0.880476.
Iteration 8888: Policy loss: -0.000953. Value loss: 0.113892. Entropy: 0.882454.
Iteration 8889: Policy loss: -0.004516. Value loss: 0.105976. Entropy: 0.874638.
episode: 3238   score: 1460.0  epsilon: 1.0    steps: 56  evaluation reward: 949.4
episode: 323

Training network. lr: 0.000182. clip: 0.072665
Iteration 8947: Policy loss: 0.004007. Value loss: 0.291253. Entropy: 1.313903.
Iteration 8948: Policy loss: -0.002365. Value loss: 0.198685. Entropy: 1.304760.
Iteration 8949: Policy loss: -0.003975. Value loss: 0.146924. Entropy: 1.321546.
episode: 3260   score: 700.0  epsilon: 1.0    steps: 784  evaluation reward: 950.3
Training network. lr: 0.000182. clip: 0.072665
Iteration 8950: Policy loss: 0.003741. Value loss: 0.095816. Entropy: 1.018466.
Iteration 8951: Policy loss: -0.003477. Value loss: 0.054215. Entropy: 1.002322.
Iteration 8952: Policy loss: -0.007354. Value loss: 0.041707. Entropy: 0.986359.
Training network. lr: 0.000181. clip: 0.072509
Iteration 8953: Policy loss: 0.005398. Value loss: 0.317105. Entropy: 0.931564.
Iteration 8954: Policy loss: 0.004914. Value loss: 0.218255. Entropy: 0.928392.
Iteration 8955: Policy loss: 0.000292. Value loss: 0.203495. Entropy: 0.924857.
episode: 3261   score: 1030.0  epsilon: 1.0    steps

Iteration 9013: Policy loss: 0.003149. Value loss: 0.484341. Entropy: 1.385279.
Iteration 9014: Policy loss: 0.001102. Value loss: 0.395881. Entropy: 1.382106.
Iteration 9015: Policy loss: -0.002835. Value loss: 0.380925. Entropy: 1.390466.
episode: 3282   score: 1190.0  epsilon: 1.0    steps: 136  evaluation reward: 1002.5
episode: 3283   score: 1180.0  epsilon: 1.0    steps: 648  evaluation reward: 1003.4
Training network. lr: 0.000181. clip: 0.072361
Iteration 9016: Policy loss: 0.000972. Value loss: 0.238787. Entropy: 0.928471.
Iteration 9017: Policy loss: -0.000130. Value loss: 0.217446. Entropy: 0.914119.
Iteration 9018: Policy loss: -0.000770. Value loss: 0.212250. Entropy: 0.919286.
Training network. lr: 0.000181. clip: 0.072361
Iteration 9019: Policy loss: 0.006665. Value loss: 0.168173. Entropy: 1.121563.
Iteration 9020: Policy loss: 0.003204. Value loss: 0.117619. Entropy: 1.106814.
Iteration 9021: Policy loss: -0.003787. Value loss: 0.097792. Entropy: 1.109983.
episode: 328

Training network. lr: 0.000181. clip: 0.072205
Iteration 9079: Policy loss: 0.004040. Value loss: 0.105216. Entropy: 1.253417.
Iteration 9080: Policy loss: -0.005717. Value loss: 0.055618. Entropy: 1.238198.
Iteration 9081: Policy loss: -0.010707. Value loss: 0.050486. Entropy: 1.236789.
episode: 3305   score: 1250.0  epsilon: 1.0    steps: 144  evaluation reward: 1027.1
Training network. lr: 0.000181. clip: 0.072205
Iteration 9082: Policy loss: 0.005075. Value loss: 0.348259. Entropy: 0.667513.
Iteration 9083: Policy loss: 0.005585. Value loss: 0.299831. Entropy: 0.734757.
Iteration 9084: Policy loss: -0.003108. Value loss: 0.281889. Entropy: 0.717881.
Training network. lr: 0.000181. clip: 0.072205
Iteration 9085: Policy loss: 0.001852. Value loss: 0.182334. Entropy: 1.097251.
Iteration 9086: Policy loss: -0.005009. Value loss: 0.111392. Entropy: 1.096280.
Iteration 9087: Policy loss: -0.006680. Value loss: 0.084215. Entropy: 1.076297.
Training network. lr: 0.000181. clip: 0.072205
It

episode: 3327   score: 780.0  epsilon: 1.0    steps: 632  evaluation reward: 1035.5
Training network. lr: 0.000180. clip: 0.072048
Iteration 9145: Policy loss: 0.002046. Value loss: 0.384199. Entropy: 0.938472.
Iteration 9146: Policy loss: -0.005189. Value loss: 0.280396. Entropy: 0.960500.
Iteration 9147: Policy loss: -0.007217. Value loss: 0.253594. Entropy: 0.953372.
episode: 3328   score: 1910.0  epsilon: 1.0    steps: 632  evaluation reward: 1043.4
Training network. lr: 0.000180. clip: 0.072048
Iteration 9148: Policy loss: 0.001195. Value loss: 0.256571. Entropy: 1.345872.
Iteration 9149: Policy loss: -0.002809. Value loss: 0.167727. Entropy: 1.334904.
Iteration 9150: Policy loss: -0.009567. Value loss: 0.121592. Entropy: 1.353555.
episode: 3329   score: 950.0  epsilon: 1.0    steps: 368  evaluation reward: 1042.3
Training network. lr: 0.000180. clip: 0.071900
Iteration 9151: Policy loss: 0.008518. Value loss: 0.331602. Entropy: 1.549242.
Iteration 9152: Policy loss: -0.004801. Va

Iteration 9209: Policy loss: -0.002028. Value loss: 0.328285. Entropy: 1.395286.
Iteration 9210: Policy loss: -0.003784. Value loss: 0.302125. Entropy: 1.387902.
Training network. lr: 0.000179. clip: 0.071744
Iteration 9211: Policy loss: 0.004535. Value loss: 0.172044. Entropy: 1.743439.
Iteration 9212: Policy loss: -0.002221. Value loss: 0.098416. Entropy: 1.732647.
Iteration 9213: Policy loss: -0.002075. Value loss: 0.075646. Entropy: 1.735898.
Training network. lr: 0.000179. clip: 0.071744
Iteration 9214: Policy loss: -0.000170. Value loss: 0.084003. Entropy: 1.602425.
Iteration 9215: Policy loss: -0.003373. Value loss: 0.049402. Entropy: 1.592327.
Iteration 9216: Policy loss: -0.006527. Value loss: 0.040739. Entropy: 1.594979.
Training network. lr: 0.000179. clip: 0.071744
Iteration 9217: Policy loss: 0.004186. Value loss: 0.063221. Entropy: 1.645745.
Iteration 9218: Policy loss: -0.000070. Value loss: 0.036659. Entropy: 1.661140.
Iteration 9219: Policy loss: -0.002578. Value loss:

Training network. lr: 0.000179. clip: 0.071587
Iteration 9274: Policy loss: 0.003418. Value loss: 0.197206. Entropy: 1.912304.
Iteration 9275: Policy loss: 0.001475. Value loss: 0.138196. Entropy: 1.914386.
Iteration 9276: Policy loss: -0.006333. Value loss: 0.109608. Entropy: 1.919551.
episode: 3374   score: 900.0  epsilon: 1.0    steps: 8  evaluation reward: 1014.6
Training network. lr: 0.000179. clip: 0.071587
Iteration 9277: Policy loss: 0.005770. Value loss: 0.246329. Entropy: 1.378302.
Iteration 9278: Policy loss: 0.000025. Value loss: 0.175938. Entropy: 1.389695.
Iteration 9279: Policy loss: -0.006150. Value loss: 0.151137. Entropy: 1.392507.
episode: 3375   score: 970.0  epsilon: 1.0    steps: 728  evaluation reward: 1015.4
Training network. lr: 0.000179. clip: 0.071587
Iteration 9280: Policy loss: 0.001048. Value loss: 0.145289. Entropy: 1.469715.
Iteration 9281: Policy loss: -0.002686. Value loss: 0.083115. Entropy: 1.461713.
Iteration 9282: Policy loss: -0.005598. Value loss

Iteration 9339: Policy loss: -0.003869. Value loss: 0.168056. Entropy: 1.134533.
episode: 3397   score: 1090.0  epsilon: 1.0    steps: 176  evaluation reward: 1021.3
Training network. lr: 0.000179. clip: 0.071440
Iteration 9340: Policy loss: 0.003813. Value loss: 0.368876. Entropy: 1.002751.
Iteration 9341: Policy loss: 0.002976. Value loss: 0.322374. Entropy: 0.982051.
Iteration 9342: Policy loss: -0.000343. Value loss: 0.303186. Entropy: 0.996716.
episode: 3398   score: 1050.0  epsilon: 1.0    steps: 736  evaluation reward: 1023.0
Training network. lr: 0.000179. clip: 0.071440
Iteration 9343: Policy loss: 0.005462. Value loss: 0.139698. Entropy: 1.431855.
Iteration 9344: Policy loss: 0.001923. Value loss: 0.076432. Entropy: 1.437981.
Iteration 9345: Policy loss: -0.003190. Value loss: 0.060019. Entropy: 1.418065.
Training network. lr: 0.000179. clip: 0.071440
Iteration 9346: Policy loss: 0.002232. Value loss: 0.165912. Entropy: 1.622675.
Iteration 9347: Policy loss: 0.002349. Value l

episode: 3419   score: 1430.0  epsilon: 1.0    steps: 232  evaluation reward: 1039.3
episode: 3420   score: 860.0  epsilon: 1.0    steps: 368  evaluation reward: 1039.0
episode: 3421   score: 910.0  epsilon: 1.0    steps: 1024  evaluation reward: 1038.5
Training network. lr: 0.000178. clip: 0.071126
Iteration 9406: Policy loss: 0.005838. Value loss: 0.392913. Entropy: 0.674865.
Iteration 9407: Policy loss: -0.000275. Value loss: 0.343195. Entropy: 0.690654.
Iteration 9408: Policy loss: -0.002896. Value loss: 0.335390. Entropy: 0.681652.
Training network. lr: 0.000178. clip: 0.071126
Iteration 9409: Policy loss: 0.001192. Value loss: 0.206956. Entropy: 0.965959.
Iteration 9410: Policy loss: -0.003541. Value loss: 0.170043. Entropy: 0.945680.
Iteration 9411: Policy loss: -0.005977. Value loss: 0.151705. Entropy: 0.937705.
Training network. lr: 0.000178. clip: 0.071126
Iteration 9412: Policy loss: 0.005059. Value loss: 0.196086. Entropy: 1.550701.
Iteration 9413: Policy loss: -0.001408. V

Iteration 9473: Policy loss: -0.004535. Value loss: 0.042515. Entropy: 1.041103.
Iteration 9474: Policy loss: -0.005139. Value loss: 0.035221. Entropy: 1.057237.
episode: 3440   score: 1640.0  epsilon: 1.0    steps: 40  evaluation reward: 1042.9
episode: 3441   score: 1010.0  epsilon: 1.0    steps: 568  evaluation reward: 1046.7
episode: 3442   score: 810.0  epsilon: 1.0    steps: 840  evaluation reward: 1043.2
Training network. lr: 0.000177. clip: 0.070979
Iteration 9475: Policy loss: 0.000981. Value loss: 0.217381. Entropy: 0.709656.
Iteration 9476: Policy loss: -0.004360. Value loss: 0.185952. Entropy: 0.704862.
Iteration 9477: Policy loss: -0.005973. Value loss: 0.174936. Entropy: 0.674325.
Training network. lr: 0.000177. clip: 0.070979
Iteration 9478: Policy loss: 0.000972. Value loss: 0.141728. Entropy: 0.847827.
Iteration 9479: Policy loss: -0.003116. Value loss: 0.114625. Entropy: 0.848464.
Iteration 9480: Policy loss: -0.004252. Value loss: 0.115943. Entropy: 0.866192.
episode

Iteration 9539: Policy loss: 0.003556. Value loss: 0.168476. Entropy: 1.416783.
Iteration 9540: Policy loss: 0.002170. Value loss: 0.116028. Entropy: 1.418642.
episode: 3462   score: 980.0  epsilon: 1.0    steps: 672  evaluation reward: 1068.7
Training network. lr: 0.000177. clip: 0.070822
Iteration 9541: Policy loss: 0.006506. Value loss: 0.204536. Entropy: 1.532303.
Iteration 9542: Policy loss: -0.002537. Value loss: 0.133205. Entropy: 1.550236.
Iteration 9543: Policy loss: -0.004876. Value loss: 0.114956. Entropy: 1.543410.
episode: 3463   score: 1030.0  epsilon: 1.0    steps: 504  evaluation reward: 1072.0
Training network. lr: 0.000177. clip: 0.070822
Iteration 9544: Policy loss: 0.012439. Value loss: 0.239486. Entropy: 1.516179.
Iteration 9545: Policy loss: 0.002154. Value loss: 0.145124. Entropy: 1.495620.
Iteration 9546: Policy loss: 0.001573. Value loss: 0.127013. Entropy: 1.518899.
Training network. lr: 0.000177. clip: 0.070822
Iteration 9547: Policy loss: 0.003857. Value los

Iteration 9604: Policy loss: -0.000249. Value loss: 0.250980. Entropy: 1.453160.
Iteration 9605: Policy loss: -0.000605. Value loss: 0.241554. Entropy: 1.445750.
Iteration 9606: Policy loss: -0.004488. Value loss: 0.233791. Entropy: 1.445170.
episode: 3485   score: 1060.0  epsilon: 1.0    steps: 576  evaluation reward: 1065.3
episode: 3486   score: 1150.0  epsilon: 1.0    steps: 672  evaluation reward: 1061.4
Training network. lr: 0.000176. clip: 0.070518
Iteration 9607: Policy loss: 0.003215. Value loss: 0.123998. Entropy: 1.122358.
Iteration 9608: Policy loss: -0.003236. Value loss: 0.095029. Entropy: 1.125154.
Iteration 9609: Policy loss: -0.007931. Value loss: 0.088781. Entropy: 1.129265.
episode: 3487   score: 980.0  epsilon: 1.0    steps: 80  evaluation reward: 1049.1
Training network. lr: 0.000176. clip: 0.070518
Iteration 9610: Policy loss: 0.002432. Value loss: 0.318088. Entropy: 0.992177.
Iteration 9611: Policy loss: 0.001287. Value loss: 0.279680. Entropy: 1.004458.
Iteratio

Training network. lr: 0.000176. clip: 0.070361
Iteration 9670: Policy loss: -0.001670. Value loss: 0.169845. Entropy: 0.688428.
Iteration 9671: Policy loss: -0.004007. Value loss: 0.161198. Entropy: 0.685823.
Iteration 9672: Policy loss: -0.004477. Value loss: 0.157581. Entropy: 0.710526.
Training network. lr: 0.000176. clip: 0.070361
Iteration 9673: Policy loss: 0.000554. Value loss: 0.125667. Entropy: 1.032445.
Iteration 9674: Policy loss: -0.000037. Value loss: 0.093734. Entropy: 1.055272.
Iteration 9675: Policy loss: -0.003603. Value loss: 0.084386. Entropy: 1.046245.
episode: 3507   score: 1080.0  epsilon: 1.0    steps: 112  evaluation reward: 1068.9
Training network. lr: 0.000176. clip: 0.070361
Iteration 9676: Policy loss: -0.000052. Value loss: 0.140616. Entropy: 1.208048.
Iteration 9677: Policy loss: -0.001834. Value loss: 0.106489. Entropy: 1.225243.
Iteration 9678: Policy loss: -0.003969. Value loss: 0.091782. Entropy: 1.212599.
episode: 3508   score: 1080.0  epsilon: 1.0   

Iteration 9737: Policy loss: -0.000086. Value loss: 0.079746. Entropy: 1.351557.
Iteration 9738: Policy loss: -0.004309. Value loss: 0.061540. Entropy: 1.355056.
episode: 3528   score: 1090.0  epsilon: 1.0    steps: 408  evaluation reward: 1085.1
Training network. lr: 0.000176. clip: 0.070205
Iteration 9739: Policy loss: 0.002614. Value loss: 0.252078. Entropy: 1.344164.
Iteration 9740: Policy loss: -0.001341. Value loss: 0.188403. Entropy: 1.344203.
Iteration 9741: Policy loss: -0.003497. Value loss: 0.174672. Entropy: 1.348013.
episode: 3529   score: 880.0  epsilon: 1.0    steps: 800  evaluation reward: 1084.3
Training network. lr: 0.000176. clip: 0.070205
Iteration 9742: Policy loss: 0.001721. Value loss: 0.141614. Entropy: 1.438208.
Iteration 9743: Policy loss: -0.000066. Value loss: 0.079075. Entropy: 1.447320.
Iteration 9744: Policy loss: 0.000199. Value loss: 0.063508. Entropy: 1.449832.
episode: 3530   score: 1100.0  epsilon: 1.0    steps: 640  evaluation reward: 1085.4
episode

Training network. lr: 0.000175. clip: 0.069901
Iteration 9802: Policy loss: 0.005523. Value loss: 0.207424. Entropy: 1.567704.
Iteration 9803: Policy loss: -0.000312. Value loss: 0.132699. Entropy: 1.577553.
Iteration 9804: Policy loss: -0.001720. Value loss: 0.115839. Entropy: 1.585575.
Training network. lr: 0.000175. clip: 0.069901
Iteration 9805: Policy loss: 0.002149. Value loss: 0.734438. Entropy: 0.812595.
Iteration 9806: Policy loss: 0.000228. Value loss: 0.637853. Entropy: 0.822750.
Iteration 9807: Policy loss: -0.005133. Value loss: 0.627659. Entropy: 0.830327.
Training network. lr: 0.000175. clip: 0.069901
Iteration 9808: Policy loss: 0.000971. Value loss: 0.130476. Entropy: 1.062247.
Iteration 9809: Policy loss: -0.002986. Value loss: 0.092811. Entropy: 1.091616.
Iteration 9810: Policy loss: -0.008369. Value loss: 0.079268. Entropy: 1.104374.
episode: 3552   score: 2250.0  epsilon: 1.0    steps: 80  evaluation reward: 1117.2
episode: 3553   score: 1330.0  epsilon: 1.0    ste

Iteration 9870: Policy loss: -0.004423. Value loss: 0.126406. Entropy: 1.310599.
episode: 3572   score: 1050.0  epsilon: 1.0    steps: 216  evaluation reward: 1151.6
Training network. lr: 0.000174. clip: 0.069744
Iteration 9871: Policy loss: 0.002446. Value loss: 0.215646. Entropy: 1.382431.
Iteration 9872: Policy loss: -0.000034. Value loss: 0.136051. Entropy: 1.365658.
Iteration 9873: Policy loss: -0.002367. Value loss: 0.109295. Entropy: 1.347641.
episode: 3573   score: 1510.0  epsilon: 1.0    steps: 952  evaluation reward: 1156.3
Training network. lr: 0.000174. clip: 0.069744
Iteration 9874: Policy loss: 0.006834. Value loss: 0.121315. Entropy: 1.548603.
Iteration 9875: Policy loss: -0.000779. Value loss: 0.052191. Entropy: 1.534926.
Iteration 9876: Policy loss: 0.000468. Value loss: 0.039674. Entropy: 1.538399.
episode: 3574   score: 820.0  epsilon: 1.0    steps: 208  evaluation reward: 1155.1
episode: 3575   score: 1060.0  epsilon: 1.0    steps: 264  evaluation reward: 1156.2
Tra

episode: 3593   score: 970.0  epsilon: 1.0    steps: 320  evaluation reward: 1144.6
episode: 3594   score: 960.0  epsilon: 1.0    steps: 632  evaluation reward: 1143.8
episode: 3595   score: 1440.0  epsilon: 1.0    steps: 960  evaluation reward: 1147.8
Training network. lr: 0.000174. clip: 0.069596
Iteration 9937: Policy loss: -0.001664. Value loss: 0.174827. Entropy: 1.461874.
Iteration 9938: Policy loss: -0.003236. Value loss: 0.147675. Entropy: 1.447180.
Iteration 9939: Policy loss: -0.004005. Value loss: 0.133868. Entropy: 1.457003.
episode: 3596   score: 1370.0  epsilon: 1.0    steps: 688  evaluation reward: 1150.6
Training network. lr: 0.000174. clip: 0.069596
Iteration 9940: Policy loss: 0.000852. Value loss: 0.206884. Entropy: 1.178346.
Iteration 9941: Policy loss: 0.000190. Value loss: 0.170618. Entropy: 1.185507.
Iteration 9942: Policy loss: 0.000208. Value loss: 0.153704. Entropy: 1.178602.
episode: 3597   score: 960.0  epsilon: 1.0    steps: 784  evaluation reward: 1150.0
T

episode: 3616   score: 740.0  epsilon: 1.0    steps: 480  evaluation reward: 1125.0
Training network. lr: 0.000173. clip: 0.069283
Iteration 10003: Policy loss: 0.002915. Value loss: 0.112788. Entropy: 1.336680.
Iteration 10004: Policy loss: -0.003250. Value loss: 0.069641. Entropy: 1.336010.
Iteration 10005: Policy loss: -0.005996. Value loss: 0.065178. Entropy: 1.329698.
Training network. lr: 0.000173. clip: 0.069283
Iteration 10006: Policy loss: 0.006178. Value loss: 0.093297. Entropy: 1.498309.
Iteration 10007: Policy loss: -0.003223. Value loss: 0.051085. Entropy: 1.478302.
Iteration 10008: Policy loss: -0.005621. Value loss: 0.039322. Entropy: 1.495405.
episode: 3617   score: 1000.0  epsilon: 1.0    steps: 152  evaluation reward: 1123.9
episode: 3618   score: 1770.0  epsilon: 1.0    steps: 616  evaluation reward: 1130.8
Training network. lr: 0.000173. clip: 0.069283
Iteration 10009: Policy loss: -0.000781. Value loss: 0.322163. Entropy: 1.454767.
Iteration 10010: Policy loss: -0.

Iteration 10068: Policy loss: -0.000995. Value loss: 0.367929. Entropy: 1.545424.
Training network. lr: 0.000173. clip: 0.069136
Iteration 10069: Policy loss: 0.015368. Value loss: 0.770314. Entropy: 1.352552.
Iteration 10070: Policy loss: 0.020149. Value loss: 0.556788. Entropy: 1.356613.
Iteration 10071: Policy loss: 0.025666. Value loss: 0.476268. Entropy: 1.379639.
episode: 3638   score: 960.0  epsilon: 1.0    steps: 608  evaluation reward: 1116.9
Training network. lr: 0.000173. clip: 0.069136
Iteration 10072: Policy loss: 0.001967. Value loss: 0.164974. Entropy: 1.575998.
Iteration 10073: Policy loss: -0.000035. Value loss: 0.093642. Entropy: 1.568859.
Iteration 10074: Policy loss: -0.002278. Value loss: 0.067619. Entropy: 1.560836.
Training network. lr: 0.000173. clip: 0.069136
Iteration 10075: Policy loss: 0.000514. Value loss: 0.346902. Entropy: 1.631982.
Iteration 10076: Policy loss: -0.001548. Value loss: 0.216883. Entropy: 1.618480.
Iteration 10077: Policy loss: -0.002074. V

Iteration 10132: Policy loss: -0.000248. Value loss: 0.234767. Entropy: 0.748116.
Iteration 10133: Policy loss: 0.001006. Value loss: 0.229905. Entropy: 0.759750.
Iteration 10134: Policy loss: -0.002579. Value loss: 0.220345. Entropy: 0.759767.
episode: 3661   score: 720.0  epsilon: 1.0    steps: 312  evaluation reward: 1058.8
Training network. lr: 0.000172. clip: 0.068979
Iteration 10135: Policy loss: 0.004279. Value loss: 0.178558. Entropy: 1.154365.
Iteration 10136: Policy loss: -0.001550. Value loss: 0.096966. Entropy: 1.156304.
Iteration 10137: Policy loss: -0.003309. Value loss: 0.081010. Entropy: 1.177242.
episode: 3662   score: 1480.0  epsilon: 1.0    steps: 712  evaluation reward: 1059.5
Training network. lr: 0.000172. clip: 0.068979
Iteration 10138: Policy loss: 0.001062. Value loss: 0.268690. Entropy: 1.430300.
Iteration 10139: Policy loss: -0.000214. Value loss: 0.158997. Entropy: 1.440450.
Iteration 10140: Policy loss: -0.003279. Value loss: 0.117266. Entropy: 1.426107.
ep

Training network. lr: 0.000172. clip: 0.068822
Iteration 10198: Policy loss: 0.004855. Value loss: 0.248301. Entropy: 1.366736.
Iteration 10199: Policy loss: 0.002797. Value loss: 0.195941. Entropy: 1.359851.
Iteration 10200: Policy loss: -0.000476. Value loss: 0.184172. Entropy: 1.352277.
episode: 3683   score: 1350.0  epsilon: 1.0    steps: 320  evaluation reward: 1043.0
episode: 3684   score: 510.0  epsilon: 1.0    steps: 688  evaluation reward: 1038.2
episode: 3685   score: 680.0  epsilon: 1.0    steps: 960  evaluation reward: 1041.1
Training network. lr: 0.000172. clip: 0.068675
Iteration 10201: Policy loss: 0.010896. Value loss: 0.237992. Entropy: 1.526686.
Iteration 10202: Policy loss: 0.001331. Value loss: 0.157323. Entropy: 1.506450.
Iteration 10203: Policy loss: -0.002351. Value loss: 0.122193. Entropy: 1.502217.
episode: 3686   score: 450.0  epsilon: 1.0    steps: 440  evaluation reward: 1035.8
Training network. lr: 0.000172. clip: 0.068675
Iteration 10204: Policy loss: 0.00

episode: 3704   score: 420.0  epsilon: 1.0    steps: 712  evaluation reward: 1021.6
episode: 3705   score: 1320.0  epsilon: 1.0    steps: 768  evaluation reward: 1025.0
episode: 3706   score: 1230.0  epsilon: 1.0    steps: 800  evaluation reward: 1021.9
episode: 3707   score: 2080.0  epsilon: 1.0    steps: 1000  evaluation reward: 1034.8
Training network. lr: 0.000171. clip: 0.068518
Iteration 10264: Policy loss: 0.008534. Value loss: 0.194420. Entropy: 1.391571.
Iteration 10265: Policy loss: 0.000809. Value loss: 0.115291. Entropy: 1.381082.
Iteration 10266: Policy loss: 0.001367. Value loss: 0.101410. Entropy: 1.376999.
episode: 3708   score: 990.0  epsilon: 1.0    steps: 120  evaluation reward: 1034.9
episode: 3709   score: 1040.0  epsilon: 1.0    steps: 184  evaluation reward: 1037.7
Training network. lr: 0.000171. clip: 0.068518
Iteration 10267: Policy loss: 0.000468. Value loss: 0.364782. Entropy: 0.724476.
Iteration 10268: Policy loss: -0.000794. Value loss: 0.313010. Entropy: 0

Training network. lr: 0.000171. clip: 0.068361
Iteration 10327: Policy loss: 0.004538. Value loss: 0.176938. Entropy: 1.538720.
Iteration 10328: Policy loss: -0.000010. Value loss: 0.107215. Entropy: 1.534922.
Iteration 10329: Policy loss: -0.006093. Value loss: 0.092684. Entropy: 1.523960.
episode: 3729   score: 1150.0  epsilon: 1.0    steps: 56  evaluation reward: 1001.5
episode: 3730   score: 1340.0  epsilon: 1.0    steps: 144  evaluation reward: 1005.3
episode: 3731   score: 560.0  epsilon: 1.0    steps: 488  evaluation reward: 998.4
Training network. lr: 0.000171. clip: 0.068361
Iteration 10330: Policy loss: 0.005046. Value loss: 0.344912. Entropy: 0.880686.
Iteration 10331: Policy loss: 0.005291. Value loss: 0.252253. Entropy: 0.868029.
Iteration 10332: Policy loss: 0.000950. Value loss: 0.230479. Entropy: 0.848870.
Training network. lr: 0.000171. clip: 0.068361
Iteration 10333: Policy loss: 0.004921. Value loss: 0.257140. Entropy: 1.439154.
Iteration 10334: Policy loss: 0.004856

now time :  2019-03-06 05:25:37.762461
episode: 3751   score: 1780.0  epsilon: 1.0    steps: 688  evaluation reward: 1003.5
Training network. lr: 0.000171. clip: 0.068214
Iteration 10393: Policy loss: -0.001161. Value loss: 0.294417. Entropy: 1.181745.
Iteration 10394: Policy loss: 0.000817. Value loss: 0.219781. Entropy: 1.188523.
Iteration 10395: Policy loss: 0.000158. Value loss: 0.181100. Entropy: 1.179411.
Training network. lr: 0.000171. clip: 0.068214
Iteration 10396: Policy loss: 0.006702. Value loss: 0.360390. Entropy: 1.397462.
Iteration 10397: Policy loss: 0.000127. Value loss: 0.310692. Entropy: 1.388514.
Iteration 10398: Policy loss: 0.000945. Value loss: 0.233000. Entropy: 1.392802.
episode: 3752   score: 1360.0  epsilon: 1.0    steps: 304  evaluation reward: 1009.6
episode: 3753   score: 1040.0  epsilon: 1.0    steps: 736  evaluation reward: 1005.5
Training network. lr: 0.000171. clip: 0.068214
Iteration 10399: Policy loss: 0.004339. Value loss: 0.172992. Entropy: 1.44071

episode: 3773   score: 1000.0  epsilon: 1.0    steps: 768  evaluation reward: 1044.9
episode: 3774   score: 560.0  epsilon: 1.0    steps: 864  evaluation reward: 1041.1
Training network. lr: 0.000170. clip: 0.067901
Iteration 10459: Policy loss: 0.006384. Value loss: 0.254148. Entropy: 1.034452.
Iteration 10460: Policy loss: 0.002576. Value loss: 0.156453. Entropy: 1.028473.
Iteration 10461: Policy loss: -0.000565. Value loss: 0.137310. Entropy: 1.028654.
Training network. lr: 0.000170. clip: 0.067901
Iteration 10462: Policy loss: 0.004473. Value loss: 0.329737. Entropy: 0.716389.
Iteration 10463: Policy loss: 0.002481. Value loss: 0.254897. Entropy: 0.712423.
Iteration 10464: Policy loss: -0.000601. Value loss: 0.227301. Entropy: 0.715482.
episode: 3775   score: 1040.0  epsilon: 1.0    steps: 480  evaluation reward: 1039.9
episode: 3776   score: 1540.0  epsilon: 1.0    steps: 848  evaluation reward: 1044.5
Training network. lr: 0.000170. clip: 0.067901
Iteration 10465: Policy loss: 0.

Iteration 10524: Policy loss: -0.004627. Value loss: 0.174606. Entropy: 0.951386.
episode: 3795   score: 1520.0  epsilon: 1.0    steps: 1000  evaluation reward: 1068.1
Training network. lr: 0.000169. clip: 0.067753
Iteration 10525: Policy loss: 0.000688. Value loss: 0.304323. Entropy: 1.318602.
Iteration 10526: Policy loss: -0.001857. Value loss: 0.206338. Entropy: 1.327498.
Iteration 10527: Policy loss: -0.007506. Value loss: 0.163015. Entropy: 1.325513.
episode: 3796   score: 750.0  epsilon: 1.0    steps: 712  evaluation reward: 1065.7
episode: 3797   score: 1080.0  epsilon: 1.0    steps: 808  evaluation reward: 1060.5
Training network. lr: 0.000169. clip: 0.067753
Iteration 10528: Policy loss: 0.002199. Value loss: 0.167889. Entropy: 1.381711.
Iteration 10529: Policy loss: -0.000115. Value loss: 0.126268. Entropy: 1.390936.
Iteration 10530: Policy loss: 0.000333. Value loss: 0.116072. Entropy: 1.368936.
Training network. lr: 0.000169. clip: 0.067753
Iteration 10531: Policy loss: 0.0

Iteration 10588: Policy loss: 0.006998. Value loss: 0.214490. Entropy: 1.261168.
Iteration 10589: Policy loss: 0.001190. Value loss: 0.138221. Entropy: 1.286237.
Iteration 10590: Policy loss: -0.000878. Value loss: 0.120116. Entropy: 1.288158.
episode: 3818   score: 820.0  epsilon: 1.0    steps: 888  evaluation reward: 1074.5
Training network. lr: 0.000169. clip: 0.067597
Iteration 10591: Policy loss: 0.005021. Value loss: 0.361959. Entropy: 1.403594.
Iteration 10592: Policy loss: 0.003186. Value loss: 0.242218. Entropy: 1.388447.
Iteration 10593: Policy loss: 0.000280. Value loss: 0.219617. Entropy: 1.390857.
episode: 3819   score: 730.0  epsilon: 1.0    steps: 232  evaluation reward: 1070.7
episode: 3820   score: 1170.0  epsilon: 1.0    steps: 832  evaluation reward: 1076.0
Training network. lr: 0.000169. clip: 0.067597
Iteration 10594: Policy loss: -0.000307. Value loss: 0.283209. Entropy: 1.140793.
Iteration 10595: Policy loss: -0.002742. Value loss: 0.226712. Entropy: 1.139798.
It

Iteration 10652: Policy loss: -0.002914. Value loss: 0.204751. Entropy: 1.149722.
Iteration 10653: Policy loss: -0.004451. Value loss: 0.189199. Entropy: 1.143354.
Training network. lr: 0.000168. clip: 0.067292
Iteration 10654: Policy loss: 0.007003. Value loss: 0.499187. Entropy: 1.024226.
Iteration 10655: Policy loss: 0.012042. Value loss: 0.328471. Entropy: 1.010912.
Iteration 10656: Policy loss: 0.004185. Value loss: 0.273156. Entropy: 0.995767.
episode: 3842   score: 1080.0  epsilon: 1.0    steps: 480  evaluation reward: 1087.7
Training network. lr: 0.000168. clip: 0.067292
Iteration 10657: Policy loss: 0.003420. Value loss: 0.118732. Entropy: 1.083109.
Iteration 10658: Policy loss: -0.001062. Value loss: 0.064943. Entropy: 1.075783.
Iteration 10659: Policy loss: -0.006334. Value loss: 0.053725. Entropy: 1.088833.
episode: 3843   score: 1160.0  epsilon: 1.0    steps: 416  evaluation reward: 1084.6
Training network. lr: 0.000168. clip: 0.067292
Iteration 10660: Policy loss: 0.00324

Iteration 10715: Policy loss: -0.000204. Value loss: 0.432414. Entropy: 0.987264.
Iteration 10716: Policy loss: 0.000385. Value loss: 0.394893. Entropy: 0.989972.
Training network. lr: 0.000168. clip: 0.067136
Iteration 10717: Policy loss: 0.005010. Value loss: 0.671631. Entropy: 1.248700.
Iteration 10718: Policy loss: 0.003595. Value loss: 0.528016. Entropy: 1.205094.
Iteration 10719: Policy loss: 0.007593. Value loss: 0.438917. Entropy: 1.231713.
Training network. lr: 0.000168. clip: 0.067136
Iteration 10720: Policy loss: 0.001016. Value loss: 0.416378. Entropy: 1.491291.
Iteration 10721: Policy loss: -0.000827. Value loss: 0.276286. Entropy: 1.490525.
Iteration 10722: Policy loss: -0.003052. Value loss: 0.234990. Entropy: 1.502354.
Training network. lr: 0.000168. clip: 0.067136
Iteration 10723: Policy loss: 0.003766. Value loss: 0.265451. Entropy: 1.439053.
Iteration 10724: Policy loss: -0.000971. Value loss: 0.185812. Entropy: 1.442929.
Iteration 10725: Policy loss: -0.005346. Valu

Iteration 10781: Policy loss: -0.000395. Value loss: 0.342974. Entropy: 0.637534.
Iteration 10782: Policy loss: -0.002420. Value loss: 0.327281. Entropy: 0.641058.
Training network. lr: 0.000167. clip: 0.066979
Iteration 10783: Policy loss: 0.004488. Value loss: 0.180411. Entropy: 1.177868.
Iteration 10784: Policy loss: -0.000422. Value loss: 0.071440. Entropy: 1.198562.
Iteration 10785: Policy loss: 0.000410. Value loss: 0.049270. Entropy: 1.196943.
episode: 3888   score: 1080.0  epsilon: 1.0    steps: 296  evaluation reward: 1061.2
Training network. lr: 0.000167. clip: 0.066979
Iteration 10786: Policy loss: 0.005096. Value loss: 0.514411. Entropy: 1.471573.
Iteration 10787: Policy loss: 0.002756. Value loss: 0.392701. Entropy: 1.475579.
Iteration 10788: Policy loss: 0.002501. Value loss: 0.336949. Entropy: 1.457382.
episode: 3889   score: 1110.0  epsilon: 1.0    steps: 856  evaluation reward: 1062.1
Training network. lr: 0.000167. clip: 0.066979
Iteration 10789: Policy loss: 0.009661

Iteration 10846: Policy loss: 0.006644. Value loss: 0.243784. Entropy: 1.045230.
Iteration 10847: Policy loss: 0.003220. Value loss: 0.180065. Entropy: 1.035828.
Iteration 10848: Policy loss: -0.002562. Value loss: 0.158426. Entropy: 1.040745.
episode: 3910   score: 870.0  epsilon: 1.0    steps: 344  evaluation reward: 1098.7
episode: 3911   score: 1210.0  epsilon: 1.0    steps: 496  evaluation reward: 1098.9
Training network. lr: 0.000167. clip: 0.066832
Iteration 10849: Policy loss: 0.003040. Value loss: 0.287358. Entropy: 0.779097.
Iteration 10850: Policy loss: 0.006929. Value loss: 0.236402. Entropy: 0.802270.
Iteration 10851: Policy loss: -0.001110. Value loss: 0.202466. Entropy: 0.809933.
Training network. lr: 0.000167. clip: 0.066675
Iteration 10852: Policy loss: 0.010257. Value loss: 0.293536. Entropy: 1.370289.
Iteration 10853: Policy loss: 0.006261. Value loss: 0.211680. Entropy: 1.374503.
Iteration 10854: Policy loss: 0.004843. Value loss: 0.160720. Entropy: 1.359816.
episod

Iteration 10913: Policy loss: -0.000068. Value loss: 0.145705. Entropy: 1.626972.
Iteration 10914: Policy loss: -0.002900. Value loss: 0.120524. Entropy: 1.610734.
episode: 3931   score: 570.0  epsilon: 1.0    steps: 896  evaluation reward: 1079.8
episode: 3932   score: 980.0  epsilon: 1.0    steps: 1016  evaluation reward: 1080.8
Training network. lr: 0.000166. clip: 0.066518
Iteration 10915: Policy loss: 0.001877. Value loss: 0.096880. Entropy: 1.198484.
Iteration 10916: Policy loss: -0.003107. Value loss: 0.069970. Entropy: 1.175718.
Iteration 10917: Policy loss: -0.004639. Value loss: 0.064321. Entropy: 1.174780.
episode: 3933   score: 1110.0  epsilon: 1.0    steps: 736  evaluation reward: 1084.8
Training network. lr: 0.000166. clip: 0.066518
Iteration 10918: Policy loss: -0.000279. Value loss: 0.184966. Entropy: 0.999502.
Iteration 10919: Policy loss: -0.001689. Value loss: 0.124336. Entropy: 1.006941.
Iteration 10920: Policy loss: -0.003013. Value loss: 0.116833. Entropy: 1.01644

Training network. lr: 0.000166. clip: 0.066371
Iteration 10978: Policy loss: 0.003376. Value loss: 0.187537. Entropy: 0.831999.
Iteration 10979: Policy loss: -0.001428. Value loss: 0.134599. Entropy: 0.827457.
Iteration 10980: Policy loss: -0.005435. Value loss: 0.130771. Entropy: 0.827663.
Training network. lr: 0.000166. clip: 0.066371
Iteration 10981: Policy loss: 0.003689. Value loss: 0.616443. Entropy: 1.084312.
Iteration 10982: Policy loss: 0.010176. Value loss: 0.373134. Entropy: 1.105742.
Iteration 10983: Policy loss: -0.001998. Value loss: 0.295653. Entropy: 1.123361.
episode: 3954   score: 790.0  epsilon: 1.0    steps: 600  evaluation reward: 1064.9
Training network. lr: 0.000166. clip: 0.066371
Iteration 10984: Policy loss: 0.004718. Value loss: 0.095541. Entropy: 1.151338.
Iteration 10985: Policy loss: -0.004227. Value loss: 0.050440. Entropy: 1.133349.
Iteration 10986: Policy loss: -0.006078. Value loss: 0.045821. Entropy: 1.153376.
episode: 3955   score: 1350.0  epsilon: 1

Training network. lr: 0.000166. clip: 0.066214
Iteration 11044: Policy loss: 0.001364. Value loss: 0.262420. Entropy: 1.173876.
Iteration 11045: Policy loss: -0.003379. Value loss: 0.206982. Entropy: 1.169885.
Iteration 11046: Policy loss: -0.006692. Value loss: 0.195855. Entropy: 1.190864.
Training network. lr: 0.000166. clip: 0.066214
Iteration 11047: Policy loss: 0.003028. Value loss: 0.172504. Entropy: 1.104383.
Iteration 11048: Policy loss: -0.001853. Value loss: 0.105159. Entropy: 1.104503.
Iteration 11049: Policy loss: -0.003522. Value loss: 0.087834. Entropy: 1.099369.
Training network. lr: 0.000166. clip: 0.066214
Iteration 11050: Policy loss: 0.001842. Value loss: 0.078706. Entropy: 1.575059.
Iteration 11051: Policy loss: -0.004104. Value loss: 0.039217. Entropy: 1.575386.
Iteration 11052: Policy loss: -0.008663. Value loss: 0.031504. Entropy: 1.566728.
episode: 3976   score: 950.0  epsilon: 1.0    steps: 808  evaluation reward: 1103.6
Training network. lr: 0.000165. clip: 0.

Training network. lr: 0.000165. clip: 0.065910
Iteration 11110: Policy loss: -0.000545. Value loss: 0.405557. Entropy: 0.636810.
Iteration 11111: Policy loss: -0.001953. Value loss: 0.388778. Entropy: 0.638548.
Iteration 11112: Policy loss: -0.002458. Value loss: 0.377753. Entropy: 0.631999.
episode: 3998   score: 980.0  epsilon: 1.0    steps: 1016  evaluation reward: 1101.1
Training network. lr: 0.000165. clip: 0.065910
Iteration 11113: Policy loss: 0.004388. Value loss: 0.056154. Entropy: 1.100542.
Iteration 11114: Policy loss: -0.001827. Value loss: 0.045818. Entropy: 1.091355.
Iteration 11115: Policy loss: -0.001529. Value loss: 0.040015. Entropy: 1.099717.
episode: 3999   score: 1090.0  epsilon: 1.0    steps: 624  evaluation reward: 1100.2
Training network. lr: 0.000165. clip: 0.065910
Iteration 11116: Policy loss: 0.004433. Value loss: 0.386777. Entropy: 1.338230.
Iteration 11117: Policy loss: 0.001958. Value loss: 0.315884. Entropy: 1.325897.
Iteration 11118: Policy loss: 0.0003

Iteration 11176: Policy loss: 0.001757. Value loss: 0.141543. Entropy: 1.163096.
Iteration 11177: Policy loss: -0.003075. Value loss: 0.103245. Entropy: 1.166194.
Iteration 11178: Policy loss: -0.004402. Value loss: 0.090629. Entropy: 1.159102.
episode: 4018   score: 1420.0  epsilon: 1.0    steps: 576  evaluation reward: 1083.7
Training network. lr: 0.000164. clip: 0.065753
Iteration 11179: Policy loss: 0.001911. Value loss: 1.116612. Entropy: 0.995051.
Iteration 11180: Policy loss: 0.005490. Value loss: 0.936864. Entropy: 0.994503.
Iteration 11181: Policy loss: 0.003691. Value loss: 0.845659. Entropy: 1.001989.
episode: 4019   score: 930.0  epsilon: 1.0    steps: 112  evaluation reward: 1085.0
Training network. lr: 0.000164. clip: 0.065753
Iteration 11182: Policy loss: 0.003488. Value loss: 0.327902. Entropy: 1.040093.
Iteration 11183: Policy loss: 0.002708. Value loss: 0.268708. Entropy: 1.029365.
Iteration 11184: Policy loss: -0.000963. Value loss: 0.260404. Entropy: 1.034373.
episo

Training network. lr: 0.000164. clip: 0.065597
Iteration 11242: Policy loss: 0.005179. Value loss: 0.512433. Entropy: 1.281397.
Iteration 11243: Policy loss: 0.000639. Value loss: 0.423800. Entropy: 1.267747.
Iteration 11244: Policy loss: 0.000294. Value loss: 0.390175. Entropy: 1.273915.
Training network. lr: 0.000164. clip: 0.065597
Iteration 11245: Policy loss: 0.002620. Value loss: 0.371494. Entropy: 1.347496.
Iteration 11246: Policy loss: -0.000338. Value loss: 0.240041. Entropy: 1.341399.
Iteration 11247: Policy loss: -0.002000. Value loss: 0.214227. Entropy: 1.354798.
episode: 4040   score: 820.0  epsilon: 1.0    steps: 352  evaluation reward: 1100.9
episode: 4041   score: 540.0  epsilon: 1.0    steps: 464  evaluation reward: 1097.4
episode: 4042   score: 820.0  epsilon: 1.0    steps: 904  evaluation reward: 1097.1
Training network. lr: 0.000164. clip: 0.065597
Iteration 11248: Policy loss: 0.003809. Value loss: 0.356893. Entropy: 1.223986.
Iteration 11249: Policy loss: -0.00115

Training network. lr: 0.000163. clip: 0.065293
Iteration 11308: Policy loss: 0.008524. Value loss: 0.102462. Entropy: 1.586822.
Iteration 11309: Policy loss: 0.000058. Value loss: 0.062599. Entropy: 1.568840.
Iteration 11310: Policy loss: -0.000437. Value loss: 0.048293. Entropy: 1.569710.
episode: 4061   score: 1190.0  epsilon: 1.0    steps: 432  evaluation reward: 1071.0
episode: 4062   score: 970.0  epsilon: 1.0    steps: 672  evaluation reward: 1068.6
episode: 4063   score: 1010.0  epsilon: 1.0    steps: 944  evaluation reward: 1068.3
Training network. lr: 0.000163. clip: 0.065293
Iteration 11311: Policy loss: 0.005224. Value loss: 0.102129. Entropy: 1.363456.
Iteration 11312: Policy loss: 0.004416. Value loss: 0.059038. Entropy: 1.389103.
Iteration 11313: Policy loss: -0.003002. Value loss: 0.057280. Entropy: 1.378379.
Training network. lr: 0.000163. clip: 0.065293
Iteration 11314: Policy loss: 0.001371. Value loss: 0.185954. Entropy: 1.192750.
Iteration 11315: Policy loss: 0.0000

Iteration 11372: Policy loss: 0.001977. Value loss: 0.100157. Entropy: 1.184170.
Iteration 11373: Policy loss: -0.002900. Value loss: 0.092336. Entropy: 1.172322.
episode: 4085   score: 1250.0  epsilon: 1.0    steps: 392  evaluation reward: 1027.9
episode: 4086   score: 1190.0  epsilon: 1.0    steps: 1008  evaluation reward: 1029.4
Training network. lr: 0.000163. clip: 0.065136
Iteration 11374: Policy loss: 0.005101. Value loss: 0.536703. Entropy: 1.270722.
Iteration 11375: Policy loss: 0.002216. Value loss: 0.429205. Entropy: 1.287714.
Iteration 11376: Policy loss: -0.002224. Value loss: 0.367981. Entropy: 1.299328.
episode: 4087   score: 980.0  epsilon: 1.0    steps: 128  evaluation reward: 1028.4
Training network. lr: 0.000163. clip: 0.065136
Iteration 11377: Policy loss: 0.003521. Value loss: 0.586062. Entropy: 1.123492.
Iteration 11378: Policy loss: 0.001554. Value loss: 0.472326. Entropy: 1.156490.
Iteration 11379: Policy loss: -0.004675. Value loss: 0.423088. Entropy: 1.151010.


Iteration 11438: Policy loss: 0.003861. Value loss: 0.160957. Entropy: 1.111352.
Iteration 11439: Policy loss: -0.001249. Value loss: 0.146886. Entropy: 1.119256.
episode: 4106   score: 1230.0  epsilon: 1.0    steps: 776  evaluation reward: 1024.4
episode: 4107   score: 980.0  epsilon: 1.0    steps: 1008  evaluation reward: 1023.0
Training network. lr: 0.000162. clip: 0.064988
Iteration 11440: Policy loss: 0.009555. Value loss: 0.138628. Entropy: 1.439202.
Iteration 11441: Policy loss: 0.001622. Value loss: 0.075473. Entropy: 1.435319.
Iteration 11442: Policy loss: -0.003275. Value loss: 0.065552. Entropy: 1.417975.
episode: 4108   score: 1050.0  epsilon: 1.0    steps: 368  evaluation reward: 1022.6
episode: 4109   score: 1070.0  epsilon: 1.0    steps: 504  evaluation reward: 1022.6
Training network. lr: 0.000162. clip: 0.064988
Iteration 11443: Policy loss: 0.001018. Value loss: 0.189865. Entropy: 1.049659.
Iteration 11444: Policy loss: -0.001950. Value loss: 0.164673. Entropy: 1.0689

Iteration 11502: Policy loss: -0.000860. Value loss: 0.165084. Entropy: 1.390715.
Training network. lr: 0.000162. clip: 0.064675
Iteration 11503: Policy loss: 0.005087. Value loss: 0.170969. Entropy: 1.110606.
Iteration 11504: Policy loss: 0.002491. Value loss: 0.122900. Entropy: 1.091322.
Iteration 11505: Policy loss: 0.001685. Value loss: 0.098218. Entropy: 1.100463.
episode: 4130   score: 880.0  epsilon: 1.0    steps: 224  evaluation reward: 1013.5
Training network. lr: 0.000162. clip: 0.064675
Iteration 11506: Policy loss: 0.002798. Value loss: 0.138615. Entropy: 1.189653.
Iteration 11507: Policy loss: -0.001617. Value loss: 0.109194. Entropy: 1.163670.
Iteration 11508: Policy loss: -0.005322. Value loss: 0.101186. Entropy: 1.161970.
episode: 4131   score: 1420.0  epsilon: 1.0    steps: 48  evaluation reward: 1021.5
episode: 4132   score: 870.0  epsilon: 1.0    steps: 344  evaluation reward: 1015.2
Training network. lr: 0.000162. clip: 0.064675
Iteration 11509: Policy loss: 0.00256

now time :  2019-03-06 05:41:23.459142
episode: 4151   score: 980.0  epsilon: 1.0    steps: 560  evaluation reward: 1043.7
episode: 4152   score: 1210.0  epsilon: 1.0    steps: 768  evaluation reward: 1047.9
Training network. lr: 0.000161. clip: 0.064528
Iteration 11569: Policy loss: 0.003642. Value loss: 0.128704. Entropy: 1.087634.
Iteration 11570: Policy loss: -0.003784. Value loss: 0.111449. Entropy: 1.069873.
Iteration 11571: Policy loss: -0.006950. Value loss: 0.104120. Entropy: 1.088904.
episode: 4153   score: 1120.0  epsilon: 1.0    steps: 32  evaluation reward: 1052.2
episode: 4154   score: 890.0  epsilon: 1.0    steps: 368  evaluation reward: 1052.0
episode: 4155   score: 1220.0  epsilon: 1.0    steps: 976  evaluation reward: 1056.7
episode: 4156   score: 1890.0  epsilon: 1.0    steps: 992  evaluation reward: 1065.1
Training network. lr: 0.000161. clip: 0.064528
Iteration 11572: Policy loss: -0.001840. Value loss: 0.281619. Entropy: 0.628152.
Iteration 11573: Policy loss: -0.

episode: 4175   score: 740.0  epsilon: 1.0    steps: 552  evaluation reward: 1096.9
Training network. lr: 0.000161. clip: 0.064371
Iteration 11632: Policy loss: 0.002357. Value loss: 0.298907. Entropy: 0.976593.
Iteration 11633: Policy loss: 0.006139. Value loss: 0.203188. Entropy: 0.977885.
Iteration 11634: Policy loss: -0.000216. Value loss: 0.174469. Entropy: 0.980155.
Training network. lr: 0.000161. clip: 0.064371
Iteration 11635: Policy loss: 0.004735. Value loss: 0.405325. Entropy: 1.145368.
Iteration 11636: Policy loss: -0.000506. Value loss: 0.221870. Entropy: 1.149161.
Iteration 11637: Policy loss: -0.002611. Value loss: 0.161393. Entropy: 1.124876.
Training network. lr: 0.000161. clip: 0.064371
Iteration 11638: Policy loss: 0.001564. Value loss: 0.102681. Entropy: 1.489977.
Iteration 11639: Policy loss: -0.003588. Value loss: 0.049508. Entropy: 1.478729.
Iteration 11640: Policy loss: -0.005367. Value loss: 0.038634. Entropy: 1.479046.
Training network. lr: 0.000161. clip: 0.0

Training network. lr: 0.000161. clip: 0.064214
Iteration 11698: Policy loss: 0.010875. Value loss: 0.464820. Entropy: 1.437987.
Iteration 11699: Policy loss: 0.014787. Value loss: 0.348056. Entropy: 1.424060.
Iteration 11700: Policy loss: 0.009795. Value loss: 0.334534. Entropy: 1.417316.
episode: 4197   score: 1040.0  epsilon: 1.0    steps: 920  evaluation reward: 1127.1
Training network. lr: 0.000160. clip: 0.064067
Iteration 11701: Policy loss: 0.001815. Value loss: 0.196268. Entropy: 1.054140.
Iteration 11702: Policy loss: -0.000698. Value loss: 0.145443. Entropy: 1.061357.
Iteration 11703: Policy loss: -0.004176. Value loss: 0.133017. Entropy: 1.034773.
Training network. lr: 0.000160. clip: 0.064067
Iteration 11704: Policy loss: 0.001826. Value loss: 0.118240. Entropy: 1.253499.
Iteration 11705: Policy loss: -0.003989. Value loss: 0.091519. Entropy: 1.231531.
Iteration 11706: Policy loss: -0.004477. Value loss: 0.085221. Entropy: 1.242532.
episode: 4198   score: 1250.0  epsilon: 1

episode: 4218   score: 830.0  epsilon: 1.0    steps: 504  evaluation reward: 1160.2
episode: 4219   score: 2060.0  epsilon: 1.0    steps: 960  evaluation reward: 1166.0
Training network. lr: 0.000160. clip: 0.063910
Iteration 11764: Policy loss: 0.010164. Value loss: 0.446453. Entropy: 1.240863.
Iteration 11765: Policy loss: 0.007577. Value loss: 0.328299. Entropy: 1.271710.
Iteration 11766: Policy loss: 0.006600. Value loss: 0.311000. Entropy: 1.274298.
episode: 4220   score: 990.0  epsilon: 1.0    steps: 344  evaluation reward: 1166.0
Training network. lr: 0.000160. clip: 0.063910
Iteration 11767: Policy loss: 0.002198. Value loss: 0.237542. Entropy: 1.025071.
Iteration 11768: Policy loss: 0.001154. Value loss: 0.215654. Entropy: 1.027879.
Iteration 11769: Policy loss: -0.002819. Value loss: 0.188891. Entropy: 1.029953.
episode: 4221   score: 1230.0  epsilon: 1.0    steps: 456  evaluation reward: 1166.5
Training network. lr: 0.000160. clip: 0.063910
Iteration 11770: Policy loss: 0.00

Iteration 11828: Policy loss: -0.002587. Value loss: 0.193413. Entropy: 0.975038.
Iteration 11829: Policy loss: -0.004590. Value loss: 0.195994. Entropy: 1.001149.
Training network. lr: 0.000159. clip: 0.063753
Iteration 11830: Policy loss: 0.003250. Value loss: 0.191667. Entropy: 0.654285.
Iteration 11831: Policy loss: 0.000895. Value loss: 0.155453. Entropy: 0.628811.
Iteration 11832: Policy loss: -0.000008. Value loss: 0.162585. Entropy: 0.649600.
episode: 4242   score: 1150.0  epsilon: 1.0    steps: 200  evaluation reward: 1162.9
Training network. lr: 0.000159. clip: 0.063753
Iteration 11833: Policy loss: 0.006378. Value loss: 0.722235. Entropy: 1.016913.
Iteration 11834: Policy loss: 0.006531. Value loss: 0.591454. Entropy: 1.022502.
Iteration 11835: Policy loss: 0.003679. Value loss: 0.506794. Entropy: 1.030249.
episode: 4243   score: 1180.0  epsilon: 1.0    steps: 736  evaluation reward: 1164.0
Training network. lr: 0.000159. clip: 0.063753
Iteration 11836: Policy loss: 0.001613

Iteration 11895: Policy loss: -0.006159. Value loss: 0.153718. Entropy: 1.115748.
Training network. lr: 0.000159. clip: 0.063606
Iteration 11896: Policy loss: 0.008931. Value loss: 0.266905. Entropy: 1.098065.
Iteration 11897: Policy loss: 0.006708. Value loss: 0.156398. Entropy: 1.097548.
Iteration 11898: Policy loss: 0.001079. Value loss: 0.132078. Entropy: 1.095375.
episode: 4262   score: 1090.0  epsilon: 1.0    steps: 264  evaluation reward: 1154.5
Training network. lr: 0.000159. clip: 0.063606
Iteration 11899: Policy loss: 0.003342. Value loss: 0.173487. Entropy: 1.507332.
Iteration 11900: Policy loss: -0.003041. Value loss: 0.107169. Entropy: 1.505575.
Iteration 11901: Policy loss: -0.005934. Value loss: 0.096943. Entropy: 1.507820.
episode: 4263   score: 1100.0  epsilon: 1.0    steps: 688  evaluation reward: 1155.0
Training network. lr: 0.000159. clip: 0.063449
Iteration 11902: Policy loss: 0.004457. Value loss: 0.086780. Entropy: 1.503656.
Iteration 11903: Policy loss: 0.003537

Iteration 11961: Policy loss: -0.005284. Value loss: 0.171612. Entropy: 1.127563.
episode: 4283   score: 1130.0  epsilon: 1.0    steps: 592  evaluation reward: 1143.4
episode: 4284   score: 990.0  epsilon: 1.0    steps: 744  evaluation reward: 1141.1
Training network. lr: 0.000158. clip: 0.063293
Iteration 11962: Policy loss: 0.005142. Value loss: 0.246128. Entropy: 1.083777.
Iteration 11963: Policy loss: -0.001007. Value loss: 0.171323. Entropy: 1.094650.
Iteration 11964: Policy loss: -0.001745. Value loss: 0.152709. Entropy: 1.081130.
Training network. lr: 0.000158. clip: 0.063293
Iteration 11965: Policy loss: 0.002081. Value loss: 0.284454. Entropy: 1.047583.
Iteration 11966: Policy loss: 0.000117. Value loss: 0.190668. Entropy: 1.054743.
Iteration 11967: Policy loss: -0.001818. Value loss: 0.172180. Entropy: 1.048950.
episode: 4285   score: 1230.0  epsilon: 1.0    steps: 408  evaluation reward: 1145.2
episode: 4286   score: 770.0  epsilon: 1.0    steps: 456  evaluation reward: 1140

Training network. lr: 0.000158. clip: 0.063145
Iteration 12025: Policy loss: 0.000701. Value loss: 0.120112. Entropy: 1.042300.
Iteration 12026: Policy loss: -0.001768. Value loss: 0.104532. Entropy: 1.025277.
Iteration 12027: Policy loss: 0.000151. Value loss: 0.099443. Entropy: 1.040548.
Training network. lr: 0.000158. clip: 0.063145
Iteration 12028: Policy loss: 0.006453. Value loss: 0.220825. Entropy: 1.186906.
Iteration 12029: Policy loss: 0.002122. Value loss: 0.200877. Entropy: 1.179016.
Iteration 12030: Policy loss: -0.000938. Value loss: 0.201819. Entropy: 1.178077.
episode: 4307   score: 920.0  epsilon: 1.0    steps: 704  evaluation reward: 1098.0
episode: 4308   score: 880.0  epsilon: 1.0    steps: 816  evaluation reward: 1095.3
Training network. lr: 0.000158. clip: 0.063145
Iteration 12031: Policy loss: 0.005531. Value loss: 0.366794. Entropy: 1.231111.
Iteration 12032: Policy loss: 0.004357. Value loss: 0.193946. Entropy: 1.199553.
Iteration 12033: Policy loss: -0.001526. 

Iteration 12092: Policy loss: -0.000782. Value loss: 0.276184. Entropy: 1.577987.
Iteration 12093: Policy loss: -0.004055. Value loss: 0.253613. Entropy: 1.580287.
episode: 4327   score: 700.0  epsilon: 1.0    steps: 672  evaluation reward: 1084.6
episode: 4328   score: 430.0  epsilon: 1.0    steps: 776  evaluation reward: 1076.9
Training network. lr: 0.000157. clip: 0.062989
Iteration 12094: Policy loss: 0.007254. Value loss: 0.466202. Entropy: 1.538193.
Iteration 12095: Policy loss: -0.002395. Value loss: 0.268498. Entropy: 1.522073.
Iteration 12096: Policy loss: -0.008804. Value loss: 0.211773. Entropy: 1.529315.
episode: 4329   score: 830.0  epsilon: 1.0    steps: 872  evaluation reward: 1074.0
Training network. lr: 0.000157. clip: 0.062989
Iteration 12097: Policy loss: 0.001577. Value loss: 0.187227. Entropy: 1.205686.
Iteration 12098: Policy loss: 0.002204. Value loss: 0.155464. Entropy: 1.164968.
Iteration 12099: Policy loss: -0.005030. Value loss: 0.134828. Entropy: 1.189369.
T

Training network. lr: 0.000157. clip: 0.062684
Iteration 12157: Policy loss: 0.009177. Value loss: 0.395576. Entropy: 1.490376.
Iteration 12158: Policy loss: 0.014551. Value loss: 0.242366. Entropy: 1.479331.
Iteration 12159: Policy loss: 0.005617. Value loss: 0.216849. Entropy: 1.479269.
episode: 4350   score: 1090.0  epsilon: 1.0    steps: 792  evaluation reward: 1078.1
now time :  2019-03-06 05:49:20.258726
episode: 4351   score: 1240.0  epsilon: 1.0    steps: 864  evaluation reward: 1076.3
Training network. lr: 0.000157. clip: 0.062684
Iteration 12160: Policy loss: 0.001777. Value loss: 0.061015. Entropy: 1.496431.
Iteration 12161: Policy loss: -0.001184. Value loss: 0.041010. Entropy: 1.495960.
Iteration 12162: Policy loss: -0.005081. Value loss: 0.035401. Entropy: 1.489841.
episode: 4352   score: 1030.0  epsilon: 1.0    steps: 984  evaluation reward: 1075.8
Training network. lr: 0.000157. clip: 0.062684
Iteration 12163: Policy loss: 0.004020. Value loss: 0.201162. Entropy: 1.1487

Training network. lr: 0.000156. clip: 0.062528
Iteration 12220: Policy loss: 0.001704. Value loss: 0.250476. Entropy: 0.911044.
Iteration 12221: Policy loss: -0.000485. Value loss: 0.220089. Entropy: 0.887808.
Iteration 12222: Policy loss: -0.001091. Value loss: 0.219037. Entropy: 0.891325.
episode: 4374   score: 1060.0  epsilon: 1.0    steps: 752  evaluation reward: 1052.2
Training network. lr: 0.000156. clip: 0.062528
Iteration 12223: Policy loss: 0.001853. Value loss: 0.210604. Entropy: 1.144009.
Iteration 12224: Policy loss: -0.002130. Value loss: 0.165832. Entropy: 1.141457.
Iteration 12225: Policy loss: -0.001134. Value loss: 0.150281. Entropy: 1.150628.
Training network. lr: 0.000156. clip: 0.062528
Iteration 12226: Policy loss: 0.003869. Value loss: 0.219546. Entropy: 1.214488.
Iteration 12227: Policy loss: 0.000265. Value loss: 0.171169. Entropy: 1.213197.
Iteration 12228: Policy loss: -0.001400. Value loss: 0.160649. Entropy: 1.229585.
episode: 4375   score: 1290.0  epsilon: 

Training network. lr: 0.000156. clip: 0.062371
Iteration 12286: Policy loss: 0.003834. Value loss: 0.112580. Entropy: 0.963498.
Iteration 12287: Policy loss: 0.000615. Value loss: 0.083134. Entropy: 0.981128.
Iteration 12288: Policy loss: -0.004209. Value loss: 0.082098. Entropy: 0.975892.
episode: 4395   score: 1260.0  epsilon: 1.0    steps: 280  evaluation reward: 1083.5
Training network. lr: 0.000156. clip: 0.062371
Iteration 12289: Policy loss: 0.001861. Value loss: 0.325998. Entropy: 1.179058.
Iteration 12290: Policy loss: 0.000229. Value loss: 0.246741. Entropy: 1.180118.
Iteration 12291: Policy loss: -0.001605. Value loss: 0.229836. Entropy: 1.176516.
Training network. lr: 0.000156. clip: 0.062371
Iteration 12292: Policy loss: 0.006311. Value loss: 0.080442. Entropy: 1.503853.
Iteration 12293: Policy loss: 0.001508. Value loss: 0.039678. Entropy: 1.502313.
Iteration 12294: Policy loss: -0.003943. Value loss: 0.030984. Entropy: 1.509574.
episode: 4396   score: 1010.0  epsilon: 1.

episode: 4417   score: 1060.0  epsilon: 1.0    steps: 496  evaluation reward: 1113.4
Training network. lr: 0.000155. clip: 0.062067
Iteration 12352: Policy loss: 0.004317. Value loss: 0.378499. Entropy: 0.797440.
Iteration 12353: Policy loss: -0.001457. Value loss: 0.362169. Entropy: 0.793192.
Iteration 12354: Policy loss: -0.003355. Value loss: 0.353568. Entropy: 0.787329.
episode: 4418   score: 890.0  epsilon: 1.0    steps: 632  evaluation reward: 1107.2
Training network. lr: 0.000155. clip: 0.062067
Iteration 12355: Policy loss: 0.007062. Value loss: 0.144330. Entropy: 1.090914.
Iteration 12356: Policy loss: 0.000506. Value loss: 0.111202. Entropy: 1.110543.
Iteration 12357: Policy loss: -0.001977. Value loss: 0.105187. Entropy: 1.103479.
Training network. lr: 0.000155. clip: 0.062067
Iteration 12358: Policy loss: 0.004313. Value loss: 0.306971. Entropy: 1.398967.
Iteration 12359: Policy loss: 0.002686. Value loss: 0.223778. Entropy: 1.409507.
Iteration 12360: Policy loss: 0.000711.

Iteration 12417: Policy loss: -0.006517. Value loss: 0.083656. Entropy: 1.402429.
episode: 4439   score: 1100.0  epsilon: 1.0    steps: 64  evaluation reward: 1123.7
Training network. lr: 0.000155. clip: 0.061910
Iteration 12418: Policy loss: 0.001059. Value loss: 0.212450. Entropy: 1.349809.
Iteration 12419: Policy loss: -0.001814. Value loss: 0.181537. Entropy: 1.344051.
Iteration 12420: Policy loss: -0.001543. Value loss: 0.183963. Entropy: 1.347425.
Training network. lr: 0.000155. clip: 0.061910
Iteration 12421: Policy loss: 0.002597. Value loss: 0.063309. Entropy: 1.495320.
Iteration 12422: Policy loss: -0.003260. Value loss: 0.037028. Entropy: 1.496840.
Iteration 12423: Policy loss: -0.002478. Value loss: 0.026982. Entropy: 1.490816.
episode: 4440   score: 1070.0  epsilon: 1.0    steps: 720  evaluation reward: 1125.0
episode: 4441   score: 1320.0  epsilon: 1.0    steps: 904  evaluation reward: 1128.1
Training network. lr: 0.000155. clip: 0.061910
Iteration 12424: Policy loss: 0.0

Iteration 12482: Policy loss: -0.002839. Value loss: 0.118573. Entropy: 1.173446.
Iteration 12483: Policy loss: -0.005765. Value loss: 0.096991. Entropy: 1.177653.
episode: 4461   score: 1210.0  epsilon: 1.0    steps: 336  evaluation reward: 1149.1
episode: 4462   score: 1090.0  epsilon: 1.0    steps: 1000  evaluation reward: 1151.9
Training network. lr: 0.000154. clip: 0.061763
Iteration 12484: Policy loss: 0.006974. Value loss: 0.292983. Entropy: 1.294414.
Iteration 12485: Policy loss: 0.006383. Value loss: 0.217355. Entropy: 1.287110.
Iteration 12486: Policy loss: -0.000370. Value loss: 0.196217. Entropy: 1.279756.
episode: 4463   score: 1310.0  epsilon: 1.0    steps: 376  evaluation reward: 1152.6
Training network. lr: 0.000154. clip: 0.061763
Iteration 12487: Policy loss: 0.011086. Value loss: 0.449143. Entropy: 1.187222.
Iteration 12488: Policy loss: 0.002227. Value loss: 0.321021. Entropy: 1.196081.
Iteration 12489: Policy loss: 0.004306. Value loss: 0.257925. Entropy: 1.191485.

Iteration 12549: Policy loss: -0.006220. Value loss: 0.110925. Entropy: 1.484604.
episode: 4482   score: 930.0  epsilon: 1.0    steps: 232  evaluation reward: 1158.5
episode: 4483   score: 1100.0  epsilon: 1.0    steps: 328  evaluation reward: 1157.6
Training network. lr: 0.000154. clip: 0.061606
Iteration 12550: Policy loss: 0.002097. Value loss: 0.179603. Entropy: 1.029143.
Iteration 12551: Policy loss: -0.000916. Value loss: 0.171814. Entropy: 1.019209.
Iteration 12552: Policy loss: -0.003094. Value loss: 0.167360. Entropy: 1.035691.
episode: 4484   score: 1010.0  epsilon: 1.0    steps: 560  evaluation reward: 1157.4
episode: 4485   score: 1480.0  epsilon: 1.0    steps: 720  evaluation reward: 1160.0
episode: 4486   score: 1220.0  epsilon: 1.0    steps: 864  evaluation reward: 1160.5
Training network. lr: 0.000154. clip: 0.061449
Iteration 12553: Policy loss: 0.010855. Value loss: 0.161008. Entropy: 1.080735.
Iteration 12554: Policy loss: 0.000703. Value loss: 0.099168. Entropy: 1.0

episode: 4502   score: 1130.0  epsilon: 1.0    steps: 360  evaluation reward: 1162.6
episode: 4503   score: 850.0  epsilon: 1.0    steps: 808  evaluation reward: 1160.5
episode: 4504   score: 1010.0  epsilon: 1.0    steps: 888  evaluation reward: 1159.1
Training network. lr: 0.000153. clip: 0.061302
Iteration 12616: Policy loss: 0.008277. Value loss: 0.200503. Entropy: 1.246442.
Iteration 12617: Policy loss: 0.001167. Value loss: 0.116122. Entropy: 1.248317.
Iteration 12618: Policy loss: -0.008836. Value loss: 0.095203. Entropy: 1.245537.
episode: 4505   score: 1240.0  epsilon: 1.0    steps: 336  evaluation reward: 1155.4
episode: 4506   score: 1020.0  epsilon: 1.0    steps: 600  evaluation reward: 1153.8
Training network. lr: 0.000153. clip: 0.061302
Iteration 12619: Policy loss: -0.002407. Value loss: 0.214791. Entropy: 0.736942.
Iteration 12620: Policy loss: -0.003336. Value loss: 0.168077. Entropy: 0.733971.
Iteration 12621: Policy loss: -0.004223. Value loss: 0.171733. Entropy: 0.

Iteration 12681: Policy loss: 0.001887. Value loss: 0.147521. Entropy: 1.474603.
Training network. lr: 0.000153. clip: 0.061145
Iteration 12682: Policy loss: 0.002901. Value loss: 0.345601. Entropy: 1.686373.
Iteration 12683: Policy loss: -0.004173. Value loss: 0.304498. Entropy: 1.685487.
Iteration 12684: Policy loss: -0.002441. Value loss: 0.282280. Entropy: 1.684880.
episode: 4525   score: 1460.0  epsilon: 1.0    steps: 688  evaluation reward: 1154.4
Training network. lr: 0.000153. clip: 0.061145
Iteration 12685: Policy loss: 0.000020. Value loss: 0.184639. Entropy: 1.736637.
Iteration 12686: Policy loss: -0.002763. Value loss: 0.140122. Entropy: 1.741469.
Iteration 12687: Policy loss: -0.007113. Value loss: 0.116934. Entropy: 1.730633.
episode: 4526   score: 1060.0  epsilon: 1.0    steps: 488  evaluation reward: 1151.6
Training network. lr: 0.000153. clip: 0.061145
Iteration 12688: Policy loss: 0.002812. Value loss: 0.199924. Entropy: 1.346928.
Iteration 12689: Policy loss: -0.0000

Iteration 12747: Policy loss: -0.005795. Value loss: 0.044332. Entropy: 1.400048.
episode: 4546   score: 1080.0  epsilon: 1.0    steps: 200  evaluation reward: 1164.3
episode: 4547   score: 1270.0  epsilon: 1.0    steps: 536  evaluation reward: 1162.2
Training network. lr: 0.000152. clip: 0.060989
Iteration 12748: Policy loss: 0.010626. Value loss: 0.521375. Entropy: 1.310460.
Iteration 12749: Policy loss: 0.008151. Value loss: 0.360251. Entropy: 1.309872.
Iteration 12750: Policy loss: 0.007967. Value loss: 0.291427. Entropy: 1.303289.
Training network. lr: 0.000152. clip: 0.060841
Iteration 12751: Policy loss: 0.004313. Value loss: 0.822987. Entropy: 1.276377.
Iteration 12752: Policy loss: 0.004755. Value loss: 0.673738. Entropy: 1.262958.
Iteration 12753: Policy loss: 0.004280. Value loss: 0.622990. Entropy: 1.262692.
Training network. lr: 0.000152. clip: 0.060841
Iteration 12754: Policy loss: 0.005641. Value loss: 0.215325. Entropy: 1.492006.
Iteration 12755: Policy loss: -0.003610.

Iteration 12811: Policy loss: 0.006353. Value loss: 0.412111. Entropy: 1.205310.
Iteration 12812: Policy loss: -0.002116. Value loss: 0.249548. Entropy: 1.207548.
Iteration 12813: Policy loss: -0.003839. Value loss: 0.211029. Entropy: 1.215243.
Training network. lr: 0.000152. clip: 0.060685
Iteration 12814: Policy loss: 0.000637. Value loss: 0.361091. Entropy: 1.337006.
Iteration 12815: Policy loss: -0.001064. Value loss: 0.218396. Entropy: 1.342298.
Iteration 12816: Policy loss: -0.001559. Value loss: 0.185637. Entropy: 1.337890.
episode: 4569   score: 1210.0  epsilon: 1.0    steps: 832  evaluation reward: 1158.1
episode: 4570   score: 1600.0  epsilon: 1.0    steps: 1016  evaluation reward: 1164.9
Training network. lr: 0.000152. clip: 0.060685
Iteration 12817: Policy loss: 0.000563. Value loss: 0.202401. Entropy: 1.523175.
Iteration 12818: Policy loss: -0.000617. Value loss: 0.167087. Entropy: 1.517933.
Iteration 12819: Policy loss: -0.003697. Value loss: 0.149568. Entropy: 1.517057.


Training network. lr: 0.000151. clip: 0.060528
Iteration 12877: Policy loss: 0.003665. Value loss: 0.327064. Entropy: 1.458942.
Iteration 12878: Policy loss: -0.000091. Value loss: 0.199152. Entropy: 1.472131.
Iteration 12879: Policy loss: -0.004106. Value loss: 0.153945. Entropy: 1.478051.
episode: 4591   score: 850.0  epsilon: 1.0    steps: 608  evaluation reward: 1158.6
episode: 4592   score: 680.0  epsilon: 1.0    steps: 648  evaluation reward: 1154.5
Training network. lr: 0.000151. clip: 0.060528
Iteration 12880: Policy loss: 0.002477. Value loss: 0.501109. Entropy: 1.529207.
Iteration 12881: Policy loss: 0.002353. Value loss: 0.337493. Entropy: 1.531344.
Iteration 12882: Policy loss: 0.001058. Value loss: 0.281185. Entropy: 1.527582.
episode: 4593   score: 1060.0  epsilon: 1.0    steps: 344  evaluation reward: 1155.8
Training network. lr: 0.000151. clip: 0.060528
Iteration 12883: Policy loss: 0.000557. Value loss: 0.260737. Entropy: 1.178399.
Iteration 12884: Policy loss: -0.0028

Iteration 12942: Policy loss: -0.002478. Value loss: 0.173422. Entropy: 0.947688.
episode: 4613   score: 930.0  epsilon: 1.0    steps: 256  evaluation reward: 1146.7
Training network. lr: 0.000151. clip: 0.060380
Iteration 12943: Policy loss: 0.002521. Value loss: 0.321102. Entropy: 1.183738.
Iteration 12944: Policy loss: 0.000084. Value loss: 0.251207. Entropy: 1.186447.
Iteration 12945: Policy loss: -0.004103. Value loss: 0.237750. Entropy: 1.192880.
Training network. lr: 0.000151. clip: 0.060380
Iteration 12946: Policy loss: 0.003462. Value loss: 0.173800. Entropy: 1.412924.
Iteration 12947: Policy loss: 0.001590. Value loss: 0.113640. Entropy: 1.402846.
Iteration 12948: Policy loss: -0.004296. Value loss: 0.088690. Entropy: 1.401139.
Training network. lr: 0.000151. clip: 0.060380
Iteration 12949: Policy loss: 0.003045. Value loss: 0.234905. Entropy: 1.469283.
Iteration 12950: Policy loss: 0.003785. Value loss: 0.155722. Entropy: 1.473174.
Iteration 12951: Policy loss: -0.002219. Va

episode: 4634   score: 1150.0  epsilon: 1.0    steps: 720  evaluation reward: 1115.6
Training network. lr: 0.000150. clip: 0.060067
Iteration 13009: Policy loss: 0.004576. Value loss: 0.407109. Entropy: 1.167732.
Iteration 13010: Policy loss: 0.007010. Value loss: 0.312350. Entropy: 1.189549.
Iteration 13011: Policy loss: 0.002817. Value loss: 0.285808. Entropy: 1.200578.
episode: 4635   score: 1460.0  epsilon: 1.0    steps: 144  evaluation reward: 1116.6
Training network. lr: 0.000150. clip: 0.060067
Iteration 13012: Policy loss: 0.002337. Value loss: 0.190836. Entropy: 1.191244.
Iteration 13013: Policy loss: -0.000091. Value loss: 0.154132. Entropy: 1.204028.
Iteration 13014: Policy loss: -0.003549. Value loss: 0.144744. Entropy: 1.170932.
episode: 4636   score: 1300.0  epsilon: 1.0    steps: 512  evaluation reward: 1115.4
Training network. lr: 0.000150. clip: 0.060067
Iteration 13015: Policy loss: 0.003182. Value loss: 0.103201. Entropy: 1.189619.
Iteration 13016: Policy loss: 0.000

episode: 4655   score: 1160.0  epsilon: 1.0    steps: 184  evaluation reward: 1115.5
episode: 4656   score: 1410.0  epsilon: 1.0    steps: 744  evaluation reward: 1117.2
Training network. lr: 0.000150. clip: 0.059920
Iteration 13075: Policy loss: 0.003898. Value loss: 0.298099. Entropy: 0.992226.
Iteration 13076: Policy loss: 0.004114. Value loss: 0.241107. Entropy: 0.990755.
Iteration 13077: Policy loss: -0.000295. Value loss: 0.229626. Entropy: 0.981043.
Training network. lr: 0.000150. clip: 0.059920
Iteration 13078: Policy loss: 0.011106. Value loss: 0.181501. Entropy: 1.075791.
Iteration 13079: Policy loss: 0.000912. Value loss: 0.108219. Entropy: 1.058847.
Iteration 13080: Policy loss: -0.003032. Value loss: 0.100997. Entropy: 1.048725.
episode: 4657   score: 1340.0  epsilon: 1.0    steps: 88  evaluation reward: 1123.5
episode: 4658   score: 500.0  epsilon: 1.0    steps: 160  evaluation reward: 1113.7
episode: 4659   score: 1130.0  epsilon: 1.0    steps: 352  evaluation reward: 11

episode: 4677   score: 930.0  epsilon: 1.0    steps: 608  evaluation reward: 1101.0
Training network. lr: 0.000149. clip: 0.059763
Iteration 13141: Policy loss: 0.003136. Value loss: 0.170329. Entropy: 1.279841.
Iteration 13142: Policy loss: -0.005486. Value loss: 0.113968. Entropy: 1.300444.
Iteration 13143: Policy loss: -0.007780. Value loss: 0.106110. Entropy: 1.297624.
Training network. lr: 0.000149. clip: 0.059763
Iteration 13144: Policy loss: 0.004072. Value loss: 0.174083. Entropy: 0.994948.
Iteration 13145: Policy loss: -0.002262. Value loss: 0.109431. Entropy: 0.984696.
Iteration 13146: Policy loss: -0.004562. Value loss: 0.094921. Entropy: 0.984426.
episode: 4678   score: 1060.0  epsilon: 1.0    steps: 608  evaluation reward: 1096.7
episode: 4679   score: 540.0  epsilon: 1.0    steps: 896  evaluation reward: 1091.8
Training network. lr: 0.000149. clip: 0.059763
Iteration 13147: Policy loss: -0.000390. Value loss: 0.104764. Entropy: 1.217065.
Iteration 13148: Policy loss: -0.0

Iteration 13205: Policy loss: -0.002465. Value loss: 0.122426. Entropy: 1.434760.
Iteration 13206: Policy loss: -0.005698. Value loss: 0.104889. Entropy: 1.436630.
episode: 4700   score: 1020.0  epsilon: 1.0    steps: 584  evaluation reward: 1086.9
Training network. lr: 0.000149. clip: 0.059459
Iteration 13207: Policy loss: 0.007332. Value loss: 0.407738. Entropy: 1.182629.
Iteration 13208: Policy loss: 0.003780. Value loss: 0.281591. Entropy: 1.193175.
Iteration 13209: Policy loss: 0.000049. Value loss: 0.247084. Entropy: 1.187836.
Training network. lr: 0.000149. clip: 0.059459
Iteration 13210: Policy loss: 0.003792. Value loss: 0.399634. Entropy: 1.278759.
Iteration 13211: Policy loss: 0.010006. Value loss: 0.279790. Entropy: 1.264213.
Iteration 13212: Policy loss: -0.000255. Value loss: 0.218196. Entropy: 1.258522.
Training network. lr: 0.000149. clip: 0.059459
Iteration 13213: Policy loss: 0.003755. Value loss: 0.183910. Entropy: 1.279629.
Iteration 13214: Policy loss: -0.000766. V

Iteration 13270: Policy loss: 0.001865. Value loss: 0.144948. Entropy: 0.809195.
Iteration 13271: Policy loss: 0.000637. Value loss: 0.122268. Entropy: 0.807036.
Iteration 13272: Policy loss: -0.003546. Value loss: 0.115520. Entropy: 0.807242.
episode: 4722   score: 1250.0  epsilon: 1.0    steps: 824  evaluation reward: 1097.1
episode: 4723   score: 1240.0  epsilon: 1.0    steps: 992  evaluation reward: 1100.8
Training network. lr: 0.000148. clip: 0.059302
Iteration 13273: Policy loss: 0.003351. Value loss: 0.180479. Entropy: 0.960898.
Iteration 13274: Policy loss: 0.001392. Value loss: 0.123070. Entropy: 0.977620.
Iteration 13275: Policy loss: -0.005417. Value loss: 0.109128. Entropy: 0.955499.
Training network. lr: 0.000148. clip: 0.059302
Iteration 13276: Policy loss: 0.005600. Value loss: 0.254947. Entropy: 1.093798.
Iteration 13277: Policy loss: 0.005526. Value loss: 0.191760. Entropy: 1.084270.
Iteration 13278: Policy loss: 0.004201. Value loss: 0.160844. Entropy: 1.070302.
episo

episode: 4741   score: 910.0  epsilon: 1.0    steps: 856  evaluation reward: 1104.4
Training network. lr: 0.000148. clip: 0.059145
Iteration 13339: Policy loss: 0.001485. Value loss: 0.040048. Entropy: 1.209617.
Iteration 13340: Policy loss: -0.004300. Value loss: 0.023765. Entropy: 1.181719.
Iteration 13341: Policy loss: -0.006247. Value loss: 0.020590. Entropy: 1.194012.
episode: 4742   score: 1280.0  epsilon: 1.0    steps: 592  evaluation reward: 1105.9
episode: 4743   score: 1110.0  epsilon: 1.0    steps: 912  evaluation reward: 1101.2
episode: 4744   score: 1040.0  epsilon: 1.0    steps: 984  evaluation reward: 1098.1
Training network. lr: 0.000148. clip: 0.059145
Iteration 13342: Policy loss: 0.002959. Value loss: 0.118296. Entropy: 1.115565.
Iteration 13343: Policy loss: 0.000294. Value loss: 0.106039. Entropy: 1.121882.
Iteration 13344: Policy loss: -0.000717. Value loss: 0.100945. Entropy: 1.134848.
Training network. lr: 0.000148. clip: 0.059145
Iteration 13345: Policy loss: 0

Iteration 13402: Policy loss: 0.005883. Value loss: 0.191271. Entropy: 1.272848.
Iteration 13403: Policy loss: 0.001099. Value loss: 0.108881. Entropy: 1.262261.
Iteration 13404: Policy loss: -0.001293. Value loss: 0.092160. Entropy: 1.263524.
Training network. lr: 0.000147. clip: 0.058841
Iteration 13405: Policy loss: 0.005196. Value loss: 0.209673. Entropy: 1.336946.
Iteration 13406: Policy loss: 0.002677. Value loss: 0.155063. Entropy: 1.326758.
Iteration 13407: Policy loss: -0.000545. Value loss: 0.115057. Entropy: 1.331036.
Training network. lr: 0.000147. clip: 0.058841
Iteration 13408: Policy loss: 0.005504. Value loss: 0.092318. Entropy: 1.417946.
Iteration 13409: Policy loss: -0.001151. Value loss: 0.045598. Entropy: 1.418488.
Iteration 13410: Policy loss: -0.005146. Value loss: 0.038654. Entropy: 1.421421.
episode: 4765   score: 970.0  epsilon: 1.0    steps: 992  evaluation reward: 1112.0
Training network. lr: 0.000147. clip: 0.058841
Iteration 13411: Policy loss: 0.002576. Va

Training network. lr: 0.000147. clip: 0.058685
Iteration 13468: Policy loss: 0.000926. Value loss: 0.094086. Entropy: 1.151548.
Iteration 13469: Policy loss: -0.003944. Value loss: 0.051088. Entropy: 1.148914.
Iteration 13470: Policy loss: -0.006014. Value loss: 0.040583. Entropy: 1.156575.
episode: 4787   score: 1260.0  epsilon: 1.0    steps: 384  evaluation reward: 1098.4
episode: 4788   score: 1250.0  epsilon: 1.0    steps: 760  evaluation reward: 1096.7
Training network. lr: 0.000147. clip: 0.058685
Iteration 13471: Policy loss: 0.003474. Value loss: 0.404881. Entropy: 1.173846.
Iteration 13472: Policy loss: 0.000897. Value loss: 0.290888. Entropy: 1.170740.
Iteration 13473: Policy loss: -0.000054. Value loss: 0.239921. Entropy: 1.159912.
Training network. lr: 0.000147. clip: 0.058685
Iteration 13474: Policy loss: 0.005886. Value loss: 0.212578. Entropy: 1.180128.
Iteration 13475: Policy loss: 0.001649. Value loss: 0.131440. Entropy: 1.189129.
Iteration 13476: Policy loss: -0.00461

Iteration 13533: Policy loss: -0.000840. Value loss: 0.122881. Entropy: 1.153742.
Training network. lr: 0.000146. clip: 0.058537
Iteration 13534: Policy loss: 0.003459. Value loss: 0.255220. Entropy: 1.358580.
Iteration 13535: Policy loss: 0.000815. Value loss: 0.160121. Entropy: 1.357337.
Iteration 13536: Policy loss: -0.000751. Value loss: 0.118048. Entropy: 1.364962.
Training network. lr: 0.000146. clip: 0.058537
Iteration 13537: Policy loss: 0.004979. Value loss: 0.191036. Entropy: 1.356123.
Iteration 13538: Policy loss: -0.001283. Value loss: 0.124516. Entropy: 1.343118.
Iteration 13539: Policy loss: -0.003778. Value loss: 0.074997. Entropy: 1.354369.
episode: 4809   score: 1300.0  epsilon: 1.0    steps: 128  evaluation reward: 1136.5
Training network. lr: 0.000146. clip: 0.058537
Iteration 13540: Policy loss: 0.008524. Value loss: 0.151764. Entropy: 1.452029.
Iteration 13541: Policy loss: 0.002187. Value loss: 0.111199. Entropy: 1.440421.
Iteration 13542: Policy loss: -0.000069. 

Iteration 13600: Policy loss: 0.002124. Value loss: 0.100503. Entropy: 1.327180.
Iteration 13601: Policy loss: -0.002440. Value loss: 0.066994. Entropy: 1.335121.
Iteration 13602: Policy loss: -0.007021. Value loss: 0.055674. Entropy: 1.325882.
episode: 4831   score: 1200.0  epsilon: 1.0    steps: 376  evaluation reward: 1116.4
Training network. lr: 0.000146. clip: 0.058224
Iteration 13603: Policy loss: 0.008355. Value loss: 0.365892. Entropy: 1.270470.
Iteration 13604: Policy loss: 0.007717. Value loss: 0.317447. Entropy: 1.288697.
Iteration 13605: Policy loss: 0.002321. Value loss: 0.287362. Entropy: 1.273686.
Training network. lr: 0.000146. clip: 0.058224
Iteration 13606: Policy loss: 0.003660. Value loss: 0.092568. Entropy: 1.099388.
Iteration 13607: Policy loss: -0.001512. Value loss: 0.057559. Entropy: 1.095921.
Iteration 13608: Policy loss: -0.004880. Value loss: 0.048666. Entropy: 1.089924.
episode: 4832   score: 770.0  epsilon: 1.0    steps: 896  evaluation reward: 1116.1
Trai

episode: 4850   score: 990.0  epsilon: 1.0    steps: 248  evaluation reward: 1081.9
now time :  2019-03-06 06:09:39.521981
episode: 4851   score: 940.0  epsilon: 1.0    steps: 256  evaluation reward: 1083.5
Training network. lr: 0.000145. clip: 0.058076
Iteration 13669: Policy loss: 0.007335. Value loss: 0.225698. Entropy: 1.459505.
Iteration 13670: Policy loss: -0.000404. Value loss: 0.180205. Entropy: 1.463612.
Iteration 13671: Policy loss: -0.000477. Value loss: 0.164411. Entropy: 1.459574.
episode: 4852   score: 790.0  epsilon: 1.0    steps: 56  evaluation reward: 1083.0
episode: 4853   score: 1560.0  epsilon: 1.0    steps: 624  evaluation reward: 1087.7
Training network. lr: 0.000145. clip: 0.058076
Iteration 13672: Policy loss: 0.005633. Value loss: 0.136139. Entropy: 1.261588.
Iteration 13673: Policy loss: 0.003229. Value loss: 0.108296. Entropy: 1.272848.
Iteration 13674: Policy loss: -0.002824. Value loss: 0.102130. Entropy: 1.265753.
episode: 4854   score: 890.0  epsilon: 1.0

Training network. lr: 0.000145. clip: 0.057920
Iteration 13732: Policy loss: 0.007727. Value loss: 0.373036. Entropy: 1.595695.
Iteration 13733: Policy loss: 0.005744. Value loss: 0.244676. Entropy: 1.586764.
Iteration 13734: Policy loss: -0.000914. Value loss: 0.239069. Entropy: 1.575046.
Training network. lr: 0.000145. clip: 0.057920
Iteration 13735: Policy loss: 0.007803. Value loss: 0.306907. Entropy: 1.575218.
Iteration 13736: Policy loss: 0.004065. Value loss: 0.237238. Entropy: 1.570779.
Iteration 13737: Policy loss: -0.003541. Value loss: 0.209293. Entropy: 1.574642.
Training network. lr: 0.000145. clip: 0.057920
Iteration 13738: Policy loss: 0.004516. Value loss: 1.133838. Entropy: 1.570718.
Iteration 13739: Policy loss: 0.005145. Value loss: 1.033265. Entropy: 1.572709.
Iteration 13740: Policy loss: 0.003436. Value loss: 0.904943. Entropy: 1.564302.
episode: 4875   score: 700.0  epsilon: 1.0    steps: 288  evaluation reward: 1069.4
episode: 4876   score: 1470.0  epsilon: 1.0 

Training network. lr: 0.000144. clip: 0.057763
Iteration 13798: Policy loss: 0.007205. Value loss: 1.069483. Entropy: 1.438049.
Iteration 13799: Policy loss: 0.000645. Value loss: 0.992469. Entropy: 1.436984.
Iteration 13800: Policy loss: 0.004622. Value loss: 0.934560. Entropy: 1.429776.
Training network. lr: 0.000144. clip: 0.057616
Iteration 13801: Policy loss: 0.002859. Value loss: 0.299845. Entropy: 1.373856.
Iteration 13802: Policy loss: -0.000776. Value loss: 0.200756. Entropy: 1.372938.
Iteration 13803: Policy loss: 0.000192. Value loss: 0.147636. Entropy: 1.379004.
Training network. lr: 0.000144. clip: 0.057616
Iteration 13804: Policy loss: 0.003325. Value loss: 0.252728. Entropy: 1.374478.
Iteration 13805: Policy loss: -0.002084. Value loss: 0.153272. Entropy: 1.386387.
Iteration 13806: Policy loss: -0.005631. Value loss: 0.116604. Entropy: 1.383677.
episode: 4898   score: 740.0  epsilon: 1.0    steps: 88  evaluation reward: 1073.2
episode: 4899   score: 780.0  epsilon: 1.0  

episode: 4919   score: 1640.0  epsilon: 1.0    steps: 520  evaluation reward: 1049.0
Training network. lr: 0.000144. clip: 0.057459
Iteration 13864: Policy loss: 0.001250. Value loss: 0.305778. Entropy: 1.479983.
Iteration 13865: Policy loss: -0.000311. Value loss: 0.188967. Entropy: 1.473442.
Iteration 13866: Policy loss: -0.002998. Value loss: 0.150342. Entropy: 1.465995.
episode: 4920   score: 1140.0  epsilon: 1.0    steps: 392  evaluation reward: 1044.5
episode: 4921   score: 840.0  epsilon: 1.0    steps: 656  evaluation reward: 1040.4
Training network. lr: 0.000144. clip: 0.057459
Iteration 13867: Policy loss: 0.002060. Value loss: 0.179982. Entropy: 1.486765.
Iteration 13868: Policy loss: -0.000791. Value loss: 0.132010. Entropy: 1.474928.
Iteration 13869: Policy loss: -0.003725. Value loss: 0.118757. Entropy: 1.472605.
episode: 4922   score: 830.0  epsilon: 1.0    steps: 264  evaluation reward: 1037.0
Training network. lr: 0.000144. clip: 0.057459
Iteration 13870: Policy loss: 0

Iteration 13929: Policy loss: -0.003255. Value loss: 0.240066. Entropy: 1.171577.
episode: 4942   score: 830.0  epsilon: 1.0    steps: 616  evaluation reward: 1016.8
episode: 4943   score: 950.0  epsilon: 1.0    steps: 664  evaluation reward: 1012.1
episode: 4944   score: 920.0  epsilon: 1.0    steps: 800  evaluation reward: 1015.3
Training network. lr: 0.000143. clip: 0.057302
Iteration 13930: Policy loss: 0.001116. Value loss: 0.164537. Entropy: 1.261900.
Iteration 13931: Policy loss: 0.002909. Value loss: 0.117118. Entropy: 1.272514.
Iteration 13932: Policy loss: -0.004396. Value loss: 0.104734. Entropy: 1.277529.
Training network. lr: 0.000143. clip: 0.057302
Iteration 13933: Policy loss: 0.003507. Value loss: 0.304108. Entropy: 0.994613.
Iteration 13934: Policy loss: 0.005147. Value loss: 0.215576. Entropy: 1.000734.
Iteration 13935: Policy loss: 0.000710. Value loss: 0.197185. Entropy: 0.998473.
episode: 4945   score: 1530.0  epsilon: 1.0    steps: 840  evaluation reward: 1022.1


Iteration 13993: Policy loss: 0.006108. Value loss: 0.395960. Entropy: 1.438344.
Iteration 13994: Policy loss: 0.003470. Value loss: 0.298401. Entropy: 1.445309.
Iteration 13995: Policy loss: 0.000897. Value loss: 0.275044. Entropy: 1.436857.
episode: 4965   score: 880.0  epsilon: 1.0    steps: 24  evaluation reward: 1059.8
Training network. lr: 0.000143. clip: 0.057155
Iteration 13996: Policy loss: 0.008955. Value loss: 0.353934. Entropy: 1.038249.
Iteration 13997: Policy loss: 0.005243. Value loss: 0.235747. Entropy: 1.042604.
Iteration 13998: Policy loss: 0.002023. Value loss: 0.209626. Entropy: 1.034455.
Training network. lr: 0.000143. clip: 0.057155
Iteration 13999: Policy loss: 0.013406. Value loss: 0.300329. Entropy: 1.121373.
Iteration 14000: Policy loss: 0.009459. Value loss: 0.162259. Entropy: 1.127678.
Iteration 14001: Policy loss: 0.000929. Value loss: 0.126381. Entropy: 1.126885.
episode: 4966   score: 700.0  epsilon: 1.0    steps: 728  evaluation reward: 1060.9
Training n

episode: 4987   score: 900.0  epsilon: 1.0    steps: 856  evaluation reward: 1042.2
Training network. lr: 0.000142. clip: 0.056841
Iteration 14059: Policy loss: 0.010766. Value loss: 0.490966. Entropy: 1.638794.
Iteration 14060: Policy loss: 0.008608. Value loss: 0.327146. Entropy: 1.632251.
Iteration 14061: Policy loss: 0.007986. Value loss: 0.244758. Entropy: 1.643270.
episode: 4988   score: 270.0  epsilon: 1.0    steps: 192  evaluation reward: 1037.3
episode: 4989   score: 630.0  epsilon: 1.0    steps: 736  evaluation reward: 1037.5
Training network. lr: 0.000142. clip: 0.056841
Iteration 14062: Policy loss: 0.008060. Value loss: 0.411500. Entropy: 1.407918.
Iteration 14063: Policy loss: 0.004208. Value loss: 0.290785. Entropy: 1.411667.
Iteration 14064: Policy loss: 0.005038. Value loss: 0.239219. Entropy: 1.422789.
episode: 4990   score: 1210.0  epsilon: 1.0    steps: 728  evaluation reward: 1033.7
Training network. lr: 0.000142. clip: 0.056841
Iteration 14065: Policy loss: 0.0065

Iteration 14124: Policy loss: -0.001209. Value loss: 0.122105. Entropy: 1.455297.
Training network. lr: 0.000142. clip: 0.056694
Iteration 14125: Policy loss: 0.001804. Value loss: 0.408070. Entropy: 1.190933.
Iteration 14126: Policy loss: -0.000577. Value loss: 0.304569. Entropy: 1.186065.
Iteration 14127: Policy loss: -0.005479. Value loss: 0.268918. Entropy: 1.180303.
episode: 5009   score: 920.0  epsilon: 1.0    steps: 288  evaluation reward: 1034.4
Training network. lr: 0.000142. clip: 0.056694
Iteration 14128: Policy loss: 0.005686. Value loss: 0.615202. Entropy: 1.200196.
Iteration 14129: Policy loss: 0.002578. Value loss: 0.522351. Entropy: 1.201720.
Iteration 14130: Policy loss: 0.014354. Value loss: 0.488123. Entropy: 1.218780.
episode: 5010   score: 1410.0  epsilon: 1.0    steps: 224  evaluation reward: 1039.4
episode: 5011   score: 1360.0  epsilon: 1.0    steps: 552  evaluation reward: 1043.2
episode: 5012   score: 670.0  epsilon: 1.0    steps: 928  evaluation reward: 1040.

Iteration 14188: Policy loss: 0.010498. Value loss: 0.523052. Entropy: 1.340046.
Iteration 14189: Policy loss: 0.005529. Value loss: 0.364708. Entropy: 1.303723.
Iteration 14190: Policy loss: 0.002120. Value loss: 0.307200. Entropy: 1.304884.
episode: 5033   score: 1010.0  epsilon: 1.0    steps: 360  evaluation reward: 1032.1
Training network. lr: 0.000141. clip: 0.056537
Iteration 14191: Policy loss: 0.003990. Value loss: 0.214048. Entropy: 1.331915.
Iteration 14192: Policy loss: 0.002918. Value loss: 0.166313. Entropy: 1.342738.
Iteration 14193: Policy loss: -0.004857. Value loss: 0.149810. Entropy: 1.340472.
Training network. lr: 0.000141. clip: 0.056537
Iteration 14194: Policy loss: 0.001540. Value loss: 0.587481. Entropy: 1.465539.
Iteration 14195: Policy loss: 0.005495. Value loss: 0.361616. Entropy: 1.460668.
Iteration 14196: Policy loss: 0.000757. Value loss: 0.248216. Entropy: 1.457868.
Training network. lr: 0.000141. clip: 0.056537
Iteration 14197: Policy loss: 0.011526. Valu

Iteration 14254: Policy loss: 0.005359. Value loss: 0.152507. Entropy: 1.505290.
Iteration 14255: Policy loss: 0.000834. Value loss: 0.121541. Entropy: 1.514291.
Iteration 14256: Policy loss: -0.002161. Value loss: 0.101744. Entropy: 1.508066.
episode: 5054   score: 840.0  epsilon: 1.0    steps: 112  evaluation reward: 1041.5
episode: 5055   score: 800.0  epsilon: 1.0    steps: 216  evaluation reward: 1036.8
Training network. lr: 0.000141. clip: 0.056233
Iteration 14257: Policy loss: 0.004032. Value loss: 0.296673. Entropy: 1.324456.
Iteration 14258: Policy loss: 0.000964. Value loss: 0.245962. Entropy: 1.345182.
Iteration 14259: Policy loss: 0.000767. Value loss: 0.237174. Entropy: 1.344358.
episode: 5056   score: 960.0  epsilon: 1.0    steps: 520  evaluation reward: 1025.7
Training network. lr: 0.000141. clip: 0.056233
Iteration 14260: Policy loss: 0.002030. Value loss: 0.171756. Entropy: 1.374208.
Iteration 14261: Policy loss: -0.001856. Value loss: 0.128042. Entropy: 1.388031.
Iter

Training network. lr: 0.000140. clip: 0.056077
Iteration 14320: Policy loss: 0.002986. Value loss: 0.088815. Entropy: 1.315465.
Iteration 14321: Policy loss: 0.003149. Value loss: 0.044647. Entropy: 1.322555.
Iteration 14322: Policy loss: -0.000681. Value loss: 0.027951. Entropy: 1.321022.
episode: 5077   score: 1170.0  epsilon: 1.0    steps: 224  evaluation reward: 1015.9
episode: 5078   score: 1180.0  epsilon: 1.0    steps: 440  evaluation reward: 1014.9
episode: 5079   score: 850.0  epsilon: 1.0    steps: 608  evaluation reward: 1015.0
Training network. lr: 0.000140. clip: 0.056077
Iteration 14323: Policy loss: 0.003792. Value loss: 0.345767. Entropy: 1.051580.
Iteration 14324: Policy loss: -0.000504. Value loss: 0.289244. Entropy: 1.058845.
Iteration 14325: Policy loss: 0.000582. Value loss: 0.263665. Entropy: 1.049483.
Training network. lr: 0.000140. clip: 0.056077
Iteration 14326: Policy loss: 0.006404. Value loss: 0.366530. Entropy: 0.917835.
Iteration 14327: Policy loss: 0.0022

Iteration 14384: Policy loss: -0.002505. Value loss: 0.108627. Entropy: 1.207645.
Iteration 14385: Policy loss: -0.003462. Value loss: 0.098565. Entropy: 1.208467.
Training network. lr: 0.000140. clip: 0.055920
Iteration 14386: Policy loss: 0.008755. Value loss: 0.202813. Entropy: 1.365740.
Iteration 14387: Policy loss: 0.001530. Value loss: 0.119969. Entropy: 1.391590.
Iteration 14388: Policy loss: -0.000432. Value loss: 0.082925. Entropy: 1.385318.
Training network. lr: 0.000140. clip: 0.055920
Iteration 14389: Policy loss: 0.011618. Value loss: 0.164525. Entropy: 1.261464.
Iteration 14390: Policy loss: 0.000928. Value loss: 0.085151. Entropy: 1.272340.
Iteration 14391: Policy loss: -0.003138. Value loss: 0.061756. Entropy: 1.265215.
now time :  2019-03-06 06:19:26.379496
episode: 5101   score: 820.0  epsilon: 1.0    steps: 96  evaluation reward: 1050.6
episode: 5102   score: 1190.0  epsilon: 1.0    steps: 480  evaluation reward: 1045.1
episode: 5103   score: 1190.0  epsilon: 1.0    

episode: 5124   score: 860.0  epsilon: 1.0    steps: 808  evaluation reward: 1034.3
Training network. lr: 0.000139. clip: 0.055772
Iteration 14449: Policy loss: 0.002650. Value loss: 0.320418. Entropy: 1.121930.
Iteration 14450: Policy loss: 0.004976. Value loss: 0.188715. Entropy: 1.109995.
Iteration 14451: Policy loss: -0.001105. Value loss: 0.152649. Entropy: 1.113456.
Training network. lr: 0.000139. clip: 0.055616
Iteration 14452: Policy loss: 0.006852. Value loss: 0.477000. Entropy: 1.212904.
Iteration 14453: Policy loss: 0.004756. Value loss: 0.341080. Entropy: 1.225067.
Iteration 14454: Policy loss: 0.001928. Value loss: 0.302401. Entropy: 1.219419.
Training network. lr: 0.000139. clip: 0.055616
Iteration 14455: Policy loss: 0.001003. Value loss: 0.227812. Entropy: 1.410732.
Iteration 14456: Policy loss: 0.001643. Value loss: 0.162124. Entropy: 1.412871.
Iteration 14457: Policy loss: -0.002333. Value loss: 0.141383. Entropy: 1.409288.
episode: 5125   score: 1640.0  epsilon: 1.0 

Training network. lr: 0.000139. clip: 0.055459
Iteration 14515: Policy loss: 0.001349. Value loss: 0.215511. Entropy: 1.037650.
Iteration 14516: Policy loss: 0.001057. Value loss: 0.149516. Entropy: 1.033849.
Iteration 14517: Policy loss: -0.002673. Value loss: 0.132253. Entropy: 1.061729.
Training network. lr: 0.000139. clip: 0.055459
Iteration 14518: Policy loss: 0.004627. Value loss: 0.171638. Entropy: 1.030805.
Iteration 14519: Policy loss: 0.000816. Value loss: 0.102519. Entropy: 1.041291.
Iteration 14520: Policy loss: 0.000101. Value loss: 0.081145. Entropy: 1.030789.
episode: 5146   score: 1250.0  epsilon: 1.0    steps: 8  evaluation reward: 1047.6
episode: 5147   score: 1420.0  epsilon: 1.0    steps: 888  evaluation reward: 1049.8
episode: 5148   score: 970.0  epsilon: 1.0    steps: 968  evaluation reward: 1039.2
Training network. lr: 0.000139. clip: 0.055459
Iteration 14521: Policy loss: 0.004582. Value loss: 0.216487. Entropy: 1.329286.
Iteration 14522: Policy loss: -0.001689

episode: 5167   score: 840.0  epsilon: 1.0    steps: 704  evaluation reward: 1064.8
Training network. lr: 0.000138. clip: 0.055312
Iteration 14581: Policy loss: 0.006255. Value loss: 0.353210. Entropy: 1.240329.
Iteration 14582: Policy loss: 0.003845. Value loss: 0.241651. Entropy: 1.247657.
Iteration 14583: Policy loss: -0.000783. Value loss: 0.224369. Entropy: 1.220863.
episode: 5168   score: 1840.0  epsilon: 1.0    steps: 448  evaluation reward: 1074.0
Training network. lr: 0.000138. clip: 0.055312
Iteration 14584: Policy loss: 0.008670. Value loss: 0.238744. Entropy: 1.129078.
Iteration 14585: Policy loss: 0.002529. Value loss: 0.164892. Entropy: 1.143654.
Iteration 14586: Policy loss: 0.001878. Value loss: 0.154371. Entropy: 1.126840.
episode: 5169   score: 1500.0  epsilon: 1.0    steps: 448  evaluation reward: 1079.7
episode: 5170   score: 880.0  epsilon: 1.0    steps: 552  evaluation reward: 1080.5
Training network. lr: 0.000138. clip: 0.055312
Iteration 14587: Policy loss: 0.00

episode: 5189   score: 460.0  epsilon: 1.0    steps: 960  evaluation reward: 1046.8
Training network. lr: 0.000138. clip: 0.055155
Iteration 14647: Policy loss: 0.007524. Value loss: 0.255889. Entropy: 1.597272.
Iteration 14648: Policy loss: 0.007685. Value loss: 0.157391. Entropy: 1.591717.
Iteration 14649: Policy loss: 0.003766. Value loss: 0.121948. Entropy: 1.592279.
episode: 5190   score: 900.0  epsilon: 1.0    steps: 432  evaluation reward: 1053.6
episode: 5191   score: 1190.0  epsilon: 1.0    steps: 744  evaluation reward: 1051.2
Training network. lr: 0.000138. clip: 0.055155
Iteration 14650: Policy loss: 0.005083. Value loss: 0.282082. Entropy: 1.264056.
Iteration 14651: Policy loss: -0.001923. Value loss: 0.256098. Entropy: 1.249728.
Iteration 14652: Policy loss: -0.002337. Value loss: 0.243033. Entropy: 1.269822.
episode: 5192   score: 820.0  epsilon: 1.0    steps: 184  evaluation reward: 1041.9
episode: 5193   score: 870.0  epsilon: 1.0    steps: 656  evaluation reward: 1044

Iteration 14711: Policy loss: 0.004891. Value loss: 0.434043. Entropy: 1.602595.
Iteration 14712: Policy loss: 0.005093. Value loss: 0.358008. Entropy: 1.611105.
Training network. lr: 0.000137. clip: 0.054851
Iteration 14713: Policy loss: 0.007050. Value loss: 0.561666. Entropy: 1.446728.
Iteration 14714: Policy loss: 0.004395. Value loss: 0.447897. Entropy: 1.413170.
Iteration 14715: Policy loss: 0.002399. Value loss: 0.415659. Entropy: 1.418675.
episode: 5212   score: 1190.0  epsilon: 1.0    steps: 512  evaluation reward: 1057.2
Training network. lr: 0.000137. clip: 0.054851
Iteration 14716: Policy loss: 0.002139. Value loss: 0.057352. Entropy: 1.288842.
Iteration 14717: Policy loss: -0.000093. Value loss: 0.037929. Entropy: 1.286034.
Iteration 14718: Policy loss: -0.004910. Value loss: 0.033568. Entropy: 1.283644.
episode: 5213   score: 900.0  epsilon: 1.0    steps: 200  evaluation reward: 1057.4
episode: 5214   score: 1050.0  epsilon: 1.0    steps: 240  evaluation reward: 1057.1
ep

Training network. lr: 0.000137. clip: 0.054694
Iteration 14776: Policy loss: 0.005410. Value loss: 0.297940. Entropy: 1.288594.
Iteration 14777: Policy loss: 0.009187. Value loss: 0.177772. Entropy: 1.291891.
Iteration 14778: Policy loss: 0.003864. Value loss: 0.137968. Entropy: 1.301061.
episode: 5235   score: 780.0  epsilon: 1.0    steps: 320  evaluation reward: 1081.8
Training network. lr: 0.000137. clip: 0.054694
Iteration 14779: Policy loss: 0.005680. Value loss: 0.207282. Entropy: 1.273693.
Iteration 14780: Policy loss: -0.000274. Value loss: 0.158808. Entropy: 1.272129.
Iteration 14781: Policy loss: -0.002397. Value loss: 0.146841. Entropy: 1.268148.
episode: 5236   score: 940.0  epsilon: 1.0    steps: 336  evaluation reward: 1079.3
episode: 5237   score: 1340.0  epsilon: 1.0    steps: 1016  evaluation reward: 1081.9
Training network. lr: 0.000137. clip: 0.054694
Iteration 14782: Policy loss: 0.002391. Value loss: 0.101911. Entropy: 1.140242.
Iteration 14783: Policy loss: -0.003

Training network. lr: 0.000136. clip: 0.054537
Iteration 14842: Policy loss: -0.001354. Value loss: 0.187232. Entropy: 1.139356.
Iteration 14843: Policy loss: 0.001564. Value loss: 0.133339. Entropy: 1.141906.
Iteration 14844: Policy loss: -0.002972. Value loss: 0.119012. Entropy: 1.146978.
episode: 5256   score: 630.0  epsilon: 1.0    steps: 520  evaluation reward: 1069.5
Training network. lr: 0.000136. clip: 0.054537
Iteration 14845: Policy loss: 0.006892. Value loss: 0.214680. Entropy: 1.443764.
Iteration 14846: Policy loss: 0.000861. Value loss: 0.107337. Entropy: 1.440835.
Iteration 14847: Policy loss: -0.002960. Value loss: 0.078017. Entropy: 1.437927.
episode: 5257   score: 1410.0  epsilon: 1.0    steps: 8  evaluation reward: 1072.2
episode: 5258   score: 690.0  epsilon: 1.0    steps: 976  evaluation reward: 1066.7
Training network. lr: 0.000136. clip: 0.054537
Iteration 14848: Policy loss: 0.002115. Value loss: 0.248253. Entropy: 1.459256.
Iteration 14849: Policy loss: -0.00186

Iteration 14908: Policy loss: -0.000019. Value loss: 0.165649. Entropy: 1.534314.
Iteration 14909: Policy loss: -0.003459. Value loss: 0.102627. Entropy: 1.539472.
Iteration 14910: Policy loss: -0.002216. Value loss: 0.078532. Entropy: 1.538242.
episode: 5277   score: 1090.0  epsilon: 1.0    steps: 184  evaluation reward: 1051.1
episode: 5278   score: 1150.0  epsilon: 1.0    steps: 776  evaluation reward: 1053.4
Training network. lr: 0.000136. clip: 0.054233
Iteration 14911: Policy loss: 0.004610. Value loss: 0.184785. Entropy: 1.512750.
Iteration 14912: Policy loss: 0.001760. Value loss: 0.163040. Entropy: 1.493970.
Iteration 14913: Policy loss: -0.001317. Value loss: 0.164915. Entropy: 1.500815.
episode: 5279   score: 950.0  epsilon: 1.0    steps: 16  evaluation reward: 1047.0
episode: 5280   score: 980.0  epsilon: 1.0    steps: 840  evaluation reward: 1045.3
episode: 5281   score: 860.0  epsilon: 1.0    steps: 904  evaluation reward: 1043.4
Training network. lr: 0.000136. clip: 0.05

Training network. lr: 0.000135. clip: 0.054077
Iteration 14974: Policy loss: 0.001996. Value loss: 0.194045. Entropy: 1.020359.
Iteration 14975: Policy loss: -0.002546. Value loss: 0.140053. Entropy: 1.017997.
Iteration 14976: Policy loss: -0.005526. Value loss: 0.131357. Entropy: 1.007335.
episode: 5299   score: 1020.0  epsilon: 1.0    steps: 120  evaluation reward: 1049.3
episode: 5300   score: 1230.0  epsilon: 1.0    steps: 568  evaluation reward: 1053.1
Training network. lr: 0.000135. clip: 0.054077
Iteration 14977: Policy loss: 0.000986. Value loss: 0.153616. Entropy: 0.873249.
Iteration 14978: Policy loss: -0.000389. Value loss: 0.116191. Entropy: 0.878314.
Iteration 14979: Policy loss: -0.001762. Value loss: 0.109292. Entropy: 0.875396.
now time :  2019-03-06 06:27:24.189655
episode: 5301   score: 1250.0  epsilon: 1.0    steps: 304  evaluation reward: 1054.5
Training network. lr: 0.000135. clip: 0.054077
Iteration 14980: Policy loss: 0.001942. Value loss: 0.265332. Entropy: 0.79

Iteration 15039: Policy loss: 0.000717. Value loss: 0.152468. Entropy: 1.471324.
episode: 5321   score: 930.0  epsilon: 1.0    steps: 408  evaluation reward: 1062.5
Training network. lr: 0.000135. clip: 0.053929
Iteration 15040: Policy loss: 0.004372. Value loss: 0.180211. Entropy: 1.675213.
Iteration 15041: Policy loss: -0.000493. Value loss: 0.123296. Entropy: 1.676113.
Iteration 15042: Policy loss: -0.001090. Value loss: 0.101824. Entropy: 1.676518.
Training network. lr: 0.000135. clip: 0.053929
Iteration 15043: Policy loss: 0.011795. Value loss: 0.215067. Entropy: 1.522094.
Iteration 15044: Policy loss: 0.003962. Value loss: 0.142744. Entropy: 1.515856.
Iteration 15045: Policy loss: 0.001057. Value loss: 0.122496. Entropy: 1.507754.
episode: 5322   score: 960.0  epsilon: 1.0    steps: 16  evaluation reward: 1059.3
episode: 5323   score: 780.0  epsilon: 1.0    steps: 528  evaluation reward: 1056.2
episode: 5324   score: 1080.0  epsilon: 1.0    steps: 800  evaluation reward: 1058.8
T

Iteration 15103: Policy loss: 0.013303. Value loss: 0.905336. Entropy: 1.171974.
Iteration 15104: Policy loss: 0.012164. Value loss: 0.705994. Entropy: 1.188323.
Iteration 15105: Policy loss: 0.004031. Value loss: 0.569811. Entropy: 1.161260.
Training network. lr: 0.000134. clip: 0.053616
Iteration 15106: Policy loss: 0.003650. Value loss: 0.331885. Entropy: 0.888196.
Iteration 15107: Policy loss: 0.002340. Value loss: 0.224982. Entropy: 0.902197.
Iteration 15108: Policy loss: -0.000442. Value loss: 0.204324. Entropy: 0.907789.
episode: 5345   score: 730.0  epsilon: 1.0    steps: 40  evaluation reward: 1053.4
episode: 5346   score: 1240.0  epsilon: 1.0    steps: 672  evaluation reward: 1054.4
Training network. lr: 0.000134. clip: 0.053616
Iteration 15109: Policy loss: 0.009778. Value loss: 0.318731. Entropy: 1.011680.
Iteration 15110: Policy loss: 0.002141. Value loss: 0.212066. Entropy: 1.026471.
Iteration 15111: Policy loss: 0.001397. Value loss: 0.200559. Entropy: 1.010581.
episode:

episode: 5367   score: 760.0  epsilon: 1.0    steps: 496  evaluation reward: 1085.1
Training network. lr: 0.000134. clip: 0.053468
Iteration 15169: Policy loss: 0.012344. Value loss: 0.458577. Entropy: 1.515381.
Iteration 15170: Policy loss: 0.005079. Value loss: 0.312526. Entropy: 1.538940.
Iteration 15171: Policy loss: 0.004527. Value loss: 0.260086. Entropy: 1.537328.
Training network. lr: 0.000134. clip: 0.053468
Iteration 15172: Policy loss: 0.004660. Value loss: 0.115058. Entropy: 1.429832.
Iteration 15173: Policy loss: 0.006482. Value loss: 0.077969. Entropy: 1.416513.
Iteration 15174: Policy loss: 0.009049. Value loss: 0.058332. Entropy: 1.420679.
episode: 5368   score: 930.0  epsilon: 1.0    steps: 752  evaluation reward: 1086.1
Training network. lr: 0.000134. clip: 0.053468
Iteration 15175: Policy loss: 0.007594. Value loss: 0.097953. Entropy: 1.394619.
Iteration 15176: Policy loss: 0.003562. Value loss: 0.058583. Entropy: 1.391825.
Iteration 15177: Policy loss: -0.001121. Va

episode: 5390   score: 890.0  epsilon: 1.0    steps: 960  evaluation reward: 1122.8
Training network. lr: 0.000133. clip: 0.053312
Iteration 15235: Policy loss: 0.007897. Value loss: 0.563300. Entropy: 1.176800.
Iteration 15236: Policy loss: 0.004981. Value loss: 0.399486. Entropy: 1.151672.
Iteration 15237: Policy loss: 0.002280. Value loss: 0.346379. Entropy: 1.165579.
Training network. lr: 0.000133. clip: 0.053312
Iteration 15238: Policy loss: 0.007273. Value loss: 0.464165. Entropy: 1.051814.
Iteration 15239: Policy loss: 0.001491. Value loss: 0.336862. Entropy: 1.056761.
Iteration 15240: Policy loss: 0.006774. Value loss: 0.275528. Entropy: 1.050562.
Training network. lr: 0.000133. clip: 0.053312
Iteration 15241: Policy loss: 0.005363. Value loss: 0.193519. Entropy: 1.159467.
Iteration 15242: Policy loss: 0.000777. Value loss: 0.115871. Entropy: 1.140750.
Iteration 15243: Policy loss: -0.004082. Value loss: 0.084186. Entropy: 1.136617.
episode: 5391   score: 1000.0  epsilon: 1.0  

Iteration 15300: Policy loss: 0.004259. Value loss: 0.100627. Entropy: 1.265304.
Training network. lr: 0.000133. clip: 0.053008
Iteration 15301: Policy loss: 0.008939. Value loss: 0.593451. Entropy: 1.087171.
Iteration 15302: Policy loss: 0.004533. Value loss: 0.464504. Entropy: 1.064320.
Iteration 15303: Policy loss: -0.001906. Value loss: 0.432926. Entropy: 1.057667.
episode: 5412   score: 1310.0  epsilon: 1.0    steps: 656  evaluation reward: 1126.0
Training network. lr: 0.000133. clip: 0.053008
Iteration 15304: Policy loss: 0.003943. Value loss: 0.186304. Entropy: 1.275617.
Iteration 15305: Policy loss: 0.001030. Value loss: 0.107859. Entropy: 1.272024.
Iteration 15306: Policy loss: 0.000757. Value loss: 0.092214. Entropy: 1.272288.
episode: 5413   score: 1040.0  epsilon: 1.0    steps: 440  evaluation reward: 1126.7
episode: 5414   score: 1120.0  epsilon: 1.0    steps: 520  evaluation reward: 1124.7
Training network. lr: 0.000133. clip: 0.053008
Iteration 15307: Policy loss: 0.0059

Iteration 15365: Policy loss: 0.001769. Value loss: 0.175054. Entropy: 1.253565.
Iteration 15366: Policy loss: -0.002681. Value loss: 0.134575. Entropy: 1.254729.
episode: 5435   score: 230.0  epsilon: 1.0    steps: 8  evaluation reward: 1116.9
episode: 5436   score: 1480.0  epsilon: 1.0    steps: 200  evaluation reward: 1120.1
Training network. lr: 0.000132. clip: 0.052851
Iteration 15367: Policy loss: 0.001846. Value loss: 0.349083. Entropy: 1.116846.
Iteration 15368: Policy loss: 0.000202. Value loss: 0.248962. Entropy: 1.109829.
Iteration 15369: Policy loss: -0.000760. Value loss: 0.204750. Entropy: 1.096749.
episode: 5437   score: 560.0  epsilon: 1.0    steps: 816  evaluation reward: 1115.5
Training network. lr: 0.000132. clip: 0.052851
Iteration 15370: Policy loss: 0.005780. Value loss: 0.214841. Entropy: 1.481699.
Iteration 15371: Policy loss: 0.002029. Value loss: 0.128944. Entropy: 1.486981.
Iteration 15372: Policy loss: -0.003527. Value loss: 0.105134. Entropy: 1.475552.
epis

Training network. lr: 0.000132. clip: 0.052694
Iteration 15430: Policy loss: 0.002763. Value loss: 0.122497. Entropy: 1.259297.
Iteration 15431: Policy loss: -0.000386. Value loss: 0.079948. Entropy: 1.265306.
Iteration 15432: Policy loss: -0.001872. Value loss: 0.064471. Entropy: 1.259724.
episode: 5459   score: 760.0  epsilon: 1.0    steps: 344  evaluation reward: 1095.4
Training network. lr: 0.000132. clip: 0.052694
Iteration 15433: Policy loss: 0.003845. Value loss: 0.241478. Entropy: 1.144073.
Iteration 15434: Policy loss: -0.000426. Value loss: 0.194594. Entropy: 1.137740.
Iteration 15435: Policy loss: -0.001463. Value loss: 0.178815. Entropy: 1.153170.
episode: 5460   score: 1030.0  epsilon: 1.0    steps: 320  evaluation reward: 1096.5
Training network. lr: 0.000132. clip: 0.052694
Iteration 15436: Policy loss: 0.002405. Value loss: 0.124496. Entropy: 1.213118.
Iteration 15437: Policy loss: -0.004583. Value loss: 0.090243. Entropy: 1.218937.
Iteration 15438: Policy loss: -0.0074

Iteration 15495: Policy loss: -0.003480. Value loss: 0.223078. Entropy: 1.077076.
episode: 5482   score: 820.0  epsilon: 1.0    steps: 456  evaluation reward: 1092.1
Training network. lr: 0.000131. clip: 0.052547
Iteration 15496: Policy loss: 0.005470. Value loss: 0.310987. Entropy: 1.216355.
Iteration 15497: Policy loss: -0.001061. Value loss: 0.243409. Entropy: 1.212266.
Iteration 15498: Policy loss: -0.003796. Value loss: 0.229729. Entropy: 1.220244.
Training network. lr: 0.000131. clip: 0.052547
Iteration 15499: Policy loss: 0.006286. Value loss: 0.314686. Entropy: 1.175215.
Iteration 15500: Policy loss: 0.006381. Value loss: 0.222349. Entropy: 1.177649.
Iteration 15501: Policy loss: 0.000555. Value loss: 0.196663. Entropy: 1.170272.
episode: 5483   score: 990.0  epsilon: 1.0    steps: 400  evaluation reward: 1093.6
episode: 5484   score: 310.0  epsilon: 1.0    steps: 480  evaluation reward: 1087.8
Training network. lr: 0.000131. clip: 0.052390
Iteration 15502: Policy loss: 0.00365

Iteration 15559: Policy loss: 0.003430. Value loss: 0.359284. Entropy: 1.006983.
Iteration 15560: Policy loss: -0.000092. Value loss: 0.296204. Entropy: 1.042999.
Iteration 15561: Policy loss: -0.004174. Value loss: 0.276913. Entropy: 1.034592.
Training network. lr: 0.000131. clip: 0.052233
Iteration 15562: Policy loss: 0.003090. Value loss: 0.235851. Entropy: 1.081544.
Iteration 15563: Policy loss: 0.000281. Value loss: 0.169107. Entropy: 1.095272.
Iteration 15564: Policy loss: -0.000152. Value loss: 0.147554. Entropy: 1.097086.
episode: 5506   score: 940.0  epsilon: 1.0    steps: 56  evaluation reward: 1054.8
episode: 5507   score: 1070.0  epsilon: 1.0    steps: 600  evaluation reward: 1054.9
Training network. lr: 0.000131. clip: 0.052233
Iteration 15565: Policy loss: 0.004326. Value loss: 0.343866. Entropy: 1.265032.
Iteration 15566: Policy loss: 0.003905. Value loss: 0.208710. Entropy: 1.266654.
Iteration 15567: Policy loss: 0.001397. Value loss: 0.157992. Entropy: 1.260205.
Traini

Iteration 15627: Policy loss: -0.001388. Value loss: 0.086388. Entropy: 1.147615.
Training network. lr: 0.000130. clip: 0.052086
Iteration 15628: Policy loss: 0.001401. Value loss: 0.262189. Entropy: 1.388048.
Iteration 15629: Policy loss: 0.004455. Value loss: 0.157433. Entropy: 1.384299.
Iteration 15630: Policy loss: 0.000869. Value loss: 0.133790. Entropy: 1.391601.
episode: 5526   score: 1370.0  epsilon: 1.0    steps: 440  evaluation reward: 1037.9
episode: 5527   score: 880.0  epsilon: 1.0    steps: 480  evaluation reward: 1037.5
episode: 5528   score: 1210.0  epsilon: 1.0    steps: 704  evaluation reward: 1042.4
Training network. lr: 0.000130. clip: 0.052086
Iteration 15631: Policy loss: 0.007726. Value loss: 0.331760. Entropy: 1.115394.
Iteration 15632: Policy loss: -0.001256. Value loss: 0.225672. Entropy: 1.125595.
Iteration 15633: Policy loss: 0.002329. Value loss: 0.196778. Entropy: 1.126885.
episode: 5529   score: 2340.0  epsilon: 1.0    steps: 856  evaluation reward: 1060.

Iteration 15691: Policy loss: -0.000622. Value loss: 0.341054. Entropy: 1.139266.
Iteration 15692: Policy loss: -0.003640. Value loss: 0.265832. Entropy: 1.150917.
Iteration 15693: Policy loss: -0.006550. Value loss: 0.242504. Entropy: 1.152203.
Training network. lr: 0.000130. clip: 0.051929
Iteration 15694: Policy loss: 0.002787. Value loss: 0.088097. Entropy: 1.411435.
Iteration 15695: Policy loss: -0.004267. Value loss: 0.054580. Entropy: 1.409429.
Iteration 15696: Policy loss: -0.005868. Value loss: 0.043439. Entropy: 1.406906.
episode: 5549   score: 820.0  epsilon: 1.0    steps: 360  evaluation reward: 1069.8
Training network. lr: 0.000130. clip: 0.051929
Iteration 15697: Policy loss: 0.003381. Value loss: 0.159299. Entropy: 1.352714.
Iteration 15698: Policy loss: 0.000079. Value loss: 0.084205. Entropy: 1.355368.
Iteration 15699: Policy loss: -0.002868. Value loss: 0.058489. Entropy: 1.343805.
episode: 5550   score: 1830.0  epsilon: 1.0    steps: 32  evaluation reward: 1076.7
Tra

Iteration 15758: Policy loss: -0.001056. Value loss: 0.040165. Entropy: 1.343805.
Iteration 15759: Policy loss: -0.003875. Value loss: 0.036308. Entropy: 1.335619.
Training network. lr: 0.000129. clip: 0.051625
Iteration 15760: Policy loss: 0.003737. Value loss: 0.139509. Entropy: 1.422288.
Iteration 15761: Policy loss: 0.000250. Value loss: 0.071037. Entropy: 1.411860.
Iteration 15762: Policy loss: -0.001626. Value loss: 0.053923. Entropy: 1.409353.
episode: 5569   score: 970.0  epsilon: 1.0    steps: 600  evaluation reward: 1092.0
episode: 5570   score: 1270.0  epsilon: 1.0    steps: 752  evaluation reward: 1094.2
Training network. lr: 0.000129. clip: 0.051625
Iteration 15763: Policy loss: 0.005299. Value loss: 0.124191. Entropy: 1.458736.
Iteration 15764: Policy loss: -0.003534. Value loss: 0.063091. Entropy: 1.448618.
Iteration 15765: Policy loss: -0.004969. Value loss: 0.052742. Entropy: 1.450152.
episode: 5571   score: 990.0  epsilon: 1.0    steps: 352  evaluation reward: 1097.2


Training network. lr: 0.000129. clip: 0.051469
Iteration 15823: Policy loss: 0.007287. Value loss: 0.403611. Entropy: 1.147483.
Iteration 15824: Policy loss: 0.007801. Value loss: 0.256308. Entropy: 1.157376.
Iteration 15825: Policy loss: -0.000970. Value loss: 0.214964. Entropy: 1.151468.
episode: 5592   score: 1270.0  epsilon: 1.0    steps: 440  evaluation reward: 1116.4
Training network. lr: 0.000129. clip: 0.051469
Iteration 15826: Policy loss: 0.007321. Value loss: 0.144818. Entropy: 1.416180.
Iteration 15827: Policy loss: 0.002263. Value loss: 0.075781. Entropy: 1.404353.
Iteration 15828: Policy loss: -0.002253. Value loss: 0.053550. Entropy: 1.411652.
Training network. lr: 0.000129. clip: 0.051469
Iteration 15829: Policy loss: 0.003116. Value loss: 0.217284. Entropy: 1.488911.
Iteration 15830: Policy loss: 0.003522. Value loss: 0.150815. Entropy: 1.488211.
Iteration 15831: Policy loss: -0.001099. Value loss: 0.129243. Entropy: 1.488522.
Training network. lr: 0.000129. clip: 0.05

Iteration 15888: Policy loss: -0.005253. Value loss: 0.107176. Entropy: 1.413403.
episode: 5614   score: 1070.0  epsilon: 1.0    steps: 352  evaluation reward: 1110.8
Training network. lr: 0.000128. clip: 0.051312
Iteration 15889: Policy loss: 0.001434. Value loss: 0.328625. Entropy: 1.152222.
Iteration 15890: Policy loss: -0.000395. Value loss: 0.258837. Entropy: 1.163190.
Iteration 15891: Policy loss: -0.002690. Value loss: 0.221278. Entropy: 1.170168.
Training network. lr: 0.000128. clip: 0.051312
Iteration 15892: Policy loss: 0.006878. Value loss: 0.287307. Entropy: 1.533645.
Iteration 15893: Policy loss: 0.005557. Value loss: 0.150914. Entropy: 1.522794.
Iteration 15894: Policy loss: 0.000283. Value loss: 0.120021. Entropy: 1.515679.
Training network. lr: 0.000128. clip: 0.051312
Iteration 15895: Policy loss: 0.005257. Value loss: 0.141206. Entropy: 1.839772.
Iteration 15896: Policy loss: 0.004109. Value loss: 0.073625. Entropy: 1.839229.
Iteration 15897: Policy loss: -0.001214. V

Training network. lr: 0.000128. clip: 0.051008
Iteration 15955: Policy loss: 0.002055. Value loss: 0.139646. Entropy: 0.993080.
Iteration 15956: Policy loss: -0.001782. Value loss: 0.123623. Entropy: 0.969824.
Iteration 15957: Policy loss: -0.003491. Value loss: 0.119319. Entropy: 0.982587.
episode: 5635   score: 1250.0  epsilon: 1.0    steps: 40  evaluation reward: 1088.0
episode: 5636   score: 1260.0  epsilon: 1.0    steps: 904  evaluation reward: 1088.7
Training network. lr: 0.000128. clip: 0.051008
Iteration 15958: Policy loss: 0.000810. Value loss: 0.221757. Entropy: 0.889956.
Iteration 15959: Policy loss: -0.002376. Value loss: 0.193532. Entropy: 0.874180.
Iteration 15960: Policy loss: -0.003427. Value loss: 0.190167. Entropy: 0.872815.
episode: 5637   score: 980.0  epsilon: 1.0    steps: 520  evaluation reward: 1088.1
Training network. lr: 0.000128. clip: 0.051008
Iteration 15961: Policy loss: 0.004181. Value loss: 0.200540. Entropy: 1.073617.
Iteration 15962: Policy loss: 0.002

episode: 5656   score: 960.0  epsilon: 1.0    steps: 992  evaluation reward: 1100.3
Training network. lr: 0.000127. clip: 0.050851
Iteration 16021: Policy loss: 0.001602. Value loss: 0.193652. Entropy: 1.166926.
Iteration 16022: Policy loss: -0.000893. Value loss: 0.163322. Entropy: 1.164383.
Iteration 16023: Policy loss: -0.003248. Value loss: 0.141239. Entropy: 1.162239.
episode: 5657   score: 390.0  epsilon: 1.0    steps: 360  evaluation reward: 1094.2
Training network. lr: 0.000127. clip: 0.050851
Iteration 16024: Policy loss: 0.006017. Value loss: 0.517999. Entropy: 0.869455.
Iteration 16025: Policy loss: 0.002490. Value loss: 0.395728. Entropy: 0.872241.
Iteration 16026: Policy loss: 0.003673. Value loss: 0.321875. Entropy: 0.871050.
Training network. lr: 0.000127. clip: 0.050851
Iteration 16027: Policy loss: 0.003917. Value loss: 0.095023. Entropy: 1.204245.
Iteration 16028: Policy loss: -0.000590. Value loss: 0.062019. Entropy: 1.195407.
Iteration 16029: Policy loss: -0.006304.

Training network. lr: 0.000127. clip: 0.050704
Iteration 16087: Policy loss: 0.000873. Value loss: 0.058391. Entropy: 1.168977.
Iteration 16088: Policy loss: -0.002813. Value loss: 0.033246. Entropy: 1.175481.
Iteration 16089: Policy loss: -0.003941. Value loss: 0.028980. Entropy: 1.165426.
episode: 5678   score: 1390.0  epsilon: 1.0    steps: 504  evaluation reward: 1127.8
Training network. lr: 0.000127. clip: 0.050704
Iteration 16090: Policy loss: -0.000928. Value loss: 0.192317. Entropy: 1.094426.
Iteration 16091: Policy loss: -0.001279. Value loss: 0.147880. Entropy: 1.102129.
Iteration 16092: Policy loss: -0.004058. Value loss: 0.121209. Entropy: 1.096895.
episode: 5679   score: 2550.0  epsilon: 1.0    steps: 48  evaluation reward: 1140.1
episode: 5680   score: 960.0  epsilon: 1.0    steps: 216  evaluation reward: 1143.9
episode: 5681   score: 1090.0  epsilon: 1.0    steps: 264  evaluation reward: 1144.2
episode: 5682   score: 1210.0  epsilon: 1.0    steps: 872  evaluation reward:

Iteration 16152: Policy loss: -0.000480. Value loss: 0.098217. Entropy: 1.327610.
episode: 5700   score: 1350.0  epsilon: 1.0    steps: 248  evaluation reward: 1151.5
Training network. lr: 0.000126. clip: 0.050390
Iteration 16153: Policy loss: 0.001964. Value loss: 0.295715. Entropy: 1.117296.
Iteration 16154: Policy loss: -0.001312. Value loss: 0.248018. Entropy: 1.124439.
Iteration 16155: Policy loss: -0.001416. Value loss: 0.214201. Entropy: 1.115588.
Training network. lr: 0.000126. clip: 0.050390
Iteration 16156: Policy loss: 0.011599. Value loss: 0.214461. Entropy: 1.349691.
Iteration 16157: Policy loss: 0.004309. Value loss: 0.108811. Entropy: 1.342699.
Iteration 16158: Policy loss: 0.002424. Value loss: 0.096886. Entropy: 1.351993.
now time :  2019-03-06 06:43:19.385295
episode: 5701   score: 950.0  epsilon: 1.0    steps: 768  evaluation reward: 1152.5
Training network. lr: 0.000126. clip: 0.050390
Iteration 16159: Policy loss: 0.005433. Value loss: 0.190173. Entropy: 1.439528.


Iteration 16216: Policy loss: -0.000409. Value loss: 0.158139. Entropy: 1.356649.
Iteration 16217: Policy loss: -0.002758. Value loss: 0.124162. Entropy: 1.347227.
Iteration 16218: Policy loss: -0.004067. Value loss: 0.112841. Entropy: 1.351239.
episode: 5723   score: 1030.0  epsilon: 1.0    steps: 864  evaluation reward: 1177.7
episode: 5724   score: 940.0  epsilon: 1.0    steps: 944  evaluation reward: 1176.4
Training network. lr: 0.000126. clip: 0.050243
Iteration 16219: Policy loss: 0.002749. Value loss: 0.258496. Entropy: 1.300827.
Iteration 16220: Policy loss: 0.000652. Value loss: 0.194638. Entropy: 1.313136.
Iteration 16221: Policy loss: -0.002102. Value loss: 0.165171. Entropy: 1.314305.
Training network. lr: 0.000126. clip: 0.050243
Iteration 16222: Policy loss: 0.000912. Value loss: 0.208298. Entropy: 1.121721.
Iteration 16223: Policy loss: 0.001048. Value loss: 0.177165. Entropy: 1.142051.
Iteration 16224: Policy loss: -0.000660. Value loss: 0.162875. Entropy: 1.145842.
Tra

Training network. lr: 0.000125. clip: 0.050086
Iteration 16282: Policy loss: 0.001016. Value loss: 0.257820. Entropy: 1.005886.
Iteration 16283: Policy loss: -0.000917. Value loss: 0.208026. Entropy: 1.017437.
Iteration 16284: Policy loss: -0.000987. Value loss: 0.200332. Entropy: 1.020340.
Training network. lr: 0.000125. clip: 0.050086
Iteration 16285: Policy loss: 0.001551. Value loss: 0.137974. Entropy: 1.190399.
Iteration 16286: Policy loss: -0.002395. Value loss: 0.102952. Entropy: 1.188233.
Iteration 16287: Policy loss: -0.002546. Value loss: 0.090950. Entropy: 1.197741.
Training network. lr: 0.000125. clip: 0.050086
Iteration 16288: Policy loss: 0.005735. Value loss: 0.099531. Entropy: 1.423021.
Iteration 16289: Policy loss: 0.000277. Value loss: 0.036717. Entropy: 1.415569.
Iteration 16290: Policy loss: -0.003557. Value loss: 0.022949. Entropy: 1.420363.
episode: 5745   score: 960.0  epsilon: 1.0    steps: 464  evaluation reward: 1199.1
Training network. lr: 0.000125. clip: 0.0

episode: 5763   score: 1040.0  epsilon: 1.0    steps: 392  evaluation reward: 1192.3
Training network. lr: 0.000124. clip: 0.049782
Iteration 16351: Policy loss: 0.001054. Value loss: 0.040086. Entropy: 1.102135.
Iteration 16352: Policy loss: -0.003824. Value loss: 0.020222. Entropy: 1.086440.
Iteration 16353: Policy loss: -0.005691. Value loss: 0.017316. Entropy: 1.093319.
episode: 5764   score: 1080.0  epsilon: 1.0    steps: 680  evaluation reward: 1195.2
Training network. lr: 0.000124. clip: 0.049782
Iteration 16354: Policy loss: 0.002917. Value loss: 0.179045. Entropy: 1.117884.
Iteration 16355: Policy loss: 0.003098. Value loss: 0.128289. Entropy: 1.143754.
Iteration 16356: Policy loss: 0.001333. Value loss: 0.115569. Entropy: 1.132126.
episode: 5765   score: 1220.0  epsilon: 1.0    steps: 552  evaluation reward: 1198.7
episode: 5766   score: 1320.0  epsilon: 1.0    steps: 744  evaluation reward: 1199.8
Training network. lr: 0.000124. clip: 0.049782
Iteration 16357: Policy loss: 0

Iteration 16416: Policy loss: 0.002060. Value loss: 0.431597. Entropy: 1.208722.
episode: 5785   score: 1030.0  epsilon: 1.0    steps: 728  evaluation reward: 1194.4
Training network. lr: 0.000124. clip: 0.049625
Iteration 16417: Policy loss: 0.001454. Value loss: 0.129054. Entropy: 0.846054.
Iteration 16418: Policy loss: -0.000253. Value loss: 0.103284. Entropy: 0.846520.
Iteration 16419: Policy loss: -0.000483. Value loss: 0.107083. Entropy: 0.859602.
Training network. lr: 0.000124. clip: 0.049625
Iteration 16420: Policy loss: 0.001191. Value loss: 0.149501. Entropy: 1.167314.
Iteration 16421: Policy loss: -0.001178. Value loss: 0.102005. Entropy: 1.163475.
Iteration 16422: Policy loss: -0.004467. Value loss: 0.090369. Entropy: 1.156593.
episode: 5786   score: 1240.0  epsilon: 1.0    steps: 88  evaluation reward: 1195.9
episode: 5787   score: 1060.0  epsilon: 1.0    steps: 1016  evaluation reward: 1194.7
Training network. lr: 0.000124. clip: 0.049625
Iteration 16423: Policy loss: -0.

Iteration 16481: Policy loss: 0.001159. Value loss: 0.315987. Entropy: 0.973521.
Iteration 16482: Policy loss: -0.000404. Value loss: 0.310747. Entropy: 0.971481.
Training network. lr: 0.000124. clip: 0.049469
Iteration 16483: Policy loss: 0.001513. Value loss: 0.051701. Entropy: 1.444851.
Iteration 16484: Policy loss: -0.002528. Value loss: 0.031693. Entropy: 1.439638.
Iteration 16485: Policy loss: -0.006231. Value loss: 0.025611. Entropy: 1.441706.
episode: 5807   score: 1200.0  epsilon: 1.0    steps: 608  evaluation reward: 1199.0
episode: 5808   score: 940.0  epsilon: 1.0    steps: 784  evaluation reward: 1201.5
Training network. lr: 0.000124. clip: 0.049469
Iteration 16486: Policy loss: 0.005350. Value loss: 0.267117. Entropy: 1.628811.
Iteration 16487: Policy loss: 0.003688. Value loss: 0.193576. Entropy: 1.639501.
Iteration 16488: Policy loss: -0.001014. Value loss: 0.166536. Entropy: 1.627593.
Training network. lr: 0.000124. clip: 0.049469
Iteration 16489: Policy loss: 0.000716

episode: 5827   score: 820.0  epsilon: 1.0    steps: 288  evaluation reward: 1218.3
episode: 5828   score: 780.0  epsilon: 1.0    steps: 888  evaluation reward: 1215.3
Training network. lr: 0.000123. clip: 0.049321
Iteration 16549: Policy loss: 0.004622. Value loss: 0.278499. Entropy: 1.125136.
Iteration 16550: Policy loss: 0.000716. Value loss: 0.244239. Entropy: 1.112079.
Iteration 16551: Policy loss: -0.002082. Value loss: 0.235642. Entropy: 1.119444.
episode: 5829   score: 1070.0  epsilon: 1.0    steps: 664  evaluation reward: 1215.0
Training network. lr: 0.000123. clip: 0.049165
Iteration 16552: Policy loss: 0.004634. Value loss: 0.215810. Entropy: 0.951034.
Iteration 16553: Policy loss: -0.001164. Value loss: 0.154096. Entropy: 0.952762.
Iteration 16554: Policy loss: -0.004526. Value loss: 0.136947. Entropy: 0.958672.
episode: 5830   score: 1140.0  epsilon: 1.0    steps: 432  evaluation reward: 1214.9
episode: 5831   score: 1180.0  epsilon: 1.0    steps: 512  evaluation reward: 1

Training network. lr: 0.000123. clip: 0.049008
Iteration 16615: Policy loss: 0.000456. Value loss: 0.153235. Entropy: 1.376138.
Iteration 16616: Policy loss: -0.003545. Value loss: 0.131312. Entropy: 1.401839.
Iteration 16617: Policy loss: -0.005473. Value loss: 0.122220. Entropy: 1.387871.
episode: 5850   score: 1160.0  epsilon: 1.0    steps: 424  evaluation reward: 1193.8
Training network. lr: 0.000123. clip: 0.049008
Iteration 16618: Policy loss: -0.000355. Value loss: 0.051296. Entropy: 1.431760.
Iteration 16619: Policy loss: -0.004134. Value loss: 0.026633. Entropy: 1.430748.
Iteration 16620: Policy loss: -0.008065. Value loss: 0.022832. Entropy: 1.440798.
Training network. lr: 0.000123. clip: 0.049008
Iteration 16621: Policy loss: 0.001981. Value loss: 0.128582. Entropy: 1.551601.
Iteration 16622: Policy loss: -0.003243. Value loss: 0.114744. Entropy: 1.543581.
Iteration 16623: Policy loss: -0.003146. Value loss: 0.104156. Entropy: 1.536169.
now time :  2019-03-06 06:49:35.036387

Iteration 16680: Policy loss: -0.004171. Value loss: 0.179851. Entropy: 1.195133.
episode: 5872   score: 800.0  epsilon: 1.0    steps: 944  evaluation reward: 1159.7
Training network. lr: 0.000122. clip: 0.048860
Iteration 16681: Policy loss: 0.002474. Value loss: 0.248769. Entropy: 1.199198.
Iteration 16682: Policy loss: -0.000998. Value loss: 0.168984. Entropy: 1.212108.
Iteration 16683: Policy loss: -0.002961. Value loss: 0.136765. Entropy: 1.226758.
Training network. lr: 0.000122. clip: 0.048860
Iteration 16684: Policy loss: 0.006956. Value loss: 0.426003. Entropy: 1.348322.
Iteration 16685: Policy loss: 0.003101. Value loss: 0.312947. Entropy: 1.344304.
Iteration 16686: Policy loss: 0.001966. Value loss: 0.255215. Entropy: 1.344598.
episode: 5873   score: 1450.0  epsilon: 1.0    steps: 264  evaluation reward: 1162.6
episode: 5874   score: 1220.0  epsilon: 1.0    steps: 1016  evaluation reward: 1161.7
Training network. lr: 0.000122. clip: 0.048860
Iteration 16687: Policy loss: 0.00

Iteration 16747: Policy loss: 0.005860. Value loss: 0.193787. Entropy: 1.575984.
Iteration 16748: Policy loss: 0.001030. Value loss: 0.126290. Entropy: 1.577889.
Iteration 16749: Policy loss: -0.004199. Value loss: 0.105373. Entropy: 1.587504.
episode: 5894   score: 460.0  epsilon: 1.0    steps: 248  evaluation reward: 1121.7
episode: 5895   score: 760.0  epsilon: 1.0    steps: 632  evaluation reward: 1118.3
episode: 5896   score: 1350.0  epsilon: 1.0    steps: 960  evaluation reward: 1120.9
Training network. lr: 0.000122. clip: 0.048704
Iteration 16750: Policy loss: 0.000904. Value loss: 0.289040. Entropy: 0.946082.
Iteration 16751: Policy loss: 0.001022. Value loss: 0.255141. Entropy: 0.952043.
Iteration 16752: Policy loss: -0.002754. Value loss: 0.244065. Entropy: 0.945426.
Training network. lr: 0.000121. clip: 0.048547
Iteration 16753: Policy loss: 0.002997. Value loss: 0.245546. Entropy: 0.919441.
Iteration 16754: Policy loss: -0.000735. Value loss: 0.162806. Entropy: 0.934765.
It

Training network. lr: 0.000121. clip: 0.048400
Iteration 16813: Policy loss: 0.012740. Value loss: 0.443324. Entropy: 1.444933.
Iteration 16814: Policy loss: 0.006127. Value loss: 0.242498. Entropy: 1.431893.
Iteration 16815: Policy loss: 0.004298. Value loss: 0.192934. Entropy: 1.439619.
Training network. lr: 0.000121. clip: 0.048400
Iteration 16816: Policy loss: 0.007719. Value loss: 0.230666. Entropy: 1.532739.
Iteration 16817: Policy loss: 0.002799. Value loss: 0.154092. Entropy: 1.523788.
Iteration 16818: Policy loss: 0.000847. Value loss: 0.123784. Entropy: 1.519503.
episode: 5916   score: 1640.0  epsilon: 1.0    steps: 528  evaluation reward: 1118.4
episode: 5917   score: 1450.0  epsilon: 1.0    steps: 904  evaluation reward: 1121.1
Training network. lr: 0.000121. clip: 0.048400
Iteration 16819: Policy loss: 0.002591. Value loss: 0.230926. Entropy: 1.582864.
Iteration 16820: Policy loss: -0.000330. Value loss: 0.157307. Entropy: 1.579927.
Iteration 16821: Policy loss: -0.003847.

Training network. lr: 0.000121. clip: 0.048243
Iteration 16879: Policy loss: 0.000586. Value loss: 0.182216. Entropy: 1.650518.
Iteration 16880: Policy loss: -0.001322. Value loss: 0.109289. Entropy: 1.662316.
Iteration 16881: Policy loss: -0.003895. Value loss: 0.086237. Entropy: 1.671010.
episode: 5938   score: 830.0  epsilon: 1.0    steps: 784  evaluation reward: 1094.6
episode: 5939   score: 1050.0  epsilon: 1.0    steps: 848  evaluation reward: 1093.2
Training network. lr: 0.000121. clip: 0.048243
Iteration 16882: Policy loss: 0.000862. Value loss: 0.135547. Entropy: 1.543635.
Iteration 16883: Policy loss: -0.004002. Value loss: 0.108960. Entropy: 1.523555.
Iteration 16884: Policy loss: -0.004436. Value loss: 0.104182. Entropy: 1.536658.
Training network. lr: 0.000121. clip: 0.048243
Iteration 16885: Policy loss: 0.001233. Value loss: 0.245795. Entropy: 1.240822.
Iteration 16886: Policy loss: -0.002314. Value loss: 0.200369. Entropy: 1.254517.
Iteration 16887: Policy loss: -0.0027

episode: 5960   score: 1530.0  epsilon: 1.0    steps: 904  evaluation reward: 1114.8
Training network. lr: 0.000120. clip: 0.048086
Iteration 16945: Policy loss: 0.004671. Value loss: 0.473269. Entropy: 1.090652.
Iteration 16946: Policy loss: 0.004880. Value loss: 0.329864. Entropy: 1.071471.
Iteration 16947: Policy loss: 0.000200. Value loss: 0.285506. Entropy: 1.083974.
episode: 5961   score: 780.0  epsilon: 1.0    steps: 568  evaluation reward: 1112.0
Training network. lr: 0.000120. clip: 0.048086
Iteration 16948: Policy loss: 0.000688. Value loss: 0.278545. Entropy: 1.295160.
Iteration 16949: Policy loss: 0.000099. Value loss: 0.187711. Entropy: 1.292942.
Iteration 16950: Policy loss: -0.002927. Value loss: 0.166440. Entropy: 1.289967.
Training network. lr: 0.000120. clip: 0.047939
Iteration 16951: Policy loss: 0.003333. Value loss: 0.545593. Entropy: 1.497812.
Iteration 16952: Policy loss: 0.004327. Value loss: 0.394942. Entropy: 1.508492.
Iteration 16953: Policy loss: 0.001569. V

Training network. lr: 0.000119. clip: 0.047782
Iteration 17011: Policy loss: -0.001864. Value loss: 0.099174. Entropy: 1.382498.
Iteration 17012: Policy loss: -0.003441. Value loss: 0.076173. Entropy: 1.384973.
Iteration 17013: Policy loss: -0.005624. Value loss: 0.071004. Entropy: 1.381533.
Training network. lr: 0.000119. clip: 0.047782
Iteration 17014: Policy loss: 0.002611. Value loss: 0.269098. Entropy: 1.362671.
Iteration 17015: Policy loss: 0.002017. Value loss: 0.181570. Entropy: 1.381170.
Iteration 17016: Policy loss: 0.000431. Value loss: 0.168290. Entropy: 1.369962.
episode: 5982   score: 960.0  epsilon: 1.0    steps: 264  evaluation reward: 1137.4
Training network. lr: 0.000119. clip: 0.047782
Iteration 17017: Policy loss: 0.002339. Value loss: 0.301563. Entropy: 1.353050.
Iteration 17018: Policy loss: 0.001435. Value loss: 0.257888. Entropy: 1.345502.
Iteration 17019: Policy loss: -0.001054. Value loss: 0.234650. Entropy: 1.346476.
Training network. lr: 0.000119. clip: 0.04

Training network. lr: 0.000119. clip: 0.047625
Iteration 17077: Policy loss: 0.005913. Value loss: 0.233963. Entropy: 1.594496.
Iteration 17078: Policy loss: 0.002165. Value loss: 0.132127. Entropy: 1.582449.
Iteration 17079: Policy loss: 0.002948. Value loss: 0.111826. Entropy: 1.574738.
episode: 6003   score: 1240.0  epsilon: 1.0    steps: 8  evaluation reward: 1142.6
Training network. lr: 0.000119. clip: 0.047625
Iteration 17080: Policy loss: 0.002075. Value loss: 0.180927. Entropy: 1.414982.
Iteration 17081: Policy loss: 0.001394. Value loss: 0.154043. Entropy: 1.414469.
Iteration 17082: Policy loss: 0.000070. Value loss: 0.137168. Entropy: 1.414764.
episode: 6004   score: 1140.0  epsilon: 1.0    steps: 480  evaluation reward: 1142.7
Training network. lr: 0.000119. clip: 0.047625
Iteration 17083: Policy loss: 0.006236. Value loss: 0.402420. Entropy: 1.431197.
Iteration 17084: Policy loss: 0.004639. Value loss: 0.303338. Entropy: 1.429741.
Iteration 17085: Policy loss: 0.005003. Val

Iteration 17142: Policy loss: 0.001366. Value loss: 0.203997. Entropy: 1.219290.
episode: 6026   score: 1170.0  epsilon: 1.0    steps: 1008  evaluation reward: 1122.6
Training network. lr: 0.000119. clip: 0.047478
Iteration 17143: Policy loss: 0.004594. Value loss: 0.129264. Entropy: 1.609734.
Iteration 17144: Policy loss: 0.000757. Value loss: 0.079769. Entropy: 1.615439.
Iteration 17145: Policy loss: -0.002102. Value loss: 0.062770. Entropy: 1.615888.
Training network. lr: 0.000119. clip: 0.047478
Iteration 17146: Policy loss: 0.002092. Value loss: 0.177296. Entropy: 1.496117.
Iteration 17147: Policy loss: 0.000618. Value loss: 0.137094. Entropy: 1.482346.
Iteration 17148: Policy loss: -0.000118. Value loss: 0.123557. Entropy: 1.491665.
Training network. lr: 0.000119. clip: 0.047478
Iteration 17149: Policy loss: 0.005305. Value loss: 0.170735. Entropy: 1.516666.
Iteration 17150: Policy loss: 0.000249. Value loss: 0.100891. Entropy: 1.509930.
Iteration 17151: Policy loss: -0.000687. V

Iteration 17208: Policy loss: -0.000321. Value loss: 0.209148. Entropy: 1.045354.
episode: 6048   score: 770.0  epsilon: 1.0    steps: 240  evaluation reward: 1116.2
Training network. lr: 0.000118. clip: 0.047165
Iteration 17209: Policy loss: 0.006683. Value loss: 0.506477. Entropy: 1.312116.
Iteration 17210: Policy loss: 0.003579. Value loss: 0.342329. Entropy: 1.322952.
Iteration 17211: Policy loss: -0.000359. Value loss: 0.268993. Entropy: 1.305241.
Training network. lr: 0.000118. clip: 0.047165
Iteration 17212: Policy loss: 0.002246. Value loss: 0.165141. Entropy: 1.495614.
Iteration 17213: Policy loss: -0.002733. Value loss: 0.128195. Entropy: 1.489275.
Iteration 17214: Policy loss: -0.001740. Value loss: 0.116642. Entropy: 1.505714.
episode: 6049   score: 850.0  epsilon: 1.0    steps: 56  evaluation reward: 1113.6
episode: 6050   score: 1450.0  epsilon: 1.0    steps: 776  evaluation reward: 1116.6
Training network. lr: 0.000118. clip: 0.047165
Iteration 17215: Policy loss: 0.0047

Iteration 17272: Policy loss: 0.004146. Value loss: 0.208034. Entropy: 1.182543.
Iteration 17273: Policy loss: 0.002555. Value loss: 0.110410. Entropy: 1.188116.
Iteration 17274: Policy loss: -0.002744. Value loss: 0.097314. Entropy: 1.172775.
episode: 6071   score: 1160.0  epsilon: 1.0    steps: 248  evaluation reward: 1123.9
Training network. lr: 0.000118. clip: 0.047017
Iteration 17275: Policy loss: 0.001273. Value loss: 0.716164. Entropy: 1.410180.
Iteration 17276: Policy loss: 0.002086. Value loss: 0.534505. Entropy: 1.401953.
Iteration 17277: Policy loss: 0.001974. Value loss: 0.447103. Entropy: 1.403695.
episode: 6072   score: 750.0  epsilon: 1.0    steps: 856  evaluation reward: 1120.7
Training network. lr: 0.000118. clip: 0.047017
Iteration 17278: Policy loss: 0.005113. Value loss: 0.323981. Entropy: 1.534215.
Iteration 17279: Policy loss: 0.003506. Value loss: 0.210064. Entropy: 1.536490.
Iteration 17280: Policy loss: -0.000635. Value loss: 0.166731. Entropy: 1.519093.
Traini

episode: 6093   score: 500.0  epsilon: 1.0    steps: 704  evaluation reward: 1085.0
Training network. lr: 0.000117. clip: 0.046861
Iteration 17338: Policy loss: 0.004150. Value loss: 0.449017. Entropy: 0.891863.
Iteration 17339: Policy loss: -0.001297. Value loss: 0.309499. Entropy: 0.888885.
Iteration 17340: Policy loss: -0.001796. Value loss: 0.279195. Entropy: 0.909184.
episode: 6094   score: 1620.0  epsilon: 1.0    steps: 952  evaluation reward: 1093.2
Training network. lr: 0.000117. clip: 0.046861
Iteration 17341: Policy loss: 0.006519. Value loss: 0.354984. Entropy: 1.088956.
Iteration 17342: Policy loss: 0.009318. Value loss: 0.205561. Entropy: 1.098428.
Iteration 17343: Policy loss: 0.005171. Value loss: 0.162197. Entropy: 1.112291.
Training network. lr: 0.000117. clip: 0.046861
Iteration 17344: Policy loss: 0.002293. Value loss: 0.629056. Entropy: 1.201381.
Iteration 17345: Policy loss: 0.003479. Value loss: 0.485596. Entropy: 1.213178.
Iteration 17346: Policy loss: 0.003658. 

Training network. lr: 0.000116. clip: 0.046556
Iteration 17404: Policy loss: 0.002121. Value loss: 0.085545. Entropy: 0.983712.
Iteration 17405: Policy loss: -0.000756. Value loss: 0.049002. Entropy: 0.960394.
Iteration 17406: Policy loss: -0.001961. Value loss: 0.038971. Entropy: 0.970649.
episode: 6114   score: 940.0  epsilon: 1.0    steps: 992  evaluation reward: 1127.6
Training network. lr: 0.000116. clip: 0.046556
Iteration 17407: Policy loss: 0.003302. Value loss: 0.205424. Entropy: 1.214980.
Iteration 17408: Policy loss: -0.001913. Value loss: 0.129131. Entropy: 1.210133.
Iteration 17409: Policy loss: -0.000421. Value loss: 0.108441. Entropy: 1.211063.
episode: 6115   score: 1250.0  epsilon: 1.0    steps: 480  evaluation reward: 1129.3
episode: 6116   score: 1320.0  epsilon: 1.0    steps: 576  evaluation reward: 1129.8
Training network. lr: 0.000116. clip: 0.046556
Iteration 17410: Policy loss: 0.001147. Value loss: 0.222634. Entropy: 1.110111.
Iteration 17411: Policy loss: -0.0

Iteration 17469: Policy loss: -0.001733. Value loss: 0.131540. Entropy: 1.036175.
episode: 6137   score: 1430.0  epsilon: 1.0    steps: 272  evaluation reward: 1144.7
Training network. lr: 0.000116. clip: 0.046400
Iteration 17470: Policy loss: 0.000227. Value loss: 0.343374. Entropy: 1.174103.
Iteration 17471: Policy loss: -0.002319. Value loss: 0.210440. Entropy: 1.184150.
Iteration 17472: Policy loss: -0.004221. Value loss: 0.185402. Entropy: 1.185131.
episode: 6138   score: 1450.0  epsilon: 1.0    steps: 336  evaluation reward: 1146.8
Training network. lr: 0.000116. clip: 0.046400
Iteration 17473: Policy loss: 0.005580. Value loss: 0.179578. Entropy: 1.448502.
Iteration 17474: Policy loss: 0.002505. Value loss: 0.097099. Entropy: 1.443494.
Iteration 17475: Policy loss: -0.000605. Value loss: 0.081812. Entropy: 1.448604.
episode: 6139   score: 1250.0  epsilon: 1.0    steps: 328  evaluation reward: 1153.9
episode: 6140   score: 890.0  epsilon: 1.0    steps: 696  evaluation reward: 115

Iteration 17534: Policy loss: 0.002542. Value loss: 0.296543. Entropy: 1.071403.
Iteration 17535: Policy loss: -0.000045. Value loss: 0.275292. Entropy: 1.075760.
episode: 6159   score: 820.0  epsilon: 1.0    steps: 896  evaluation reward: 1174.1
Training network. lr: 0.000116. clip: 0.046243
Iteration 17536: Policy loss: 0.008218. Value loss: 0.200011. Entropy: 1.280652.
Iteration 17537: Policy loss: 0.005704. Value loss: 0.096155. Entropy: 1.271912.
Iteration 17538: Policy loss: 0.001048. Value loss: 0.068376. Entropy: 1.278497.
episode: 6160   score: 1220.0  epsilon: 1.0    steps: 400  evaluation reward: 1174.2
Training network. lr: 0.000116. clip: 0.046243
Iteration 17539: Policy loss: 0.006086. Value loss: 0.357179. Entropy: 1.022690.
Iteration 17540: Policy loss: 0.006900. Value loss: 0.265659. Entropy: 1.025137.
Iteration 17541: Policy loss: 0.003612. Value loss: 0.223939. Entropy: 1.026802.
episode: 6161   score: 1490.0  epsilon: 1.0    steps: 152  evaluation reward: 1178.8
epi

Iteration 17601: Policy loss: -0.001870. Value loss: 0.124789. Entropy: 1.060545.
episode: 6180   score: 1230.0  epsilon: 1.0    steps: 96  evaluation reward: 1192.9
episode: 6181   score: 1150.0  epsilon: 1.0    steps: 704  evaluation reward: 1191.3
episode: 6182   score: 870.0  epsilon: 1.0    steps: 944  evaluation reward: 1197.4
Training network. lr: 0.000115. clip: 0.045939
Iteration 17602: Policy loss: 0.013808. Value loss: 0.491932. Entropy: 1.351953.
Iteration 17603: Policy loss: 0.003648. Value loss: 0.432842. Entropy: 1.332277.
Iteration 17604: Policy loss: -0.001688. Value loss: 0.397217. Entropy: 1.346713.
episode: 6183   score: 760.0  epsilon: 1.0    steps: 744  evaluation reward: 1193.5
Training network. lr: 0.000115. clip: 0.045939
Iteration 17605: Policy loss: 0.001792. Value loss: 0.272328. Entropy: 1.019157.
Iteration 17606: Policy loss: 0.001420. Value loss: 0.248573. Entropy: 1.047420.
Iteration 17607: Policy loss: -0.000869. Value loss: 0.232021. Entropy: 1.034489.

Iteration 17666: Policy loss: 0.000567. Value loss: 0.100427. Entropy: 1.479636.
Iteration 17667: Policy loss: -0.002356. Value loss: 0.081440. Entropy: 1.490727.
episode: 6202   score: 780.0  epsilon: 1.0    steps: 88  evaluation reward: 1207.5
episode: 6203   score: 1880.0  epsilon: 1.0    steps: 672  evaluation reward: 1212.7
episode: 6204   score: 580.0  epsilon: 1.0    steps: 696  evaluation reward: 1211.2
Training network. lr: 0.000114. clip: 0.045782
Iteration 17668: Policy loss: 0.006597. Value loss: 0.246798. Entropy: 1.326409.
Iteration 17669: Policy loss: 0.003758. Value loss: 0.168205. Entropy: 1.330778.
Iteration 17670: Policy loss: 0.002291. Value loss: 0.154297. Entropy: 1.304357.
episode: 6205   score: 1250.0  epsilon: 1.0    steps: 392  evaluation reward: 1213.7
Training network. lr: 0.000114. clip: 0.045782
Iteration 17671: Policy loss: 0.004502. Value loss: 0.243479. Entropy: 1.089453.
Iteration 17672: Policy loss: 0.001579. Value loss: 0.196021. Entropy: 1.093287.
I

episode: 6225   score: 740.0  epsilon: 1.0    steps: 176  evaluation reward: 1204.0
Training network. lr: 0.000114. clip: 0.045635
Iteration 17731: Policy loss: 0.003880. Value loss: 0.345844. Entropy: 1.124072.
Iteration 17732: Policy loss: 0.000818. Value loss: 0.265028. Entropy: 1.133173.
Iteration 17733: Policy loss: 0.000042. Value loss: 0.238428. Entropy: 1.126510.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17734: Policy loss: 0.002387. Value loss: 0.349024. Entropy: 1.504317.
Iteration 17735: Policy loss: 0.005905. Value loss: 0.222856. Entropy: 1.510221.
Iteration 17736: Policy loss: 0.002016. Value loss: 0.175438. Entropy: 1.507012.
episode: 6226   score: 1260.0  epsilon: 1.0    steps: 192  evaluation reward: 1191.9
Training network. lr: 0.000114. clip: 0.045635
Iteration 17737: Policy loss: 0.002411. Value loss: 0.237335. Entropy: 1.583680.
Iteration 17738: Policy loss: 0.001848. Value loss: 0.172818. Entropy: 1.575885.
Iteration 17739: Policy loss: -0.001535. V

Training network. lr: 0.000114. clip: 0.045478
Iteration 17797: Policy loss: 0.002583. Value loss: 0.187849. Entropy: 1.333855.
Iteration 17798: Policy loss: -0.000046. Value loss: 0.134348. Entropy: 1.333092.
Iteration 17799: Policy loss: -0.002379. Value loss: 0.114768. Entropy: 1.340435.
episode: 6247   score: 950.0  epsilon: 1.0    steps: 1008  evaluation reward: 1179.9
Training network. lr: 0.000114. clip: 0.045478
Iteration 17800: Policy loss: 0.007630. Value loss: 0.465678. Entropy: 1.564408.
Iteration 17801: Policy loss: 0.003172. Value loss: 0.387666. Entropy: 1.556214.
Iteration 17802: Policy loss: -0.001445. Value loss: 0.358064. Entropy: 1.539875.
episode: 6248   score: 1220.0  epsilon: 1.0    steps: 120  evaluation reward: 1179.1
episode: 6249   score: 1270.0  epsilon: 1.0    steps: 728  evaluation reward: 1187.0
Training network. lr: 0.000113. clip: 0.045321
Iteration 17803: Policy loss: 0.001814. Value loss: 0.263942. Entropy: 1.137037.
Iteration 17804: Policy loss: -0.0

episode: 6269   score: 1060.0  epsilon: 1.0    steps: 384  evaluation reward: 1212.4
Training network. lr: 0.000113. clip: 0.045174
Iteration 17863: Policy loss: 0.008493. Value loss: 0.563206. Entropy: 1.028348.
Iteration 17864: Policy loss: 0.002700. Value loss: 0.426688. Entropy: 1.023207.
Iteration 17865: Policy loss: -0.002229. Value loss: 0.386146. Entropy: 1.027219.
Training network. lr: 0.000113. clip: 0.045174
Iteration 17866: Policy loss: 0.008726. Value loss: 0.311674. Entropy: 1.106160.
Iteration 17867: Policy loss: 0.001124. Value loss: 0.239075. Entropy: 1.097941.
Iteration 17868: Policy loss: -0.002237. Value loss: 0.215049. Entropy: 1.097335.
episode: 6270   score: 1440.0  epsilon: 1.0    steps: 72  evaluation reward: 1216.0
episode: 6271   score: 810.0  epsilon: 1.0    steps: 968  evaluation reward: 1212.7
Training network. lr: 0.000113. clip: 0.045174
Iteration 17869: Policy loss: 0.004752. Value loss: 0.617779. Entropy: 1.243475.
Iteration 17870: Policy loss: 0.00265

Training network. lr: 0.000113. clip: 0.045017
Iteration 17929: Policy loss: 0.002652. Value loss: 0.273103. Entropy: 1.593979.
Iteration 17930: Policy loss: 0.001147. Value loss: 0.196979. Entropy: 1.591358.
Iteration 17931: Policy loss: 0.000117. Value loss: 0.159220. Entropy: 1.578051.
episode: 6291   score: 530.0  epsilon: 1.0    steps: 488  evaluation reward: 1198.0
Training network. lr: 0.000113. clip: 0.045017
Iteration 17932: Policy loss: 0.006312. Value loss: 0.261015. Entropy: 1.483067.
Iteration 17933: Policy loss: 0.003388. Value loss: 0.146695. Entropy: 1.478391.
Iteration 17934: Policy loss: -0.000136. Value loss: 0.119327. Entropy: 1.471879.
episode: 6292   score: 1000.0  epsilon: 1.0    steps: 8  evaluation reward: 1192.3
Training network. lr: 0.000113. clip: 0.045017
Iteration 17935: Policy loss: 0.004747. Value loss: 0.502337. Entropy: 1.146056.
Iteration 17936: Policy loss: 0.005485. Value loss: 0.438641. Entropy: 1.136536.
Iteration 17937: Policy loss: 0.003356. Val

Iteration 17994: Policy loss: 0.001440. Value loss: 0.450675. Entropy: 1.324053.
episode: 6313   score: 1740.0  epsilon: 1.0    steps: 336  evaluation reward: 1184.5
Training network. lr: 0.000112. clip: 0.044861
Iteration 17995: Policy loss: 0.005092. Value loss: 0.397154. Entropy: 1.299733.
Iteration 17996: Policy loss: 0.003457. Value loss: 0.285577. Entropy: 1.310713.
Iteration 17997: Policy loss: 0.005604. Value loss: 0.233053. Entropy: 1.302695.
Training network. lr: 0.000112. clip: 0.044861
Iteration 17998: Policy loss: 0.004339. Value loss: 0.255911. Entropy: 1.390183.
Iteration 17999: Policy loss: 0.000211. Value loss: 0.197096. Entropy: 1.392880.
Iteration 18000: Policy loss: 0.001175. Value loss: 0.168769. Entropy: 1.407521.
episode: 6314   score: 1030.0  epsilon: 1.0    steps: 856  evaluation reward: 1184.4
Training network. lr: 0.000112. clip: 0.044713
Iteration 18001: Policy loss: 0.000591. Value loss: 0.473446. Entropy: 1.387295.
Iteration 18002: Policy loss: 0.001787. V

episode: 6334   score: 2190.0  epsilon: 1.0    steps: 968  evaluation reward: 1200.7
Training network. lr: 0.000111. clip: 0.044557
Iteration 18061: Policy loss: 0.000821. Value loss: 0.270083. Entropy: 0.750912.
Iteration 18062: Policy loss: 0.001939. Value loss: 0.206834. Entropy: 0.749723.
Iteration 18063: Policy loss: -0.002937. Value loss: 0.201028. Entropy: 0.756483.
episode: 6335   score: 1270.0  epsilon: 1.0    steps: 64  evaluation reward: 1204.3
episode: 6336   score: 1380.0  epsilon: 1.0    steps: 512  evaluation reward: 1209.9
episode: 6337   score: 1440.0  epsilon: 1.0    steps: 560  evaluation reward: 1214.9
Training network. lr: 0.000111. clip: 0.044557
Iteration 18064: Policy loss: 0.005407. Value loss: 0.289729. Entropy: 0.789569.
Iteration 18065: Policy loss: -0.000777. Value loss: 0.231409. Entropy: 0.794071.
Iteration 18066: Policy loss: -0.003244. Value loss: 0.226009. Entropy: 0.785917.
Training network. lr: 0.000111. clip: 0.044557
Iteration 18067: Policy loss: 0

Iteration 18126: Policy loss: 0.000862. Value loss: 0.418172. Entropy: 1.250219.
episode: 6356   score: 1450.0  epsilon: 1.0    steps: 96  evaluation reward: 1198.5
episode: 6357   score: 790.0  epsilon: 1.0    steps: 328  evaluation reward: 1193.7
Training network. lr: 0.000111. clip: 0.044400
Iteration 18127: Policy loss: 0.002934. Value loss: 0.231197. Entropy: 1.192659.
Iteration 18128: Policy loss: 0.002591. Value loss: 0.181734. Entropy: 1.175595.
Iteration 18129: Policy loss: -0.001406. Value loss: 0.168398. Entropy: 1.180347.
Training network. lr: 0.000111. clip: 0.044400
Iteration 18130: Policy loss: 0.007776. Value loss: 0.289509. Entropy: 1.409785.
Iteration 18131: Policy loss: 0.006622. Value loss: 0.134828. Entropy: 1.400209.
Iteration 18132: Policy loss: 0.002382. Value loss: 0.094549. Entropy: 1.378572.
episode: 6358   score: 1090.0  epsilon: 1.0    steps: 192  evaluation reward: 1200.0
episode: 6359   score: 890.0  epsilon: 1.0    steps: 720  evaluation reward: 1192.7
e

Iteration 18191: Policy loss: 0.001728. Value loss: 0.206322. Entropy: 1.273743.
Iteration 18192: Policy loss: -0.000714. Value loss: 0.185490. Entropy: 1.283004.
Training network. lr: 0.000111. clip: 0.044252
Iteration 18193: Policy loss: 0.007075. Value loss: 0.373003. Entropy: 1.114948.
Iteration 18194: Policy loss: 0.007039. Value loss: 0.229153. Entropy: 1.149089.
Iteration 18195: Policy loss: 0.001766. Value loss: 0.208760. Entropy: 1.149871.
Training network. lr: 0.000111. clip: 0.044252
Iteration 18196: Policy loss: 0.002591. Value loss: 0.371178. Entropy: 1.334174.
Iteration 18197: Policy loss: 0.003693. Value loss: 0.309310. Entropy: 1.321757.
Iteration 18198: Policy loss: 0.000093. Value loss: 0.281391. Entropy: 1.320041.
Training network. lr: 0.000111. clip: 0.044252
Iteration 18199: Policy loss: 0.001457. Value loss: 0.160316. Entropy: 1.297732.
Iteration 18200: Policy loss: -0.001088. Value loss: 0.121146. Entropy: 1.301897.
Iteration 18201: Policy loss: -0.003897. Value 

Iteration 18258: Policy loss: -0.002366. Value loss: 0.118765. Entropy: 1.021445.
episode: 6400   score: 930.0  epsilon: 1.0    steps: 616  evaluation reward: 1171.8
Training network. lr: 0.000110. clip: 0.043939
Iteration 18259: Policy loss: 0.003096. Value loss: 0.381803. Entropy: 1.110878.
Iteration 18260: Policy loss: 0.004541. Value loss: 0.275529. Entropy: 1.126436.
Iteration 18261: Policy loss: -0.000070. Value loss: 0.254015. Entropy: 1.131662.
now time :  2019-03-06 07:11:40.223927
episode: 6401   score: 1420.0  epsilon: 1.0    steps: 336  evaluation reward: 1177.0
episode: 6402   score: 900.0  epsilon: 1.0    steps: 816  evaluation reward: 1173.0
Training network. lr: 0.000110. clip: 0.043939
Iteration 18262: Policy loss: 0.002382. Value loss: 0.194347. Entropy: 1.367689.
Iteration 18263: Policy loss: -0.000924. Value loss: 0.144765. Entropy: 1.363239.
Iteration 18264: Policy loss: -0.002042. Value loss: 0.132698. Entropy: 1.352427.
Training network. lr: 0.000110. clip: 0.043

Training network. lr: 0.000109. clip: 0.043792
Iteration 18325: Policy loss: 0.003435. Value loss: 0.093209. Entropy: 1.043814.
Iteration 18326: Policy loss: 0.001714. Value loss: 0.058213. Entropy: 1.051413.
Iteration 18327: Policy loss: 0.002028. Value loss: 0.047616. Entropy: 1.043438.
episode: 6420   score: 1000.0  epsilon: 1.0    steps: 144  evaluation reward: 1141.2
episode: 6421   score: 1330.0  epsilon: 1.0    steps: 544  evaluation reward: 1141.8
Training network. lr: 0.000109. clip: 0.043792
Iteration 18328: Policy loss: 0.002154. Value loss: 0.248318. Entropy: 0.829743.
Iteration 18329: Policy loss: 0.000595. Value loss: 0.203026. Entropy: 0.818338.
Iteration 18330: Policy loss: -0.000586. Value loss: 0.198678. Entropy: 0.814800.
episode: 6422   score: 1300.0  epsilon: 1.0    steps: 992  evaluation reward: 1143.7
Training network. lr: 0.000109. clip: 0.043792
Iteration 18331: Policy loss: -0.000128. Value loss: 0.112613. Entropy: 0.969331.
Iteration 18332: Policy loss: -0.00

episode: 6442   score: 1150.0  epsilon: 1.0    steps: 200  evaluation reward: 1110.4
episode: 6443   score: 1360.0  epsilon: 1.0    steps: 776  evaluation reward: 1111.4
Training network. lr: 0.000109. clip: 0.043635
Iteration 18391: Policy loss: 0.010015. Value loss: 0.428494. Entropy: 1.306014.
Iteration 18392: Policy loss: 0.005525. Value loss: 0.285957. Entropy: 1.317803.
Iteration 18393: Policy loss: 0.004168. Value loss: 0.227180. Entropy: 1.323312.
episode: 6444   score: 610.0  epsilon: 1.0    steps: 448  evaluation reward: 1107.5
Training network. lr: 0.000109. clip: 0.043635
Iteration 18394: Policy loss: 0.010310. Value loss: 0.473898. Entropy: 1.278648.
Iteration 18395: Policy loss: 0.013025. Value loss: 0.299781. Entropy: 1.272084.
Iteration 18396: Policy loss: 0.004237. Value loss: 0.242665. Entropy: 1.268603.
Training network. lr: 0.000109. clip: 0.043635
Iteration 18397: Policy loss: 0.004416. Value loss: 0.256349. Entropy: 1.290754.
Iteration 18398: Policy loss: -0.00217

Iteration 18456: Policy loss: 0.003408. Value loss: 0.218665. Entropy: 1.071462.
episode: 6464   score: 840.0  epsilon: 1.0    steps: 408  evaluation reward: 1098.1
Training network. lr: 0.000108. clip: 0.043331
Iteration 18457: Policy loss: 0.004048. Value loss: 0.171572. Entropy: 0.917805.
Iteration 18458: Policy loss: 0.001036. Value loss: 0.137624. Entropy: 0.889680.
Iteration 18459: Policy loss: -0.001374. Value loss: 0.124141. Entropy: 0.917634.
episode: 6465   score: 1200.0  epsilon: 1.0    steps: 304  evaluation reward: 1096.5
Training network. lr: 0.000108. clip: 0.043331
Iteration 18460: Policy loss: 0.003687. Value loss: 0.152390. Entropy: 0.853805.
Iteration 18461: Policy loss: 0.000148. Value loss: 0.093354. Entropy: 0.831490.
Iteration 18462: Policy loss: -0.000344. Value loss: 0.080376. Entropy: 0.827663.
episode: 6466   score: 1220.0  epsilon: 1.0    steps: 784  evaluation reward: 1099.2
Training network. lr: 0.000108. clip: 0.043331
Iteration 18463: Policy loss: 0.0018

episode: 6485   score: 1060.0  epsilon: 1.0    steps: 312  evaluation reward: 1106.5
Training network. lr: 0.000108. clip: 0.043174
Iteration 18523: Policy loss: 0.002083. Value loss: 0.123467. Entropy: 1.059225.
Iteration 18524: Policy loss: -0.000353. Value loss: 0.102075. Entropy: 1.066990.
Iteration 18525: Policy loss: -0.001200. Value loss: 0.097148. Entropy: 1.068915.
Training network. lr: 0.000108. clip: 0.043174
Iteration 18526: Policy loss: 0.001882. Value loss: 0.326586. Entropy: 1.097432.
Iteration 18527: Policy loss: 0.003640. Value loss: 0.240717. Entropy: 1.110325.
Iteration 18528: Policy loss: 0.003321. Value loss: 0.217247. Entropy: 1.111402.
episode: 6486   score: 1060.0  epsilon: 1.0    steps: 72  evaluation reward: 1109.1
episode: 6487   score: 1420.0  epsilon: 1.0    steps: 192  evaluation reward: 1112.8
episode: 6488   score: 1320.0  epsilon: 1.0    steps: 792  evaluation reward: 1116.8
Training network. lr: 0.000108. clip: 0.043174
Iteration 18529: Policy loss: 0.

Iteration 18588: Policy loss: -0.005968. Value loss: 0.157761. Entropy: 1.559504.
Training network. lr: 0.000108. clip: 0.043017
Iteration 18589: Policy loss: 0.003993. Value loss: 0.506278. Entropy: 1.763155.
Iteration 18590: Policy loss: 0.003045. Value loss: 0.374130. Entropy: 1.749949.
Iteration 18591: Policy loss: 0.003954. Value loss: 0.322879. Entropy: 1.756238.
episode: 6507   score: 880.0  epsilon: 1.0    steps: 240  evaluation reward: 1140.6
episode: 6508   score: 990.0  epsilon: 1.0    steps: 264  evaluation reward: 1138.8
Training network. lr: 0.000108. clip: 0.043017
Iteration 18592: Policy loss: 0.004498. Value loss: 0.354067. Entropy: 1.338003.
Iteration 18593: Policy loss: 0.000041. Value loss: 0.266129. Entropy: 1.326797.
Iteration 18594: Policy loss: -0.001492. Value loss: 0.235487. Entropy: 1.324764.
episode: 6509   score: 1040.0  epsilon: 1.0    steps: 600  evaluation reward: 1140.4
Training network. lr: 0.000108. clip: 0.043017
Iteration 18595: Policy loss: 0.00332

Iteration 18654: Policy loss: -0.000258. Value loss: 0.260711. Entropy: 1.244959.
episode: 6529   score: 1270.0  epsilon: 1.0    steps: 384  evaluation reward: 1170.6
Training network. lr: 0.000107. clip: 0.042713
Iteration 18655: Policy loss: 0.003797. Value loss: 0.725686. Entropy: 1.584397.
Iteration 18656: Policy loss: 0.005878. Value loss: 0.555995. Entropy: 1.579706.
Iteration 18657: Policy loss: 0.001649. Value loss: 0.510746. Entropy: 1.580558.
Training network. lr: 0.000107. clip: 0.042713
Iteration 18658: Policy loss: 0.005209. Value loss: 0.171999. Entropy: 1.364654.
Iteration 18659: Policy loss: 0.001672. Value loss: 0.117190. Entropy: 1.363495.
Iteration 18660: Policy loss: -0.000598. Value loss: 0.095506. Entropy: 1.348640.
episode: 6530   score: 980.0  epsilon: 1.0    steps: 400  evaluation reward: 1166.9
Training network. lr: 0.000107. clip: 0.042713
Iteration 18661: Policy loss: 0.005668. Value loss: 0.262807. Entropy: 1.285640.
Iteration 18662: Policy loss: -0.000548.

Training network. lr: 0.000106. clip: 0.042557
Iteration 18721: Policy loss: 0.002542. Value loss: 0.219579. Entropy: 1.066539.
Iteration 18722: Policy loss: -0.000610. Value loss: 0.196056. Entropy: 1.081080.
Iteration 18723: Policy loss: -0.001791. Value loss: 0.188764. Entropy: 1.069490.
episode: 6550   score: 1060.0  epsilon: 1.0    steps: 224  evaluation reward: 1205.9
Training network. lr: 0.000106. clip: 0.042557
Iteration 18724: Policy loss: 0.000688. Value loss: 0.143878. Entropy: 1.087066.
Iteration 18725: Policy loss: -0.000351. Value loss: 0.130250. Entropy: 1.082023.
Iteration 18726: Policy loss: -0.002252. Value loss: 0.123418. Entropy: 1.088147.
now time :  2019-03-06 07:17:55.825501
episode: 6551   score: 1270.0  epsilon: 1.0    steps: 224  evaluation reward: 1203.9
episode: 6552   score: 730.0  epsilon: 1.0    steps: 240  evaluation reward: 1203.2
episode: 6553   score: 1060.0  epsilon: 1.0    steps: 648  evaluation reward: 1201.3
Training network. lr: 0.000106. clip: 

Iteration 18786: Policy loss: 0.000203. Value loss: 0.206258. Entropy: 0.993247.
episode: 6572   score: 1070.0  epsilon: 1.0    steps: 120  evaluation reward: 1182.0
Training network. lr: 0.000106. clip: 0.042409
Iteration 18787: Policy loss: 0.001844. Value loss: 0.292525. Entropy: 1.337259.
Iteration 18788: Policy loss: 0.001754. Value loss: 0.189946. Entropy: 1.330811.
Iteration 18789: Policy loss: 0.000176. Value loss: 0.168709. Entropy: 1.344247.
Training network. lr: 0.000106. clip: 0.042409
Iteration 18790: Policy loss: 0.001877. Value loss: 0.208736. Entropy: 1.502000.
Iteration 18791: Policy loss: 0.003083. Value loss: 0.129742. Entropy: 1.506049.
Iteration 18792: Policy loss: 0.002046. Value loss: 0.105987. Entropy: 1.504330.
episode: 6573   score: 1080.0  epsilon: 1.0    steps: 672  evaluation reward: 1181.9
episode: 6574   score: 1230.0  epsilon: 1.0    steps: 720  evaluation reward: 1181.3
Training network. lr: 0.000106. clip: 0.042409
Iteration 18793: Policy loss: 0.00039

Iteration 18850: Policy loss: 0.004581. Value loss: 0.168809. Entropy: 0.918993.
Iteration 18851: Policy loss: 0.004000. Value loss: 0.141727. Entropy: 0.937906.
Iteration 18852: Policy loss: -0.003068. Value loss: 0.132478. Entropy: 0.936700.
Training network. lr: 0.000105. clip: 0.042096
Iteration 18853: Policy loss: 0.002932. Value loss: 0.291285. Entropy: 0.949433.
Iteration 18854: Policy loss: -0.000132. Value loss: 0.165179. Entropy: 0.933620.
Iteration 18855: Policy loss: -0.001115. Value loss: 0.135247. Entropy: 0.940208.
episode: 6595   score: 1400.0  epsilon: 1.0    steps: 416  evaluation reward: 1171.4
Training network. lr: 0.000105. clip: 0.042096
Iteration 18856: Policy loss: 0.002595. Value loss: 0.071820. Entropy: 1.294208.
Iteration 18857: Policy loss: -0.001581. Value loss: 0.038743. Entropy: 1.282571.
Iteration 18858: Policy loss: -0.002842. Value loss: 0.034261. Entropy: 1.277762.
episode: 6596   score: 1040.0  epsilon: 1.0    steps: 200  evaluation reward: 1172.6
ep

Iteration 18917: Policy loss: -0.000239. Value loss: 0.074970. Entropy: 1.140179.
Iteration 18918: Policy loss: -0.002782. Value loss: 0.054450. Entropy: 1.132287.
episode: 6615   score: 2540.0  epsilon: 1.0    steps: 96  evaluation reward: 1179.2
episode: 6616   score: 1110.0  epsilon: 1.0    steps: 136  evaluation reward: 1180.3
episode: 6617   score: 1260.0  epsilon: 1.0    steps: 376  evaluation reward: 1184.5
episode: 6618   score: 1510.0  epsilon: 1.0    steps: 656  evaluation reward: 1189.1
Training network. lr: 0.000105. clip: 0.041948
Iteration 18919: Policy loss: 0.001954. Value loss: 0.489267. Entropy: 0.883715.
Iteration 18920: Policy loss: 0.006753. Value loss: 0.362234. Entropy: 0.884056.
Iteration 18921: Policy loss: -0.002883. Value loss: 0.318438. Entropy: 0.877946.
Training network. lr: 0.000105. clip: 0.041948
Iteration 18922: Policy loss: 0.013063. Value loss: 0.338470. Entropy: 1.010539.
Iteration 18923: Policy loss: 0.003979. Value loss: 0.199833. Entropy: 1.00284

Training network. lr: 0.000104. clip: 0.041792
Iteration 18982: Policy loss: 0.000729. Value loss: 0.343546. Entropy: 0.602524.
Iteration 18983: Policy loss: -0.001823. Value loss: 0.299805. Entropy: 0.596093.
Iteration 18984: Policy loss: -0.001891. Value loss: 0.273714. Entropy: 0.601624.
episode: 6639   score: 1270.0  epsilon: 1.0    steps: 792  evaluation reward: 1174.6
Training network. lr: 0.000104. clip: 0.041792
Iteration 18985: Policy loss: 0.006318. Value loss: 0.330277. Entropy: 1.130490.
Iteration 18986: Policy loss: 0.003154. Value loss: 0.184508. Entropy: 1.118107.
Iteration 18987: Policy loss: 0.001944. Value loss: 0.152538. Entropy: 1.122588.
Training network. lr: 0.000104. clip: 0.041792
Iteration 18988: Policy loss: 0.007595. Value loss: 0.730233. Entropy: 1.257697.
Iteration 18989: Policy loss: 0.006633. Value loss: 0.567763. Entropy: 1.271189.
Iteration 18990: Policy loss: 0.003205. Value loss: 0.499796. Entropy: 1.266510.
episode: 6640   score: 1430.0  epsilon: 1.0

episode: 6657   score: 1270.0  epsilon: 1.0    steps: 968  evaluation reward: 1220.6
Training network. lr: 0.000104. clip: 0.041488
Iteration 19051: Policy loss: 0.003114. Value loss: 0.135948. Entropy: 1.309710.
Iteration 19052: Policy loss: 0.001824. Value loss: 0.086741. Entropy: 1.302355.
Iteration 19053: Policy loss: 0.000823. Value loss: 0.063058. Entropy: 1.327649.
episode: 6658   score: 940.0  epsilon: 1.0    steps: 40  evaluation reward: 1219.2
episode: 6659   score: 1270.0  epsilon: 1.0    steps: 528  evaluation reward: 1219.9
Training network. lr: 0.000104. clip: 0.041488
Iteration 19054: Policy loss: 0.001389. Value loss: 0.287518. Entropy: 1.110233.
Iteration 19055: Policy loss: -0.002436. Value loss: 0.239736. Entropy: 1.111119.
Iteration 19056: Policy loss: -0.002528. Value loss: 0.218886. Entropy: 1.098822.
episode: 6660   score: 1850.0  epsilon: 1.0    steps: 256  evaluation reward: 1226.7
Training network. lr: 0.000104. clip: 0.041488
Iteration 19057: Policy loss: 0.0

episode: 6679   score: 1960.0  epsilon: 1.0    steps: 544  evaluation reward: 1229.3
Training network. lr: 0.000103. clip: 0.041331
Iteration 19117: Policy loss: 0.003393. Value loss: 0.346958. Entropy: 1.272748.
Iteration 19118: Policy loss: 0.003185. Value loss: 0.204330. Entropy: 1.284179.
Iteration 19119: Policy loss: -0.000490. Value loss: 0.170560. Entropy: 1.294895.
episode: 6680   score: 900.0  epsilon: 1.0    steps: 848  evaluation reward: 1230.2
Training network. lr: 0.000103. clip: 0.041331
Iteration 19120: Policy loss: 0.006562. Value loss: 0.271719. Entropy: 1.415057.
Iteration 19121: Policy loss: 0.004284. Value loss: 0.209653. Entropy: 1.404060.
Iteration 19122: Policy loss: 0.000015. Value loss: 0.188532. Entropy: 1.388844.
episode: 6681   score: 600.0  epsilon: 1.0    steps: 632  evaluation reward: 1219.3
Training network. lr: 0.000103. clip: 0.041331
Iteration 19123: Policy loss: 0.006368. Value loss: 0.226800. Entropy: 1.331537.
Iteration 19124: Policy loss: 0.005040

Training network. lr: 0.000103. clip: 0.041174
Iteration 19183: Policy loss: 0.000882. Value loss: 0.394376. Entropy: 0.939061.
Iteration 19184: Policy loss: 0.005027. Value loss: 0.285073. Entropy: 0.951397.
Iteration 19185: Policy loss: 0.000862. Value loss: 0.201313. Entropy: 0.964086.
now time :  2019-03-06 07:24:07.048619
episode: 6701   score: 1790.0  epsilon: 1.0    steps: 40  evaluation reward: 1218.0
episode: 6702   score: 690.0  epsilon: 1.0    steps: 896  evaluation reward: 1216.5
Training network. lr: 0.000103. clip: 0.041174
Iteration 19186: Policy loss: 0.000733. Value loss: 0.375842. Entropy: 1.311727.
Iteration 19187: Policy loss: -0.000564. Value loss: 0.278732. Entropy: 1.307476.
Iteration 19188: Policy loss: 0.001780. Value loss: 0.251128. Entropy: 1.312927.
episode: 6703   score: 1360.0  epsilon: 1.0    steps: 488  evaluation reward: 1217.0
Training network. lr: 0.000103. clip: 0.041174
Iteration 19189: Policy loss: 0.005343. Value loss: 0.190986. Entropy: 1.101766.

Iteration 19248: Policy loss: 0.000794. Value loss: 0.322472. Entropy: 1.201324.
episode: 6723   score: 1100.0  epsilon: 1.0    steps: 896  evaluation reward: 1192.1
Training network. lr: 0.000103. clip: 0.041027
Iteration 19249: Policy loss: 0.000212. Value loss: 0.164119. Entropy: 0.998387.
Iteration 19250: Policy loss: -0.002852. Value loss: 0.096359. Entropy: 1.006234.
Iteration 19251: Policy loss: -0.007050. Value loss: 0.069639. Entropy: 1.014243.
episode: 6724   score: 950.0  epsilon: 1.0    steps: 360  evaluation reward: 1187.8
episode: 6725   score: 1270.0  epsilon: 1.0    steps: 504  evaluation reward: 1191.3
Training network. lr: 0.000102. clip: 0.040870
Iteration 19252: Policy loss: 0.002853. Value loss: 0.339142. Entropy: 0.870213.
Iteration 19253: Policy loss: 0.003125. Value loss: 0.260160. Entropy: 0.861578.
Iteration 19254: Policy loss: -0.001481. Value loss: 0.222747. Entropy: 0.861446.
episode: 6726   score: 1180.0  epsilon: 1.0    steps: 952  evaluation reward: 1194

episode: 6747   score: 1890.0  epsilon: 1.0    steps: 536  evaluation reward: 1168.7
Training network. lr: 0.000102. clip: 0.040713
Iteration 19312: Policy loss: 0.007022. Value loss: 0.409183. Entropy: 0.963606.
Iteration 19313: Policy loss: 0.006651. Value loss: 0.296536. Entropy: 0.956130.
Iteration 19314: Policy loss: 0.003473. Value loss: 0.260174. Entropy: 0.945785.
episode: 6748   score: 450.0  epsilon: 1.0    steps: 512  evaluation reward: 1165.7
Training network. lr: 0.000102. clip: 0.040713
Iteration 19315: Policy loss: 0.005508. Value loss: 0.896547. Entropy: 1.140138.
Iteration 19316: Policy loss: 0.004476. Value loss: 0.768652. Entropy: 1.137684.
Iteration 19317: Policy loss: 0.004299. Value loss: 0.715340. Entropy: 1.143885.
episode: 6749   score: 320.0  epsilon: 1.0    steps: 520  evaluation reward: 1152.3
Training network. lr: 0.000102. clip: 0.040713
Iteration 19318: Policy loss: 0.010665. Value loss: 0.589473. Entropy: 1.259508.
Iteration 19319: Policy loss: 0.005638.

Iteration 19377: Policy loss: 0.000395. Value loss: 0.161652. Entropy: 1.164805.
Training network. lr: 0.000101. clip: 0.040566
Iteration 19378: Policy loss: 0.003534. Value loss: 0.667541. Entropy: 1.233658.
Iteration 19379: Policy loss: 0.006087. Value loss: 0.546491. Entropy: 1.248314.
Iteration 19380: Policy loss: 0.002047. Value loss: 0.512024. Entropy: 1.242075.
episode: 6769   score: 1260.0  epsilon: 1.0    steps: 224  evaluation reward: 1160.0
episode: 6770   score: 1180.0  epsilon: 1.0    steps: 872  evaluation reward: 1158.8
Training network. lr: 0.000101. clip: 0.040566
Iteration 19381: Policy loss: 0.003773. Value loss: 0.262905. Entropy: 1.073875.
Iteration 19382: Policy loss: 0.001868. Value loss: 0.210482. Entropy: 1.082244.
Iteration 19383: Policy loss: -0.002707. Value loss: 0.196639. Entropy: 1.065506.
episode: 6771   score: 1640.0  epsilon: 1.0    steps: 712  evaluation reward: 1163.8
Training network. lr: 0.000101. clip: 0.040566
Iteration 19384: Policy loss: 0.0034

Training network. lr: 0.000101. clip: 0.040409
Iteration 19441: Policy loss: 0.004937. Value loss: 0.356968. Entropy: 0.935351.
Iteration 19442: Policy loss: 0.006173. Value loss: 0.303747. Entropy: 0.945272.
Iteration 19443: Policy loss: 0.004498. Value loss: 0.267443. Entropy: 0.927725.
episode: 6793   score: 1000.0  epsilon: 1.0    steps: 8  evaluation reward: 1204.4
Training network. lr: 0.000101. clip: 0.040409
Iteration 19444: Policy loss: 0.007390. Value loss: 0.987801. Entropy: 1.528080.
Iteration 19445: Policy loss: 0.012461. Value loss: 0.716205. Entropy: 1.530118.
Iteration 19446: Policy loss: 0.008672. Value loss: 0.604737. Entropy: 1.527232.
episode: 6794   score: 1140.0  epsilon: 1.0    steps: 80  evaluation reward: 1206.9
Training network. lr: 0.000101. clip: 0.040409
Iteration 19447: Policy loss: 0.003714. Value loss: 0.928409. Entropy: 1.447170.
Iteration 19448: Policy loss: 0.003515. Value loss: 0.760676. Entropy: 1.451006.
Iteration 19449: Policy loss: 0.001758. Valu

Iteration 19506: Policy loss: 0.012016. Value loss: 0.411554. Entropy: 0.492303.
episode: 6815   score: 1850.0  epsilon: 1.0    steps: 216  evaluation reward: 1243.0
Training network. lr: 0.000100. clip: 0.040105
Iteration 19507: Policy loss: 0.000976. Value loss: 0.369950. Entropy: 0.501242.
Iteration 19508: Policy loss: 0.000138. Value loss: 0.292315. Entropy: 0.495174.
Iteration 19509: Policy loss: -0.000150. Value loss: 0.276557. Entropy: 0.487891.
episode: 6816   score: 1430.0  epsilon: 1.0    steps: 576  evaluation reward: 1247.5
Training network. lr: 0.000100. clip: 0.040105
Iteration 19510: Policy loss: 0.002370. Value loss: 0.486904. Entropy: 1.041371.
Iteration 19511: Policy loss: 0.004108. Value loss: 0.361763. Entropy: 1.045345.
Iteration 19512: Policy loss: -0.000260. Value loss: 0.309645. Entropy: 1.045122.
episode: 6817   score: 1100.0  epsilon: 1.0    steps: 440  evaluation reward: 1245.9
Training network. lr: 0.000100. clip: 0.040105
Iteration 19513: Policy loss: 0.003

Training network. lr: 0.000100. clip: 0.039949
Iteration 19573: Policy loss: 0.005913. Value loss: 0.925223. Entropy: 1.058464.
Iteration 19574: Policy loss: 0.010414. Value loss: 0.737733. Entropy: 1.066521.
Iteration 19575: Policy loss: 0.010049. Value loss: 0.680409. Entropy: 1.062646.
episode: 6837   score: 1170.0  epsilon: 1.0    steps: 128  evaluation reward: 1242.0
episode: 6838   score: 940.0  epsilon: 1.0    steps: 376  evaluation reward: 1245.5
Training network. lr: 0.000100. clip: 0.039949
Iteration 19576: Policy loss: 0.006639. Value loss: 0.412074. Entropy: 0.633779.
Iteration 19577: Policy loss: 0.002708. Value loss: 0.240420. Entropy: 0.605322.
Iteration 19578: Policy loss: -0.000104. Value loss: 0.208700. Entropy: 0.590457.
Training network. lr: 0.000100. clip: 0.039949
Iteration 19579: Policy loss: 0.007306. Value loss: 0.326171. Entropy: 0.755853.
Iteration 19580: Policy loss: 0.002006. Value loss: 0.184031. Entropy: 0.737400.
Iteration 19581: Policy loss: -0.000387. 

Iteration 19638: Policy loss: -0.001506. Value loss: 0.192913. Entropy: 1.122476.
Training network. lr: 0.000099. clip: 0.039792
Iteration 19639: Policy loss: 0.004156. Value loss: 0.831136. Entropy: 1.202484.
Iteration 19640: Policy loss: 0.003625. Value loss: 0.743532. Entropy: 1.205132.
Iteration 19641: Policy loss: 0.001367. Value loss: 0.702024. Entropy: 1.215718.
episode: 6859   score: 570.0  epsilon: 1.0    steps: 464  evaluation reward: 1258.8
Training network. lr: 0.000099. clip: 0.039792
Iteration 19642: Policy loss: 0.006332. Value loss: 0.232148. Entropy: 0.963197.
Iteration 19643: Policy loss: 0.002395. Value loss: 0.146691. Entropy: 0.977217.
Iteration 19644: Policy loss: -0.002110. Value loss: 0.127130. Entropy: 0.989197.
episode: 6860   score: 1460.0  epsilon: 1.0    steps: 520  evaluation reward: 1261.5
Training network. lr: 0.000099. clip: 0.039792
Iteration 19645: Policy loss: 0.002820. Value loss: 0.177283. Entropy: 0.844574.
Iteration 19646: Policy loss: 0.001996. 

Iteration 19703: Policy loss: -0.001293. Value loss: 0.069942. Entropy: 1.023054.
Iteration 19704: Policy loss: -0.002147. Value loss: 0.051627. Entropy: 1.021167.
Training network. lr: 0.000099. clip: 0.039488
Iteration 19705: Policy loss: 0.001959. Value loss: 0.255845. Entropy: 0.973866.
Iteration 19706: Policy loss: 0.000474. Value loss: 0.224538. Entropy: 0.984185.
Iteration 19707: Policy loss: -0.002600. Value loss: 0.205622. Entropy: 0.977811.
Training network. lr: 0.000099. clip: 0.039488
Iteration 19708: Policy loss: 0.001335. Value loss: 0.188332. Entropy: 1.157729.
Iteration 19709: Policy loss: -0.003070. Value loss: 0.122541. Entropy: 1.168501.
Iteration 19710: Policy loss: -0.004865. Value loss: 0.100334. Entropy: 1.170286.
Training network. lr: 0.000099. clip: 0.039488
Iteration 19711: Policy loss: 0.003569. Value loss: 0.122893. Entropy: 1.285655.
Iteration 19712: Policy loss: 0.003748. Value loss: 0.066098. Entropy: 1.268103.
Iteration 19713: Policy loss: 0.001357. Valu

Iteration 19769: Policy loss: -0.000945. Value loss: 0.701836. Entropy: 0.910380.
Iteration 19770: Policy loss: -0.002997. Value loss: 0.638022. Entropy: 0.910233.
Training network. lr: 0.000098. clip: 0.039331
Iteration 19771: Policy loss: 0.002062. Value loss: 0.274267. Entropy: 1.102620.
Iteration 19772: Policy loss: 0.000488. Value loss: 0.215959. Entropy: 1.096949.
Iteration 19773: Policy loss: -0.001072. Value loss: 0.203825. Entropy: 1.102926.
episode: 6903   score: 890.0  epsilon: 1.0    steps: 216  evaluation reward: 1242.7
episode: 6904   score: 1410.0  epsilon: 1.0    steps: 776  evaluation reward: 1249.8
episode: 6905   score: 700.0  epsilon: 1.0    steps: 1024  evaluation reward: 1222.6
Training network. lr: 0.000098. clip: 0.039331
Iteration 19774: Policy loss: 0.005618. Value loss: 0.398819. Entropy: 1.167144.
Iteration 19775: Policy loss: 0.000317. Value loss: 0.341080. Entropy: 1.183939.
Iteration 19776: Policy loss: -0.000962. Value loss: 0.325414. Entropy: 1.189658.


Iteration 19836: Policy loss: 0.000781. Value loss: 0.083448. Entropy: 1.361363.
Training network. lr: 0.000098. clip: 0.039184
Iteration 19837: Policy loss: 0.003726. Value loss: 0.178297. Entropy: 1.277481.
Iteration 19838: Policy loss: 0.002336. Value loss: 0.142878. Entropy: 1.284348.
Iteration 19839: Policy loss: 0.000207. Value loss: 0.125955. Entropy: 1.269796.
episode: 6924   score: 880.0  epsilon: 1.0    steps: 344  evaluation reward: 1182.8
episode: 6925   score: 1260.0  epsilon: 1.0    steps: 368  evaluation reward: 1181.7
episode: 6926   score: 1070.0  epsilon: 1.0    steps: 856  evaluation reward: 1184.7
episode: 6927   score: 910.0  epsilon: 1.0    steps: 888  evaluation reward: 1181.3
episode: 6928   score: 1090.0  epsilon: 1.0    steps: 976  evaluation reward: 1177.1
Training network. lr: 0.000098. clip: 0.039184
Iteration 19840: Policy loss: 0.002795. Value loss: 0.088599. Entropy: 1.220096.
Iteration 19841: Policy loss: -0.000589. Value loss: 0.058079. Entropy: 1.2100

episode: 6948   score: 810.0  epsilon: 1.0    steps: 944  evaluation reward: 1105.9
Training network. lr: 0.000098. clip: 0.039027
Iteration 19900: Policy loss: 0.004952. Value loss: 0.220943. Entropy: 0.981104.
Iteration 19901: Policy loss: 0.002625. Value loss: 0.178541. Entropy: 0.976117.
Iteration 19902: Policy loss: -0.000208. Value loss: 0.159806. Entropy: 0.988253.
Training network. lr: 0.000097. clip: 0.038870
Iteration 19903: Policy loss: 0.003211. Value loss: 0.646226. Entropy: 1.036466.
Iteration 19904: Policy loss: 0.006156. Value loss: 0.525987. Entropy: 1.034843.
Iteration 19905: Policy loss: 0.007064. Value loss: 0.470876. Entropy: 1.035958.
episode: 6949   score: 1000.0  epsilon: 1.0    steps: 432  evaluation reward: 1103.4
Training network. lr: 0.000097. clip: 0.038870
Iteration 19906: Policy loss: 0.001494. Value loss: 0.187892. Entropy: 1.374453.
Iteration 19907: Policy loss: 0.002293. Value loss: 0.106558. Entropy: 1.372669.
Iteration 19908: Policy loss: -0.001246. 

Iteration 19964: Policy loss: 0.004537. Value loss: 0.358934. Entropy: 1.070369.
Iteration 19965: Policy loss: 0.004344. Value loss: 0.314402. Entropy: 1.072327.
Training network. lr: 0.000097. clip: 0.038723
Iteration 19966: Policy loss: 0.005757. Value loss: 0.581563. Entropy: 1.625464.
Iteration 19967: Policy loss: 0.003494. Value loss: 0.499696. Entropy: 1.623584.
Iteration 19968: Policy loss: 0.000991. Value loss: 0.464247. Entropy: 1.629194.
episode: 6971   score: 1090.0  epsilon: 1.0    steps: 784  evaluation reward: 1091.6
Training network. lr: 0.000097. clip: 0.038723
Iteration 19969: Policy loss: 0.000810. Value loss: 0.119291. Entropy: 1.689051.
Iteration 19970: Policy loss: -0.001965. Value loss: 0.082892. Entropy: 1.691328.
Iteration 19971: Policy loss: -0.004302. Value loss: 0.066969. Entropy: 1.696451.
Training network. lr: 0.000097. clip: 0.038723
Iteration 19972: Policy loss: 0.002298. Value loss: 0.136071. Entropy: 1.539967.
Iteration 19973: Policy loss: 0.000724. Val

Training network. lr: 0.000096. clip: 0.038566
Iteration 20029: Policy loss: 0.003025. Value loss: 0.500688. Entropy: 1.141475.
Iteration 20030: Policy loss: 0.000556. Value loss: 0.469066. Entropy: 1.134925.
Iteration 20031: Policy loss: 0.001856. Value loss: 0.431717. Entropy: 1.145432.
Training network. lr: 0.000096. clip: 0.038566
Iteration 20032: Policy loss: 0.003835. Value loss: 0.195298. Entropy: 1.271373.
Iteration 20033: Policy loss: 0.001048. Value loss: 0.146026. Entropy: 1.265346.
Iteration 20034: Policy loss: 0.000210. Value loss: 0.121319. Entropy: 1.261707.
Training network. lr: 0.000096. clip: 0.038566
Iteration 20035: Policy loss: 0.002834. Value loss: 0.190233. Entropy: 1.470099.
Iteration 20036: Policy loss: 0.004355. Value loss: 0.122928. Entropy: 1.470607.
Iteration 20037: Policy loss: 0.001839. Value loss: 0.093646. Entropy: 1.459490.
episode: 6994   score: 890.0  epsilon: 1.0    steps: 256  evaluation reward: 1034.2
Training network. lr: 0.000096. clip: 0.038566

Training network. lr: 0.000096. clip: 0.038409
Iteration 20095: Policy loss: 0.003546. Value loss: 0.256057. Entropy: 1.104823.
Iteration 20096: Policy loss: 0.000111. Value loss: 0.176012. Entropy: 1.109858.
Iteration 20097: Policy loss: -0.001788. Value loss: 0.162227. Entropy: 1.112403.
episode: 7015   score: 850.0  epsilon: 1.0    steps: 592  evaluation reward: 1014.2
episode: 7016   score: 790.0  epsilon: 1.0    steps: 1008  evaluation reward: 1012.5
Training network. lr: 0.000096. clip: 0.038409
Iteration 20098: Policy loss: 0.008171. Value loss: 0.168213. Entropy: 1.253684.
Iteration 20099: Policy loss: 0.001623. Value loss: 0.094705. Entropy: 1.267250.
Iteration 20100: Policy loss: 0.001875. Value loss: 0.078714. Entropy: 1.261427.
episode: 7017   score: 1130.0  epsilon: 1.0    steps: 440  evaluation reward: 1016.9
episode: 7018   score: 1400.0  epsilon: 1.0    steps: 736  evaluation reward: 1016.3
Training network. lr: 0.000096. clip: 0.038262
Iteration 20101: Policy loss: 0.0

episode: 7037   score: 740.0  epsilon: 1.0    steps: 816  evaluation reward: 1015.3
Training network. lr: 0.000095. clip: 0.038105
Iteration 20161: Policy loss: 0.002306. Value loss: 0.210326. Entropy: 0.856317.
Iteration 20162: Policy loss: -0.000829. Value loss: 0.155913. Entropy: 0.849958.
Iteration 20163: Policy loss: -0.002052. Value loss: 0.139037. Entropy: 0.860821.
episode: 7038   score: 1880.0  epsilon: 1.0    steps: 568  evaluation reward: 1027.9
Training network. lr: 0.000095. clip: 0.038105
Iteration 20164: Policy loss: 0.002132. Value loss: 0.640188. Entropy: 0.915610.
Iteration 20165: Policy loss: -0.000766. Value loss: 0.540989. Entropy: 0.902995.
Iteration 20166: Policy loss: -0.001897. Value loss: 0.471857. Entropy: 0.913310.
episode: 7039   score: 1470.0  epsilon: 1.0    steps: 200  evaluation reward: 1040.6
episode: 7040   score: 980.0  epsilon: 1.0    steps: 784  evaluation reward: 1048.5
episode: 7041   score: 890.0  epsilon: 1.0    steps: 904  evaluation reward: 1

Iteration 20226: Policy loss: 0.001224. Value loss: 0.345826. Entropy: 1.200783.
episode: 7059   score: 810.0  epsilon: 1.0    steps: 648  evaluation reward: 1018.4
episode: 7060   score: 1130.0  epsilon: 1.0    steps: 824  evaluation reward: 1020.4
Training network. lr: 0.000095. clip: 0.037949
Iteration 20227: Policy loss: 0.002590. Value loss: 0.113845. Entropy: 1.017612.
Iteration 20228: Policy loss: -0.001819. Value loss: 0.072022. Entropy: 1.018227.
Iteration 20229: Policy loss: -0.003017. Value loss: 0.055813. Entropy: 1.007401.
Training network. lr: 0.000095. clip: 0.037949
Iteration 20230: Policy loss: 0.003031. Value loss: 0.217438. Entropy: 0.948928.
Iteration 20231: Policy loss: 0.000025. Value loss: 0.161902. Entropy: 0.946121.
Iteration 20232: Policy loss: -0.002624. Value loss: 0.152639. Entropy: 0.950463.
Training network. lr: 0.000095. clip: 0.037949
Iteration 20233: Policy loss: 0.002625. Value loss: 0.244928. Entropy: 1.197527.
Iteration 20234: Policy loss: -0.000490

Iteration 20292: Policy loss: 0.003463. Value loss: 0.427052. Entropy: 1.165533.
episode: 7081   score: 1450.0  epsilon: 1.0    steps: 352  evaluation reward: 1004.8
Training network. lr: 0.000095. clip: 0.037801
Iteration 20293: Policy loss: 0.002570. Value loss: 0.475286. Entropy: 0.997157.
Iteration 20294: Policy loss: 0.000382. Value loss: 0.351893. Entropy: 0.994634.
Iteration 20295: Policy loss: 0.001240. Value loss: 0.283215. Entropy: 0.995869.
episode: 7082   score: 930.0  epsilon: 1.0    steps: 176  evaluation reward: 1006.5
Training network. lr: 0.000095. clip: 0.037801
Iteration 20296: Policy loss: -0.000562. Value loss: 0.260740. Entropy: 0.900932.
Iteration 20297: Policy loss: -0.000309. Value loss: 0.189053. Entropy: 0.900112.
Iteration 20298: Policy loss: -0.002331. Value loss: 0.160055. Entropy: 0.902748.
Training network. lr: 0.000095. clip: 0.037801
Iteration 20299: Policy loss: 0.007548. Value loss: 0.214046. Entropy: 1.044177.
Iteration 20300: Policy loss: 0.004391.

Iteration 20359: Policy loss: 0.003069. Value loss: 0.186417. Entropy: 1.303643.
Iteration 20360: Policy loss: 0.004078. Value loss: 0.106728. Entropy: 1.317794.
Iteration 20361: Policy loss: -0.000512. Value loss: 0.090585. Entropy: 1.306525.
now time :  2019-03-06 07:39:57.493020
episode: 7101   score: 990.0  epsilon: 1.0    steps: 64  evaluation reward: 1047.2
Training network. lr: 0.000094. clip: 0.037488
Iteration 20362: Policy loss: 0.001084. Value loss: 0.389186. Entropy: 1.195019.
Iteration 20363: Policy loss: 0.001608. Value loss: 0.293393. Entropy: 1.188649.
Iteration 20364: Policy loss: 0.001273. Value loss: 0.268828. Entropy: 1.186044.
episode: 7102   score: 1110.0  epsilon: 1.0    steps: 512  evaluation reward: 1044.2
episode: 7103   score: 510.0  epsilon: 1.0    steps: 656  evaluation reward: 1040.6
episode: 7104   score: 1140.0  epsilon: 1.0    steps: 752  evaluation reward: 1042.6
Training network. lr: 0.000094. clip: 0.037488
Iteration 20365: Policy loss: 0.002949. Val

Iteration 20424: Policy loss: -0.000785. Value loss: 0.448000. Entropy: 1.185322.
episode: 7124   score: 1040.0  epsilon: 1.0    steps: 888  evaluation reward: 1045.1
Training network. lr: 0.000093. clip: 0.037340
Iteration 20425: Policy loss: 0.001939. Value loss: 0.242911. Entropy: 1.201074.
Iteration 20426: Policy loss: 0.000748. Value loss: 0.157400. Entropy: 1.201854.
Iteration 20427: Policy loss: -0.001157. Value loss: 0.127061. Entropy: 1.210704.
episode: 7125   score: 1020.0  epsilon: 1.0    steps: 480  evaluation reward: 1045.3
Training network. lr: 0.000093. clip: 0.037340
Iteration 20428: Policy loss: 0.011258. Value loss: 0.287788. Entropy: 1.155792.
Iteration 20429: Policy loss: 0.005975. Value loss: 0.173147. Entropy: 1.146528.
Iteration 20430: Policy loss: 0.002374. Value loss: 0.145296. Entropy: 1.139414.
Training network. lr: 0.000093. clip: 0.037340
Iteration 20431: Policy loss: 0.001252. Value loss: 0.168642. Entropy: 1.328823.
Iteration 20432: Policy loss: -0.000173

episode: 7145   score: 950.0  epsilon: 1.0    steps: 400  evaluation reward: 1083.6
episode: 7146   score: 1040.0  epsilon: 1.0    steps: 912  evaluation reward: 1083.1
Training network. lr: 0.000093. clip: 0.037184
Iteration 20491: Policy loss: 0.003787. Value loss: 0.159150. Entropy: 1.181927.
Iteration 20492: Policy loss: 0.003469. Value loss: 0.084451. Entropy: 1.158235.
Iteration 20493: Policy loss: -0.000336. Value loss: 0.071966. Entropy: 1.154722.
Training network. lr: 0.000093. clip: 0.037184
Iteration 20494: Policy loss: 0.003490. Value loss: 0.283572. Entropy: 1.028676.
Iteration 20495: Policy loss: 0.002259. Value loss: 0.210990. Entropy: 1.039118.
Iteration 20496: Policy loss: 0.002614. Value loss: 0.179146. Entropy: 1.026447.
episode: 7147   score: 1170.0  epsilon: 1.0    steps: 168  evaluation reward: 1085.1
episode: 7148   score: 860.0  epsilon: 1.0    steps: 560  evaluation reward: 1080.8
Training network. lr: 0.000093. clip: 0.037184
Iteration 20497: Policy loss: 0.00

Iteration 20555: Policy loss: 0.002993. Value loss: 1.610332. Entropy: 1.003122.
Iteration 20556: Policy loss: 0.003329. Value loss: 1.467083. Entropy: 0.988452.
episode: 7168   score: 860.0  epsilon: 1.0    steps: 928  evaluation reward: 1105.7
Training network. lr: 0.000092. clip: 0.036880
Iteration 20557: Policy loss: 0.004300. Value loss: 0.253624. Entropy: 0.954622.
Iteration 20558: Policy loss: 0.001816. Value loss: 0.169270. Entropy: 0.949123.
Iteration 20559: Policy loss: -0.001642. Value loss: 0.142567. Entropy: 0.949313.
Training network. lr: 0.000092. clip: 0.036880
Iteration 20560: Policy loss: 0.001664. Value loss: 0.327787. Entropy: 1.228022.
Iteration 20561: Policy loss: -0.000947. Value loss: 0.218174. Entropy: 1.218933.
Iteration 20562: Policy loss: -0.001164. Value loss: 0.170630. Entropy: 1.213884.
episode: 7169   score: 1480.0  epsilon: 1.0    steps: 304  evaluation reward: 1115.4
episode: 7170   score: 700.0  epsilon: 1.0    steps: 544  evaluation reward: 1113.5
Tr

Iteration 20621: Policy loss: -0.001482. Value loss: 0.097486. Entropy: 1.204171.
Iteration 20622: Policy loss: -0.003457. Value loss: 0.085943. Entropy: 1.202111.
Training network. lr: 0.000092. clip: 0.036723
Iteration 20623: Policy loss: 0.003524. Value loss: 0.277622. Entropy: 1.038211.
Iteration 20624: Policy loss: 0.000599. Value loss: 0.224364. Entropy: 1.038039.
Iteration 20625: Policy loss: -0.000686. Value loss: 0.198745. Entropy: 1.046986.
episode: 7190   score: 1500.0  epsilon: 1.0    steps: 936  evaluation reward: 1110.6
Training network. lr: 0.000092. clip: 0.036723
Iteration 20626: Policy loss: -0.000696. Value loss: 0.155156. Entropy: 1.060430.
Iteration 20627: Policy loss: 0.000471. Value loss: 0.093534. Entropy: 1.062535.
Iteration 20628: Policy loss: 0.000194. Value loss: 0.072452. Entropy: 1.059586.
episode: 7191   score: 1880.0  epsilon: 1.0    steps: 56  evaluation reward: 1125.1
episode: 7192   score: 1090.0  epsilon: 1.0    steps: 840  evaluation reward: 1125.1


Iteration 20687: Policy loss: -0.002213. Value loss: 0.135628. Entropy: 1.207365.
Iteration 20688: Policy loss: 0.001077. Value loss: 0.117802. Entropy: 1.208772.
episode: 7211   score: 1450.0  epsilon: 1.0    steps: 440  evaluation reward: 1158.3
episode: 7212   score: 1030.0  epsilon: 1.0    steps: 472  evaluation reward: 1157.4
episode: 7213   score: 1110.0  epsilon: 1.0    steps: 592  evaluation reward: 1159.9
Training network. lr: 0.000091. clip: 0.036566
Iteration 20689: Policy loss: 0.005835. Value loss: 0.380686. Entropy: 1.200258.
Iteration 20690: Policy loss: 0.002486. Value loss: 0.275495. Entropy: 1.191512.
Iteration 20691: Policy loss: 0.002107. Value loss: 0.226325. Entropy: 1.190185.
Training network. lr: 0.000091. clip: 0.036566
Iteration 20692: Policy loss: 0.006945. Value loss: 0.369523. Entropy: 1.237883.
Iteration 20693: Policy loss: 0.004075. Value loss: 0.231496. Entropy: 1.240875.
Iteration 20694: Policy loss: 0.001435. Value loss: 0.200123. Entropy: 1.232915.
ep

Training network. lr: 0.000091. clip: 0.036262
Iteration 20752: Policy loss: 0.004036. Value loss: 0.416507. Entropy: 1.481045.
Iteration 20753: Policy loss: 0.006474. Value loss: 0.266938. Entropy: 1.487206.
Iteration 20754: Policy loss: 0.004929. Value loss: 0.227050. Entropy: 1.484784.
episode: 7234   score: 780.0  epsilon: 1.0    steps: 216  evaluation reward: 1166.6
Training network. lr: 0.000091. clip: 0.036262
Iteration 20755: Policy loss: 0.002560. Value loss: 0.243335. Entropy: 1.450197.
Iteration 20756: Policy loss: 0.003951. Value loss: 0.163485. Entropy: 1.460032.
Iteration 20757: Policy loss: 0.000840. Value loss: 0.147828. Entropy: 1.464577.
Training network. lr: 0.000091. clip: 0.036262
Iteration 20758: Policy loss: 0.005897. Value loss: 0.317654. Entropy: 1.339520.
Iteration 20759: Policy loss: 0.004363. Value loss: 0.229899. Entropy: 1.347183.
Iteration 20760: Policy loss: 0.001104. Value loss: 0.175834. Entropy: 1.341835.
Training network. lr: 0.000091. clip: 0.036262

Iteration 20819: Policy loss: 0.000808. Value loss: 0.162680. Entropy: 1.141008.
Iteration 20820: Policy loss: 0.001785. Value loss: 0.136508. Entropy: 1.140240.
Training network. lr: 0.000090. clip: 0.036105
Iteration 20821: Policy loss: 0.001921. Value loss: 0.320735. Entropy: 1.053437.
Iteration 20822: Policy loss: 0.001138. Value loss: 0.250554. Entropy: 1.056610.
Iteration 20823: Policy loss: 0.000978. Value loss: 0.207246. Entropy: 1.052911.
episode: 7254   score: 970.0  epsilon: 1.0    steps: 8  evaluation reward: 1151.7
episode: 7255   score: 1120.0  epsilon: 1.0    steps: 656  evaluation reward: 1153.1
episode: 7256   score: 1120.0  epsilon: 1.0    steps: 680  evaluation reward: 1152.8
episode: 7257   score: 1230.0  epsilon: 1.0    steps: 784  evaluation reward: 1158.4
Training network. lr: 0.000090. clip: 0.036105
Iteration 20824: Policy loss: 0.004919. Value loss: 0.361792. Entropy: 0.874253.
Iteration 20825: Policy loss: 0.001011. Value loss: 0.230327. Entropy: 0.885779.
It

Iteration 20886: Policy loss: 0.001072. Value loss: 0.234891. Entropy: 0.588881.
episode: 7275   score: 830.0  epsilon: 1.0    steps: 784  evaluation reward: 1121.0
Training network. lr: 0.000090. clip: 0.035958
Iteration 20887: Policy loss: 0.004262. Value loss: 0.164622. Entropy: 0.451381.
Iteration 20888: Policy loss: 0.004103. Value loss: 0.080493. Entropy: 0.440744.
Iteration 20889: Policy loss: 0.001915. Value loss: 0.073180. Entropy: 0.440390.
episode: 7276   score: 1220.0  epsilon: 1.0    steps: 704  evaluation reward: 1122.0
Training network. lr: 0.000090. clip: 0.035958
Iteration 20890: Policy loss: 0.000268. Value loss: 0.352164. Entropy: 0.334872.
Iteration 20891: Policy loss: 0.001380. Value loss: 0.280545. Entropy: 0.365786.
Iteration 20892: Policy loss: 0.000644. Value loss: 0.237217. Entropy: 0.360865.
episode: 7277   score: 1540.0  epsilon: 1.0    steps: 432  evaluation reward: 1125.0
Training network. lr: 0.000090. clip: 0.035958
Iteration 20893: Policy loss: 0.000800

Iteration 20951: Policy loss: 0.004519. Value loss: 0.325323. Entropy: 0.991517.
Iteration 20952: Policy loss: 0.002056. Value loss: 0.314843. Entropy: 0.999476.
episode: 7298   score: 1140.0  epsilon: 1.0    steps: 200  evaluation reward: 1165.6
episode: 7299   score: 1270.0  epsilon: 1.0    steps: 400  evaluation reward: 1163.7
Training network. lr: 0.000089. clip: 0.035645
Iteration 20953: Policy loss: 0.007749. Value loss: 0.502703. Entropy: 0.799606.
Iteration 20954: Policy loss: 0.005597. Value loss: 0.287703. Entropy: 0.813921.
Iteration 20955: Policy loss: 0.001443. Value loss: 0.245149. Entropy: 0.813140.
Training network. lr: 0.000089. clip: 0.035645
Iteration 20956: Policy loss: 0.005967. Value loss: 0.436321. Entropy: 0.863527.
Iteration 20957: Policy loss: 0.008340. Value loss: 0.288273. Entropy: 0.877740.
Iteration 20958: Policy loss: 0.006367. Value loss: 0.247975. Entropy: 0.869689.
Training network. lr: 0.000089. clip: 0.035645
Iteration 20959: Policy loss: 0.005551. V

Iteration 21016: Policy loss: 0.001100. Value loss: 0.075091. Entropy: 1.054488.
Iteration 21017: Policy loss: -0.001783. Value loss: 0.042803. Entropy: 1.052127.
Iteration 21018: Policy loss: -0.001820. Value loss: 0.035137. Entropy: 1.043213.
episode: 7320   score: 1160.0  epsilon: 1.0    steps: 408  evaluation reward: 1155.9
episode: 7321   score: 680.0  epsilon: 1.0    steps: 432  evaluation reward: 1154.3
Training network. lr: 0.000089. clip: 0.035497
Iteration 21019: Policy loss: 0.002069. Value loss: 0.500120. Entropy: 0.862787.
Iteration 21020: Policy loss: 0.008446. Value loss: 0.329854. Entropy: 0.873894.
Iteration 21021: Policy loss: 0.002248. Value loss: 0.275288. Entropy: 0.869774.
Training network. lr: 0.000089. clip: 0.035497
Iteration 21022: Policy loss: 0.007588. Value loss: 0.940772. Entropy: 0.684974.
Iteration 21023: Policy loss: 0.002754. Value loss: 0.707104. Entropy: 0.688865.
Iteration 21024: Policy loss: 0.002561. Value loss: 0.595362. Entropy: 0.672088.
episod

Iteration 21084: Policy loss: 0.000959. Value loss: 0.110630. Entropy: 0.454841.
episode: 7340   score: 1160.0  epsilon: 1.0    steps: 824  evaluation reward: 1164.6
Training network. lr: 0.000088. clip: 0.035341
Iteration 21085: Policy loss: 0.001484. Value loss: 0.071879. Entropy: 0.581135.
Iteration 21086: Policy loss: -0.000543. Value loss: 0.044554. Entropy: 0.594606.
Iteration 21087: Policy loss: -0.000529. Value loss: 0.039827. Entropy: 0.590999.
episode: 7341   score: 1290.0  epsilon: 1.0    steps: 552  evaluation reward: 1165.3
Training network. lr: 0.000088. clip: 0.035341
Iteration 21088: Policy loss: 0.001241. Value loss: 0.538596. Entropy: 0.465882.
Iteration 21089: Policy loss: -0.000058. Value loss: 0.460045. Entropy: 0.496842.
Iteration 21090: Policy loss: -0.000872. Value loss: 0.440698. Entropy: 0.487002.
episode: 7342   score: 1160.0  epsilon: 1.0    steps: 576  evaluation reward: 1164.6
Training network. lr: 0.000088. clip: 0.035341
Iteration 21091: Policy loss: -0.

Iteration 21149: Policy loss: 0.001865. Value loss: 0.153700. Entropy: 0.847034.
Iteration 21150: Policy loss: 0.003392. Value loss: 0.132449. Entropy: 0.843967.
episode: 7362   score: 3390.0  epsilon: 1.0    steps: 960  evaluation reward: 1211.5
Training network. lr: 0.000088. clip: 0.035036
Iteration 21151: Policy loss: 0.001957. Value loss: 0.244176. Entropy: 0.920942.
Iteration 21152: Policy loss: 0.000750. Value loss: 0.194685. Entropy: 0.935201.
Iteration 21153: Policy loss: -0.002538. Value loss: 0.167769. Entropy: 0.946945.
episode: 7363   score: 1170.0  epsilon: 1.0    steps: 296  evaluation reward: 1212.5
episode: 7364   score: 1010.0  epsilon: 1.0    steps: 808  evaluation reward: 1204.3
Training network. lr: 0.000088. clip: 0.035036
Iteration 21154: Policy loss: 0.006062. Value loss: 0.590425. Entropy: 1.002876.
Iteration 21155: Policy loss: 0.003441. Value loss: 0.463058. Entropy: 1.009426.
Iteration 21156: Policy loss: 0.003694. Value loss: 0.435790. Entropy: 0.996914.
ep

Training network. lr: 0.000087. clip: 0.034880
Iteration 21214: Policy loss: 0.002481. Value loss: 0.247068. Entropy: 1.148059.
Iteration 21215: Policy loss: -0.000220. Value loss: 0.205233. Entropy: 1.140075.
Iteration 21216: Policy loss: -0.000555. Value loss: 0.181118. Entropy: 1.139236.
episode: 7385   score: 520.0  epsilon: 1.0    steps: 232  evaluation reward: 1202.5
Training network. lr: 0.000087. clip: 0.034880
Iteration 21217: Policy loss: 0.009990. Value loss: 0.864226. Entropy: 1.020525.
Iteration 21218: Policy loss: 0.007362. Value loss: 0.674229. Entropy: 1.023078.
Iteration 21219: Policy loss: 0.002625. Value loss: 0.620518. Entropy: 1.048724.
Training network. lr: 0.000087. clip: 0.034880
Iteration 21220: Policy loss: 0.000296. Value loss: 0.183967. Entropy: 0.912892.
Iteration 21221: Policy loss: 0.000325. Value loss: 0.116610. Entropy: 0.901198.
Iteration 21222: Policy loss: -0.001176. Value loss: 0.098084. Entropy: 0.921627.
episode: 7386   score: 1080.0  epsilon: 1.0

Training network. lr: 0.000087. clip: 0.034723
Iteration 21280: Policy loss: 0.003873. Value loss: 0.250528. Entropy: 1.082126.
Iteration 21281: Policy loss: 0.002413. Value loss: 0.168581. Entropy: 1.085753.
Iteration 21282: Policy loss: 0.000845. Value loss: 0.139887. Entropy: 1.095832.
episode: 7406   score: 1810.0  epsilon: 1.0    steps: 400  evaluation reward: 1195.5
Training network. lr: 0.000087. clip: 0.034723
Iteration 21283: Policy loss: 0.001729. Value loss: 1.078727. Entropy: 1.025303.
Iteration 21284: Policy loss: 0.001373. Value loss: 0.839221. Entropy: 1.012308.
Iteration 21285: Policy loss: 0.000724. Value loss: 0.763151. Entropy: 1.014351.
episode: 7407   score: 1280.0  epsilon: 1.0    steps: 768  evaluation reward: 1191.5
Training network. lr: 0.000087. clip: 0.034723
Iteration 21286: Policy loss: 0.008399. Value loss: 0.183845. Entropy: 0.990693.
Iteration 21287: Policy loss: 0.004060. Value loss: 0.091270. Entropy: 1.008869.
Iteration 21288: Policy loss: 0.000969. V

Iteration 21346: Policy loss: 0.000125. Value loss: 0.184555. Entropy: 1.050200.
Iteration 21347: Policy loss: -0.001302. Value loss: 0.133847. Entropy: 1.036546.
Iteration 21348: Policy loss: -0.002587. Value loss: 0.119354. Entropy: 1.032110.
episode: 7427   score: 880.0  epsilon: 1.0    steps: 296  evaluation reward: 1210.6
episode: 7428   score: 1080.0  epsilon: 1.0    steps: 440  evaluation reward: 1210.6
episode: 7429   score: 580.0  epsilon: 1.0    steps: 640  evaluation reward: 1206.3
Training network. lr: 0.000086. clip: 0.034576
Iteration 21349: Policy loss: -0.000316. Value loss: 0.168694. Entropy: 0.692492.
Iteration 21350: Policy loss: -0.001202. Value loss: 0.138196. Entropy: 0.689584.
Iteration 21351: Policy loss: -0.003003. Value loss: 0.121753. Entropy: 0.689283.
episode: 7430   score: 1260.0  epsilon: 1.0    steps: 552  evaluation reward: 1204.5
episode: 7431   score: 1280.0  epsilon: 1.0    steps: 928  evaluation reward: 1206.6
Training network. lr: 0.000086. clip: 0

episode: 7449   score: 770.0  epsilon: 1.0    steps: 624  evaluation reward: 1185.0
Training network. lr: 0.000086. clip: 0.034262
Iteration 21412: Policy loss: 0.004769. Value loss: 0.302003. Entropy: 1.176902.
Iteration 21413: Policy loss: 0.001813. Value loss: 0.214582. Entropy: 1.177218.
Iteration 21414: Policy loss: 0.001139. Value loss: 0.191360. Entropy: 1.175020.
Training network. lr: 0.000086. clip: 0.034262
Iteration 21415: Policy loss: 0.001852. Value loss: 0.212346. Entropy: 0.857822.
Iteration 21416: Policy loss: 0.000373. Value loss: 0.139174. Entropy: 0.860984.
Iteration 21417: Policy loss: -0.003662. Value loss: 0.120664. Entropy: 0.858274.
episode: 7450   score: 1090.0  epsilon: 1.0    steps: 1008  evaluation reward: 1185.1
Training network. lr: 0.000086. clip: 0.034262
Iteration 21418: Policy loss: 0.004047. Value loss: 0.087591. Entropy: 1.043757.
Iteration 21419: Policy loss: 0.000872. Value loss: 0.054167. Entropy: 1.033428.
Iteration 21420: Policy loss: 0.002210. 

episode: 7470   score: 920.0  epsilon: 1.0    steps: 760  evaluation reward: 1179.6
episode: 7471   score: 1210.0  epsilon: 1.0    steps: 800  evaluation reward: 1177.3
Training network. lr: 0.000085. clip: 0.034115
Iteration 21478: Policy loss: 0.008843. Value loss: 1.332474. Entropy: 1.035660.
Iteration 21479: Policy loss: 0.016073. Value loss: 0.974952. Entropy: 1.013646.
Iteration 21480: Policy loss: 0.012386. Value loss: 0.849406. Entropy: 1.022763.
Training network. lr: 0.000085. clip: 0.034115
Iteration 21481: Policy loss: 0.003181. Value loss: 0.297057. Entropy: 0.678440.
Iteration 21482: Policy loss: -0.001784. Value loss: 0.190843. Entropy: 0.672409.
Iteration 21483: Policy loss: -0.001056. Value loss: 0.174640. Entropy: 0.670047.
episode: 7472   score: 990.0  epsilon: 1.0    steps: 136  evaluation reward: 1173.5
episode: 7473   score: 1210.0  epsilon: 1.0    steps: 240  evaluation reward: 1174.9
Training network. lr: 0.000085. clip: 0.034115
Iteration 21484: Policy loss: 0.0

Iteration 21543: Policy loss: 0.000115. Value loss: 0.092865. Entropy: 1.160067.
Training network. lr: 0.000085. clip: 0.033958
Iteration 21544: Policy loss: 0.003592. Value loss: 0.333196. Entropy: 1.424475.
Iteration 21545: Policy loss: 0.003339. Value loss: 0.229698. Entropy: 1.426144.
Iteration 21546: Policy loss: 0.003603. Value loss: 0.174255. Entropy: 1.418555.
Training network. lr: 0.000085. clip: 0.033958
Iteration 21547: Policy loss: 0.008934. Value loss: 0.459335. Entropy: 1.444880.
Iteration 21548: Policy loss: 0.009082. Value loss: 0.306448. Entropy: 1.465792.
Iteration 21549: Policy loss: 0.009961. Value loss: 0.247007. Entropy: 1.458187.
episode: 7493   score: 1130.0  epsilon: 1.0    steps: 64  evaluation reward: 1170.7
episode: 7494   score: 920.0  epsilon: 1.0    steps: 304  evaluation reward: 1167.9
episode: 7495   score: 570.0  epsilon: 1.0    steps: 544  evaluation reward: 1163.6
episode: 7496   score: 1420.0  epsilon: 1.0    steps: 1024  evaluation reward: 1162.9
T

Iteration 21607: Policy loss: 0.002343. Value loss: 0.250956. Entropy: 1.434432.
Iteration 21608: Policy loss: 0.002218. Value loss: 0.185099. Entropy: 1.452515.
Iteration 21609: Policy loss: 0.001045. Value loss: 0.142122. Entropy: 1.449333.
Training network. lr: 0.000084. clip: 0.033654
Iteration 21610: Policy loss: 0.003500. Value loss: 0.204224. Entropy: 1.307868.
Iteration 21611: Policy loss: 0.002960. Value loss: 0.136097. Entropy: 1.287315.
Iteration 21612: Policy loss: 0.000812. Value loss: 0.110950. Entropy: 1.293267.
episode: 7516   score: 720.0  epsilon: 1.0    steps: 80  evaluation reward: 1113.2
episode: 7517   score: 1070.0  epsilon: 1.0    steps: 720  evaluation reward: 1108.6
Training network. lr: 0.000084. clip: 0.033654
Iteration 21613: Policy loss: 0.000885. Value loss: 0.122711. Entropy: 1.219026.
Iteration 21614: Policy loss: -0.001840. Value loss: 0.109288. Entropy: 1.208443.
Iteration 21615: Policy loss: -0.002059. Value loss: 0.097153. Entropy: 1.223145.
episode

Training network. lr: 0.000084. clip: 0.033497
Iteration 21673: Policy loss: 0.002769. Value loss: 0.254360. Entropy: 1.587737.
Iteration 21674: Policy loss: 0.002898. Value loss: 0.170230. Entropy: 1.586561.
Iteration 21675: Policy loss: 0.001548. Value loss: 0.145167. Entropy: 1.586198.
Training network. lr: 0.000084. clip: 0.033497
Iteration 21676: Policy loss: 0.004073. Value loss: 1.198138. Entropy: 1.330217.
Iteration 21677: Policy loss: 0.006541. Value loss: 1.090668. Entropy: 1.312390.
Iteration 21678: Policy loss: 0.005653. Value loss: 1.032678. Entropy: 1.324139.
episode: 7538   score: 1060.0  epsilon: 1.0    steps: 840  evaluation reward: 1126.6
Training network. lr: 0.000084. clip: 0.033497
Iteration 21679: Policy loss: 0.009164. Value loss: 0.327826. Entropy: 1.028624.
Iteration 21680: Policy loss: 0.007480. Value loss: 0.227279. Entropy: 1.017243.
Iteration 21681: Policy loss: 0.002519. Value loss: 0.203848. Entropy: 1.010178.
episode: 7539   score: 1270.0  epsilon: 1.0  

Training network. lr: 0.000083. clip: 0.033341
Iteration 21739: Policy loss: 0.001921. Value loss: 0.170575. Entropy: 1.035920.
Iteration 21740: Policy loss: 0.001331. Value loss: 0.127249. Entropy: 1.020760.
Iteration 21741: Policy loss: -0.001065. Value loss: 0.110947. Entropy: 1.028961.
episode: 7560   score: 920.0  epsilon: 1.0    steps: 200  evaluation reward: 1114.9
Training network. lr: 0.000083. clip: 0.033341
Iteration 21742: Policy loss: 0.001951. Value loss: 0.173912. Entropy: 1.060228.
Iteration 21743: Policy loss: 0.000723. Value loss: 0.127818. Entropy: 1.062547.
Iteration 21744: Policy loss: -0.001658. Value loss: 0.116073. Entropy: 1.066103.
Training network. lr: 0.000083. clip: 0.033341
Iteration 21745: Policy loss: 0.009528. Value loss: 0.262939. Entropy: 1.165892.
Iteration 21746: Policy loss: 0.006598. Value loss: 0.184480. Entropy: 1.164178.
Iteration 21747: Policy loss: 0.005252. Value loss: 0.152407. Entropy: 1.179176.
episode: 7561   score: 1080.0  epsilon: 1.0 

Iteration 21805: Policy loss: 0.004645. Value loss: 0.246494. Entropy: 1.028640.
Iteration 21806: Policy loss: 0.000211. Value loss: 0.198012. Entropy: 1.018666.
Iteration 21807: Policy loss: -0.000628. Value loss: 0.184738. Entropy: 1.035489.
episode: 7581   score: 890.0  epsilon: 1.0    steps: 128  evaluation reward: 1071.4
episode: 7582   score: 1330.0  epsilon: 1.0    steps: 480  evaluation reward: 1072.3
episode: 7583   score: 1140.0  epsilon: 1.0    steps: 928  evaluation reward: 1072.7
Training network. lr: 0.000083. clip: 0.033037
Iteration 21808: Policy loss: 0.001565. Value loss: 0.218647. Entropy: 1.068052.
Iteration 21809: Policy loss: 0.000100. Value loss: 0.155702. Entropy: 1.072486.
Iteration 21810: Policy loss: -0.001986. Value loss: 0.148769. Entropy: 1.060436.
Training network. lr: 0.000083. clip: 0.033037
Iteration 21811: Policy loss: 0.002480. Value loss: 0.185753. Entropy: 1.333933.
Iteration 21812: Policy loss: 0.001767. Value loss: 0.161605. Entropy: 1.343526.
It

Training network. lr: 0.000082. clip: 0.032880
Iteration 21871: Policy loss: 0.005536. Value loss: 0.371405. Entropy: 1.219890.
Iteration 21872: Policy loss: 0.004249. Value loss: 0.282647. Entropy: 1.222477.
Iteration 21873: Policy loss: 0.006297. Value loss: 0.244409. Entropy: 1.214051.
episode: 7603   score: 570.0  epsilon: 1.0    steps: 832  evaluation reward: 1095.1
Training network. lr: 0.000082. clip: 0.032880
Iteration 21874: Policy loss: 0.002236. Value loss: 0.184373. Entropy: 1.053884.
Iteration 21875: Policy loss: -0.000649. Value loss: 0.151088. Entropy: 1.044170.
Iteration 21876: Policy loss: -0.002550. Value loss: 0.131269. Entropy: 1.044910.
episode: 7604   score: 1290.0  epsilon: 1.0    steps: 152  evaluation reward: 1097.8
episode: 7605   score: 1110.0  epsilon: 1.0    steps: 624  evaluation reward: 1099.3
episode: 7606   score: 1400.0  epsilon: 1.0    steps: 872  evaluation reward: 1110.6
episode: 7607   score: 1020.0  epsilon: 1.0    steps: 904  evaluation reward: 1

Iteration 21936: Policy loss: 0.000349. Value loss: 0.093469. Entropy: 1.429309.
episode: 7626   score: 740.0  epsilon: 1.0    steps: 72  evaluation reward: 1106.5
episode: 7627   score: 780.0  epsilon: 1.0    steps: 656  evaluation reward: 1099.7
Training network. lr: 0.000082. clip: 0.032732
Iteration 21937: Policy loss: 0.005108. Value loss: 0.534977. Entropy: 1.361506.
Iteration 21938: Policy loss: 0.008648. Value loss: 0.327572. Entropy: 1.374151.
Iteration 21939: Policy loss: 0.006972. Value loss: 0.263012. Entropy: 1.385677.
Training network. lr: 0.000082. clip: 0.032732
Iteration 21940: Policy loss: 0.005914. Value loss: 0.315858. Entropy: 1.329666.
Iteration 21941: Policy loss: 0.006342. Value loss: 0.153044. Entropy: 1.321325.
Iteration 21942: Policy loss: 0.001060. Value loss: 0.119961. Entropy: 1.318732.
Training network. lr: 0.000082. clip: 0.032732
Iteration 21943: Policy loss: 0.007723. Value loss: 0.327393. Entropy: 1.429343.
Iteration 21944: Policy loss: 0.007258. Valu

Training network. lr: 0.000081. clip: 0.032419
Iteration 22003: Policy loss: 0.002060. Value loss: 0.205668. Entropy: 0.871665.
Iteration 22004: Policy loss: 0.001118. Value loss: 0.140223. Entropy: 0.877605.
Iteration 22005: Policy loss: -0.001426. Value loss: 0.126744. Entropy: 0.863543.
Training network. lr: 0.000081. clip: 0.032419
Iteration 22006: Policy loss: 0.002411. Value loss: 0.250373. Entropy: 0.934087.
Iteration 22007: Policy loss: 0.001204. Value loss: 0.148048. Entropy: 0.924476.
Iteration 22008: Policy loss: -0.000005. Value loss: 0.122901. Entropy: 0.911892.
episode: 7647   score: 1200.0  epsilon: 1.0    steps: 856  evaluation reward: 1072.6
Training network. lr: 0.000081. clip: 0.032419
Iteration 22009: Policy loss: 0.002536. Value loss: 0.228603. Entropy: 0.899283.
Iteration 22010: Policy loss: 0.002908. Value loss: 0.159041. Entropy: 0.904665.
Iteration 22011: Policy loss: 0.001189. Value loss: 0.139239. Entropy: 0.899494.
episode: 7648   score: 1000.0  epsilon: 1.0

episode: 7668   score: 1300.0  epsilon: 1.0    steps: 880  evaluation reward: 1087.6
Training network. lr: 0.000081. clip: 0.032272
Iteration 22069: Policy loss: 0.003637. Value loss: 0.548971. Entropy: 1.239595.
Iteration 22070: Policy loss: 0.005205. Value loss: 0.458419. Entropy: 1.227114.
Iteration 22071: Policy loss: 0.002395. Value loss: 0.398858. Entropy: 1.249911.
Training network. lr: 0.000081. clip: 0.032272
Iteration 22072: Policy loss: 0.005722. Value loss: 0.423378. Entropy: 0.869621.
Iteration 22073: Policy loss: 0.006213. Value loss: 0.314833. Entropy: 0.867462.
Iteration 22074: Policy loss: 0.003719. Value loss: 0.279421. Entropy: 0.863220.
Training network. lr: 0.000081. clip: 0.032272
Iteration 22075: Policy loss: 0.001864. Value loss: 0.267560. Entropy: 0.944398.
Iteration 22076: Policy loss: -0.000672. Value loss: 0.169192. Entropy: 0.927959.
Iteration 22077: Policy loss: -0.002706. Value loss: 0.136458. Entropy: 0.924259.
episode: 7669   score: 1420.0  epsilon: 1.0

Iteration 22136: Policy loss: 0.003211. Value loss: 0.233995. Entropy: 1.180665.
Iteration 22137: Policy loss: 0.003097. Value loss: 0.201215. Entropy: 1.185728.
episode: 7688   score: 1020.0  epsilon: 1.0    steps: 560  evaluation reward: 1164.5
Training network. lr: 0.000080. clip: 0.032115
Iteration 22138: Policy loss: 0.002806. Value loss: 0.242025. Entropy: 1.036253.
Iteration 22139: Policy loss: 0.000726. Value loss: 0.152607. Entropy: 1.035262.
Iteration 22140: Policy loss: -0.001399. Value loss: 0.144430. Entropy: 1.041309.
episode: 7689   score: 1720.0  epsilon: 1.0    steps: 488  evaluation reward: 1165.7
Training network. lr: 0.000080. clip: 0.032115
Iteration 22141: Policy loss: 0.000043. Value loss: 0.128590. Entropy: 0.756160.
Iteration 22142: Policy loss: -0.000014. Value loss: 0.113021. Entropy: 0.772869.
Iteration 22143: Policy loss: -0.001615. Value loss: 0.102341. Entropy: 0.772947.
episode: 7690   score: 1020.0  epsilon: 1.0    steps: 56  evaluation reward: 1160.0
e

episode: 7712   score: 1170.0  epsilon: 1.0    steps: 936  evaluation reward: 1183.7
Training network. lr: 0.000080. clip: 0.031811
Iteration 22201: Policy loss: 0.001380. Value loss: 0.271657. Entropy: 0.839553.
Iteration 22202: Policy loss: 0.000904. Value loss: 0.242061. Entropy: 0.853207.
Iteration 22203: Policy loss: 0.000745. Value loss: 0.219544. Entropy: 0.850308.
Training network. lr: 0.000080. clip: 0.031811
Iteration 22204: Policy loss: 0.004061. Value loss: 0.316468. Entropy: 1.062962.
Iteration 22205: Policy loss: 0.002456. Value loss: 0.263402. Entropy: 1.063594.
Iteration 22206: Policy loss: 0.000917. Value loss: 0.242923. Entropy: 1.064323.
episode: 7713   score: 1270.0  epsilon: 1.0    steps: 976  evaluation reward: 1185.3
Training network. lr: 0.000080. clip: 0.031811
Iteration 22207: Policy loss: 0.002240. Value loss: 0.322595. Entropy: 1.378935.
Iteration 22208: Policy loss: 0.006672. Value loss: 0.249808. Entropy: 1.390740.
Iteration 22209: Policy loss: 0.003127. V

Iteration 22268: Policy loss: 0.000290. Value loss: 0.224875. Entropy: 0.782873.
Iteration 22269: Policy loss: -0.002212. Value loss: 0.220264. Entropy: 0.793053.
episode: 7732   score: 1830.0  epsilon: 1.0    steps: 232  evaluation reward: 1208.4
Training network. lr: 0.000079. clip: 0.031654
Iteration 22270: Policy loss: 0.000460. Value loss: 0.277503. Entropy: 0.785232.
Iteration 22271: Policy loss: -0.000036. Value loss: 0.219600. Entropy: 0.773351.
Iteration 22272: Policy loss: -0.001123. Value loss: 0.204087. Entropy: 0.772128.
episode: 7733   score: 1430.0  epsilon: 1.0    steps: 672  evaluation reward: 1209.3
episode: 7734   score: 1460.0  epsilon: 1.0    steps: 952  evaluation reward: 1209.0
episode: 7735   score: 690.0  epsilon: 1.0    steps: 1024  evaluation reward: 1205.6
Training network. lr: 0.000079. clip: 0.031654
Iteration 22273: Policy loss: 0.004876. Value loss: 0.192912. Entropy: 1.124671.
Iteration 22274: Policy loss: 0.000315. Value loss: 0.122796. Entropy: 1.1274

Iteration 22333: Policy loss: 0.004038. Value loss: 0.252371. Entropy: 0.857719.
Iteration 22334: Policy loss: 0.006913. Value loss: 0.180077. Entropy: 0.870959.
Iteration 22335: Policy loss: 0.005830. Value loss: 0.147062. Entropy: 0.867554.
episode: 7754   score: 1380.0  epsilon: 1.0    steps: 520  evaluation reward: 1227.3
episode: 7755   score: 1240.0  epsilon: 1.0    steps: 608  evaluation reward: 1228.6
Training network. lr: 0.000079. clip: 0.031497
Iteration 22336: Policy loss: 0.001807. Value loss: 0.226677. Entropy: 0.859039.
Iteration 22337: Policy loss: 0.000178. Value loss: 0.169405. Entropy: 0.858586.
Iteration 22338: Policy loss: -0.000681. Value loss: 0.155909. Entropy: 0.854973.
episode: 7756   score: 920.0  epsilon: 1.0    steps: 368  evaluation reward: 1222.0
Training network. lr: 0.000079. clip: 0.031497
Iteration 22339: Policy loss: 0.002052. Value loss: 0.486346. Entropy: 0.681627.
Iteration 22340: Policy loss: 0.001817. Value loss: 0.400927. Entropy: 0.663898.
Ite

Iteration 22400: Policy loss: 0.004506. Value loss: 0.135861. Entropy: 0.820292.
Iteration 22401: Policy loss: -0.000196. Value loss: 0.113708. Entropy: 0.836217.
episode: 7775   score: 1460.0  epsilon: 1.0    steps: 376  evaluation reward: 1250.5
Training network. lr: 0.000078. clip: 0.031193
Iteration 22402: Policy loss: 0.007478. Value loss: 1.063246. Entropy: 1.024114.
Iteration 22403: Policy loss: 0.009647. Value loss: 0.889201. Entropy: 1.035119.
Iteration 22404: Policy loss: 0.005186. Value loss: 0.869793. Entropy: 1.026651.
episode: 7776   score: 1570.0  epsilon: 1.0    steps: 728  evaluation reward: 1238.8
episode: 7777   score: 1360.0  epsilon: 1.0    steps: 992  evaluation reward: 1242.2
Training network. lr: 0.000078. clip: 0.031193
Iteration 22405: Policy loss: 0.002076. Value loss: 0.135733. Entropy: 0.968687.
Iteration 22406: Policy loss: 0.000029. Value loss: 0.097095. Entropy: 0.963415.
Iteration 22407: Policy loss: 0.001056. Value loss: 0.070761. Entropy: 0.962338.
ep

episode: 7798   score: 1260.0  epsilon: 1.0    steps: 320  evaluation reward: 1215.4
episode: 7799   score: 1480.0  epsilon: 1.0    steps: 544  evaluation reward: 1214.6
Training network. lr: 0.000078. clip: 0.031037
Iteration 22465: Policy loss: 0.008653. Value loss: 1.048736. Entropy: 0.978868.
Iteration 22466: Policy loss: 0.006291. Value loss: 0.914642. Entropy: 0.972884.
Iteration 22467: Policy loss: 0.004890. Value loss: 0.825542. Entropy: 0.963445.
Training network. lr: 0.000078. clip: 0.031037
Iteration 22468: Policy loss: 0.002337. Value loss: 0.142893. Entropy: 1.067683.
Iteration 22469: Policy loss: 0.003554. Value loss: 0.093471. Entropy: 1.063543.
Iteration 22470: Policy loss: 0.000427. Value loss: 0.076760. Entropy: 1.072481.
Training network. lr: 0.000078. clip: 0.031037
Iteration 22471: Policy loss: 0.001881. Value loss: 0.128011. Entropy: 1.405380.
Iteration 22472: Policy loss: 0.000032. Value loss: 0.093397. Entropy: 1.413021.
Iteration 22473: Policy loss: 0.000298. V

Iteration 22529: Policy loss: 0.003182. Value loss: 0.388584. Entropy: 1.187771.
Iteration 22530: Policy loss: 0.002307. Value loss: 0.354224. Entropy: 1.190670.
Training network. lr: 0.000077. clip: 0.030889
Iteration 22531: Policy loss: 0.003877. Value loss: 0.627333. Entropy: 0.972850.
Iteration 22532: Policy loss: 0.002060. Value loss: 0.519318. Entropy: 0.948112.
Iteration 22533: Policy loss: 0.000686. Value loss: 0.486433. Entropy: 0.952427.
episode: 7821   score: 820.0  epsilon: 1.0    steps: 408  evaluation reward: 1209.5
episode: 7822   score: 1420.0  epsilon: 1.0    steps: 432  evaluation reward: 1209.6
Training network. lr: 0.000077. clip: 0.030889
Iteration 22534: Policy loss: 0.002777. Value loss: 0.253142. Entropy: 0.918634.
Iteration 22535: Policy loss: 0.009129. Value loss: 0.144103. Entropy: 0.929480.
Iteration 22536: Policy loss: 0.002869. Value loss: 0.114522. Entropy: 0.907635.
Training network. lr: 0.000077. clip: 0.030889
Iteration 22537: Policy loss: 0.001331. Va

episode: 7841   score: 1150.0  epsilon: 1.0    steps: 456  evaluation reward: 1256.0
Training network. lr: 0.000077. clip: 0.030733
Iteration 22597: Policy loss: 0.001120. Value loss: 0.307521. Entropy: 0.803834.
Iteration 22598: Policy loss: -0.000746. Value loss: 0.266470. Entropy: 0.805448.
Iteration 22599: Policy loss: 0.000441. Value loss: 0.245222. Entropy: 0.815408.
episode: 7842   score: 1420.0  epsilon: 1.0    steps: 80  evaluation reward: 1257.8
Training network. lr: 0.000077. clip: 0.030733
Iteration 22600: Policy loss: 0.002385. Value loss: 0.334909. Entropy: 0.520684.
Iteration 22601: Policy loss: 0.002281. Value loss: 0.280215. Entropy: 0.503501.
Iteration 22602: Policy loss: 0.001127. Value loss: 0.260718. Entropy: 0.499676.
episode: 7843   score: 1240.0  epsilon: 1.0    steps: 688  evaluation reward: 1257.2
Training network. lr: 0.000076. clip: 0.030576
Iteration 22603: Policy loss: 0.005231. Value loss: 0.306971. Entropy: 0.727651.
Iteration 22604: Policy loss: 0.00500

episode: 7862   score: 1420.0  epsilon: 1.0    steps: 448  evaluation reward: 1263.5
Training network. lr: 0.000076. clip: 0.030428
Iteration 22663: Policy loss: 0.001787. Value loss: 0.174813. Entropy: 0.796447.
Iteration 22664: Policy loss: 0.002127. Value loss: 0.096009. Entropy: 0.798009.
Iteration 22665: Policy loss: -0.000796. Value loss: 0.074352. Entropy: 0.802808.
episode: 7863   score: 1920.0  epsilon: 1.0    steps: 688  evaluation reward: 1267.8
episode: 7864   score: 1190.0  epsilon: 1.0    steps: 896  evaluation reward: 1267.1
Training network. lr: 0.000076. clip: 0.030428
Iteration 22666: Policy loss: 0.003953. Value loss: 0.301768. Entropy: 0.746913.
Iteration 22667: Policy loss: 0.002282. Value loss: 0.205517. Entropy: 0.750757.
Iteration 22668: Policy loss: -0.000391. Value loss: 0.179912. Entropy: 0.757633.
episode: 7865   score: 1350.0  epsilon: 1.0    steps: 48  evaluation reward: 1266.2
Training network. lr: 0.000076. clip: 0.030428
Iteration 22669: Policy loss: 0.

Iteration 22728: Policy loss: 0.002004. Value loss: 0.168893. Entropy: 1.323559.
Training network. lr: 0.000076. clip: 0.030272
Iteration 22729: Policy loss: 0.007285. Value loss: 0.457607. Entropy: 1.323874.
Iteration 22730: Policy loss: 0.004925. Value loss: 0.352438. Entropy: 1.326535.
Iteration 22731: Policy loss: 0.000981. Value loss: 0.297589. Entropy: 1.323535.
episode: 7885   score: 1150.0  epsilon: 1.0    steps: 480  evaluation reward: 1242.4
Training network. lr: 0.000076. clip: 0.030272
Iteration 22732: Policy loss: 0.002734. Value loss: 0.091394. Entropy: 0.967213.
Iteration 22733: Policy loss: 0.000953. Value loss: 0.053851. Entropy: 0.977050.
Iteration 22734: Policy loss: -0.002956. Value loss: 0.048218. Entropy: 0.970844.
episode: 7886   score: 1020.0  epsilon: 1.0    steps: 432  evaluation reward: 1239.8
episode: 7887   score: 1310.0  epsilon: 1.0    steps: 656  evaluation reward: 1238.0
episode: 7888   score: 1020.0  epsilon: 1.0    steps: 744  evaluation reward: 1231.

Iteration 22792: Policy loss: 0.002775. Value loss: 0.331168. Entropy: 1.404882.
Iteration 22793: Policy loss: 0.005205. Value loss: 0.230667. Entropy: 1.409927.
Iteration 22794: Policy loss: 0.001819. Value loss: 0.192736. Entropy: 1.411697.
episode: 7908   score: 1090.0  epsilon: 1.0    steps: 736  evaluation reward: 1268.1
Training network. lr: 0.000075. clip: 0.030115
Iteration 22795: Policy loss: -0.000506. Value loss: 0.162497. Entropy: 1.470958.
Iteration 22796: Policy loss: 0.000774. Value loss: 0.114854. Entropy: 1.479632.
Iteration 22797: Policy loss: -0.001304. Value loss: 0.102827. Entropy: 1.473688.
Training network. lr: 0.000075. clip: 0.030115
Iteration 22798: Policy loss: 0.002779. Value loss: 0.149672. Entropy: 1.291277.
Iteration 22799: Policy loss: 0.002388. Value loss: 0.112184. Entropy: 1.301121.
Iteration 22800: Policy loss: 0.002428. Value loss: 0.105895. Entropy: 1.293493.
episode: 7909   score: 900.0  epsilon: 1.0    steps: 744  evaluation reward: 1272.4
Traini

Iteration 22859: Policy loss: 0.000569. Value loss: 0.142410. Entropy: 1.164976.
Iteration 22860: Policy loss: -0.000812. Value loss: 0.126408. Entropy: 1.165785.
episode: 7929   score: 1010.0  epsilon: 1.0    steps: 384  evaluation reward: 1278.6
Training network. lr: 0.000075. clip: 0.029811
Iteration 22861: Policy loss: 0.006651. Value loss: 0.226817. Entropy: 1.085582.
Iteration 22862: Policy loss: 0.004397. Value loss: 0.162848. Entropy: 1.075655.
Iteration 22863: Policy loss: 0.000260. Value loss: 0.135205. Entropy: 1.088068.
Training network. lr: 0.000075. clip: 0.029811
Iteration 22864: Policy loss: 0.002470. Value loss: 0.257896. Entropy: 1.021533.
Iteration 22865: Policy loss: 0.001229. Value loss: 0.180283. Entropy: 1.018978.
Iteration 22866: Policy loss: -0.000695. Value loss: 0.166903. Entropy: 1.024406.
episode: 7930   score: 1520.0  epsilon: 1.0    steps: 272  evaluation reward: 1279.4
episode: 7931   score: 1320.0  epsilon: 1.0    steps: 456  evaluation reward: 1275.7
e

episode: 7952   score: 420.0  epsilon: 1.0    steps: 864  evaluation reward: 1243.0
Training network. lr: 0.000074. clip: 0.029654
Iteration 22924: Policy loss: 0.004288. Value loss: 0.191833. Entropy: 1.207548.
Iteration 22925: Policy loss: 0.000490. Value loss: 0.135148. Entropy: 1.202051.
Iteration 22926: Policy loss: -0.001866. Value loss: 0.125990. Entropy: 1.201619.
Training network. lr: 0.000074. clip: 0.029654
Iteration 22927: Policy loss: 0.000643. Value loss: 0.222933. Entropy: 1.145230.
Iteration 22928: Policy loss: -0.001493. Value loss: 0.177999. Entropy: 1.141289.
Iteration 22929: Policy loss: -0.001349. Value loss: 0.164378. Entropy: 1.144373.
episode: 7953   score: 1230.0  epsilon: 1.0    steps: 888  evaluation reward: 1238.9
Training network. lr: 0.000074. clip: 0.029654
Iteration 22930: Policy loss: 0.001676. Value loss: 0.104551. Entropy: 1.496747.
Iteration 22931: Policy loss: -0.000875. Value loss: 0.075146. Entropy: 1.490301.
Iteration 22932: Policy loss: 0.000999

Iteration 22990: Policy loss: 0.004184. Value loss: 0.232280. Entropy: 1.445454.
Iteration 22991: Policy loss: 0.004044. Value loss: 0.150707. Entropy: 1.451365.
Iteration 22992: Policy loss: 0.002214. Value loss: 0.125807. Entropy: 1.452462.
episode: 7973   score: 550.0  epsilon: 1.0    steps: 776  evaluation reward: 1226.9
episode: 7974   score: 1060.0  epsilon: 1.0    steps: 784  evaluation reward: 1221.5
Training network. lr: 0.000074. clip: 0.029507
Iteration 22993: Policy loss: 0.004071. Value loss: 0.352173. Entropy: 1.066310.
Iteration 22994: Policy loss: 0.004718. Value loss: 0.258416. Entropy: 1.062536.
Iteration 22995: Policy loss: 0.001262. Value loss: 0.232220. Entropy: 1.071847.
episode: 7975   score: 1410.0  epsilon: 1.0    steps: 104  evaluation reward: 1218.3
Training network. lr: 0.000074. clip: 0.029507
Iteration 22996: Policy loss: 0.001140. Value loss: 0.341730. Entropy: 0.689149.
Iteration 22997: Policy loss: 0.001135. Value loss: 0.297939. Entropy: 0.697272.
Iter

episode: 7995   score: 960.0  epsilon: 1.0    steps: 960  evaluation reward: 1202.2
Training network. lr: 0.000073. clip: 0.029193
Iteration 23056: Policy loss: 0.002482. Value loss: 0.659836. Entropy: 1.347967.
Iteration 23057: Policy loss: 0.002483. Value loss: 0.565054. Entropy: 1.343019.
Iteration 23058: Policy loss: 0.002099. Value loss: 0.529290. Entropy: 1.352478.
episode: 7996   score: 930.0  epsilon: 1.0    steps: 368  evaluation reward: 1192.9
episode: 7997   score: 1380.0  epsilon: 1.0    steps: 472  evaluation reward: 1193.8
episode: 7998   score: 1100.0  epsilon: 1.0    steps: 936  evaluation reward: 1185.7
Training network. lr: 0.000073. clip: 0.029193
Iteration 23059: Policy loss: 0.002055. Value loss: 0.507357. Entropy: 0.818893.
Iteration 23060: Policy loss: 0.004083. Value loss: 0.398318. Entropy: 0.816331.
Iteration 23061: Policy loss: 0.001836. Value loss: 0.356447. Entropy: 0.813459.
Training network. lr: 0.000073. clip: 0.029193
Iteration 23062: Policy loss: 0.000

Iteration 23121: Policy loss: 0.002249. Value loss: 0.246963. Entropy: 1.010201.
episode: 8017   score: 1630.0  epsilon: 1.0    steps: 296  evaluation reward: 1212.5
episode: 8018   score: 1440.0  epsilon: 1.0    steps: 344  evaluation reward: 1217.9
Training network. lr: 0.000073. clip: 0.029046
Iteration 23122: Policy loss: 0.002476. Value loss: 0.266747. Entropy: 0.856348.
Iteration 23123: Policy loss: 0.003434. Value loss: 0.178360. Entropy: 0.845245.
Iteration 23124: Policy loss: -0.000410. Value loss: 0.142680. Entropy: 0.859837.
episode: 8019   score: 1030.0  epsilon: 1.0    steps: 776  evaluation reward: 1215.3
Training network. lr: 0.000073. clip: 0.029046
Iteration 23125: Policy loss: 0.004270. Value loss: 0.213190. Entropy: 1.034081.
Iteration 23126: Policy loss: 0.003258. Value loss: 0.154394. Entropy: 1.045907.
Iteration 23127: Policy loss: 0.003236. Value loss: 0.126140. Entropy: 1.042427.
episode: 8020   score: 1380.0  epsilon: 1.0    steps: 912  evaluation reward: 1217.

episode: 8038   score: 1720.0  epsilon: 1.0    steps: 696  evaluation reward: 1224.8
Training network. lr: 0.000072. clip: 0.028889
Iteration 23188: Policy loss: 0.002848. Value loss: 0.113046. Entropy: 0.986124.
Iteration 23189: Policy loss: -0.000593. Value loss: 0.070584. Entropy: 0.986475.
Iteration 23190: Policy loss: -0.001585. Value loss: 0.058057. Entropy: 0.982984.
Training network. lr: 0.000072. clip: 0.028889
Iteration 23191: Policy loss: 0.001223. Value loss: 0.578401. Entropy: 1.229853.
Iteration 23192: Policy loss: 0.005421. Value loss: 0.464965. Entropy: 1.226095.
Iteration 23193: Policy loss: 0.000252. Value loss: 0.397042. Entropy: 1.217663.
episode: 8039   score: 1230.0  epsilon: 1.0    steps: 616  evaluation reward: 1223.1
episode: 8040   score: 1410.0  epsilon: 1.0    steps: 720  evaluation reward: 1227.6
Training network. lr: 0.000072. clip: 0.028889
Iteration 23194: Policy loss: 0.006838. Value loss: 0.165738. Entropy: 1.239185.
Iteration 23195: Policy loss: 0.003

Iteration 23255: Policy loss: 0.001055. Value loss: 0.188067. Entropy: 1.165527.
Iteration 23256: Policy loss: 0.000004. Value loss: 0.168863. Entropy: 1.161723.
Training network. lr: 0.000071. clip: 0.028585
Iteration 23257: Policy loss: 0.001892. Value loss: 0.175952. Entropy: 1.061919.
Iteration 23258: Policy loss: -0.000511. Value loss: 0.127162. Entropy: 1.075733.
Iteration 23259: Policy loss: -0.001507. Value loss: 0.114039. Entropy: 1.066082.
episode: 8058   score: 1060.0  epsilon: 1.0    steps: 56  evaluation reward: 1248.1
episode: 8059   score: 870.0  epsilon: 1.0    steps: 88  evaluation reward: 1241.6
episode: 8060   score: 410.0  epsilon: 1.0    steps: 264  evaluation reward: 1240.4
episode: 8061   score: 1190.0  epsilon: 1.0    steps: 728  evaluation reward: 1238.2
Training network. lr: 0.000071. clip: 0.028585
Iteration 23260: Policy loss: 0.000614. Value loss: 0.389019. Entropy: 0.604050.
Iteration 23261: Policy loss: 0.000361. Value loss: 0.317757. Entropy: 0.596373.
I

Iteration 23321: Policy loss: 0.002107. Value loss: 0.197486. Entropy: 0.523950.
Iteration 23322: Policy loss: 0.001934. Value loss: 0.177467. Entropy: 0.533506.
episode: 8080   score: 1570.0  epsilon: 1.0    steps: 440  evaluation reward: 1273.9
Training network. lr: 0.000071. clip: 0.028429
Iteration 23323: Policy loss: 0.001880. Value loss: 0.366901. Entropy: 0.585828.
Iteration 23324: Policy loss: 0.000900. Value loss: 0.299167. Entropy: 0.583660.
Iteration 23325: Policy loss: -0.000054. Value loss: 0.286262. Entropy: 0.585082.
episode: 8081   score: 1360.0  epsilon: 1.0    steps: 96  evaluation reward: 1276.3
Training network. lr: 0.000071. clip: 0.028429
Iteration 23326: Policy loss: 0.000668. Value loss: 0.189696. Entropy: 0.875677.
Iteration 23327: Policy loss: -0.000048. Value loss: 0.169197. Entropy: 0.861254.
Iteration 23328: Policy loss: -0.001028. Value loss: 0.162856. Entropy: 0.866269.
episode: 8082   score: 1050.0  epsilon: 1.0    steps: 168  evaluation reward: 1274.7
e

Iteration 23386: Policy loss: 0.004039. Value loss: 0.361004. Entropy: 1.062514.
Iteration 23387: Policy loss: 0.002854. Value loss: 0.290880. Entropy: 1.073868.
Iteration 23388: Policy loss: 0.003176. Value loss: 0.260892. Entropy: 1.069715.
episode: 8102   score: 1410.0  epsilon: 1.0    steps: 824  evaluation reward: 1283.8
Training network. lr: 0.000071. clip: 0.028272
Iteration 23389: Policy loss: 0.004739. Value loss: 0.229700. Entropy: 0.790568.
Iteration 23390: Policy loss: 0.001947. Value loss: 0.159530. Entropy: 0.769852.
Iteration 23391: Policy loss: 0.000114. Value loss: 0.141378. Entropy: 0.770420.
Training network. lr: 0.000071. clip: 0.028272
Iteration 23392: Policy loss: 0.003728. Value loss: 0.252846. Entropy: 1.063377.
Iteration 23393: Policy loss: 0.001938. Value loss: 0.188103. Entropy: 1.064878.
Iteration 23394: Policy loss: 0.001429. Value loss: 0.170379. Entropy: 1.058921.
episode: 8103   score: 1490.0  epsilon: 1.0    steps: 760  evaluation reward: 1284.2
Trainin

Training network. lr: 0.000070. clip: 0.027968
Iteration 23452: Policy loss: 0.004754. Value loss: 0.700105. Entropy: 1.238475.
Iteration 23453: Policy loss: 0.002790. Value loss: 0.617449. Entropy: 1.240138.
Iteration 23454: Policy loss: 0.003126. Value loss: 0.560024. Entropy: 1.235002.
Training network. lr: 0.000070. clip: 0.027968
Iteration 23455: Policy loss: 0.004906. Value loss: 0.379718. Entropy: 1.356317.
Iteration 23456: Policy loss: 0.004773. Value loss: 0.266837. Entropy: 1.351193.
Iteration 23457: Policy loss: 0.003036. Value loss: 0.218047. Entropy: 1.353906.
episode: 8124   score: 950.0  epsilon: 1.0    steps: 24  evaluation reward: 1276.9
Training network. lr: 0.000070. clip: 0.027968
Iteration 23458: Policy loss: 0.001731. Value loss: 0.168731. Entropy: 1.389851.
Iteration 23459: Policy loss: 0.001259. Value loss: 0.128652. Entropy: 1.389627.
Iteration 23460: Policy loss: 0.000532. Value loss: 0.116396. Entropy: 1.386961.
episode: 8125   score: 870.0  epsilon: 1.0    s

Iteration 23519: Policy loss: 0.004866. Value loss: 0.284468. Entropy: 0.784979.
Iteration 23520: Policy loss: 0.003632. Value loss: 0.217602. Entropy: 0.760617.
episode: 8144   score: 1480.0  epsilon: 1.0    steps: 944  evaluation reward: 1258.4
Training network. lr: 0.000070. clip: 0.027811
Iteration 23521: Policy loss: 0.003254. Value loss: 0.155750. Entropy: 0.868141.
Iteration 23522: Policy loss: -0.000097. Value loss: 0.101335. Entropy: 0.876268.
Iteration 23523: Policy loss: -0.000836. Value loss: 0.075889. Entropy: 0.876804.
episode: 8145   score: 1680.0  epsilon: 1.0    steps: 224  evaluation reward: 1257.0
episode: 8146   score: 1360.0  epsilon: 1.0    steps: 776  evaluation reward: 1255.5
episode: 8147   score: 1520.0  epsilon: 1.0    steps: 912  evaluation reward: 1256.3
Training network. lr: 0.000070. clip: 0.027811
Iteration 23524: Policy loss: 0.001127. Value loss: 0.229959. Entropy: 0.845862.
Iteration 23525: Policy loss: 0.001032. Value loss: 0.191588. Entropy: 0.84334

Iteration 23584: Policy loss: 0.003145. Value loss: 0.210040. Entropy: 0.928277.
Iteration 23585: Policy loss: 0.001899. Value loss: 0.167173. Entropy: 0.918284.
Iteration 23586: Policy loss: 0.000785. Value loss: 0.152874. Entropy: 0.925844.
Training network. lr: 0.000069. clip: 0.027664
Iteration 23587: Policy loss: 0.005030. Value loss: 0.234320. Entropy: 0.923442.
Iteration 23588: Policy loss: 0.002965. Value loss: 0.142446. Entropy: 0.921216.
Iteration 23589: Policy loss: -0.000866. Value loss: 0.117640. Entropy: 0.925312.
episode: 8166   score: 1960.0  epsilon: 1.0    steps: 584  evaluation reward: 1281.3
Training network. lr: 0.000069. clip: 0.027664
Iteration 23590: Policy loss: 0.004022. Value loss: 0.342087. Entropy: 1.031671.
Iteration 23591: Policy loss: 0.003654. Value loss: 0.228144. Entropy: 1.029421.
Iteration 23592: Policy loss: 0.000082. Value loss: 0.179471. Entropy: 1.033621.
episode: 8167   score: 1060.0  epsilon: 1.0    steps: 144  evaluation reward: 1276.4
episod

Iteration 23652: Policy loss: 0.001772. Value loss: 0.245587. Entropy: 1.245887.
Training network. lr: 0.000068. clip: 0.027350
Iteration 23653: Policy loss: 0.004225. Value loss: 0.127663. Entropy: 1.222668.
Iteration 23654: Policy loss: 0.000765. Value loss: 0.071497. Entropy: 1.224122.
Iteration 23655: Policy loss: -0.000514. Value loss: 0.061832. Entropy: 1.233354.
episode: 8186   score: 600.0  epsilon: 1.0    steps: 32  evaluation reward: 1285.7
episode: 8187   score: 1040.0  epsilon: 1.0    steps: 336  evaluation reward: 1284.3
episode: 8188   score: 870.0  epsilon: 1.0    steps: 944  evaluation reward: 1278.4
episode: 8189   score: 1250.0  epsilon: 1.0    steps: 992  evaluation reward: 1276.6
Training network. lr: 0.000068. clip: 0.027350
Iteration 23656: Policy loss: 0.000968. Value loss: 0.297690. Entropy: 0.766043.
Iteration 23657: Policy loss: -0.000653. Value loss: 0.260840. Entropy: 0.775734.
Iteration 23658: Policy loss: -0.000905. Value loss: 0.245506. Entropy: 0.778299.

Iteration 23715: Policy loss: -0.000020. Value loss: 0.092700. Entropy: 1.240162.
Training network. lr: 0.000068. clip: 0.027203
Iteration 23716: Policy loss: 0.004435. Value loss: 0.308392. Entropy: 1.414481.
Iteration 23717: Policy loss: 0.003349. Value loss: 0.216152. Entropy: 1.421633.
Iteration 23718: Policy loss: 0.001301. Value loss: 0.188011. Entropy: 1.417948.
episode: 8211   score: 1290.0  epsilon: 1.0    steps: 720  evaluation reward: 1301.4
Training network. lr: 0.000068. clip: 0.027203
Iteration 23719: Policy loss: 0.006858. Value loss: 0.229208. Entropy: 1.457787.
Iteration 23720: Policy loss: 0.002959. Value loss: 0.128639. Entropy: 1.463693.
Iteration 23721: Policy loss: 0.002030. Value loss: 0.101288. Entropy: 1.455860.
episode: 8212   score: 1010.0  epsilon: 1.0    steps: 40  evaluation reward: 1298.3
episode: 8213   score: 1050.0  epsilon: 1.0    steps: 920  evaluation reward: 1296.0
Training network. lr: 0.000068. clip: 0.027203
Iteration 23722: Policy loss: 0.00158

Iteration 23780: Policy loss: 0.001124. Value loss: 0.186433. Entropy: 1.248584.
Iteration 23781: Policy loss: -0.000657. Value loss: 0.156008. Entropy: 1.254624.
Training network. lr: 0.000068. clip: 0.027046
Iteration 23782: Policy loss: 0.005555. Value loss: 0.221097. Entropy: 1.250929.
Iteration 23783: Policy loss: 0.003440. Value loss: 0.137958. Entropy: 1.254618.
Iteration 23784: Policy loss: 0.001188. Value loss: 0.108306. Entropy: 1.259885.
episode: 8234   score: 920.0  epsilon: 1.0    steps: 880  evaluation reward: 1302.0
Training network. lr: 0.000068. clip: 0.027046
Iteration 23785: Policy loss: 0.004858. Value loss: 0.218709. Entropy: 1.590474.
Iteration 23786: Policy loss: 0.004417. Value loss: 0.122591. Entropy: 1.592849.
Iteration 23787: Policy loss: 0.003295. Value loss: 0.096005. Entropy: 1.589763.
episode: 8235   score: 770.0  epsilon: 1.0    steps: 432  evaluation reward: 1297.2
Training network. lr: 0.000068. clip: 0.027046
Iteration 23788: Policy loss: 0.002028. Va

Iteration 23845: Policy loss: 0.003330. Value loss: 0.229842. Entropy: 1.241577.
Iteration 23846: Policy loss: 0.002253. Value loss: 0.166700. Entropy: 1.236854.
Iteration 23847: Policy loss: 0.000350. Value loss: 0.154542. Entropy: 1.241111.
episode: 8256   score: 1260.0  epsilon: 1.0    steps: 640  evaluation reward: 1271.4
Training network. lr: 0.000067. clip: 0.026889
Iteration 23848: Policy loss: 0.000613. Value loss: 0.171565. Entropy: 1.097997.
Iteration 23849: Policy loss: 0.000129. Value loss: 0.146020. Entropy: 1.083658.
Iteration 23850: Policy loss: -0.002051. Value loss: 0.131345. Entropy: 1.096486.
episode: 8257   score: 710.0  epsilon: 1.0    steps: 984  evaluation reward: 1265.5
Training network. lr: 0.000067. clip: 0.026742
Iteration 23851: Policy loss: 0.002420. Value loss: 0.353103. Entropy: 1.231665.
Iteration 23852: Policy loss: 0.001024. Value loss: 0.251279. Entropy: 1.226324.
Iteration 23853: Policy loss: 0.000424. Value loss: 0.212236. Entropy: 1.232507.
Trainin

episode: 8278   score: 1450.0  epsilon: 1.0    steps: 736  evaluation reward: 1215.8
Training network. lr: 0.000066. clip: 0.026585
Iteration 23911: Policy loss: 0.004211. Value loss: 0.486399. Entropy: 1.240617.
Iteration 23912: Policy loss: 0.004154. Value loss: 0.351205. Entropy: 1.240356.
Iteration 23913: Policy loss: 0.003463. Value loss: 0.297212. Entropy: 1.240201.
episode: 8279   score: 720.0  epsilon: 1.0    steps: 272  evaluation reward: 1208.6
Training network. lr: 0.000066. clip: 0.026585
Iteration 23914: Policy loss: 0.005135. Value loss: 0.407865. Entropy: 1.105152.
Iteration 23915: Policy loss: 0.005490. Value loss: 0.327840. Entropy: 1.109702.
Iteration 23916: Policy loss: 0.002965. Value loss: 0.279242. Entropy: 1.111270.
Training network. lr: 0.000066. clip: 0.026585
Iteration 23917: Policy loss: 0.002742. Value loss: 0.139633. Entropy: 1.228682.
Iteration 23918: Policy loss: 0.000604. Value loss: 0.085981. Entropy: 1.214243.
Iteration 23919: Policy loss: -0.001510. V

episode: 8300   score: 1460.0  epsilon: 1.0    steps: 912  evaluation reward: 1189.9
Training network. lr: 0.000066. clip: 0.026429
Iteration 23977: Policy loss: 0.001484. Value loss: 0.200076. Entropy: 0.867021.
Iteration 23978: Policy loss: 0.000911. Value loss: 0.154096. Entropy: 0.865718.
Iteration 23979: Policy loss: -0.000649. Value loss: 0.142476. Entropy: 0.862037.
now time :  2019-03-06 08:28:39.995005
episode: 8301   score: 530.0  epsilon: 1.0    steps: 288  evaluation reward: 1177.0
Training network. lr: 0.000066. clip: 0.026429
Iteration 23980: Policy loss: 0.002675. Value loss: 0.495336. Entropy: 0.758197.
Iteration 23981: Policy loss: 0.001956. Value loss: 0.428921. Entropy: 0.759896.
Iteration 23982: Policy loss: -0.000095. Value loss: 0.361360. Entropy: 0.755857.
episode: 8302   score: 1510.0  epsilon: 1.0    steps: 1000  evaluation reward: 1179.6
Training network. lr: 0.000066. clip: 0.026429
Iteration 23983: Policy loss: 0.001643. Value loss: 0.250680. Entropy: 0.9783

Iteration 24042: Policy loss: -0.000318. Value loss: 0.121872. Entropy: 1.015676.
episode: 8322   score: 1620.0  epsilon: 1.0    steps: 952  evaluation reward: 1215.0
Training network. lr: 0.000066. clip: 0.026281
Iteration 24043: Policy loss: 0.003194. Value loss: 0.364644. Entropy: 1.174907.
Iteration 24044: Policy loss: 0.004834. Value loss: 0.272899. Entropy: 1.174242.
Iteration 24045: Policy loss: 0.006735. Value loss: 0.253736. Entropy: 1.164910.
episode: 8323   score: 1870.0  epsilon: 1.0    steps: 376  evaluation reward: 1219.4
episode: 8324   score: 1010.0  epsilon: 1.0    steps: 544  evaluation reward: 1215.2
Training network. lr: 0.000066. clip: 0.026281
Iteration 24046: Policy loss: 0.001046. Value loss: 0.189259. Entropy: 1.169926.
Iteration 24047: Policy loss: 0.001446. Value loss: 0.152425. Entropy: 1.175312.
Iteration 24048: Policy loss: -0.000905. Value loss: 0.145010. Entropy: 1.163012.
episode: 8325   score: 880.0  epsilon: 1.0    steps: 672  evaluation reward: 1206.

Iteration 24108: Policy loss: -0.000203. Value loss: 0.208075. Entropy: 1.077780.
episode: 8344   score: 1700.0  epsilon: 1.0    steps: 728  evaluation reward: 1207.5
Training network. lr: 0.000065. clip: 0.025968
Iteration 24109: Policy loss: 0.002400. Value loss: 0.189162. Entropy: 1.158628.
Iteration 24110: Policy loss: 0.002366. Value loss: 0.151217. Entropy: 1.156065.
Iteration 24111: Policy loss: 0.001417. Value loss: 0.133480. Entropy: 1.171252.
episode: 8345   score: 1030.0  epsilon: 1.0    steps: 616  evaluation reward: 1205.7
Training network. lr: 0.000065. clip: 0.025968
Iteration 24112: Policy loss: 0.006577. Value loss: 0.833484. Entropy: 1.215394.
Iteration 24113: Policy loss: 0.006263. Value loss: 0.648982. Entropy: 1.229741.
Iteration 24114: Policy loss: 0.003758. Value loss: 0.581095. Entropy: 1.226967.
Training network. lr: 0.000065. clip: 0.025968
Iteration 24115: Policy loss: 0.002740. Value loss: 0.159844. Entropy: 1.217032.
Iteration 24116: Policy loss: 0.001578. 

Training network. lr: 0.000065. clip: 0.025820
Iteration 24172: Policy loss: 0.001944. Value loss: 0.323182. Entropy: 0.940460.
Iteration 24173: Policy loss: 0.003219. Value loss: 0.259915. Entropy: 0.947230.
Iteration 24174: Policy loss: 0.001968. Value loss: 0.248141. Entropy: 0.932504.
episode: 8368   score: 1270.0  epsilon: 1.0    steps: 688  evaluation reward: 1230.7
Training network. lr: 0.000065. clip: 0.025820
Iteration 24175: Policy loss: 0.004282. Value loss: 0.247102. Entropy: 1.343333.
Iteration 24176: Policy loss: 0.002398. Value loss: 0.125647. Entropy: 1.344389.
Iteration 24177: Policy loss: 0.000275. Value loss: 0.102334. Entropy: 1.335616.
Training network. lr: 0.000065. clip: 0.025820
Iteration 24178: Policy loss: 0.012037. Value loss: 0.503599. Entropy: 1.336720.
Iteration 24179: Policy loss: 0.015932. Value loss: 0.253275. Entropy: 1.336416.
Iteration 24180: Policy loss: 0.009546. Value loss: 0.208737. Entropy: 1.332547.
Training network. lr: 0.000065. clip: 0.02582

Iteration 24240: Policy loss: 0.001508. Value loss: 0.118438. Entropy: 1.271987.
episode: 8387   score: 1640.0  epsilon: 1.0    steps: 832  evaluation reward: 1261.0
episode: 8388   score: 1260.0  epsilon: 1.0    steps: 904  evaluation reward: 1260.5
Training network. lr: 0.000064. clip: 0.025664
Iteration 24241: Policy loss: 0.001633. Value loss: 0.211886. Entropy: 1.389394.
Iteration 24242: Policy loss: 0.002221. Value loss: 0.164979. Entropy: 1.383669.
Iteration 24243: Policy loss: -0.001397. Value loss: 0.142048. Entropy: 1.387370.
episode: 8389   score: 1340.0  epsilon: 1.0    steps: 168  evaluation reward: 1263.0
Training network. lr: 0.000064. clip: 0.025664
Iteration 24244: Policy loss: 0.003133. Value loss: 0.445236. Entropy: 1.056327.
Iteration 24245: Policy loss: 0.002935. Value loss: 0.382210. Entropy: 1.048471.
Iteration 24246: Policy loss: 0.001905. Value loss: 0.337044. Entropy: 1.050446.
episode: 8390   score: 940.0  epsilon: 1.0    steps: 48  evaluation reward: 1261.3


Training network. lr: 0.000063. clip: 0.025360
Iteration 24304: Policy loss: 0.003416. Value loss: 0.283040. Entropy: 0.900383.
Iteration 24305: Policy loss: -0.000388. Value loss: 0.245546. Entropy: 0.902799.
Iteration 24306: Policy loss: -0.000118. Value loss: 0.223777. Entropy: 0.899409.
Training network. lr: 0.000063. clip: 0.025360
Iteration 24307: Policy loss: 0.001963. Value loss: 0.185692. Entropy: 1.047570.
Iteration 24308: Policy loss: 0.000219. Value loss: 0.131140. Entropy: 1.049284.
Iteration 24309: Policy loss: -0.000178. Value loss: 0.107582. Entropy: 1.040107.
Training network. lr: 0.000063. clip: 0.025360
Iteration 24310: Policy loss: 0.003006. Value loss: 0.433341. Entropy: 1.345441.
Iteration 24311: Policy loss: 0.006050. Value loss: 0.334102. Entropy: 1.335963.
Iteration 24312: Policy loss: 0.005970. Value loss: 0.290114. Entropy: 1.334392.
episode: 8411   score: 1300.0  epsilon: 1.0    steps: 512  evaluation reward: 1255.6
episode: 8412   score: 1400.0  epsilon: 1.

Iteration 24371: Policy loss: 0.002260. Value loss: 0.147545. Entropy: 1.604810.
Iteration 24372: Policy loss: 0.002578. Value loss: 0.118826. Entropy: 1.600808.
Training network. lr: 0.000063. clip: 0.025203
Iteration 24373: Policy loss: 0.002577. Value loss: 0.273267. Entropy: 1.413820.
Iteration 24374: Policy loss: 0.002125. Value loss: 0.183381. Entropy: 1.409021.
Iteration 24375: Policy loss: 0.001351. Value loss: 0.149357. Entropy: 1.413348.
episode: 8431   score: 1410.0  epsilon: 1.0    steps: 248  evaluation reward: 1306.2
Training network. lr: 0.000063. clip: 0.025203
Iteration 24376: Policy loss: 0.000835. Value loss: 0.157181. Entropy: 1.444578.
Iteration 24377: Policy loss: -0.000127. Value loss: 0.137857. Entropy: 1.442813.
Iteration 24378: Policy loss: -0.000787. Value loss: 0.127768. Entropy: 1.440350.
episode: 8432   score: 1000.0  epsilon: 1.0    steps: 56  evaluation reward: 1300.8
episode: 8433   score: 1000.0  epsilon: 1.0    steps: 288  evaluation reward: 1294.9
ep

Iteration 24436: Policy loss: 0.003245. Value loss: 0.378916. Entropy: 1.251829.
Iteration 24437: Policy loss: 0.002517. Value loss: 0.258768. Entropy: 1.255054.
Iteration 24438: Policy loss: 0.000582. Value loss: 0.233774. Entropy: 1.258982.
episode: 8453   score: 1560.0  epsilon: 1.0    steps: 976  evaluation reward: 1269.9
Training network. lr: 0.000063. clip: 0.025046
Iteration 24439: Policy loss: 0.002077. Value loss: 0.330732. Entropy: 1.455407.
Iteration 24440: Policy loss: 0.003981. Value loss: 0.246327. Entropy: 1.452393.
Iteration 24441: Policy loss: 0.003086. Value loss: 0.212997. Entropy: 1.455250.
Training network. lr: 0.000063. clip: 0.025046
Iteration 24442: Policy loss: 0.000388. Value loss: 0.990292. Entropy: 1.115418.
Iteration 24443: Policy loss: 0.000540. Value loss: 0.893329. Entropy: 1.120788.
Iteration 24444: Policy loss: -0.000194. Value loss: 0.842864. Entropy: 1.110158.
episode: 8454   score: 2420.0  epsilon: 1.0    steps: 440  evaluation reward: 1281.8
episod

Iteration 24503: Policy loss: 0.002890. Value loss: 0.160041. Entropy: 1.057535.
Iteration 24504: Policy loss: 0.001145. Value loss: 0.149956. Entropy: 1.037129.
episode: 8474   score: 1040.0  epsilon: 1.0    steps: 736  evaluation reward: 1282.0
episode: 8475   score: 1850.0  epsilon: 1.0    steps: 760  evaluation reward: 1278.8
Training network. lr: 0.000062. clip: 0.024742
Iteration 24505: Policy loss: 0.003153. Value loss: 0.232177. Entropy: 1.082394.
Iteration 24506: Policy loss: 0.002374. Value loss: 0.177451. Entropy: 1.094469.
Iteration 24507: Policy loss: 0.000687. Value loss: 0.151631. Entropy: 1.084947.
episode: 8476   score: 1270.0  epsilon: 1.0    steps: 136  evaluation reward: 1278.2
episode: 8477   score: 1380.0  epsilon: 1.0    steps: 480  evaluation reward: 1280.5
Training network. lr: 0.000062. clip: 0.024742
Iteration 24508: Policy loss: 0.000840. Value loss: 0.360395. Entropy: 0.886810.
Iteration 24509: Policy loss: 0.000209. Value loss: 0.319652. Entropy: 0.893153.

Iteration 24567: Policy loss: 0.002487. Value loss: 0.295125. Entropy: 0.896490.
Training network. lr: 0.000061. clip: 0.024585
Iteration 24568: Policy loss: 0.003156. Value loss: 0.468731. Entropy: 1.034017.
Iteration 24569: Policy loss: 0.003007. Value loss: 0.366431. Entropy: 1.046745.
Iteration 24570: Policy loss: 0.002588. Value loss: 0.325914. Entropy: 1.042419.
Training network. lr: 0.000061. clip: 0.024585
Iteration 24571: Policy loss: 0.002794. Value loss: 0.449344. Entropy: 1.178204.
Iteration 24572: Policy loss: 0.002018. Value loss: 0.342376. Entropy: 1.184995.
Iteration 24573: Policy loss: 0.001793. Value loss: 0.299819. Entropy: 1.184214.
episode: 8498   score: 1440.0  epsilon: 1.0    steps: 984  evaluation reward: 1263.9
Training network. lr: 0.000061. clip: 0.024585
Iteration 24574: Policy loss: 0.003676. Value loss: 0.173748. Entropy: 1.313940.
Iteration 24575: Policy loss: 0.002241. Value loss: 0.113256. Entropy: 1.315342.
Iteration 24576: Policy loss: 0.001366. Value

Iteration 24633: Policy loss: 0.002361. Value loss: 0.107227. Entropy: 1.544966.
episode: 8519   score: 970.0  epsilon: 1.0    steps: 960  evaluation reward: 1238.3
Training network. lr: 0.000061. clip: 0.024438
Iteration 24634: Policy loss: 0.003627. Value loss: 0.367973. Entropy: 1.350047.
Iteration 24635: Policy loss: 0.005958. Value loss: 0.231818. Entropy: 1.353244.
Iteration 24636: Policy loss: 0.002397. Value loss: 0.191029. Entropy: 1.353031.
episode: 8520   score: 1500.0  epsilon: 1.0    steps: 944  evaluation reward: 1240.6
Training network. lr: 0.000061. clip: 0.024438
Iteration 24637: Policy loss: 0.004307. Value loss: 0.568048. Entropy: 1.281033.
Iteration 24638: Policy loss: 0.004333. Value loss: 0.455748. Entropy: 1.283240.
Iteration 24639: Policy loss: 0.003706. Value loss: 0.416858. Entropy: 1.276873.
episode: 8521   score: 920.0  epsilon: 1.0    steps: 568  evaluation reward: 1232.6
episode: 8522   score: 610.0  epsilon: 1.0    steps: 976  evaluation reward: 1222.0
Tr

Iteration 24698: Policy loss: 0.004521. Value loss: 0.096451. Entropy: 1.377592.
Iteration 24699: Policy loss: 0.002529. Value loss: 0.072719. Entropy: 1.372947.
Training network. lr: 0.000061. clip: 0.024281
Iteration 24700: Policy loss: 0.002939. Value loss: 0.342419. Entropy: 1.466787.
Iteration 24701: Policy loss: 0.001258. Value loss: 0.283842. Entropy: 1.472415.
Iteration 24702: Policy loss: 0.000918. Value loss: 0.253889. Entropy: 1.476504.
Training network. lr: 0.000060. clip: 0.024125
Iteration 24703: Policy loss: 0.007485. Value loss: 0.189786. Entropy: 1.614158.
Iteration 24704: Policy loss: 0.004851. Value loss: 0.120248. Entropy: 1.610608.
Iteration 24705: Policy loss: 0.005928. Value loss: 0.095079. Entropy: 1.610098.
Training network. lr: 0.000060. clip: 0.024125
Iteration 24706: Policy loss: 0.003444. Value loss: 0.114983. Entropy: 1.638072.
Iteration 24707: Policy loss: 0.003210. Value loss: 0.064959. Entropy: 1.636960.
Iteration 24708: Policy loss: 0.001594. Value los

Training network. lr: 0.000060. clip: 0.023977
Iteration 24763: Policy loss: 0.003575. Value loss: 0.339442. Entropy: 1.228724.
Iteration 24764: Policy loss: 0.004916. Value loss: 0.267083. Entropy: 1.224649.
Iteration 24765: Policy loss: 0.003380. Value loss: 0.226363. Entropy: 1.215149.
episode: 8565   score: 1210.0  epsilon: 1.0    steps: 640  evaluation reward: 1167.8
Training network. lr: 0.000060. clip: 0.023977
Iteration 24766: Policy loss: 0.009712. Value loss: 0.498881. Entropy: 1.176333.
Iteration 24767: Policy loss: 0.005458. Value loss: 0.381952. Entropy: 1.190289.
Iteration 24768: Policy loss: 0.004740. Value loss: 0.357224. Entropy: 1.190702.
Training network. lr: 0.000060. clip: 0.023977
Iteration 24769: Policy loss: 0.002408. Value loss: 0.277731. Entropy: 1.288860.
Iteration 24770: Policy loss: 0.002480. Value loss: 0.224943. Entropy: 1.290776.
Iteration 24771: Policy loss: 0.001784. Value loss: 0.199147. Entropy: 1.298180.
Training network. lr: 0.000060. clip: 0.02397

episode: 8587   score: 1230.0  epsilon: 1.0    steps: 344  evaluation reward: 1166.3
Training network. lr: 0.000060. clip: 0.023821
Iteration 24829: Policy loss: 0.000908. Value loss: 0.536534. Entropy: 0.739047.
Iteration 24830: Policy loss: 0.000165. Value loss: 0.439128. Entropy: 0.734898.
Iteration 24831: Policy loss: -0.000912. Value loss: 0.415801. Entropy: 0.738433.
Training network. lr: 0.000060. clip: 0.023821
Iteration 24832: Policy loss: 0.004021. Value loss: 0.203474. Entropy: 1.246994.
Iteration 24833: Policy loss: 0.002837. Value loss: 0.133062. Entropy: 1.242411.
Iteration 24834: Policy loss: 0.000614. Value loss: 0.109034. Entropy: 1.249236.
Training network. lr: 0.000060. clip: 0.023821
Iteration 24835: Policy loss: 0.006767. Value loss: 0.625001. Entropy: 1.379452.
Iteration 24836: Policy loss: 0.014491. Value loss: 0.432957. Entropy: 1.382318.
Iteration 24837: Policy loss: 0.009161. Value loss: 0.361678. Entropy: 1.388092.
episode: 8588   score: 930.0  epsilon: 1.0  

Iteration 24896: Policy loss: 0.003663. Value loss: 0.180153. Entropy: 1.256366.
Iteration 24897: Policy loss: 0.004235. Value loss: 0.156502. Entropy: 1.263018.
Training network. lr: 0.000059. clip: 0.023664
Iteration 24898: Policy loss: 0.003623. Value loss: 0.194799. Entropy: 1.414312.
Iteration 24899: Policy loss: 0.000444. Value loss: 0.152136. Entropy: 1.409933.
Iteration 24900: Policy loss: 0.002930. Value loss: 0.128585. Entropy: 1.409572.
episode: 8607   score: 500.0  epsilon: 1.0    steps: 160  evaluation reward: 1168.7
episode: 8608   score: 940.0  epsilon: 1.0    steps: 576  evaluation reward: 1163.7
episode: 8609   score: 1370.0  epsilon: 1.0    steps: 672  evaluation reward: 1167.8
episode: 8610   score: 1920.0  epsilon: 1.0    steps: 824  evaluation reward: 1174.7
Training network. lr: 0.000059. clip: 0.023516
Iteration 24901: Policy loss: 0.000900. Value loss: 0.269807. Entropy: 1.094301.
Iteration 24902: Policy loss: -0.000360. Value loss: 0.216987. Entropy: 1.080404.


Iteration 24962: Policy loss: 0.002486. Value loss: 0.201897. Entropy: 1.075943.
Iteration 24963: Policy loss: 0.000881. Value loss: 0.169577. Entropy: 1.058429.
Training network. lr: 0.000058. clip: 0.023360
Iteration 24964: Policy loss: 0.002366. Value loss: 0.213662. Entropy: 1.352023.
Iteration 24965: Policy loss: 0.001158. Value loss: 0.148648. Entropy: 1.343444.
Iteration 24966: Policy loss: -0.001334. Value loss: 0.126483. Entropy: 1.348376.
episode: 8629   score: 730.0  epsilon: 1.0    steps: 736  evaluation reward: 1184.2
Training network. lr: 0.000058. clip: 0.023360
Iteration 24967: Policy loss: 0.005695. Value loss: 0.301773. Entropy: 1.375619.
Iteration 24968: Policy loss: 0.001693. Value loss: 0.221564. Entropy: 1.378220.
Iteration 24969: Policy loss: 0.001143. Value loss: 0.197401. Entropy: 1.382993.
episode: 8630   score: 1600.0  epsilon: 1.0    steps: 280  evaluation reward: 1186.8
Training network. lr: 0.000058. clip: 0.023360
Iteration 24970: Policy loss: 0.003515. V

Training network. lr: 0.000058. clip: 0.023203
Iteration 25027: Policy loss: 0.000577. Value loss: 0.279185. Entropy: 1.226865.
Iteration 25028: Policy loss: 0.003068. Value loss: 0.175223. Entropy: 1.217483.
Iteration 25029: Policy loss: 0.001564. Value loss: 0.150289. Entropy: 1.216467.
Training network. lr: 0.000058. clip: 0.023203
Iteration 25030: Policy loss: 0.004451. Value loss: 0.369333. Entropy: 1.385114.
Iteration 25031: Policy loss: 0.003541. Value loss: 0.243875. Entropy: 1.385298.
Iteration 25032: Policy loss: 0.001996. Value loss: 0.219188. Entropy: 1.388387.
episode: 8652   score: 1410.0  epsilon: 1.0    steps: 336  evaluation reward: 1220.2
episode: 8653   score: 1270.0  epsilon: 1.0    steps: 768  evaluation reward: 1217.6
Training network. lr: 0.000058. clip: 0.023203
Iteration 25033: Policy loss: 0.001398. Value loss: 0.151359. Entropy: 1.239707.
Iteration 25034: Policy loss: 0.001551. Value loss: 0.083311. Entropy: 1.237375.
Iteration 25035: Policy loss: -0.000640. 

episode: 8670   score: 1130.0  epsilon: 1.0    steps: 728  evaluation reward: 1262.7
episode: 8671   score: 1080.0  epsilon: 1.0    steps: 928  evaluation reward: 1260.9
Training network. lr: 0.000058. clip: 0.023056
Iteration 25096: Policy loss: 0.006888. Value loss: 0.219829. Entropy: 1.325624.
Iteration 25097: Policy loss: 0.003101. Value loss: 0.140861. Entropy: 1.327009.
Iteration 25098: Policy loss: 0.002166. Value loss: 0.117070. Entropy: 1.319224.
episode: 8672   score: 990.0  epsilon: 1.0    steps: 472  evaluation reward: 1257.1
episode: 8673   score: 1590.0  epsilon: 1.0    steps: 792  evaluation reward: 1262.0
Training network. lr: 0.000058. clip: 0.023056
Iteration 25099: Policy loss: 0.001399. Value loss: 0.409530. Entropy: 0.953069.
Iteration 25100: Policy loss: 0.001079. Value loss: 0.330406. Entropy: 0.964305.
Iteration 25101: Policy loss: 0.000298. Value loss: 0.298735. Entropy: 0.963223.
episode: 8674   score: 1350.0  epsilon: 1.0    steps: 120  evaluation reward: 125

Iteration 25161: Policy loss: 0.004480. Value loss: 0.069612. Entropy: 1.367886.
episode: 8693   score: 1600.0  epsilon: 1.0    steps: 248  evaluation reward: 1256.2
Training network. lr: 0.000057. clip: 0.022742
Iteration 25162: Policy loss: 0.001723. Value loss: 0.249422. Entropy: 1.261970.
Iteration 25163: Policy loss: 0.000654. Value loss: 0.198925. Entropy: 1.267491.
Iteration 25164: Policy loss: -0.000702. Value loss: 0.192394. Entropy: 1.258642.
episode: 8694   score: 1350.0  epsilon: 1.0    steps: 280  evaluation reward: 1254.8
Training network. lr: 0.000057. clip: 0.022742
Iteration 25165: Policy loss: 0.004112. Value loss: 0.310841. Entropy: 1.319923.
Iteration 25166: Policy loss: 0.003842. Value loss: 0.255331. Entropy: 1.320854.
Iteration 25167: Policy loss: 0.000704. Value loss: 0.236584. Entropy: 1.325929.
episode: 8695   score: 1000.0  epsilon: 1.0    steps: 392  evaluation reward: 1251.4
Training network. lr: 0.000057. clip: 0.022742
Iteration 25168: Policy loss: 0.0031

Iteration 25227: Policy loss: 0.000425. Value loss: 0.147549. Entropy: 0.764898.
episode: 8714   score: 1530.0  epsilon: 1.0    steps: 776  evaluation reward: 1266.7
Training network. lr: 0.000056. clip: 0.022595
Iteration 25228: Policy loss: 0.004622. Value loss: 0.371342. Entropy: 0.876934.
Iteration 25229: Policy loss: 0.003191. Value loss: 0.277657. Entropy: 0.876908.
Iteration 25230: Policy loss: 0.003082. Value loss: 0.246126. Entropy: 0.879357.
episode: 8715   score: 1630.0  epsilon: 1.0    steps: 208  evaluation reward: 1273.0
episode: 8716   score: 1300.0  epsilon: 1.0    steps: 440  evaluation reward: 1271.7
Training network. lr: 0.000056. clip: 0.022595
Iteration 25231: Policy loss: 0.001686. Value loss: 0.427952. Entropy: 0.763417.
Iteration 25232: Policy loss: 0.001628. Value loss: 0.366982. Entropy: 0.760788.
Iteration 25233: Policy loss: 0.000629. Value loss: 0.304366. Entropy: 0.768135.
episode: 8717   score: 1610.0  epsilon: 1.0    steps: 832  evaluation reward: 1273.8

Training network. lr: 0.000056. clip: 0.022438
Iteration 25294: Policy loss: 0.002747. Value loss: 0.227219. Entropy: 0.984856.
Iteration 25295: Policy loss: 0.001680. Value loss: 0.179746. Entropy: 0.994412.
Iteration 25296: Policy loss: 0.002274. Value loss: 0.162640. Entropy: 0.988141.
episode: 8735   score: 1360.0  epsilon: 1.0    steps: 880  evaluation reward: 1315.5
Training network. lr: 0.000056. clip: 0.022438
Iteration 25297: Policy loss: 0.001588. Value loss: 0.081231. Entropy: 1.040518.
Iteration 25298: Policy loss: -0.001244. Value loss: 0.065403. Entropy: 1.045639.
Iteration 25299: Policy loss: -0.001299. Value loss: 0.058335. Entropy: 1.039944.
episode: 8736   score: 1310.0  epsilon: 1.0    steps: 824  evaluation reward: 1314.1
Training network. lr: 0.000056. clip: 0.022438
Iteration 25300: Policy loss: 0.002625. Value loss: 0.249401. Entropy: 0.994801.
Iteration 25301: Policy loss: 0.002486. Value loss: 0.200455. Entropy: 0.996212.
Iteration 25302: Policy loss: 0.000849.

Training network. lr: 0.000055. clip: 0.022134
Iteration 25360: Policy loss: 0.003191. Value loss: 0.336771. Entropy: 1.177015.
Iteration 25361: Policy loss: 0.002721. Value loss: 0.232826. Entropy: 1.163240.
Iteration 25362: Policy loss: 0.001196. Value loss: 0.222400. Entropy: 1.168909.
episode: 8757   score: 1480.0  epsilon: 1.0    steps: 336  evaluation reward: 1358.5
episode: 8758   score: 1280.0  epsilon: 1.0    steps: 536  evaluation reward: 1354.7
episode: 8759   score: 1350.0  epsilon: 1.0    steps: 1024  evaluation reward: 1353.5
Training network. lr: 0.000055. clip: 0.022134
Iteration 25363: Policy loss: 0.002685. Value loss: 0.182752. Entropy: 1.206512.
Iteration 25364: Policy loss: 0.002011. Value loss: 0.116742. Entropy: 1.212067.
Iteration 25365: Policy loss: 0.001080. Value loss: 0.099724. Entropy: 1.202074.
Training network. lr: 0.000055. clip: 0.022134
Iteration 25366: Policy loss: 0.003153. Value loss: 0.491846. Entropy: 1.178180.
Iteration 25367: Policy loss: 0.0022

Training network. lr: 0.000055. clip: 0.021977
Iteration 25426: Policy loss: 0.004488. Value loss: 0.732466. Entropy: 1.097706.
Iteration 25427: Policy loss: 0.002777. Value loss: 0.557672. Entropy: 1.092653.
Iteration 25428: Policy loss: 0.004583. Value loss: 0.507446. Entropy: 1.097131.
Training network. lr: 0.000055. clip: 0.021977
Iteration 25429: Policy loss: 0.003123. Value loss: 0.240994. Entropy: 1.175266.
Iteration 25430: Policy loss: 0.002162. Value loss: 0.171486. Entropy: 1.173152.
Iteration 25431: Policy loss: 0.001027. Value loss: 0.156146. Entropy: 1.174147.
Training network. lr: 0.000055. clip: 0.021977
Iteration 25432: Policy loss: 0.005621. Value loss: 0.295036. Entropy: 1.334360.
Iteration 25433: Policy loss: 0.006064. Value loss: 0.194254. Entropy: 1.338352.
Iteration 25434: Policy loss: 0.003590. Value loss: 0.161667. Entropy: 1.338191.
episode: 8779   score: 1050.0  epsilon: 1.0    steps: 872  evaluation reward: 1385.0
Training network. lr: 0.000055. clip: 0.02197

Iteration 25494: Policy loss: 0.000522. Value loss: 0.209648. Entropy: 0.966059.
Training network. lr: 0.000055. clip: 0.021821
Iteration 25495: Policy loss: 0.001210. Value loss: 0.352299. Entropy: 1.020953.
Iteration 25496: Policy loss: 0.000370. Value loss: 0.339795. Entropy: 1.017944.
Iteration 25497: Policy loss: 0.000304. Value loss: 0.308363. Entropy: 1.020315.
episode: 8798   score: 1340.0  epsilon: 1.0    steps: 216  evaluation reward: 1389.6
Training network. lr: 0.000055. clip: 0.021821
Iteration 25498: Policy loss: 0.002998. Value loss: 0.230177. Entropy: 0.913238.
Iteration 25499: Policy loss: 0.001780. Value loss: 0.144579. Entropy: 0.917318.
Iteration 25500: Policy loss: 0.000570. Value loss: 0.127245. Entropy: 0.922169.
episode: 8799   score: 1240.0  epsilon: 1.0    steps: 144  evaluation reward: 1389.1
episode: 8800   score: 1750.0  epsilon: 1.0    steps: 312  evaluation reward: 1398.4
Training network. lr: 0.000054. clip: 0.021673
Iteration 25501: Policy loss: 0.00091

Iteration 25558: Policy loss: 0.003489. Value loss: 0.234032. Entropy: 1.038461.
Iteration 25559: Policy loss: 0.003627. Value loss: 0.181988. Entropy: 1.045620.
Iteration 25560: Policy loss: 0.000412. Value loss: 0.172438. Entropy: 1.048407.
Training network. lr: 0.000054. clip: 0.021517
Iteration 25561: Policy loss: 0.001698. Value loss: 0.243250. Entropy: 1.030712.
Iteration 25562: Policy loss: 0.002647. Value loss: 0.192238. Entropy: 1.043013.
Iteration 25563: Policy loss: 0.001677. Value loss: 0.178211. Entropy: 1.057840.
episode: 8821   score: 1340.0  epsilon: 1.0    steps: 168  evaluation reward: 1386.0
Training network. lr: 0.000054. clip: 0.021517
Iteration 25564: Policy loss: 0.002644. Value loss: 0.226868. Entropy: 1.298507.
Iteration 25565: Policy loss: 0.002731. Value loss: 0.171289. Entropy: 1.302730.
Iteration 25566: Policy loss: 0.000683. Value loss: 0.142821. Entropy: 1.298653.
Training network. lr: 0.000054. clip: 0.021517
Iteration 25567: Policy loss: 0.003732. Value

episode: 8840   score: 1190.0  epsilon: 1.0    steps: 240  evaluation reward: 1358.7
Training network. lr: 0.000053. clip: 0.021360
Iteration 25627: Policy loss: 0.000980. Value loss: 1.291368. Entropy: 0.994317.
Iteration 25628: Policy loss: 0.001609. Value loss: 1.100581. Entropy: 0.995575.
Iteration 25629: Policy loss: 0.001954. Value loss: 1.065474. Entropy: 0.987551.
episode: 8841   score: 1010.0  epsilon: 1.0    steps: 280  evaluation reward: 1356.7
Training network. lr: 0.000053. clip: 0.021360
Iteration 25630: Policy loss: 0.002872. Value loss: 0.377919. Entropy: 0.993013.
Iteration 25631: Policy loss: 0.003866. Value loss: 0.256368. Entropy: 0.990555.
Iteration 25632: Policy loss: 0.002089. Value loss: 0.223334. Entropy: 0.991660.
episode: 8842   score: 1020.0  epsilon: 1.0    steps: 456  evaluation reward: 1351.6
episode: 8843   score: 2600.0  epsilon: 1.0    steps: 672  evaluation reward: 1363.2
Training network. lr: 0.000053. clip: 0.021360
Iteration 25633: Policy loss: 0.0

episode: 8861   score: 860.0  epsilon: 1.0    steps: 600  evaluation reward: 1366.4
episode: 8862   score: 760.0  epsilon: 1.0    steps: 640  evaluation reward: 1350.2
Training network. lr: 0.000053. clip: 0.021212
Iteration 25693: Policy loss: 0.004032. Value loss: 0.293774. Entropy: 1.101071.
Iteration 25694: Policy loss: 0.003638. Value loss: 0.191234. Entropy: 1.088722.
Iteration 25695: Policy loss: 0.003354. Value loss: 0.172657. Entropy: 1.096376.
episode: 8863   score: 1010.0  epsilon: 1.0    steps: 360  evaluation reward: 1343.9
episode: 8864   score: 1160.0  epsilon: 1.0    steps: 936  evaluation reward: 1341.0
Training network. lr: 0.000053. clip: 0.021212
Iteration 25696: Policy loss: 0.000196. Value loss: 0.266466. Entropy: 0.862183.
Iteration 25697: Policy loss: 0.000425. Value loss: 0.240356. Entropy: 0.855890.
Iteration 25698: Policy loss: 0.000510. Value loss: 0.236232. Entropy: 0.854349.
Training network. lr: 0.000053. clip: 0.021212
Iteration 25699: Policy loss: 0.005

episode: 8883   score: 1250.0  epsilon: 1.0    steps: 696  evaluation reward: 1362.8
Training network. lr: 0.000052. clip: 0.020899
Iteration 25759: Policy loss: 0.002936. Value loss: 0.342366. Entropy: 1.369064.
Iteration 25760: Policy loss: 0.002781. Value loss: 0.214858. Entropy: 1.371473.
Iteration 25761: Policy loss: 0.002042. Value loss: 0.189332. Entropy: 1.375909.
episode: 8884   score: 1240.0  epsilon: 1.0    steps: 216  evaluation reward: 1356.4
episode: 8885   score: 1240.0  epsilon: 1.0    steps: 888  evaluation reward: 1355.6
Training network. lr: 0.000052. clip: 0.020899
Iteration 25762: Policy loss: 0.002557. Value loss: 0.660770. Entropy: 1.085391.
Iteration 25763: Policy loss: 0.003644. Value loss: 0.537311. Entropy: 1.089257.
Iteration 25764: Policy loss: 0.003098. Value loss: 0.480584. Entropy: 1.075279.
Training network. lr: 0.000052. clip: 0.020899
Iteration 25765: Policy loss: 0.002220. Value loss: 0.371166. Entropy: 0.979150.
Iteration 25766: Policy loss: 0.00095

Iteration 25824: Policy loss: 0.000628. Value loss: 0.230377. Entropy: 0.917663.
episode: 8905   score: 1300.0  epsilon: 1.0    steps: 904  evaluation reward: 1359.2
Training network. lr: 0.000052. clip: 0.020752
Iteration 25825: Policy loss: 0.001633. Value loss: 0.244136. Entropy: 0.754481.
Iteration 25826: Policy loss: 0.000048. Value loss: 0.207070. Entropy: 0.747440.
Iteration 25827: Policy loss: -0.000714. Value loss: 0.196303. Entropy: 0.747958.
episode: 8906   score: 1450.0  epsilon: 1.0    steps: 960  evaluation reward: 1360.0
Training network. lr: 0.000052. clip: 0.020752
Iteration 25828: Policy loss: 0.002891. Value loss: 0.175604. Entropy: 1.037412.
Iteration 25829: Policy loss: 0.002900. Value loss: 0.144292. Entropy: 1.034316.
Iteration 25830: Policy loss: 0.002985. Value loss: 0.131512. Entropy: 1.031148.
episode: 8907   score: 1850.0  epsilon: 1.0    steps: 816  evaluation reward: 1366.7
Training network. lr: 0.000052. clip: 0.020752
Iteration 25831: Policy loss: 0.0031

Training network. lr: 0.000051. clip: 0.020595
Iteration 25891: Policy loss: 0.003169. Value loss: 0.510826. Entropy: 0.945580.
Iteration 25892: Policy loss: 0.002933. Value loss: 0.384813. Entropy: 0.948019.
Iteration 25893: Policy loss: 0.001099. Value loss: 0.354634. Entropy: 0.950284.
episode: 8927   score: 1020.0  epsilon: 1.0    steps: 160  evaluation reward: 1377.4
Training network. lr: 0.000051. clip: 0.020595
Iteration 25894: Policy loss: 0.002189. Value loss: 0.830876. Entropy: 1.034425.
Iteration 25895: Policy loss: 0.002135. Value loss: 0.705183. Entropy: 1.034448.
Iteration 25896: Policy loss: 0.001812. Value loss: 0.687478. Entropy: 1.030069.
episode: 8928   score: 1350.0  epsilon: 1.0    steps: 96  evaluation reward: 1376.0
episode: 8929   score: 2020.0  epsilon: 1.0    steps: 368  evaluation reward: 1379.8
Training network. lr: 0.000051. clip: 0.020595
Iteration 25897: Policy loss: 0.001093. Value loss: 0.186237. Entropy: 1.096886.
Iteration 25898: Policy loss: 0.000736

Iteration 25957: Policy loss: 0.005256. Value loss: 0.822505. Entropy: 1.218435.
Iteration 25958: Policy loss: 0.007954. Value loss: 0.688579. Entropy: 1.214917.
Iteration 25959: Policy loss: 0.009498. Value loss: 0.612883. Entropy: 1.214751.
episode: 8948   score: 1230.0  epsilon: 1.0    steps: 600  evaluation reward: 1351.1
Training network. lr: 0.000051. clip: 0.020291
Iteration 25960: Policy loss: 0.001389. Value loss: 0.167942. Entropy: 1.044699.
Iteration 25961: Policy loss: 0.002289. Value loss: 0.077740. Entropy: 1.039423.
Iteration 25962: Policy loss: 0.002627. Value loss: 0.060818. Entropy: 1.041369.
episode: 8949   score: 1190.0  epsilon: 1.0    steps: 224  evaluation reward: 1352.3
Training network. lr: 0.000051. clip: 0.020291
Iteration 25963: Policy loss: 0.004836. Value loss: 0.325543. Entropy: 0.834390.
Iteration 25964: Policy loss: 0.003355. Value loss: 0.230115. Entropy: 0.838841.
Iteration 25965: Policy loss: 0.001964. Value loss: 0.207609. Entropy: 0.820394.
episode

episode: 8970   score: 1430.0  epsilon: 1.0    steps: 880  evaluation reward: 1347.8
Training network. lr: 0.000050. clip: 0.020134
Iteration 26023: Policy loss: 0.002224. Value loss: 0.262967. Entropy: 1.102457.
Iteration 26024: Policy loss: 0.001023. Value loss: 0.216900. Entropy: 1.098791.
Iteration 26025: Policy loss: 0.000975. Value loss: 0.196972. Entropy: 1.107913.
episode: 8971   score: 1030.0  epsilon: 1.0    steps: 672  evaluation reward: 1344.1
Training network. lr: 0.000050. clip: 0.020134
Iteration 26026: Policy loss: 0.000017. Value loss: 0.232608. Entropy: 0.985615.
Iteration 26027: Policy loss: 0.000049. Value loss: 0.213538. Entropy: 0.983572.
Iteration 26028: Policy loss: -0.000607. Value loss: 0.203173. Entropy: 0.979677.
episode: 8972   score: 1570.0  epsilon: 1.0    steps: 632  evaluation reward: 1315.1
Training network. lr: 0.000050. clip: 0.020134
Iteration 26029: Policy loss: 0.001853. Value loss: 0.303076. Entropy: 1.107972.
Iteration 26030: Policy loss: 0.0006

episode: 8992   score: 1460.0  epsilon: 1.0    steps: 800  evaluation reward: 1306.4
Training network. lr: 0.000050. clip: 0.019977
Iteration 26089: Policy loss: 0.000560. Value loss: 0.190930. Entropy: 0.946848.
Iteration 26090: Policy loss: 0.000093. Value loss: 0.165802. Entropy: 0.947118.
Iteration 26091: Policy loss: -0.000309. Value loss: 0.150179. Entropy: 0.950707.
episode: 8993   score: 1270.0  epsilon: 1.0    steps: 240  evaluation reward: 1304.4
Training network. lr: 0.000050. clip: 0.019977
Iteration 26092: Policy loss: 0.000300. Value loss: 0.278973. Entropy: 0.902129.
Iteration 26093: Policy loss: -0.000159. Value loss: 0.258199. Entropy: 0.898834.
Iteration 26094: Policy loss: -0.000211. Value loss: 0.240528. Entropy: 0.910551.
Training network. lr: 0.000050. clip: 0.019977
Iteration 26095: Policy loss: 0.003382. Value loss: 0.321450. Entropy: 1.391575.
Iteration 26096: Policy loss: 0.004432. Value loss: 0.249293. Entropy: 1.397751.
Iteration 26097: Policy loss: 0.002163

Training network. lr: 0.000049. clip: 0.019673
Iteration 26155: Policy loss: 0.007547. Value loss: 0.702593. Entropy: 1.372451.
Iteration 26156: Policy loss: 0.009789. Value loss: 0.651225. Entropy: 1.370320.
Iteration 26157: Policy loss: 0.006418. Value loss: 0.599044. Entropy: 1.369572.
episode: 9013   score: 1410.0  epsilon: 1.0    steps: 200  evaluation reward: 1339.7
Training network. lr: 0.000049. clip: 0.019673
Iteration 26158: Policy loss: 0.005377. Value loss: 0.364408. Entropy: 1.293462.
Iteration 26159: Policy loss: 0.005637. Value loss: 0.231785. Entropy: 1.286469.
Iteration 26160: Policy loss: 0.004865. Value loss: 0.206213. Entropy: 1.298818.
episode: 9014   score: 1320.0  epsilon: 1.0    steps: 832  evaluation reward: 1342.7
Training network. lr: 0.000049. clip: 0.019673
Iteration 26161: Policy loss: 0.003115. Value loss: 0.238516. Entropy: 1.588707.
Iteration 26162: Policy loss: 0.002694. Value loss: 0.182397. Entropy: 1.583269.
Iteration 26163: Policy loss: 0.002670. V

episode: 9035   score: 2070.0  epsilon: 1.0    steps: 592  evaluation reward: 1302.9
episode: 9036   score: 1400.0  epsilon: 1.0    steps: 672  evaluation reward: 1305.5
episode: 9037   score: 940.0  epsilon: 1.0    steps: 704  evaluation reward: 1299.7
Training network. lr: 0.000049. clip: 0.019517
Iteration 26221: Policy loss: 0.002718. Value loss: 0.233520. Entropy: 1.140107.
Iteration 26222: Policy loss: 0.001955. Value loss: 0.176730. Entropy: 1.141420.
Iteration 26223: Policy loss: 0.000489. Value loss: 0.166444. Entropy: 1.130368.
Training network. lr: 0.000049. clip: 0.019517
Iteration 26224: Policy loss: 0.002795. Value loss: 0.476398. Entropy: 1.047341.
Iteration 26225: Policy loss: 0.004355. Value loss: 0.333256. Entropy: 1.049068.
Iteration 26226: Policy loss: 0.004278. Value loss: 0.306650. Entropy: 1.038491.
Training network. lr: 0.000049. clip: 0.019517
Iteration 26227: Policy loss: 0.012719. Value loss: 0.665989. Entropy: 1.599874.
Iteration 26228: Policy loss: 0.009600

Iteration 26286: Policy loss: 0.001497. Value loss: 0.119102. Entropy: 0.938715.
episode: 9057   score: 2320.0  epsilon: 1.0    steps: 648  evaluation reward: 1272.8
Training network. lr: 0.000048. clip: 0.019369
Iteration 26287: Policy loss: 0.003927. Value loss: 0.406948. Entropy: 0.974904.
Iteration 26288: Policy loss: 0.007060. Value loss: 0.260131. Entropy: 0.957532.
Iteration 26289: Policy loss: 0.003474. Value loss: 0.209871. Entropy: 0.969523.
episode: 9058   score: 400.0  epsilon: 1.0    steps: 424  evaluation reward: 1265.3
Training network. lr: 0.000048. clip: 0.019369
Iteration 26290: Policy loss: 0.004251. Value loss: 0.203715. Entropy: 1.029026.
Iteration 26291: Policy loss: 0.001947. Value loss: 0.134496. Entropy: 1.041539.
Iteration 26292: Policy loss: 0.000209. Value loss: 0.118609. Entropy: 1.035397.
episode: 9059   score: 310.0  epsilon: 1.0    steps: 520  evaluation reward: 1258.6
Training network. lr: 0.000048. clip: 0.019369
Iteration 26293: Policy loss: 0.001042.

Iteration 26352: Policy loss: 0.000434. Value loss: 0.233410. Entropy: 1.366797.
episode: 9079   score: 1090.0  epsilon: 1.0    steps: 1016  evaluation reward: 1240.9
Training network. lr: 0.000048. clip: 0.019056
Iteration 26353: Policy loss: 0.005292. Value loss: 0.829729. Entropy: 1.436962.
Iteration 26354: Policy loss: 0.004905. Value loss: 0.633447. Entropy: 1.427214.
Iteration 26355: Policy loss: 0.006073. Value loss: 0.568013. Entropy: 1.431068.
Training network. lr: 0.000048. clip: 0.019056
Iteration 26356: Policy loss: 0.005263. Value loss: 0.427889. Entropy: 1.042317.
Iteration 26357: Policy loss: 0.003350. Value loss: 0.343783. Entropy: 1.061600.
Iteration 26358: Policy loss: 0.001830. Value loss: 0.302631. Entropy: 1.045007.
Training network. lr: 0.000048. clip: 0.019056
Iteration 26359: Policy loss: 0.003972. Value loss: 0.141636. Entropy: 1.144132.
Iteration 26360: Policy loss: 0.003984. Value loss: 0.079880. Entropy: 1.125324.
Iteration 26361: Policy loss: 0.001470. Valu

Iteration 26418: Policy loss: 0.000410. Value loss: 0.165140. Entropy: 1.328510.
Training network. lr: 0.000047. clip: 0.018908
Iteration 26419: Policy loss: 0.004679. Value loss: 0.440991. Entropy: 1.544035.
Iteration 26420: Policy loss: 0.004686. Value loss: 0.365381. Entropy: 1.545550.
Iteration 26421: Policy loss: 0.001603. Value loss: 0.315487. Entropy: 1.546675.
now time :  2019-03-06 09:01:37.985883
episode: 9101   score: 1280.0  epsilon: 1.0    steps: 160  evaluation reward: 1221.5
Training network. lr: 0.000047. clip: 0.018908
Iteration 26422: Policy loss: 0.002454. Value loss: 0.402137. Entropy: 1.332670.
Iteration 26423: Policy loss: 0.001728. Value loss: 0.349204. Entropy: 1.338830.
Iteration 26424: Policy loss: 0.000937. Value loss: 0.334341. Entropy: 1.338876.
episode: 9102   score: 1040.0  epsilon: 1.0    steps: 296  evaluation reward: 1228.5
Training network. lr: 0.000047. clip: 0.018908
Iteration 26425: Policy loss: 0.003264. Value loss: 0.226249. Entropy: 1.112959.
It

Iteration 26483: Policy loss: 0.000128. Value loss: 0.131576. Entropy: 1.150888.
Iteration 26484: Policy loss: -0.000079. Value loss: 0.120738. Entropy: 1.155679.
Training network. lr: 0.000047. clip: 0.018752
Iteration 26485: Policy loss: 0.003912. Value loss: 0.385641. Entropy: 1.146127.
Iteration 26486: Policy loss: 0.004015. Value loss: 0.270384. Entropy: 1.146650.
Iteration 26487: Policy loss: 0.005130. Value loss: 0.215415. Entropy: 1.160746.
episode: 9123   score: 1240.0  epsilon: 1.0    steps: 120  evaluation reward: 1182.0
episode: 9124   score: 1380.0  epsilon: 1.0    steps: 896  evaluation reward: 1186.8
Training network. lr: 0.000047. clip: 0.018752
Iteration 26488: Policy loss: 0.005512. Value loss: 0.376388. Entropy: 1.228062.
Iteration 26489: Policy loss: 0.007580. Value loss: 0.267626. Entropy: 1.246319.
Iteration 26490: Policy loss: 0.007465. Value loss: 0.236639. Entropy: 1.231857.
Training network. lr: 0.000047. clip: 0.018752
Iteration 26491: Policy loss: 0.002428. 

episode: 9143   score: 910.0  epsilon: 1.0    steps: 248  evaluation reward: 1197.4
episode: 9144   score: 1690.0  epsilon: 1.0    steps: 720  evaluation reward: 1200.1
Training network. lr: 0.000046. clip: 0.018448
Iteration 26551: Policy loss: 0.002814. Value loss: 0.297915. Entropy: 1.341401.
Iteration 26552: Policy loss: 0.002463. Value loss: 0.241076. Entropy: 1.346236.
Iteration 26553: Policy loss: 0.001668. Value loss: 0.211144. Entropy: 1.343748.
episode: 9145   score: 1460.0  epsilon: 1.0    steps: 992  evaluation reward: 1198.6
Training network. lr: 0.000046. clip: 0.018448
Iteration 26554: Policy loss: 0.001081. Value loss: 0.176764. Entropy: 1.356486.
Iteration 26555: Policy loss: 0.001862. Value loss: 0.121932. Entropy: 1.359116.
Iteration 26556: Policy loss: 0.001488. Value loss: 0.105563. Entropy: 1.361873.
episode: 9146   score: 1230.0  epsilon: 1.0    steps: 296  evaluation reward: 1203.7
Training network. lr: 0.000046. clip: 0.018448
Iteration 26557: Policy loss: 0.00

Iteration 26616: Policy loss: 0.000373. Value loss: 0.132819. Entropy: 1.514894.
Training network. lr: 0.000046. clip: 0.018291
Iteration 26617: Policy loss: 0.001630. Value loss: 0.511632. Entropy: 1.241656.
Iteration 26618: Policy loss: 0.001686. Value loss: 0.473867. Entropy: 1.233909.
Iteration 26619: Policy loss: 0.002962. Value loss: 0.429373. Entropy: 1.237718.
episode: 9165   score: 1440.0  epsilon: 1.0    steps: 40  evaluation reward: 1200.8
Training network. lr: 0.000046. clip: 0.018291
Iteration 26620: Policy loss: 0.000631. Value loss: 0.201511. Entropy: 1.263777.
Iteration 26621: Policy loss: 0.000323. Value loss: 0.181446. Entropy: 1.260826.
Iteration 26622: Policy loss: 0.000179. Value loss: 0.183374. Entropy: 1.263465.
episode: 9166   score: 1440.0  epsilon: 1.0    steps: 56  evaluation reward: 1205.1
episode: 9167   score: 1290.0  epsilon: 1.0    steps: 200  evaluation reward: 1206.7
Training network. lr: 0.000046. clip: 0.018291
Iteration 26623: Policy loss: 0.000840.

Iteration 26682: Policy loss: -0.000200. Value loss: 0.271465. Entropy: 0.778903.
episode: 9187   score: 1280.0  epsilon: 1.0    steps: 976  evaluation reward: 1210.5
Training network. lr: 0.000045. clip: 0.018134
Iteration 26683: Policy loss: 0.002878. Value loss: 0.219094. Entropy: 1.304504.
Iteration 26684: Policy loss: 0.001207. Value loss: 0.176386. Entropy: 1.305113.
Iteration 26685: Policy loss: -0.000288. Value loss: 0.160246. Entropy: 1.302525.
Training network. lr: 0.000045. clip: 0.018134
Iteration 26686: Policy loss: 0.002658. Value loss: 0.222406. Entropy: 1.483756.
Iteration 26687: Policy loss: 0.002809. Value loss: 0.167409. Entropy: 1.483785.
Iteration 26688: Policy loss: 0.001349. Value loss: 0.150083. Entropy: 1.483725.
episode: 9188   score: 770.0  epsilon: 1.0    steps: 80  evaluation reward: 1209.1
Training network. lr: 0.000045. clip: 0.018134
Iteration 26689: Policy loss: 0.006240. Value loss: 0.734330. Entropy: 1.270511.
Iteration 26690: Policy loss: 0.004820. V

episode: 9207   score: 1330.0  epsilon: 1.0    steps: 928  evaluation reward: 1257.7
Training network. lr: 0.000045. clip: 0.017987
Iteration 26749: Policy loss: 0.002217. Value loss: 0.079050. Entropy: 1.671739.
Iteration 26750: Policy loss: 0.001254. Value loss: 0.057234. Entropy: 1.671713.
Iteration 26751: Policy loss: -0.000652. Value loss: 0.049587. Entropy: 1.669791.
episode: 9208   score: 1280.0  epsilon: 1.0    steps: 392  evaluation reward: 1261.3
episode: 9209   score: 970.0  epsilon: 1.0    steps: 536  evaluation reward: 1257.6
Training network. lr: 0.000045. clip: 0.017830
Iteration 26752: Policy loss: 0.001717. Value loss: 0.268207. Entropy: 1.046952.
Iteration 26753: Policy loss: 0.000794. Value loss: 0.235522. Entropy: 1.046209.
Iteration 26754: Policy loss: -0.000233. Value loss: 0.229451. Entropy: 1.050412.
Training network. lr: 0.000045. clip: 0.017830
Iteration 26755: Policy loss: 0.001470. Value loss: 0.196128. Entropy: 1.373302.
Iteration 26756: Policy loss: 0.0007

episode: 9229   score: 1150.0  epsilon: 1.0    steps: 936  evaluation reward: 1273.0
Training network. lr: 0.000044. clip: 0.017673
Iteration 26815: Policy loss: 0.003784. Value loss: 0.253074. Entropy: 1.175481.
Iteration 26816: Policy loss: 0.003003. Value loss: 0.219539. Entropy: 1.191652.
Iteration 26817: Policy loss: 0.001449. Value loss: 0.204646. Entropy: 1.184715.
episode: 9230   score: 740.0  epsilon: 1.0    steps: 120  evaluation reward: 1268.8
episode: 9231   score: 1400.0  epsilon: 1.0    steps: 848  evaluation reward: 1269.5
episode: 9232   score: 910.0  epsilon: 1.0    steps: 968  evaluation reward: 1267.0
Training network. lr: 0.000044. clip: 0.017673
Iteration 26818: Policy loss: 0.000827. Value loss: 0.301464. Entropy: 0.864664.
Iteration 26819: Policy loss: 0.000362. Value loss: 0.278587. Entropy: 0.864466.
Iteration 26820: Policy loss: -0.000739. Value loss: 0.264958. Entropy: 0.873536.
episode: 9233   score: 2040.0  epsilon: 1.0    steps: 336  evaluation reward: 127

episode: 9251   score: 1100.0  epsilon: 1.0    steps: 128  evaluation reward: 1308.5
episode: 9252   score: 1130.0  epsilon: 1.0    steps: 344  evaluation reward: 1311.4
episode: 9253   score: 1360.0  epsilon: 1.0    steps: 816  evaluation reward: 1306.3
Training network. lr: 0.000044. clip: 0.017526
Iteration 26881: Policy loss: 0.000514. Value loss: 0.248293. Entropy: 1.110076.
Iteration 26882: Policy loss: -0.000125. Value loss: 0.240917. Entropy: 1.097680.
Iteration 26883: Policy loss: -0.000002. Value loss: 0.217736. Entropy: 1.105823.
episode: 9254   score: 1100.0  epsilon: 1.0    steps: 80  evaluation reward: 1306.3
Training network. lr: 0.000044. clip: 0.017526
Iteration 26884: Policy loss: 0.001791. Value loss: 0.239857. Entropy: 1.042897.
Iteration 26885: Policy loss: 0.000022. Value loss: 0.221238. Entropy: 1.046817.
Iteration 26886: Policy loss: -0.000473. Value loss: 0.211388. Entropy: 1.038292.
episode: 9255   score: 1130.0  epsilon: 1.0    steps: 976  evaluation reward: 

Iteration 26944: Policy loss: 0.000259. Value loss: 0.417959. Entropy: 0.987121.
Iteration 26945: Policy loss: 0.000248. Value loss: 0.385454. Entropy: 0.992407.
Iteration 26946: Policy loss: -0.000151. Value loss: 0.371613. Entropy: 0.993125.
episode: 9276   score: 850.0  epsilon: 1.0    steps: 552  evaluation reward: 1335.4
Training network. lr: 0.000043. clip: 0.017369
Iteration 26947: Policy loss: 0.001760. Value loss: 0.445941. Entropy: 1.113421.
Iteration 26948: Policy loss: 0.005188. Value loss: 0.333493. Entropy: 1.110579.
Iteration 26949: Policy loss: 0.002682. Value loss: 0.286705. Entropy: 1.109698.
episode: 9277   score: 1050.0  epsilon: 1.0    steps: 288  evaluation reward: 1334.7
Training network. lr: 0.000043. clip: 0.017369
Iteration 26950: Policy loss: 0.003244. Value loss: 0.411090. Entropy: 1.365832.
Iteration 26951: Policy loss: 0.003780. Value loss: 0.351847. Entropy: 1.351731.
Iteration 26952: Policy loss: 0.004066. Value loss: 0.334066. Entropy: 1.356519.
Trainin

Iteration 27012: Policy loss: 0.002108. Value loss: 0.334506. Entropy: 1.517753.
episode: 9296   score: 1940.0  epsilon: 1.0    steps: 808  evaluation reward: 1314.4
episode: 9297   score: 980.0  epsilon: 1.0    steps: 928  evaluation reward: 1309.5
Training network. lr: 0.000043. clip: 0.017065
Iteration 27013: Policy loss: 0.000868. Value loss: 0.258232. Entropy: 1.220403.
Iteration 27014: Policy loss: 0.001624. Value loss: 0.207238. Entropy: 1.221116.
Iteration 27015: Policy loss: 0.001759. Value loss: 0.173558. Entropy: 1.213142.
Training network. lr: 0.000043. clip: 0.017065
Iteration 27016: Policy loss: 0.005888. Value loss: 0.405759. Entropy: 1.259822.
Iteration 27017: Policy loss: 0.003070. Value loss: 0.313778. Entropy: 1.266205.
Iteration 27018: Policy loss: 0.001462. Value loss: 0.304304. Entropy: 1.274123.
episode: 9298   score: 1040.0  epsilon: 1.0    steps: 928  evaluation reward: 1305.7
Training network. lr: 0.000043. clip: 0.017065
Iteration 27019: Policy loss: 0.002221

Iteration 27076: Policy loss: 0.005273. Value loss: 0.599748. Entropy: 1.244131.
Iteration 27077: Policy loss: 0.004642. Value loss: 0.497072. Entropy: 1.240280.
Iteration 27078: Policy loss: 0.003720. Value loss: 0.451045. Entropy: 1.238023.
episode: 9319   score: 730.0  epsilon: 1.0    steps: 504  evaluation reward: 1290.5
Training network. lr: 0.000042. clip: 0.016909
Iteration 27079: Policy loss: 0.004167. Value loss: 0.375315. Entropy: 1.530925.
Iteration 27080: Policy loss: 0.005506. Value loss: 0.261550. Entropy: 1.537232.
Iteration 27081: Policy loss: 0.004504. Value loss: 0.212187. Entropy: 1.535699.
episode: 9320   score: 790.0  epsilon: 1.0    steps: 984  evaluation reward: 1283.0
Training network. lr: 0.000042. clip: 0.016909
Iteration 27082: Policy loss: 0.001221. Value loss: 0.261298. Entropy: 1.556406.
Iteration 27083: Policy loss: 0.000968. Value loss: 0.221211. Entropy: 1.555665.
Iteration 27084: Policy loss: 0.000037. Value loss: 0.205985. Entropy: 1.559249.
Training 

Iteration 27143: Policy loss: 0.002233. Value loss: 0.109385. Entropy: 1.621836.
Iteration 27144: Policy loss: 0.001410. Value loss: 0.087149. Entropy: 1.623239.
Training network. lr: 0.000042. clip: 0.016752
Iteration 27145: Policy loss: 0.001195. Value loss: 0.230838. Entropy: 1.224580.
Iteration 27146: Policy loss: 0.001457. Value loss: 0.203538. Entropy: 1.224793.
Iteration 27147: Policy loss: 0.000042. Value loss: 0.196198. Entropy: 1.226916.
episode: 9340   score: 1050.0  epsilon: 1.0    steps: 56  evaluation reward: 1295.6
Training network. lr: 0.000042. clip: 0.016752
Iteration 27148: Policy loss: 0.001868. Value loss: 0.510114. Entropy: 1.303935.
Iteration 27149: Policy loss: 0.003125. Value loss: 0.475877. Entropy: 1.302423.
Iteration 27150: Policy loss: 0.001584. Value loss: 0.461754. Entropy: 1.309496.
Training network. lr: 0.000042. clip: 0.016604
Iteration 27151: Policy loss: 0.003108. Value loss: 0.411854. Entropy: 1.504784.
Iteration 27152: Policy loss: 0.005756. Value 

Iteration 27210: Policy loss: 0.000742. Value loss: 0.137831. Entropy: 1.324277.
episode: 9360   score: 1370.0  epsilon: 1.0    steps: 448  evaluation reward: 1302.9
episode: 9361   score: 960.0  epsilon: 1.0    steps: 768  evaluation reward: 1298.6
Training network. lr: 0.000041. clip: 0.016448
Iteration 27211: Policy loss: 0.000745. Value loss: 0.290212. Entropy: 1.329950.
Iteration 27212: Policy loss: 0.000788. Value loss: 0.232047. Entropy: 1.330000.
Iteration 27213: Policy loss: 0.000266. Value loss: 0.206030. Entropy: 1.331427.
Training network. lr: 0.000041. clip: 0.016448
Iteration 27214: Policy loss: 0.001904. Value loss: 0.352535. Entropy: 1.442680.
Iteration 27215: Policy loss: 0.002967. Value loss: 0.277142. Entropy: 1.447131.
Iteration 27216: Policy loss: 0.002993. Value loss: 0.246568. Entropy: 1.449649.
episode: 9362   score: 1130.0  epsilon: 1.0    steps: 736  evaluation reward: 1299.4
episode: 9363   score: 1260.0  epsilon: 1.0    steps: 936  evaluation reward: 1298.2


episode: 9381   score: 1910.0  epsilon: 1.0    steps: 936  evaluation reward: 1285.5
Training network. lr: 0.000041. clip: 0.016291
Iteration 27277: Policy loss: 0.001143. Value loss: 0.118277. Entropy: 1.346789.
Iteration 27278: Policy loss: 0.000780. Value loss: 0.094884. Entropy: 1.354600.
Iteration 27279: Policy loss: 0.000856. Value loss: 0.084794. Entropy: 1.353737.
episode: 9382   score: 920.0  epsilon: 1.0    steps: 632  evaluation reward: 1282.4
episode: 9383   score: 1280.0  epsilon: 1.0    steps: 808  evaluation reward: 1278.5
episode: 9384   score: 1400.0  epsilon: 1.0    steps: 904  evaluation reward: 1283.0
Training network. lr: 0.000041. clip: 0.016291
Iteration 27280: Policy loss: 0.003747. Value loss: 0.158778. Entropy: 1.401544.
Iteration 27281: Policy loss: 0.001204. Value loss: 0.128979. Entropy: 1.406781.
Iteration 27282: Policy loss: 0.000052. Value loss: 0.121309. Entropy: 1.406233.
Training network. lr: 0.000041. clip: 0.016291
Iteration 27283: Policy loss: 0.00

Iteration 27342: Policy loss: 0.000535. Value loss: 0.040451. Entropy: 1.305699.
episode: 9403   score: 2710.0  epsilon: 1.0    steps: 848  evaluation reward: 1314.5
Training network. lr: 0.000040. clip: 0.016144
Iteration 27343: Policy loss: 0.003128. Value loss: 0.384477. Entropy: 1.322789.
Iteration 27344: Policy loss: 0.004643. Value loss: 0.315084. Entropy: 1.319880.
Iteration 27345: Policy loss: 0.004066. Value loss: 0.275245. Entropy: 1.325719.
episode: 9404   score: 940.0  epsilon: 1.0    steps: 664  evaluation reward: 1313.7
episode: 9405   score: 1400.0  epsilon: 1.0    steps: 856  evaluation reward: 1314.8
Training network. lr: 0.000040. clip: 0.016144
Iteration 27346: Policy loss: 0.005655. Value loss: 0.348770. Entropy: 1.139103.
Iteration 27347: Policy loss: 0.003164. Value loss: 0.232155. Entropy: 1.140285.
Iteration 27348: Policy loss: 0.002419. Value loss: 0.205669. Entropy: 1.141853.
Training network. lr: 0.000040. clip: 0.016144
Iteration 27349: Policy loss: 0.000981

Iteration 27407: Policy loss: 0.000508. Value loss: 0.370408. Entropy: 0.783362.
Iteration 27408: Policy loss: -0.000215. Value loss: 0.353992. Entropy: 0.792915.
Training network. lr: 0.000040. clip: 0.015830
Iteration 27409: Policy loss: 0.005811. Value loss: 0.699738. Entropy: 1.392054.
Iteration 27410: Policy loss: 0.008457. Value loss: 0.530597. Entropy: 1.384379.
Iteration 27411: Policy loss: 0.007655. Value loss: 0.484498. Entropy: 1.378853.
Training network. lr: 0.000040. clip: 0.015830
Iteration 27412: Policy loss: 0.001708. Value loss: 0.344156. Entropy: 1.379683.
Iteration 27413: Policy loss: 0.000326. Value loss: 0.290280. Entropy: 1.367113.
Iteration 27414: Policy loss: 0.000500. Value loss: 0.261176. Entropy: 1.372450.
episode: 9426   score: 1610.0  epsilon: 1.0    steps: 584  evaluation reward: 1337.8
Training network. lr: 0.000040. clip: 0.015830
Iteration 27415: Policy loss: 0.004787. Value loss: 0.460296. Entropy: 1.583838.
Iteration 27416: Policy loss: 0.006126. Valu

episode: 9446   score: 1490.0  epsilon: 1.0    steps: 1016  evaluation reward: 1334.8
Training network. lr: 0.000039. clip: 0.015683
Iteration 27475: Policy loss: 0.005144. Value loss: 0.211163. Entropy: 1.414214.
Iteration 27476: Policy loss: 0.004793. Value loss: 0.150417. Entropy: 1.414721.
Iteration 27477: Policy loss: 0.001880. Value loss: 0.136004. Entropy: 1.417352.
episode: 9447   score: 1280.0  epsilon: 1.0    steps: 640  evaluation reward: 1321.3
episode: 9448   score: 1320.0  epsilon: 1.0    steps: 920  evaluation reward: 1318.9
Training network. lr: 0.000039. clip: 0.015683
Iteration 27478: Policy loss: 0.000158. Value loss: 0.196691. Entropy: 1.351909.
Iteration 27479: Policy loss: -0.000063. Value loss: 0.160794. Entropy: 1.357075.
Iteration 27480: Policy loss: -0.000304. Value loss: 0.147718. Entropy: 1.360700.
episode: 9449   score: 830.0  epsilon: 1.0    steps: 536  evaluation reward: 1318.3
Training network. lr: 0.000039. clip: 0.015683
Iteration 27481: Policy loss: 0

Iteration 27540: Policy loss: 0.002900. Value loss: 0.056671. Entropy: 1.075945.
Training network. lr: 0.000039. clip: 0.015526
Iteration 27541: Policy loss: 0.001943. Value loss: 0.649510. Entropy: 1.218224.
Iteration 27542: Policy loss: 0.001602. Value loss: 0.536577. Entropy: 1.216607.
Iteration 27543: Policy loss: 0.003227. Value loss: 0.478232. Entropy: 1.217115.
episode: 9468   score: 1040.0  epsilon: 1.0    steps: 784  evaluation reward: 1331.7
Training network. lr: 0.000039. clip: 0.015526
Iteration 27544: Policy loss: 0.002951. Value loss: 0.531032. Entropy: 1.154984.
Iteration 27545: Policy loss: 0.006199. Value loss: 0.429123. Entropy: 1.159841.
Iteration 27546: Policy loss: 0.004629. Value loss: 0.395161. Entropy: 1.168646.
episode: 9469   score: 1530.0  epsilon: 1.0    steps: 552  evaluation reward: 1336.7
episode: 9470   score: 700.0  epsilon: 1.0    steps: 928  evaluation reward: 1333.1
Training network. lr: 0.000039. clip: 0.015526
Iteration 27547: Policy loss: 0.001507

Training network. lr: 0.000038. clip: 0.015222
Iteration 27607: Policy loss: 0.003071. Value loss: 0.329775. Entropy: 1.210770.
Iteration 27608: Policy loss: 0.003661. Value loss: 0.245938. Entropy: 1.208043.
Iteration 27609: Policy loss: 0.003540. Value loss: 0.228347. Entropy: 1.208525.
episode: 9489   score: 1070.0  epsilon: 1.0    steps: 1008  evaluation reward: 1333.5
Training network. lr: 0.000038. clip: 0.015222
Iteration 27610: Policy loss: 0.001011. Value loss: 0.530973. Entropy: 1.162465.
Iteration 27611: Policy loss: 0.002274. Value loss: 0.464275. Entropy: 1.170345.
Iteration 27612: Policy loss: 0.001447. Value loss: 0.411195. Entropy: 1.155262.
episode: 9490   score: 1300.0  epsilon: 1.0    steps: 240  evaluation reward: 1332.5
episode: 9491   score: 2450.0  epsilon: 1.0    steps: 472  evaluation reward: 1344.9
episode: 9492   score: 970.0  epsilon: 1.0    steps: 864  evaluation reward: 1340.0
episode: 9493   score: 1540.0  epsilon: 1.0    steps: 888  evaluation reward: 13

Iteration 27671: Policy loss: 0.003250. Value loss: 0.294783. Entropy: 1.542552.
Iteration 27672: Policy loss: 0.004856. Value loss: 0.243582. Entropy: 1.548591.
Training network. lr: 0.000038. clip: 0.015065
Iteration 27673: Policy loss: 0.002861. Value loss: 0.354388. Entropy: 1.453016.
Iteration 27674: Policy loss: 0.003055. Value loss: 0.296173. Entropy: 1.461881.
Iteration 27675: Policy loss: 0.003207. Value loss: 0.275106. Entropy: 1.465029.
episode: 9512   score: 1020.0  epsilon: 1.0    steps: 392  evaluation reward: 1319.3
Training network. lr: 0.000038. clip: 0.015065
Iteration 27676: Policy loss: 0.004465. Value loss: 0.373093. Entropy: 1.382221.
Iteration 27677: Policy loss: 0.005469. Value loss: 0.250587. Entropy: 1.383324.
Iteration 27678: Policy loss: 0.002129. Value loss: 0.210602. Entropy: 1.366885.
episode: 9513   score: 1030.0  epsilon: 1.0    steps: 488  evaluation reward: 1316.0
Training network. lr: 0.000038. clip: 0.015065
Iteration 27679: Policy loss: 0.002064. V

Training network. lr: 0.000037. clip: 0.014909
Iteration 27739: Policy loss: 0.002980. Value loss: 0.363887. Entropy: 1.439593.
Iteration 27740: Policy loss: 0.003873. Value loss: 0.284312. Entropy: 1.440917.
Iteration 27741: Policy loss: 0.003076. Value loss: 0.256603. Entropy: 1.437992.
episode: 9532   score: 1090.0  epsilon: 1.0    steps: 24  evaluation reward: 1315.3
episode: 9533   score: 1510.0  epsilon: 1.0    steps: 800  evaluation reward: 1313.7
Training network. lr: 0.000037. clip: 0.014909
Iteration 27742: Policy loss: 0.000884. Value loss: 0.803018. Entropy: 1.282369.
Iteration 27743: Policy loss: 0.005563. Value loss: 0.576349. Entropy: 1.278324.
Iteration 27744: Policy loss: 0.004192. Value loss: 0.554263. Entropy: 1.276643.
Training network. lr: 0.000037. clip: 0.014909
Iteration 27745: Policy loss: 0.001321. Value loss: 0.163449. Entropy: 1.281177.
Iteration 27746: Policy loss: 0.000301. Value loss: 0.118046. Entropy: 1.285930.
Iteration 27747: Policy loss: -0.000407. V

episode: 9554   score: 1410.0  epsilon: 1.0    steps: 368  evaluation reward: 1299.9
Training network. lr: 0.000037. clip: 0.014605
Iteration 27805: Policy loss: 0.003166. Value loss: 0.368650. Entropy: 0.981668.
Iteration 27806: Policy loss: 0.000971. Value loss: 0.265107. Entropy: 0.985524.
Iteration 27807: Policy loss: 0.000630. Value loss: 0.243252. Entropy: 0.988308.
episode: 9555   score: 1460.0  epsilon: 1.0    steps: 456  evaluation reward: 1302.7
Training network. lr: 0.000037. clip: 0.014605
Iteration 27808: Policy loss: 0.002250. Value loss: 0.352870. Entropy: 1.301821.
Iteration 27809: Policy loss: 0.002983. Value loss: 0.258682. Entropy: 1.294150.
Iteration 27810: Policy loss: 0.003811. Value loss: 0.232715. Entropy: 1.300977.
episode: 9556   score: 910.0  epsilon: 1.0    steps: 880  evaluation reward: 1294.6
Training network. lr: 0.000037. clip: 0.014605
Iteration 27811: Policy loss: 0.003523. Value loss: 0.295535. Entropy: 1.327244.
Iteration 27812: Policy loss: 0.005199

Iteration 27871: Policy loss: 0.001859. Value loss: 0.322434. Entropy: 1.087581.
Iteration 27872: Policy loss: 0.002609. Value loss: 0.235386. Entropy: 1.098527.
Iteration 27873: Policy loss: 0.001550. Value loss: 0.205970. Entropy: 1.100777.
episode: 9575   score: 1340.0  epsilon: 1.0    steps: 728  evaluation reward: 1297.4
episode: 9576   score: 1190.0  epsilon: 1.0    steps: 944  evaluation reward: 1292.4
Training network. lr: 0.000036. clip: 0.014448
Iteration 27874: Policy loss: 0.001336. Value loss: 0.260329. Entropy: 1.297763.
Iteration 27875: Policy loss: 0.002532. Value loss: 0.180931. Entropy: 1.300593.
Iteration 27876: Policy loss: 0.002122. Value loss: 0.150569. Entropy: 1.301211.
episode: 9577   score: 1190.0  epsilon: 1.0    steps: 328  evaluation reward: 1292.4
Training network. lr: 0.000036. clip: 0.014448
Iteration 27877: Policy loss: 0.001517. Value loss: 0.356418. Entropy: 1.039149.
Iteration 27878: Policy loss: 0.004252. Value loss: 0.294615. Entropy: 1.037763.
Ite

Iteration 27939: Policy loss: 0.007101. Value loss: 0.252012. Entropy: 1.184678.
episode: 9595   score: 1820.0  epsilon: 1.0    steps: 728  evaluation reward: 1255.8
Training network. lr: 0.000036. clip: 0.014300
Iteration 27940: Policy loss: 0.004444. Value loss: 0.434720. Entropy: 1.115015.
Iteration 27941: Policy loss: 0.004041. Value loss: 0.364994. Entropy: 1.112669.
Iteration 27942: Policy loss: 0.002854. Value loss: 0.344978. Entropy: 1.112815.
episode: 9596   score: 1640.0  epsilon: 1.0    steps: 208  evaluation reward: 1261.0
episode: 9597   score: 1300.0  epsilon: 1.0    steps: 912  evaluation reward: 1263.2
Training network. lr: 0.000036. clip: 0.014300
Iteration 27943: Policy loss: 0.002532. Value loss: 0.282475. Entropy: 1.044221.
Iteration 27944: Policy loss: 0.001683. Value loss: 0.247431. Entropy: 1.039128.
Iteration 27945: Policy loss: 0.000963. Value loss: 0.234005. Entropy: 1.034025.
episode: 9598   score: 1010.0  epsilon: 1.0    steps: 448  evaluation reward: 1260.3

Iteration 28003: Policy loss: 0.003377. Value loss: 0.184019. Entropy: 1.055064.
Iteration 28004: Policy loss: 0.005499. Value loss: 0.137461. Entropy: 1.061355.
Iteration 28005: Policy loss: 0.002832. Value loss: 0.111627. Entropy: 1.062054.
Training network. lr: 0.000035. clip: 0.013987
Iteration 28006: Policy loss: 0.002296. Value loss: 0.245208. Entropy: 0.711638.
Iteration 28007: Policy loss: 0.001407. Value loss: 0.210623. Entropy: 0.708919.
Iteration 28008: Policy loss: 0.001339. Value loss: 0.199907. Entropy: 0.716482.
Training network. lr: 0.000035. clip: 0.013987
Iteration 28009: Policy loss: 0.006001. Value loss: 0.342852. Entropy: 0.981352.
Iteration 28010: Policy loss: 0.007188. Value loss: 0.188914. Entropy: 0.974820.
Iteration 28011: Policy loss: 0.005656. Value loss: 0.169844. Entropy: 0.986379.
episode: 9618   score: 740.0  epsilon: 1.0    steps: 312  evaluation reward: 1267.1
episode: 9619   score: 1570.0  epsilon: 1.0    steps: 384  evaluation reward: 1269.4
episode:

Training network. lr: 0.000035. clip: 0.013840
Iteration 28069: Policy loss: 0.002447. Value loss: 0.207724. Entropy: 0.810812.
Iteration 28070: Policy loss: 0.002895. Value loss: 0.154438. Entropy: 0.811582.
Iteration 28071: Policy loss: 0.000744. Value loss: 0.148615. Entropy: 0.815426.
Training network. lr: 0.000035. clip: 0.013840
Iteration 28072: Policy loss: 0.002395. Value loss: 0.423876. Entropy: 1.078493.
Iteration 28073: Policy loss: 0.004145. Value loss: 0.288391. Entropy: 1.071151.
Iteration 28074: Policy loss: 0.004269. Value loss: 0.246774. Entropy: 1.087330.
Training network. lr: 0.000035. clip: 0.013840
Iteration 28075: Policy loss: 0.001433. Value loss: 0.347841. Entropy: 1.289956.
Iteration 28076: Policy loss: 0.000577. Value loss: 0.295800. Entropy: 1.275782.
Iteration 28077: Policy loss: 0.000342. Value loss: 0.281357. Entropy: 1.280735.
episode: 9640   score: 1190.0  epsilon: 1.0    steps: 304  evaluation reward: 1272.4
episode: 9641   score: 1360.0  epsilon: 1.0  

Iteration 28136: Policy loss: 0.007146. Value loss: 0.222757. Entropy: 1.427020.
Iteration 28137: Policy loss: 0.005011. Value loss: 0.190157. Entropy: 1.421642.
episode: 9660   score: 1360.0  epsilon: 1.0    steps: 104  evaluation reward: 1281.0
Training network. lr: 0.000034. clip: 0.013683
Iteration 28138: Policy loss: 0.004194. Value loss: 0.171567. Entropy: 1.286784.
Iteration 28139: Policy loss: 0.002414. Value loss: 0.147441. Entropy: 1.276720.
Iteration 28140: Policy loss: 0.001482. Value loss: 0.143978. Entropy: 1.274196.
episode: 9661   score: 980.0  epsilon: 1.0    steps: 184  evaluation reward: 1275.4
episode: 9662   score: 1210.0  epsilon: 1.0    steps: 432  evaluation reward: 1273.2
Training network. lr: 0.000034. clip: 0.013683
Iteration 28141: Policy loss: 0.000345. Value loss: 0.249123. Entropy: 1.132204.
Iteration 28142: Policy loss: 0.000212. Value loss: 0.205671. Entropy: 1.134540.
Iteration 28143: Policy loss: 0.001226. Value loss: 0.179573. Entropy: 1.135507.
epis

episode: 9680   score: 1270.0  epsilon: 1.0    steps: 160  evaluation reward: 1284.9
episode: 9681   score: 1750.0  epsilon: 1.0    steps: 360  evaluation reward: 1290.0
episode: 9682   score: 1810.0  epsilon: 1.0    steps: 696  evaluation reward: 1299.2
Training network. lr: 0.000033. clip: 0.013379
Iteration 28204: Policy loss: 0.001969. Value loss: 0.157997. Entropy: 0.817521.
Iteration 28205: Policy loss: 0.001336. Value loss: 0.122698. Entropy: 0.823705.
Iteration 28206: Policy loss: 0.000559. Value loss: 0.120900. Entropy: 0.815463.
episode: 9683   score: 1250.0  epsilon: 1.0    steps: 336  evaluation reward: 1299.1
episode: 9684   score: 1270.0  epsilon: 1.0    steps: 680  evaluation reward: 1300.2
Training network. lr: 0.000033. clip: 0.013379
Iteration 28207: Policy loss: 0.000814. Value loss: 0.324058. Entropy: 0.799602.
Iteration 28208: Policy loss: 0.000297. Value loss: 0.258581. Entropy: 0.802589.
Iteration 28209: Policy loss: -0.000206. Value loss: 0.233204. Entropy: 0.80

Iteration 28268: Policy loss: 0.004520. Value loss: 0.630665. Entropy: 1.069426.
Iteration 28269: Policy loss: 0.005571. Value loss: 0.577682. Entropy: 1.074324.
episode: 9703   score: 1490.0  epsilon: 1.0    steps: 800  evaluation reward: 1330.8
Training network. lr: 0.000033. clip: 0.013222
Iteration 28270: Policy loss: 0.005209. Value loss: 0.358796. Entropy: 1.230571.
Iteration 28271: Policy loss: 0.007608. Value loss: 0.212550. Entropy: 1.260048.
Iteration 28272: Policy loss: 0.007444. Value loss: 0.179511. Entropy: 1.255484.
Training network. lr: 0.000033. clip: 0.013222
Iteration 28273: Policy loss: 0.002831. Value loss: 0.350739. Entropy: 1.115778.
Iteration 28274: Policy loss: 0.004924. Value loss: 0.255498. Entropy: 1.126447.
Iteration 28275: Policy loss: 0.003162. Value loss: 0.236080. Entropy: 1.120087.
Training network. lr: 0.000033. clip: 0.013222
Iteration 28276: Policy loss: 0.005040. Value loss: 0.476439. Entropy: 1.274498.
Iteration 28277: Policy loss: 0.004359. Value

Iteration 28334: Policy loss: 0.003575. Value loss: 0.246845. Entropy: 0.902857.
Iteration 28335: Policy loss: 0.003532. Value loss: 0.220185. Entropy: 0.907088.
episode: 9725   score: 2620.0  epsilon: 1.0    steps: 248  evaluation reward: 1287.8
Training network. lr: 0.000033. clip: 0.013065
Iteration 28336: Policy loss: 0.001625. Value loss: 0.350163. Entropy: 0.811041.
Iteration 28337: Policy loss: 0.000948. Value loss: 0.295265. Entropy: 0.804505.
Iteration 28338: Policy loss: 0.000714. Value loss: 0.279853. Entropy: 0.809265.
Training network. lr: 0.000033. clip: 0.013065
Iteration 28339: Policy loss: 0.000990. Value loss: 0.117577. Entropy: 1.120519.
Iteration 28340: Policy loss: 0.001052. Value loss: 0.086008. Entropy: 1.122468.
Iteration 28341: Policy loss: -0.000229. Value loss: 0.075283. Entropy: 1.119256.
episode: 9726   score: 1690.0  epsilon: 1.0    steps: 64  evaluation reward: 1293.0
Training network. lr: 0.000033. clip: 0.013065
Iteration 28342: Policy loss: 0.001442. V

Iteration 28400: Policy loss: 0.005719. Value loss: 0.360472. Entropy: 1.003012.
Iteration 28401: Policy loss: 0.006720. Value loss: 0.271993. Entropy: 1.013279.
Training network. lr: 0.000032. clip: 0.012761
Iteration 28402: Policy loss: 0.002668. Value loss: 0.420951. Entropy: 1.373528.
Iteration 28403: Policy loss: 0.003324. Value loss: 0.251354. Entropy: 1.380831.
Iteration 28404: Policy loss: 0.004200. Value loss: 0.199469. Entropy: 1.384134.
episode: 9747   score: 3800.0  epsilon: 1.0    steps: 648  evaluation reward: 1309.9
episode: 9748   score: 1240.0  epsilon: 1.0    steps: 672  evaluation reward: 1312.2
Training network. lr: 0.000032. clip: 0.012761
Iteration 28405: Policy loss: 0.003951. Value loss: 0.422161. Entropy: 1.404248.
Iteration 28406: Policy loss: 0.004574. Value loss: 0.266075. Entropy: 1.405367.
Iteration 28407: Policy loss: 0.006346. Value loss: 0.215284. Entropy: 1.406403.
Training network. lr: 0.000032. clip: 0.012761
Iteration 28408: Policy loss: 0.000581. V

Iteration 28465: Policy loss: 0.001252. Value loss: 0.324946. Entropy: 1.275986.
Iteration 28466: Policy loss: 0.001176. Value loss: 0.295201. Entropy: 1.279089.
Iteration 28467: Policy loss: 0.001327. Value loss: 0.277316. Entropy: 1.272803.
episode: 9769   score: 1050.0  epsilon: 1.0    steps: 408  evaluation reward: 1325.1
Training network. lr: 0.000032. clip: 0.012605
Iteration 28468: Policy loss: 0.004059. Value loss: 0.550473. Entropy: 1.144508.
Iteration 28469: Policy loss: 0.002596. Value loss: 0.451922. Entropy: 1.148556.
Iteration 28470: Policy loss: 0.004507. Value loss: 0.406951. Entropy: 1.135464.
Training network. lr: 0.000032. clip: 0.012605
Iteration 28471: Policy loss: 0.003377. Value loss: 0.561108. Entropy: 1.180431.
Iteration 28472: Policy loss: 0.006397. Value loss: 0.412230. Entropy: 1.170637.
Iteration 28473: Policy loss: 0.006907. Value loss: 0.353539. Entropy: 1.175076.
Training network. lr: 0.000032. clip: 0.012605
Iteration 28474: Policy loss: 0.001785. Value

Iteration 28533: Policy loss: 0.000215. Value loss: 0.254843. Entropy: 1.075893.
episode: 9789   score: 1390.0  epsilon: 1.0    steps: 832  evaluation reward: 1328.8
Training network. lr: 0.000031. clip: 0.012457
Iteration 28534: Policy loss: 0.002544. Value loss: 0.414058. Entropy: 1.098344.
Iteration 28535: Policy loss: 0.004081. Value loss: 0.304882. Entropy: 1.098981.
Iteration 28536: Policy loss: 0.005909. Value loss: 0.270346. Entropy: 1.096669.
Training network. lr: 0.000031. clip: 0.012457
Iteration 28537: Policy loss: 0.001709. Value loss: 0.788081. Entropy: 0.948476.
Iteration 28538: Policy loss: 0.001869. Value loss: 0.640913. Entropy: 0.947479.
Iteration 28539: Policy loss: 0.003315. Value loss: 0.574950. Entropy: 0.959569.
episode: 9790   score: 1000.0  epsilon: 1.0    steps: 160  evaluation reward: 1325.2
episode: 9791   score: 1500.0  epsilon: 1.0    steps: 336  evaluation reward: 1325.8
episode: 9792   score: 1410.0  epsilon: 1.0    steps: 424  evaluation reward: 1321.4

Iteration 28598: Policy loss: 0.002644. Value loss: 0.531228. Entropy: 1.454694.
Iteration 28599: Policy loss: 0.004589. Value loss: 0.510366. Entropy: 1.452723.
episode: 9811   score: 1800.0  epsilon: 1.0    steps: 40  evaluation reward: 1308.5
Training network. lr: 0.000031. clip: 0.012301
Iteration 28600: Policy loss: 0.003376. Value loss: 0.287763. Entropy: 1.252623.
Iteration 28601: Policy loss: 0.003323. Value loss: 0.252156. Entropy: 1.245176.
Iteration 28602: Policy loss: 0.001675. Value loss: 0.232173. Entropy: 1.247228.
Training network. lr: 0.000030. clip: 0.012144
Iteration 28603: Policy loss: 0.001832. Value loss: 0.276314. Entropy: 1.256411.
Iteration 28604: Policy loss: 0.002411. Value loss: 0.227591. Entropy: 1.252762.
Iteration 28605: Policy loss: 0.001481. Value loss: 0.204397. Entropy: 1.257258.
episode: 9812   score: 980.0  epsilon: 1.0    steps: 544  evaluation reward: 1311.7
Training network. lr: 0.000030. clip: 0.012144
Iteration 28606: Policy loss: 0.002487. Val

Training network. lr: 0.000030. clip: 0.011996
Iteration 28663: Policy loss: 0.002156. Value loss: 0.333116. Entropy: 1.044097.
Iteration 28664: Policy loss: 0.001376. Value loss: 0.261400. Entropy: 1.041617.
Iteration 28665: Policy loss: 0.001788. Value loss: 0.231724. Entropy: 1.036270.
episode: 9834   score: 2410.0  epsilon: 1.0    steps: 920  evaluation reward: 1326.2
Training network. lr: 0.000030. clip: 0.011996
Iteration 28666: Policy loss: 0.004332. Value loss: 0.899280. Entropy: 0.855541.
Iteration 28667: Policy loss: 0.007926. Value loss: 0.737234. Entropy: 0.848780.
Iteration 28668: Policy loss: 0.008995. Value loss: 0.666066. Entropy: 0.843664.
Training network. lr: 0.000030. clip: 0.011996
Iteration 28669: Policy loss: 0.004446. Value loss: 0.445897. Entropy: 1.239020.
Iteration 28670: Policy loss: 0.005473. Value loss: 0.339191. Entropy: 1.252591.
Iteration 28671: Policy loss: 0.004368. Value loss: 0.305042. Entropy: 1.251930.
episode: 9835   score: 1240.0  epsilon: 1.0  

Training network. lr: 0.000030. clip: 0.011840
Iteration 28732: Policy loss: 0.001782. Value loss: 0.556803. Entropy: 1.175680.
Iteration 28733: Policy loss: 0.002624. Value loss: 0.463993. Entropy: 1.170300.
Iteration 28734: Policy loss: 0.003287. Value loss: 0.413197. Entropy: 1.169821.
episode: 9854   score: 1380.0  epsilon: 1.0    steps: 424  evaluation reward: 1326.1
Training network. lr: 0.000030. clip: 0.011840
Iteration 28735: Policy loss: 0.003585. Value loss: 0.121468. Entropy: 1.348856.
Iteration 28736: Policy loss: 0.003943. Value loss: 0.077330. Entropy: 1.343367.
Iteration 28737: Policy loss: 0.002900. Value loss: 0.065849. Entropy: 1.344969.
episode: 9855   score: 1250.0  epsilon: 1.0    steps: 192  evaluation reward: 1327.3
Training network. lr: 0.000030. clip: 0.011840
Iteration 28738: Policy loss: 0.000969. Value loss: 1.307848. Entropy: 1.159587.
Iteration 28739: Policy loss: 0.003868. Value loss: 1.090458. Entropy: 1.166387.
Iteration 28740: Policy loss: 0.004536. V

Iteration 28799: Policy loss: 0.007194. Value loss: 0.306696. Entropy: 1.177918.
Iteration 28800: Policy loss: 0.005683. Value loss: 0.286047. Entropy: 1.184658.
episode: 9874   score: 1930.0  epsilon: 1.0    steps: 536  evaluation reward: 1322.6
episode: 9875   score: 1070.0  epsilon: 1.0    steps: 808  evaluation reward: 1323.0
Training network. lr: 0.000029. clip: 0.011536
Iteration 28801: Policy loss: 0.000962. Value loss: 0.548561. Entropy: 0.933816.
Iteration 28802: Policy loss: 0.002379. Value loss: 0.469259. Entropy: 0.926923.
Iteration 28803: Policy loss: 0.001975. Value loss: 0.440952. Entropy: 0.921933.
Training network. lr: 0.000029. clip: 0.011536
Iteration 28804: Policy loss: 0.001354. Value loss: 0.293068. Entropy: 1.065477.
Iteration 28805: Policy loss: 0.001925. Value loss: 0.210546. Entropy: 1.060219.
Iteration 28806: Policy loss: 0.000930. Value loss: 0.183182. Entropy: 1.074069.
episode: 9876   score: 610.0  epsilon: 1.0    steps: 960  evaluation reward: 1316.1
Trai

Training network. lr: 0.000028. clip: 0.011379
Iteration 28864: Policy loss: 0.000919. Value loss: 0.570155. Entropy: 1.029669.
Iteration 28865: Policy loss: 0.002747. Value loss: 0.539736. Entropy: 1.028017.
Iteration 28866: Policy loss: 0.001817. Value loss: 0.511041. Entropy: 1.034773.
Training network. lr: 0.000028. clip: 0.011379
Iteration 28867: Policy loss: 0.001806. Value loss: 0.237673. Entropy: 0.957589.
Iteration 28868: Policy loss: 0.001496. Value loss: 0.189877. Entropy: 0.955131.
Iteration 28869: Policy loss: 0.001969. Value loss: 0.168416. Entropy: 0.957656.
Training network. lr: 0.000028. clip: 0.011379
Iteration 28870: Policy loss: 0.003603. Value loss: 0.215612. Entropy: 1.088982.
Iteration 28871: Policy loss: 0.004875. Value loss: 0.169752. Entropy: 1.097206.
Iteration 28872: Policy loss: 0.005508. Value loss: 0.149707. Entropy: 1.091104.
episode: 9897   score: 2190.0  epsilon: 1.0    steps: 504  evaluation reward: 1282.3
Training network. lr: 0.000028. clip: 0.01137

episode: 9915   score: 1590.0  epsilon: 1.0    steps: 360  evaluation reward: 1319.7
Training network. lr: 0.000028. clip: 0.011222
Iteration 28933: Policy loss: 0.000238. Value loss: 0.148234. Entropy: 0.976021.
Iteration 28934: Policy loss: 0.000251. Value loss: 0.134724. Entropy: 0.976874.
Iteration 28935: Policy loss: 0.000038. Value loss: 0.120696. Entropy: 0.978295.
episode: 9916   score: 1600.0  epsilon: 1.0    steps: 344  evaluation reward: 1323.9
episode: 9917   score: 1250.0  epsilon: 1.0    steps: 400  evaluation reward: 1322.6
episode: 9918   score: 1040.0  epsilon: 1.0    steps: 920  evaluation reward: 1319.0
Training network. lr: 0.000028. clip: 0.011222
Iteration 28936: Policy loss: 0.003151. Value loss: 0.236694. Entropy: 1.067611.
Iteration 28937: Policy loss: 0.005352. Value loss: 0.150588. Entropy: 1.072702.
Iteration 28938: Policy loss: 0.004216. Value loss: 0.129330. Entropy: 1.073254.
episode: 9919   score: 1750.0  epsilon: 1.0    steps: 24  evaluation reward: 132

episode: 9937   score: 1070.0  epsilon: 1.0    steps: 896  evaluation reward: 1333.6
Training network. lr: 0.000028. clip: 0.011075
Iteration 28999: Policy loss: 0.002229. Value loss: 0.097781. Entropy: 1.324381.
Iteration 29000: Policy loss: 0.000926. Value loss: 0.073747. Entropy: 1.323600.
Iteration 29001: Policy loss: 0.000171. Value loss: 0.064011. Entropy: 1.316148.
episode: 9938   score: 1390.0  epsilon: 1.0    steps: 80  evaluation reward: 1337.7
Training network. lr: 0.000027. clip: 0.010918
Iteration 29002: Policy loss: 0.000447. Value loss: 0.263850. Entropy: 1.100609.
Iteration 29003: Policy loss: 0.000859. Value loss: 0.250816. Entropy: 1.094615.
Iteration 29004: Policy loss: -0.000042. Value loss: 0.240533. Entropy: 1.088058.
episode: 9939   score: 1320.0  epsilon: 1.0    steps: 1008  evaluation reward: 1333.9
Training network. lr: 0.000027. clip: 0.010918
Iteration 29005: Policy loss: 0.002693. Value loss: 0.224466. Entropy: 1.150908.
Iteration 29006: Policy loss: 0.0035

Iteration 29064: Policy loss: 0.002469. Value loss: 0.120126. Entropy: 1.502207.
Training network. lr: 0.000027. clip: 0.010761
Iteration 29065: Policy loss: 0.003130. Value loss: 0.874802. Entropy: 1.329430.
Iteration 29066: Policy loss: 0.007120. Value loss: 0.753907. Entropy: 1.337942.
Iteration 29067: Policy loss: 0.004511. Value loss: 0.701989. Entropy: 1.338047.
episode: 9959   score: 1480.0  epsilon: 1.0    steps: 56  evaluation reward: 1327.2
Training network. lr: 0.000027. clip: 0.010761
Iteration 29068: Policy loss: 0.000732. Value loss: 0.424194. Entropy: 1.283704.
Iteration 29069: Policy loss: 0.001005. Value loss: 0.421429. Entropy: 1.289801.
Iteration 29070: Policy loss: 0.001346. Value loss: 0.410669. Entropy: 1.292961.
episode: 9960   score: 1780.0  epsilon: 1.0    steps: 176  evaluation reward: 1334.2
Training network. lr: 0.000027. clip: 0.010761
Iteration 29071: Policy loss: 0.001159. Value loss: 0.203914. Entropy: 1.196434.
Iteration 29072: Policy loss: 0.002673. Va

Iteration 29130: Policy loss: 0.003065. Value loss: 1.001526. Entropy: 1.318716.
episode: 9981   score: 1120.0  epsilon: 1.0    steps: 216  evaluation reward: 1319.5
episode: 9982   score: 1090.0  epsilon: 1.0    steps: 272  evaluation reward: 1315.5
Training network. lr: 0.000027. clip: 0.010614
Iteration 29131: Policy loss: 0.001354. Value loss: 0.385449. Entropy: 0.743679.
Iteration 29132: Policy loss: 0.001856. Value loss: 0.306932. Entropy: 0.750233.
Iteration 29133: Policy loss: 0.001942. Value loss: 0.305231. Entropy: 0.753174.
Training network. lr: 0.000027. clip: 0.010614
Iteration 29134: Policy loss: 0.003551. Value loss: 0.236488. Entropy: 1.105847.
Iteration 29135: Policy loss: 0.006166. Value loss: 0.172739. Entropy: 1.098663.
Iteration 29136: Policy loss: 0.004687. Value loss: 0.167225. Entropy: 1.100963.
episode: 9983   score: 2410.0  epsilon: 1.0    steps: 320  evaluation reward: 1329.2
Training network. lr: 0.000027. clip: 0.010614
Iteration 29137: Policy loss: 0.00313

Iteration 29197: Policy loss: 0.002642. Value loss: 0.341132. Entropy: 1.072228.
Iteration 29198: Policy loss: 0.004059. Value loss: 0.261667. Entropy: 1.066516.
Iteration 29199: Policy loss: 0.005271. Value loss: 0.235327. Entropy: 1.057461.
now time :  2019-03-06 09:39:13.412487
episode: 10001   score: 1730.0  epsilon: 1.0    steps: 56  evaluation reward: 1349.8
Training network. lr: 0.000026. clip: 0.010457
Iteration 29200: Policy loss: 0.001860. Value loss: 0.324633. Entropy: 1.008717.
Iteration 29201: Policy loss: 0.002421. Value loss: 0.285101. Entropy: 1.012806.
Iteration 29202: Policy loss: 0.002119. Value loss: 0.255138. Entropy: 1.012128.
episode: 10002   score: 1360.0  epsilon: 1.0    steps: 232  evaluation reward: 1351.3
episode: 10003   score: 830.0  epsilon: 1.0    steps: 768  evaluation reward: 1348.2
episode: 10004   score: 1470.0  epsilon: 1.0    steps: 792  evaluation reward: 1341.0
Training network. lr: 0.000026. clip: 0.010301
Iteration 29203: Policy loss: 0.002907.

Training network. lr: 0.000025. clip: 0.010153
Iteration 29260: Policy loss: 0.004340. Value loss: 0.782348. Entropy: 0.923182.
Iteration 29261: Policy loss: 0.003298. Value loss: 0.683401. Entropy: 0.915148.
Iteration 29262: Policy loss: 0.005708. Value loss: 0.630728. Entropy: 0.911763.
Training network. lr: 0.000025. clip: 0.010153
Iteration 29263: Policy loss: 0.002095. Value loss: 0.344078. Entropy: 0.929598.
Iteration 29264: Policy loss: 0.001653. Value loss: 0.278540. Entropy: 0.919958.
Iteration 29265: Policy loss: 0.001098. Value loss: 0.269320. Entropy: 0.920031.
Training network. lr: 0.000025. clip: 0.010153
Iteration 29266: Policy loss: 0.004626. Value loss: 0.358436. Entropy: 1.258291.
Iteration 29267: Policy loss: 0.011945. Value loss: 0.178943. Entropy: 1.281187.
Iteration 29268: Policy loss: 0.010794. Value loss: 0.129759. Entropy: 1.284638.
episode: 10026   score: 1000.0  epsilon: 1.0    steps: 368  evaluation reward: 1298.4
Training network. lr: 0.000025. clip: 0.0101

Iteration 34: Policy loss: 0.011026. Value loss: 0.665660. Entropy: 2.596179.
Iteration 35: Policy loss: 0.000407. Value loss: 0.518858. Entropy: 2.594499.
Iteration 36: Policy loss: -0.010662. Value loss: 0.489563. Entropy: 2.595016.
episode: 5   score: 880.0  epsilon: 1.0    steps: 400  evaluation reward: 542.0
episode: 6   score: 1300.0  epsilon: 1.0    steps: 944  evaluation reward: 668.3333333333334
Training network. lr: 0.000250. clip: 0.100000
Iteration 37: Policy loss: 0.022662. Value loss: 0.820333. Entropy: 2.576780.
Iteration 38: Policy loss: -0.007299. Value loss: 0.491685. Entropy: 2.586882.
Iteration 39: Policy loss: -0.016776. Value loss: 0.423339. Entropy: 2.582516.
episode: 7   score: 1280.0  epsilon: 1.0    steps: 312  evaluation reward: 755.7142857142857
episode: 8   score: 830.0  epsilon: 1.0    steps: 680  evaluation reward: 765.0
episode: 9   score: 1710.0  epsilon: 1.0    steps: 968  evaluation reward: 870.0
Training network. lr: 0.000250. clip: 0.100000
Iteratio

Training network. lr: 0.000250. clip: 0.099853
Iteration 100: Policy loss: 0.011243. Value loss: 1.130985. Entropy: 2.559996.
Iteration 101: Policy loss: -0.001396. Value loss: 0.946785. Entropy: 2.560591.
Iteration 102: Policy loss: -0.010751. Value loss: 0.819851. Entropy: 2.564840.
Training network. lr: 0.000249. clip: 0.099696
Iteration 103: Policy loss: 0.001208. Value loss: 0.772125. Entropy: 2.572895.
Iteration 104: Policy loss: -0.003149. Value loss: 0.621461. Entropy: 2.584393.
Iteration 105: Policy loss: -0.010921. Value loss: 0.545392. Entropy: 2.577695.
episode: 28   score: 1300.0  epsilon: 1.0    steps: 48  evaluation reward: 931.0714285714286
episode: 29   score: 780.0  epsilon: 1.0    steps: 256  evaluation reward: 925.8620689655172
Training network. lr: 0.000249. clip: 0.099696
Iteration 106: Policy loss: 0.003635. Value loss: 1.009362. Entropy: 2.555568.
Iteration 107: Policy loss: -0.011664. Value loss: 0.841085. Entropy: 2.548999.
Iteration 108: Policy loss: -0.01585

Iteration 167: Policy loss: -0.007443. Value loss: 0.429098. Entropy: 2.429536.
Iteration 168: Policy loss: -0.012840. Value loss: 0.273233. Entropy: 2.414375.
Training network. lr: 0.000249. clip: 0.099548
Iteration 169: Policy loss: 0.009887. Value loss: 0.906087. Entropy: 2.427313.
Iteration 170: Policy loss: -0.012767. Value loss: 0.619293. Entropy: 2.417408.
Iteration 171: Policy loss: -0.024152. Value loss: 0.532625. Entropy: 2.406967.
episode: 48   score: 1370.0  epsilon: 1.0    steps: 80  evaluation reward: 978.9583333333334
Training network. lr: 0.000249. clip: 0.099548
Iteration 172: Policy loss: 0.002928. Value loss: 0.914493. Entropy: 2.431994.
Iteration 173: Policy loss: -0.009437. Value loss: 0.637100. Entropy: 2.420759.
Iteration 174: Policy loss: -0.016491. Value loss: 0.513640. Entropy: 2.405357.
Training network. lr: 0.000249. clip: 0.099548
Iteration 175: Policy loss: 0.004458. Value loss: 0.485981. Entropy: 2.424182.
Iteration 176: Policy loss: -0.011405. Value loss

Training network. lr: 0.000248. clip: 0.099392
Iteration 232: Policy loss: 0.005729. Value loss: 0.894145. Entropy: 2.209065.
Iteration 233: Policy loss: -0.006041. Value loss: 0.613481. Entropy: 2.179980.
Iteration 234: Policy loss: -0.015091. Value loss: 0.473302. Entropy: 2.173483.
Training network. lr: 0.000248. clip: 0.099392
Iteration 235: Policy loss: 0.018445. Value loss: 0.792825. Entropy: 2.208738.
Iteration 236: Policy loss: -0.001550. Value loss: 0.502460. Entropy: 2.206210.
Iteration 237: Policy loss: -0.011838. Value loss: 0.350846. Entropy: 2.197402.
Training network. lr: 0.000248. clip: 0.099392
Iteration 238: Policy loss: 0.005438. Value loss: 0.838527. Entropy: 2.345443.
Iteration 239: Policy loss: -0.009799. Value loss: 0.599876. Entropy: 2.344759.
Iteration 240: Policy loss: -0.019725. Value loss: 0.479239. Entropy: 2.332583.
episode: 70   score: 330.0  epsilon: 1.0    steps: 32  evaluation reward: 1010.0
episode: 71   score: 730.0  epsilon: 1.0    steps: 224  evalu

Iteration 300: Policy loss: -0.008025. Value loss: 1.006330. Entropy: 2.255832.
Training network. lr: 0.000248. clip: 0.099088
Iteration 301: Policy loss: 0.013248. Value loss: 1.413328. Entropy: 2.252839.
Iteration 302: Policy loss: 0.004052. Value loss: 1.054974. Entropy: 2.243685.
Iteration 303: Policy loss: -0.006658. Value loss: 0.871249. Entropy: 2.223552.
episode: 89   score: 830.0  epsilon: 1.0    steps: 520  evaluation reward: 1008.314606741573
episode: 90   score: 1880.0  epsilon: 1.0    steps: 584  evaluation reward: 1018.0
Training network. lr: 0.000248. clip: 0.099088
Iteration 304: Policy loss: 0.010403. Value loss: 1.017008. Entropy: 2.296359.
Iteration 305: Policy loss: -0.002659. Value loss: 0.695946. Entropy: 2.279266.
Iteration 306: Policy loss: -0.015429. Value loss: 0.534382. Entropy: 2.268642.
episode: 91   score: 2010.0  epsilon: 1.0    steps: 272  evaluation reward: 1028.901098901099
Training network. lr: 0.000248. clip: 0.099088
Iteration 307: Policy loss: 0.00

episode: 113   score: 2150.0  epsilon: 1.0    steps: 968  evaluation reward: 1031.3
Training network. lr: 0.000247. clip: 0.098931
Iteration 364: Policy loss: 0.011972. Value loss: 0.840771. Entropy: 1.963537.
Iteration 365: Policy loss: -0.000340. Value loss: 0.472808. Entropy: 1.981390.
Iteration 366: Policy loss: -0.014041. Value loss: 0.343523. Entropy: 1.964451.
episode: 114   score: 1300.0  epsilon: 1.0    steps: 240  evaluation reward: 1040.5
Training network. lr: 0.000247. clip: 0.098931
Iteration 367: Policy loss: 0.006054. Value loss: 0.798516. Entropy: 2.039963.
Iteration 368: Policy loss: -0.011092. Value loss: 0.482922. Entropy: 2.034613.
Iteration 369: Policy loss: -0.018388. Value loss: 0.369867. Entropy: 2.015457.
episode: 115   score: 310.0  epsilon: 1.0    steps: 616  evaluation reward: 1026.8
Training network. lr: 0.000247. clip: 0.098931
Iteration 370: Policy loss: 0.005971. Value loss: 0.951906. Entropy: 2.102882.
Iteration 371: Policy loss: -0.006354. Value loss: 

Iteration 429: Policy loss: -0.014861. Value loss: 0.620599. Entropy: 2.047942.
episode: 137   score: 480.0  epsilon: 1.0    steps: 568  evaluation reward: 1005.6
episode: 138   score: 980.0  epsilon: 1.0    steps: 1016  evaluation reward: 1002.4
Training network. lr: 0.000247. clip: 0.098774
Iteration 430: Policy loss: 0.007299. Value loss: 0.822590. Entropy: 1.860503.
Iteration 431: Policy loss: -0.001769. Value loss: 0.516639. Entropy: 1.831660.
Iteration 432: Policy loss: -0.016956. Value loss: 0.369959. Entropy: 1.828955.
Training network. lr: 0.000247. clip: 0.098774
Iteration 433: Policy loss: 0.008423. Value loss: 0.791093. Entropy: 1.916450.
Iteration 434: Policy loss: 0.000048. Value loss: 0.537717. Entropy: 1.914111.
Iteration 435: Policy loss: -0.010738. Value loss: 0.472011. Entropy: 1.903172.
episode: 139   score: 560.0  epsilon: 1.0    steps: 352  evaluation reward: 996.2
Training network. lr: 0.000247. clip: 0.098774
Iteration 436: Policy loss: 0.016585. Value loss: 1.1

episode: 159   score: 1480.0  epsilon: 1.0    steps: 848  evaluation reward: 966.0
Training network. lr: 0.000247. clip: 0.098627
Iteration 496: Policy loss: 0.012782. Value loss: 1.082642. Entropy: 2.039748.
Iteration 497: Policy loss: -0.002375. Value loss: 0.656498. Entropy: 2.029098.
Iteration 498: Policy loss: -0.014594. Value loss: 0.429511. Entropy: 1.983915.
episode: 160   score: 930.0  epsilon: 1.0    steps: 496  evaluation reward: 961.9
Training network. lr: 0.000247. clip: 0.098627
Iteration 499: Policy loss: 0.011980. Value loss: 0.612803. Entropy: 2.057229.
Iteration 500: Policy loss: 0.003345. Value loss: 0.335143. Entropy: 2.048556.
Iteration 501: Policy loss: -0.010566. Value loss: 0.245911. Entropy: 2.028654.
episode: 161   score: 930.0  epsilon: 1.0    steps: 216  evaluation reward: 965.4
episode: 162   score: 460.0  epsilon: 1.0    steps: 592  evaluation reward: 958.2
Training network. lr: 0.000246. clip: 0.098470
Iteration 502: Policy loss: 0.010225. Value loss: 0.7

episode: 183   score: 1180.0  epsilon: 1.0    steps: 888  evaluation reward: 889.9
Training network. lr: 0.000246. clip: 0.098313
Iteration 562: Policy loss: 0.008995. Value loss: 0.956960. Entropy: 1.859862.
Iteration 563: Policy loss: -0.005171. Value loss: 0.666045. Entropy: 1.867691.
Iteration 564: Policy loss: -0.009704. Value loss: 0.530711. Entropy: 1.864572.
episode: 184   score: 1320.0  epsilon: 1.0    steps: 632  evaluation reward: 900.0
Training network. lr: 0.000246. clip: 0.098313
Iteration 565: Policy loss: 0.010050. Value loss: 1.106961. Entropy: 1.904223.
Iteration 566: Policy loss: 0.000711. Value loss: 0.737347. Entropy: 1.922366.
Iteration 567: Policy loss: -0.007443. Value loss: 0.549349. Entropy: 1.872573.
episode: 185   score: 880.0  epsilon: 1.0    steps: 952  evaluation reward: 895.8
Training network. lr: 0.000246. clip: 0.098313
Iteration 568: Policy loss: 0.017329. Value loss: 0.964709. Entropy: 1.903766.
Iteration 569: Policy loss: 0.000998. Value loss: 0.613

episode: 206   score: 630.0  epsilon: 1.0    steps: 224  evaluation reward: 881.9
episode: 207   score: 1080.0  epsilon: 1.0    steps: 480  evaluation reward: 882.9
Training network. lr: 0.000245. clip: 0.098166
Iteration 628: Policy loss: 0.012367. Value loss: 0.737024. Entropy: 1.760926.
Iteration 629: Policy loss: -0.004461. Value loss: 0.388840. Entropy: 1.742775.
Iteration 630: Policy loss: -0.016744. Value loss: 0.302165. Entropy: 1.733630.
episode: 208   score: 1080.0  epsilon: 1.0    steps: 792  evaluation reward: 879.8
Training network. lr: 0.000245. clip: 0.098166
Iteration 631: Policy loss: 0.005581. Value loss: 1.053126. Entropy: 1.953048.
Iteration 632: Policy loss: -0.008418. Value loss: 0.596358. Entropy: 1.947845.
Iteration 633: Policy loss: -0.019187. Value loss: 0.442242. Entropy: 1.921501.
Training network. lr: 0.000245. clip: 0.098166
Iteration 634: Policy loss: 0.013019. Value loss: 1.270392. Entropy: 2.023642.
Iteration 635: Policy loss: -0.000494. Value loss: 0.7

Iteration 696: Policy loss: -0.012856. Value loss: 0.403137. Entropy: 2.081949.
episode: 227   score: 360.0  epsilon: 1.0    steps: 520  evaluation reward: 868.0
Training network. lr: 0.000245. clip: 0.098009
Iteration 697: Policy loss: 0.004584. Value loss: 0.750106. Entropy: 2.045701.
Iteration 698: Policy loss: -0.001190. Value loss: 0.396607. Entropy: 2.048014.
Iteration 699: Policy loss: -0.016692. Value loss: 0.275623. Entropy: 2.060972.
episode: 228   score: 380.0  epsilon: 1.0    steps: 688  evaluation reward: 869.0
Training network. lr: 0.000245. clip: 0.098009
Iteration 700: Policy loss: 0.003297. Value loss: 1.141613. Entropy: 2.133219.
Iteration 701: Policy loss: -0.008865. Value loss: 0.840653. Entropy: 2.110705.
Iteration 702: Policy loss: -0.012824. Value loss: 0.687991. Entropy: 2.112882.
Training network. lr: 0.000245. clip: 0.097853
Iteration 703: Policy loss: 0.010523. Value loss: 0.987410. Entropy: 2.117092.
Iteration 704: Policy loss: -0.003900. Value loss: 0.66089

Iteration 762: Policy loss: -0.019609. Value loss: 0.610046. Entropy: 2.024385.
Training network. lr: 0.000244. clip: 0.097705
Iteration 763: Policy loss: 0.006214. Value loss: 0.958892. Entropy: 2.103537.
Iteration 764: Policy loss: -0.006851. Value loss: 0.525430. Entropy: 2.093300.
Iteration 765: Policy loss: -0.017617. Value loss: 0.341195. Entropy: 2.086866.
Training network. lr: 0.000244. clip: 0.097705
Iteration 766: Policy loss: 0.003438. Value loss: 0.903475. Entropy: 2.199818.
Iteration 767: Policy loss: -0.002910. Value loss: 0.526377. Entropy: 2.208105.
Iteration 768: Policy loss: -0.015788. Value loss: 0.367323. Entropy: 2.193444.
Training network. lr: 0.000244. clip: 0.097705
Iteration 769: Policy loss: 0.005580. Value loss: 1.399826. Entropy: 2.164321.
Iteration 770: Policy loss: -0.003450. Value loss: 0.896324. Entropy: 2.144200.
Iteration 771: Policy loss: -0.012809. Value loss: 0.635836. Entropy: 2.146450.
Training network. lr: 0.000244. clip: 0.097705
Iteration 772: 

Iteration 829: Policy loss: 0.009663. Value loss: 1.126724. Entropy: 2.235371.
Iteration 830: Policy loss: -0.008096. Value loss: 0.636182. Entropy: 2.216624.
Iteration 831: Policy loss: -0.018408. Value loss: 0.456364. Entropy: 2.199176.
Training network. lr: 0.000244. clip: 0.097549
Iteration 832: Policy loss: 0.006550. Value loss: 0.954451. Entropy: 2.207431.
Iteration 833: Policy loss: -0.012363. Value loss: 0.479730. Entropy: 2.173864.
Iteration 834: Policy loss: -0.019975. Value loss: 0.358148. Entropy: 2.165597.
Training network. lr: 0.000244. clip: 0.097549
Iteration 835: Policy loss: 0.008942. Value loss: 1.806534. Entropy: 2.285215.
Iteration 836: Policy loss: -0.006599. Value loss: 1.078977. Entropy: 2.266976.
Iteration 837: Policy loss: -0.014952. Value loss: 0.721755. Entropy: 2.267597.
episode: 273   score: 1280.0  epsilon: 1.0    steps: 448  evaluation reward: 964.6
Training network. lr: 0.000244. clip: 0.097549
Iteration 838: Policy loss: 0.010838. Value loss: 0.976574.

Training network. lr: 0.000243. clip: 0.097392
Iteration 895: Policy loss: 0.004501. Value loss: 0.852715. Entropy: 2.096275.
Iteration 896: Policy loss: -0.000535. Value loss: 0.504544. Entropy: 2.114952.
Iteration 897: Policy loss: -0.020406. Value loss: 0.358659. Entropy: 2.073730.
Training network. lr: 0.000243. clip: 0.097392
Iteration 898: Policy loss: 0.006080. Value loss: 0.869950. Entropy: 2.237394.
Iteration 899: Policy loss: -0.002865. Value loss: 0.467643. Entropy: 2.259945.
Iteration 900: Policy loss: -0.009180. Value loss: 0.348544. Entropy: 2.236467.
episode: 297   score: 880.0  epsilon: 1.0    steps: 664  evaluation reward: 954.0
episode: 298   score: 780.0  epsilon: 1.0    steps: 736  evaluation reward: 958.0
Training network. lr: 0.000243. clip: 0.097244
Iteration 901: Policy loss: 0.007689. Value loss: 1.534644. Entropy: 2.236821.
Iteration 902: Policy loss: -0.004411. Value loss: 1.004448. Entropy: 2.197628.
Iteration 903: Policy loss: -0.010301. Value loss: 0.69088

Iteration 963: Policy loss: -0.010765. Value loss: 0.759285. Entropy: 1.796867.
episode: 318   score: 1440.0  epsilon: 1.0    steps: 48  evaluation reward: 998.1
episode: 319   score: 930.0  epsilon: 1.0    steps: 744  evaluation reward: 1002.6
Training network. lr: 0.000243. clip: 0.097088
Iteration 964: Policy loss: 0.011142. Value loss: 1.187446. Entropy: 2.043563.
Iteration 965: Policy loss: 0.005226. Value loss: 0.615471. Entropy: 2.050294.
Iteration 966: Policy loss: -0.008605. Value loss: 0.405450. Entropy: 2.008450.
Training network. lr: 0.000243. clip: 0.097088
Iteration 967: Policy loss: 0.006817. Value loss: 1.328997. Entropy: 2.054429.
Iteration 968: Policy loss: -0.002382. Value loss: 0.830125. Entropy: 2.065900.
Iteration 969: Policy loss: -0.015155. Value loss: 0.599586. Entropy: 2.048698.
episode: 320   score: 360.0  epsilon: 1.0    steps: 16  evaluation reward: 997.6
Training network. lr: 0.000243. clip: 0.097088
Iteration 970: Policy loss: 0.006109. Value loss: 1.0658

Iteration 1031: Policy loss: -0.007078. Value loss: 0.452240. Entropy: 2.071388.
Iteration 1032: Policy loss: -0.013237. Value loss: 0.306695. Entropy: 2.055629.
episode: 339   score: 980.0  epsilon: 1.0    steps: 16  evaluation reward: 986.8
Training network. lr: 0.000242. clip: 0.096931
Iteration 1033: Policy loss: 0.002786. Value loss: 0.853923. Entropy: 2.084999.
Iteration 1034: Policy loss: -0.010013. Value loss: 0.577840. Entropy: 2.065653.
Iteration 1035: Policy loss: -0.014538. Value loss: 0.460817. Entropy: 2.061064.
Training network. lr: 0.000242. clip: 0.096931
Iteration 1036: Policy loss: 0.015075. Value loss: 0.351827. Entropy: 2.121059.
Iteration 1037: Policy loss: -0.002810. Value loss: 0.138458. Entropy: 2.148746.
Iteration 1038: Policy loss: -0.016161. Value loss: 0.098401. Entropy: 2.139488.
episode: 340   score: 1030.0  epsilon: 1.0    steps: 272  evaluation reward: 983.9
episode: 341   score: 260.0  epsilon: 1.0    steps: 664  evaluation reward: 975.7
Training netwo

Training network. lr: 0.000242. clip: 0.096784
Iteration 1099: Policy loss: 0.010101. Value loss: 1.179235. Entropy: 1.997866.
Iteration 1100: Policy loss: -0.002048. Value loss: 0.771794. Entropy: 1.990937.
Iteration 1101: Policy loss: -0.014855. Value loss: 0.558075. Entropy: 1.949484.
episode: 359   score: 1410.0  epsilon: 1.0    steps: 968  evaluation reward: 972.3
Training network. lr: 0.000242. clip: 0.096627
Iteration 1102: Policy loss: 0.010206. Value loss: 1.113826. Entropy: 2.005069.
Iteration 1103: Policy loss: -0.004086. Value loss: 0.701312. Entropy: 2.008263.
Iteration 1104: Policy loss: -0.016405. Value loss: 0.518520. Entropy: 2.004808.
episode: 360   score: 1300.0  epsilon: 1.0    steps: 552  evaluation reward: 981.0
episode: 361   score: 780.0  epsilon: 1.0    steps: 1024  evaluation reward: 974.1
Training network. lr: 0.000242. clip: 0.096627
Iteration 1105: Policy loss: 0.009913. Value loss: 1.269921. Entropy: 2.108320.
Iteration 1106: Policy loss: -0.003742. Value 

Iteration 1169: Policy loss: -0.005951. Value loss: 0.689336. Entropy: 1.991445.
Iteration 1170: Policy loss: -0.014263. Value loss: 0.476694. Entropy: 1.974808.
episode: 377   score: 1300.0  epsilon: 1.0    steps: 448  evaluation reward: 994.7
Training network. lr: 0.000241. clip: 0.096470
Iteration 1171: Policy loss: 0.005317. Value loss: 1.174083. Entropy: 1.962255.
Iteration 1172: Policy loss: -0.011148. Value loss: 0.740510. Entropy: 1.963988.
Iteration 1173: Policy loss: -0.015521. Value loss: 0.604611. Entropy: 1.968146.
episode: 378   score: 1300.0  epsilon: 1.0    steps: 72  evaluation reward: 994.9
episode: 379   score: 1300.0  epsilon: 1.0    steps: 112  evaluation reward: 1001.6
episode: 380   score: 1710.0  epsilon: 1.0    steps: 200  evaluation reward: 1005.3
Training network. lr: 0.000241. clip: 0.096470
Iteration 1174: Policy loss: 0.005257. Value loss: 1.282754. Entropy: 2.051533.
Iteration 1175: Policy loss: -0.009059. Value loss: 0.852125. Entropy: 2.043375.
Iteratio

Iteration 1231: Policy loss: 0.004750. Value loss: 1.165569. Entropy: 2.014084.
Iteration 1232: Policy loss: -0.014317. Value loss: 0.641489. Entropy: 1.991408.
Iteration 1233: Policy loss: -0.020210. Value loss: 0.421481. Entropy: 1.982430.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1234: Policy loss: 0.015904. Value loss: 2.322085. Entropy: 2.044786.
Iteration 1235: Policy loss: 0.002204. Value loss: 1.519007. Entropy: 2.019469.
Iteration 1236: Policy loss: 0.002523. Value loss: 1.107573. Entropy: 1.999697.
episode: 403   score: 1030.0  epsilon: 1.0    steps: 8  evaluation reward: 1042.9
episode: 404   score: 2060.0  epsilon: 1.0    steps: 112  evaluation reward: 1058.7
Training network. lr: 0.000241. clip: 0.096323
Iteration 1237: Policy loss: 0.008602. Value loss: 1.130473. Entropy: 1.943526.
Iteration 1238: Policy loss: -0.001228. Value loss: 0.617516. Entropy: 1.948721.
Iteration 1239: Policy loss: -0.015138. Value loss: 0.423511. Entropy: 1.941382.
episode: 405   s

Iteration 1298: Policy loss: -0.006996. Value loss: 0.383296. Entropy: 2.062720.
Iteration 1299: Policy loss: -0.011927. Value loss: 0.276431. Entropy: 2.048395.
Training network. lr: 0.000240. clip: 0.096166
Iteration 1300: Policy loss: 0.006479. Value loss: 1.055302. Entropy: 1.928999.
Iteration 1301: Policy loss: -0.012139. Value loss: 0.689085. Entropy: 1.928136.
Iteration 1302: Policy loss: -0.016096. Value loss: 0.492934. Entropy: 1.900928.
episode: 425   score: 1300.0  epsilon: 1.0    steps: 880  evaluation reward: 1064.3
Training network. lr: 0.000240. clip: 0.096009
Iteration 1303: Policy loss: 0.005158. Value loss: 0.767690. Entropy: 1.934460.
Iteration 1304: Policy loss: -0.005839. Value loss: 0.437528. Entropy: 1.917412.
Iteration 1305: Policy loss: -0.018289. Value loss: 0.333177. Entropy: 1.909177.
episode: 426   score: 1300.0  epsilon: 1.0    steps: 136  evaluation reward: 1070.0
Training network. lr: 0.000240. clip: 0.096009
Iteration 1306: Policy loss: 0.004899. Value 

Iteration 1363: Policy loss: 0.011063. Value loss: 1.368801. Entropy: 2.107311.
Iteration 1364: Policy loss: -0.004449. Value loss: 0.895365. Entropy: 2.088603.
Iteration 1365: Policy loss: -0.015180. Value loss: 0.683635. Entropy: 2.076907.
episode: 448   score: 530.0  epsilon: 1.0    steps: 344  evaluation reward: 1087.7
Training network. lr: 0.000240. clip: 0.095862
Iteration 1366: Policy loss: 0.004597. Value loss: 0.762785. Entropy: 2.031893.
Iteration 1367: Policy loss: -0.010589. Value loss: 0.478376. Entropy: 2.054329.
Iteration 1368: Policy loss: -0.015354. Value loss: 0.353051. Entropy: 2.044396.
episode: 449   score: 1180.0  epsilon: 1.0    steps: 576  evaluation reward: 1091.2
Training network. lr: 0.000240. clip: 0.095862
Iteration 1369: Policy loss: 0.003062. Value loss: 0.987054. Entropy: 1.913175.
Iteration 1370: Policy loss: -0.010810. Value loss: 0.670695. Entropy: 1.915915.
Iteration 1371: Policy loss: -0.015155. Value loss: 0.577770. Entropy: 1.909466.
Training netw

Iteration 1428: Policy loss: -0.018326. Value loss: 0.212649. Entropy: 1.957048.
Training network. lr: 0.000239. clip: 0.095705
Iteration 1429: Policy loss: 0.008125. Value loss: 1.000508. Entropy: 2.002627.
Iteration 1430: Policy loss: -0.006959. Value loss: 0.588643. Entropy: 1.990986.
Iteration 1431: Policy loss: -0.018875. Value loss: 0.408295. Entropy: 1.975804.
Training network. lr: 0.000239. clip: 0.095705
Iteration 1432: Policy loss: 0.008670. Value loss: 1.069885. Entropy: 2.083526.
Iteration 1433: Policy loss: -0.005297. Value loss: 0.577948. Entropy: 2.066157.
Iteration 1434: Policy loss: -0.016270. Value loss: 0.468342. Entropy: 2.060836.
Training network. lr: 0.000239. clip: 0.095705
Iteration 1435: Policy loss: 0.005823. Value loss: 0.819674. Entropy: 2.081071.
Iteration 1436: Policy loss: -0.004458. Value loss: 0.435740. Entropy: 2.076365.
Iteration 1437: Policy loss: -0.014973. Value loss: 0.262322. Entropy: 2.057703.
episode: 472   score: 410.0  epsilon: 1.0    steps: 

Iteration 1493: Policy loss: -0.000684. Value loss: 0.860172. Entropy: 1.995349.
Iteration 1494: Policy loss: -0.009530. Value loss: 0.645703. Entropy: 1.969360.
episode: 495   score: 1830.0  epsilon: 1.0    steps: 912  evaluation reward: 1099.6
Training network. lr: 0.000239. clip: 0.095549
Iteration 1495: Policy loss: 0.007575. Value loss: 1.032552. Entropy: 1.866472.
Iteration 1496: Policy loss: -0.009561. Value loss: 0.662551. Entropy: 1.865477.
Iteration 1497: Policy loss: -0.016124. Value loss: 0.534043. Entropy: 1.842553.
episode: 496   score: 1280.0  epsilon: 1.0    steps: 856  evaluation reward: 1102.6
Training network. lr: 0.000239. clip: 0.095549
Iteration 1498: Policy loss: 0.007941. Value loss: 1.624512. Entropy: 1.834620.
Iteration 1499: Policy loss: -0.001002. Value loss: 1.004058. Entropy: 1.813451.
Iteration 1500: Policy loss: -0.004615. Value loss: 0.688758. Entropy: 1.768156.
episode: 497   score: 1560.0  epsilon: 1.0    steps: 200  evaluation reward: 1108.9
Training

Iteration 1561: Policy loss: 0.007469. Value loss: 0.855971. Entropy: 1.761316.
Iteration 1562: Policy loss: -0.005576. Value loss: 0.543709. Entropy: 1.757543.
Iteration 1563: Policy loss: -0.011919. Value loss: 0.417186. Entropy: 1.767284.
episode: 514   score: 1710.0  epsilon: 1.0    steps: 240  evaluation reward: 1100.9
Training network. lr: 0.000238. clip: 0.095245
Iteration 1564: Policy loss: 0.015722. Value loss: 0.855055. Entropy: 1.737658.
Iteration 1565: Policy loss: -0.004084. Value loss: 0.514830. Entropy: 1.731019.
Iteration 1566: Policy loss: -0.015709. Value loss: 0.380860. Entropy: 1.735353.
episode: 515   score: 1180.0  epsilon: 1.0    steps: 640  evaluation reward: 1095.9
Training network. lr: 0.000238. clip: 0.095245
Iteration 1567: Policy loss: 0.008347. Value loss: 0.916906. Entropy: 1.758046.
Iteration 1568: Policy loss: -0.003296. Value loss: 0.461270. Entropy: 1.760056.
Iteration 1569: Policy loss: -0.012475. Value loss: 0.303075. Entropy: 1.767895.
episode: 516

episode: 537   score: 580.0  epsilon: 1.0    steps: 776  evaluation reward: 1085.7
Training network. lr: 0.000238. clip: 0.095088
Iteration 1627: Policy loss: 0.003732. Value loss: 1.801772. Entropy: 1.871022.
Iteration 1628: Policy loss: -0.005554. Value loss: 1.191269. Entropy: 1.865256.
Iteration 1629: Policy loss: -0.013158. Value loss: 0.821950. Entropy: 1.824904.
Training network. lr: 0.000238. clip: 0.095088
Iteration 1630: Policy loss: 0.003790. Value loss: 1.493994. Entropy: 2.027802.
Iteration 1631: Policy loss: -0.014687. Value loss: 0.964589. Entropy: 2.052792.
Iteration 1632: Policy loss: -0.019840. Value loss: 0.710403. Entropy: 2.012336.
episode: 538   score: 630.0  epsilon: 1.0    steps: 8  evaluation reward: 1079.0
Training network. lr: 0.000238. clip: 0.095088
Iteration 1633: Policy loss: 0.003794. Value loss: 1.481630. Entropy: 1.934600.
Iteration 1634: Policy loss: -0.005985. Value loss: 1.020928. Entropy: 1.929313.
Iteration 1635: Policy loss: -0.009841. Value loss

Training network. lr: 0.000237. clip: 0.094940
Iteration 1690: Policy loss: 0.007214. Value loss: 1.015043. Entropy: 1.978325.
Iteration 1691: Policy loss: -0.006189. Value loss: 0.546751. Entropy: 1.978806.
Iteration 1692: Policy loss: -0.020665. Value loss: 0.379100. Entropy: 1.955590.
Training network. lr: 0.000237. clip: 0.094940
Iteration 1693: Policy loss: 0.007412. Value loss: 0.731834. Entropy: 1.987608.
Iteration 1694: Policy loss: -0.005640. Value loss: 0.350352. Entropy: 1.968284.
Iteration 1695: Policy loss: -0.021603. Value loss: 0.236110. Entropy: 1.963313.
episode: 563   score: 630.0  epsilon: 1.0    steps: 152  evaluation reward: 1031.5
episode: 564   score: 1180.0  epsilon: 1.0    steps: 456  evaluation reward: 1030.3
Training network. lr: 0.000237. clip: 0.094940
Iteration 1696: Policy loss: 0.005965. Value loss: 0.852860. Entropy: 1.807530.
Iteration 1697: Policy loss: -0.009606. Value loss: 0.509869. Entropy: 1.777644.
Iteration 1698: Policy loss: -0.014968. Value l

episode: 582   score: 1370.0  epsilon: 1.0    steps: 312  evaluation reward: 1063.2
Training network. lr: 0.000237. clip: 0.094627
Iteration 1759: Policy loss: 0.004200. Value loss: 0.979015. Entropy: 1.861092.
Iteration 1760: Policy loss: -0.006368. Value loss: 0.550432. Entropy: 1.854185.
Iteration 1761: Policy loss: -0.012748. Value loss: 0.403566. Entropy: 1.854095.
episode: 583   score: 1510.0  epsilon: 1.0    steps: 472  evaluation reward: 1073.0
episode: 584   score: 730.0  epsilon: 1.0    steps: 856  evaluation reward: 1072.0
Training network. lr: 0.000237. clip: 0.094627
Iteration 1762: Policy loss: 0.002455. Value loss: 1.458267. Entropy: 1.918173.
Iteration 1763: Policy loss: -0.007544. Value loss: 0.797176. Entropy: 1.894634.
Iteration 1764: Policy loss: -0.016732. Value loss: 0.513578. Entropy: 1.866492.
episode: 585   score: 1030.0  epsilon: 1.0    steps: 1008  evaluation reward: 1073.5
Training network. lr: 0.000237. clip: 0.094627
Iteration 1765: Policy loss: 0.002123. 

episode: 604   score: 1830.0  epsilon: 1.0    steps: 272  evaluation reward: 1081.6
episode: 605   score: 1280.0  epsilon: 1.0    steps: 928  evaluation reward: 1083.6
Training network. lr: 0.000236. clip: 0.094480
Iteration 1825: Policy loss: 0.005067. Value loss: 1.275407. Entropy: 1.757139.
Iteration 1826: Policy loss: -0.004943. Value loss: 0.806270. Entropy: 1.757829.
Iteration 1827: Policy loss: -0.017339. Value loss: 0.606996. Entropy: 1.744381.
Training network. lr: 0.000236. clip: 0.094480
Iteration 1828: Policy loss: 0.010005. Value loss: 1.226771. Entropy: 1.787564.
Iteration 1829: Policy loss: -0.001685. Value loss: 0.663282. Entropy: 1.803539.
Iteration 1830: Policy loss: -0.019749. Value loss: 0.499677. Entropy: 1.791903.
Training network. lr: 0.000236. clip: 0.094480
Iteration 1831: Policy loss: 0.005484. Value loss: 1.297163. Entropy: 1.820880.
Iteration 1832: Policy loss: -0.000936. Value loss: 0.733471. Entropy: 1.810580.
Iteration 1833: Policy loss: -0.015214. Value 

Iteration 1893: Policy loss: -0.016299. Value loss: 0.400098. Entropy: 1.735914.
Training network. lr: 0.000236. clip: 0.094323
Iteration 1894: Policy loss: 0.002057. Value loss: 1.218314. Entropy: 1.651641.
Iteration 1895: Policy loss: -0.005088. Value loss: 0.684062. Entropy: 1.666384.
Iteration 1896: Policy loss: -0.016215. Value loss: 0.464867. Entropy: 1.655304.
episode: 624   score: 1280.0  epsilon: 1.0    steps: 432  evaluation reward: 1174.1
Training network. lr: 0.000236. clip: 0.094323
Iteration 1897: Policy loss: 0.009972. Value loss: 0.897714. Entropy: 1.600377.
Iteration 1898: Policy loss: -0.006012. Value loss: 0.501067. Entropy: 1.610323.
Iteration 1899: Policy loss: -0.016584. Value loss: 0.330753. Entropy: 1.595435.
Training network. lr: 0.000236. clip: 0.094323
Iteration 1900: Policy loss: 0.008307. Value loss: 0.835208. Entropy: 1.745369.
Iteration 1901: Policy loss: -0.006692. Value loss: 0.516933. Entropy: 1.725597.
Iteration 1902: Policy loss: -0.016907. Value los

Training network. lr: 0.000235. clip: 0.094019
Iteration 1960: Policy loss: 0.007875. Value loss: 1.271901. Entropy: 1.861226.
Iteration 1961: Policy loss: -0.009709. Value loss: 0.714231. Entropy: 1.885225.
Iteration 1962: Policy loss: -0.019704. Value loss: 0.521746. Entropy: 1.861070.
Training network. lr: 0.000235. clip: 0.094019
Iteration 1963: Policy loss: 0.008086. Value loss: 1.925723. Entropy: 1.779185.
Iteration 1964: Policy loss: -0.010693. Value loss: 1.266909. Entropy: 1.773887.
Iteration 1965: Policy loss: -0.017781. Value loss: 0.871892. Entropy: 1.751404.
episode: 646   score: 1030.0  epsilon: 1.0    steps: 256  evaluation reward: 1216.3
episode: 647   score: 330.0  epsilon: 1.0    steps: 544  evaluation reward: 1214.8
Training network. lr: 0.000235. clip: 0.094019
Iteration 1966: Policy loss: 0.007624. Value loss: 1.587320. Entropy: 1.771015.
Iteration 1967: Policy loss: -0.011377. Value loss: 1.114941. Entropy: 1.783900.
Iteration 1968: Policy loss: -0.019020. Value l

episode: 668   score: 830.0  epsilon: 1.0    steps: 248  evaluation reward: 1292.2
episode: 669   score: 2250.0  epsilon: 1.0    steps: 336  evaluation reward: 1302.9
Training network. lr: 0.000235. clip: 0.093862
Iteration 2026: Policy loss: 0.006706. Value loss: 1.373258. Entropy: 1.759557.
Iteration 2027: Policy loss: -0.011225. Value loss: 0.827523. Entropy: 1.738273.
Iteration 2028: Policy loss: -0.021142. Value loss: 0.655857. Entropy: 1.704743.
Training network. lr: 0.000235. clip: 0.093862
Iteration 2029: Policy loss: 0.011743. Value loss: 1.596878. Entropy: 1.738602.
Iteration 2030: Policy loss: -0.010290. Value loss: 0.846229. Entropy: 1.732877.
Iteration 2031: Policy loss: -0.020413. Value loss: 0.572102. Entropy: 1.740755.
episode: 670   score: 1480.0  epsilon: 1.0    steps: 112  evaluation reward: 1312.1
Training network. lr: 0.000235. clip: 0.093862
Iteration 2032: Policy loss: 0.008626. Value loss: 1.733146. Entropy: 1.879892.
Iteration 2033: Policy loss: -0.012943. Valu

Iteration 2090: Policy loss: -0.003235. Value loss: 0.690570. Entropy: 1.951916.
Iteration 2091: Policy loss: -0.015978. Value loss: 0.492861. Entropy: 1.935418.
episode: 692   score: 980.0  epsilon: 1.0    steps: 584  evaluation reward: 1308.6
Training network. lr: 0.000234. clip: 0.093705
Iteration 2092: Policy loss: 0.003345. Value loss: 1.331236. Entropy: 1.902830.
Iteration 2093: Policy loss: -0.003238. Value loss: 0.867051. Entropy: 1.894391.
Iteration 2094: Policy loss: -0.016004. Value loss: 0.601462. Entropy: 1.899673.
Training network. lr: 0.000234. clip: 0.093705
Iteration 2095: Policy loss: 0.003205. Value loss: 1.087306. Entropy: 1.808604.
Iteration 2096: Policy loss: -0.004821. Value loss: 0.725876. Entropy: 1.817196.
Iteration 2097: Policy loss: -0.014865. Value loss: 0.603233. Entropy: 1.828180.
Training network. lr: 0.000234. clip: 0.093705
Iteration 2098: Policy loss: 0.004768. Value loss: 0.675459. Entropy: 1.630267.
Iteration 2099: Policy loss: -0.009566. Value loss

Iteration 2156: Policy loss: -0.014438. Value loss: 0.703230. Entropy: 1.871996.
Iteration 2157: Policy loss: -0.024194. Value loss: 0.465288. Entropy: 1.857462.
episode: 714   score: 980.0  epsilon: 1.0    steps: 488  evaluation reward: 1300.0
Training network. lr: 0.000234. clip: 0.093401
Iteration 2158: Policy loss: 0.010016. Value loss: 1.584328. Entropy: 1.825607.
Iteration 2159: Policy loss: -0.002330. Value loss: 0.939352. Entropy: 1.785406.
Iteration 2160: Policy loss: -0.004532. Value loss: 0.635723. Entropy: 1.784940.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2161: Policy loss: 0.008866. Value loss: 1.197360. Entropy: 1.738959.
Iteration 2162: Policy loss: -0.004929. Value loss: 0.658983. Entropy: 1.744292.
Iteration 2163: Policy loss: -0.015252. Value loss: 0.484131. Entropy: 1.728911.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2164: Policy loss: 0.010199. Value loss: 1.296578. Entropy: 1.805900.
Iteration 2165: Policy loss: -0.008866. Value loss

episode: 734   score: 2090.0  epsilon: 1.0    steps: 472  evaluation reward: 1331.4
Training network. lr: 0.000233. clip: 0.093245
Iteration 2224: Policy loss: 0.008318. Value loss: 1.533051. Entropy: 1.788364.
Iteration 2225: Policy loss: -0.002559. Value loss: 0.921252. Entropy: 1.802448.
Iteration 2226: Policy loss: -0.011022. Value loss: 0.632154. Entropy: 1.789369.
Training network. lr: 0.000233. clip: 0.093245
Iteration 2227: Policy loss: 0.009202. Value loss: 1.140455. Entropy: 1.795181.
Iteration 2228: Policy loss: -0.002627. Value loss: 0.782772. Entropy: 1.790598.
Iteration 2229: Policy loss: -0.014763. Value loss: 0.601296. Entropy: 1.781431.
episode: 735   score: 980.0  epsilon: 1.0    steps: 752  evaluation reward: 1334.9
Training network. lr: 0.000233. clip: 0.093245
Iteration 2230: Policy loss: 0.008166. Value loss: 1.570788. Entropy: 1.786585.
Iteration 2231: Policy loss: -0.003638. Value loss: 0.984048. Entropy: 1.800693.
Iteration 2232: Policy loss: -0.012299. Value l

Training network. lr: 0.000233. clip: 0.093097
Iteration 2290: Policy loss: 0.003073. Value loss: 0.614361. Entropy: 1.551979.
Iteration 2291: Policy loss: -0.006097. Value loss: 0.393943. Entropy: 1.594355.
Iteration 2292: Policy loss: -0.012281. Value loss: 0.295496. Entropy: 1.604219.
Training network. lr: 0.000233. clip: 0.093097
Iteration 2293: Policy loss: 0.004547. Value loss: 1.060548. Entropy: 1.666067.
Iteration 2294: Policy loss: -0.001360. Value loss: 0.818209. Entropy: 1.647937.
Iteration 2295: Policy loss: -0.006104. Value loss: 0.677879. Entropy: 1.629937.
Training network. lr: 0.000233. clip: 0.093097
Iteration 2296: Policy loss: 0.005936. Value loss: 0.988599. Entropy: 1.659256.
Iteration 2297: Policy loss: -0.004938. Value loss: 0.650745. Entropy: 1.643673.
Iteration 2298: Policy loss: -0.014538. Value loss: 0.516229. Entropy: 1.652718.
episode: 756   score: 1510.0  epsilon: 1.0    steps: 560  evaluation reward: 1297.2
Training network. lr: 0.000233. clip: 0.093097
It

Training network. lr: 0.000232. clip: 0.092784
Iteration 2359: Policy loss: 0.004771. Value loss: 1.163172. Entropy: 2.028862.
Iteration 2360: Policy loss: -0.001376. Value loss: 0.714121. Entropy: 2.015215.
Iteration 2361: Policy loss: -0.007909. Value loss: 0.540956. Entropy: 2.022022.
Training network. lr: 0.000232. clip: 0.092784
Iteration 2362: Policy loss: 0.007316. Value loss: 0.963350. Entropy: 2.037022.
Iteration 2363: Policy loss: -0.006063. Value loss: 0.598837. Entropy: 2.052998.
Iteration 2364: Policy loss: -0.016006. Value loss: 0.447999. Entropy: 2.036863.
episode: 775   score: 880.0  epsilon: 1.0    steps: 888  evaluation reward: 1303.4
episode: 776   score: 1180.0  epsilon: 1.0    steps: 968  evaluation reward: 1297.9
Training network. lr: 0.000232. clip: 0.092784
Iteration 2365: Policy loss: 0.007567. Value loss: 0.922762. Entropy: 2.007625.
Iteration 2366: Policy loss: -0.009344. Value loss: 0.600434. Entropy: 1.986312.
Iteration 2367: Policy loss: -0.014021. Value l

Iteration 2422: Policy loss: 0.006573. Value loss: 0.637413. Entropy: 1.955491.
Iteration 2423: Policy loss: -0.010236. Value loss: 0.417050. Entropy: 1.950418.
Iteration 2424: Policy loss: -0.015250. Value loss: 0.316944. Entropy: 1.939563.
episode: 800   score: 330.0  epsilon: 1.0    steps: 952  evaluation reward: 1240.8
Training network. lr: 0.000232. clip: 0.092636
Iteration 2425: Policy loss: 0.006057. Value loss: 1.223965. Entropy: 1.841746.
Iteration 2426: Policy loss: -0.005304. Value loss: 0.714875. Entropy: 1.865250.
Iteration 2427: Policy loss: -0.011387. Value loss: 0.500133. Entropy: 1.846506.
Training network. lr: 0.000232. clip: 0.092636
Iteration 2428: Policy loss: 0.012275. Value loss: 0.948705. Entropy: 1.920004.
Iteration 2429: Policy loss: -0.004129. Value loss: 0.496557. Entropy: 1.920271.
Iteration 2430: Policy loss: -0.009198. Value loss: 0.337852. Entropy: 1.896051.
now time :  2019-03-06 10:13:34.491256
episode: 801   score: 880.0  epsilon: 1.0    steps: 872  e

Iteration 2488: Policy loss: 0.004810. Value loss: 1.264276. Entropy: 1.842662.
Iteration 2489: Policy loss: -0.008529. Value loss: 0.671532. Entropy: 1.833784.
Iteration 2490: Policy loss: -0.014261. Value loss: 0.382276. Entropy: 1.808577.
episode: 822   score: 330.0  epsilon: 1.0    steps: 176  evaluation reward: 1153.9
Training network. lr: 0.000231. clip: 0.092480
Iteration 2491: Policy loss: 0.004463. Value loss: 1.285311. Entropy: 1.919474.
Iteration 2492: Policy loss: -0.007121. Value loss: 0.786633. Entropy: 1.944017.
Iteration 2493: Policy loss: -0.016621. Value loss: 0.595257. Entropy: 1.937463.
episode: 823   score: 980.0  epsilon: 1.0    steps: 192  evaluation reward: 1150.0
Training network. lr: 0.000231. clip: 0.092480
Iteration 2494: Policy loss: 0.009298. Value loss: 0.766132. Entropy: 1.951601.
Iteration 2495: Policy loss: -0.006935. Value loss: 0.393692. Entropy: 1.955637.
Iteration 2496: Policy loss: -0.010084. Value loss: 0.276322. Entropy: 1.940038.
episode: 824  

Iteration 2551: Policy loss: 0.004882. Value loss: 1.232316. Entropy: 1.733489.
Iteration 2552: Policy loss: -0.009849. Value loss: 0.806930. Entropy: 1.722911.
Iteration 2553: Policy loss: -0.018274. Value loss: 0.593322. Entropy: 1.685673.
Training network. lr: 0.000230. clip: 0.092176
Iteration 2554: Policy loss: 0.003205. Value loss: 0.981129. Entropy: 1.902485.
Iteration 2555: Policy loss: -0.010125. Value loss: 0.624384. Entropy: 1.915142.
Iteration 2556: Policy loss: -0.024321. Value loss: 0.473047. Entropy: 1.914829.
episode: 850   score: 830.0  epsilon: 1.0    steps: 112  evaluation reward: 1096.7
Training network. lr: 0.000230. clip: 0.092176
Iteration 2557: Policy loss: 0.004850. Value loss: 1.500342. Entropy: 1.849334.
Iteration 2558: Policy loss: -0.008743. Value loss: 0.993625. Entropy: 1.809986.
Iteration 2559: Policy loss: -0.015071. Value loss: 0.725501. Entropy: 1.796115.
Training network. lr: 0.000230. clip: 0.092176
Iteration 2560: Policy loss: 0.010255. Value loss:

Iteration 2616: Policy loss: -0.015920. Value loss: 0.395301. Entropy: 1.832796.
episode: 874   score: 1410.0  epsilon: 1.0    steps: 240  evaluation reward: 1015.3
episode: 875   score: 930.0  epsilon: 1.0    steps: 376  evaluation reward: 1015.8
Training network. lr: 0.000230. clip: 0.092019
Iteration 2617: Policy loss: 0.003003. Value loss: 1.001283. Entropy: 1.857466.
Iteration 2618: Policy loss: -0.004693. Value loss: 0.618062. Entropy: 1.858148.
Iteration 2619: Policy loss: -0.015295. Value loss: 0.454646. Entropy: 1.867056.
Training network. lr: 0.000230. clip: 0.092019
Iteration 2620: Policy loss: 0.004940. Value loss: 0.948731. Entropy: 1.913137.
Iteration 2621: Policy loss: -0.008619. Value loss: 0.599385. Entropy: 1.907413.
Iteration 2622: Policy loss: -0.017137. Value loss: 0.424141. Entropy: 1.890123.
episode: 876   score: 480.0  epsilon: 1.0    steps: 208  evaluation reward: 1008.8
Training network. lr: 0.000230. clip: 0.092019
Iteration 2623: Policy loss: 0.005677. Value

Iteration 2684: Policy loss: -0.004326. Value loss: 0.751724. Entropy: 1.831602.
Iteration 2685: Policy loss: -0.017594. Value loss: 0.474565. Entropy: 1.826077.
episode: 894   score: 630.0  epsilon: 1.0    steps: 616  evaluation reward: 1030.5
Training network. lr: 0.000230. clip: 0.091862
Iteration 2686: Policy loss: 0.002892. Value loss: 1.176176. Entropy: 1.883049.
Iteration 2687: Policy loss: -0.011114. Value loss: 0.606791. Entropy: 1.897581.
Iteration 2688: Policy loss: -0.016718. Value loss: 0.370229. Entropy: 1.867187.
episode: 895   score: 630.0  epsilon: 1.0    steps: 568  evaluation reward: 1029.0
Training network. lr: 0.000230. clip: 0.091862
Iteration 2689: Policy loss: 0.003065. Value loss: 1.160423. Entropy: 1.771877.
Iteration 2690: Policy loss: -0.005431. Value loss: 0.714559. Entropy: 1.792570.
Iteration 2691: Policy loss: -0.013795. Value loss: 0.492275. Entropy: 1.745696.
episode: 896   score: 1030.0  epsilon: 1.0    steps: 448  evaluation reward: 1030.0
episode: 8

Training network. lr: 0.000229. clip: 0.091715
Iteration 2749: Policy loss: 0.005714. Value loss: 0.533551. Entropy: 1.710737.
Iteration 2750: Policy loss: -0.008385. Value loss: 0.271969. Entropy: 1.714170.
Iteration 2751: Policy loss: -0.024750. Value loss: 0.179245. Entropy: 1.703893.
Training network. lr: 0.000229. clip: 0.091558
Iteration 2752: Policy loss: 0.008230. Value loss: 0.822046. Entropy: 1.697299.
Iteration 2753: Policy loss: -0.007542. Value loss: 0.464911. Entropy: 1.689687.
Iteration 2754: Policy loss: -0.020707. Value loss: 0.350067. Entropy: 1.675009.
episode: 917   score: 780.0  epsilon: 1.0    steps: 616  evaluation reward: 1027.8
Training network. lr: 0.000229. clip: 0.091558
Iteration 2755: Policy loss: 0.008399. Value loss: 0.781297. Entropy: 1.774669.
Iteration 2756: Policy loss: -0.014079. Value loss: 0.382688. Entropy: 1.749581.
Iteration 2757: Policy loss: -0.021816. Value loss: 0.264867. Entropy: 1.761640.
Training network. lr: 0.000229. clip: 0.091558
Ite

episode: 936   score: 380.0  epsilon: 1.0    steps: 464  evaluation reward: 997.0
Training network. lr: 0.000229. clip: 0.091401
Iteration 2818: Policy loss: 0.015626. Value loss: 0.617438. Entropy: 1.551409.
Iteration 2819: Policy loss: -0.000741. Value loss: 0.363437. Entropy: 1.514593.
Iteration 2820: Policy loss: -0.014036. Value loss: 0.275384. Entropy: 1.515723.
episode: 937   score: 1300.0  epsilon: 1.0    steps: 216  evaluation reward: 1002.2
Training network. lr: 0.000229. clip: 0.091401
Iteration 2821: Policy loss: 0.004557. Value loss: 1.062343. Entropy: 1.486826.
Iteration 2822: Policy loss: -0.011193. Value loss: 0.697460. Entropy: 1.484880.
Iteration 2823: Policy loss: -0.020243. Value loss: 0.548670. Entropy: 1.494642.
episode: 938   score: 1930.0  epsilon: 1.0    steps: 8  evaluation reward: 1011.7
episode: 939   score: 1300.0  epsilon: 1.0    steps: 576  evaluation reward: 1014.9
Training network. lr: 0.000229. clip: 0.091401
Iteration 2824: Policy loss: 0.005907. Valu

episode: 958   score: 930.0  epsilon: 1.0    steps: 984  evaluation reward: 1043.1
Training network. lr: 0.000228. clip: 0.091254
Iteration 2884: Policy loss: 0.008295. Value loss: 0.514897. Entropy: 1.686859.
Iteration 2885: Policy loss: -0.006911. Value loss: 0.275629. Entropy: 1.677344.
Iteration 2886: Policy loss: -0.016377. Value loss: 0.214910. Entropy: 1.675425.
Training network. lr: 0.000228. clip: 0.091254
Iteration 2887: Policy loss: 0.006053. Value loss: 0.929181. Entropy: 1.701761.
Iteration 2888: Policy loss: -0.014787. Value loss: 0.595941. Entropy: 1.696517.
Iteration 2889: Policy loss: -0.020640. Value loss: 0.431860. Entropy: 1.683156.
episode: 959   score: 1300.0  epsilon: 1.0    steps: 472  evaluation reward: 1043.3
episode: 960   score: 280.0  epsilon: 1.0    steps: 608  evaluation reward: 1035.8
episode: 961   score: 1410.0  epsilon: 1.0    steps: 904  evaluation reward: 1040.1
Training network. lr: 0.000228. clip: 0.091254
Iteration 2890: Policy loss: 0.005821. Va

Iteration 2949: Policy loss: -0.016589. Value loss: 0.363516. Entropy: 1.779320.
Training network. lr: 0.000228. clip: 0.091097
Iteration 2950: Policy loss: 0.008545. Value loss: 0.640598. Entropy: 1.803750.
Iteration 2951: Policy loss: -0.010129. Value loss: 0.432708. Entropy: 1.797583.
Iteration 2952: Policy loss: -0.017301. Value loss: 0.321963. Entropy: 1.784160.
Training network. lr: 0.000227. clip: 0.090941
Iteration 2953: Policy loss: 0.002981. Value loss: 0.793484. Entropy: 1.838595.
Iteration 2954: Policy loss: -0.006389. Value loss: 0.496540. Entropy: 1.826981.
Iteration 2955: Policy loss: -0.015970. Value loss: 0.357818. Entropy: 1.811800.
episode: 981   score: 1300.0  epsilon: 1.0    steps: 880  evaluation reward: 1032.6
Training network. lr: 0.000227. clip: 0.090941
Iteration 2956: Policy loss: 0.005538. Value loss: 0.965123. Entropy: 1.748435.
Iteration 2957: Policy loss: -0.011021. Value loss: 0.528103. Entropy: 1.734436.
Iteration 2958: Policy loss: -0.019985. Value los

Training network. lr: 0.000227. clip: 0.090793
Iteration 3016: Policy loss: 0.004355. Value loss: 1.243534. Entropy: 1.591321.
Iteration 3017: Policy loss: -0.014646. Value loss: 0.784245. Entropy: 1.564294.
Iteration 3018: Policy loss: -0.021481. Value loss: 0.592892. Entropy: 1.558101.
Training network. lr: 0.000227. clip: 0.090793
Iteration 3019: Policy loss: 0.007293. Value loss: 1.046602. Entropy: 1.705493.
Iteration 3020: Policy loss: -0.004441. Value loss: 0.611971. Entropy: 1.718139.
Iteration 3021: Policy loss: -0.014175. Value loss: 0.413959. Entropy: 1.720810.
episode: 1002   score: 1300.0  epsilon: 1.0    steps: 8  evaluation reward: 1013.7
Training network. lr: 0.000227. clip: 0.090793
Iteration 3022: Policy loss: 0.010125. Value loss: 1.786621. Entropy: 1.791705.
Iteration 3023: Policy loss: -0.007996. Value loss: 1.223775. Entropy: 1.766009.
Iteration 3024: Policy loss: -0.013024. Value loss: 0.959345. Entropy: 1.744656.
episode: 1003   score: 1180.0  epsilon: 1.0    ste

Iteration 3083: Policy loss: -0.009605. Value loss: 0.523016. Entropy: 1.936320.
Iteration 3084: Policy loss: -0.018986. Value loss: 0.369387. Entropy: 1.923361.
episode: 1023   score: 930.0  epsilon: 1.0    steps: 16  evaluation reward: 1013.9
Training network. lr: 0.000227. clip: 0.090637
Iteration 3085: Policy loss: 0.011910. Value loss: 0.840316. Entropy: 1.817059.
Iteration 3086: Policy loss: -0.003386. Value loss: 0.447360. Entropy: 1.797743.
Iteration 3087: Policy loss: -0.013723. Value loss: 0.287722. Entropy: 1.785053.
episode: 1024   score: 930.0  epsilon: 1.0    steps: 328  evaluation reward: 1010.2
Training network. lr: 0.000227. clip: 0.090637
Iteration 3088: Policy loss: 0.003024. Value loss: 0.678391. Entropy: 1.777746.
Iteration 3089: Policy loss: -0.007175. Value loss: 0.361563. Entropy: 1.779606.
Iteration 3090: Policy loss: -0.014608. Value loss: 0.236483. Entropy: 1.770788.
episode: 1025   score: 230.0  epsilon: 1.0    steps: 320  evaluation reward: 1002.7
Training 

Iteration 3151: Policy loss: 0.006928. Value loss: 0.752923. Entropy: 1.703918.
Iteration 3152: Policy loss: -0.008288. Value loss: 0.391243. Entropy: 1.689329.
Iteration 3153: Policy loss: -0.018271. Value loss: 0.282223. Entropy: 1.691026.
Training network. lr: 0.000226. clip: 0.090332
Iteration 3154: Policy loss: 0.007048. Value loss: 0.616417. Entropy: 1.517112.
Iteration 3155: Policy loss: -0.008331. Value loss: 0.363651. Entropy: 1.507187.
Iteration 3156: Policy loss: -0.017092. Value loss: 0.286162. Entropy: 1.504969.
Training network. lr: 0.000226. clip: 0.090332
Iteration 3157: Policy loss: 0.009951. Value loss: 1.092207. Entropy: 1.629149.
Iteration 3158: Policy loss: -0.006338. Value loss: 0.762425. Entropy: 1.624165.
Iteration 3159: Policy loss: -0.014538. Value loss: 0.587779. Entropy: 1.608564.
episode: 1044   score: 1350.0  epsilon: 1.0    steps: 104  evaluation reward: 1024.5
episode: 1045   score: 1180.0  epsilon: 1.0    steps: 1016  evaluation reward: 1027.0
Training 

Iteration 3218: Policy loss: -0.012579. Value loss: 0.346973. Entropy: 1.937361.
Iteration 3219: Policy loss: -0.023657. Value loss: 0.248739. Entropy: 1.920328.
episode: 1065   score: 830.0  epsilon: 1.0    steps: 576  evaluation reward: 1017.7
Training network. lr: 0.000225. clip: 0.090176
Iteration 3220: Policy loss: 0.002284. Value loss: 1.226765. Entropy: 1.893436.
Iteration 3221: Policy loss: -0.007522. Value loss: 0.860834. Entropy: 1.885699.
Iteration 3222: Policy loss: -0.015878. Value loss: 0.671891. Entropy: 1.892282.
episode: 1066   score: 880.0  epsilon: 1.0    steps: 8  evaluation reward: 1012.3
Training network. lr: 0.000225. clip: 0.090176
Iteration 3223: Policy loss: 0.007236. Value loss: 0.794591. Entropy: 1.761542.
Iteration 3224: Policy loss: -0.007193. Value loss: 0.502087. Entropy: 1.763014.
Iteration 3225: Policy loss: -0.014977. Value loss: 0.353475. Entropy: 1.749854.
episode: 1067   score: 680.0  epsilon: 1.0    steps: 896  evaluation reward: 1005.9
Training n

episode: 1085   score: 1350.0  epsilon: 1.0    steps: 1016  evaluation reward: 1049.4
Training network. lr: 0.000225. clip: 0.090019
Iteration 3286: Policy loss: 0.008557. Value loss: 0.877500. Entropy: 1.785775.
Iteration 3287: Policy loss: -0.009624. Value loss: 0.502892. Entropy: 1.819782.
Iteration 3288: Policy loss: -0.020476. Value loss: 0.363863. Entropy: 1.804592.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3289: Policy loss: 0.007090. Value loss: 0.991687. Entropy: 1.925220.
Iteration 3290: Policy loss: -0.010271. Value loss: 0.611758. Entropy: 1.940689.
Iteration 3291: Policy loss: -0.022539. Value loss: 0.438524. Entropy: 1.916296.
episode: 1086   score: 980.0  epsilon: 1.0    steps: 464  evaluation reward: 1048.9
Training network. lr: 0.000225. clip: 0.090019
Iteration 3292: Policy loss: 0.001761. Value loss: 0.984336. Entropy: 2.016214.
Iteration 3293: Policy loss: -0.010853. Value loss: 0.529007. Entropy: 2.008651.
Iteration 3294: Policy loss: -0.020430. Valu

Iteration 3354: Policy loss: -0.020656. Value loss: 0.419380. Entropy: 1.925250.
Training network. lr: 0.000224. clip: 0.089715
Iteration 3355: Policy loss: 0.006257. Value loss: 0.658104. Entropy: 1.901516.
Iteration 3356: Policy loss: -0.006817. Value loss: 0.348216. Entropy: 1.911151.
Iteration 3357: Policy loss: -0.022355. Value loss: 0.239734. Entropy: 1.899117.
episode: 1104   score: 1440.0  epsilon: 1.0    steps: 888  evaluation reward: 1063.0
episode: 1105   score: 980.0  epsilon: 1.0    steps: 1008  evaluation reward: 1048.3
Training network. lr: 0.000224. clip: 0.089715
Iteration 3358: Policy loss: 0.005384. Value loss: 0.751769. Entropy: 1.978898.
Iteration 3359: Policy loss: -0.008848. Value loss: 0.384932. Entropy: 1.971395.
Iteration 3360: Policy loss: -0.020970. Value loss: 0.247613. Entropy: 1.961830.
Training network. lr: 0.000224. clip: 0.089715
Iteration 3361: Policy loss: 0.005632. Value loss: 0.614231. Entropy: 1.788510.
Iteration 3362: Policy loss: -0.012009. Valu

Training network. lr: 0.000224. clip: 0.089558
Iteration 3421: Policy loss: 0.007583. Value loss: 1.352715. Entropy: 1.840626.
Iteration 3422: Policy loss: -0.001829. Value loss: 0.934664. Entropy: 1.848739.
Iteration 3423: Policy loss: -0.012294. Value loss: 0.663131. Entropy: 1.827671.
Training network. lr: 0.000224. clip: 0.089558
Iteration 3424: Policy loss: 0.008465. Value loss: 0.521512. Entropy: 1.860794.
Iteration 3425: Policy loss: -0.005596. Value loss: 0.288282. Entropy: 1.867465.
Iteration 3426: Policy loss: -0.018558. Value loss: 0.231658. Entropy: 1.846358.
episode: 1125   score: 1750.0  epsilon: 1.0    steps: 168  evaluation reward: 1082.7
Training network. lr: 0.000224. clip: 0.089558
Iteration 3427: Policy loss: 0.009494. Value loss: 0.723151. Entropy: 1.935400.
Iteration 3428: Policy loss: -0.003570. Value loss: 0.449576. Entropy: 1.926922.
Iteration 3429: Policy loss: -0.016313. Value loss: 0.327407. Entropy: 1.909006.
Training network. lr: 0.000224. clip: 0.089558
I

episode: 1144   score: 330.0  epsilon: 1.0    steps: 448  evaluation reward: 1059.8
Training network. lr: 0.000224. clip: 0.089411
Iteration 3490: Policy loss: 0.014939. Value loss: 1.172163. Entropy: 1.736164.
Iteration 3491: Policy loss: -0.003046. Value loss: 0.645324. Entropy: 1.715444.
Iteration 3492: Policy loss: -0.015128. Value loss: 0.430931. Entropy: 1.709199.
Training network. lr: 0.000224. clip: 0.089411
Iteration 3493: Policy loss: 0.007757. Value loss: 1.164855. Entropy: 1.817560.
Iteration 3494: Policy loss: -0.005428. Value loss: 0.704618. Entropy: 1.817700.
Iteration 3495: Policy loss: -0.017141. Value loss: 0.478520. Entropy: 1.805968.
Training network. lr: 0.000224. clip: 0.089411
Iteration 3496: Policy loss: 0.009972. Value loss: 1.311292. Entropy: 1.893290.
Iteration 3497: Policy loss: -0.006423. Value loss: 0.743846. Entropy: 1.861639.
Iteration 3498: Policy loss: -0.016008. Value loss: 0.455100. Entropy: 1.854235.
episode: 1145   score: 780.0  epsilon: 1.0    ste

Iteration 3557: Policy loss: -0.003713. Value loss: 0.441588. Entropy: 1.546989.
Iteration 3558: Policy loss: -0.013359. Value loss: 0.294907. Entropy: 1.538999.
episode: 1164   score: 1490.0  epsilon: 1.0    steps: 432  evaluation reward: 1080.0
Training network. lr: 0.000223. clip: 0.089097
Iteration 3559: Policy loss: 0.011862. Value loss: 1.176932. Entropy: 1.676333.
Iteration 3560: Policy loss: -0.003811. Value loss: 0.572654. Entropy: 1.649327.
Iteration 3561: Policy loss: -0.013016. Value loss: 0.354804. Entropy: 1.614113.
Training network. lr: 0.000223. clip: 0.089097
Iteration 3562: Policy loss: 0.009144. Value loss: 1.163977. Entropy: 1.857028.
Iteration 3563: Policy loss: -0.006131. Value loss: 0.600089. Entropy: 1.841904.
Iteration 3564: Policy loss: -0.016523. Value loss: 0.406267. Entropy: 1.830601.
Training network. lr: 0.000223. clip: 0.089097
Iteration 3565: Policy loss: 0.007182. Value loss: 1.562748. Entropy: 1.763397.
Iteration 3566: Policy loss: -0.002235. Value lo

Iteration 3627: Policy loss: -0.017863. Value loss: 0.488045. Entropy: 1.797019.
episode: 1182   score: 1360.0  epsilon: 1.0    steps: 888  evaluation reward: 1104.9
Training network. lr: 0.000222. clip: 0.088950
Iteration 3628: Policy loss: 0.010814. Value loss: 1.070831. Entropy: 1.746738.
Iteration 3629: Policy loss: 0.000609. Value loss: 0.714611. Entropy: 1.749640.
Iteration 3630: Policy loss: -0.013704. Value loss: 0.483945. Entropy: 1.746520.
episode: 1183   score: 1360.0  epsilon: 1.0    steps: 680  evaluation reward: 1106.7
episode: 1184   score: 680.0  epsilon: 1.0    steps: 880  evaluation reward: 1107.2
Training network. lr: 0.000222. clip: 0.088950
Iteration 3631: Policy loss: 0.006643. Value loss: 0.624464. Entropy: 1.834286.
Iteration 3632: Policy loss: -0.008047. Value loss: 0.355680. Entropy: 1.847765.
Iteration 3633: Policy loss: -0.020117. Value loss: 0.258232. Entropy: 1.835912.
episode: 1185   score: 830.0  epsilon: 1.0    steps: 928  evaluation reward: 1102.0
Trai

episode: 1203   score: 780.0  epsilon: 1.0    steps: 200  evaluation reward: 1074.3
episode: 1204   score: 480.0  epsilon: 1.0    steps: 632  evaluation reward: 1064.7
Training network. lr: 0.000222. clip: 0.088793
Iteration 3694: Policy loss: 0.006885. Value loss: 0.867943. Entropy: 1.938750.
Iteration 3695: Policy loss: -0.013652. Value loss: 0.429347. Entropy: 1.903751.
Iteration 3696: Policy loss: -0.023490. Value loss: 0.292472. Entropy: 1.892762.
episode: 1205   score: 930.0  epsilon: 1.0    steps: 248  evaluation reward: 1064.2
episode: 1206   score: 1030.0  epsilon: 1.0    steps: 328  evaluation reward: 1063.7
Training network. lr: 0.000222. clip: 0.088793
Iteration 3697: Policy loss: 0.009431. Value loss: 0.887252. Entropy: 1.739385.
Iteration 3698: Policy loss: -0.006989. Value loss: 0.456654. Entropy: 1.698393.
Iteration 3699: Policy loss: -0.020656. Value loss: 0.309906. Entropy: 1.713482.
episode: 1207   score: 480.0  epsilon: 1.0    steps: 408  evaluation reward: 1059.7
T

Iteration 3759: Policy loss: -0.019869. Value loss: 0.565201. Entropy: 1.938200.
episode: 1226   score: 580.0  epsilon: 1.0    steps: 424  evaluation reward: 1034.6
Training network. lr: 0.000221. clip: 0.088489
Iteration 3760: Policy loss: 0.004696. Value loss: 0.764775. Entropy: 1.944607.
Iteration 3761: Policy loss: -0.013020. Value loss: 0.423395. Entropy: 1.948130.
Iteration 3762: Policy loss: -0.023718. Value loss: 0.291956. Entropy: 1.928919.
Training network. lr: 0.000221. clip: 0.088489
Iteration 3763: Policy loss: 0.006926. Value loss: 1.283474. Entropy: 1.936789.
Iteration 3764: Policy loss: -0.012210. Value loss: 0.773739. Entropy: 1.941530.
Iteration 3765: Policy loss: -0.022278. Value loss: 0.550818. Entropy: 1.907325.
episode: 1227   score: 480.0  epsilon: 1.0    steps: 56  evaluation reward: 1035.1
Training network. lr: 0.000221. clip: 0.088489
Iteration 3766: Policy loss: 0.003181. Value loss: 0.840583. Entropy: 1.907463.
Iteration 3767: Policy loss: -0.011669. Value l

Training network. lr: 0.000221. clip: 0.088333
Iteration 3826: Policy loss: 0.006705. Value loss: 0.738009. Entropy: 1.849879.
Iteration 3827: Policy loss: -0.010757. Value loss: 0.432670. Entropy: 1.833799.
Iteration 3828: Policy loss: -0.019429. Value loss: 0.302001. Entropy: 1.836009.
episode: 1247   score: 530.0  epsilon: 1.0    steps: 80  evaluation reward: 996.2
Training network. lr: 0.000221. clip: 0.088333
Iteration 3829: Policy loss: 0.012917. Value loss: 1.171220. Entropy: 1.805550.
Iteration 3830: Policy loss: -0.004130. Value loss: 0.717976. Entropy: 1.833011.
Iteration 3831: Policy loss: -0.015040. Value loss: 0.598408. Entropy: 1.820338.
episode: 1248   score: 2350.0  epsilon: 1.0    steps: 760  evaluation reward: 1005.8
Training network. lr: 0.000221. clip: 0.088333
Iteration 3832: Policy loss: 0.006297. Value loss: 1.085162. Entropy: 1.756340.
Iteration 3833: Policy loss: -0.009963. Value loss: 0.649414. Entropy: 1.771228.
Iteration 3834: Policy loss: -0.018836. Value l

Iteration 3892: Policy loss: 0.009617. Value loss: 0.477557. Entropy: 1.709287.
Iteration 3893: Policy loss: -0.008126. Value loss: 0.231546. Entropy: 1.681611.
Iteration 3894: Policy loss: -0.020318. Value loss: 0.182208. Entropy: 1.691233.
episode: 1268   score: 3270.0  epsilon: 1.0    steps: 488  evaluation reward: 1030.8
episode: 1269   score: 480.0  epsilon: 1.0    steps: 544  evaluation reward: 1023.8
Training network. lr: 0.000220. clip: 0.088176
Iteration 3895: Policy loss: 0.004005. Value loss: 0.834962. Entropy: 1.677081.
Iteration 3896: Policy loss: -0.014447. Value loss: 0.504495. Entropy: 1.685935.
Iteration 3897: Policy loss: -0.023510. Value loss: 0.355517. Entropy: 1.660742.
episode: 1270   score: 160.0  epsilon: 1.0    steps: 984  evaluation reward: 999.9
episode: 1271   score: 930.0  epsilon: 1.0    steps: 1024  evaluation reward: 996.2
Training network. lr: 0.000220. clip: 0.088176
Iteration 3898: Policy loss: 0.002234. Value loss: 1.219544. Entropy: 1.801280.
Iterat

Training network. lr: 0.000220. clip: 0.087872
Iteration 3958: Policy loss: 0.008081. Value loss: 1.194900. Entropy: 1.875525.
Iteration 3959: Policy loss: -0.006240. Value loss: 0.626051. Entropy: 1.857642.
Iteration 3960: Policy loss: -0.013696. Value loss: 0.430712. Entropy: 1.855763.
episode: 1291   score: 430.0  epsilon: 1.0    steps: 528  evaluation reward: 979.9
episode: 1292   score: 1510.0  epsilon: 1.0    steps: 920  evaluation reward: 980.8
Training network. lr: 0.000220. clip: 0.087872
Iteration 3961: Policy loss: 0.008829. Value loss: 1.182818. Entropy: 1.835329.
Iteration 3962: Policy loss: -0.009917. Value loss: 0.749581. Entropy: 1.826577.
Iteration 3963: Policy loss: -0.016679. Value loss: 0.509767. Entropy: 1.814618.
episode: 1293   score: 680.0  epsilon: 1.0    steps: 488  evaluation reward: 978.3
Training network. lr: 0.000220. clip: 0.087872
Iteration 3964: Policy loss: 0.004174. Value loss: 1.072062. Entropy: 1.763059.
Iteration 3965: Policy loss: -0.007870. Value

Training network. lr: 0.000219. clip: 0.087715
Iteration 4024: Policy loss: 0.007657. Value loss: 0.844364. Entropy: 1.730626.
Iteration 4025: Policy loss: -0.005769. Value loss: 0.482500. Entropy: 1.772807.
Iteration 4026: Policy loss: -0.013538. Value loss: 0.321464. Entropy: 1.740822.
Training network. lr: 0.000219. clip: 0.087715
Iteration 4027: Policy loss: 0.006710. Value loss: 1.048459. Entropy: 1.739528.
Iteration 4028: Policy loss: -0.009910. Value loss: 0.619040. Entropy: 1.729234.
Iteration 4029: Policy loss: -0.021836. Value loss: 0.433765. Entropy: 1.730270.
Training network. lr: 0.000219. clip: 0.087715
Iteration 4030: Policy loss: 0.006943. Value loss: 0.805086. Entropy: 1.911754.
Iteration 4031: Policy loss: -0.004637. Value loss: 0.408970. Entropy: 1.941407.
Iteration 4032: Policy loss: -0.021087. Value loss: 0.272449. Entropy: 1.920125.
episode: 1313   score: 1280.0  epsilon: 1.0    steps: 176  evaluation reward: 1063.1
episode: 1314   score: 980.0  epsilon: 1.0    st

Iteration 4092: Policy loss: -0.016172. Value loss: 0.535554. Entropy: 1.833844.
Training network. lr: 0.000219. clip: 0.087568
Iteration 4093: Policy loss: 0.011665. Value loss: 1.850546. Entropy: 1.931382.
Iteration 4094: Policy loss: 0.002294. Value loss: 1.121859. Entropy: 1.949648.
Iteration 4095: Policy loss: -0.011747. Value loss: 0.752070. Entropy: 1.923962.
episode: 1333   score: 1360.0  epsilon: 1.0    steps: 128  evaluation reward: 1092.2
episode: 1334   score: 1440.0  epsilon: 1.0    steps: 496  evaluation reward: 1099.8
Training network. lr: 0.000219. clip: 0.087568
Iteration 4096: Policy loss: 0.009688. Value loss: 0.968173. Entropy: 1.927739.
Iteration 4097: Policy loss: -0.006937. Value loss: 0.511652. Entropy: 1.928735.
Iteration 4098: Policy loss: -0.017879. Value loss: 0.327176. Entropy: 1.926349.
episode: 1335   score: 780.0  epsilon: 1.0    steps: 632  evaluation reward: 1097.8
Training network. lr: 0.000219. clip: 0.087568
Iteration 4099: Policy loss: 0.005686. Va

Iteration 4155: Policy loss: -0.015969. Value loss: 0.309105. Entropy: 1.769703.
episode: 1358   score: 1560.0  epsilon: 1.0    steps: 728  evaluation reward: 1071.5
episode: 1359   score: 880.0  epsilon: 1.0    steps: 992  evaluation reward: 1068.5
Training network. lr: 0.000218. clip: 0.087254
Iteration 4156: Policy loss: 0.003799. Value loss: 0.495636. Entropy: 1.789291.
Iteration 4157: Policy loss: -0.009103. Value loss: 0.308397. Entropy: 1.788303.
Iteration 4158: Policy loss: -0.018000. Value loss: 0.235337. Entropy: 1.794955.
Training network. lr: 0.000218. clip: 0.087254
Iteration 4159: Policy loss: 0.006521. Value loss: 1.074839. Entropy: 1.703758.
Iteration 4160: Policy loss: -0.006777. Value loss: 0.713576. Entropy: 1.731297.
Iteration 4161: Policy loss: -0.015495. Value loss: 0.584911. Entropy: 1.708277.
Training network. lr: 0.000218. clip: 0.087254
Iteration 4162: Policy loss: 0.007656. Value loss: 0.932590. Entropy: 1.943102.
Iteration 4163: Policy loss: -0.010464. Value

Iteration 4223: Policy loss: -0.010870. Value loss: 0.503121. Entropy: 1.916087.
Iteration 4224: Policy loss: -0.018797. Value loss: 0.319312. Entropy: 1.903767.
episode: 1378   score: 1390.0  epsilon: 1.0    steps: 488  evaluation reward: 1102.2
Training network. lr: 0.000218. clip: 0.087107
Iteration 4225: Policy loss: 0.003739. Value loss: 0.758849. Entropy: 1.805223.
Iteration 4226: Policy loss: -0.014834. Value loss: 0.374311. Entropy: 1.818907.
Iteration 4227: Policy loss: -0.022174. Value loss: 0.270848. Entropy: 1.824048.
episode: 1379   score: 330.0  epsilon: 1.0    steps: 280  evaluation reward: 1096.7
episode: 1380   score: 930.0  epsilon: 1.0    steps: 328  evaluation reward: 1093.2
episode: 1381   score: 1030.0  epsilon: 1.0    steps: 336  evaluation reward: 1100.7
Training network. lr: 0.000218. clip: 0.087107
Iteration 4228: Policy loss: 0.008747. Value loss: 0.858408. Entropy: 1.714459.
Iteration 4229: Policy loss: -0.000507. Value loss: 0.477155. Entropy: 1.708304.
Ite

Iteration 4289: Policy loss: -0.009239. Value loss: 0.392638. Entropy: 1.730032.
Iteration 4290: Policy loss: -0.015861. Value loss: 0.245761. Entropy: 1.721874.
episode: 1400   score: 830.0  epsilon: 1.0    steps: 696  evaluation reward: 1070.6
Training network. lr: 0.000217. clip: 0.086950
Iteration 4291: Policy loss: 0.007750. Value loss: 0.931203. Entropy: 1.821985.
Iteration 4292: Policy loss: -0.013116. Value loss: 0.617757. Entropy: 1.811081.
Iteration 4293: Policy loss: -0.019928. Value loss: 0.417926. Entropy: 1.816269.
Training network. lr: 0.000217. clip: 0.086950
Iteration 4294: Policy loss: 0.007738. Value loss: 1.095646. Entropy: 1.865707.
Iteration 4295: Policy loss: -0.001699. Value loss: 0.662830. Entropy: 1.875719.
Iteration 4296: Policy loss: -0.014557. Value loss: 0.477833. Entropy: 1.860919.
Training network. lr: 0.000217. clip: 0.086950
Iteration 4297: Policy loss: 0.008155. Value loss: 1.357733. Entropy: 1.891554.
Iteration 4298: Policy loss: -0.006518. Value los

Iteration 4358: Policy loss: -0.006735. Value loss: 0.469752. Entropy: 1.865797.
Iteration 4359: Policy loss: -0.016832. Value loss: 0.321323. Entropy: 1.855676.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4360: Policy loss: 0.007039. Value loss: 0.722287. Entropy: 1.904849.
Iteration 4361: Policy loss: -0.010459. Value loss: 0.381455. Entropy: 1.920812.
Iteration 4362: Policy loss: -0.019719. Value loss: 0.245548. Entropy: 1.899156.
episode: 1418   score: 680.0  epsilon: 1.0    steps: 688  evaluation reward: 1082.8
Training network. lr: 0.000217. clip: 0.086646
Iteration 4363: Policy loss: 0.004192. Value loss: 0.673552. Entropy: 1.988891.
Iteration 4364: Policy loss: -0.008737. Value loss: 0.356845. Entropy: 1.986866.
Iteration 4365: Policy loss: -0.020072. Value loss: 0.271640. Entropy: 1.972932.
episode: 1419   score: 1370.0  epsilon: 1.0    steps: 440  evaluation reward: 1086.7
Training network. lr: 0.000217. clip: 0.086646
Iteration 4366: Policy loss: 0.007595. Value

Training network. lr: 0.000216. clip: 0.086489
Iteration 4426: Policy loss: 0.008961. Value loss: 1.129640. Entropy: 1.964734.
Iteration 4427: Policy loss: -0.012445. Value loss: 0.581982. Entropy: 1.946335.
Iteration 4428: Policy loss: -0.023551. Value loss: 0.395741. Entropy: 1.954720.
Training network. lr: 0.000216. clip: 0.086489
Iteration 4429: Policy loss: 0.007879. Value loss: 1.112629. Entropy: 1.967438.
Iteration 4430: Policy loss: -0.006277. Value loss: 0.666847. Entropy: 1.982301.
Iteration 4431: Policy loss: -0.014682. Value loss: 0.466454. Entropy: 1.954867.
episode: 1438   score: 880.0  epsilon: 1.0    steps: 280  evaluation reward: 1080.8
episode: 1439   score: 630.0  epsilon: 1.0    steps: 544  evaluation reward: 1073.2
Training network. lr: 0.000216. clip: 0.086489
Iteration 4432: Policy loss: 0.005698. Value loss: 0.431860. Entropy: 1.836444.
Iteration 4433: Policy loss: -0.008539. Value loss: 0.266354. Entropy: 1.835883.
Iteration 4434: Policy loss: -0.016843. Value 

Iteration 4494: Policy loss: -0.021334. Value loss: 0.535376. Entropy: 1.961010.
episode: 1457   score: 580.0  epsilon: 1.0    steps: 496  evaluation reward: 1148.5
Training network. lr: 0.000216. clip: 0.086333
Iteration 4495: Policy loss: 0.000925. Value loss: 1.620200. Entropy: 1.951484.
Iteration 4496: Policy loss: -0.007182. Value loss: 0.912681. Entropy: 1.933622.
Iteration 4497: Policy loss: -0.018356. Value loss: 0.598601. Entropy: 1.934534.
episode: 1458   score: 1180.0  epsilon: 1.0    steps: 296  evaluation reward: 1144.7
Training network. lr: 0.000216. clip: 0.086333
Iteration 4498: Policy loss: -0.000329. Value loss: 0.922804. Entropy: 1.869653.
Iteration 4499: Policy loss: -0.012358. Value loss: 0.540184. Entropy: 1.897355.
Iteration 4500: Policy loss: -0.013224. Value loss: 0.414976. Entropy: 1.888855.
episode: 1459   score: 930.0  epsilon: 1.0    steps: 88  evaluation reward: 1145.2
Training network. lr: 0.000215. clip: 0.086185
Iteration 4501: Policy loss: 0.004827. Va

episode: 1478   score: 780.0  epsilon: 1.0    steps: 976  evaluation reward: 1140.7
Training network. lr: 0.000215. clip: 0.086029
Iteration 4561: Policy loss: 0.004512. Value loss: 1.071431. Entropy: 1.876033.
Iteration 4562: Policy loss: -0.008708. Value loss: 0.689954. Entropy: 1.878524.
Iteration 4563: Policy loss: -0.016771. Value loss: 0.519695. Entropy: 1.870881.
Training network. lr: 0.000215. clip: 0.086029
Iteration 4564: Policy loss: 0.007712. Value loss: 1.046155. Entropy: 1.817444.
Iteration 4565: Policy loss: -0.008267. Value loss: 0.736799. Entropy: 1.814277.
Iteration 4566: Policy loss: -0.013785. Value loss: 0.580595. Entropy: 1.804973.
episode: 1479   score: 1430.0  epsilon: 1.0    steps: 1016  evaluation reward: 1151.7
Training network. lr: 0.000215. clip: 0.086029
Iteration 4567: Policy loss: 0.005916. Value loss: 1.172669. Entropy: 1.865564.
Iteration 4568: Policy loss: -0.010468. Value loss: 0.806250. Entropy: 1.863809.
Iteration 4569: Policy loss: -0.007363. Valu

Iteration 4626: Policy loss: -0.019465. Value loss: 0.427575. Entropy: 1.581170.
Training network. lr: 0.000215. clip: 0.085872
Iteration 4627: Policy loss: 0.007618. Value loss: 1.282744. Entropy: 1.732688.
Iteration 4628: Policy loss: -0.005433. Value loss: 0.726982. Entropy: 1.723019.
Iteration 4629: Policy loss: -0.014994. Value loss: 0.538908. Entropy: 1.710864.
Training network. lr: 0.000215. clip: 0.085872
Iteration 4630: Policy loss: 0.009461. Value loss: 1.279685. Entropy: 1.833186.
Iteration 4631: Policy loss: -0.007361. Value loss: 0.783998. Entropy: 1.808021.
Iteration 4632: Policy loss: -0.017150. Value loss: 0.531428. Entropy: 1.782323.
now time :  2019-03-06 10:42:44.496897
episode: 1501   score: 780.0  epsilon: 1.0    steps: 440  evaluation reward: 1142.4
Training network. lr: 0.000215. clip: 0.085872
Iteration 4633: Policy loss: 0.008234. Value loss: 1.020430. Entropy: 1.849937.
Iteration 4634: Policy loss: -0.005599. Value loss: 0.574880. Entropy: 1.855337.
Iteration 

Iteration 4691: Policy loss: -0.005038. Value loss: 1.071599. Entropy: 1.777309.
Iteration 4692: Policy loss: -0.009017. Value loss: 0.725628. Entropy: 1.760000.
Training network. lr: 0.000214. clip: 0.085724
Iteration 4693: Policy loss: 0.004503. Value loss: 1.137984. Entropy: 1.845110.
Iteration 4694: Policy loss: -0.009487. Value loss: 0.732469. Entropy: 1.831680.
Iteration 4695: Policy loss: -0.017003. Value loss: 0.596255. Entropy: 1.835114.
episode: 1524   score: 1440.0  epsilon: 1.0    steps: 1008  evaluation reward: 1115.9
Training network. lr: 0.000214. clip: 0.085724
Iteration 4696: Policy loss: 0.006926. Value loss: 1.306904. Entropy: 1.873492.
Iteration 4697: Policy loss: -0.002913. Value loss: 0.861608. Entropy: 1.879175.
Iteration 4698: Policy loss: -0.017058. Value loss: 0.576743. Entropy: 1.863410.
Training network. lr: 0.000214. clip: 0.085724
Iteration 4699: Policy loss: 0.007096. Value loss: 0.910722. Entropy: 1.827007.
Iteration 4700: Policy loss: -0.003975. Value l

Iteration 4753: Policy loss: 0.006468. Value loss: 0.876120. Entropy: 1.684461.
Iteration 4754: Policy loss: -0.008482. Value loss: 0.554401. Entropy: 1.681645.
Iteration 4755: Policy loss: -0.016566. Value loss: 0.402438. Entropy: 1.670925.
now time :  2019-03-06 10:44:22.236573
episode: 1551   score: 880.0  epsilon: 1.0    steps: 144  evaluation reward: 1096.0
episode: 1552   score: 830.0  epsilon: 1.0    steps: 928  evaluation reward: 1088.7
Training network. lr: 0.000214. clip: 0.085411
Iteration 4756: Policy loss: 0.003212. Value loss: 0.795459. Entropy: 1.604175.
Iteration 4757: Policy loss: -0.010192. Value loss: 0.547873. Entropy: 1.622731.
Iteration 4758: Policy loss: -0.015423. Value loss: 0.412515. Entropy: 1.602510.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4759: Policy loss: 0.011475. Value loss: 1.253395. Entropy: 1.663179.
Iteration 4760: Policy loss: -0.001492. Value loss: 0.765764. Entropy: 1.659611.
Iteration 4761: Policy loss: -0.009347. Value loss: 0.

Iteration 4817: Policy loss: -0.007534. Value loss: 0.600718. Entropy: 1.767393.
Iteration 4818: Policy loss: -0.018239. Value loss: 0.459640. Entropy: 1.761740.
episode: 1575   score: 1180.0  epsilon: 1.0    steps: 136  evaluation reward: 1048.8
episode: 1576   score: 1130.0  epsilon: 1.0    steps: 624  evaluation reward: 1041.8
Training network. lr: 0.000213. clip: 0.085264
Iteration 4819: Policy loss: 0.006233. Value loss: 0.897025. Entropy: 1.735803.
Iteration 4820: Policy loss: -0.007819. Value loss: 0.644181. Entropy: 1.741685.
Iteration 4821: Policy loss: -0.019017. Value loss: 0.494983. Entropy: 1.699543.
Training network. lr: 0.000213. clip: 0.085264
Iteration 4822: Policy loss: 0.008776. Value loss: 1.416186. Entropy: 1.829862.
Iteration 4823: Policy loss: -0.000744. Value loss: 0.751160. Entropy: 1.799896.
Iteration 4824: Policy loss: -0.011376. Value loss: 0.510421. Entropy: 1.795530.
episode: 1577   score: 630.0  epsilon: 1.0    steps: 96  evaluation reward: 1043.3
episode

Training network. lr: 0.000213. clip: 0.085107
Iteration 4882: Policy loss: 0.010495. Value loss: 1.088123. Entropy: 1.784513.
Iteration 4883: Policy loss: -0.000960. Value loss: 0.662721. Entropy: 1.791057.
Iteration 4884: Policy loss: -0.010516. Value loss: 0.511152. Entropy: 1.789291.
episode: 1599   score: 1510.0  epsilon: 1.0    steps: 776  evaluation reward: 1011.2
episode: 1600   score: 1080.0  epsilon: 1.0    steps: 816  evaluation reward: 1016.2
Training network. lr: 0.000213. clip: 0.085107
Iteration 4885: Policy loss: 0.008692. Value loss: 1.240257. Entropy: 1.737983.
Iteration 4886: Policy loss: -0.008030. Value loss: 0.864803. Entropy: 1.733638.
Iteration 4887: Policy loss: -0.017493. Value loss: 0.640353. Entropy: 1.715444.
now time :  2019-03-06 10:46:07.413208
episode: 1601   score: 1510.0  epsilon: 1.0    steps: 24  evaluation reward: 1023.5
Training network. lr: 0.000213. clip: 0.085107
Iteration 4888: Policy loss: 0.013064. Value loss: 1.757960. Entropy: 1.637221.
It

Iteration 4947: Policy loss: -0.015864. Value loss: 0.628517. Entropy: 1.819485.
episode: 1622   score: 730.0  epsilon: 1.0    steps: 24  evaluation reward: 1081.6
Training network. lr: 0.000212. clip: 0.084950
Iteration 4948: Policy loss: 0.010302. Value loss: 1.062253. Entropy: 1.775501.
Iteration 4949: Policy loss: -0.003300. Value loss: 0.710982. Entropy: 1.770070.
Iteration 4950: Policy loss: -0.009691. Value loss: 0.579357. Entropy: 1.757884.
Training network. lr: 0.000212. clip: 0.084803
Iteration 4951: Policy loss: 0.007273. Value loss: 1.747447. Entropy: 1.747213.
Iteration 4952: Policy loss: -0.005161. Value loss: 1.270787. Entropy: 1.734653.
Iteration 4953: Policy loss: -0.011281. Value loss: 0.984132. Entropy: 1.719265.
episode: 1623   score: 580.0  epsilon: 1.0    steps: 144  evaluation reward: 1076.6
episode: 1624   score: 580.0  epsilon: 1.0    steps: 560  evaluation reward: 1068.0
Training network. lr: 0.000212. clip: 0.084803
Iteration 4954: Policy loss: 0.009781. Valu

Training network. lr: 0.000212. clip: 0.084646
Iteration 5011: Policy loss: 0.009007. Value loss: 1.338068. Entropy: 1.864798.
Iteration 5012: Policy loss: -0.008879. Value loss: 0.832997. Entropy: 1.869927.
Iteration 5013: Policy loss: -0.016029. Value loss: 0.563901. Entropy: 1.856789.
episode: 1647   score: 780.0  epsilon: 1.0    steps: 376  evaluation reward: 1055.0
episode: 1648   score: 1080.0  epsilon: 1.0    steps: 400  evaluation reward: 1056.0
Training network. lr: 0.000212. clip: 0.084646
Iteration 5014: Policy loss: 0.007965. Value loss: 0.844177. Entropy: 1.711251.
Iteration 5015: Policy loss: -0.010340. Value loss: 0.486958. Entropy: 1.705009.
Iteration 5016: Policy loss: -0.015706. Value loss: 0.325480. Entropy: 1.675774.
episode: 1649   score: 780.0  epsilon: 1.0    steps: 88  evaluation reward: 1050.8
Training network. lr: 0.000212. clip: 0.084646
Iteration 5017: Policy loss: 0.003023. Value loss: 0.950765. Entropy: 1.706934.
Iteration 5018: Policy loss: -0.003731. Val

episode: 1670   score: 1490.0  epsilon: 1.0    steps: 848  evaluation reward: 1071.2
Training network. lr: 0.000211. clip: 0.084489
Iteration 5077: Policy loss: 0.007825. Value loss: 1.123642. Entropy: 1.733373.
Iteration 5078: Policy loss: -0.009850. Value loss: 0.673940. Entropy: 1.701194.
Iteration 5079: Policy loss: -0.016327. Value loss: 0.466097. Entropy: 1.686171.
Training network. lr: 0.000211. clip: 0.084489
Iteration 5080: Policy loss: 0.011980. Value loss: 0.950164. Entropy: 1.789701.
Iteration 5081: Policy loss: -0.004298. Value loss: 0.530340. Entropy: 1.779159.
Iteration 5082: Policy loss: -0.019212. Value loss: 0.359649. Entropy: 1.781354.
episode: 1671   score: 1300.0  epsilon: 1.0    steps: 744  evaluation reward: 1077.4
Training network. lr: 0.000211. clip: 0.084489
Iteration 5083: Policy loss: 0.008265. Value loss: 1.488681. Entropy: 1.846181.
Iteration 5084: Policy loss: -0.001929. Value loss: 0.899409. Entropy: 1.836661.
Iteration 5085: Policy loss: -0.017714. Valu

Iteration 5141: Policy loss: -0.005365. Value loss: 0.600015. Entropy: 1.733319.
Iteration 5142: Policy loss: -0.008905. Value loss: 0.446511. Entropy: 1.739739.
Training network. lr: 0.000211. clip: 0.084342
Iteration 5143: Policy loss: 0.010574. Value loss: 0.953764. Entropy: 1.848624.
Iteration 5144: Policy loss: -0.000707. Value loss: 0.637600. Entropy: 1.892758.
Iteration 5145: Policy loss: -0.010308. Value loss: 0.450586. Entropy: 1.865056.
episode: 1694   score: 1300.0  epsilon: 1.0    steps: 480  evaluation reward: 1128.0
Training network. lr: 0.000211. clip: 0.084342
Iteration 5146: Policy loss: 0.009021. Value loss: 1.507558. Entropy: 1.839039.
Iteration 5147: Policy loss: -0.001069. Value loss: 0.883015. Entropy: 1.849715.
Iteration 5148: Policy loss: -0.013518. Value loss: 0.642159. Entropy: 1.846984.
Training network. lr: 0.000211. clip: 0.084342
Iteration 5149: Policy loss: 0.007351. Value loss: 1.158199. Entropy: 1.833960.
Iteration 5150: Policy loss: -0.001983. Value lo

episode: 1717   score: 1480.0  epsilon: 1.0    steps: 488  evaluation reward: 1094.5
episode: 1718   score: 1680.0  epsilon: 1.0    steps: 560  evaluation reward: 1095.7
Training network. lr: 0.000210. clip: 0.084029
Iteration 5206: Policy loss: 0.007321. Value loss: 1.533280. Entropy: 1.740584.
Iteration 5207: Policy loss: -0.007517. Value loss: 0.992977. Entropy: 1.741679.
Iteration 5208: Policy loss: -0.016016. Value loss: 0.774268. Entropy: 1.731957.
Training network. lr: 0.000210. clip: 0.084029
Iteration 5209: Policy loss: 0.010718. Value loss: 1.316740. Entropy: 1.821389.
Iteration 5210: Policy loss: -0.001645. Value loss: 0.901623. Entropy: 1.827107.
Iteration 5211: Policy loss: -0.010578. Value loss: 0.680544. Entropy: 1.815857.
episode: 1719   score: 1680.0  epsilon: 1.0    steps: 272  evaluation reward: 1099.5
Training network. lr: 0.000210. clip: 0.084029
Iteration 5212: Policy loss: 0.007082. Value loss: 1.077222. Entropy: 1.795018.
Iteration 5213: Policy loss: -0.009510. 

Iteration 5269: Policy loss: 0.006775. Value loss: 1.173726. Entropy: 1.793492.
Iteration 5270: Policy loss: -0.005088. Value loss: 0.830900. Entropy: 1.810879.
Iteration 5271: Policy loss: -0.013095. Value loss: 0.665647. Entropy: 1.791679.
episode: 1742   score: 980.0  epsilon: 1.0    steps: 48  evaluation reward: 1124.5
episode: 1743   score: 1180.0  epsilon: 1.0    steps: 296  evaluation reward: 1123.3
episode: 1744   score: 630.0  epsilon: 1.0    steps: 544  evaluation reward: 1116.8
Training network. lr: 0.000210. clip: 0.083881
Iteration 5272: Policy loss: 0.003063. Value loss: 0.491946. Entropy: 1.523368.
Iteration 5273: Policy loss: -0.005378. Value loss: 0.322735. Entropy: 1.529359.
Iteration 5274: Policy loss: -0.016015. Value loss: 0.258209. Entropy: 1.513858.
Training network. lr: 0.000210. clip: 0.083881
Iteration 5275: Policy loss: 0.005860. Value loss: 0.518900. Entropy: 1.713156.
Iteration 5276: Policy loss: -0.012513. Value loss: 0.299427. Entropy: 1.721528.
Iteration

Iteration 5332: Policy loss: 0.006383. Value loss: 1.354570. Entropy: 1.866134.
Iteration 5333: Policy loss: -0.008350. Value loss: 0.912739. Entropy: 1.870393.
Iteration 5334: Policy loss: -0.018420. Value loss: 0.707433. Entropy: 1.855374.
episode: 1767   score: 830.0  epsilon: 1.0    steps: 112  evaluation reward: 1059.6
episode: 1768   score: 1030.0  epsilon: 1.0    steps: 304  evaluation reward: 1056.2
Training network. lr: 0.000209. clip: 0.083725
Iteration 5335: Policy loss: 0.004664. Value loss: 0.647154. Entropy: 1.734849.
Iteration 5336: Policy loss: -0.008078. Value loss: 0.401789. Entropy: 1.729961.
Iteration 5337: Policy loss: -0.018707. Value loss: 0.322215. Entropy: 1.721228.
Training network. lr: 0.000209. clip: 0.083725
Iteration 5338: Policy loss: 0.005329. Value loss: 1.238831. Entropy: 1.873768.
Iteration 5339: Policy loss: -0.002098. Value loss: 0.863434. Entropy: 1.889185.
Iteration 5340: Policy loss: -0.008030. Value loss: 0.687464. Entropy: 1.875797.
episode: 17

Iteration 5397: Policy loss: -0.009506. Value loss: 0.708727. Entropy: 1.836403.
episode: 1791   score: 1750.0  epsilon: 1.0    steps: 520  evaluation reward: 1034.0
episode: 1792   score: 2900.0  epsilon: 1.0    steps: 672  evaluation reward: 1050.0
Training network. lr: 0.000209. clip: 0.083568
Iteration 5398: Policy loss: 0.011058. Value loss: 1.076125. Entropy: 1.861010.
Iteration 5399: Policy loss: 0.002518. Value loss: 0.694046. Entropy: 1.874314.
Iteration 5400: Policy loss: -0.008090. Value loss: 0.517777. Entropy: 1.864064.
episode: 1793   score: 930.0  epsilon: 1.0    steps: 40  evaluation reward: 1053.5
Training network. lr: 0.000209. clip: 0.083420
Iteration 5401: Policy loss: 0.009681. Value loss: 0.798723. Entropy: 1.838329.
Iteration 5402: Policy loss: -0.006190. Value loss: 0.564349. Entropy: 1.834799.
Iteration 5403: Policy loss: -0.011608. Value loss: 0.463661. Entropy: 1.833568.
episode: 1794   score: 410.0  epsilon: 1.0    steps: 392  evaluation reward: 1044.6
Train

Iteration 5462: Policy loss: -0.004472. Value loss: 0.564719. Entropy: 1.888122.
Iteration 5463: Policy loss: -0.015714. Value loss: 0.408674. Entropy: 1.884275.
episode: 1814   score: 1680.0  epsilon: 1.0    steps: 320  evaluation reward: 1072.9
episode: 1815   score: 980.0  epsilon: 1.0    steps: 496  evaluation reward: 1065.6
Training network. lr: 0.000208. clip: 0.083264
Iteration 5464: Policy loss: 0.006363. Value loss: 0.911669. Entropy: 1.884507.
Iteration 5465: Policy loss: -0.010357. Value loss: 0.572286. Entropy: 1.883201.
Iteration 5466: Policy loss: -0.021237. Value loss: 0.414554. Entropy: 1.876353.
episode: 1816   score: 640.0  epsilon: 1.0    steps: 64  evaluation reward: 1065.2
episode: 1817   score: 430.0  epsilon: 1.0    steps: 176  evaluation reward: 1054.7
Training network. lr: 0.000208. clip: 0.083264
Iteration 5467: Policy loss: 0.004987. Value loss: 0.645713. Entropy: 1.733723.
Iteration 5468: Policy loss: -0.011093. Value loss: 0.409797. Entropy: 1.733423.
Itera

Training network. lr: 0.000208. clip: 0.083107
Iteration 5530: Policy loss: 0.006063. Value loss: 1.081530. Entropy: 1.939629.
Iteration 5531: Policy loss: -0.006897. Value loss: 0.663521. Entropy: 1.946123.
Iteration 5532: Policy loss: -0.018257. Value loss: 0.436984. Entropy: 1.924121.
episode: 1835   score: 360.0  epsilon: 1.0    steps: 160  evaluation reward: 1014.4
episode: 1836   score: 630.0  epsilon: 1.0    steps: 656  evaluation reward: 1010.4
Training network. lr: 0.000208. clip: 0.083107
Iteration 5533: Policy loss: 0.010458. Value loss: 0.942376. Entropy: 1.909129.
Iteration 5534: Policy loss: -0.005862. Value loss: 0.602327. Entropy: 1.899472.
Iteration 5535: Policy loss: -0.015213. Value loss: 0.418391. Entropy: 1.881436.
episode: 1837   score: 830.0  epsilon: 1.0    steps: 576  evaluation reward: 1013.4
episode: 1838   score: 880.0  epsilon: 1.0    steps: 936  evaluation reward: 1010.4
Training network. lr: 0.000208. clip: 0.083107
Iteration 5536: Policy loss: 0.006520. 

Training network. lr: 0.000207. clip: 0.082960
Iteration 5596: Policy loss: 0.006216. Value loss: 1.245376. Entropy: 1.778357.
Iteration 5597: Policy loss: -0.007922. Value loss: 0.810243. Entropy: 1.780800.
Iteration 5598: Policy loss: -0.013577. Value loss: 0.577624. Entropy: 1.749175.
episode: 1857   score: 930.0  epsilon: 1.0    steps: 312  evaluation reward: 1060.5
Training network. lr: 0.000207. clip: 0.082960
Iteration 5599: Policy loss: 0.005375. Value loss: 0.586279. Entropy: 1.689841.
Iteration 5600: Policy loss: -0.001863. Value loss: 0.352868. Entropy: 1.706473.
Iteration 5601: Policy loss: -0.014365. Value loss: 0.259129. Entropy: 1.693806.
episode: 1858   score: 930.0  epsilon: 1.0    steps: 360  evaluation reward: 1067.0
Training network. lr: 0.000207. clip: 0.082803
Iteration 5602: Policy loss: 0.010376. Value loss: 0.505888. Entropy: 1.727028.
Iteration 5603: Policy loss: -0.010727. Value loss: 0.255046. Entropy: 1.735019.
Iteration 5604: Policy loss: -0.022877. Value 

Iteration 5664: Policy loss: -0.012473. Value loss: 0.569950. Entropy: 1.909755.
episode: 1877   score: 1320.0  epsilon: 1.0    steps: 760  evaluation reward: 1090.4
Training network. lr: 0.000207. clip: 0.082646
Iteration 5665: Policy loss: 0.012426. Value loss: 0.703746. Entropy: 1.770923.
Iteration 5666: Policy loss: -0.005851. Value loss: 0.423878. Entropy: 1.779110.
Iteration 5667: Policy loss: -0.016122. Value loss: 0.334794. Entropy: 1.768232.
episode: 1878   score: 280.0  epsilon: 1.0    steps: 680  evaluation reward: 1081.4
Training network. lr: 0.000207. clip: 0.082646
Iteration 5668: Policy loss: 0.005343. Value loss: 0.925847. Entropy: 1.799557.
Iteration 5669: Policy loss: -0.010187. Value loss: 0.545436. Entropy: 1.792670.
Iteration 5670: Policy loss: -0.016953. Value loss: 0.403108. Entropy: 1.799849.
Training network. lr: 0.000207. clip: 0.082646
Iteration 5671: Policy loss: 0.007769. Value loss: 0.613425. Entropy: 1.816358.
Iteration 5672: Policy loss: -0.006977. Value

episode: 1898   score: 880.0  epsilon: 1.0    steps: 416  evaluation reward: 1076.3
episode: 1899   score: 730.0  epsilon: 1.0    steps: 920  evaluation reward: 1070.8
Training network. lr: 0.000206. clip: 0.082499
Iteration 5731: Policy loss: 0.003874. Value loss: 1.327570. Entropy: 1.501160.
Iteration 5732: Policy loss: -0.008148. Value loss: 0.765633. Entropy: 1.508759.
Iteration 5733: Policy loss: -0.016318. Value loss: 0.538411. Entropy: 1.481318.
Training network. lr: 0.000206. clip: 0.082499
Iteration 5734: Policy loss: 0.005489. Value loss: 1.239322. Entropy: 1.619314.
Iteration 5735: Policy loss: -0.007643. Value loss: 0.815356. Entropy: 1.624026.
Iteration 5736: Policy loss: -0.013269. Value loss: 0.570687. Entropy: 1.566214.
episode: 1900   score: 630.0  epsilon: 1.0    steps: 288  evaluation reward: 1063.0
now time :  2019-03-06 10:57:23.666411
episode: 1901   score: 1180.0  epsilon: 1.0    steps: 608  evaluation reward: 1058.1
Training network. lr: 0.000206. clip: 0.082499

Training network. lr: 0.000206. clip: 0.082342
Iteration 5797: Policy loss: 0.003452. Value loss: 0.928463. Entropy: 1.812553.
Iteration 5798: Policy loss: -0.005312. Value loss: 0.556188. Entropy: 1.796509.
Iteration 5799: Policy loss: -0.012307. Value loss: 0.407335. Entropy: 1.772034.
episode: 1920   score: 1320.0  epsilon: 1.0    steps: 360  evaluation reward: 1070.0
episode: 1921   score: 1280.0  epsilon: 1.0    steps: 672  evaluation reward: 1065.7
Training network. lr: 0.000206. clip: 0.082342
Iteration 5800: Policy loss: 0.008084. Value loss: 0.787356. Entropy: 1.796439.
Iteration 5801: Policy loss: -0.008793. Value loss: 0.537391. Entropy: 1.803832.
Iteration 5802: Policy loss: -0.018548. Value loss: 0.406817. Entropy: 1.787241.
episode: 1922   score: 1180.0  epsilon: 1.0    steps: 336  evaluation reward: 1069.2
Training network. lr: 0.000205. clip: 0.082185
Iteration 5803: Policy loss: 0.005134. Value loss: 0.896773. Entropy: 1.659244.
Iteration 5804: Policy loss: -0.008229. 

episode: 1943   score: 1280.0  epsilon: 1.0    steps: 568  evaluation reward: 1086.2
Training network. lr: 0.000205. clip: 0.082038
Iteration 5863: Policy loss: 0.003224. Value loss: 0.639773. Entropy: 1.665159.
Iteration 5864: Policy loss: -0.010378. Value loss: 0.374207. Entropy: 1.669202.
Iteration 5865: Policy loss: -0.019805. Value loss: 0.251230. Entropy: 1.646696.
episode: 1944   score: 580.0  epsilon: 1.0    steps: 432  evaluation reward: 1081.7
Training network. lr: 0.000205. clip: 0.082038
Iteration 5866: Policy loss: 0.004494. Value loss: 0.665839. Entropy: 1.650354.
Iteration 5867: Policy loss: -0.009559. Value loss: 0.402801. Entropy: 1.654062.
Iteration 5868: Policy loss: -0.019759. Value loss: 0.285591. Entropy: 1.652980.
episode: 1945   score: 230.0  epsilon: 1.0    steps: 336  evaluation reward: 1074.2
Training network. lr: 0.000205. clip: 0.082038
Iteration 5869: Policy loss: 0.003253. Value loss: 1.453697. Entropy: 1.667085.
Iteration 5870: Policy loss: -0.005431. Va

Iteration 5926: Policy loss: 0.006556. Value loss: 1.759309. Entropy: 1.727708.
Iteration 5927: Policy loss: -0.003985. Value loss: 1.046239. Entropy: 1.686297.
Iteration 5928: Policy loss: -0.013715. Value loss: 0.720579. Entropy: 1.696760.
episode: 1968   score: 530.0  epsilon: 1.0    steps: 904  evaluation reward: 1029.5
Training network. lr: 0.000205. clip: 0.081881
Iteration 5929: Policy loss: 0.008046. Value loss: 0.535700. Entropy: 1.615276.
Iteration 5930: Policy loss: -0.011438. Value loss: 0.276922. Entropy: 1.629461.
Iteration 5931: Policy loss: -0.019383. Value loss: 0.196150. Entropy: 1.617036.
Training network. lr: 0.000205. clip: 0.081881
Iteration 5932: Policy loss: 0.005503. Value loss: 0.890390. Entropy: 1.652624.
Iteration 5933: Policy loss: -0.006136. Value loss: 0.514514. Entropy: 1.657263.
Iteration 5934: Policy loss: -0.013970. Value loss: 0.353077. Entropy: 1.647738.
episode: 1969   score: 1030.0  epsilon: 1.0    steps: 552  evaluation reward: 1024.2
episode: 19

Iteration 5993: Policy loss: -0.007926. Value loss: 0.702560. Entropy: 1.600758.
Iteration 5994: Policy loss: -0.017862. Value loss: 0.483124. Entropy: 1.615287.
episode: 1989   score: 1320.0  epsilon: 1.0    steps: 512  evaluation reward: 1011.4
Training network. lr: 0.000204. clip: 0.081725
Iteration 5995: Policy loss: 0.008443. Value loss: 1.097850. Entropy: 1.749445.
Iteration 5996: Policy loss: -0.001369. Value loss: 0.657610. Entropy: 1.765821.
Iteration 5997: Policy loss: -0.012266. Value loss: 0.454315. Entropy: 1.737790.
Training network. lr: 0.000204. clip: 0.081725
Iteration 5998: Policy loss: 0.007357. Value loss: 0.945486. Entropy: 1.869981.
Iteration 5999: Policy loss: -0.004417. Value loss: 0.597853. Entropy: 1.862562.
Iteration 6000: Policy loss: -0.012292. Value loss: 0.467973. Entropy: 1.859183.
episode: 1990   score: 530.0  epsilon: 1.0    steps: 536  evaluation reward: 1003.9
Training network. lr: 0.000204. clip: 0.081577
Iteration 6001: Policy loss: 0.003209. Value

Iteration 6061: Policy loss: 0.001735. Value loss: 0.758373. Entropy: 1.766816.
Iteration 6062: Policy loss: -0.011726. Value loss: 0.460805. Entropy: 1.773850.
Iteration 6063: Policy loss: -0.020873. Value loss: 0.363268. Entropy: 1.765718.
episode: 2008   score: 1320.0  epsilon: 1.0    steps: 8  evaluation reward: 1017.8
episode: 2009   score: 1580.0  epsilon: 1.0    steps: 552  evaluation reward: 1020.6
Training network. lr: 0.000204. clip: 0.081421
Iteration 6064: Policy loss: 0.006768. Value loss: 0.864415. Entropy: 1.622511.
Iteration 6065: Policy loss: -0.002074. Value loss: 0.579939. Entropy: 1.607914.
Iteration 6066: Policy loss: -0.013464. Value loss: 0.398496. Entropy: 1.606498.
Training network. lr: 0.000204. clip: 0.081421
Iteration 6067: Policy loss: 0.005456. Value loss: 1.357144. Entropy: 1.684198.
Iteration 6068: Policy loss: -0.009066. Value loss: 0.934826. Entropy: 1.649866.
Iteration 6069: Policy loss: -0.013962. Value loss: 0.657191. Entropy: 1.660857.
episode: 201

Iteration 6128: Policy loss: -0.002975. Value loss: 0.613465. Entropy: 1.830723.
Iteration 6129: Policy loss: -0.011508. Value loss: 0.442324. Entropy: 1.813512.
Training network. lr: 0.000203. clip: 0.081264
Iteration 6130: Policy loss: 0.005138. Value loss: 0.880058. Entropy: 1.892023.
Iteration 6131: Policy loss: -0.005522. Value loss: 0.577290. Entropy: 1.887462.
Iteration 6132: Policy loss: -0.019001. Value loss: 0.443206. Entropy: 1.880430.
episode: 2029   score: 1470.0  epsilon: 1.0    steps: 928  evaluation reward: 1015.5
Training network. lr: 0.000203. clip: 0.081264
Iteration 6133: Policy loss: 0.012552. Value loss: 0.730069. Entropy: 1.911034.
Iteration 6134: Policy loss: -0.003535. Value loss: 0.486259. Entropy: 1.926139.
Iteration 6135: Policy loss: -0.012488. Value loss: 0.360293. Entropy: 1.906727.
Training network. lr: 0.000203. clip: 0.081264
Iteration 6136: Policy loss: 0.006215. Value loss: 0.591383. Entropy: 1.874340.
Iteration 6137: Policy loss: -0.004863. Value lo

Iteration 6197: Policy loss: -0.010087. Value loss: 0.433472. Entropy: 1.737007.
Iteration 6198: Policy loss: -0.018969. Value loss: 0.334003. Entropy: 1.723124.
Training network. lr: 0.000203. clip: 0.081116
Iteration 6199: Policy loss: 0.004821. Value loss: 1.145976. Entropy: 1.768498.
Iteration 6200: Policy loss: -0.006129. Value loss: 0.712949. Entropy: 1.774579.
Iteration 6201: Policy loss: -0.017232. Value loss: 0.490609. Entropy: 1.771130.
episode: 2048   score: 1080.0  epsilon: 1.0    steps: 568  evaluation reward: 1065.7
Training network. lr: 0.000202. clip: 0.080960
Iteration 6202: Policy loss: 0.005640. Value loss: 1.409299. Entropy: 1.835797.
Iteration 6203: Policy loss: 0.000909. Value loss: 0.884401. Entropy: 1.828552.
Iteration 6204: Policy loss: -0.011337. Value loss: 0.714676. Entropy: 1.822834.
Training network. lr: 0.000202. clip: 0.080960
Iteration 6205: Policy loss: 0.001258. Value loss: 0.695844. Entropy: 1.881331.
Iteration 6206: Policy loss: -0.014449. Value los

Iteration 6267: Policy loss: -0.016723. Value loss: 0.369135. Entropy: 1.847799.
Training network. lr: 0.000202. clip: 0.080803
Iteration 6268: Policy loss: 0.014021. Value loss: 0.989830. Entropy: 1.820935.
Iteration 6269: Policy loss: -0.000108. Value loss: 0.566778. Entropy: 1.832405.
Iteration 6270: Policy loss: -0.009981. Value loss: 0.400012. Entropy: 1.812553.
episode: 2065   score: 2750.0  epsilon: 1.0    steps: 864  evaluation reward: 1167.2
Training network. lr: 0.000202. clip: 0.080803
Iteration 6271: Policy loss: 0.002029. Value loss: 0.991663. Entropy: 1.853255.
Iteration 6272: Policy loss: -0.010839. Value loss: 0.551340. Entropy: 1.854057.
Iteration 6273: Policy loss: -0.019397. Value loss: 0.351182. Entropy: 1.844019.
episode: 2066   score: 1470.0  epsilon: 1.0    steps: 648  evaluation reward: 1178.6
Training network. lr: 0.000202. clip: 0.080803
Iteration 6274: Policy loss: 0.002322. Value loss: 0.833054. Entropy: 1.822151.
Iteration 6275: Policy loss: -0.012476. Valu

Iteration 6336: Policy loss: -0.015918. Value loss: 0.625590. Entropy: 1.841380.
episode: 2084   score: 980.0  epsilon: 1.0    steps: 416  evaluation reward: 1187.0
Training network. lr: 0.000202. clip: 0.080656
Iteration 6337: Policy loss: 0.009463. Value loss: 0.974647. Entropy: 1.828393.
Iteration 6338: Policy loss: -0.007423. Value loss: 0.541423. Entropy: 1.816451.
Iteration 6339: Policy loss: -0.013586. Value loss: 0.430190. Entropy: 1.809787.
episode: 2085   score: 680.0  epsilon: 1.0    steps: 760  evaluation reward: 1188.0
Training network. lr: 0.000202. clip: 0.080656
Iteration 6340: Policy loss: 0.002579. Value loss: 0.969921. Entropy: 1.787132.
Iteration 6341: Policy loss: -0.006572. Value loss: 0.604062. Entropy: 1.774320.
Iteration 6342: Policy loss: -0.012900. Value loss: 0.418008. Entropy: 1.751662.
episode: 2086   score: 780.0  epsilon: 1.0    steps: 240  evaluation reward: 1182.8
Training network. lr: 0.000202. clip: 0.080656
Iteration 6343: Policy loss: 0.006976. Val

Iteration 6401: Policy loss: -0.007187. Value loss: 0.643648. Entropy: 1.644598.
Iteration 6402: Policy loss: -0.015674. Value loss: 0.501848. Entropy: 1.628024.
episode: 2107   score: 2450.0  epsilon: 1.0    steps: 328  evaluation reward: 1164.8
Training network. lr: 0.000201. clip: 0.080342
Iteration 6403: Policy loss: 0.010655. Value loss: 1.260436. Entropy: 1.577168.
Iteration 6404: Policy loss: -0.005535. Value loss: 0.900386. Entropy: 1.565760.
Iteration 6405: Policy loss: -0.009182. Value loss: 0.667519. Entropy: 1.536293.
Training network. lr: 0.000201. clip: 0.080342
Iteration 6406: Policy loss: 0.005563. Value loss: 0.585382. Entropy: 1.800496.
Iteration 6407: Policy loss: -0.012703. Value loss: 0.293811. Entropy: 1.803128.
Iteration 6408: Policy loss: -0.021915. Value loss: 0.230549. Entropy: 1.795016.
episode: 2108   score: 630.0  epsilon: 1.0    steps: 736  evaluation reward: 1157.9
episode: 2109   score: 380.0  epsilon: 1.0    steps: 760  evaluation reward: 1145.9
Trainin

Iteration 6471: Policy loss: -0.009995. Value loss: 0.327455. Entropy: 1.898922.
episode: 2125   score: 1350.0  epsilon: 1.0    steps: 40  evaluation reward: 1173.5
episode: 2126   score: 1490.0  epsilon: 1.0    steps: 200  evaluation reward: 1182.6
Training network. lr: 0.000200. clip: 0.080195
Iteration 6472: Policy loss: 0.014820. Value loss: 0.960338. Entropy: 1.790909.
Iteration 6473: Policy loss: -0.000559. Value loss: 0.555825. Entropy: 1.769279.
Iteration 6474: Policy loss: -0.010976. Value loss: 0.396096. Entropy: 1.761159.
episode: 2127   score: 230.0  epsilon: 1.0    steps: 232  evaluation reward: 1172.1
Training network. lr: 0.000200. clip: 0.080195
Iteration 6475: Policy loss: 0.005703. Value loss: 0.793969. Entropy: 1.814306.
Iteration 6476: Policy loss: -0.011271. Value loss: 0.532899. Entropy: 1.805880.
Iteration 6477: Policy loss: -0.016093. Value loss: 0.383844. Entropy: 1.796394.
Training network. lr: 0.000200. clip: 0.080195
Iteration 6478: Policy loss: 0.010058. Va

Iteration 6540: Policy loss: -0.013989. Value loss: 0.475215. Entropy: 1.826673.
episode: 2144   score: 1300.0  epsilon: 1.0    steps: 32  evaluation reward: 1176.3
Training network. lr: 0.000200. clip: 0.080038
Iteration 6541: Policy loss: 0.002792. Value loss: 1.319313. Entropy: 1.663229.
Iteration 6542: Policy loss: -0.005437. Value loss: 0.866128. Entropy: 1.641925.
Iteration 6543: Policy loss: -0.009898. Value loss: 0.630247. Entropy: 1.613974.
episode: 2145   score: 580.0  epsilon: 1.0    steps: 16  evaluation reward: 1169.1
Training network. lr: 0.000200. clip: 0.080038
Iteration 6544: Policy loss: 0.005671. Value loss: 0.853891. Entropy: 1.608122.
Iteration 6545: Policy loss: -0.006533. Value loss: 0.636213. Entropy: 1.612802.
Iteration 6546: Policy loss: -0.014691. Value loss: 0.546734. Entropy: 1.608574.
episode: 2146   score: 680.0  epsilon: 1.0    steps: 640  evaluation reward: 1168.1
Training network. lr: 0.000200. clip: 0.080038
Iteration 6547: Policy loss: 0.013929. Valu

Iteration 6605: Policy loss: -0.012631. Value loss: 0.958046. Entropy: 1.777357.
Iteration 6606: Policy loss: -0.018515. Value loss: 0.702074. Entropy: 1.749575.
Training network. lr: 0.000199. clip: 0.079734
Iteration 6607: Policy loss: 0.007524. Value loss: 0.878522. Entropy: 1.908027.
Iteration 6608: Policy loss: -0.005680. Value loss: 0.558301. Entropy: 1.925150.
Iteration 6609: Policy loss: -0.017585. Value loss: 0.418319. Entropy: 1.904623.
episode: 2167   score: 930.0  epsilon: 1.0    steps: 624  evaluation reward: 1064.1
Training network. lr: 0.000199. clip: 0.079734
Iteration 6610: Policy loss: 0.006978. Value loss: 0.860503. Entropy: 1.840352.
Iteration 6611: Policy loss: -0.007136. Value loss: 0.619036. Entropy: 1.841687.
Iteration 6612: Policy loss: -0.017521. Value loss: 0.483336. Entropy: 1.839580.
Training network. lr: 0.000199. clip: 0.079734
Iteration 6613: Policy loss: 0.008405. Value loss: 0.902853. Entropy: 1.737495.
Iteration 6614: Policy loss: -0.009598. Value los

Iteration 6671: Policy loss: -0.006554. Value loss: 0.764150. Entropy: 1.597100.
Iteration 6672: Policy loss: -0.016110. Value loss: 0.532257. Entropy: 1.580815.
episode: 2189   score: 380.0  epsilon: 1.0    steps: 368  evaluation reward: 1073.5
Training network. lr: 0.000199. clip: 0.079577
Iteration 6673: Policy loss: 0.007549. Value loss: 1.156978. Entropy: 1.606127.
Iteration 6674: Policy loss: -0.007583. Value loss: 0.659729. Entropy: 1.597325.
Iteration 6675: Policy loss: -0.014952. Value loss: 0.423755. Entropy: 1.591075.
Training network. lr: 0.000199. clip: 0.079577
Iteration 6676: Policy loss: 0.005497. Value loss: 0.908385. Entropy: 1.616710.
Iteration 6677: Policy loss: -0.010531. Value loss: 0.495701. Entropy: 1.646550.
Iteration 6678: Policy loss: -0.018060. Value loss: 0.350017. Entropy: 1.631733.
Training network. lr: 0.000199. clip: 0.079577
Iteration 6679: Policy loss: 0.004619. Value loss: 1.369672. Entropy: 1.688710.
Iteration 6680: Policy loss: -0.005687. Value los

episode: 2209   score: 780.0  epsilon: 1.0    steps: 344  evaluation reward: 1086.0
Training network. lr: 0.000199. clip: 0.079421
Iteration 6739: Policy loss: 0.009808. Value loss: 0.986274. Entropy: 1.586364.
Iteration 6740: Policy loss: 0.000517. Value loss: 0.688325. Entropy: 1.579577.
Iteration 6741: Policy loss: -0.006462. Value loss: 0.477914. Entropy: 1.561511.
episode: 2210   score: 1280.0  epsilon: 1.0    steps: 136  evaluation reward: 1094.9
Training network. lr: 0.000199. clip: 0.079421
Iteration 6742: Policy loss: 0.011542. Value loss: 0.695475. Entropy: 1.434982.
Iteration 6743: Policy loss: -0.004873. Value loss: 0.376133. Entropy: 1.421368.
Iteration 6744: Policy loss: -0.015898. Value loss: 0.271582. Entropy: 1.425292.
Training network. lr: 0.000199. clip: 0.079421
Iteration 6745: Policy loss: 0.009393. Value loss: 1.567023. Entropy: 1.708953.
Iteration 6746: Policy loss: -0.001992. Value loss: 1.130268. Entropy: 1.727636.
Iteration 6747: Policy loss: -0.012500. Value 

episode: 2231   score: 530.0  epsilon: 1.0    steps: 280  evaluation reward: 1037.1
Training network. lr: 0.000198. clip: 0.079117
Iteration 6805: Policy loss: 0.010171. Value loss: 1.658595. Entropy: 1.852508.
Iteration 6806: Policy loss: -0.004162. Value loss: 1.232839. Entropy: 1.831921.
Iteration 6807: Policy loss: -0.007531. Value loss: 0.994424. Entropy: 1.810591.
Training network. lr: 0.000198. clip: 0.079117
Iteration 6808: Policy loss: 0.000803. Value loss: 0.916649. Entropy: 1.829836.
Iteration 6809: Policy loss: -0.005745. Value loss: 0.678261. Entropy: 1.852668.
Iteration 6810: Policy loss: -0.014376. Value loss: 0.562293. Entropy: 1.831348.
episode: 2232   score: 680.0  epsilon: 1.0    steps: 320  evaluation reward: 1029.9
episode: 2233   score: 1080.0  epsilon: 1.0    steps: 584  evaluation reward: 1036.8
Training network. lr: 0.000198. clip: 0.079117
Iteration 6811: Policy loss: 0.007191. Value loss: 0.722109. Entropy: 1.687689.
Iteration 6812: Policy loss: -0.003752. Va

Training network. lr: 0.000197. clip: 0.078960
Iteration 6874: Policy loss: 0.010001. Value loss: 0.636243. Entropy: 1.828645.
Iteration 6875: Policy loss: -0.004295. Value loss: 0.376430. Entropy: 1.828899.
Iteration 6876: Policy loss: -0.008931. Value loss: 0.281386. Entropy: 1.824000.
Training network. lr: 0.000197. clip: 0.078960
Iteration 6877: Policy loss: 0.007448. Value loss: 1.047811. Entropy: 1.952200.
Iteration 6878: Policy loss: -0.007577. Value loss: 0.698802. Entropy: 1.954474.
Iteration 6879: Policy loss: -0.014666. Value loss: 0.527225. Entropy: 1.959084.
Training network. lr: 0.000197. clip: 0.078960
Iteration 6880: Policy loss: 0.002383. Value loss: 0.842471. Entropy: 1.918407.
Iteration 6881: Policy loss: -0.006069. Value loss: 0.518085. Entropy: 1.923140.
Iteration 6882: Policy loss: -0.014510. Value loss: 0.372356. Entropy: 1.903640.
episode: 2250   score: 830.0  epsilon: 1.0    steps: 832  evaluation reward: 1039.1
now time :  2019-03-06 11:13:38.303827
episode: 2

Iteration 6942: Policy loss: -0.013798. Value loss: 0.439070. Entropy: 1.777806.
Training network. lr: 0.000197. clip: 0.078812
Iteration 6943: Policy loss: 0.010377. Value loss: 0.641862. Entropy: 1.666797.
Iteration 6944: Policy loss: -0.006969. Value loss: 0.423632. Entropy: 1.686804.
Iteration 6945: Policy loss: -0.016180. Value loss: 0.340444. Entropy: 1.661907.
episode: 2269   score: 1180.0  epsilon: 1.0    steps: 992  evaluation reward: 1101.0
Training network. lr: 0.000197. clip: 0.078812
Iteration 6946: Policy loss: 0.011389. Value loss: 0.703044. Entropy: 1.630524.
Iteration 6947: Policy loss: -0.005700. Value loss: 0.433706. Entropy: 1.607209.
Iteration 6948: Policy loss: -0.011085. Value loss: 0.318321. Entropy: 1.605069.
Training network. lr: 0.000197. clip: 0.078812
Iteration 6949: Policy loss: 0.010329. Value loss: 0.823826. Entropy: 1.543516.
Iteration 6950: Policy loss: 0.000494. Value loss: 0.519084. Entropy: 1.541974.
Iteration 6951: Policy loss: -0.010700. Value los

Iteration 7010: Policy loss: -0.002509. Value loss: 0.444585. Entropy: 1.534002.
Iteration 7011: Policy loss: -0.015448. Value loss: 0.337692. Entropy: 1.522084.
Training network. lr: 0.000196. clip: 0.078499
Iteration 7012: Policy loss: 0.005226. Value loss: 0.730878. Entropy: 1.740042.
Iteration 7013: Policy loss: -0.007713. Value loss: 0.412880. Entropy: 1.745374.
Iteration 7014: Policy loss: -0.012654. Value loss: 0.269414. Entropy: 1.718753.
Training network. lr: 0.000196. clip: 0.078499
Iteration 7015: Policy loss: 0.008263. Value loss: 0.728185. Entropy: 1.807267.
Iteration 7016: Policy loss: -0.008171. Value loss: 0.440993. Entropy: 1.819609.
Iteration 7017: Policy loss: -0.016770. Value loss: 0.259039. Entropy: 1.794860.
Training network. lr: 0.000196. clip: 0.078499
Iteration 7018: Policy loss: 0.016617. Value loss: 1.110807. Entropy: 1.867421.
Iteration 7019: Policy loss: -0.002519. Value loss: 0.681738. Entropy: 1.850068.
Iteration 7020: Policy loss: -0.010188. Value loss: 

Iteration 7079: Policy loss: -0.004846. Value loss: 0.423706. Entropy: 1.650413.
Iteration 7080: Policy loss: -0.014121. Value loss: 0.327365. Entropy: 1.657966.
Training network. lr: 0.000196. clip: 0.078352
Iteration 7081: Policy loss: 0.008209. Value loss: 1.007570. Entropy: 1.668484.
Iteration 7082: Policy loss: -0.007463. Value loss: 0.624113. Entropy: 1.680873.
Iteration 7083: Policy loss: -0.010367. Value loss: 0.496794. Entropy: 1.691687.
Training network. lr: 0.000196. clip: 0.078352
Iteration 7084: Policy loss: 0.008873. Value loss: 1.075544. Entropy: 1.731397.
Iteration 7085: Policy loss: -0.007280. Value loss: 0.768021. Entropy: 1.754236.
Iteration 7086: Policy loss: -0.017394. Value loss: 0.629957. Entropy: 1.722867.
episode: 2307   score: 1180.0  epsilon: 1.0    steps: 520  evaluation reward: 1159.4
Training network. lr: 0.000196. clip: 0.078352
Iteration 7087: Policy loss: 0.002198. Value loss: 1.414263. Entropy: 1.750092.
Iteration 7088: Policy loss: 0.003898. Value los

Iteration 7149: Policy loss: -0.011190. Value loss: 0.231563. Entropy: 1.673953.
episode: 2325   score: 160.0  epsilon: 1.0    steps: 832  evaluation reward: 1211.8
Training network. lr: 0.000195. clip: 0.078195
Iteration 7150: Policy loss: 0.012630. Value loss: 0.745687. Entropy: 1.690140.
Iteration 7151: Policy loss: -0.001322. Value loss: 0.415186. Entropy: 1.707564.
Iteration 7152: Policy loss: -0.011297. Value loss: 0.310669. Entropy: 1.691629.
episode: 2326   score: 1300.0  epsilon: 1.0    steps: 544  evaluation reward: 1217.0
episode: 2327   score: 1080.0  epsilon: 1.0    steps: 824  evaluation reward: 1216.0
Training network. lr: 0.000195. clip: 0.078038
Iteration 7153: Policy loss: 0.003249. Value loss: 0.952412. Entropy: 1.786503.
Iteration 7154: Policy loss: -0.009981. Value loss: 0.638099. Entropy: 1.808880.
Iteration 7155: Policy loss: -0.019445. Value loss: 0.497665. Entropy: 1.793057.
episode: 2328   score: 780.0  epsilon: 1.0    steps: 672  evaluation reward: 1211.0
Tra