# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
torch.backends.cudnn.benchmarks = True
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

### Main Training Loop

In [None]:
agent = Agent(action_size, mode='PPO_MHDPA')
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10


### Loop through all environments and run PPO on them

#env_names = ['Breakout-v0', 'Phoenix-v0', 'Asteroids-v0', 'SpaceInvaders-v0', 'MsPacman-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']
env_names = ['SpaceInvaders-v4']
for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0' or name == 'Breakout-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size, mode='PPO_MHDPA')
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[HISTORY_SIZE-1,:,:] for i in range(num_envs)])
            next_states = []
            net_in = np.stack([envs[i].history[:HISTORY_SIZE,:,:] for i in range(num_envs)])
            step += num_envs
            frame += num_envs
            actions, values, _ = agent.get_action(np.float32(net_in) / 255.)
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, deepcopy(curr_states[i]), actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    _, frame_next_vals, _ = agent.get_action(np.float32(net_in) / 255.)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()
    
    for i in range(len(envs)):
        envs[i]._env.close()
    del envs

In [None]:
def test_best(name):
    env = GameEnv(name)
    print("\n\n\n ------- TESTING BEST MODEL FOR %s ------- \n\n\n" % (name))
    number_lives = env.life
    
    if (name == 'SpaceInvaders-v0'):
        action_size = 4
    else:
        action_size = env.action_space.n
    rewards, episodes = [], []
    
    e = 0
    frame = 0

    agent = Agent(action_size)
    agent.policy_net.load_state_dict(torch.load("./save_model/" + name + "_ppo_best"))
    agent.update_target_net()
    agent.policy_net.eval()
    evaluation_reward = deque(maxlen=evaluation_reward_length)

    for i in range(100):
        env.done = False
        env.score = 0
        env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
        env.state = env.reset()
        env.life = number_lives
        get_init_state(env.history, env.state)
        step = 0
        while not env.done:
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            net_in = env.history[:HISTORY_SIZE,:,:]
            action, value, _ = agent.get_action(np.float32(net_in) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            env._env.render()
            
            frame_next_state = get_frame(next_state)
            
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            env.life = env.info['ale.lives']
            
            
            env.score += env.reward
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            step += 1
        

        evaluation_reward.append(env.score)
        print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))
            

In [None]:
test_best('MsPacman-v0')

### Convolutional LSTM agent

In [None]:
agent = Agent(action_size, mode='PPO_LSTM')
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10


### Loop through all environments and run PPO on them

#env_names = ['Breakout-v0', 'Phoenix-v0', 'Asteroids-v0', 'SpaceInvaders-v0', 'MsPacman-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']
env_names = ['SpaceInvaders-v4']
for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
        envs[i].reset_memory(agent.init_hidden())
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0' or name == 'Breakout-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size, mode='PPO_LSTM')
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[[HISTORY_SIZE-1],:,:] for i in range(num_envs)])
            hiddens = torch.cat([envs[i].memory for i in range(num_envs)])
            next_states = []
            step += num_envs
            frame += num_envs
            actions, values, hiddens = agent.get_action(np.float32(curr_states) / 255., hiddens)
            hiddens = hiddens.detach()
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                env.memory = hiddens[[i]]
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, [deepcopy(curr_states[i]), hiddens[i].detach().cpu().data.numpy()], actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    #net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    net_in = np.stack([envs[k].history[[-1],:,:] for k in range(num_envs)])
                    _, frame_next_vals, _ = agent.get_action(np.float32(net_in) / 255., hiddens)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
                    env.reset_memory(agent.init_hidden())
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()
    
    for i in range(len(envs)):
        envs[i]._env.close()
    del envs




 ------- STARTING TRAINING FOR SpaceInvaders-v4 ------- 



Determing min/max rewards of environment
Min: 0. Max: 200.


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))
  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: -0.003072. Value loss: 0.022652. Entropy: 0.299232.
Iteration 2: Policy loss: -0.003580. Value loss: 0.004498. Entropy: 0.298123.
Iteration 3: Policy loss: -0.001997. Value loss: 0.003115. Entropy: 0.299286.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: -0.260096. Value loss: 0.115091. Entropy: 0.296358.
Iteration 5: Policy loss: -0.265775. Value loss: 0.115090. Entropy: 0.299206.
Iteration 6: Policy loss: -0.264243. Value loss: 0.107152. Entropy: 0.296674.
now time :  2019-09-05 14:14:58.883665
episode: 1   score: 30.0  epsilon: 1.0    steps: 1016  evaluation reward: 30.0
Training network. lr: 0.000250. clip: 0.100000


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Iteration 7: Policy loss: -0.127577. Value loss: 0.097120. Entropy: 0.299122.
Iteration 8: Policy loss: -0.129910. Value loss: 0.098277. Entropy: 0.299415.
Iteration 9: Policy loss: -0.128063. Value loss: 0.101629. Entropy: 0.298915.
Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: -0.182123. Value loss: 0.134210. Entropy: 0.301229.
Iteration 11: Policy loss: -0.193441. Value loss: 0.141604. Entropy: 0.300440.
Iteration 12: Policy loss: -0.192024. Value loss: 0.141548. Entropy: 0.300822.
episode: 2   score: 105.0  epsilon: 1.0    steps: 936  evaluation reward: 67.5
episode: 3   score: 110.0  epsilon: 1.0    steps: 952  evaluation reward: 81.66666666666667
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: -0.365810. Value loss: 0.377602. Entropy: 0.299323.
Iteration 14: Policy loss: -0.352187. Value loss: 0.360414. Entropy: 0.298534.
Iteration 15: Policy loss: -0.374952. Value loss: 0.368787. Entropy: 0.297435.
episode: 4   score: 110.0  

Iteration 70: Policy loss: -0.076122. Value loss: 0.137759. Entropy: 0.301880.
Iteration 71: Policy loss: -0.069569. Value loss: 0.101255. Entropy: 0.301401.
Iteration 72: Policy loss: -0.075448. Value loss: 0.082537. Entropy: 0.300758.
episode: 28   score: 110.0  epsilon: 1.0    steps: 864  evaluation reward: 183.39285714285714
Training network. lr: 0.000250. clip: 0.099853
Iteration 73: Policy loss: 0.059979. Value loss: 0.093275. Entropy: 0.303562.
Iteration 74: Policy loss: 0.065201. Value loss: 0.058164. Entropy: 0.303229.
Iteration 75: Policy loss: 0.062696. Value loss: 0.045372. Entropy: 0.302738.
episode: 29   score: 155.0  epsilon: 1.0    steps: 456  evaluation reward: 182.41379310344828
Training network. lr: 0.000250. clip: 0.099853
Iteration 76: Policy loss: 0.189335. Value loss: 0.147428. Entropy: 0.304712.
Iteration 77: Policy loss: 0.191607. Value loss: 0.091933. Entropy: 0.304422.
Iteration 78: Policy loss: 0.195197. Value loss: 0.079831. Entropy: 0.304911.
episode: 30  

Iteration 132: Policy loss: -0.118019. Value loss: 0.041037. Entropy: 0.301752.
episode: 54   score: 155.0  epsilon: 1.0    steps: 664  evaluation reward: 186.11111111111111
Training network. lr: 0.000249. clip: 0.099696
Iteration 133: Policy loss: -0.000096. Value loss: 0.085713. Entropy: 0.303117.
Iteration 134: Policy loss: 0.003984. Value loss: 0.044598. Entropy: 0.303275.
Iteration 135: Policy loss: 0.001568. Value loss: 0.031010. Entropy: 0.303298.
episode: 55   score: 180.0  epsilon: 1.0    steps: 200  evaluation reward: 186.0
episode: 56   score: 105.0  epsilon: 1.0    steps: 224  evaluation reward: 184.55357142857142
episode: 57   score: 235.0  epsilon: 1.0    steps: 800  evaluation reward: 185.43859649122808
Training network. lr: 0.000249. clip: 0.099696
Iteration 136: Policy loss: 0.086186. Value loss: 0.110051. Entropy: 0.301385.
Iteration 137: Policy loss: 0.075962. Value loss: 0.048434. Entropy: 0.301680.
Iteration 138: Policy loss: 0.085940. Value loss: 0.034758. Entropy

Iteration 193: Policy loss: 0.273945. Value loss: 0.177352. Entropy: 0.306840.
Iteration 194: Policy loss: 0.269715. Value loss: 0.107727. Entropy: 0.306228.
Iteration 195: Policy loss: 0.265975. Value loss: 0.071253. Entropy: 0.305879.
episode: 80   score: 155.0  epsilon: 1.0    steps: 264  evaluation reward: 187.625
episode: 81   score: 260.0  epsilon: 1.0    steps: 336  evaluation reward: 188.5185185185185
episode: 82   score: 210.0  epsilon: 1.0    steps: 736  evaluation reward: 188.78048780487805
Training network. lr: 0.000249. clip: 0.099548
Iteration 196: Policy loss: 0.292155. Value loss: 0.142949. Entropy: 0.302272.
Iteration 197: Policy loss: 0.299017. Value loss: 0.065318. Entropy: 0.302921.
Iteration 198: Policy loss: 0.296714. Value loss: 0.048246. Entropy: 0.303208.
Training network. lr: 0.000249. clip: 0.099548
Iteration 199: Policy loss: -0.285775. Value loss: 0.365460. Entropy: 0.305739.
Iteration 200: Policy loss: -0.304001. Value loss: 0.288381. Entropy: 0.306277.
It

Iteration 255: Policy loss: -0.151396. Value loss: 0.203915. Entropy: 0.303627.
episode: 106   score: 180.0  epsilon: 1.0    steps: 392  evaluation reward: 197.15
episode: 107   score: 110.0  epsilon: 1.0    steps: 928  evaluation reward: 197.35
Training network. lr: 0.000248. clip: 0.099235
Iteration 256: Policy loss: -0.080352. Value loss: 0.107484. Entropy: 0.301900.
Iteration 257: Policy loss: -0.079008. Value loss: 0.050604. Entropy: 0.302630.
Iteration 258: Policy loss: -0.093881. Value loss: 0.037557. Entropy: 0.302018.
episode: 108   score: 110.0  epsilon: 1.0    steps: 432  evaluation reward: 196.2
episode: 109   score: 455.0  epsilon: 1.0    steps: 856  evaluation reward: 196.9
Training network. lr: 0.000248. clip: 0.099235
Iteration 259: Policy loss: -0.147414. Value loss: 0.095688. Entropy: 0.300330.
Iteration 260: Policy loss: -0.148294. Value loss: 0.063489. Entropy: 0.300346.
Iteration 261: Policy loss: -0.150122. Value loss: 0.046794. Entropy: 0.300337.
Training network

Iteration 317: Policy loss: -0.086123. Value loss: 0.171853. Entropy: 0.307447.
Iteration 318: Policy loss: -0.090028. Value loss: 0.158286. Entropy: 0.308561.
episode: 134   score: 135.0  epsilon: 1.0    steps: 536  evaluation reward: 188.6
Training network. lr: 0.000248. clip: 0.099088
Iteration 319: Policy loss: 0.060408. Value loss: 0.025223. Entropy: 0.311062.
Iteration 320: Policy loss: 0.064124. Value loss: 0.011927. Entropy: 0.310658.
Iteration 321: Policy loss: 0.060911. Value loss: 0.010168. Entropy: 0.310868.
episode: 135   score: 155.0  epsilon: 1.0    steps: 744  evaluation reward: 187.6
Training network. lr: 0.000248. clip: 0.099088
Iteration 322: Policy loss: 0.176940. Value loss: 0.097887. Entropy: 0.310519.
Iteration 323: Policy loss: 0.175646. Value loss: 0.060738. Entropy: 0.310161.
Iteration 324: Policy loss: 0.168918. Value loss: 0.043239. Entropy: 0.310416.
Training network. lr: 0.000248. clip: 0.099088
Iteration 325: Policy loss: 0.092346. Value loss: 0.094472. E

episode: 159   score: 270.0  epsilon: 1.0    steps: 232  evaluation reward: 200.15
Training network. lr: 0.000247. clip: 0.098931
Iteration 382: Policy loss: -0.161313. Value loss: 0.368421. Entropy: 0.304058.
Iteration 383: Policy loss: -0.197960. Value loss: 0.234876. Entropy: 0.305501.
Iteration 384: Policy loss: -0.197468. Value loss: 0.185866. Entropy: 0.305080.
episode: 160   score: 380.0  epsilon: 1.0    steps: 328  evaluation reward: 201.85
episode: 161   score: 440.0  epsilon: 1.0    steps: 496  evaluation reward: 203.1
Training network. lr: 0.000247. clip: 0.098931
Iteration 385: Policy loss: 0.084137. Value loss: 0.114610. Entropy: 0.301088.
Iteration 386: Policy loss: 0.091531. Value loss: 0.052758. Entropy: 0.300881.
Iteration 387: Policy loss: 0.083467. Value loss: 0.035825. Entropy: 0.300521.
episode: 162   score: 210.0  epsilon: 1.0    steps: 480  evaluation reward: 203.85
Training network. lr: 0.000247. clip: 0.098931
Iteration 388: Policy loss: 0.448084. Value loss: 0

Iteration 446: Policy loss: 0.051993. Value loss: 0.038382. Entropy: 0.310445.
Iteration 447: Policy loss: 0.047904. Value loss: 0.035645. Entropy: 0.310614.
episode: 185   score: 160.0  epsilon: 1.0    steps: 16  evaluation reward: 202.3
episode: 186   score: 75.0  epsilon: 1.0    steps: 224  evaluation reward: 201.25
episode: 187   score: 210.0  epsilon: 1.0    steps: 568  evaluation reward: 201.75
Training network. lr: 0.000247. clip: 0.098774
Iteration 448: Policy loss: -0.410388. Value loss: 0.380311. Entropy: 0.312355.
Iteration 449: Policy loss: -0.406879. Value loss: 0.331196. Entropy: 0.311127.
Iteration 450: Policy loss: -0.418655. Value loss: 0.327167. Entropy: 0.311873.
episode: 188   score: 410.0  epsilon: 1.0    steps: 96  evaluation reward: 204.05
Training network. lr: 0.000247. clip: 0.098627
Iteration 451: Policy loss: -0.004777. Value loss: 0.060357. Entropy: 0.312691.
Iteration 452: Policy loss: -0.003765. Value loss: 0.040119. Entropy: 0.314112.
Iteration 453: Polic

Training network. lr: 0.000246. clip: 0.098470
Iteration 508: Policy loss: 0.143334. Value loss: 0.073930. Entropy: 0.310891.
Iteration 509: Policy loss: 0.135727. Value loss: 0.034330. Entropy: 0.309867.
Iteration 510: Policy loss: 0.139621. Value loss: 0.027317. Entropy: 0.310562.
episode: 213   score: 240.0  epsilon: 1.0    steps: 408  evaluation reward: 207.4
Training network. lr: 0.000246. clip: 0.098470
Iteration 511: Policy loss: 0.237362. Value loss: 0.179286. Entropy: 0.309028.
Iteration 512: Policy loss: 0.245715. Value loss: 0.102672. Entropy: 0.308951.
Iteration 513: Policy loss: 0.239417. Value loss: 0.080930. Entropy: 0.308315.
episode: 214   score: 50.0  epsilon: 1.0    steps: 704  evaluation reward: 206.85
Training network. lr: 0.000246. clip: 0.098470
Iteration 514: Policy loss: 0.043187. Value loss: 0.052896. Entropy: 0.303742.
Iteration 515: Policy loss: 0.039772. Value loss: 0.039807. Entropy: 0.303993.
Iteration 516: Policy loss: 0.039053. Value loss: 0.033090. Ent

Iteration 572: Policy loss: -0.085634. Value loss: 0.059892. Entropy: 0.315327.
Iteration 573: Policy loss: -0.088751. Value loss: 0.051303. Entropy: 0.314566.
episode: 239   score: 180.0  epsilon: 1.0    steps: 136  evaluation reward: 216.55
Training network. lr: 0.000246. clip: 0.098313
Iteration 574: Policy loss: 0.064143. Value loss: 0.093035. Entropy: 0.310301.
Iteration 575: Policy loss: 0.060098. Value loss: 0.053096. Entropy: 0.308524.
Iteration 576: Policy loss: 0.058230. Value loss: 0.039798. Entropy: 0.310218.
episode: 240   score: 180.0  epsilon: 1.0    steps: 968  evaluation reward: 215.95
Training network. lr: 0.000246. clip: 0.098313
Iteration 577: Policy loss: -0.069184. Value loss: 0.090246. Entropy: 0.307852.
Iteration 578: Policy loss: -0.069222. Value loss: 0.049696. Entropy: 0.309294.
Iteration 579: Policy loss: -0.074391. Value loss: 0.041001. Entropy: 0.307432.
Training network. lr: 0.000246. clip: 0.098313
Iteration 580: Policy loss: -0.006535. Value loss: 0.029

episode: 264   score: 460.0  epsilon: 1.0    steps: 1000  evaluation reward: 213.35
Training network. lr: 0.000245. clip: 0.098166
Iteration 637: Policy loss: 0.435789. Value loss: 0.158224. Entropy: 0.316739.
Iteration 638: Policy loss: 0.423871. Value loss: 0.075184. Entropy: 0.316123.
Iteration 639: Policy loss: 0.421722. Value loss: 0.057418. Entropy: 0.314886.
episode: 265   score: 140.0  epsilon: 1.0    steps: 856  evaluation reward: 212.95
episode: 266   score: 110.0  epsilon: 1.0    steps: 896  evaluation reward: 211.95
episode: 267   score: 210.0  epsilon: 1.0    steps: 904  evaluation reward: 211.95
Training network. lr: 0.000245. clip: 0.098166
Iteration 640: Policy loss: 0.518990. Value loss: 0.193227. Entropy: 0.312773.
Iteration 641: Policy loss: 0.496688. Value loss: 0.088088. Entropy: 0.311492.
Iteration 642: Policy loss: 0.501727. Value loss: 0.071791. Entropy: 0.312908.
episode: 268   score: 185.0  epsilon: 1.0    steps: 576  evaluation reward: 212.4
episode: 269   sc

Iteration 698: Policy loss: -0.334175. Value loss: 0.291601. Entropy: 0.305463.
Iteration 699: Policy loss: -0.306716. Value loss: 0.235482. Entropy: 0.306462.
episode: 293   score: 75.0  epsilon: 1.0    steps: 504  evaluation reward: 196.75
episode: 294   score: 210.0  epsilon: 1.0    steps: 824  evaluation reward: 197.05
Training network. lr: 0.000245. clip: 0.098009
Iteration 700: Policy loss: -0.019522. Value loss: 0.122763. Entropy: 0.310567.
Iteration 701: Policy loss: -0.018095. Value loss: 0.072068. Entropy: 0.312894.
Iteration 702: Policy loss: -0.021566. Value loss: 0.055665. Entropy: 0.311574.
episode: 295   score: 410.0  epsilon: 1.0    steps: 432  evaluation reward: 199.6
Training network. lr: 0.000245. clip: 0.097853
Iteration 703: Policy loss: -0.010226. Value loss: 0.068783. Entropy: 0.310512.
Iteration 704: Policy loss: -0.013442. Value loss: 0.046234. Entropy: 0.310803.
Iteration 705: Policy loss: -0.017030. Value loss: 0.032986. Entropy: 0.310235.
episode: 296   scor

Iteration 761: Policy loss: 0.099468. Value loss: 0.050773. Entropy: 0.307322.
Iteration 762: Policy loss: 0.092795. Value loss: 0.043434. Entropy: 0.307186.
episode: 320   score: 185.0  epsilon: 1.0    steps: 480  evaluation reward: 202.35
Training network. lr: 0.000244. clip: 0.097705
Iteration 763: Policy loss: -0.101951. Value loss: 0.038457. Entropy: 0.311380.
Iteration 764: Policy loss: -0.110501. Value loss: 0.020821. Entropy: 0.312606.
Iteration 765: Policy loss: -0.109399. Value loss: 0.015768. Entropy: 0.312464.
episode: 321   score: 210.0  epsilon: 1.0    steps: 880  evaluation reward: 201.85
Training network. lr: 0.000244. clip: 0.097705
Iteration 766: Policy loss: -0.167401. Value loss: 0.085403. Entropy: 0.307891.
Iteration 767: Policy loss: -0.174135. Value loss: 0.048966. Entropy: 0.306655.
Iteration 768: Policy loss: -0.173222. Value loss: 0.038282. Entropy: 0.305465.
episode: 322   score: 120.0  epsilon: 1.0    steps: 680  evaluation reward: 198.95
Training network. l

Training network. lr: 0.000244. clip: 0.097549
Iteration 826: Policy loss: 0.043008. Value loss: 0.020128. Entropy: 0.309589.
Iteration 827: Policy loss: 0.043385. Value loss: 0.007859. Entropy: 0.309781.
Iteration 828: Policy loss: 0.042307. Value loss: 0.007555. Entropy: 0.310032.
Training network. lr: 0.000244. clip: 0.097549
Iteration 829: Policy loss: -0.030747. Value loss: 0.199828. Entropy: 0.312241.
Iteration 830: Policy loss: -0.022786. Value loss: 0.147987. Entropy: 0.313047.
Iteration 831: Policy loss: -0.042524. Value loss: 0.157136. Entropy: 0.309871.
episode: 345   score: 335.0  epsilon: 1.0    steps: 176  evaluation reward: 200.35
episode: 346   score: 155.0  epsilon: 1.0    steps: 880  evaluation reward: 200.35
Training network. lr: 0.000244. clip: 0.097549
Iteration 832: Policy loss: -0.006028. Value loss: 0.251443. Entropy: 0.310197.
Iteration 833: Policy loss: -0.005790. Value loss: 0.190381. Entropy: 0.309764.
Iteration 834: Policy loss: -0.010735. Value loss: 0.165

Training network. lr: 0.000243. clip: 0.097392
Iteration 889: Policy loss: 0.064028. Value loss: 0.088296. Entropy: 0.312375.
Iteration 890: Policy loss: 0.062724. Value loss: 0.048590. Entropy: 0.312227.
Iteration 891: Policy loss: 0.060144. Value loss: 0.043817. Entropy: 0.311411.
episode: 372   score: 210.0  epsilon: 1.0    steps: 400  evaluation reward: 207.2
episode: 373   score: 180.0  epsilon: 1.0    steps: 760  evaluation reward: 207.95
Training network. lr: 0.000243. clip: 0.097392
Iteration 892: Policy loss: -0.049782. Value loss: 0.083552. Entropy: 0.312183.
Iteration 893: Policy loss: -0.042907. Value loss: 0.048511. Entropy: 0.310798.
Iteration 894: Policy loss: -0.045363. Value loss: 0.040396. Entropy: 0.312579.
Training network. lr: 0.000243. clip: 0.097392
Iteration 895: Policy loss: 0.009009. Value loss: 0.063010. Entropy: 0.308551.
Iteration 896: Policy loss: 0.009307. Value loss: 0.035756. Entropy: 0.309655.
Iteration 897: Policy loss: 0.007052. Value loss: 0.025500.

episode: 396   score: 180.0  epsilon: 1.0    steps: 112  evaluation reward: 215.8
Training network. lr: 0.000243. clip: 0.097088
Iteration 955: Policy loss: -0.032777. Value loss: 0.060411. Entropy: 0.323494.
Iteration 956: Policy loss: -0.030748. Value loss: 0.041253. Entropy: 0.324102.
Iteration 957: Policy loss: -0.035400. Value loss: 0.027471. Entropy: 0.324283.
episode: 397   score: 110.0  epsilon: 1.0    steps: 816  evaluation reward: 213.1
Training network. lr: 0.000243. clip: 0.097088
Iteration 958: Policy loss: 0.048075. Value loss: 0.047801. Entropy: 0.319483.
Iteration 959: Policy loss: 0.046277. Value loss: 0.023106. Entropy: 0.318479.
Iteration 960: Policy loss: 0.043414. Value loss: 0.018974. Entropy: 0.319972.
episode: 398   score: 210.0  epsilon: 1.0    steps: 128  evaluation reward: 213.1
Training network. lr: 0.000243. clip: 0.097088
Iteration 961: Policy loss: 0.162709. Value loss: 0.049892. Entropy: 0.321124.
Iteration 962: Policy loss: 0.158515. Value loss: 0.02277

Iteration 1019: Policy loss: 0.128685. Value loss: 0.028273. Entropy: 0.319968.
Iteration 1020: Policy loss: 0.129215. Value loss: 0.021116. Entropy: 0.320053.
episode: 421   score: 180.0  epsilon: 1.0    steps: 360  evaluation reward: 225.45
Training network. lr: 0.000242. clip: 0.096931
Iteration 1021: Policy loss: 0.135895. Value loss: 0.086223. Entropy: 0.320001.
Iteration 1022: Policy loss: 0.130853. Value loss: 0.039678. Entropy: 0.319431.
Iteration 1023: Policy loss: 0.129467. Value loss: 0.029529. Entropy: 0.319325.
episode: 422   score: 180.0  epsilon: 1.0    steps: 48  evaluation reward: 226.05
episode: 423   score: 285.0  epsilon: 1.0    steps: 824  evaluation reward: 224.8
Training network. lr: 0.000242. clip: 0.096931
Iteration 1024: Policy loss: -0.164338. Value loss: 0.341830. Entropy: 0.317998.
Iteration 1025: Policy loss: -0.175828. Value loss: 0.216803. Entropy: 0.318819.
Iteration 1026: Policy loss: -0.164410. Value loss: 0.140316. Entropy: 0.316659.
episode: 424   s

Iteration 1083: Policy loss: -0.086207. Value loss: 0.014305. Entropy: 0.311300.
episode: 446   score: 155.0  epsilon: 1.0    steps: 480  evaluation reward: 235.05
episode: 447   score: 270.0  epsilon: 1.0    steps: 528  evaluation reward: 235.65
Training network. lr: 0.000242. clip: 0.096784
Iteration 1084: Policy loss: 0.030910. Value loss: 0.037592. Entropy: 0.312698.
Iteration 1085: Policy loss: 0.035983. Value loss: 0.022686. Entropy: 0.313099.
Iteration 1086: Policy loss: 0.028398. Value loss: 0.021619. Entropy: 0.313733.
episode: 448   score: 180.0  epsilon: 1.0    steps: 400  evaluation reward: 235.55
Training network. lr: 0.000242. clip: 0.096784
Iteration 1087: Policy loss: -0.006672. Value loss: 0.033265. Entropy: 0.320068.
Iteration 1088: Policy loss: -0.011792. Value loss: 0.024389. Entropy: 0.319143.
Iteration 1089: Policy loss: -0.011044. Value loss: 0.022047. Entropy: 0.319373.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1090: Policy loss: -0.363881. Value 

Training network. lr: 0.000242. clip: 0.096627
Iteration 1147: Policy loss: -0.202875. Value loss: 0.289967. Entropy: 0.305634.
Iteration 1148: Policy loss: -0.203725. Value loss: 0.207373. Entropy: 0.304608.
Iteration 1149: Policy loss: -0.232034. Value loss: 0.159725. Entropy: 0.306849.
episode: 471   score: 210.0  epsilon: 1.0    steps: 224  evaluation reward: 241.85
episode: 472   score: 180.0  epsilon: 1.0    steps: 984  evaluation reward: 241.55
Training network. lr: 0.000242. clip: 0.096627
Iteration 1150: Policy loss: -0.215885. Value loss: 0.199545. Entropy: 0.306260.
Iteration 1151: Policy loss: -0.222716. Value loss: 0.123808. Entropy: 0.308236.
Iteration 1152: Policy loss: -0.233832. Value loss: 0.092535. Entropy: 0.308163.
episode: 473   score: 180.0  epsilon: 1.0    steps: 8  evaluation reward: 241.55
Training network. lr: 0.000241. clip: 0.096470
Iteration 1153: Policy loss: 0.020605. Value loss: 0.069664. Entropy: 0.306789.
Iteration 1154: Policy loss: 0.014537. Value l

Iteration 1209: Policy loss: -0.178575. Value loss: 0.063094. Entropy: 0.312586.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1210: Policy loss: 0.445260. Value loss: 0.227201. Entropy: 0.314056.
Iteration 1211: Policy loss: 0.417243. Value loss: 0.077166. Entropy: 0.312360.
Iteration 1212: Policy loss: 0.413206. Value loss: 0.057094. Entropy: 0.310822.
episode: 498   score: 410.0  epsilon: 1.0    steps: 352  evaluation reward: 244.75
episode: 499   score: 110.0  epsilon: 1.0    steps: 456  evaluation reward: 243.25
episode: 500   score: 210.0  epsilon: 1.0    steps: 960  evaluation reward: 243.25
Training network. lr: 0.000241. clip: 0.096323
Iteration 1213: Policy loss: -0.004709. Value loss: 0.066209. Entropy: 0.318927.
Iteration 1214: Policy loss: 0.000368. Value loss: 0.031836. Entropy: 0.316756.
Iteration 1215: Policy loss: -0.003027. Value loss: 0.029191. Entropy: 0.318033.
now time :  2019-09-05 15:30:44.973579
episode: 501   score: 180.0  epsilon: 1.0    steps: 24 

episode: 523   score: 135.0  epsilon: 1.0    steps: 576  evaluation reward: 232.0
Training network. lr: 0.000240. clip: 0.096166
Iteration 1273: Policy loss: 0.142035. Value loss: 0.102947. Entropy: 0.314545.
Iteration 1274: Policy loss: 0.140948. Value loss: 0.053422. Entropy: 0.314247.
Iteration 1275: Policy loss: 0.139409. Value loss: 0.038223. Entropy: 0.314830.
episode: 524   score: 230.0  epsilon: 1.0    steps: 672  evaluation reward: 232.2
episode: 525   score: 185.0  epsilon: 1.0    steps: 808  evaluation reward: 231.95
Training network. lr: 0.000240. clip: 0.096166
Iteration 1276: Policy loss: 0.189719. Value loss: 0.039574. Entropy: 0.308117.
Iteration 1277: Policy loss: 0.178981. Value loss: 0.017683. Entropy: 0.307149.
Iteration 1278: Policy loss: 0.180516. Value loss: 0.014216. Entropy: 0.307280.
episode: 526   score: 180.0  epsilon: 1.0    steps: 784  evaluation reward: 228.1
Training network. lr: 0.000240. clip: 0.096166
Iteration 1279: Policy loss: 0.017583. Value loss:

Iteration 1336: Policy loss: -0.188311. Value loss: 0.099911. Entropy: 0.312454.
Iteration 1337: Policy loss: -0.195644. Value loss: 0.047685. Entropy: 0.313423.
Iteration 1338: Policy loss: -0.197213. Value loss: 0.036661. Entropy: 0.313257.
episode: 549   score: 105.0  epsilon: 1.0    steps: 432  evaluation reward: 227.15
Training network. lr: 0.000240. clip: 0.096009
Iteration 1339: Policy loss: -0.018762. Value loss: 0.051266. Entropy: 0.311965.
Iteration 1340: Policy loss: -0.021670. Value loss: 0.024547. Entropy: 0.311910.
Iteration 1341: Policy loss: -0.022929. Value loss: 0.018020. Entropy: 0.312344.
Training network. lr: 0.000240. clip: 0.096009
Iteration 1342: Policy loss: -0.040171. Value loss: 0.050105. Entropy: 0.313103.
Iteration 1343: Policy loss: -0.041769. Value loss: 0.026972. Entropy: 0.312572.
Iteration 1344: Policy loss: -0.040616. Value loss: 0.022075. Entropy: 0.313074.
episode: 550   score: 180.0  epsilon: 1.0    steps: 352  evaluation reward: 227.15
Training ne

Iteration 1400: Policy loss: 0.168765. Value loss: 0.029598. Entropy: 0.310243.
Iteration 1401: Policy loss: 0.171736. Value loss: 0.021091. Entropy: 0.309909.
episode: 574   score: 180.0  epsilon: 1.0    steps: 728  evaluation reward: 216.75
Training network. lr: 0.000239. clip: 0.095705
Iteration 1402: Policy loss: 0.198986. Value loss: 0.061248. Entropy: 0.307980.
Iteration 1403: Policy loss: 0.193473. Value loss: 0.035644. Entropy: 0.307474.
Iteration 1404: Policy loss: 0.196900. Value loss: 0.031844. Entropy: 0.307023.
episode: 575   score: 135.0  epsilon: 1.0    steps: 928  evaluation reward: 216.3
Training network. lr: 0.000239. clip: 0.095705
Iteration 1405: Policy loss: 0.121817. Value loss: 0.041688. Entropy: 0.306735.
Iteration 1406: Policy loss: 0.122748. Value loss: 0.024122. Entropy: 0.304812.
Iteration 1407: Policy loss: 0.121967. Value loss: 0.018623. Entropy: 0.304565.
episode: 576   score: 210.0  epsilon: 1.0    steps: 440  evaluation reward: 213.6
episode: 577   scor

episode: 599   score: 180.0  epsilon: 1.0    steps: 832  evaluation reward: 213.5
Training network. lr: 0.000239. clip: 0.095549
Iteration 1465: Policy loss: 0.154402. Value loss: 0.120820. Entropy: 0.307644.
Iteration 1466: Policy loss: 0.145311. Value loss: 0.051990. Entropy: 0.307487.
Iteration 1467: Policy loss: 0.149044. Value loss: 0.036868. Entropy: 0.307712.
episode: 600   score: 180.0  epsilon: 1.0    steps: 224  evaluation reward: 213.2
now time :  2019-09-05 15:46:27.088156
episode: 601   score: 210.0  epsilon: 1.0    steps: 1016  evaluation reward: 213.5
Training network. lr: 0.000239. clip: 0.095549
Iteration 1468: Policy loss: -0.013089. Value loss: 0.063904. Entropy: 0.309140.
Iteration 1469: Policy loss: -0.018970. Value loss: 0.030006. Entropy: 0.308689.
Iteration 1470: Policy loss: -0.016967. Value loss: 0.025862. Entropy: 0.308779.
episode: 602   score: 180.0  epsilon: 1.0    steps: 240  evaluation reward: 213.2
Training network. lr: 0.000239. clip: 0.095549
Iteratio

Iteration 1528: Policy loss: 0.048697. Value loss: 0.146396. Entropy: 0.311537.
Iteration 1529: Policy loss: 0.041499. Value loss: 0.117827. Entropy: 0.311345.
Iteration 1530: Policy loss: 0.053499. Value loss: 0.095325. Entropy: 0.311309.
episode: 624   score: 210.0  epsilon: 1.0    steps: 304  evaluation reward: 216.05
Training network. lr: 0.000239. clip: 0.095401
Iteration 1531: Policy loss: -0.033787. Value loss: 0.077636. Entropy: 0.312716.
Iteration 1532: Policy loss: -0.034780. Value loss: 0.037772. Entropy: 0.312031.
Iteration 1533: Policy loss: -0.031790. Value loss: 0.029440. Entropy: 0.312737.
episode: 625   score: 890.0  epsilon: 1.0    steps: 160  evaluation reward: 223.1
Training network. lr: 0.000239. clip: 0.095401
Iteration 1534: Policy loss: 0.071247. Value loss: 0.055659. Entropy: 0.309230.
Iteration 1535: Policy loss: 0.068623. Value loss: 0.031527. Entropy: 0.308693.
Iteration 1536: Policy loss: 0.068956. Value loss: 0.025179. Entropy: 0.309060.
Training network. 

episode: 648   score: 180.0  epsilon: 1.0    steps: 696  evaluation reward: 222.15
Training network. lr: 0.000238. clip: 0.095245
Iteration 1594: Policy loss: 0.075551. Value loss: 0.112224. Entropy: 0.311709.
Iteration 1595: Policy loss: 0.066678. Value loss: 0.089131. Entropy: 0.309093.
Iteration 1596: Policy loss: 0.062639. Value loss: 0.081407. Entropy: 0.310534.
episode: 649   score: 105.0  epsilon: 1.0    steps: 656  evaluation reward: 222.15
Training network. lr: 0.000238. clip: 0.095245
Iteration 1597: Policy loss: -0.086287. Value loss: 0.052516. Entropy: 0.310729.
Iteration 1598: Policy loss: -0.089890. Value loss: 0.028958. Entropy: 0.310565.
Iteration 1599: Policy loss: -0.093391. Value loss: 0.020048. Entropy: 0.310236.
Training network. lr: 0.000238. clip: 0.095245
Iteration 1600: Policy loss: -0.166700. Value loss: 0.066960. Entropy: 0.308742.
Iteration 1601: Policy loss: -0.167217. Value loss: 0.039448. Entropy: 0.310391.
Iteration 1602: Policy loss: -0.162142. Value lo

Iteration 1657: Policy loss: 0.258656. Value loss: 0.142841. Entropy: 0.305459.
Iteration 1658: Policy loss: 0.256712. Value loss: 0.070056. Entropy: 0.306692.
Iteration 1659: Policy loss: 0.247652. Value loss: 0.046741. Entropy: 0.305143.
episode: 673   score: 355.0  epsilon: 1.0    steps: 184  evaluation reward: 229.75
episode: 674   score: 410.0  epsilon: 1.0    steps: 280  evaluation reward: 232.05
Training network. lr: 0.000237. clip: 0.094940
Iteration 1660: Policy loss: 0.337248. Value loss: 0.079010. Entropy: 0.300479.
Iteration 1661: Policy loss: 0.337340. Value loss: 0.037760. Entropy: 0.299633.
Iteration 1662: Policy loss: 0.322666. Value loss: 0.029281. Entropy: 0.299157.
episode: 675   score: 440.0  epsilon: 1.0    steps: 504  evaluation reward: 235.1
Training network. lr: 0.000237. clip: 0.094940
Iteration 1663: Policy loss: -0.040488. Value loss: 0.143035. Entropy: 0.300727.
Iteration 1664: Policy loss: -0.050415. Value loss: 0.070190. Entropy: 0.302345.
Iteration 1665: 

Iteration 1721: Policy loss: 0.027552. Value loss: 0.055792. Entropy: 0.302339.
Iteration 1722: Policy loss: 0.025511. Value loss: 0.034373. Entropy: 0.303455.
episode: 699   score: 95.0  epsilon: 1.0    steps: 840  evaluation reward: 234.55
Training network. lr: 0.000237. clip: 0.094784
Iteration 1723: Policy loss: -0.299452. Value loss: 0.150055. Entropy: 0.297371.
Iteration 1724: Policy loss: -0.300907. Value loss: 0.082464. Entropy: 0.298506.
Iteration 1725: Policy loss: -0.307422. Value loss: 0.065561. Entropy: 0.298976.
episode: 700   score: 180.0  epsilon: 1.0    steps: 32  evaluation reward: 234.55
now time :  2019-09-05 16:02:24.833556
episode: 701   score: 215.0  epsilon: 1.0    steps: 168  evaluation reward: 234.6
Training network. lr: 0.000237. clip: 0.094784
Iteration 1726: Policy loss: -0.116625. Value loss: 0.088813. Entropy: 0.306035.
Iteration 1727: Policy loss: -0.115225. Value loss: 0.045088. Entropy: 0.305847.
Iteration 1728: Policy loss: -0.114435. Value loss: 0.03

episode: 727   score: 345.0  epsilon: 1.0    steps: 304  evaluation reward: 219.65
episode: 728   score: 50.0  epsilon: 1.0    steps: 416  evaluation reward: 216.7
Training network. lr: 0.000237. clip: 0.094627
Iteration 1783: Policy loss: -0.158962. Value loss: 0.110381. Entropy: 0.300410.
Iteration 1784: Policy loss: -0.153231. Value loss: 0.056759. Entropy: 0.299886.
Iteration 1785: Policy loss: -0.163111. Value loss: 0.043410. Entropy: 0.300051.
episode: 729   score: 180.0  epsilon: 1.0    steps: 48  evaluation reward: 215.7
episode: 730   score: 320.0  epsilon: 1.0    steps: 256  evaluation reward: 216.8
Training network. lr: 0.000237. clip: 0.094627
Iteration 1786: Policy loss: 0.033022. Value loss: 0.134080. Entropy: 0.306399.
Iteration 1787: Policy loss: 0.028131. Value loss: 0.067449. Entropy: 0.305557.
Iteration 1788: Policy loss: 0.027726. Value loss: 0.043136. Entropy: 0.306091.
Training network. lr: 0.000237. clip: 0.094627
Iteration 1789: Policy loss: 0.116047. Value loss

Iteration 1844: Policy loss: 0.206900. Value loss: 0.033552. Entropy: 0.303949.
Iteration 1845: Policy loss: 0.204404. Value loss: 0.025224. Entropy: 0.304616.
episode: 755   score: 55.0  epsilon: 1.0    steps: 24  evaluation reward: 205.0
episode: 756   score: 180.0  epsilon: 1.0    steps: 512  evaluation reward: 202.7
Training network. lr: 0.000236. clip: 0.094480
Iteration 1846: Policy loss: 0.012469. Value loss: 0.080442. Entropy: 0.303073.
Iteration 1847: Policy loss: 0.009887. Value loss: 0.050038. Entropy: 0.303198.
Iteration 1848: Policy loss: 0.003613. Value loss: 0.035607. Entropy: 0.302713.
episode: 757   score: 105.0  epsilon: 1.0    steps: 336  evaluation reward: 201.45
episode: 758   score: 180.0  epsilon: 1.0    steps: 576  evaluation reward: 202.0
Training network. lr: 0.000236. clip: 0.094480
Iteration 1849: Policy loss: -0.004164. Value loss: 0.030583. Entropy: 0.305554.
Iteration 1850: Policy loss: -0.004695. Value loss: 0.018973. Entropy: 0.306029.
Iteration 1851: P

Iteration 1906: Policy loss: 0.114060. Value loss: 0.076014. Entropy: 0.309114.
Iteration 1907: Policy loss: 0.105534. Value loss: 0.035580. Entropy: 0.309122.
Iteration 1908: Policy loss: 0.100910. Value loss: 0.027011. Entropy: 0.307997.
episode: 782   score: 180.0  epsilon: 1.0    steps: 648  evaluation reward: 195.0
Training network. lr: 0.000235. clip: 0.094166
Iteration 1909: Policy loss: -0.065965. Value loss: 0.108078. Entropy: 0.309508.
Iteration 1910: Policy loss: -0.078105. Value loss: 0.045633. Entropy: 0.309063.
Iteration 1911: Policy loss: -0.075902. Value loss: 0.033280. Entropy: 0.309396.
episode: 783   score: 80.0  epsilon: 1.0    steps: 304  evaluation reward: 194.25
Training network. lr: 0.000235. clip: 0.094166
Iteration 1912: Policy loss: -0.560201. Value loss: 0.364594. Entropy: 0.306260.
Iteration 1913: Policy loss: -0.571373. Value loss: 0.187950. Entropy: 0.308046.
Iteration 1914: Policy loss: -0.573579. Value loss: 0.116739. Entropy: 0.308938.
episode: 784   s

Iteration 1971: Policy loss: -0.104906. Value loss: 0.023856. Entropy: 0.310852.
episode: 806   score: 670.0  epsilon: 1.0    steps: 104  evaluation reward: 206.9
episode: 807   score: 135.0  epsilon: 1.0    steps: 584  evaluation reward: 203.4
Training network. lr: 0.000235. clip: 0.094019
Iteration 1972: Policy loss: 0.103580. Value loss: 0.088231. Entropy: 0.314413.
Iteration 1973: Policy loss: 0.091848. Value loss: 0.040847. Entropy: 0.315570.
Iteration 1974: Policy loss: 0.088372. Value loss: 0.026012. Entropy: 0.314710.
episode: 808   score: 305.0  epsilon: 1.0    steps: 776  evaluation reward: 205.9
Training network. lr: 0.000235. clip: 0.094019
Iteration 1975: Policy loss: 0.044838. Value loss: 0.041550. Entropy: 0.317571.
Iteration 1976: Policy loss: 0.044132. Value loss: 0.021350. Entropy: 0.317610.
Iteration 1977: Policy loss: 0.044928. Value loss: 0.015028. Entropy: 0.317575.
episode: 809   score: 290.0  epsilon: 1.0    steps: 496  evaluation reward: 204.8
Training network.

Training network. lr: 0.000235. clip: 0.093862
Iteration 2035: Policy loss: -0.025122. Value loss: 0.037172. Entropy: 0.314454.
Iteration 2036: Policy loss: -0.026624. Value loss: 0.021219. Entropy: 0.313686.
Iteration 2037: Policy loss: -0.022871. Value loss: 0.015988. Entropy: 0.313149.
episode: 832   score: 105.0  epsilon: 1.0    steps: 272  evaluation reward: 222.35
Training network. lr: 0.000235. clip: 0.093862
Iteration 2038: Policy loss: 0.039074. Value loss: 0.022901. Entropy: 0.312180.
Iteration 2039: Policy loss: 0.036625. Value loss: 0.013035. Entropy: 0.311907.
Iteration 2040: Policy loss: 0.039957. Value loss: 0.009726. Entropy: 0.311517.
episode: 833   score: 165.0  epsilon: 1.0    steps: 704  evaluation reward: 222.75
Training network. lr: 0.000235. clip: 0.093862
Iteration 2041: Policy loss: -0.562535. Value loss: 0.396664. Entropy: 0.315814.
Iteration 2042: Policy loss: -0.612317. Value loss: 0.137991. Entropy: 0.316569.
Iteration 2043: Policy loss: -0.659417. Value lo

Iteration 2099: Policy loss: 0.187381. Value loss: 0.051266. Entropy: 0.313835.
Iteration 2100: Policy loss: 0.185924. Value loss: 0.037994. Entropy: 0.314115.
episode: 856   score: 260.0  epsilon: 1.0    steps: 592  evaluation reward: 232.9
Training network. lr: 0.000234. clip: 0.093558
Iteration 2101: Policy loss: 0.115766. Value loss: 0.068279. Entropy: 0.314547.
Iteration 2102: Policy loss: 0.111018. Value loss: 0.034848. Entropy: 0.313725.
Iteration 2103: Policy loss: 0.111075. Value loss: 0.023849. Entropy: 0.314195.
episode: 857   score: 260.0  epsilon: 1.0    steps: 208  evaluation reward: 234.45
episode: 858   score: 260.0  epsilon: 1.0    steps: 384  evaluation reward: 235.25
Training network. lr: 0.000234. clip: 0.093558
Iteration 2104: Policy loss: 0.266719. Value loss: 0.093514. Entropy: 0.311927.
Iteration 2105: Policy loss: 0.257548. Value loss: 0.038369. Entropy: 0.311845.
Iteration 2106: Policy loss: 0.255513. Value loss: 0.028374. Entropy: 0.311472.
episode: 859   sco

episode: 881   score: 410.0  epsilon: 1.0    steps: 1016  evaluation reward: 237.65
Training network. lr: 0.000234. clip: 0.093401
Iteration 2164: Policy loss: -0.081376. Value loss: 0.068085. Entropy: 0.317643.
Iteration 2165: Policy loss: -0.084689. Value loss: 0.046011. Entropy: 0.317793.
Iteration 2166: Policy loss: -0.089889. Value loss: 0.039383. Entropy: 0.318391.
episode: 882   score: 300.0  epsilon: 1.0    steps: 72  evaluation reward: 238.85
episode: 883   score: 285.0  epsilon: 1.0    steps: 112  evaluation reward: 240.9
episode: 884   score: 210.0  epsilon: 1.0    steps: 856  evaluation reward: 240.4
Training network. lr: 0.000234. clip: 0.093401
Iteration 2167: Policy loss: 0.040483. Value loss: 0.055373. Entropy: 0.312864.
Iteration 2168: Policy loss: 0.038685. Value loss: 0.028122. Entropy: 0.312237.
Iteration 2169: Policy loss: 0.035219. Value loss: 0.020851. Entropy: 0.313389.
episode: 885   score: 210.0  epsilon: 1.0    steps: 896  evaluation reward: 238.7
Training ne

episode: 907   score: 210.0  epsilon: 1.0    steps: 856  evaluation reward: 240.1
Training network. lr: 0.000233. clip: 0.093245
Iteration 2227: Policy loss: -0.055754. Value loss: 0.064109. Entropy: 0.309835.
Iteration 2228: Policy loss: -0.057385. Value loss: 0.027767. Entropy: 0.308416.
Iteration 2229: Policy loss: -0.058272. Value loss: 0.019794. Entropy: 0.307930.
episode: 908   score: 110.0  epsilon: 1.0    steps: 1016  evaluation reward: 238.15
Training network. lr: 0.000233. clip: 0.093245
Iteration 2230: Policy loss: 0.082338. Value loss: 0.059480. Entropy: 0.311577.
Iteration 2231: Policy loss: 0.072174. Value loss: 0.028261. Entropy: 0.311272.
Iteration 2232: Policy loss: 0.077377. Value loss: 0.021993. Entropy: 0.313319.
episode: 909   score: 110.0  epsilon: 1.0    steps: 464  evaluation reward: 236.35
Training network. lr: 0.000233. clip: 0.093245
Iteration 2233: Policy loss: 0.090755. Value loss: 0.090228. Entropy: 0.310182.
Iteration 2234: Policy loss: 0.086314. Value lo

Iteration 2292: Policy loss: -0.063155. Value loss: 0.025565. Entropy: 0.303731.
Training network. lr: 0.000233. clip: 0.093097
Iteration 2293: Policy loss: -0.006727. Value loss: 0.083363. Entropy: 0.302939.
Iteration 2294: Policy loss: -0.006923. Value loss: 0.035899. Entropy: 0.304224.
Iteration 2295: Policy loss: -0.009924. Value loss: 0.026893. Entropy: 0.303997.
episode: 931   score: 180.0  epsilon: 1.0    steps: 720  evaluation reward: 236.35
Training network. lr: 0.000233. clip: 0.093097
Iteration 2296: Policy loss: 0.102585. Value loss: 0.035325. Entropy: 0.312669.
Iteration 2297: Policy loss: 0.100874. Value loss: 0.014777. Entropy: 0.312647.
Iteration 2298: Policy loss: 0.098819. Value loss: 0.012489. Entropy: 0.312109.
episode: 932   score: 240.0  epsilon: 1.0    steps: 624  evaluation reward: 237.7
episode: 933   score: 135.0  epsilon: 1.0    steps: 904  evaluation reward: 237.4
Training network. lr: 0.000233. clip: 0.093097
Iteration 2299: Policy loss: 0.168693. Value los

episode: 957   score: 210.0  epsilon: 1.0    steps: 368  evaluation reward: 225.15
Training network. lr: 0.000232. clip: 0.092784
Iteration 2356: Policy loss: 0.035675. Value loss: 0.106984. Entropy: 0.309629.
Iteration 2357: Policy loss: 0.032156. Value loss: 0.036718. Entropy: 0.307967.
Iteration 2358: Policy loss: 0.025862. Value loss: 0.021291. Entropy: 0.307593.
episode: 958   score: 285.0  epsilon: 1.0    steps: 136  evaluation reward: 225.4
episode: 959   score: 210.0  epsilon: 1.0    steps: 944  evaluation reward: 225.7
Training network. lr: 0.000232. clip: 0.092784
Iteration 2359: Policy loss: 0.173690. Value loss: 0.101289. Entropy: 0.307883.
Iteration 2360: Policy loss: 0.171572. Value loss: 0.051215. Entropy: 0.308737.
Iteration 2361: Policy loss: 0.172703. Value loss: 0.037749. Entropy: 0.308606.
Training network. lr: 0.000232. clip: 0.092784
Iteration 2362: Policy loss: 0.128250. Value loss: 0.076492. Entropy: 0.306721.
Iteration 2363: Policy loss: 0.122346. Value loss: 0

Iteration 2418: Policy loss: 0.046958. Value loss: 0.026255. Entropy: 0.306460.
Training network. lr: 0.000232. clip: 0.092636
Iteration 2419: Policy loss: 0.067862. Value loss: 0.035420. Entropy: 0.308262.
Iteration 2420: Policy loss: 0.064136. Value loss: 0.016129. Entropy: 0.308356.
Iteration 2421: Policy loss: 0.065233. Value loss: 0.012950. Entropy: 0.307238.
episode: 985   score: 105.0  epsilon: 1.0    steps: 184  evaluation reward: 211.5
episode: 986   score: 85.0  epsilon: 1.0    steps: 200  evaluation reward: 209.3
episode: 987   score: 225.0  epsilon: 1.0    steps: 456  evaluation reward: 207.45
Training network. lr: 0.000232. clip: 0.092636
Iteration 2422: Policy loss: 0.083568. Value loss: 0.080277. Entropy: 0.308596.
Iteration 2423: Policy loss: 0.078489. Value loss: 0.039965. Entropy: 0.307918.
Iteration 2424: Policy loss: 0.078573. Value loss: 0.031324. Entropy: 0.308338.
episode: 988   score: 145.0  epsilon: 1.0    steps: 128  evaluation reward: 206.2
episode: 989   sco

Iteration 2477: Policy loss: 0.205135. Value loss: 0.021132. Entropy: 0.305368.
Iteration 2478: Policy loss: 0.207363. Value loss: 0.015880. Entropy: 0.304925.
episode: 1015   score: 215.0  epsilon: 1.0    steps: 552  evaluation reward: 190.05
episode: 1016   score: 35.0  epsilon: 1.0    steps: 904  evaluation reward: 188.15
Training network. lr: 0.000231. clip: 0.092480
Iteration 2479: Policy loss: 0.154079. Value loss: 0.071900. Entropy: 0.303259.
Iteration 2480: Policy loss: 0.151619. Value loss: 0.034289. Entropy: 0.303418.
Iteration 2481: Policy loss: 0.141205. Value loss: 0.029060. Entropy: 0.302845.
episode: 1017   score: 80.0  epsilon: 1.0    steps: 224  evaluation reward: 187.9
episode: 1018   score: 50.0  epsilon: 1.0    steps: 248  evaluation reward: 186.6
episode: 1019   score: 80.0  epsilon: 1.0    steps: 592  evaluation reward: 185.0
Training network. lr: 0.000231. clip: 0.092480
Iteration 2482: Policy loss: -0.155891. Value loss: 0.124803. Entropy: 0.306068.
Iteration 24

Iteration 2533: Policy loss: -0.131828. Value loss: 0.103553. Entropy: 0.306850.
Iteration 2534: Policy loss: -0.140427. Value loss: 0.051093. Entropy: 0.308858.
Iteration 2535: Policy loss: -0.146217. Value loss: 0.038257. Entropy: 0.309375.
Training network. lr: 0.000231. clip: 0.092323
Iteration 2536: Policy loss: -0.148410. Value loss: 0.083132. Entropy: 0.314422.
Iteration 2537: Policy loss: -0.151852. Value loss: 0.036000. Entropy: 0.314413.
Iteration 2538: Policy loss: -0.158816. Value loss: 0.030305. Entropy: 0.314861.
episode: 1049   score: 105.0  epsilon: 1.0    steps: 24  evaluation reward: 171.45
Training network. lr: 0.000231. clip: 0.092323
Iteration 2539: Policy loss: 0.036329. Value loss: 0.073877. Entropy: 0.312102.
Iteration 2540: Policy loss: 0.026498. Value loss: 0.036181. Entropy: 0.312134.
Iteration 2541: Policy loss: 0.022827. Value loss: 0.027163. Entropy: 0.312477.
episode: 1050   score: 120.0  epsilon: 1.0    steps: 344  evaluation reward: 170.55
now time :  2

Iteration 2594: Policy loss: 0.088154. Value loss: 0.030526. Entropy: 0.306330.
Iteration 2595: Policy loss: 0.078389. Value loss: 0.024217. Entropy: 0.306780.
episode: 1077   score: 155.0  epsilon: 1.0    steps: 480  evaluation reward: 166.3
Training network. lr: 0.000230. clip: 0.092176
Iteration 2596: Policy loss: -0.133946. Value loss: 0.059201. Entropy: 0.303826.
Iteration 2597: Policy loss: -0.137811. Value loss: 0.032445. Entropy: 0.302666.
Iteration 2598: Policy loss: -0.138765. Value loss: 0.026333. Entropy: 0.303166.
episode: 1078   score: 135.0  epsilon: 1.0    steps: 848  evaluation reward: 166.45
Training network. lr: 0.000230. clip: 0.092176
Iteration 2599: Policy loss: 0.209522. Value loss: 0.057969. Entropy: 0.311442.
Iteration 2600: Policy loss: 0.203806. Value loss: 0.020787. Entropy: 0.311484.
Iteration 2601: Policy loss: 0.200156. Value loss: 0.012073. Entropy: 0.310445.
episode: 1079   score: 265.0  epsilon: 1.0    steps: 56  evaluation reward: 167.75
Training netw

Iteration 2655: Policy loss: -0.167466. Value loss: 0.088160. Entropy: 0.308398.
episode: 1105   score: 170.0  epsilon: 1.0    steps: 656  evaluation reward: 164.1
Training network. lr: 0.000230. clip: 0.091862
Iteration 2656: Policy loss: 0.131384. Value loss: 0.116947. Entropy: 0.309920.
Iteration 2657: Policy loss: 0.123270. Value loss: 0.046998. Entropy: 0.309435.
Iteration 2658: Policy loss: 0.124173. Value loss: 0.036353. Entropy: 0.309580.
episode: 1106   score: 280.0  epsilon: 1.0    steps: 136  evaluation reward: 165.55
episode: 1107   score: 155.0  epsilon: 1.0    steps: 392  evaluation reward: 165.9
Training network. lr: 0.000230. clip: 0.091862
Iteration 2659: Policy loss: -0.068289. Value loss: 0.070705. Entropy: 0.308897.
Iteration 2660: Policy loss: -0.070755. Value loss: 0.032301. Entropy: 0.308335.
Iteration 2661: Policy loss: -0.072582. Value loss: 0.022950. Entropy: 0.309506.
episode: 1108   score: 210.0  epsilon: 1.0    steps: 40  evaluation reward: 166.6
episode: 1

episode: 1137   score: 180.0  epsilon: 1.0    steps: 584  evaluation reward: 175.55
Training network. lr: 0.000229. clip: 0.091715
Iteration 2713: Policy loss: 0.150807. Value loss: 0.092564. Entropy: 0.304272.
Iteration 2714: Policy loss: 0.144419. Value loss: 0.040186. Entropy: 0.303797.
Iteration 2715: Policy loss: 0.137442. Value loss: 0.026629. Entropy: 0.303686.
episode: 1138   score: 215.0  epsilon: 1.0    steps: 408  evaluation reward: 175.6
Training network. lr: 0.000229. clip: 0.091715
Iteration 2716: Policy loss: -0.087580. Value loss: 0.125708. Entropy: 0.306560.
Iteration 2717: Policy loss: -0.095231. Value loss: 0.063423. Entropy: 0.307571.
Iteration 2718: Policy loss: -0.094035. Value loss: 0.048372. Entropy: 0.307309.
episode: 1139   score: 180.0  epsilon: 1.0    steps: 584  evaluation reward: 175.8
Training network. lr: 0.000229. clip: 0.091715
Iteration 2719: Policy loss: -0.079490. Value loss: 0.116821. Entropy: 0.307413.
Iteration 2720: Policy loss: -0.081809. Value

Iteration 2775: Policy loss: -0.082765. Value loss: 0.043096. Entropy: 0.309514.
episode: 1164   score: 440.0  epsilon: 1.0    steps: 320  evaluation reward: 177.75
episode: 1165   score: 335.0  epsilon: 1.0    steps: 968  evaluation reward: 177.6
Training network. lr: 0.000229. clip: 0.091558
Iteration 2776: Policy loss: -0.306636. Value loss: 0.405849. Entropy: 0.304897.
Iteration 2777: Policy loss: -0.313148. Value loss: 0.216375. Entropy: 0.306354.
Iteration 2778: Policy loss: -0.325853. Value loss: 0.117472. Entropy: 0.305703.
Training network. lr: 0.000229. clip: 0.091558
Iteration 2779: Policy loss: 0.077285. Value loss: 0.100590. Entropy: 0.309373.
Iteration 2780: Policy loss: 0.060093. Value loss: 0.048435. Entropy: 0.307723.
Iteration 2781: Policy loss: 0.064781. Value loss: 0.033398. Entropy: 0.307334.
Training network. lr: 0.000229. clip: 0.091558
Iteration 2782: Policy loss: 0.090910. Value loss: 0.142448. Entropy: 0.308551.
Iteration 2783: Policy loss: 0.087172. Value los

Iteration 2838: Policy loss: -0.067564. Value loss: 0.051015. Entropy: 0.300259.
Training network. lr: 0.000229. clip: 0.091401
Iteration 2839: Policy loss: 0.116314. Value loss: 0.076566. Entropy: 0.306290.
Iteration 2840: Policy loss: 0.118478. Value loss: 0.027060. Entropy: 0.307295.
Iteration 2841: Policy loss: 0.117059. Value loss: 0.020523. Entropy: 0.307351.
episode: 1190   score: 115.0  epsilon: 1.0    steps: 184  evaluation reward: 200.45
episode: 1191   score: 180.0  epsilon: 1.0    steps: 272  evaluation reward: 200.9
episode: 1192   score: 180.0  epsilon: 1.0    steps: 272  evaluation reward: 201.3
episode: 1193   score: 125.0  epsilon: 1.0    steps: 328  evaluation reward: 199.95
Training network. lr: 0.000229. clip: 0.091401
Iteration 2842: Policy loss: 0.071886. Value loss: 0.103267. Entropy: 0.301996.
Iteration 2843: Policy loss: 0.066992. Value loss: 0.047478. Entropy: 0.300885.
Iteration 2844: Policy loss: 0.071509. Value loss: 0.033289. Entropy: 0.301769.
episode: 11

episode: 1218   score: 240.0  epsilon: 1.0    steps: 1008  evaluation reward: 206.7
Training network. lr: 0.000228. clip: 0.091254
Iteration 2899: Policy loss: -0.084231. Value loss: 0.101351. Entropy: 0.287313.
Iteration 2900: Policy loss: -0.082507. Value loss: 0.051323. Entropy: 0.285151.
Iteration 2901: Policy loss: -0.081416. Value loss: 0.037666. Entropy: 0.286726.
episode: 1219   score: 545.0  epsilon: 1.0    steps: 728  evaluation reward: 211.15
Training network. lr: 0.000228. clip: 0.091097
Iteration 2902: Policy loss: 0.060892. Value loss: 0.116322. Entropy: 0.307733.
Iteration 2903: Policy loss: 0.053944. Value loss: 0.052955. Entropy: 0.307649.
Iteration 2904: Policy loss: 0.046909. Value loss: 0.037261. Entropy: 0.307409.
episode: 1220   score: 210.0  epsilon: 1.0    steps: 80  evaluation reward: 211.45
Training network. lr: 0.000228. clip: 0.091097
Iteration 2905: Policy loss: 0.159037. Value loss: 0.100332. Entropy: 0.303588.
Iteration 2906: Policy loss: 0.157230. Value 

Iteration 2963: Policy loss: 0.131641. Value loss: 0.030800. Entropy: 0.305363.
Iteration 2964: Policy loss: 0.129936. Value loss: 0.021786. Entropy: 0.304983.
Training network. lr: 0.000227. clip: 0.090941
Iteration 2965: Policy loss: -0.021960. Value loss: 0.113944. Entropy: 0.306808.
Iteration 2966: Policy loss: -0.029928. Value loss: 0.058269. Entropy: 0.305670.
Iteration 2967: Policy loss: -0.032994. Value loss: 0.045611. Entropy: 0.304640.
episode: 1243   score: 210.0  epsilon: 1.0    steps: 576  evaluation reward: 226.7
episode: 1244   score: 550.0  epsilon: 1.0    steps: 1000  evaluation reward: 230.6
Training network. lr: 0.000227. clip: 0.090941
Iteration 2968: Policy loss: -0.364220. Value loss: 0.327139. Entropy: 0.308246.
Iteration 2969: Policy loss: -0.341147. Value loss: 0.234200. Entropy: 0.308481.
Iteration 2970: Policy loss: -0.376053. Value loss: 0.189966. Entropy: 0.308919.
episode: 1245   score: 285.0  epsilon: 1.0    steps: 896  evaluation reward: 230.05
Training 

Iteration 3027: Policy loss: 0.135243. Value loss: 0.022274. Entropy: 0.312505.
Training network. lr: 0.000227. clip: 0.090793
Iteration 3028: Policy loss: -0.020235. Value loss: 0.027603. Entropy: 0.312508.
Iteration 3029: Policy loss: -0.023239. Value loss: 0.011740. Entropy: 0.311300.
Iteration 3030: Policy loss: -0.023573. Value loss: 0.008882. Entropy: 0.312077.
episode: 1268   score: 270.0  epsilon: 1.0    steps: 680  evaluation reward: 239.75
Training network. lr: 0.000227. clip: 0.090793
Iteration 3031: Policy loss: -0.073749. Value loss: 0.056016. Entropy: 0.315688.
Iteration 3032: Policy loss: -0.081229. Value loss: 0.032357. Entropy: 0.315376.
Iteration 3033: Policy loss: -0.079559. Value loss: 0.024469. Entropy: 0.315669.
episode: 1269   score: 110.0  epsilon: 1.0    steps: 752  evaluation reward: 234.1
Training network. lr: 0.000227. clip: 0.090793
Iteration 3034: Policy loss: 0.111311. Value loss: 0.068697. Entropy: 0.314130.
Iteration 3035: Policy loss: 0.111206. Value l

Training network. lr: 0.000227. clip: 0.090637
Iteration 3091: Policy loss: -0.004307. Value loss: 0.029337. Entropy: 0.311855.
Iteration 3092: Policy loss: -0.007361. Value loss: 0.016709. Entropy: 0.310357.
Iteration 3093: Policy loss: -0.010134. Value loss: 0.010378. Entropy: 0.310828.
Training network. lr: 0.000227. clip: 0.090637
Iteration 3094: Policy loss: -0.001379. Value loss: 0.054047. Entropy: 0.314098.
Iteration 3095: Policy loss: -0.007095. Value loss: 0.022039. Entropy: 0.311336.
Iteration 3096: Policy loss: -0.009638. Value loss: 0.014217. Entropy: 0.312770.
episode: 1293   score: 265.0  epsilon: 1.0    steps: 184  evaluation reward: 238.85
episode: 1294   score: 110.0  epsilon: 1.0    steps: 352  evaluation reward: 236.9
episode: 1295   score: 180.0  epsilon: 1.0    steps: 360  evaluation reward: 236.6
Training network. lr: 0.000227. clip: 0.090637
Iteration 3097: Policy loss: -0.193539. Value loss: 0.163650. Entropy: 0.308770.
Iteration 3098: Policy loss: -0.226664. Va

episode: 1319   score: 180.0  epsilon: 1.0    steps: 264  evaluation reward: 238.0
Training network. lr: 0.000226. clip: 0.090332
Iteration 3154: Policy loss: 0.133795. Value loss: 0.046245. Entropy: 0.309546.
Iteration 3155: Policy loss: 0.126200. Value loss: 0.022030. Entropy: 0.308473.
Iteration 3156: Policy loss: 0.127240. Value loss: 0.018926. Entropy: 0.308964.
episode: 1320   score: 265.0  epsilon: 1.0    steps: 320  evaluation reward: 238.55
episode: 1321   score: 105.0  epsilon: 1.0    steps: 552  evaluation reward: 236.75
Training network. lr: 0.000226. clip: 0.090332
Iteration 3157: Policy loss: 0.111420. Value loss: 0.048483. Entropy: 0.312509.
Iteration 3158: Policy loss: 0.108330. Value loss: 0.022803. Entropy: 0.312198.
Iteration 3159: Policy loss: 0.101693. Value loss: 0.017869. Entropy: 0.312167.
episode: 1322   score: 120.0  epsilon: 1.0    steps: 176  evaluation reward: 236.15
episode: 1323   score: 155.0  epsilon: 1.0    steps: 688  evaluation reward: 236.95
Trainin

episode: 1349   score: 380.0  epsilon: 1.0    steps: 712  evaluation reward: 216.9
Training network. lr: 0.000225. clip: 0.090176
Iteration 3214: Policy loss: -0.099421. Value loss: 0.272553. Entropy: 0.309435.
Iteration 3215: Policy loss: -0.114367. Value loss: 0.210854. Entropy: 0.308441.
Iteration 3216: Policy loss: -0.101748. Value loss: 0.147264. Entropy: 0.309089.
episode: 1350   score: 110.0  epsilon: 1.0    steps: 184  evaluation reward: 213.3
Training network. lr: 0.000225. clip: 0.090176
Iteration 3217: Policy loss: -0.029500. Value loss: 0.061596. Entropy: 0.300521.
Iteration 3218: Policy loss: -0.034818. Value loss: 0.023173. Entropy: 0.303238.
Iteration 3219: Policy loss: -0.036024. Value loss: 0.017114. Entropy: 0.303364.
now time :  2019-09-05 17:35:09.979882
episode: 1351   score: 120.0  epsilon: 1.0    steps: 296  evaluation reward: 212.85
episode: 1352   score: 120.0  epsilon: 1.0    steps: 528  evaluation reward: 213.25
episode: 1353   score: 110.0  epsilon: 1.0    s

Training network. lr: 0.000225. clip: 0.090019
Iteration 3274: Policy loss: 0.300113. Value loss: 0.070862. Entropy: 0.304554.
Iteration 3275: Policy loss: 0.294186. Value loss: 0.026119. Entropy: 0.303051.
Iteration 3276: Policy loss: 0.289028. Value loss: 0.020910. Entropy: 0.302737.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3277: Policy loss: 0.078076. Value loss: 0.034719. Entropy: 0.312003.
Iteration 3278: Policy loss: 0.080106. Value loss: 0.019772. Entropy: 0.311689.
Iteration 3279: Policy loss: 0.075565. Value loss: 0.015146. Entropy: 0.311672.
episode: 1378   score: 180.0  epsilon: 1.0    steps: 216  evaluation reward: 200.65
episode: 1379   score: 75.0  epsilon: 1.0    steps: 248  evaluation reward: 199.6
Training network. lr: 0.000225. clip: 0.090019
Iteration 3280: Policy loss: -0.019949. Value loss: 0.061797. Entropy: 0.311217.
Iteration 3281: Policy loss: -0.026405. Value loss: 0.033296. Entropy: 0.311893.
Iteration 3282: Policy loss: -0.026488. Value loss:

Iteration 3341: Policy loss: -0.154911. Value loss: 0.151528. Entropy: 0.304633.
Iteration 3342: Policy loss: -0.189403. Value loss: 0.104835. Entropy: 0.305456.
episode: 1399   score: 270.0  epsilon: 1.0    steps: 40  evaluation reward: 211.05
Training network. lr: 0.000225. clip: 0.089872
Iteration 3343: Policy loss: 0.157908. Value loss: 0.157212. Entropy: 0.301561.
Iteration 3344: Policy loss: 0.156158. Value loss: 0.075145. Entropy: 0.300812.
Iteration 3345: Policy loss: 0.148939. Value loss: 0.050316. Entropy: 0.301303.
episode: 1400   score: 135.0  epsilon: 1.0    steps: 512  evaluation reward: 209.75
now time :  2019-09-05 17:42:59.969142
episode: 1401   score: 155.0  epsilon: 1.0    steps: 856  evaluation reward: 209.75
Training network. lr: 0.000225. clip: 0.089872
Iteration 3346: Policy loss: 0.128987. Value loss: 0.109393. Entropy: 0.305104.
Iteration 3347: Policy loss: 0.116643. Value loss: 0.057488. Entropy: 0.304033.
Iteration 3348: Policy loss: 0.116856. Value loss: 0.0

Iteration 3401: Policy loss: 0.117273. Value loss: 0.036785. Entropy: 0.314503.
Iteration 3402: Policy loss: 0.113025. Value loss: 0.028240. Entropy: 0.313311.
episode: 1428   score: 165.0  epsilon: 1.0    steps: 496  evaluation reward: 208.85
Training network. lr: 0.000224. clip: 0.089558
Iteration 3403: Policy loss: 0.055046. Value loss: 0.085758. Entropy: 0.313917.
Iteration 3404: Policy loss: 0.048050. Value loss: 0.041475. Entropy: 0.312528.
Iteration 3405: Policy loss: 0.052669. Value loss: 0.026849. Entropy: 0.312715.
episode: 1429   score: 70.0  epsilon: 1.0    steps: 376  evaluation reward: 207.75
Training network. lr: 0.000224. clip: 0.089558
Iteration 3406: Policy loss: -0.159601. Value loss: 0.103547. Entropy: 0.312059.
Iteration 3407: Policy loss: -0.170926. Value loss: 0.049818. Entropy: 0.312987.
Iteration 3408: Policy loss: -0.168208. Value loss: 0.034028. Entropy: 0.312437.
Training network. lr: 0.000224. clip: 0.089558
Iteration 3409: Policy loss: -0.110814. Value los

episode: 1455   score: 125.0  epsilon: 1.0    steps: 648  evaluation reward: 220.7
Training network. lr: 0.000224. clip: 0.089411
Iteration 3463: Policy loss: 0.053064. Value loss: 0.087018. Entropy: 0.309147.
Iteration 3464: Policy loss: 0.041767. Value loss: 0.034698. Entropy: 0.309362.
Iteration 3465: Policy loss: 0.042323. Value loss: 0.026883. Entropy: 0.309320.
episode: 1456   score: 110.0  epsilon: 1.0    steps: 712  evaluation reward: 220.45
Training network. lr: 0.000224. clip: 0.089411
Iteration 3466: Policy loss: -0.188499. Value loss: 0.120551. Entropy: 0.307540.
Iteration 3467: Policy loss: -0.192988. Value loss: 0.060793. Entropy: 0.307020.
Iteration 3468: Policy loss: -0.189293. Value loss: 0.050564. Entropy: 0.306574.
episode: 1457   score: 95.0  epsilon: 1.0    steps: 320  evaluation reward: 219.75
episode: 1458   score: 240.0  epsilon: 1.0    steps: 536  evaluation reward: 219.75
Training network. lr: 0.000224. clip: 0.089411
Iteration 3469: Policy loss: 0.036346. Val

Iteration 3525: Policy loss: 0.075813. Value loss: 0.056780. Entropy: 0.312543.
episode: 1482   score: 45.0  epsilon: 1.0    steps: 872  evaluation reward: 217.5
Training network. lr: 0.000223. clip: 0.089254
Iteration 3526: Policy loss: 0.251776. Value loss: 0.150613. Entropy: 0.310113.
Iteration 3527: Policy loss: 0.235542. Value loss: 0.075736. Entropy: 0.308437.
Iteration 3528: Policy loss: 0.238958. Value loss: 0.057362. Entropy: 0.307533.
episode: 1483   score: 455.0  epsilon: 1.0    steps: 376  evaluation reward: 220.2
episode: 1484   score: 235.0  epsilon: 1.0    steps: 408  evaluation reward: 221.15
episode: 1485   score: 45.0  epsilon: 1.0    steps: 592  evaluation reward: 217.95
Training network. lr: 0.000223. clip: 0.089254
Iteration 3529: Policy loss: 0.345058. Value loss: 0.137155. Entropy: 0.304343.
Iteration 3530: Policy loss: 0.325249. Value loss: 0.055530. Entropy: 0.300954.
Iteration 3531: Policy loss: 0.315427. Value loss: 0.042589. Entropy: 0.301624.
episode: 1486 

Iteration 3585: Policy loss: 0.007074. Value loss: 0.046435. Entropy: 0.313248.
episode: 1511   score: 180.0  epsilon: 1.0    steps: 536  evaluation reward: 195.75
episode: 1512   score: 215.0  epsilon: 1.0    steps: 624  evaluation reward: 193.8
Training network. lr: 0.000223. clip: 0.089097
Iteration 3586: Policy loss: 0.324710. Value loss: 0.089796. Entropy: 0.301732.
Iteration 3587: Policy loss: 0.316629. Value loss: 0.032886. Entropy: 0.302263.
Iteration 3588: Policy loss: 0.310241. Value loss: 0.023603. Entropy: 0.301710.
episode: 1513   score: 105.0  epsilon: 1.0    steps: 96  evaluation reward: 192.2
episode: 1514   score: 350.0  epsilon: 1.0    steps: 504  evaluation reward: 193.55
episode: 1515   score: 195.0  epsilon: 1.0    steps: 1024  evaluation reward: 194.45
Training network. lr: 0.000223. clip: 0.089097
Iteration 3589: Policy loss: 0.141460. Value loss: 0.088751. Entropy: 0.307311.
Iteration 3590: Policy loss: 0.138878. Value loss: 0.041391. Entropy: 0.305583.
Iteratio

Iteration 3645: Policy loss: 0.244014. Value loss: 0.023272. Entropy: 0.304962.
episode: 1541   score: 170.0  epsilon: 1.0    steps: 168  evaluation reward: 209.35
Training network. lr: 0.000222. clip: 0.088950
Iteration 3646: Policy loss: 0.028212. Value loss: 0.115088. Entropy: 0.294956.
Iteration 3647: Policy loss: 0.020647. Value loss: 0.050148. Entropy: 0.293680.
Iteration 3648: Policy loss: 0.015985. Value loss: 0.038967. Entropy: 0.295525.
Training network. lr: 0.000222. clip: 0.088950
Iteration 3649: Policy loss: -0.193552. Value loss: 0.316432. Entropy: 0.305706.
Iteration 3650: Policy loss: -0.192746. Value loss: 0.150736. Entropy: 0.305591.
Iteration 3651: Policy loss: -0.211956. Value loss: 0.082111. Entropy: 0.306698.
episode: 1542   score: 215.0  epsilon: 1.0    steps: 88  evaluation reward: 205.3
episode: 1543   score: 50.0  epsilon: 1.0    steps: 864  evaluation reward: 203.7
episode: 1544   score: 120.0  epsilon: 1.0    steps: 944  evaluation reward: 202.55
Training ne

episode: 1569   score: 240.0  epsilon: 1.0    steps: 192  evaluation reward: 212.4
episode: 1570   score: 105.0  epsilon: 1.0    steps: 672  evaluation reward: 211.3
episode: 1571   score: 125.0  epsilon: 1.0    steps: 816  evaluation reward: 211.1
Training network. lr: 0.000222. clip: 0.088637
Iteration 3706: Policy loss: 0.092202. Value loss: 0.207010. Entropy: 0.294404.
Iteration 3707: Policy loss: 0.059726. Value loss: 0.125320. Entropy: 0.290657.
Iteration 3708: Policy loss: 0.051847. Value loss: 0.102046. Entropy: 0.289225.
episode: 1572   score: 715.0  epsilon: 1.0    steps: 816  evaluation reward: 215.8
Training network. lr: 0.000222. clip: 0.088637
Iteration 3709: Policy loss: 0.145706. Value loss: 0.112369. Entropy: 0.303970.
Iteration 3710: Policy loss: 0.142026. Value loss: 0.042241. Entropy: 0.300961.
Iteration 3711: Policy loss: 0.123893. Value loss: 0.034314. Entropy: 0.301465.
episode: 1573   score: 65.0  epsilon: 1.0    steps: 352  evaluation reward: 214.65
episode: 15

episode: 1599   score: 270.0  epsilon: 1.0    steps: 304  evaluation reward: 208.7
episode: 1600   score: 125.0  epsilon: 1.0    steps: 656  evaluation reward: 209.2
now time :  2019-09-05 18:09:05.471321
episode: 1601   score: 290.0  epsilon: 1.0    steps: 952  evaluation reward: 211.8
Training network. lr: 0.000221. clip: 0.088489
Iteration 3766: Policy loss: 0.012848. Value loss: 0.111114. Entropy: 0.278083.
Iteration 3767: Policy loss: 0.008256. Value loss: 0.056457. Entropy: 0.277805.
Iteration 3768: Policy loss: 0.001604. Value loss: 0.047326. Entropy: 0.276056.
episode: 1602   score: 160.0  epsilon: 1.0    steps: 264  evaluation reward: 212.45
episode: 1603   score: 210.0  epsilon: 1.0    steps: 888  evaluation reward: 213.35
Training network. lr: 0.000221. clip: 0.088489
Iteration 3769: Policy loss: -0.070339. Value loss: 0.067852. Entropy: 0.280204.
Iteration 3770: Policy loss: -0.068157. Value loss: 0.031027. Entropy: 0.282329.
Iteration 3771: Policy loss: -0.078059. Value lo

Training network. lr: 0.000221. clip: 0.088333
Iteration 3826: Policy loss: -0.007195. Value loss: 0.130759. Entropy: 0.292389.
Iteration 3827: Policy loss: -0.013108. Value loss: 0.043033. Entropy: 0.292133.
Iteration 3828: Policy loss: -0.015269. Value loss: 0.029916. Entropy: 0.290458.
episode: 1629   score: 240.0  epsilon: 1.0    steps: 136  evaluation reward: 203.15
episode: 1630   score: 210.0  epsilon: 1.0    steps: 240  evaluation reward: 204.0
episode: 1631   score: 215.0  epsilon: 1.0    steps: 256  evaluation reward: 204.6
episode: 1632   score: 75.0  epsilon: 1.0    steps: 784  evaluation reward: 201.6
Training network. lr: 0.000221. clip: 0.088333
Iteration 3829: Policy loss: 0.120638. Value loss: 0.066699. Entropy: 0.247401.
Iteration 3830: Policy loss: 0.114227. Value loss: 0.037506. Entropy: 0.252817.
Iteration 3831: Policy loss: 0.116463. Value loss: 0.030481. Entropy: 0.249189.
Training network. lr: 0.000221. clip: 0.088333
Iteration 3832: Policy loss: 0.155883. Value

Training network. lr: 0.000220. clip: 0.088176
Iteration 3886: Policy loss: -0.149572. Value loss: 0.072509. Entropy: 0.292866.
Iteration 3887: Policy loss: -0.154069. Value loss: 0.030878. Entropy: 0.292401.
Iteration 3888: Policy loss: -0.158255. Value loss: 0.024375. Entropy: 0.293168.
episode: 1658   score: 135.0  epsilon: 1.0    steps: 984  evaluation reward: 198.4
Training network. lr: 0.000220. clip: 0.088176
Iteration 3889: Policy loss: 0.126642. Value loss: 0.051692. Entropy: 0.306677.
Iteration 3890: Policy loss: 0.125306. Value loss: 0.019154. Entropy: 0.306370.
Iteration 3891: Policy loss: 0.123872. Value loss: 0.013942. Entropy: 0.306528.
episode: 1659   score: 150.0  epsilon: 1.0    steps: 336  evaluation reward: 196.4
episode: 1660   score: 180.0  epsilon: 1.0    steps: 448  evaluation reward: 196.8
Training network. lr: 0.000220. clip: 0.088176
Iteration 3892: Policy loss: 0.064001. Value loss: 0.058275. Entropy: 0.269641.
Iteration 3893: Policy loss: 0.063893. Value lo

Iteration 3950: Policy loss: -0.002286. Value loss: 0.046216. Entropy: 0.307704.
Iteration 3951: Policy loss: -0.007093. Value loss: 0.031152. Entropy: 0.306002.
episode: 1683   score: 300.0  epsilon: 1.0    steps: 240  evaluation reward: 203.35
episode: 1684   score: 490.0  epsilon: 1.0    steps: 952  evaluation reward: 204.75
Training network. lr: 0.000220. clip: 0.087872
Iteration 3952: Policy loss: -0.126781. Value loss: 0.091149. Entropy: 0.291628.
Iteration 3953: Policy loss: -0.128222. Value loss: 0.045720. Entropy: 0.292247.
Iteration 3954: Policy loss: -0.137355. Value loss: 0.036825. Entropy: 0.292324.
episode: 1685   score: 210.0  epsilon: 1.0    steps: 312  evaluation reward: 205.5
episode: 1686   score: 265.0  epsilon: 1.0    steps: 416  evaluation reward: 206.6
Training network. lr: 0.000220. clip: 0.087872
Iteration 3955: Policy loss: 0.109270. Value loss: 0.068462. Entropy: 0.271005.
Iteration 3956: Policy loss: 0.105819. Value loss: 0.034096. Entropy: 0.270388.
Iterati

episode: 1710   score: 360.0  epsilon: 1.0    steps: 832  evaluation reward: 209.9
Training network. lr: 0.000219. clip: 0.087715
Iteration 4012: Policy loss: 0.065593. Value loss: 0.057587. Entropy: 0.294711.
Iteration 4013: Policy loss: 0.061959. Value loss: 0.030797. Entropy: 0.292614.
Iteration 4014: Policy loss: 0.058022. Value loss: 0.024642. Entropy: 0.292232.
episode: 1711   score: 555.0  epsilon: 1.0    steps: 216  evaluation reward: 213.4
episode: 1712   score: 135.0  epsilon: 1.0    steps: 416  evaluation reward: 214.15
Training network. lr: 0.000219. clip: 0.087715
Iteration 4015: Policy loss: 0.101973. Value loss: 0.085238. Entropy: 0.277656.
Iteration 4016: Policy loss: 0.095878. Value loss: 0.036416. Entropy: 0.279318.
Iteration 4017: Policy loss: 0.099838. Value loss: 0.027982. Entropy: 0.279098.
Training network. lr: 0.000219. clip: 0.087715
Iteration 4018: Policy loss: 0.086066. Value loss: 0.104112. Entropy: 0.308995.
Iteration 4019: Policy loss: 0.088124. Value loss

Iteration 4074: Policy loss: -0.329597. Value loss: 0.049800. Entropy: 0.296713.
Training network. lr: 0.000219. clip: 0.087568
Iteration 4075: Policy loss: -0.218568. Value loss: 0.364597. Entropy: 0.312019.
Iteration 4076: Policy loss: -0.240991. Value loss: 0.109619. Entropy: 0.312526.
Iteration 4077: Policy loss: -0.265324. Value loss: 0.062711. Entropy: 0.313000.
episode: 1737   score: 180.0  epsilon: 1.0    steps: 88  evaluation reward: 221.0
episode: 1738   score: 265.0  epsilon: 1.0    steps: 312  evaluation reward: 222.85
episode: 1739   score: 180.0  epsilon: 1.0    steps: 1000  evaluation reward: 224.35
episode: 1740   score: 215.0  epsilon: 1.0    steps: 1016  evaluation reward: 223.85
Training network. lr: 0.000219. clip: 0.087568
Iteration 4078: Policy loss: 0.240650. Value loss: 0.077566. Entropy: 0.282143.
Iteration 4079: Policy loss: 0.233297. Value loss: 0.038839. Entropy: 0.283101.
Iteration 4080: Policy loss: 0.239294. Value loss: 0.026382. Entropy: 0.282159.
Traini

Iteration 4136: Policy loss: -0.060323. Value loss: 0.040896. Entropy: 0.296542.
Iteration 4137: Policy loss: -0.066963. Value loss: 0.028867. Entropy: 0.297375.
episode: 1764   score: 210.0  epsilon: 1.0    steps: 872  evaluation reward: 228.95
Training network. lr: 0.000219. clip: 0.087411
Iteration 4138: Policy loss: -0.670966. Value loss: 0.366866. Entropy: 0.303785.
Iteration 4139: Policy loss: -0.703337. Value loss: 0.134494. Entropy: 0.304524.
Iteration 4140: Policy loss: -0.693265. Value loss: 0.076898. Entropy: 0.302891.
episode: 1765   score: 250.0  epsilon: 1.0    steps: 328  evaluation reward: 229.35
Training network. lr: 0.000219. clip: 0.087411
Iteration 4141: Policy loss: -0.076715. Value loss: 0.101261. Entropy: 0.290097.
Iteration 4142: Policy loss: -0.080041. Value loss: 0.051772. Entropy: 0.290275.
Iteration 4143: Policy loss: -0.088494. Value loss: 0.039109. Entropy: 0.289527.
episode: 1766   score: 440.0  epsilon: 1.0    steps: 136  evaluation reward: 231.65
episod

Training network. lr: 0.000218. clip: 0.087254
Iteration 4198: Policy loss: -0.031681. Value loss: 0.056731. Entropy: 0.309265.
Iteration 4199: Policy loss: -0.034564. Value loss: 0.027014. Entropy: 0.309139.
Iteration 4200: Policy loss: -0.040164. Value loss: 0.025588. Entropy: 0.308023.
Training network. lr: 0.000218. clip: 0.087107
Iteration 4201: Policy loss: -0.102668. Value loss: 0.072650. Entropy: 0.304891.
Iteration 4202: Policy loss: -0.106344. Value loss: 0.034785. Entropy: 0.304633.
Iteration 4203: Policy loss: -0.106117. Value loss: 0.026902. Entropy: 0.305551.
episode: 1792   score: 180.0  epsilon: 1.0    steps: 632  evaluation reward: 229.0
episode: 1793   score: 105.0  epsilon: 1.0    steps: 784  evaluation reward: 227.95
episode: 1794   score: 185.0  epsilon: 1.0    steps: 952  evaluation reward: 225.7
Training network. lr: 0.000218. clip: 0.087107
Iteration 4204: Policy loss: 0.027300. Value loss: 0.125204. Entropy: 0.282884.
Iteration 4205: Policy loss: 0.018385. Valu

Iteration 4258: Policy loss: 0.020607. Value loss: 0.103367. Entropy: 0.273507.
Iteration 4259: Policy loss: 0.008595. Value loss: 0.041198. Entropy: 0.270273.
Iteration 4260: Policy loss: 0.004503. Value loss: 0.025111. Entropy: 0.269854.
Training network. lr: 0.000217. clip: 0.086950
Iteration 4261: Policy loss: 0.007979. Value loss: 0.151386. Entropy: 0.306496.
Iteration 4262: Policy loss: -0.004730. Value loss: 0.051584. Entropy: 0.306367.
Iteration 4263: Policy loss: -0.013339. Value loss: 0.037738. Entropy: 0.305368.
Training network. lr: 0.000217. clip: 0.086950
Iteration 4264: Policy loss: 0.112938. Value loss: 0.148129. Entropy: 0.306703.
Iteration 4265: Policy loss: 0.101902. Value loss: 0.059880. Entropy: 0.305864.
Iteration 4266: Policy loss: 0.099184. Value loss: 0.041849. Entropy: 0.305716.
episode: 1820   score: 215.0  epsilon: 1.0    steps: 64  evaluation reward: 234.0
episode: 1821   score: 285.0  epsilon: 1.0    steps: 392  evaluation reward: 232.25
episode: 1822   sc

Iteration 4320: Policy loss: -0.237503. Value loss: 0.082556. Entropy: 0.285914.
episode: 1848   score: 495.0  epsilon: 1.0    steps: 736  evaluation reward: 219.5
episode: 1849   score: 210.0  epsilon: 1.0    steps: 960  evaluation reward: 220.8
Training network. lr: 0.000217. clip: 0.086793
Iteration 4321: Policy loss: -0.066580. Value loss: 0.114805. Entropy: 0.297329.
Iteration 4322: Policy loss: -0.067931. Value loss: 0.044798. Entropy: 0.297944.
Iteration 4323: Policy loss: -0.074037. Value loss: 0.032048. Entropy: 0.297090.
Training network. lr: 0.000217. clip: 0.086793
Iteration 4324: Policy loss: -0.017955. Value loss: 0.117095. Entropy: 0.302032.
Iteration 4325: Policy loss: -0.011645. Value loss: 0.035593. Entropy: 0.303086.
Iteration 4326: Policy loss: -0.021710. Value loss: 0.022273. Entropy: 0.301661.
episode: 1850   score: 505.0  epsilon: 1.0    steps: 168  evaluation reward: 222.4
Training network. lr: 0.000217. clip: 0.086793
Iteration 4327: Policy loss: -0.001494. Val

Iteration 4382: Policy loss: 0.141673. Value loss: 0.034738. Entropy: 0.299157.
Iteration 4383: Policy loss: 0.141770. Value loss: 0.024891. Entropy: 0.298033.
episode: 1875   score: 300.0  epsilon: 1.0    steps: 688  evaluation reward: 218.7
Training network. lr: 0.000217. clip: 0.086646
Iteration 4384: Policy loss: -0.294478. Value loss: 0.255007. Entropy: 0.289714.
Iteration 4385: Policy loss: -0.283580. Value loss: 0.080416. Entropy: 0.289425.
Iteration 4386: Policy loss: -0.310762. Value loss: 0.059456. Entropy: 0.290659.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4387: Policy loss: 0.191934. Value loss: 0.147653. Entropy: 0.302512.
Iteration 4388: Policy loss: 0.179541. Value loss: 0.061269. Entropy: 0.298874.
Iteration 4389: Policy loss: 0.174454. Value loss: 0.037109. Entropy: 0.297831.
episode: 1876   score: 390.0  epsilon: 1.0    steps: 40  evaluation reward: 220.8
episode: 1877   score: 305.0  epsilon: 1.0    steps: 376  evaluation reward: 223.1
Training networ

Training network. lr: 0.000216. clip: 0.086489
Iteration 4444: Policy loss: 0.049681. Value loss: 0.049177. Entropy: 0.298077.
Iteration 4445: Policy loss: 0.046389. Value loss: 0.024300. Entropy: 0.297891.
Iteration 4446: Policy loss: 0.042677. Value loss: 0.017109. Entropy: 0.297217.
episode: 1902   score: 210.0  epsilon: 1.0    steps: 176  evaluation reward: 221.4
Training network. lr: 0.000216. clip: 0.086489
Iteration 4447: Policy loss: -0.062261. Value loss: 0.069846. Entropy: 0.284719.
Iteration 4448: Policy loss: -0.066497. Value loss: 0.028873. Entropy: 0.281748.
Iteration 4449: Policy loss: -0.070685. Value loss: 0.019370. Entropy: 0.283638.
episode: 1903   score: 270.0  epsilon: 1.0    steps: 24  evaluation reward: 223.0
Training network. lr: 0.000216. clip: 0.086489
Iteration 4450: Policy loss: 0.190029. Value loss: 0.064358. Entropy: 0.291942.
Iteration 4451: Policy loss: 0.185603. Value loss: 0.029010. Entropy: 0.292261.
Iteration 4452: Policy loss: 0.182315. Value loss: 

Iteration 4506: Policy loss: 0.126217. Value loss: 0.026996. Entropy: 0.278411.
episode: 1929   score: 210.0  epsilon: 1.0    steps: 200  evaluation reward: 211.25
episode: 1930   score: 210.0  epsilon: 1.0    steps: 456  evaluation reward: 211.55
Training network. lr: 0.000215. clip: 0.086185
Iteration 4507: Policy loss: -0.038504. Value loss: 0.124574. Entropy: 0.261726.
Iteration 4508: Policy loss: -0.051995. Value loss: 0.060988. Entropy: 0.262168.
Iteration 4509: Policy loss: -0.042911. Value loss: 0.042969. Entropy: 0.262558.
episode: 1931   score: 180.0  epsilon: 1.0    steps: 616  evaluation reward: 211.55
Training network. lr: 0.000215. clip: 0.086185
Iteration 4510: Policy loss: -0.156713. Value loss: 0.131801. Entropy: 0.295201.
Iteration 4511: Policy loss: -0.162554. Value loss: 0.055581. Entropy: 0.293970.
Iteration 4512: Policy loss: -0.163275. Value loss: 0.040976. Entropy: 0.293491.
episode: 1932   score: 435.0  epsilon: 1.0    steps: 136  evaluation reward: 214.55
epis

Iteration 4566: Policy loss: 0.015791. Value loss: 0.032774. Entropy: 0.297691.
episode: 1958   score: 185.0  epsilon: 1.0    steps: 656  evaluation reward: 219.5
Training network. lr: 0.000215. clip: 0.086029
Iteration 4567: Policy loss: -0.024466. Value loss: 0.068519. Entropy: 0.295802.
Iteration 4568: Policy loss: -0.027434. Value loss: 0.030521. Entropy: 0.294232.
Iteration 4569: Policy loss: -0.031347. Value loss: 0.024439. Entropy: 0.294900.
episode: 1959   score: 155.0  epsilon: 1.0    steps: 192  evaluation reward: 219.85
episode: 1960   score: 160.0  epsilon: 1.0    steps: 304  evaluation reward: 219.8
Training network. lr: 0.000215. clip: 0.086029
Iteration 4570: Policy loss: -0.022365. Value loss: 0.091257. Entropy: 0.281768.
Iteration 4571: Policy loss: -0.025013. Value loss: 0.052845. Entropy: 0.282308.
Iteration 4572: Policy loss: -0.030739. Value loss: 0.039667. Entropy: 0.282447.
episode: 1961   score: 155.0  epsilon: 1.0    steps: 24  evaluation reward: 218.15
Trainin

episode: 1987   score: 410.0  epsilon: 1.0    steps: 768  evaluation reward: 210.85
episode: 1988   score: 270.0  epsilon: 1.0    steps: 960  evaluation reward: 211.75
Training network. lr: 0.000215. clip: 0.085872
Iteration 4627: Policy loss: 0.188276. Value loss: 0.109858. Entropy: 0.299495.
Iteration 4628: Policy loss: 0.187414. Value loss: 0.058216. Entropy: 0.298803.
Iteration 4629: Policy loss: 0.175221. Value loss: 0.042385. Entropy: 0.298045.
episode: 1989   score: 180.0  epsilon: 1.0    steps: 792  evaluation reward: 211.75
Training network. lr: 0.000215. clip: 0.085872
Iteration 4630: Policy loss: 0.215674. Value loss: 0.156944. Entropy: 0.293744.
Iteration 4631: Policy loss: 0.221254. Value loss: 0.057408. Entropy: 0.294680.
Iteration 4632: Policy loss: 0.214929. Value loss: 0.039783. Entropy: 0.292416.
Training network. lr: 0.000215. clip: 0.085872
Iteration 4633: Policy loss: 0.126910. Value loss: 0.096629. Entropy: 0.310189.
Iteration 4634: Policy loss: 0.119061. Value lo

Training network. lr: 0.000214. clip: 0.085724
Iteration 4687: Policy loss: 0.083504. Value loss: 0.049051. Entropy: 0.293637.
Iteration 4688: Policy loss: 0.079692. Value loss: 0.023764. Entropy: 0.295007.
Iteration 4689: Policy loss: 0.075649. Value loss: 0.018322. Entropy: 0.294048.
Training network. lr: 0.000214. clip: 0.085724
Iteration 4690: Policy loss: -0.054595. Value loss: 0.093748. Entropy: 0.311375.
Iteration 4691: Policy loss: -0.059340. Value loss: 0.039094. Entropy: 0.311995.
Iteration 4692: Policy loss: -0.067386. Value loss: 0.026891. Entropy: 0.310048.
episode: 2016   score: 55.0  epsilon: 1.0    steps: 816  evaluation reward: 206.85
Training network. lr: 0.000214. clip: 0.085724
Iteration 4693: Policy loss: 0.059946. Value loss: 0.092618. Entropy: 0.305802.
Iteration 4694: Policy loss: 0.059159. Value loss: 0.034664. Entropy: 0.304339.
Iteration 4695: Policy loss: 0.056100. Value loss: 0.026218. Entropy: 0.305042.
episode: 2017   score: 210.0  epsilon: 1.0    steps: 

Training network. lr: 0.000214. clip: 0.085411
Iteration 4753: Policy loss: 0.185553. Value loss: 0.094211. Entropy: 0.293916.
Iteration 4754: Policy loss: 0.171898. Value loss: 0.041087. Entropy: 0.292758.
Iteration 4755: Policy loss: 0.166828. Value loss: 0.027624. Entropy: 0.293595.
episode: 2039   score: 240.0  epsilon: 1.0    steps: 40  evaluation reward: 229.8
Training network. lr: 0.000214. clip: 0.085411
Iteration 4756: Policy loss: 0.009498. Value loss: 0.057772. Entropy: 0.290332.
Iteration 4757: Policy loss: 0.006347. Value loss: 0.027603. Entropy: 0.291023.
Iteration 4758: Policy loss: 0.003575. Value loss: 0.019544. Entropy: 0.290451.
episode: 2040   score: 235.0  epsilon: 1.0    steps: 536  evaluation reward: 229.25
Training network. lr: 0.000214. clip: 0.085411
Iteration 4759: Policy loss: -0.093708. Value loss: 0.083473. Entropy: 0.293555.
Iteration 4760: Policy loss: -0.098124. Value loss: 0.043872. Entropy: 0.294505.
Iteration 4761: Policy loss: -0.104875. Value loss:

Iteration 4817: Policy loss: 0.129565. Value loss: 0.023255. Entropy: 0.307062.
Iteration 4818: Policy loss: 0.123714. Value loss: 0.018018. Entropy: 0.307022.
episode: 2063   score: 100.0  epsilon: 1.0    steps: 56  evaluation reward: 242.15
episode: 2064   score: 325.0  epsilon: 1.0    steps: 112  evaluation reward: 243.3
episode: 2065   score: 260.0  epsilon: 1.0    steps: 280  evaluation reward: 244.3
Training network. lr: 0.000213. clip: 0.085264
Iteration 4819: Policy loss: -0.340211. Value loss: 0.303493. Entropy: 0.263564.
Iteration 4820: Policy loss: -0.326296. Value loss: 0.135881. Entropy: 0.263990.
Iteration 4821: Policy loss: -0.340266. Value loss: 0.070542. Entropy: 0.264361.
Training network. lr: 0.000213. clip: 0.085264
Iteration 4822: Policy loss: -0.204722. Value loss: 0.158060. Entropy: 0.308343.
Iteration 4823: Policy loss: -0.206469. Value loss: 0.070781. Entropy: 0.306102.
Iteration 4824: Policy loss: -0.220239. Value loss: 0.042958. Entropy: 0.307431.
Training ne

Iteration 4883: Policy loss: -0.215748. Value loss: 0.118512. Entropy: 0.292883.
Iteration 4884: Policy loss: -0.213055. Value loss: 0.063418. Entropy: 0.290383.
episode: 2086   score: 195.0  epsilon: 1.0    steps: 24  evaluation reward: 273.7
episode: 2087   score: 445.0  epsilon: 1.0    steps: 424  evaluation reward: 274.05
episode: 2088   score: 210.0  epsilon: 1.0    steps: 608  evaluation reward: 273.45
Training network. lr: 0.000213. clip: 0.085107
Iteration 4885: Policy loss: 0.100058. Value loss: 0.062946. Entropy: 0.263910.
Iteration 4886: Policy loss: 0.094480. Value loss: 0.026507. Entropy: 0.264721.
Iteration 4887: Policy loss: 0.086981. Value loss: 0.021334. Entropy: 0.265140.
Training network. lr: 0.000213. clip: 0.085107
Iteration 4888: Policy loss: -0.059209. Value loss: 0.067743. Entropy: 0.305193.
Iteration 4889: Policy loss: -0.062223. Value loss: 0.035631. Entropy: 0.306241.
Iteration 4890: Policy loss: -0.063426. Value loss: 0.023865. Entropy: 0.305260.
episode: 20

Iteration 4947: Policy loss: 0.361089. Value loss: 0.041319. Entropy: 0.269798.
Training network. lr: 0.000212. clip: 0.084950
Iteration 4948: Policy loss: 0.050818. Value loss: 0.117728. Entropy: 0.298395.
Iteration 4949: Policy loss: 0.044045. Value loss: 0.056505. Entropy: 0.298367.
Iteration 4950: Policy loss: 0.043624. Value loss: 0.047598. Entropy: 0.299422.
Training network. lr: 0.000212. clip: 0.084803
Iteration 4951: Policy loss: 0.018311. Value loss: 0.144309. Entropy: 0.307534.
Iteration 4952: Policy loss: 0.009424. Value loss: 0.059913. Entropy: 0.307761.
Iteration 4953: Policy loss: -0.002565. Value loss: 0.047691. Entropy: 0.307936.
episode: 2111   score: 210.0  epsilon: 1.0    steps: 672  evaluation reward: 304.4
Training network. lr: 0.000212. clip: 0.084803
Iteration 4954: Policy loss: 0.243635. Value loss: 0.090055. Entropy: 0.294173.
Iteration 4955: Policy loss: 0.232115. Value loss: 0.032055. Entropy: 0.293531.
Iteration 4956: Policy loss: 0.230352. Value loss: 0.02

Iteration 5014: Policy loss: 0.002714. Value loss: 0.128883. Entropy: 0.292077.
Iteration 5015: Policy loss: 0.000146. Value loss: 0.048709. Entropy: 0.293634.
Iteration 5016: Policy loss: -0.009912. Value loss: 0.035822. Entropy: 0.290839.
episode: 2132   score: 390.0  epsilon: 1.0    steps: 600  evaluation reward: 315.95
Training network. lr: 0.000212. clip: 0.084646
Iteration 5017: Policy loss: -0.049629. Value loss: 0.157047. Entropy: 0.292627.
Iteration 5018: Policy loss: -0.052832. Value loss: 0.070007. Entropy: 0.291554.
Iteration 5019: Policy loss: -0.058316. Value loss: 0.052322. Entropy: 0.290108.
episode: 2133   score: 120.0  epsilon: 1.0    steps: 544  evaluation reward: 313.5
Training network. lr: 0.000212. clip: 0.084646
Iteration 5020: Policy loss: 0.077881. Value loss: 0.110108. Entropy: 0.294941.
Iteration 5021: Policy loss: 0.065882. Value loss: 0.054650. Entropy: 0.294822.
Iteration 5022: Policy loss: 0.065323. Value loss: 0.037218. Entropy: 0.293871.
episode: 2134  

Iteration 5080: Policy loss: -0.037553. Value loss: 0.076185. Entropy: 0.271365.
Iteration 5081: Policy loss: -0.039732. Value loss: 0.048443. Entropy: 0.272780.
Iteration 5082: Policy loss: -0.043433. Value loss: 0.039313. Entropy: 0.271748.
episode: 2154   score: 210.0  epsilon: 1.0    steps: 968  evaluation reward: 344.0
episode: 2155   score: 180.0  epsilon: 1.0    steps: 976  evaluation reward: 341.2
Training network. lr: 0.000211. clip: 0.084489
Iteration 5083: Policy loss: 0.080371. Value loss: 0.082303. Entropy: 0.294591.
Iteration 5084: Policy loss: 0.080186. Value loss: 0.035444. Entropy: 0.292907.
Iteration 5085: Policy loss: 0.072922. Value loss: 0.029236. Entropy: 0.294206.
episode: 2156   score: 120.0  epsilon: 1.0    steps: 216  evaluation reward: 339.25
Training network. lr: 0.000211. clip: 0.084489
Iteration 5086: Policy loss: -0.075394. Value loss: 0.062924. Entropy: 0.265818.
Iteration 5087: Policy loss: -0.081651. Value loss: 0.038916. Entropy: 0.265807.
Iteration 5

Training network. lr: 0.000211. clip: 0.084342
Iteration 5146: Policy loss: 0.087739. Value loss: 0.187625. Entropy: 0.299016.
Iteration 5147: Policy loss: 0.063715. Value loss: 0.047148. Entropy: 0.297836.
Iteration 5148: Policy loss: 0.063899. Value loss: 0.033474. Entropy: 0.298443.
Training network. lr: 0.000211. clip: 0.084342
Iteration 5149: Policy loss: 0.030314. Value loss: 0.186495. Entropy: 0.306356.
Iteration 5150: Policy loss: 0.020497. Value loss: 0.064580. Entropy: 0.305996.
Iteration 5151: Policy loss: 0.006065. Value loss: 0.047922. Entropy: 0.307507.
Training network. lr: 0.000210. clip: 0.084185
Iteration 5152: Policy loss: 0.002924. Value loss: 0.078207. Entropy: 0.304531.
Iteration 5153: Policy loss: -0.006818. Value loss: 0.035828. Entropy: 0.304867.
Iteration 5154: Policy loss: -0.006101. Value loss: 0.021733. Entropy: 0.304866.
episode: 2177   score: 335.0  epsilon: 1.0    steps: 688  evaluation reward: 351.3
Training network. lr: 0.000210. clip: 0.084185
Iterati

episode: 2196   score: 80.0  epsilon: 1.0    steps: 1000  evaluation reward: 362.1
Training network. lr: 0.000210. clip: 0.084029
Iteration 5215: Policy loss: 0.058912. Value loss: 0.083321. Entropy: 0.296942.
Iteration 5216: Policy loss: 0.048133. Value loss: 0.044821. Entropy: 0.298115.
Iteration 5217: Policy loss: 0.046686. Value loss: 0.031913. Entropy: 0.298194.
episode: 2197   score: 590.0  epsilon: 1.0    steps: 168  evaluation reward: 365.75
episode: 2198   score: 285.0  epsilon: 1.0    steps: 792  evaluation reward: 362.95
episode: 2199   score: 590.0  epsilon: 1.0    steps: 808  evaluation reward: 363.35
episode: 2200   score: 395.0  epsilon: 1.0    steps: 896  evaluation reward: 364.2
Training network. lr: 0.000210. clip: 0.084029
Iteration 5218: Policy loss: 0.173790. Value loss: 0.104979. Entropy: 0.262954.
Iteration 5219: Policy loss: 0.170747. Value loss: 0.046778. Entropy: 0.261451.
Iteration 5220: Policy loss: 0.159786. Value loss: 0.035327. Entropy: 0.261501.
now time

Iteration 5283: Policy loss: -0.426356. Value loss: 0.053165. Entropy: 0.301864.
episode: 2216   score: 335.0  epsilon: 1.0    steps: 112  evaluation reward: 387.35
episode: 2217   score: 620.0  epsilon: 1.0    steps: 840  evaluation reward: 387.8
Training network. lr: 0.000210. clip: 0.083881
Iteration 5284: Policy loss: 0.243341. Value loss: 0.130143. Entropy: 0.285561.
Iteration 5285: Policy loss: 0.238396. Value loss: 0.057308. Entropy: 0.283189.
Iteration 5286: Policy loss: 0.222689. Value loss: 0.047019. Entropy: 0.284991.
episode: 2218   score: 680.0  epsilon: 1.0    steps: 312  evaluation reward: 390.9
episode: 2219   score: 240.0  epsilon: 1.0    steps: 1008  evaluation reward: 389.85
Training network. lr: 0.000210. clip: 0.083881
Iteration 5287: Policy loss: 0.207227. Value loss: 0.084806. Entropy: 0.284060.
Iteration 5288: Policy loss: 0.203890. Value loss: 0.040282. Entropy: 0.284717.
Iteration 5289: Policy loss: 0.198876. Value loss: 0.031829. Entropy: 0.281959.
episode: 2

episode: 2238   score: 240.0  epsilon: 1.0    steps: 488  evaluation reward: 400.75
Training network. lr: 0.000209. clip: 0.083725
Iteration 5350: Policy loss: 0.249842. Value loss: 0.156122. Entropy: 0.264457.
Iteration 5351: Policy loss: 0.237060. Value loss: 0.067922. Entropy: 0.262397.
Iteration 5352: Policy loss: 0.224422. Value loss: 0.049465. Entropy: 0.263823.
episode: 2239   score: 645.0  epsilon: 1.0    steps: 488  evaluation reward: 403.7
Training network. lr: 0.000209. clip: 0.083568
Iteration 5353: Policy loss: -0.006621. Value loss: 0.233397. Entropy: 0.281213.
Iteration 5354: Policy loss: -0.016963. Value loss: 0.119735. Entropy: 0.280597.
Iteration 5355: Policy loss: -0.027196. Value loss: 0.089123. Entropy: 0.279863.
episode: 2240   score: 340.0  epsilon: 1.0    steps: 528  evaluation reward: 405.0
Training network. lr: 0.000209. clip: 0.083568
Iteration 5356: Policy loss: 0.262024. Value loss: 0.088404. Entropy: 0.286696.
Iteration 5357: Policy loss: 0.254792. Value l

Training network. lr: 0.000209. clip: 0.083420
Iteration 5416: Policy loss: -0.017249. Value loss: 0.066397. Entropy: 0.298901.
Iteration 5417: Policy loss: -0.017665. Value loss: 0.033924. Entropy: 0.298383.
Iteration 5418: Policy loss: -0.020233. Value loss: 0.027115. Entropy: 0.297154.
episode: 2260   score: 310.0  epsilon: 1.0    steps: 56  evaluation reward: 397.85
episode: 2261   score: 290.0  epsilon: 1.0    steps: 416  evaluation reward: 398.35
Training network. lr: 0.000209. clip: 0.083420
Iteration 5419: Policy loss: -0.240484. Value loss: 0.321218. Entropy: 0.279361.
Iteration 5420: Policy loss: -0.262770. Value loss: 0.096741. Entropy: 0.279065.
Iteration 5421: Policy loss: -0.270508. Value loss: 0.041294. Entropy: 0.277152.
episode: 2262   score: 290.0  epsilon: 1.0    steps: 64  evaluation reward: 396.45
episode: 2263   score: 650.0  epsilon: 1.0    steps: 256  evaluation reward: 399.8
Training network. lr: 0.000209. clip: 0.083420
Iteration 5422: Policy loss: 0.364391. V

episode: 2283   score: 315.0  epsilon: 1.0    steps: 784  evaluation reward: 381.9
Training network. lr: 0.000208. clip: 0.083264
Iteration 5482: Policy loss: 0.018581. Value loss: 0.137950. Entropy: 0.301295.
Iteration 5483: Policy loss: 0.012309. Value loss: 0.061441. Entropy: 0.301423.
Iteration 5484: Policy loss: 0.006865. Value loss: 0.047192. Entropy: 0.301568.
episode: 2284   score: 690.0  epsilon: 1.0    steps: 128  evaluation reward: 386.7
episode: 2285   score: 420.0  epsilon: 1.0    steps: 248  evaluation reward: 384.7
Training network. lr: 0.000208. clip: 0.083264
Iteration 5485: Policy loss: 0.196019. Value loss: 0.044372. Entropy: 0.292512.
Iteration 5486: Policy loss: 0.189178. Value loss: 0.025082. Entropy: 0.293206.
Iteration 5487: Policy loss: 0.183071. Value loss: 0.020443. Entropy: 0.291732.
episode: 2286   score: 180.0  epsilon: 1.0    steps: 792  evaluation reward: 384.4
Training network. lr: 0.000208. clip: 0.083264
Iteration 5488: Policy loss: 0.037200. Value lo

Training network. lr: 0.000208. clip: 0.083107
Iteration 5548: Policy loss: 0.303836. Value loss: 0.086742. Entropy: 0.292794.
Iteration 5549: Policy loss: 0.294408. Value loss: 0.041133. Entropy: 0.291588.
Iteration 5550: Policy loss: 0.298013. Value loss: 0.030244. Entropy: 0.289658.
episode: 2305   score: 260.0  epsilon: 1.0    steps: 424  evaluation reward: 388.55
Training network. lr: 0.000207. clip: 0.082960
Iteration 5551: Policy loss: 0.128444. Value loss: 0.068656. Entropy: 0.295540.
Iteration 5552: Policy loss: 0.116741. Value loss: 0.033890. Entropy: 0.293253.
Iteration 5553: Policy loss: 0.118955. Value loss: 0.025721. Entropy: 0.292668.
episode: 2306   score: 295.0  epsilon: 1.0    steps: 768  evaluation reward: 388.75
Training network. lr: 0.000207. clip: 0.082960
Iteration 5554: Policy loss: 0.032538. Value loss: 0.080747. Entropy: 0.297356.
Iteration 5555: Policy loss: 0.027998. Value loss: 0.034688. Entropy: 0.297445.
Iteration 5556: Policy loss: 0.022512. Value loss: 

Iteration 5615: Policy loss: 0.128969. Value loss: 0.049795. Entropy: 0.296527.
Iteration 5616: Policy loss: 0.126740. Value loss: 0.036770. Entropy: 0.295059.
Training network. lr: 0.000207. clip: 0.082803
Iteration 5617: Policy loss: 0.144459. Value loss: 0.047718. Entropy: 0.308941.
Iteration 5618: Policy loss: 0.134382. Value loss: 0.022503. Entropy: 0.308644.
Iteration 5619: Policy loss: 0.133570. Value loss: 0.016917. Entropy: 0.309639.
Training network. lr: 0.000207. clip: 0.082803
Iteration 5620: Policy loss: 0.039234. Value loss: 0.068580. Entropy: 0.310589.
Iteration 5621: Policy loss: 0.030674. Value loss: 0.021271. Entropy: 0.311189.
Iteration 5622: Policy loss: 0.028070. Value loss: 0.017992. Entropy: 0.310759.
episode: 2326   score: 500.0  epsilon: 1.0    steps: 728  evaluation reward: 365.75
episode: 2327   score: 390.0  epsilon: 1.0    steps: 896  evaluation reward: 365.7
Training network. lr: 0.000207. clip: 0.082803
Iteration 5623: Policy loss: -0.025616. Value loss: 

Training network. lr: 0.000207. clip: 0.082646
Iteration 5683: Policy loss: 0.023746. Value loss: 0.198452. Entropy: 0.302599.
Iteration 5684: Policy loss: 0.039559. Value loss: 0.079856. Entropy: 0.300590.
Iteration 5685: Policy loss: 0.029241. Value loss: 0.057344. Entropy: 0.300118.
episode: 2347   score: 635.0  epsilon: 1.0    steps: 264  evaluation reward: 363.75
episode: 2348   score: 145.0  epsilon: 1.0    steps: 368  evaluation reward: 358.25
Training network. lr: 0.000207. clip: 0.082646
Iteration 5686: Policy loss: 0.060547. Value loss: 0.051985. Entropy: 0.274781.
Iteration 5687: Policy loss: 0.054979. Value loss: 0.025742. Entropy: 0.274112.
Iteration 5688: Policy loss: 0.053362. Value loss: 0.019459. Entropy: 0.272364.
episode: 2349   score: 290.0  epsilon: 1.0    steps: 960  evaluation reward: 359.05
Training network. lr: 0.000207. clip: 0.082646
Iteration 5689: Policy loss: 0.105015. Value loss: 0.169849. Entropy: 0.307340.
Iteration 5690: Policy loss: 0.095014. Value lo

Iteration 5751: Policy loss: -0.178905. Value loss: 0.050702. Entropy: 0.309861.
episode: 2367   score: 435.0  epsilon: 1.0    steps: 264  evaluation reward: 371.65
Training network. lr: 0.000206. clip: 0.082342
Iteration 5752: Policy loss: 0.485267. Value loss: 0.214746. Entropy: 0.296796.
Iteration 5753: Policy loss: 0.446147. Value loss: 0.067843. Entropy: 0.297099.
Iteration 5754: Policy loss: 0.445767. Value loss: 0.041592. Entropy: 0.297478.
episode: 2368   score: 525.0  epsilon: 1.0    steps: 192  evaluation reward: 373.45
episode: 2369   score: 655.0  epsilon: 1.0    steps: 600  evaluation reward: 375.0
episode: 2370   score: 345.0  epsilon: 1.0    steps: 864  evaluation reward: 375.6
Training network. lr: 0.000206. clip: 0.082342
Iteration 5755: Policy loss: 0.199180. Value loss: 0.111402. Entropy: 0.287040.
Iteration 5756: Policy loss: 0.186574. Value loss: 0.049994. Entropy: 0.288049.
Iteration 5757: Policy loss: 0.183143. Value loss: 0.032540. Entropy: 0.289567.
episode: 23

Training network. lr: 0.000205. clip: 0.082185
Iteration 5815: Policy loss: 0.169607. Value loss: 0.159754. Entropy: 0.280016.
Iteration 5816: Policy loss: 0.160311. Value loss: 0.065905. Entropy: 0.278981.
Iteration 5817: Policy loss: 0.151495. Value loss: 0.047828. Entropy: 0.278436.
episode: 2392   score: 400.0  epsilon: 1.0    steps: 304  evaluation reward: 376.25
Training network. lr: 0.000205. clip: 0.082185
Iteration 5818: Policy loss: 0.070557. Value loss: 0.134830. Entropy: 0.293721.
Iteration 5819: Policy loss: 0.064280. Value loss: 0.045064. Entropy: 0.293709.
Iteration 5820: Policy loss: 0.054434. Value loss: 0.032699. Entropy: 0.294053.
Training network. lr: 0.000205. clip: 0.082185
Iteration 5821: Policy loss: 0.040008. Value loss: 0.103036. Entropy: 0.304008.
Iteration 5822: Policy loss: 0.031202. Value loss: 0.041247. Entropy: 0.303815.
Iteration 5823: Policy loss: 0.027272. Value loss: 0.030454. Entropy: 0.303584.
episode: 2393   score: 425.0  epsilon: 1.0    steps: 72

Training network. lr: 0.000205. clip: 0.082038
Iteration 5881: Policy loss: -0.021835. Value loss: 0.098555. Entropy: 0.294413.
Iteration 5882: Policy loss: -0.035719. Value loss: 0.035983. Entropy: 0.295841.
Iteration 5883: Policy loss: -0.035205. Value loss: 0.024865. Entropy: 0.295903.
episode: 2414   score: 285.0  epsilon: 1.0    steps: 496  evaluation reward: 363.25
Training network. lr: 0.000205. clip: 0.082038
Iteration 5884: Policy loss: -0.054431. Value loss: 0.063986. Entropy: 0.296116.
Iteration 5885: Policy loss: -0.063349. Value loss: 0.026243. Entropy: 0.298432.
Iteration 5886: Policy loss: -0.060469. Value loss: 0.016264. Entropy: 0.300052.
episode: 2415   score: 310.0  epsilon: 1.0    steps: 72  evaluation reward: 363.5
episode: 2416   score: 240.0  epsilon: 1.0    steps: 872  evaluation reward: 362.75
Training network. lr: 0.000205. clip: 0.082038
Iteration 5887: Policy loss: -0.282935. Value loss: 0.323206. Entropy: 0.287376.
Iteration 5888: Policy loss: -0.290546. Va

Iteration 5945: Policy loss: -0.013752. Value loss: 0.041982. Entropy: 0.285748.
Iteration 5946: Policy loss: -0.019191. Value loss: 0.029462. Entropy: 0.286674.
episode: 2438   score: 335.0  epsilon: 1.0    steps: 96  evaluation reward: 354.15
Training network. lr: 0.000205. clip: 0.081881
Iteration 5947: Policy loss: -0.088685. Value loss: 0.091718. Entropy: 0.295082.
Iteration 5948: Policy loss: -0.090417. Value loss: 0.047778. Entropy: 0.294771.
Iteration 5949: Policy loss: -0.092056. Value loss: 0.033503. Entropy: 0.295230.
Training network. lr: 0.000205. clip: 0.081881
Iteration 5950: Policy loss: -0.119157. Value loss: 0.089439. Entropy: 0.313012.
Iteration 5951: Policy loss: -0.124311. Value loss: 0.035884. Entropy: 0.312698.
Iteration 5952: Policy loss: -0.133357. Value loss: 0.027394. Entropy: 0.313256.
episode: 2439   score: 155.0  epsilon: 1.0    steps: 544  evaluation reward: 351.5
episode: 2440   score: 210.0  epsilon: 1.0    steps: 728  evaluation reward: 347.35
Training

episode: 2462   score: 290.0  epsilon: 1.0    steps: 608  evaluation reward: 327.05
Training network. lr: 0.000204. clip: 0.081577
Iteration 6010: Policy loss: -0.111294. Value loss: 0.077230. Entropy: 0.267391.
Iteration 6011: Policy loss: -0.117004. Value loss: 0.042610. Entropy: 0.265889.
Iteration 6012: Policy loss: -0.122326. Value loss: 0.035189. Entropy: 0.266639.
Training network. lr: 0.000204. clip: 0.081577
Iteration 6013: Policy loss: -0.036735. Value loss: 0.080328. Entropy: 0.309951.
Iteration 6014: Policy loss: -0.045045. Value loss: 0.029166. Entropy: 0.309682.
Iteration 6015: Policy loss: -0.049088. Value loss: 0.019533. Entropy: 0.309970.
Training network. lr: 0.000204. clip: 0.081577
Iteration 6016: Policy loss: -0.412817. Value loss: 0.354609. Entropy: 0.305939.
Iteration 6017: Policy loss: -0.392253. Value loss: 0.159296. Entropy: 0.304058.
Iteration 6018: Policy loss: -0.429867. Value loss: 0.106015. Entropy: 0.304741.
episode: 2463   score: 365.0  epsilon: 1.0    

episode: 2485   score: 300.0  epsilon: 1.0    steps: 464  evaluation reward: 315.75
episode: 2486   score: 490.0  epsilon: 1.0    steps: 464  evaluation reward: 316.05
Training network. lr: 0.000204. clip: 0.081421
Iteration 6076: Policy loss: 0.075214. Value loss: 0.123376. Entropy: 0.246979.
Iteration 6077: Policy loss: 0.069343. Value loss: 0.050259. Entropy: 0.245272.
Iteration 6078: Policy loss: 0.061521. Value loss: 0.034687. Entropy: 0.244773.
Training network. lr: 0.000204. clip: 0.081421
Iteration 6079: Policy loss: 0.252333. Value loss: 0.104957. Entropy: 0.306697.
Iteration 6080: Policy loss: 0.248595. Value loss: 0.043224. Entropy: 0.303325.
Iteration 6081: Policy loss: 0.242365. Value loss: 0.027884. Entropy: 0.303824.
Training network. lr: 0.000204. clip: 0.081421
Iteration 6082: Policy loss: 0.129894. Value loss: 0.082512. Entropy: 0.301923.
Iteration 6083: Policy loss: 0.130492. Value loss: 0.042654. Entropy: 0.300149.
Iteration 6084: Policy loss: 0.126671. Value loss: 

Iteration 6143: Policy loss: -0.147282. Value loss: 0.036595. Entropy: 0.302767.
Iteration 6144: Policy loss: -0.152323. Value loss: 0.028946. Entropy: 0.303927.
episode: 2506   score: 415.0  epsilon: 1.0    steps: 24  evaluation reward: 316.8
Training network. lr: 0.000203. clip: 0.081264
Iteration 6145: Policy loss: 0.040132. Value loss: 0.092025. Entropy: 0.297222.
Iteration 6146: Policy loss: 0.028972. Value loss: 0.038422. Entropy: 0.296049.
Iteration 6147: Policy loss: 0.029878. Value loss: 0.025795. Entropy: 0.294902.
Training network. lr: 0.000203. clip: 0.081264
Iteration 6148: Policy loss: 0.095112. Value loss: 0.112623. Entropy: 0.311712.
Iteration 6149: Policy loss: 0.093715. Value loss: 0.050936. Entropy: 0.311810.
Iteration 6150: Policy loss: 0.090855. Value loss: 0.039658. Entropy: 0.310864.
episode: 2507   score: 225.0  epsilon: 1.0    steps: 344  evaluation reward: 315.85
episode: 2508   score: 250.0  epsilon: 1.0    steps: 1008  evaluation reward: 314.45
Training netw

Training network. lr: 0.000202. clip: 0.080960
Iteration 6211: Policy loss: -0.207902. Value loss: 0.236473. Entropy: 0.303570.
Iteration 6212: Policy loss: -0.204577. Value loss: 0.080382. Entropy: 0.302425.
Iteration 6213: Policy loss: -0.219622. Value loss: 0.052803. Entropy: 0.302631.
episode: 2527   score: 225.0  epsilon: 1.0    steps: 8  evaluation reward: 329.85
episode: 2528   score: 820.0  epsilon: 1.0    steps: 664  evaluation reward: 337.15
Training network. lr: 0.000202. clip: 0.080960
Iteration 6214: Policy loss: 0.117073. Value loss: 0.150979. Entropy: 0.279203.
Iteration 6215: Policy loss: 0.102818. Value loss: 0.050993. Entropy: 0.278330.
Iteration 6216: Policy loss: 0.097633. Value loss: 0.036854. Entropy: 0.278305.
Training network. lr: 0.000202. clip: 0.080960
Iteration 6217: Policy loss: -0.017585. Value loss: 0.337266. Entropy: 0.305554.
Iteration 6218: Policy loss: -0.029201. Value loss: 0.152652. Entropy: 0.304237.
Iteration 6219: Policy loss: -0.055836. Value lo

Iteration 6277: Policy loss: 0.042919. Value loss: 0.143844. Entropy: 0.294177.
Iteration 6278: Policy loss: 0.032479. Value loss: 0.044018. Entropy: 0.295026.
Iteration 6279: Policy loss: 0.027348. Value loss: 0.029159. Entropy: 0.293680.
episode: 2549   score: 285.0  epsilon: 1.0    steps: 1016  evaluation reward: 348.5
Training network. lr: 0.000202. clip: 0.080803
Iteration 6280: Policy loss: -0.140379. Value loss: 0.076851. Entropy: 0.305172.
Iteration 6281: Policy loss: -0.144033. Value loss: 0.029983. Entropy: 0.303111.
Iteration 6282: Policy loss: -0.149300. Value loss: 0.019209. Entropy: 0.303728.
episode: 2550   score: 260.0  epsilon: 1.0    steps: 472  evaluation reward: 349.0
Training network. lr: 0.000202. clip: 0.080803
Iteration 6283: Policy loss: -0.056839. Value loss: 0.145357. Entropy: 0.266692.
Iteration 6284: Policy loss: -0.071705. Value loss: 0.047115. Entropy: 0.264929.
Iteration 6285: Policy loss: -0.079259. Value loss: 0.032165. Entropy: 0.263902.
now time :  2

Iteration 6343: Policy loss: -0.000322. Value loss: 0.054578. Entropy: 0.304427.
Iteration 6344: Policy loss: -0.002708. Value loss: 0.022541. Entropy: 0.303299.
Iteration 6345: Policy loss: -0.002472. Value loss: 0.016293. Entropy: 0.303430.
episode: 2571   score: 240.0  epsilon: 1.0    steps: 696  evaluation reward: 354.55
episode: 2572   score: 225.0  epsilon: 1.0    steps: 776  evaluation reward: 354.4
Training network. lr: 0.000202. clip: 0.080656
Iteration 6346: Policy loss: -0.524441. Value loss: 0.531512. Entropy: 0.283170.
Iteration 6347: Policy loss: -0.562957. Value loss: 0.270356. Entropy: 0.278518.
Iteration 6348: Policy loss: -0.521485. Value loss: 0.131636. Entropy: 0.281039.
episode: 2573   score: 240.0  epsilon: 1.0    steps: 872  evaluation reward: 354.65
Training network. lr: 0.000202. clip: 0.080656
Iteration 6349: Policy loss: 0.184998. Value loss: 0.091239. Entropy: 0.299776.
Iteration 6350: Policy loss: 0.179950. Value loss: 0.042453. Entropy: 0.299029.
Iteration

Training network. lr: 0.000201. clip: 0.080342
Iteration 6409: Policy loss: 0.240498. Value loss: 0.120398. Entropy: 0.293316.
Iteration 6410: Policy loss: 0.236827. Value loss: 0.034815. Entropy: 0.292958.
Iteration 6411: Policy loss: 0.237935. Value loss: 0.022509. Entropy: 0.294284.
Training network. lr: 0.000201. clip: 0.080342
Iteration 6412: Policy loss: 0.127302. Value loss: 0.113949. Entropy: 0.304335.
Iteration 6413: Policy loss: 0.121603. Value loss: 0.049121. Entropy: 0.304756.
Iteration 6414: Policy loss: 0.118266. Value loss: 0.032596. Entropy: 0.303022.
Training network. lr: 0.000201. clip: 0.080342
Iteration 6415: Policy loss: -0.257278. Value loss: 0.352868. Entropy: 0.310772.
Iteration 6416: Policy loss: -0.275748. Value loss: 0.215377. Entropy: 0.310331.
Iteration 6417: Policy loss: -0.278353. Value loss: 0.162691. Entropy: 0.310467.
Training network. lr: 0.000201. clip: 0.080342
Iteration 6418: Policy loss: 0.048037. Value loss: 0.077238. Entropy: 0.296768.
Iteration

Iteration 6477: Policy loss: -0.107349. Value loss: 0.034867. Entropy: 0.306577.
episode: 2614   score: 245.0  epsilon: 1.0    steps: 384  evaluation reward: 367.35
episode: 2615   score: 415.0  epsilon: 1.0    steps: 632  evaluation reward: 368.35
Training network. lr: 0.000200. clip: 0.080195
Iteration 6478: Policy loss: 0.019741. Value loss: 0.062430. Entropy: 0.279561.
Iteration 6479: Policy loss: 0.009359. Value loss: 0.030134. Entropy: 0.276795.
Iteration 6480: Policy loss: 0.011340. Value loss: 0.023260. Entropy: 0.277113.
episode: 2616   score: 345.0  epsilon: 1.0    steps: 944  evaluation reward: 369.2
Training network. lr: 0.000200. clip: 0.080195
Iteration 6481: Policy loss: -0.110335. Value loss: 0.301756. Entropy: 0.302342.
Iteration 6482: Policy loss: -0.140563. Value loss: 0.142143. Entropy: 0.300122.
Iteration 6483: Policy loss: -0.137906. Value loss: 0.072794. Entropy: 0.302749.
Training network. lr: 0.000200. clip: 0.080195
Iteration 6484: Policy loss: 0.027126. Value

Training network. lr: 0.000200. clip: 0.080038
Iteration 6544: Policy loss: 0.167579. Value loss: 0.121722. Entropy: 0.306084.
Iteration 6545: Policy loss: 0.148066. Value loss: 0.047609. Entropy: 0.305834.
Iteration 6546: Policy loss: 0.154034. Value loss: 0.033923. Entropy: 0.306749.
Training network. lr: 0.000200. clip: 0.080038
Iteration 6547: Policy loss: 0.011901. Value loss: 0.076616. Entropy: 0.304742.
Iteration 6548: Policy loss: 0.005138. Value loss: 0.034872. Entropy: 0.304351.
Iteration 6549: Policy loss: -0.000071. Value loss: 0.025528. Entropy: 0.302809.
episode: 2636   score: 265.0  epsilon: 1.0    steps: 816  evaluation reward: 354.9
Training network. lr: 0.000200. clip: 0.080038
Iteration 6550: Policy loss: 0.029730. Value loss: 0.088080. Entropy: 0.299629.
Iteration 6551: Policy loss: 0.016468. Value loss: 0.042820. Entropy: 0.300015.
Iteration 6552: Policy loss: 0.007149. Value loss: 0.030932. Entropy: 0.298652.
Training network. lr: 0.000200. clip: 0.079881
Iteratio

Training network. lr: 0.000199. clip: 0.079734
Iteration 6610: Policy loss: 0.079680. Value loss: 0.070470. Entropy: 0.303763.
Iteration 6611: Policy loss: 0.075484. Value loss: 0.025035. Entropy: 0.302680.
Iteration 6612: Policy loss: 0.069796. Value loss: 0.018356. Entropy: 0.302352.
episode: 2658   score: 420.0  epsilon: 1.0    steps: 480  evaluation reward: 350.4
Training network. lr: 0.000199. clip: 0.079734
Iteration 6613: Policy loss: -0.320691. Value loss: 0.251304. Entropy: 0.294616.
Iteration 6614: Policy loss: -0.353436. Value loss: 0.118017. Entropy: 0.293885.
Iteration 6615: Policy loss: -0.352501. Value loss: 0.059894. Entropy: 0.296080.
episode: 2659   score: 245.0  epsilon: 1.0    steps: 656  evaluation reward: 348.95
Training network. lr: 0.000199. clip: 0.079734
Iteration 6616: Policy loss: 0.253541. Value loss: 0.178311. Entropy: 0.295766.
Iteration 6617: Policy loss: 0.233153. Value loss: 0.054268. Entropy: 0.294667.
Iteration 6618: Policy loss: 0.230799. Value loss

Iteration 6678: Policy loss: -0.286547. Value loss: 0.056445. Entropy: 0.301041.
episode: 2678   score: 335.0  epsilon: 1.0    steps: 720  evaluation reward: 363.55
episode: 2679   score: 460.0  epsilon: 1.0    steps: 1016  evaluation reward: 363.9
Training network. lr: 0.000199. clip: 0.079577
Iteration 6679: Policy loss: -0.065833. Value loss: 0.272634. Entropy: 0.294037.
Iteration 6680: Policy loss: -0.068609. Value loss: 0.146793. Entropy: 0.293303.
Iteration 6681: Policy loss: -0.074777. Value loss: 0.059450. Entropy: 0.292984.
episode: 2680   score: 750.0  epsilon: 1.0    steps: 632  evaluation reward: 366.55
Training network. lr: 0.000199. clip: 0.079577
Iteration 6682: Policy loss: -0.153017. Value loss: 0.392036. Entropy: 0.286447.
Iteration 6683: Policy loss: -0.159847. Value loss: 0.218660. Entropy: 0.283448.
Iteration 6684: Policy loss: -0.184885. Value loss: 0.101375. Entropy: 0.285282.
Training network. lr: 0.000199. clip: 0.079577
Iteration 6685: Policy loss: 0.238576. V

Iteration 6744: Policy loss: 0.190255. Value loss: 0.022557. Entropy: 0.296897.
Training network. lr: 0.000199. clip: 0.079421
Iteration 6745: Policy loss: 0.029967. Value loss: 0.166002. Entropy: 0.308066.
Iteration 6746: Policy loss: 0.019627. Value loss: 0.054791. Entropy: 0.308836.
Iteration 6747: Policy loss: 0.011677. Value loss: 0.040468. Entropy: 0.307630.
now time :  2019-09-05 21:13:55.411383
episode: 2701   score: 455.0  epsilon: 1.0    steps: 512  evaluation reward: 359.0
Training network. lr: 0.000199. clip: 0.079421
Iteration 6748: Policy loss: 0.123612. Value loss: 0.060912. Entropy: 0.292124.
Iteration 6749: Policy loss: 0.119567. Value loss: 0.027254. Entropy: 0.289713.
Iteration 6750: Policy loss: 0.119072. Value loss: 0.020455. Entropy: 0.290365.
episode: 2702   score: 215.0  epsilon: 1.0    steps: 864  evaluation reward: 356.95
Training network. lr: 0.000198. clip: 0.079273
Iteration 6751: Policy loss: 0.048739. Value loss: 0.095473. Entropy: 0.302572.
Iteration 675

episode: 2722   score: 480.0  epsilon: 1.0    steps: 360  evaluation reward: 358.05
Training network. lr: 0.000198. clip: 0.079117
Iteration 6811: Policy loss: -0.138974. Value loss: 0.170069. Entropy: 0.294463.
Iteration 6812: Policy loss: -0.148642. Value loss: 0.068764. Entropy: 0.293519.
Iteration 6813: Policy loss: -0.144223. Value loss: 0.049739. Entropy: 0.292938.
episode: 2723   score: 270.0  epsilon: 1.0    steps: 200  evaluation reward: 358.65
episode: 2724   score: 285.0  epsilon: 1.0    steps: 848  evaluation reward: 358.75
Training network. lr: 0.000198. clip: 0.079117
Iteration 6814: Policy loss: 0.102817. Value loss: 0.079451. Entropy: 0.288372.
Iteration 6815: Policy loss: 0.106880. Value loss: 0.030938. Entropy: 0.287111.
Iteration 6816: Policy loss: 0.096691. Value loss: 0.022782. Entropy: 0.286680.
episode: 2725   score: 520.0  epsilon: 1.0    steps: 744  evaluation reward: 357.4
Training network. lr: 0.000198. clip: 0.079117
Iteration 6817: Policy loss: -0.284679. V

Training network. lr: 0.000197. clip: 0.078960
Iteration 6877: Policy loss: -0.155723. Value loss: 0.153435. Entropy: 0.294665.
Iteration 6878: Policy loss: -0.154904. Value loss: 0.083133. Entropy: 0.294031.
Iteration 6879: Policy loss: -0.161788. Value loss: 0.058360. Entropy: 0.294632.
Training network. lr: 0.000197. clip: 0.078960
Iteration 6880: Policy loss: 0.584936. Value loss: 0.286450. Entropy: 0.308066.
Iteration 6881: Policy loss: 0.571497. Value loss: 0.079748. Entropy: 0.306601.
Iteration 6882: Policy loss: 0.550194. Value loss: 0.047436. Entropy: 0.306790.
episode: 2746   score: 210.0  epsilon: 1.0    steps: 264  evaluation reward: 359.7
episode: 2747   score: 530.0  epsilon: 1.0    steps: 560  evaluation reward: 362.35
episode: 2748   score: 230.0  epsilon: 1.0    steps: 960  evaluation reward: 362.15
Training network. lr: 0.000197. clip: 0.078960
Iteration 6883: Policy loss: 0.157600. Value loss: 0.124481. Entropy: 0.277915.
Iteration 6884: Policy loss: 0.148046. Value 

Training network. lr: 0.000197. clip: 0.078812
Iteration 6943: Policy loss: 0.191665. Value loss: 0.091963. Entropy: 0.296752.
Iteration 6944: Policy loss: 0.177624. Value loss: 0.031838. Entropy: 0.295516.
Iteration 6945: Policy loss: 0.178449. Value loss: 0.022606. Entropy: 0.293527.
Training network. lr: 0.000197. clip: 0.078812
Iteration 6946: Policy loss: 0.024471. Value loss: 0.082629. Entropy: 0.312686.
Iteration 6947: Policy loss: 0.024501. Value loss: 0.045262. Entropy: 0.312080.
Iteration 6948: Policy loss: 0.019736. Value loss: 0.036130. Entropy: 0.312287.
episode: 2769   score: 565.0  epsilon: 1.0    steps: 872  evaluation reward: 360.75
Training network. lr: 0.000197. clip: 0.078812
Iteration 6949: Policy loss: 0.164586. Value loss: 0.155465. Entropy: 0.302590.
Iteration 6950: Policy loss: 0.142604. Value loss: 0.059512. Entropy: 0.301399.
Iteration 6951: Policy loss: 0.139820. Value loss: 0.039181. Entropy: 0.300479.
episode: 2770   score: 240.0  epsilon: 1.0    steps: 48

episode: 2788   score: 590.0  epsilon: 1.0    steps: 920  evaluation reward: 361.95
Training network. lr: 0.000196. clip: 0.078499
Iteration 7012: Policy loss: -0.209362. Value loss: 0.288209. Entropy: 0.276993.
Iteration 7013: Policy loss: -0.218640. Value loss: 0.134835. Entropy: 0.276444.
Iteration 7014: Policy loss: -0.222215. Value loss: 0.076305. Entropy: 0.275395.
Training network. lr: 0.000196. clip: 0.078499
Iteration 7015: Policy loss: -0.100707. Value loss: 0.072090. Entropy: 0.306639.
Iteration 7016: Policy loss: -0.103475. Value loss: 0.036552. Entropy: 0.308015.
Iteration 7017: Policy loss: -0.109795. Value loss: 0.030492. Entropy: 0.306321.
episode: 2789   score: 445.0  epsilon: 1.0    steps: 904  evaluation reward: 363.55
Training network. lr: 0.000196. clip: 0.078499
Iteration 7018: Policy loss: -0.006016. Value loss: 0.107458. Entropy: 0.305319.
Iteration 7019: Policy loss: -0.011599. Value loss: 0.053414. Entropy: 0.306396.
Iteration 7020: Policy loss: -0.009338. Val

episode: 2807   score: 555.0  epsilon: 1.0    steps: 400  evaluation reward: 369.45
episode: 2808   score: 260.0  epsilon: 1.0    steps: 456  evaluation reward: 369.2
Training network. lr: 0.000196. clip: 0.078352
Iteration 7081: Policy loss: 0.032172. Value loss: 0.093709. Entropy: 0.282434.
Iteration 7082: Policy loss: 0.022628. Value loss: 0.056114. Entropy: 0.282188.
Iteration 7083: Policy loss: 0.015903. Value loss: 0.048120. Entropy: 0.281364.
Training network. lr: 0.000196. clip: 0.078352
Iteration 7084: Policy loss: -0.539555. Value loss: 0.357075. Entropy: 0.311869.
Iteration 7085: Policy loss: -0.538464. Value loss: 0.100751. Entropy: 0.312410.
Iteration 7086: Policy loss: -0.550501. Value loss: 0.049517. Entropy: 0.310798.
Training network. lr: 0.000196. clip: 0.078352
Iteration 7087: Policy loss: -0.340045. Value loss: 0.119589. Entropy: 0.310336.
Iteration 7088: Policy loss: -0.344601. Value loss: 0.046609. Entropy: 0.310619.
Iteration 7089: Policy loss: -0.356533. Value l

Training network. lr: 0.000195. clip: 0.078195
Iteration 7147: Policy loss: 0.227599. Value loss: 0.099031. Entropy: 0.304402.
Iteration 7148: Policy loss: 0.222808. Value loss: 0.026906. Entropy: 0.303328.
Iteration 7149: Policy loss: 0.221247. Value loss: 0.018071. Entropy: 0.303343.
Training network. lr: 0.000195. clip: 0.078195
Iteration 7150: Policy loss: -0.252650. Value loss: 0.272573. Entropy: 0.295057.
Iteration 7151: Policy loss: -0.318451. Value loss: 0.150231. Entropy: 0.295572.
Iteration 7152: Policy loss: -0.320124. Value loss: 0.080366. Entropy: 0.297591.
Training network. lr: 0.000195. clip: 0.078038
Iteration 7153: Policy loss: -0.085727. Value loss: 0.084171. Entropy: 0.306932.
Iteration 7154: Policy loss: -0.088692. Value loss: 0.043137. Entropy: 0.307247.
Iteration 7155: Policy loss: -0.092867. Value loss: 0.033072. Entropy: 0.307521.
Training network. lr: 0.000195. clip: 0.078038
Iteration 7156: Policy loss: -0.223099. Value loss: 0.261468. Entropy: 0.305389.
Itera

Iteration 7217: Policy loss: 0.346490. Value loss: 0.057362. Entropy: 0.291368.
Iteration 7218: Policy loss: 0.328435. Value loss: 0.043714. Entropy: 0.293130.
episode: 2848   score: 530.0  epsilon: 1.0    steps: 352  evaluation reward: 399.3
episode: 2849   score: 765.0  epsilon: 1.0    steps: 888  evaluation reward: 403.5
episode: 2850   score: 335.0  epsilon: 1.0    steps: 1016  evaluation reward: 404.05
Training network. lr: 0.000195. clip: 0.077891
Iteration 7219: Policy loss: 0.240766. Value loss: 0.115573. Entropy: 0.287469.
Iteration 7220: Policy loss: 0.229652. Value loss: 0.046706. Entropy: 0.286925.
Iteration 7221: Policy loss: 0.228321. Value loss: 0.032634. Entropy: 0.286179.
Training network. lr: 0.000195. clip: 0.077891
Iteration 7222: Policy loss: 0.252362. Value loss: 0.157117. Entropy: 0.293494.
Iteration 7223: Policy loss: 0.242450. Value loss: 0.051821. Entropy: 0.292337.
Iteration 7224: Policy loss: 0.233983. Value loss: 0.032264. Entropy: 0.291885.
Training networ

episode: 2868   score: 590.0  epsilon: 1.0    steps: 456  evaluation reward: 410.0
episode: 2869   score: 300.0  epsilon: 1.0    steps: 976  evaluation reward: 407.35
Training network. lr: 0.000194. clip: 0.077734
Iteration 7285: Policy loss: 0.011867. Value loss: 0.125596. Entropy: 0.295697.
Iteration 7286: Policy loss: 0.001233. Value loss: 0.060855. Entropy: 0.295758.
Iteration 7287: Policy loss: -0.008978. Value loss: 0.042850. Entropy: 0.294842.
episode: 2870   score: 390.0  epsilon: 1.0    steps: 112  evaluation reward: 408.85
Training network. lr: 0.000194. clip: 0.077734
Iteration 7288: Policy loss: -0.017072. Value loss: 0.070850. Entropy: 0.282552.
Iteration 7289: Policy loss: -0.027041. Value loss: 0.032025. Entropy: 0.281405.
Iteration 7290: Policy loss: -0.029482. Value loss: 0.024401. Entropy: 0.281779.
Training network. lr: 0.000194. clip: 0.077734
Iteration 7291: Policy loss: -0.065732. Value loss: 0.097807. Entropy: 0.308167.
Iteration 7292: Policy loss: -0.070978. Val

Iteration 7353: Policy loss: -0.004665. Value loss: 0.045966. Entropy: 0.305508.
episode: 2888   score: 100.0  epsilon: 1.0    steps: 296  evaluation reward: 405.5
episode: 2889   score: 360.0  epsilon: 1.0    steps: 592  evaluation reward: 404.65
episode: 2890   score: 625.0  epsilon: 1.0    steps: 1024  evaluation reward: 407.25
Training network. lr: 0.000194. clip: 0.077430
Iteration 7354: Policy loss: -0.004696. Value loss: 0.080390. Entropy: 0.279558.
Iteration 7355: Policy loss: -0.013649. Value loss: 0.040072. Entropy: 0.278886.
Iteration 7356: Policy loss: -0.021146. Value loss: 0.028573. Entropy: 0.278869.
Training network. lr: 0.000194. clip: 0.077430
Iteration 7357: Policy loss: 0.195523. Value loss: 0.121080. Entropy: 0.294406.
Iteration 7358: Policy loss: 0.187997. Value loss: 0.045216. Entropy: 0.294913.
Iteration 7359: Policy loss: 0.183804. Value loss: 0.026885. Entropy: 0.293697.
Training network. lr: 0.000194. clip: 0.077430
Iteration 7360: Policy loss: 0.125820. Valu

Training network. lr: 0.000193. clip: 0.077273
Iteration 7417: Policy loss: 0.137419. Value loss: 0.082484. Entropy: 0.288943.
Iteration 7418: Policy loss: 0.126593. Value loss: 0.045307. Entropy: 0.288427.
Iteration 7419: Policy loss: 0.123945. Value loss: 0.034955. Entropy: 0.287167.
episode: 2913   score: 260.0  epsilon: 1.0    steps: 984  evaluation reward: 394.95
Training network. lr: 0.000193. clip: 0.077273
Iteration 7420: Policy loss: -0.081359. Value loss: 0.149358. Entropy: 0.295733.
Iteration 7421: Policy loss: -0.089301. Value loss: 0.050834. Entropy: 0.295597.
Iteration 7422: Policy loss: -0.082059. Value loss: 0.035196. Entropy: 0.293961.
episode: 2914   score: 620.0  epsilon: 1.0    steps: 752  evaluation reward: 397.05
Training network. lr: 0.000193. clip: 0.077273
Iteration 7423: Policy loss: 0.117637. Value loss: 0.096331. Entropy: 0.284582.
Iteration 7424: Policy loss: 0.104034. Value loss: 0.044323. Entropy: 0.284290.
Iteration 7425: Policy loss: 0.111658. Value los

episode: 2932   score: 530.0  epsilon: 1.0    steps: 384  evaluation reward: 393.45
Training network. lr: 0.000193. clip: 0.077117
Iteration 7486: Policy loss: 0.008866. Value loss: 0.121076. Entropy: 0.297973.
Iteration 7487: Policy loss: -0.000047. Value loss: 0.065670. Entropy: 0.297206.
Iteration 7488: Policy loss: -0.003492. Value loss: 0.043874. Entropy: 0.295767.
Training network. lr: 0.000193. clip: 0.077117
Iteration 7489: Policy loss: -0.288476. Value loss: 0.454407. Entropy: 0.304355.
Iteration 7490: Policy loss: -0.322792. Value loss: 0.322313. Entropy: 0.303883.
Iteration 7491: Policy loss: -0.344718. Value loss: 0.262592. Entropy: 0.304179.
episode: 2933   score: 665.0  epsilon: 1.0    steps: 320  evaluation reward: 396.45
Training network. lr: 0.000193. clip: 0.077117
Iteration 7492: Policy loss: -0.005303. Value loss: 0.091988. Entropy: 0.301673.
Iteration 7493: Policy loss: 0.006960. Value loss: 0.042286. Entropy: 0.301884.
Iteration 7494: Policy loss: -0.014385. Value

Training network. lr: 0.000192. clip: 0.076813
Iteration 7552: Policy loss: 0.122342. Value loss: 0.068044. Entropy: 0.295827.
Iteration 7553: Policy loss: 0.115628. Value loss: 0.033655. Entropy: 0.295757.
Iteration 7554: Policy loss: 0.111707. Value loss: 0.027141. Entropy: 0.297592.
Training network. lr: 0.000192. clip: 0.076813
Iteration 7555: Policy loss: 0.121708. Value loss: 0.130916. Entropy: 0.309687.
Iteration 7556: Policy loss: 0.106241. Value loss: 0.042065. Entropy: 0.308974.
Iteration 7557: Policy loss: 0.105521. Value loss: 0.030844. Entropy: 0.308654.
episode: 2954   score: 265.0  epsilon: 1.0    steps: 216  evaluation reward: 385.7
Training network. lr: 0.000192. clip: 0.076813
Iteration 7558: Policy loss: -0.038054. Value loss: 0.050555. Entropy: 0.303327.
Iteration 7559: Policy loss: -0.046385. Value loss: 0.023374. Entropy: 0.304675.
Iteration 7560: Policy loss: -0.047949. Value loss: 0.019308. Entropy: 0.304196.
Training network. lr: 0.000192. clip: 0.076813
Iterat

Iteration 7619: Policy loss: 0.135495. Value loss: 0.024321. Entropy: 0.300101.
Iteration 7620: Policy loss: 0.133367. Value loss: 0.016388. Entropy: 0.300533.
episode: 2975   score: 240.0  epsilon: 1.0    steps: 600  evaluation reward: 381.9
Training network. lr: 0.000192. clip: 0.076656
Iteration 7621: Policy loss: -0.062099. Value loss: 0.141761. Entropy: 0.300899.
Iteration 7622: Policy loss: -0.056628. Value loss: 0.049939. Entropy: 0.301119.
Iteration 7623: Policy loss: -0.067701. Value loss: 0.033119. Entropy: 0.300559.
episode: 2976   score: 640.0  epsilon: 1.0    steps: 608  evaluation reward: 383.65
Training network. lr: 0.000192. clip: 0.076656
Iteration 7624: Policy loss: -0.073463. Value loss: 0.146394. Entropy: 0.302992.
Iteration 7625: Policy loss: -0.077870. Value loss: 0.055867. Entropy: 0.303716.
Iteration 7626: Policy loss: -0.081284. Value loss: 0.035725. Entropy: 0.303804.
Training network. lr: 0.000192. clip: 0.076656
Iteration 7627: Policy loss: -0.257237. Value 

Iteration 7686: Policy loss: -0.069341. Value loss: 0.037451. Entropy: 0.305934.
episode: 2997   score: 215.0  epsilon: 1.0    steps: 960  evaluation reward: 381.8
Training network. lr: 0.000191. clip: 0.076508
Iteration 7687: Policy loss: 0.348637. Value loss: 0.109411. Entropy: 0.303037.
Iteration 7688: Policy loss: 0.346765. Value loss: 0.050039. Entropy: 0.302599.
Iteration 7689: Policy loss: 0.339914. Value loss: 0.038271. Entropy: 0.302107.
episode: 2998   score: 590.0  epsilon: 1.0    steps: 312  evaluation reward: 386.35
Training network. lr: 0.000191. clip: 0.076508
Iteration 7690: Policy loss: 0.210377. Value loss: 0.055297. Entropy: 0.300057.
Iteration 7691: Policy loss: 0.209120. Value loss: 0.025428. Entropy: 0.300641.
Iteration 7692: Policy loss: 0.202212. Value loss: 0.019516. Entropy: 0.299686.
Training network. lr: 0.000191. clip: 0.076508
Iteration 7693: Policy loss: -0.072336. Value loss: 0.137180. Entropy: 0.307828.
Iteration 7694: Policy loss: -0.066628. Value loss

Iteration 7754: Policy loss: 0.185893. Value loss: 0.044644. Entropy: 0.299027.
Iteration 7755: Policy loss: 0.189429. Value loss: 0.030587. Entropy: 0.299996.
episode: 3017   score: 320.0  epsilon: 1.0    steps: 912  evaluation reward: 391.85
episode: 3018   score: 460.0  epsilon: 1.0    steps: 1024  evaluation reward: 394.15
Training network. lr: 0.000190. clip: 0.076195
Iteration 7756: Policy loss: -0.023968. Value loss: 0.345293. Entropy: 0.301096.
Iteration 7757: Policy loss: -0.059381. Value loss: 0.120123. Entropy: 0.300290.
Iteration 7758: Policy loss: -0.057011. Value loss: 0.060263. Entropy: 0.299949.
episode: 3019   score: 285.0  epsilon: 1.0    steps: 800  evaluation reward: 392.3
Training network. lr: 0.000190. clip: 0.076195
Iteration 7759: Policy loss: -0.066690. Value loss: 0.175662. Entropy: 0.299123.
Iteration 7760: Policy loss: -0.084498. Value loss: 0.066982. Entropy: 0.298786.
Iteration 7761: Policy loss: -0.095640. Value loss: 0.046931. Entropy: 0.298091.
episode:

episode: 3038   score: 905.0  epsilon: 1.0    steps: 104  evaluation reward: 397.65
Training network. lr: 0.000190. clip: 0.076048
Iteration 7822: Policy loss: 0.129552. Value loss: 0.165139. Entropy: 0.299401.
Iteration 7823: Policy loss: 0.112944. Value loss: 0.065121. Entropy: 0.299734.
Iteration 7824: Policy loss: 0.106087. Value loss: 0.045572. Entropy: 0.296274.
episode: 3039   score: 670.0  epsilon: 1.0    steps: 648  evaluation reward: 401.0
episode: 3040   score: 420.0  epsilon: 1.0    steps: 968  evaluation reward: 402.6
Training network. lr: 0.000190. clip: 0.076048
Iteration 7825: Policy loss: 0.093583. Value loss: 0.545675. Entropy: 0.302620.
Iteration 7826: Policy loss: 0.075987. Value loss: 0.296728. Entropy: 0.301728.
Iteration 7827: Policy loss: 0.058051. Value loss: 0.231439. Entropy: 0.301109.
episode: 3041   score: 745.0  epsilon: 1.0    steps: 648  evaluation reward: 407.65
Training network. lr: 0.000190. clip: 0.076048
Iteration 7828: Policy loss: 0.054219. Value 

Iteration 7890: Policy loss: -0.238149. Value loss: 0.078266. Entropy: 0.295286.
Training network. lr: 0.000190. clip: 0.075891
Iteration 7891: Policy loss: -0.004486. Value loss: 0.126283. Entropy: 0.311144.
Iteration 7892: Policy loss: -0.013687. Value loss: 0.053998. Entropy: 0.309958.
Iteration 7893: Policy loss: -0.021384. Value loss: 0.037984. Entropy: 0.309569.
episode: 3058   score: 260.0  epsilon: 1.0    steps: 24  evaluation reward: 393.35
episode: 3059   score: 310.0  epsilon: 1.0    steps: 752  evaluation reward: 393.8
Training network. lr: 0.000190. clip: 0.075891
Iteration 7894: Policy loss: 0.177059. Value loss: 0.108875. Entropy: 0.297962.
Iteration 7895: Policy loss: 0.160508. Value loss: 0.029580. Entropy: 0.297415.
Iteration 7896: Policy loss: 0.154393. Value loss: 0.024139. Entropy: 0.297565.
episode: 3060   score: 470.0  epsilon: 1.0    steps: 376  evaluation reward: 395.6
Training network. lr: 0.000190. clip: 0.075891
Iteration 7897: Policy loss: -0.166478. Value 

Training network. lr: 0.000189. clip: 0.075587
Iteration 7957: Policy loss: -0.446010. Value loss: 0.281024. Entropy: 0.282022.
Iteration 7958: Policy loss: -0.478966. Value loss: 0.097537. Entropy: 0.280673.
Iteration 7959: Policy loss: -0.484525. Value loss: 0.056052. Entropy: 0.281512.
Training network. lr: 0.000189. clip: 0.075587
Iteration 7960: Policy loss: -0.148859. Value loss: 0.236901. Entropy: 0.312493.
Iteration 7961: Policy loss: -0.159272. Value loss: 0.119983. Entropy: 0.310648.
Iteration 7962: Policy loss: -0.167485. Value loss: 0.074548. Entropy: 0.311238.
episode: 3080   score: 565.0  epsilon: 1.0    steps: 528  evaluation reward: 397.55
episode: 3081   score: 545.0  epsilon: 1.0    steps: 1016  evaluation reward: 400.5
Training network. lr: 0.000189. clip: 0.075587
Iteration 7963: Policy loss: 0.190439. Value loss: 0.201872. Entropy: 0.301907.
Iteration 7964: Policy loss: 0.191716. Value loss: 0.081875. Entropy: 0.303825.
Iteration 7965: Policy loss: 0.180167. Value 

episode: 3099   score: 260.0  epsilon: 1.0    steps: 432  evaluation reward: 403.75
episode: 3100   score: 535.0  epsilon: 1.0    steps: 808  evaluation reward: 406.7
Training network. lr: 0.000189. clip: 0.075430
Iteration 8026: Policy loss: -0.177271. Value loss: 0.240016. Entropy: 0.276031.
Iteration 8027: Policy loss: -0.188505. Value loss: 0.146078. Entropy: 0.274464.
Iteration 8028: Policy loss: -0.188411. Value loss: 0.070414. Entropy: 0.275327.
Training network. lr: 0.000189. clip: 0.075430
Iteration 8029: Policy loss: -0.022451. Value loss: 0.071845. Entropy: 0.311658.
Iteration 8030: Policy loss: -0.023845. Value loss: 0.035172. Entropy: 0.310418.
Iteration 8031: Policy loss: -0.025356. Value loss: 0.027159. Entropy: 0.310259.
Training network. lr: 0.000189. clip: 0.075430
Iteration 8032: Policy loss: -0.022985. Value loss: 0.194713. Entropy: 0.305457.
Iteration 8033: Policy loss: -0.027927. Value loss: 0.063671. Entropy: 0.306063.
Iteration 8034: Policy loss: -0.034198. Valu

Training network. lr: 0.000188. clip: 0.075273
Iteration 8095: Policy loss: 0.074608. Value loss: 0.057357. Entropy: 0.298878.
Iteration 8096: Policy loss: 0.068759. Value loss: 0.027321. Entropy: 0.297200.
Iteration 8097: Policy loss: 0.069071. Value loss: 0.020226. Entropy: 0.297020.
Training network. lr: 0.000188. clip: 0.075273
Iteration 8098: Policy loss: 0.084022. Value loss: 0.063624. Entropy: 0.303656.
Iteration 8099: Policy loss: 0.076726. Value loss: 0.029633. Entropy: 0.304064.
Iteration 8100: Policy loss: 0.074319. Value loss: 0.022220. Entropy: 0.304478.
Training network. lr: 0.000188. clip: 0.075126
Iteration 8101: Policy loss: -0.011917. Value loss: 0.068604. Entropy: 0.307349.
Iteration 8102: Policy loss: -0.014368. Value loss: 0.031730. Entropy: 0.305449.
Iteration 8103: Policy loss: -0.020373. Value loss: 0.021859. Entropy: 0.306005.
episode: 3118   score: 420.0  epsilon: 1.0    steps: 464  evaluation reward: 424.2
episode: 3119   score: 345.0  epsilon: 1.0    steps: 

Iteration 8163: Policy loss: 0.088906. Value loss: 0.022173. Entropy: 0.303879.
Training network. lr: 0.000187. clip: 0.074969
Iteration 8164: Policy loss: -0.139226. Value loss: 0.138944. Entropy: 0.304960.
Iteration 8165: Policy loss: -0.143792. Value loss: 0.052421. Entropy: 0.305028.
Iteration 8166: Policy loss: -0.159089. Value loss: 0.035698. Entropy: 0.304667.
episode: 3138   score: 335.0  epsilon: 1.0    steps: 168  evaluation reward: 407.1
Training network. lr: 0.000187. clip: 0.074969
Iteration 8167: Policy loss: -0.037112. Value loss: 0.096240. Entropy: 0.300381.
Iteration 8168: Policy loss: -0.049204. Value loss: 0.037342. Entropy: 0.301485.
Iteration 8169: Policy loss: -0.053664. Value loss: 0.027718. Entropy: 0.299977.
episode: 3139   score: 515.0  epsilon: 1.0    steps: 144  evaluation reward: 405.55
episode: 3140   score: 325.0  epsilon: 1.0    steps: 680  evaluation reward: 404.6
Training network. lr: 0.000187. clip: 0.074969
Iteration 8170: Policy loss: -0.301175. Val

Iteration 8231: Policy loss: 0.110439. Value loss: 0.065386. Entropy: 0.305387.
Iteration 8232: Policy loss: 0.118291. Value loss: 0.045232. Entropy: 0.306815.
episode: 3158   score: 330.0  epsilon: 1.0    steps: 952  evaluation reward: 418.4
Training network. lr: 0.000187. clip: 0.074813
Iteration 8233: Policy loss: 0.092191. Value loss: 0.137542. Entropy: 0.305606.
Iteration 8234: Policy loss: 0.095418. Value loss: 0.058891. Entropy: 0.304420.
Iteration 8235: Policy loss: 0.076364. Value loss: 0.040224. Entropy: 0.304532.
episode: 3159   score: 300.0  epsilon: 1.0    steps: 152  evaluation reward: 418.3
Training network. lr: 0.000187. clip: 0.074813
Iteration 8236: Policy loss: 0.170337. Value loss: 0.132299. Entropy: 0.292757.
Iteration 8237: Policy loss: 0.159038. Value loss: 0.051056. Entropy: 0.289232.
Iteration 8238: Policy loss: 0.163808. Value loss: 0.038552. Entropy: 0.291771.
episode: 3160   score: 360.0  epsilon: 1.0    steps: 896  evaluation reward: 417.2
Training network.

episode: 3179   score: 345.0  epsilon: 1.0    steps: 136  evaluation reward: 411.3
episode: 3180   score: 590.0  epsilon: 1.0    steps: 256  evaluation reward: 411.55
Training network. lr: 0.000187. clip: 0.074665
Iteration 8299: Policy loss: -0.081377. Value loss: 0.118748. Entropy: 0.279404.
Iteration 8300: Policy loss: -0.092090. Value loss: 0.045486. Entropy: 0.277698.
Iteration 8301: Policy loss: -0.097316. Value loss: 0.031992. Entropy: 0.278611.
Training network. lr: 0.000186. clip: 0.074509
Iteration 8302: Policy loss: -0.210529. Value loss: 0.292269. Entropy: 0.307892.
Iteration 8303: Policy loss: -0.217324. Value loss: 0.095973. Entropy: 0.306022.
Iteration 8304: Policy loss: -0.229815. Value loss: 0.055110. Entropy: 0.307936.
episode: 3181   score: 420.0  epsilon: 1.0    steps: 112  evaluation reward: 410.3
Training network. lr: 0.000186. clip: 0.074509
Iteration 8305: Policy loss: 0.174759. Value loss: 0.065657. Entropy: 0.293881.
Iteration 8306: Policy loss: 0.168824. Valu

Iteration 8366: Policy loss: -0.109836. Value loss: 0.064854. Entropy: 0.294619.
Iteration 8367: Policy loss: -0.129317. Value loss: 0.041329. Entropy: 0.294784.
episode: 3200   score: 695.0  epsilon: 1.0    steps: 816  evaluation reward: 410.45
now time :  2019-09-05 22:54:10.166291
episode: 3201   score: 420.0  epsilon: 1.0    steps: 928  evaluation reward: 410.75
Training network. lr: 0.000186. clip: 0.074352
Iteration 8368: Policy loss: 0.079482. Value loss: 0.222376. Entropy: 0.297406.
Iteration 8369: Policy loss: 0.075450. Value loss: 0.085234. Entropy: 0.297638.
Iteration 8370: Policy loss: 0.073203. Value loss: 0.060349. Entropy: 0.298001.
Training network. lr: 0.000186. clip: 0.074352
Iteration 8371: Policy loss: 0.104413. Value loss: 0.083514. Entropy: 0.300699.
Iteration 8372: Policy loss: 0.095598. Value loss: 0.039334. Entropy: 0.300520.
Iteration 8373: Policy loss: 0.093804. Value loss: 0.029117. Entropy: 0.301293.
episode: 3202   score: 285.0  epsilon: 1.0    steps: 688 

Training network. lr: 0.000186. clip: 0.074204
Iteration 8434: Policy loss: -0.035808. Value loss: 0.089350. Entropy: 0.311253.
Iteration 8435: Policy loss: -0.043248. Value loss: 0.041898. Entropy: 0.310090.
Iteration 8436: Policy loss: -0.047401. Value loss: 0.032384. Entropy: 0.309621.
Training network. lr: 0.000186. clip: 0.074204
Iteration 8437: Policy loss: 0.114468. Value loss: 0.103378. Entropy: 0.308389.
Iteration 8438: Policy loss: 0.101475. Value loss: 0.050508. Entropy: 0.306530.
Iteration 8439: Policy loss: 0.097541. Value loss: 0.033853. Entropy: 0.306678.
Training network. lr: 0.000186. clip: 0.074204
Iteration 8440: Policy loss: 0.120889. Value loss: 0.318322. Entropy: 0.313149.
Iteration 8441: Policy loss: 0.100445. Value loss: 0.171184. Entropy: 0.312218.
Iteration 8442: Policy loss: 0.085868. Value loss: 0.121646. Entropy: 0.312472.
episode: 3220   score: 615.0  epsilon: 1.0    steps: 328  evaluation reward: 409.95
episode: 3221   score: 335.0  epsilon: 1.0    steps:

Training network. lr: 0.000185. clip: 0.073891
Iteration 8506: Policy loss: -0.054027. Value loss: 0.054115. Entropy: 0.285917.
Iteration 8507: Policy loss: -0.058750. Value loss: 0.021986. Entropy: 0.287247.
Iteration 8508: Policy loss: -0.060072. Value loss: 0.015355. Entropy: 0.287358.
episode: 3238   score: 240.0  epsilon: 1.0    steps: 24  evaluation reward: 426.0
episode: 3239   score: 345.0  epsilon: 1.0    steps: 432  evaluation reward: 424.3
Training network. lr: 0.000185. clip: 0.073891
Iteration 8509: Policy loss: 0.148966. Value loss: 0.121342. Entropy: 0.288843.
Iteration 8510: Policy loss: 0.139363. Value loss: 0.041992. Entropy: 0.285611.
Iteration 8511: Policy loss: 0.136742. Value loss: 0.030447. Entropy: 0.284698.
episode: 3240   score: 480.0  epsilon: 1.0    steps: 464  evaluation reward: 425.85
Training network. lr: 0.000185. clip: 0.073891
Iteration 8512: Policy loss: 0.101146. Value loss: 0.095475. Entropy: 0.295623.
Iteration 8513: Policy loss: 0.084815. Value lo

Iteration 8569: Policy loss: 0.158421. Value loss: 0.092699. Entropy: 0.275151.
Iteration 8570: Policy loss: 0.144178. Value loss: 0.041441. Entropy: 0.270869.
Iteration 8571: Policy loss: 0.144413. Value loss: 0.034784. Entropy: 0.273083.
Training network. lr: 0.000184. clip: 0.073744
Iteration 8572: Policy loss: 0.133269. Value loss: 0.130821. Entropy: 0.296673.
Iteration 8573: Policy loss: 0.133166. Value loss: 0.057582. Entropy: 0.294739.
Iteration 8574: Policy loss: 0.124649. Value loss: 0.044061. Entropy: 0.296452.
Training network. lr: 0.000184. clip: 0.073744
Iteration 8575: Policy loss: 0.021411. Value loss: 0.148048. Entropy: 0.304727.
Iteration 8576: Policy loss: 0.009868. Value loss: 0.040683. Entropy: 0.305428.
Iteration 8577: Policy loss: 0.009827. Value loss: 0.029352. Entropy: 0.303988.
episode: 3263   score: 180.0  epsilon: 1.0    steps: 312  evaluation reward: 403.15
Training network. lr: 0.000184. clip: 0.073744
Iteration 8578: Policy loss: -0.346982. Value loss: 0.1

episode: 3283   score: 315.0  epsilon: 1.0    steps: 384  evaluation reward: 392.5
Training network. lr: 0.000184. clip: 0.073587
Iteration 8638: Policy loss: 0.204191. Value loss: 0.079368. Entropy: 0.299334.
Iteration 8639: Policy loss: 0.194668. Value loss: 0.025025. Entropy: 0.296375.
Iteration 8640: Policy loss: 0.187630. Value loss: 0.019584. Entropy: 0.296034.
episode: 3284   score: 285.0  epsilon: 1.0    steps: 752  evaluation reward: 391.4
Training network. lr: 0.000184. clip: 0.073587
Iteration 8641: Policy loss: 0.149945. Value loss: 0.096638. Entropy: 0.294100.
Iteration 8642: Policy loss: 0.149362. Value loss: 0.039739. Entropy: 0.294728.
Iteration 8643: Policy loss: 0.128432. Value loss: 0.029361. Entropy: 0.292730.
episode: 3285   score: 420.0  epsilon: 1.0    steps: 544  evaluation reward: 390.15
Training network. lr: 0.000184. clip: 0.073587
Iteration 8644: Policy loss: 0.011258. Value loss: 0.143167. Entropy: 0.292820.
Iteration 8645: Policy loss: -0.002648. Value los

Iteration 8706: Policy loss: -0.155368. Value loss: 0.038916. Entropy: 0.306182.
episode: 3303   score: 575.0  epsilon: 1.0    steps: 560  evaluation reward: 398.85
episode: 3304   score: 590.0  epsilon: 1.0    steps: 848  evaluation reward: 398.55
episode: 3305   score: 640.0  epsilon: 1.0    steps: 848  evaluation reward: 398.25
Training network. lr: 0.000183. clip: 0.073283
Iteration 8707: Policy loss: 0.319379. Value loss: 0.476497. Entropy: 0.283799.
Iteration 8708: Policy loss: 0.307521. Value loss: 0.177159. Entropy: 0.283548.
Iteration 8709: Policy loss: 0.287845. Value loss: 0.104175. Entropy: 0.283009.
Training network. lr: 0.000183. clip: 0.073283
Iteration 8710: Policy loss: 0.171026. Value loss: 0.121373. Entropy: 0.308229.
Iteration 8711: Policy loss: 0.160287. Value loss: 0.055526. Entropy: 0.307027.
Iteration 8712: Policy loss: 0.159322. Value loss: 0.039077. Entropy: 0.306176.
episode: 3306   score: 330.0  epsilon: 1.0    steps: 224  evaluation reward: 399.4
Training n

Iteration 8774: Policy loss: 0.155994. Value loss: 0.139115. Entropy: 0.308550.
Iteration 8775: Policy loss: 0.153892. Value loss: 0.107733. Entropy: 0.307948.
Training network. lr: 0.000183. clip: 0.073126
Iteration 8776: Policy loss: 0.194582. Value loss: 0.158297. Entropy: 0.309771.
Iteration 8777: Policy loss: 0.187848. Value loss: 0.053700. Entropy: 0.308903.
Iteration 8778: Policy loss: 0.173247. Value loss: 0.033671. Entropy: 0.309861.
episode: 3323   score: 680.0  epsilon: 1.0    steps: 488  evaluation reward: 400.5
episode: 3324   score: 650.0  epsilon: 1.0    steps: 576  evaluation reward: 401.3
Training network. lr: 0.000183. clip: 0.073126
Iteration 8779: Policy loss: 0.196234. Value loss: 0.090946. Entropy: 0.283164.
Iteration 8780: Policy loss: 0.183493. Value loss: 0.037459. Entropy: 0.282880.
Iteration 8781: Policy loss: 0.182057. Value loss: 0.026068. Entropy: 0.281773.
Training network. lr: 0.000183. clip: 0.073126
Iteration 8782: Policy loss: 0.097375. Value loss: 0.

Iteration 8843: Policy loss: 0.708038. Value loss: 0.062340. Entropy: 0.292227.
Iteration 8844: Policy loss: 0.702539. Value loss: 0.040703. Entropy: 0.291329.
episode: 3342   score: 605.0  epsilon: 1.0    steps: 928  evaluation reward: 407.05
Training network. lr: 0.000182. clip: 0.072969
Iteration 8845: Policy loss: -0.039755. Value loss: 0.178662. Entropy: 0.306557.
Iteration 8846: Policy loss: -0.042503. Value loss: 0.077675. Entropy: 0.306480.
Iteration 8847: Policy loss: -0.054366. Value loss: 0.054618. Entropy: 0.306741.
episode: 3343   score: 240.0  epsilon: 1.0    steps: 152  evaluation reward: 406.3
episode: 3344   score: 525.0  epsilon: 1.0    steps: 392  evaluation reward: 408.7
Training network. lr: 0.000182. clip: 0.072969
Iteration 8848: Policy loss: 0.082882. Value loss: 0.115004. Entropy: 0.274775.
Iteration 8849: Policy loss: 0.082660. Value loss: 0.034898. Entropy: 0.274215.
Iteration 8850: Policy loss: 0.076410. Value loss: 0.027609. Entropy: 0.274256.
Training netw

episode: 3366   score: 255.0  epsilon: 1.0    steps: 656  evaluation reward: 429.5
Training network. lr: 0.000182. clip: 0.072665
Iteration 8908: Policy loss: 0.128562. Value loss: 0.121771. Entropy: 0.295378.
Iteration 8909: Policy loss: 0.133038. Value loss: 0.057314. Entropy: 0.294620.
Iteration 8910: Policy loss: 0.118852. Value loss: 0.045557. Entropy: 0.297706.
episode: 3367   score: 150.0  epsilon: 1.0    steps: 104  evaluation reward: 428.4
Training network. lr: 0.000182. clip: 0.072665
Iteration 8911: Policy loss: 0.134449. Value loss: 0.113982. Entropy: 0.308757.
Iteration 8912: Policy loss: 0.125505. Value loss: 0.057828. Entropy: 0.309134.
Iteration 8913: Policy loss: 0.124272. Value loss: 0.042007. Entropy: 0.308354.
Training network. lr: 0.000182. clip: 0.072665
Iteration 8914: Policy loss: 0.031610. Value loss: 0.056253. Entropy: 0.311280.
Iteration 8915: Policy loss: 0.026415. Value loss: 0.027817. Entropy: 0.311796.
Iteration 8916: Policy loss: 0.025104. Value loss: 0.

Iteration 8974: Policy loss: -0.026612. Value loss: 0.367270. Entropy: 0.295769.
Iteration 8975: Policy loss: -0.027890. Value loss: 0.184733. Entropy: 0.298006.
Iteration 8976: Policy loss: -0.034087. Value loss: 0.053523. Entropy: 0.294430.
episode: 3388   score: 120.0  epsilon: 1.0    steps: 944  evaluation reward: 433.65
Training network. lr: 0.000181. clip: 0.072509
Iteration 8977: Policy loss: 0.236921. Value loss: 0.088097. Entropy: 0.313224.
Iteration 8978: Policy loss: 0.233802. Value loss: 0.032490. Entropy: 0.311851.
Iteration 8979: Policy loss: 0.233211. Value loss: 0.024596. Entropy: 0.311761.
episode: 3389   score: 210.0  epsilon: 1.0    steps: 104  evaluation reward: 432.1
Training network. lr: 0.000181. clip: 0.072509
Iteration 8980: Policy loss: -0.211281. Value loss: 0.106133. Entropy: 0.287294.
Iteration 8981: Policy loss: -0.224252. Value loss: 0.057001. Entropy: 0.287787.
Iteration 8982: Policy loss: -0.219289. Value loss: 0.040671. Entropy: 0.288460.
Training netw

Iteration 9041: Policy loss: 0.315213. Value loss: 0.049653. Entropy: 0.290747.
Iteration 9042: Policy loss: 0.305201. Value loss: 0.034418. Entropy: 0.290148.
episode: 3409   score: 395.0  epsilon: 1.0    steps: 408  evaluation reward: 419.9
Training network. lr: 0.000181. clip: 0.072361
Iteration 9043: Policy loss: 0.097858. Value loss: 0.081917. Entropy: 0.302261.
Iteration 9044: Policy loss: 0.093058. Value loss: 0.031290. Entropy: 0.302096.
Iteration 9045: Policy loss: 0.090953. Value loss: 0.024286. Entropy: 0.300918.
episode: 3410   score: 320.0  epsilon: 1.0    steps: 104  evaluation reward: 419.9
episode: 3411   score: 240.0  epsilon: 1.0    steps: 384  evaluation reward: 415.5
Training network. lr: 0.000181. clip: 0.072361
Iteration 9046: Policy loss: 0.022839. Value loss: 0.096332. Entropy: 0.288990.
Iteration 9047: Policy loss: 0.016562. Value loss: 0.041964. Entropy: 0.289617.
Iteration 9048: Policy loss: 0.010615. Value loss: 0.033623. Entropy: 0.288650.
episode: 3412   s

Iteration 9110: Policy loss: 0.140080. Value loss: 0.064155. Entropy: 0.308713.
Iteration 9111: Policy loss: 0.133173. Value loss: 0.045943. Entropy: 0.309421.
episode: 3428   score: 465.0  epsilon: 1.0    steps: 280  evaluation reward: 401.1
Training network. lr: 0.000180. clip: 0.072048
Iteration 9112: Policy loss: -0.187614. Value loss: 0.248562. Entropy: 0.300704.
Iteration 9113: Policy loss: -0.218142. Value loss: 0.121307. Entropy: 0.301329.
Iteration 9114: Policy loss: -0.198360. Value loss: 0.065854. Entropy: 0.301215.
episode: 3429   score: 425.0  epsilon: 1.0    steps: 376  evaluation reward: 401.2
Training network. lr: 0.000180. clip: 0.072048
Iteration 9115: Policy loss: 0.081358. Value loss: 0.047885. Entropy: 0.297163.
Iteration 9116: Policy loss: 0.075991. Value loss: 0.027994. Entropy: 0.297536.
Iteration 9117: Policy loss: 0.074409. Value loss: 0.022991. Entropy: 0.297480.
Training network. lr: 0.000180. clip: 0.072048
Iteration 9118: Policy loss: -0.155658. Value loss

Training network. lr: 0.000180. clip: 0.071900
Iteration 9178: Policy loss: 0.137938. Value loss: 0.075997. Entropy: 0.310260.
Iteration 9179: Policy loss: 0.138755. Value loss: 0.030090. Entropy: 0.308499.
Iteration 9180: Policy loss: 0.129032. Value loss: 0.024709. Entropy: 0.308154.
Training network. lr: 0.000180. clip: 0.071900
Iteration 9181: Policy loss: 0.054530. Value loss: 0.078729. Entropy: 0.311402.
Iteration 9182: Policy loss: 0.047211. Value loss: 0.038836. Entropy: 0.312044.
Iteration 9183: Policy loss: 0.047922. Value loss: 0.026404. Entropy: 0.312528.
episode: 3449   score: 695.0  epsilon: 1.0    steps: 248  evaluation reward: 398.3
episode: 3450   score: 365.0  epsilon: 1.0    steps: 736  evaluation reward: 396.95
Training network. lr: 0.000180. clip: 0.071900
Iteration 9184: Policy loss: -0.367985. Value loss: 0.317180. Entropy: 0.284576.
Iteration 9185: Policy loss: -0.386005. Value loss: 0.251440. Entropy: 0.287075.
Iteration 9186: Policy loss: -0.384357. Value loss

Iteration 9245: Policy loss: 0.088624. Value loss: 0.037074. Entropy: 0.314765.
Iteration 9246: Policy loss: 0.082806. Value loss: 0.024842. Entropy: 0.314070.
episode: 3470   score: 245.0  epsilon: 1.0    steps: 696  evaluation reward: 402.05
Training network. lr: 0.000179. clip: 0.071744
Iteration 9247: Policy loss: 0.256859. Value loss: 0.108182. Entropy: 0.310412.
Iteration 9248: Policy loss: 0.248103. Value loss: 0.049890. Entropy: 0.308705.
Iteration 9249: Policy loss: 0.252883. Value loss: 0.034833. Entropy: 0.308874.
episode: 3471   score: 260.0  epsilon: 1.0    steps: 792  evaluation reward: 402.1
Training network. lr: 0.000179. clip: 0.071744
Iteration 9250: Policy loss: 0.120581. Value loss: 0.070613. Entropy: 0.310181.
Iteration 9251: Policy loss: 0.113713. Value loss: 0.025204. Entropy: 0.310736.
Iteration 9252: Policy loss: 0.110421. Value loss: 0.020670. Entropy: 0.310502.
Training network. lr: 0.000179. clip: 0.071587
Iteration 9253: Policy loss: -0.214408. Value loss: 

Iteration 9315: Policy loss: -0.131731. Value loss: 0.046653. Entropy: 0.302986.
Training network. lr: 0.000179. clip: 0.071440
Iteration 9316: Policy loss: 0.267907. Value loss: 0.153962. Entropy: 0.312309.
Iteration 9317: Policy loss: 0.253072. Value loss: 0.067228. Entropy: 0.312488.
Iteration 9318: Policy loss: 0.247714. Value loss: 0.049729. Entropy: 0.311502.
episode: 3488   score: 905.0  epsilon: 1.0    steps: 136  evaluation reward: 423.2
Training network. lr: 0.000179. clip: 0.071440
Iteration 9319: Policy loss: -0.318576. Value loss: 0.448247. Entropy: 0.309767.
Iteration 9320: Policy loss: -0.365008. Value loss: 0.141469. Entropy: 0.309440.
Iteration 9321: Policy loss: -0.358128. Value loss: 0.089308. Entropy: 0.309775.
Training network. lr: 0.000179. clip: 0.071440
Iteration 9322: Policy loss: -0.075710. Value loss: 0.164859. Entropy: 0.310521.
Iteration 9323: Policy loss: -0.096776. Value loss: 0.064089. Entropy: 0.311725.
Iteration 9324: Policy loss: -0.101248. Value loss

Iteration 9383: Policy loss: -0.109646. Value loss: 0.054569. Entropy: 0.309734.
Iteration 9384: Policy loss: -0.113834. Value loss: 0.040265. Entropy: 0.309479.
episode: 3508   score: 345.0  epsilon: 1.0    steps: 416  evaluation reward: 447.75
Training network. lr: 0.000178. clip: 0.071283
Iteration 9385: Policy loss: -0.033499. Value loss: 0.299844. Entropy: 0.302516.
Iteration 9386: Policy loss: -0.036479. Value loss: 0.073143. Entropy: 0.302286.
Iteration 9387: Policy loss: -0.050200. Value loss: 0.048189. Entropy: 0.304495.
Training network. lr: 0.000178. clip: 0.071283
Iteration 9388: Policy loss: 0.034461. Value loss: 0.107575. Entropy: 0.310168.
Iteration 9389: Policy loss: 0.022723. Value loss: 0.037980. Entropy: 0.310206.
Iteration 9390: Policy loss: 0.020967. Value loss: 0.027984. Entropy: 0.311427.
Training network. lr: 0.000178. clip: 0.071283
Iteration 9391: Policy loss: 0.070536. Value loss: 0.123691. Entropy: 0.307458.
Iteration 9392: Policy loss: 0.062797. Value loss:

Iteration 9451: Policy loss: 0.104270. Value loss: 0.098574. Entropy: 0.299051.
Iteration 9452: Policy loss: 0.094912. Value loss: 0.033927. Entropy: 0.298160.
Iteration 9453: Policy loss: 0.090340. Value loss: 0.025729. Entropy: 0.297196.
episode: 3529   score: 265.0  epsilon: 1.0    steps: 32  evaluation reward: 448.35
Training network. lr: 0.000177. clip: 0.070979
Iteration 9454: Policy loss: 0.289625. Value loss: 0.102852. Entropy: 0.301639.
Iteration 9455: Policy loss: 0.275778. Value loss: 0.044812. Entropy: 0.298462.
Iteration 9456: Policy loss: 0.274806. Value loss: 0.034626. Entropy: 0.298571.
Training network. lr: 0.000177. clip: 0.070979
Iteration 9457: Policy loss: 0.044759. Value loss: 0.047673. Entropy: 0.309677.
Iteration 9458: Policy loss: 0.044914. Value loss: 0.021047. Entropy: 0.311171.
Iteration 9459: Policy loss: 0.040636. Value loss: 0.017388. Entropy: 0.311080.
Training network. lr: 0.000177. clip: 0.070979
Iteration 9460: Policy loss: 0.118996. Value loss: 0.110

Training network. lr: 0.000177. clip: 0.070822
Iteration 9520: Policy loss: 0.028780. Value loss: 0.063239. Entropy: 0.307616.
Iteration 9521: Policy loss: 0.024760. Value loss: 0.029967. Entropy: 0.306473.
Iteration 9522: Policy loss: 0.022655. Value loss: 0.021949. Entropy: 0.306425.
episode: 3549   score: 470.0  epsilon: 1.0    steps: 392  evaluation reward: 432.3
Training network. lr: 0.000177. clip: 0.070822
Iteration 9523: Policy loss: -0.018428. Value loss: 0.060581. Entropy: 0.297859.
Iteration 9524: Policy loss: -0.021563. Value loss: 0.024210. Entropy: 0.297489.
Iteration 9525: Policy loss: -0.026163. Value loss: 0.016913. Entropy: 0.296941.
Training network. lr: 0.000177. clip: 0.070822
Iteration 9526: Policy loss: -0.158901. Value loss: 0.345596. Entropy: 0.307302.
Iteration 9527: Policy loss: -0.164494. Value loss: 0.090527. Entropy: 0.306610.
Iteration 9528: Policy loss: -0.164566. Value loss: 0.047730. Entropy: 0.305716.
episode: 3550   score: 390.0  epsilon: 1.0    step

Iteration 9585: Policy loss: -0.418398. Value loss: 0.038000. Entropy: 0.291531.
Training network. lr: 0.000177. clip: 0.070665
Iteration 9586: Policy loss: 0.085737. Value loss: 0.143900. Entropy: 0.311439.
Iteration 9587: Policy loss: 0.068286. Value loss: 0.043766. Entropy: 0.309926.
Iteration 9588: Policy loss: 0.080637. Value loss: 0.028459. Entropy: 0.310100.
episode: 3572   score: 225.0  epsilon: 1.0    steps: 368  evaluation reward: 408.15
episode: 3573   score: 180.0  epsilon: 1.0    steps: 496  evaluation reward: 407.1
Training network. lr: 0.000177. clip: 0.070665
Iteration 9589: Policy loss: -0.032100. Value loss: 0.177907. Entropy: 0.299032.
Iteration 9590: Policy loss: -0.053029. Value loss: 0.058799. Entropy: 0.298223.
Iteration 9591: Policy loss: -0.062661. Value loss: 0.033012. Entropy: 0.299332.
episode: 3574   score: 180.0  epsilon: 1.0    steps: 960  evaluation reward: 402.5
episode: 3575   score: 800.0  epsilon: 1.0    steps: 1000  evaluation reward: 406.9
Training

Iteration 9651: Policy loss: 0.177717. Value loss: 0.028066. Entropy: 0.304425.
episode: 3595   score: 120.0  epsilon: 1.0    steps: 376  evaluation reward: 373.05
episode: 3596   score: 455.0  epsilon: 1.0    steps: 512  evaluation reward: 368.75
episode: 3597   score: 610.0  epsilon: 1.0    steps: 848  evaluation reward: 364.9
Training network. lr: 0.000176. clip: 0.070361
Iteration 9652: Policy loss: 0.251390. Value loss: 0.113790. Entropy: 0.280469.
Iteration 9653: Policy loss: 0.241876. Value loss: 0.047066. Entropy: 0.279815.
Iteration 9654: Policy loss: 0.239541. Value loss: 0.036431. Entropy: 0.279162.
episode: 3598   score: 270.0  epsilon: 1.0    steps: 400  evaluation reward: 360.95
Training network. lr: 0.000176. clip: 0.070361
Iteration 9655: Policy loss: 0.038172. Value loss: 0.112231. Entropy: 0.297351.
Iteration 9656: Policy loss: 0.027721. Value loss: 0.056840. Entropy: 0.300725.
Iteration 9657: Policy loss: 0.019212. Value loss: 0.041554. Entropy: 0.299219.
episode: 35

Iteration 9715: Policy loss: 0.054275. Value loss: 0.096761. Entropy: 0.305897.
Iteration 9716: Policy loss: 0.046756. Value loss: 0.039254. Entropy: 0.306012.
Iteration 9717: Policy loss: 0.039965. Value loss: 0.028999. Entropy: 0.305465.
episode: 3619   score: 485.0  epsilon: 1.0    steps: 736  evaluation reward: 347.95
episode: 3620   score: 275.0  epsilon: 1.0    steps: 736  evaluation reward: 347.55
episode: 3621   score: 670.0  epsilon: 1.0    steps: 968  evaluation reward: 351.65
Training network. lr: 0.000176. clip: 0.070205
Iteration 9718: Policy loss: 0.076058. Value loss: 0.114413. Entropy: 0.304160.
Iteration 9719: Policy loss: 0.066324. Value loss: 0.058006. Entropy: 0.303760.
Iteration 9720: Policy loss: 0.066401. Value loss: 0.040121. Entropy: 0.303566.
Training network. lr: 0.000176. clip: 0.070205
Iteration 9721: Policy loss: -0.010281. Value loss: 0.206184. Entropy: 0.312300.
Iteration 9722: Policy loss: -0.025743. Value loss: 0.035659. Entropy: 0.313985.
Iteration 97

Iteration 9785: Policy loss: 0.046116. Value loss: 0.038341. Entropy: 0.311626.
Iteration 9786: Policy loss: 0.045701. Value loss: 0.028605. Entropy: 0.312195.
episode: 3637   score: 500.0  epsilon: 1.0    steps: 312  evaluation reward: 352.15
Training network. lr: 0.000175. clip: 0.070057
Iteration 9787: Policy loss: -0.113680. Value loss: 0.131828. Entropy: 0.306410.
Iteration 9788: Policy loss: -0.126647. Value loss: 0.066892. Entropy: 0.306880.
Iteration 9789: Policy loss: -0.122296. Value loss: 0.049507. Entropy: 0.305855.
episode: 3638   score: 700.0  epsilon: 1.0    steps: 248  evaluation reward: 356.5
episode: 3639   score: 365.0  epsilon: 1.0    steps: 1016  evaluation reward: 358.05
Training network. lr: 0.000175. clip: 0.070057
Iteration 9790: Policy loss: -0.550761. Value loss: 0.643254. Entropy: 0.305490.
Iteration 9791: Policy loss: -0.569718. Value loss: 0.435715. Entropy: 0.307076.
Iteration 9792: Policy loss: -0.514030. Value loss: 0.303698. Entropy: 0.305993.
episode:

Iteration 9852: Policy loss: 0.290766. Value loss: 0.057390. Entropy: 0.305560.
Training network. lr: 0.000174. clip: 0.069744
Iteration 9853: Policy loss: 0.076630. Value loss: 0.092515. Entropy: 0.307496.
Iteration 9854: Policy loss: 0.066538. Value loss: 0.041750. Entropy: 0.307500.
Iteration 9855: Policy loss: 0.063941. Value loss: 0.029234. Entropy: 0.306309.
episode: 3658   score: 590.0  epsilon: 1.0    steps: 96  evaluation reward: 378.55
Training network. lr: 0.000174. clip: 0.069744
Iteration 9856: Policy loss: 0.036871. Value loss: 0.054985. Entropy: 0.304965.
Iteration 9857: Policy loss: 0.036580. Value loss: 0.023249. Entropy: 0.304023.
Iteration 9858: Policy loss: 0.027809. Value loss: 0.016075. Entropy: 0.304854.
episode: 3659   score: 210.0  epsilon: 1.0    steps: 656  evaluation reward: 378.55
episode: 3660   score: 285.0  epsilon: 1.0    steps: 928  evaluation reward: 375.75
Training network. lr: 0.000174. clip: 0.069744
Iteration 9859: Policy loss: 0.154271. Value los

Training network. lr: 0.000174. clip: 0.069596
Iteration 9919: Policy loss: 0.177018. Value loss: 0.111635. Entropy: 0.302461.
Iteration 9920: Policy loss: 0.174383. Value loss: 0.034369. Entropy: 0.302142.
Iteration 9921: Policy loss: 0.172399. Value loss: 0.022944. Entropy: 0.300998.
Training network. lr: 0.000174. clip: 0.069596
Iteration 9922: Policy loss: -0.107175. Value loss: 0.209956. Entropy: 0.310845.
Iteration 9923: Policy loss: -0.111131. Value loss: 0.052001. Entropy: 0.311165.
Iteration 9924: Policy loss: -0.133107. Value loss: 0.034565. Entropy: 0.310897.
episode: 3680   score: 820.0  epsilon: 1.0    steps: 880  evaluation reward: 398.35
Training network. lr: 0.000174. clip: 0.069596
Iteration 9925: Policy loss: 0.108236. Value loss: 0.166619. Entropy: 0.308123.
Iteration 9926: Policy loss: 0.096767. Value loss: 0.062794. Entropy: 0.308342.
Iteration 9927: Policy loss: 0.093394. Value loss: 0.044296. Entropy: 0.307436.
episode: 3681   score: 240.0  epsilon: 1.0    steps:

Training network. lr: 0.000174. clip: 0.069440
Iteration 9988: Policy loss: 0.116272. Value loss: 0.399381. Entropy: 0.307599.
Iteration 9989: Policy loss: 0.102691. Value loss: 0.097333. Entropy: 0.306687.
Iteration 9990: Policy loss: 0.086384. Value loss: 0.058653. Entropy: 0.306675.
Training network. lr: 0.000174. clip: 0.069440
Iteration 9991: Policy loss: -0.190750. Value loss: 0.416521. Entropy: 0.309118.
Iteration 9992: Policy loss: -0.220763. Value loss: 0.237532. Entropy: 0.308138.
Iteration 9993: Policy loss: -0.233064. Value loss: 0.156933. Entropy: 0.307738.
episode: 3699   score: 180.0  epsilon: 1.0    steps: 80  evaluation reward: 400.9
episode: 3700   score: 870.0  epsilon: 1.0    steps: 608  evaluation reward: 405.4
Training network. lr: 0.000174. clip: 0.069440
Iteration 9994: Policy loss: -0.015835. Value loss: 0.201874. Entropy: 0.289550.
Iteration 9995: Policy loss: -0.017647. Value loss: 0.081187. Entropy: 0.285005.
Iteration 9996: Policy loss: -0.024077. Value los

Training network. lr: 0.000173. clip: 0.069136
Iteration 10051: Policy loss: 0.034174. Value loss: 0.053370. Entropy: 0.310823.
Iteration 10052: Policy loss: 0.030410. Value loss: 0.029600. Entropy: 0.311139.
Iteration 10053: Policy loss: 0.029758. Value loss: 0.023448. Entropy: 0.310131.
Training network. lr: 0.000173. clip: 0.069136
Iteration 10054: Policy loss: 0.091443. Value loss: 0.166064. Entropy: 0.306769.
Iteration 10055: Policy loss: 0.083860. Value loss: 0.073983. Entropy: 0.306727.
Iteration 10056: Policy loss: 0.078636. Value loss: 0.056154. Entropy: 0.306041.
episode: 3724   score: 240.0  epsilon: 1.0    steps: 80  evaluation reward: 410.1
episode: 3725   score: 315.0  epsilon: 1.0    steps: 752  evaluation reward: 408.65
Training network. lr: 0.000173. clip: 0.069136
Iteration 10057: Policy loss: 0.102531. Value loss: 0.079356. Entropy: 0.296921.
Iteration 10058: Policy loss: 0.096990. Value loss: 0.033257. Entropy: 0.295212.
Iteration 10059: Policy loss: 0.095323. Value

Iteration 10115: Policy loss: 0.128178. Value loss: 0.056150. Entropy: 0.300231.
Iteration 10116: Policy loss: 0.126853. Value loss: 0.043459. Entropy: 0.300682.
episode: 3748   score: 125.0  epsilon: 1.0    steps: 664  evaluation reward: 373.95
episode: 3749   score: 260.0  epsilon: 1.0    steps: 720  evaluation reward: 369.55
Training network. lr: 0.000172. clip: 0.068979
Iteration 10117: Policy loss: 0.024572. Value loss: 0.077451. Entropy: 0.297141.
Iteration 10118: Policy loss: 0.025352. Value loss: 0.037855. Entropy: 0.298461.
Iteration 10119: Policy loss: 0.017933. Value loss: 0.028719. Entropy: 0.297371.
episode: 3750   score: 390.0  epsilon: 1.0    steps: 880  evaluation reward: 364.75
Training network. lr: 0.000172. clip: 0.068979
Iteration 10120: Policy loss: -0.005067. Value loss: 0.119553. Entropy: 0.301082.
Iteration 10121: Policy loss: -0.006983. Value loss: 0.046181. Entropy: 0.300912.
Iteration 10122: Policy loss: -0.016669. Value loss: 0.032895. Entropy: 0.299860.
Tra

Iteration 10175: Policy loss: 0.003524. Value loss: 0.023721. Entropy: 0.304315.
Iteration 10176: Policy loss: 0.003952. Value loss: 0.016150. Entropy: 0.302063.
Training network. lr: 0.000172. clip: 0.068822
Iteration 10177: Policy loss: 0.322706. Value loss: 0.163446. Entropy: 0.304910.
Iteration 10178: Policy loss: 0.319198. Value loss: 0.072038. Entropy: 0.303974.
Iteration 10179: Policy loss: 0.313516. Value loss: 0.056713. Entropy: 0.303482.
episode: 3776   score: 220.0  epsilon: 1.0    steps: 488  evaluation reward: 333.7
episode: 3777   score: 330.0  epsilon: 1.0    steps: 600  evaluation reward: 330.85
Training network. lr: 0.000172. clip: 0.068822
Iteration 10180: Policy loss: 0.131390. Value loss: 0.063235. Entropy: 0.296225.
Iteration 10181: Policy loss: 0.125291. Value loss: 0.026523. Entropy: 0.298073.
Iteration 10182: Policy loss: 0.120299. Value loss: 0.018768. Entropy: 0.295448.
episode: 3778   score: 395.0  epsilon: 1.0    steps: 640  evaluation reward: 329.15
Trainin

Training network. lr: 0.000172. clip: 0.068675
Iteration 10240: Policy loss: -0.073914. Value loss: 0.145959. Entropy: 0.310926.
Iteration 10241: Policy loss: -0.077488. Value loss: 0.052766. Entropy: 0.309034.
Iteration 10242: Policy loss: -0.082363. Value loss: 0.036810. Entropy: 0.309241.
Training network. lr: 0.000172. clip: 0.068675
Iteration 10243: Policy loss: -0.020219. Value loss: 0.176193. Entropy: 0.309004.
Iteration 10244: Policy loss: -0.035598. Value loss: 0.079773. Entropy: 0.307208.
Iteration 10245: Policy loss: -0.033312. Value loss: 0.052380. Entropy: 0.307894.
episode: 3799   score: 1005.0  epsilon: 1.0    steps: 616  evaluation reward: 319.95
Training network. lr: 0.000172. clip: 0.068675
Iteration 10246: Policy loss: 0.141963. Value loss: 0.159139. Entropy: 0.299453.
Iteration 10247: Policy loss: 0.132312. Value loss: 0.066790. Entropy: 0.298243.
Iteration 10248: Policy loss: 0.140485. Value loss: 0.046258. Entropy: 0.297220.
episode: 3800   score: 355.0  epsilon: 

Iteration 10305: Policy loss: 0.163178. Value loss: 0.026684. Entropy: 0.292359.
episode: 3821   score: 480.0  epsilon: 1.0    steps: 208  evaluation reward: 307.8
Training network. lr: 0.000171. clip: 0.068361
Iteration 10306: Policy loss: 0.188497. Value loss: 0.065720. Entropy: 0.280644.
Iteration 10307: Policy loss: 0.180294. Value loss: 0.021471. Entropy: 0.280748.
Iteration 10308: Policy loss: 0.177785. Value loss: 0.014887. Entropy: 0.279838.
episode: 3822   score: 180.0  epsilon: 1.0    steps: 88  evaluation reward: 305.85
Training network. lr: 0.000171. clip: 0.068361
Iteration 10309: Policy loss: -0.399014. Value loss: 0.404181. Entropy: 0.291281.
Iteration 10310: Policy loss: -0.418114. Value loss: 0.126032. Entropy: 0.289181.
Iteration 10311: Policy loss: -0.412260. Value loss: 0.075666. Entropy: 0.290013.
episode: 3823   score: 450.0  epsilon: 1.0    steps: 224  evaluation reward: 307.05
episode: 3824   score: 180.0  epsilon: 1.0    steps: 872  evaluation reward: 306.45
Tr

episode: 3849   score: 210.0  epsilon: 1.0    steps: 208  evaluation reward: 302.0
Training network. lr: 0.000171. clip: 0.068214
Iteration 10366: Policy loss: 0.251670. Value loss: 0.079540. Entropy: 0.296986.
Iteration 10367: Policy loss: 0.249114. Value loss: 0.038849. Entropy: 0.296894.
Iteration 10368: Policy loss: 0.244283. Value loss: 0.031583. Entropy: 0.295254.
Training network. lr: 0.000171. clip: 0.068214
Iteration 10369: Policy loss: 0.033246. Value loss: 0.079789. Entropy: 0.304586.
Iteration 10370: Policy loss: 0.028666. Value loss: 0.032232. Entropy: 0.304322.
Iteration 10371: Policy loss: 0.017095. Value loss: 0.026123. Entropy: 0.303544.
episode: 3850   score: 160.0  epsilon: 1.0    steps: 384  evaluation reward: 299.7
now time :  2019-09-06 00:58:31.695139
episode: 3851   score: 155.0  epsilon: 1.0    steps: 416  evaluation reward: 298.7
episode: 3852   score: 285.0  epsilon: 1.0    steps: 440  evaluation reward: 299.6
Training network. lr: 0.000171. clip: 0.068214
It

episode: 3874   score: 285.0  epsilon: 1.0    steps: 656  evaluation reward: 311.4
Training network. lr: 0.000170. clip: 0.068057
Iteration 10429: Policy loss: 0.196878. Value loss: 0.132578. Entropy: 0.288028.
Iteration 10430: Policy loss: 0.191167. Value loss: 0.057749. Entropy: 0.285699.
Iteration 10431: Policy loss: 0.181266. Value loss: 0.043015. Entropy: 0.285449.
Training network. lr: 0.000170. clip: 0.068057
Iteration 10432: Policy loss: -0.348180. Value loss: 0.158205. Entropy: 0.310552.
Iteration 10433: Policy loss: -0.354501. Value loss: 0.065641. Entropy: 0.311370.
Iteration 10434: Policy loss: -0.364253. Value loss: 0.040066. Entropy: 0.311879.
Training network. lr: 0.000170. clip: 0.068057
Iteration 10435: Policy loss: 0.174126. Value loss: 0.090686. Entropy: 0.308769.
Iteration 10436: Policy loss: 0.166868. Value loss: 0.038997. Entropy: 0.309294.
Iteration 10437: Policy loss: 0.171429. Value loss: 0.026068. Entropy: 0.308455.
episode: 3875   score: 410.0  epsilon: 1.0  

Iteration 10491: Policy loss: -0.177243. Value loss: 0.053581. Entropy: 0.274513.
Training network. lr: 0.000170. clip: 0.067901
Iteration 10492: Policy loss: 0.006356. Value loss: 0.296970. Entropy: 0.305423.
Iteration 10493: Policy loss: -0.002517. Value loss: 0.099397. Entropy: 0.305647.
Iteration 10494: Policy loss: -0.018945. Value loss: 0.037799. Entropy: 0.303608.
Training network. lr: 0.000170. clip: 0.067901
Iteration 10495: Policy loss: 0.151166. Value loss: 0.125143. Entropy: 0.308841.
Iteration 10496: Policy loss: 0.138488. Value loss: 0.040255. Entropy: 0.306331.
Iteration 10497: Policy loss: 0.142724. Value loss: 0.027736. Entropy: 0.307337.
episode: 3900   score: 215.0  epsilon: 1.0    steps: 872  evaluation reward: 300.3
Training network. lr: 0.000170. clip: 0.067901
Iteration 10498: Policy loss: 0.231062. Value loss: 0.069766. Entropy: 0.297097.
Iteration 10499: Policy loss: 0.228697. Value loss: 0.034569. Entropy: 0.295802.
Iteration 10500: Policy loss: 0.225317. Valu

Iteration 10554: Policy loss: 0.114908. Value loss: 0.035256. Entropy: 0.291796.
episode: 3925   score: 220.0  epsilon: 1.0    steps: 288  evaluation reward: 288.2
Training network. lr: 0.000169. clip: 0.067597
Iteration 10555: Policy loss: 0.174823. Value loss: 0.088296. Entropy: 0.297796.
Iteration 10556: Policy loss: 0.164220. Value loss: 0.036987. Entropy: 0.296592.
Iteration 10557: Policy loss: 0.161558. Value loss: 0.026487. Entropy: 0.296276.
Training network. lr: 0.000169. clip: 0.067597
Iteration 10558: Policy loss: 0.237331. Value loss: 0.085637. Entropy: 0.303619.
Iteration 10559: Policy loss: 0.228466. Value loss: 0.033623. Entropy: 0.305204.
Iteration 10560: Policy loss: 0.230329. Value loss: 0.022213. Entropy: 0.303896.
episode: 3926   score: 210.0  epsilon: 1.0    steps: 232  evaluation reward: 284.4
Training network. lr: 0.000169. clip: 0.067597
Iteration 10561: Policy loss: 0.018165. Value loss: 0.111531. Entropy: 0.292679.
Iteration 10562: Policy loss: 0.019233. Value

Iteration 10619: Policy loss: -0.126245. Value loss: 0.066583. Entropy: 0.306367.
Iteration 10620: Policy loss: -0.127109. Value loss: 0.046887. Entropy: 0.306056.
episode: 3948   score: 325.0  epsilon: 1.0    steps: 120  evaluation reward: 298.55
episode: 3949   score: 565.0  epsilon: 1.0    steps: 552  evaluation reward: 302.1
Training network. lr: 0.000169. clip: 0.067440
Iteration 10621: Policy loss: -0.245249. Value loss: 0.178127. Entropy: 0.274089.
Iteration 10622: Policy loss: -0.245315. Value loss: 0.064325. Entropy: 0.273784.
Iteration 10623: Policy loss: -0.256045. Value loss: 0.049123. Entropy: 0.272961.
episode: 3950   score: 450.0  epsilon: 1.0    steps: 168  evaluation reward: 305.0
Training network. lr: 0.000169. clip: 0.067440
Iteration 10624: Policy loss: 0.291898. Value loss: 0.163746. Entropy: 0.298158.
Iteration 10625: Policy loss: 0.281214. Value loss: 0.052728. Entropy: 0.296080.
Iteration 10626: Policy loss: 0.269162. Value loss: 0.035232. Entropy: 0.295575.
now

Training network. lr: 0.000168. clip: 0.067292
Iteration 10684: Policy loss: 0.097935. Value loss: 0.116807. Entropy: 0.298187.
Iteration 10685: Policy loss: 0.095753. Value loss: 0.053893. Entropy: 0.297852.
Iteration 10686: Policy loss: 0.094967. Value loss: 0.040421. Entropy: 0.297741.
episode: 3971   score: 410.0  epsilon: 1.0    steps: 856  evaluation reward: 333.2
Training network. lr: 0.000168. clip: 0.067292
Iteration 10687: Policy loss: -0.063565. Value loss: 0.111106. Entropy: 0.307166.
Iteration 10688: Policy loss: -0.072216. Value loss: 0.046509. Entropy: 0.307078.
Iteration 10689: Policy loss: -0.078259. Value loss: 0.031381. Entropy: 0.307463.
episode: 3972   score: 330.0  epsilon: 1.0    steps: 720  evaluation reward: 334.25
episode: 3973   score: 210.0  epsilon: 1.0    steps: 776  evaluation reward: 332.9
Training network. lr: 0.000168. clip: 0.067292
Iteration 10690: Policy loss: 0.033048. Value loss: 0.124996. Entropy: 0.296684.
Iteration 10691: Policy loss: 0.032487.

episode: 3993   score: 210.0  epsilon: 1.0    steps: 472  evaluation reward: 344.45
Training network. lr: 0.000168. clip: 0.067136
Iteration 10750: Policy loss: -0.141580. Value loss: 0.068386. Entropy: 0.297937.
Iteration 10751: Policy loss: -0.141960. Value loss: 0.032266. Entropy: 0.299345.
Iteration 10752: Policy loss: -0.146926. Value loss: 0.024821. Entropy: 0.299596.
episode: 3994   score: 330.0  epsilon: 1.0    steps: 1016  evaluation reward: 346.2
Training network. lr: 0.000167. clip: 0.066979
Iteration 10753: Policy loss: 0.138957. Value loss: 0.140088. Entropy: 0.309966.
Iteration 10754: Policy loss: 0.126470. Value loss: 0.045554. Entropy: 0.309924.
Iteration 10755: Policy loss: 0.119652. Value loss: 0.029498. Entropy: 0.309067.
episode: 3995   score: 240.0  epsilon: 1.0    steps: 600  evaluation reward: 343.2
Training network. lr: 0.000167. clip: 0.066979
Iteration 10756: Policy loss: 0.022146. Value loss: 0.092795. Entropy: 0.291384.
Iteration 10757: Policy loss: 0.014442

Training network. lr: 0.000167. clip: 0.066832
Iteration 10813: Policy loss: 0.081218. Value loss: 0.063215. Entropy: 0.293834.
Iteration 10814: Policy loss: 0.074684. Value loss: 0.029776. Entropy: 0.293107.
Iteration 10815: Policy loss: 0.075062. Value loss: 0.022876. Entropy: 0.293396.
Training network. lr: 0.000167. clip: 0.066832
Iteration 10816: Policy loss: -0.376173. Value loss: 0.319973. Entropy: 0.308601.
Iteration 10817: Policy loss: -0.380982. Value loss: 0.091997. Entropy: 0.308269.
Iteration 10818: Policy loss: -0.414899. Value loss: 0.051762. Entropy: 0.309259.
episode: 4018   score: 625.0  epsilon: 1.0    steps: 176  evaluation reward: 368.45
Training network. lr: 0.000167. clip: 0.066832
Iteration 10819: Policy loss: -0.086639. Value loss: 0.173252. Entropy: 0.297121.
Iteration 10820: Policy loss: -0.093630. Value loss: 0.060043. Entropy: 0.298440.
Iteration 10821: Policy loss: -0.094375. Value loss: 0.038077. Entropy: 0.299726.
Training network. lr: 0.000167. clip: 0.

Iteration 10877: Policy loss: 0.076102. Value loss: 0.040596. Entropy: 0.295907.
Iteration 10878: Policy loss: 0.075029. Value loss: 0.031579. Entropy: 0.296098.
episode: 4042   score: 320.0  epsilon: 1.0    steps: 1000  evaluation reward: 364.4
Training network. lr: 0.000167. clip: 0.066675
Iteration 10879: Policy loss: -0.140406. Value loss: 0.278586. Entropy: 0.314052.
Iteration 10880: Policy loss: -0.147312. Value loss: 0.099209. Entropy: 0.313556.
Iteration 10881: Policy loss: -0.144948. Value loss: 0.063809. Entropy: 0.313688.
episode: 4043   score: 350.0  epsilon: 1.0    steps: 56  evaluation reward: 365.8
episode: 4044   score: 560.0  epsilon: 1.0    steps: 352  evaluation reward: 365.5
episode: 4045   score: 275.0  epsilon: 1.0    steps: 520  evaluation reward: 365.45
Training network. lr: 0.000167. clip: 0.066675
Iteration 10882: Policy loss: 0.186221. Value loss: 0.222371. Entropy: 0.261448.
Iteration 10883: Policy loss: 0.183092. Value loss: 0.050339. Entropy: 0.263052.
Ite

episode: 4062   score: 335.0  epsilon: 1.0    steps: 168  evaluation reward: 357.6
Training network. lr: 0.000166. clip: 0.066518
Iteration 10945: Policy loss: -0.367295. Value loss: 0.322734. Entropy: 0.289506.
Iteration 10946: Policy loss: -0.382779. Value loss: 0.222956. Entropy: 0.289260.
Iteration 10947: Policy loss: -0.372050. Value loss: 0.190376. Entropy: 0.287811.
episode: 4063   score: 630.0  epsilon: 1.0    steps: 592  evaluation reward: 362.65
episode: 4064   score: 365.0  epsilon: 1.0    steps: 736  evaluation reward: 361.85
episode: 4065   score: 670.0  epsilon: 1.0    steps: 792  evaluation reward: 365.4
episode: 4066   score: 420.0  epsilon: 1.0    steps: 968  evaluation reward: 363.65
Training network. lr: 0.000166. clip: 0.066518
Iteration 10948: Policy loss: 0.076097. Value loss: 0.138904. Entropy: 0.274862.
Iteration 10949: Policy loss: 0.074869. Value loss: 0.054118. Entropy: 0.274379.
Iteration 10950: Policy loss: 0.078018. Value loss: 0.039984. Entropy: 0.272668.

Training network. lr: 0.000166. clip: 0.066214
Iteration 11011: Policy loss: 0.204866. Value loss: 0.099628. Entropy: 0.312724.
Iteration 11012: Policy loss: 0.196365. Value loss: 0.035575. Entropy: 0.312059.
Iteration 11013: Policy loss: 0.187064. Value loss: 0.023294. Entropy: 0.311207.
episode: 4084   score: 375.0  epsilon: 1.0    steps: 48  evaluation reward: 364.75
episode: 4085   score: 210.0  epsilon: 1.0    steps: 560  evaluation reward: 364.0
Training network. lr: 0.000166. clip: 0.066214
Iteration 11014: Policy loss: 0.154654. Value loss: 0.175939. Entropy: 0.284144.
Iteration 11015: Policy loss: 0.136435. Value loss: 0.082650. Entropy: 0.285053.
Iteration 11016: Policy loss: 0.138444. Value loss: 0.063599. Entropy: 0.284460.
episode: 4086   score: 470.0  epsilon: 1.0    steps: 248  evaluation reward: 363.55
episode: 4087   score: 315.0  epsilon: 1.0    steps: 880  evaluation reward: 362.3
Training network. lr: 0.000166. clip: 0.066214
Iteration 11017: Policy loss: 0.327475. 

Training network. lr: 0.000165. clip: 0.066057
Iteration 11074: Policy loss: -0.697412. Value loss: 0.411854. Entropy: 0.294807.
Iteration 11075: Policy loss: -0.751175. Value loss: 0.131727. Entropy: 0.295292.
Iteration 11076: Policy loss: -0.759243. Value loss: 0.057113. Entropy: 0.295617.
episode: 4109   score: 915.0  epsilon: 1.0    steps: 408  evaluation reward: 356.95
Training network. lr: 0.000165. clip: 0.066057
Iteration 11077: Policy loss: 0.323383. Value loss: 0.244537. Entropy: 0.293850.
Iteration 11078: Policy loss: 0.304386. Value loss: 0.087668. Entropy: 0.294487.
Iteration 11079: Policy loss: 0.295364. Value loss: 0.045642. Entropy: 0.293483.
Training network. lr: 0.000165. clip: 0.066057
Iteration 11080: Policy loss: 0.233911. Value loss: 0.146791. Entropy: 0.316358.
Iteration 11081: Policy loss: 0.226883. Value loss: 0.062888. Entropy: 0.315936.
Iteration 11082: Policy loss: 0.225781. Value loss: 0.040822. Entropy: 0.314715.
episode: 4110   score: 465.0  epsilon: 1.0 

Training network. lr: 0.000165. clip: 0.065910
Iteration 11140: Policy loss: 0.404287. Value loss: 0.198962. Entropy: 0.306237.
Iteration 11141: Policy loss: 0.400780. Value loss: 0.040318. Entropy: 0.304048.
Iteration 11142: Policy loss: 0.381163. Value loss: 0.020514. Entropy: 0.305061.
Training network. lr: 0.000165. clip: 0.065910
Iteration 11143: Policy loss: 0.108899. Value loss: 0.118271. Entropy: 0.313442.
Iteration 11144: Policy loss: 0.109793. Value loss: 0.054848. Entropy: 0.313049.
Iteration 11145: Policy loss: 0.099543. Value loss: 0.045686. Entropy: 0.313469.
episode: 4131   score: 315.0  epsilon: 1.0    steps: 928  evaluation reward: 366.65
Training network. lr: 0.000165. clip: 0.065910
Iteration 11146: Policy loss: 0.189363. Value loss: 0.130472. Entropy: 0.301365.
Iteration 11147: Policy loss: 0.186200. Value loss: 0.057595. Entropy: 0.300605.
Iteration 11148: Policy loss: 0.178899. Value loss: 0.041440. Entropy: 0.299499.
episode: 4132   score: 340.0  epsilon: 1.0    

Iteration 11207: Policy loss: 0.012123. Value loss: 0.036106. Entropy: 0.297220.
Iteration 11208: Policy loss: 0.012103. Value loss: 0.024477. Entropy: 0.296504.
now time :  2019-09-06 01:50:27.047729
episode: 4151   score: 600.0  epsilon: 1.0    steps: 640  evaluation reward: 388.05
Training network. lr: 0.000164. clip: 0.065597
Iteration 11209: Policy loss: 0.250180. Value loss: 0.097026. Entropy: 0.296510.
Iteration 11210: Policy loss: 0.250658. Value loss: 0.032550. Entropy: 0.295823.
Iteration 11211: Policy loss: 0.235939. Value loss: 0.024259. Entropy: 0.295834.
episode: 4152   score: 390.0  epsilon: 1.0    steps: 312  evaluation reward: 388.05
Training network. lr: 0.000164. clip: 0.065597
Iteration 11212: Policy loss: -0.064719. Value loss: 0.311763. Entropy: 0.299223.
Iteration 11213: Policy loss: -0.094234. Value loss: 0.221090. Entropy: 0.300475.
Iteration 11214: Policy loss: -0.084714. Value loss: 0.109815. Entropy: 0.298012.
episode: 4153   score: 210.0  epsilon: 1.0    st

Training network. lr: 0.000164. clip: 0.065449
Iteration 11275: Policy loss: -0.176935. Value loss: 0.281923. Entropy: 0.312813.
Iteration 11276: Policy loss: -0.191464. Value loss: 0.135701. Entropy: 0.312990.
Iteration 11277: Policy loss: -0.176820. Value loss: 0.054378. Entropy: 0.312691.
episode: 4170   score: 210.0  epsilon: 1.0    steps: 520  evaluation reward: 394.55
episode: 4171   score: 660.0  epsilon: 1.0    steps: 680  evaluation reward: 398.45
Training network. lr: 0.000164. clip: 0.065449
Iteration 11278: Policy loss: 0.505432. Value loss: 0.267027. Entropy: 0.289550.
Iteration 11279: Policy loss: 0.487006. Value loss: 0.081925. Entropy: 0.286994.
Iteration 11280: Policy loss: 0.478626. Value loss: 0.044330. Entropy: 0.288992.
episode: 4172   score: 470.0  epsilon: 1.0    steps: 88  evaluation reward: 399.25
episode: 4173   score: 700.0  epsilon: 1.0    steps: 256  evaluation reward: 403.65
Training network. lr: 0.000164. clip: 0.065449
Iteration 11281: Policy loss: 0.158

Iteration 11341: Policy loss: 0.204840. Value loss: 0.086489. Entropy: 0.305086.
Iteration 11342: Policy loss: 0.190927. Value loss: 0.032647. Entropy: 0.304648.
Iteration 11343: Policy loss: 0.190665. Value loss: 0.024254. Entropy: 0.303964.
episode: 4191   score: 485.0  epsilon: 1.0    steps: 792  evaluation reward: 415.95
Training network. lr: 0.000163. clip: 0.065293
Iteration 11344: Policy loss: 0.148839. Value loss: 0.096484. Entropy: 0.303420.
Iteration 11345: Policy loss: 0.145691. Value loss: 0.040848. Entropy: 0.300586.
Iteration 11346: Policy loss: 0.134597. Value loss: 0.030917. Entropy: 0.299467.
Training network. lr: 0.000163. clip: 0.065293
Iteration 11347: Policy loss: 0.171529. Value loss: 0.103612. Entropy: 0.313008.
Iteration 11348: Policy loss: 0.165680. Value loss: 0.033730. Entropy: 0.311911.
Iteration 11349: Policy loss: 0.155323. Value loss: 0.028370. Entropy: 0.312533.
Training network. lr: 0.000163. clip: 0.065293
Iteration 11350: Policy loss: 0.044756. Value 

Iteration 11409: Policy loss: -0.280377. Value loss: 0.128698. Entropy: 0.316435.
episode: 4210   score: 290.0  epsilon: 1.0    steps: 112  evaluation reward: 421.3
episode: 4211   score: 545.0  epsilon: 1.0    steps: 168  evaluation reward: 424.05
episode: 4212   score: 270.0  epsilon: 1.0    steps: 968  evaluation reward: 423.2
Training network. lr: 0.000162. clip: 0.064988
Iteration 11410: Policy loss: 0.184285. Value loss: 0.113294. Entropy: 0.273095.
Iteration 11411: Policy loss: 0.171261. Value loss: 0.035506. Entropy: 0.275021.
Iteration 11412: Policy loss: 0.172048. Value loss: 0.026265. Entropy: 0.274690.
episode: 4213   score: 470.0  epsilon: 1.0    steps: 808  evaluation reward: 424.15
Training network. lr: 0.000162. clip: 0.064988
Iteration 11413: Policy loss: 0.165678. Value loss: 0.169119. Entropy: 0.291706.
Iteration 11414: Policy loss: 0.156097. Value loss: 0.055265. Entropy: 0.291846.
Iteration 11415: Policy loss: 0.150913. Value loss: 0.033216. Entropy: 0.291557.
epis

Iteration 11474: Policy loss: -0.039943. Value loss: 0.031146. Entropy: 0.285991.
Iteration 11475: Policy loss: -0.044441. Value loss: 0.022713. Entropy: 0.285770.
episode: 4233   score: 325.0  epsilon: 1.0    steps: 472  evaluation reward: 427.6
episode: 4234   score: 365.0  epsilon: 1.0    steps: 608  evaluation reward: 428.55
Training network. lr: 0.000162. clip: 0.064832
Iteration 11476: Policy loss: -0.113839. Value loss: 0.322808. Entropy: 0.284714.
Iteration 11477: Policy loss: -0.121468. Value loss: 0.249457. Entropy: 0.284128.
Iteration 11478: Policy loss: -0.126334. Value loss: 0.217552. Entropy: 0.284975.
Training network. lr: 0.000162. clip: 0.064832
Iteration 11479: Policy loss: -0.008866. Value loss: 0.092846. Entropy: 0.312629.
Iteration 11480: Policy loss: -0.017518. Value loss: 0.045110. Entropy: 0.308985.
Iteration 11481: Policy loss: -0.014361. Value loss: 0.036834. Entropy: 0.308921.
episode: 4235   score: 510.0  epsilon: 1.0    steps: 928  evaluation reward: 426.55

Iteration 11540: Policy loss: -0.128501. Value loss: 0.077924. Entropy: 0.300782.
Iteration 11541: Policy loss: -0.132168. Value loss: 0.038803. Entropy: 0.300574.
episode: 4254   score: 620.0  epsilon: 1.0    steps: 736  evaluation reward: 405.8
episode: 4255   score: 525.0  epsilon: 1.0    steps: 736  evaluation reward: 407.0
Training network. lr: 0.000162. clip: 0.064675
Iteration 11542: Policy loss: 0.189789. Value loss: 0.125496. Entropy: 0.288240.
Iteration 11543: Policy loss: 0.181204. Value loss: 0.046931. Entropy: 0.287039.
Iteration 11544: Policy loss: 0.175928. Value loss: 0.033563. Entropy: 0.287041.
episode: 4256   score: 425.0  epsilon: 1.0    steps: 392  evaluation reward: 408.4
Training network. lr: 0.000162. clip: 0.064675
Iteration 11545: Policy loss: -0.028822. Value loss: 0.143886. Entropy: 0.298241.
Iteration 11546: Policy loss: -0.031788. Value loss: 0.053026. Entropy: 0.298820.
Iteration 11547: Policy loss: -0.039958. Value loss: 0.037546. Entropy: 0.297500.
epis

Training network. lr: 0.000161. clip: 0.064371
Iteration 11608: Policy loss: 0.164654. Value loss: 0.065460. Entropy: 0.301987.
Iteration 11609: Policy loss: 0.160265. Value loss: 0.032241. Entropy: 0.301950.
Iteration 11610: Policy loss: 0.157027. Value loss: 0.028357. Entropy: 0.302252.
episode: 4274   score: 335.0  epsilon: 1.0    steps: 256  evaluation reward: 403.85
episode: 4275   score: 260.0  epsilon: 1.0    steps: 664  evaluation reward: 403.15
episode: 4276   score: 425.0  epsilon: 1.0    steps: 744  evaluation reward: 400.9
Training network. lr: 0.000161. clip: 0.064371
Iteration 11611: Policy loss: 0.066132. Value loss: 0.075687. Entropy: 0.276817.
Iteration 11612: Policy loss: 0.059503. Value loss: 0.038406. Entropy: 0.276753.
Iteration 11613: Policy loss: 0.060366. Value loss: 0.030368. Entropy: 0.277854.
episode: 4277   score: 655.0  epsilon: 1.0    steps: 768  evaluation reward: 402.8
Training network. lr: 0.000161. clip: 0.064371
Iteration 11614: Policy loss: 0.192952.

episode: 4297   score: 285.0  epsilon: 1.0    steps: 392  evaluation reward: 389.95
Training network. lr: 0.000161. clip: 0.064214
Iteration 11674: Policy loss: -0.126729. Value loss: 0.117256. Entropy: 0.303164.
Iteration 11675: Policy loss: -0.123663. Value loss: 0.053949. Entropy: 0.303212.
Iteration 11676: Policy loss: -0.132778. Value loss: 0.039133. Entropy: 0.302639.
episode: 4298   score: 575.0  epsilon: 1.0    steps: 88  evaluation reward: 392.9
Training network. lr: 0.000161. clip: 0.064214
Iteration 11677: Policy loss: -0.100782. Value loss: 0.112642. Entropy: 0.299693.
Iteration 11678: Policy loss: -0.108414. Value loss: 0.037459. Entropy: 0.300264.
Iteration 11679: Policy loss: -0.108943. Value loss: 0.023605. Entropy: 0.301218.
Training network. lr: 0.000161. clip: 0.064214
Iteration 11680: Policy loss: 0.142553. Value loss: 0.111515. Entropy: 0.308845.
Iteration 11681: Policy loss: 0.138692. Value loss: 0.033073. Entropy: 0.308540.
Iteration 11682: Policy loss: 0.130347.

Training network. lr: 0.000160. clip: 0.064067
Iteration 11740: Policy loss: 0.334937. Value loss: 0.138419. Entropy: 0.312473.
Iteration 11741: Policy loss: 0.333741. Value loss: 0.047013. Entropy: 0.312925.
Iteration 11742: Policy loss: 0.317563. Value loss: 0.030922. Entropy: 0.312297.
episode: 4318   score: 360.0  epsilon: 1.0    steps: 632  evaluation reward: 394.15
Training network. lr: 0.000160. clip: 0.064067
Iteration 11743: Policy loss: 0.098329. Value loss: 0.069012. Entropy: 0.305269.
Iteration 11744: Policy loss: 0.097549. Value loss: 0.033493. Entropy: 0.304336.
Iteration 11745: Policy loss: 0.088906. Value loss: 0.026384. Entropy: 0.304285.
episode: 4319   score: 240.0  epsilon: 1.0    steps: 608  evaluation reward: 392.35
episode: 4320   score: 210.0  epsilon: 1.0    steps: 800  evaluation reward: 391.3
episode: 4321   score: 470.0  epsilon: 1.0    steps: 920  evaluation reward: 393.15
Training network. lr: 0.000160. clip: 0.064067
Iteration 11746: Policy loss: 0.077421

Iteration 11806: Policy loss: 0.045796. Value loss: 0.316726. Entropy: 0.305198.
Iteration 11807: Policy loss: 0.052873. Value loss: 0.233694. Entropy: 0.304673.
Iteration 11808: Policy loss: 0.028084. Value loss: 0.222220. Entropy: 0.304139.
episode: 4339   score: 330.0  epsilon: 1.0    steps: 744  evaluation reward: 377.4
Training network. lr: 0.000159. clip: 0.063753
Iteration 11809: Policy loss: -0.092116. Value loss: 0.088416. Entropy: 0.305762.
Iteration 11810: Policy loss: -0.097258. Value loss: 0.033205. Entropy: 0.305608.
Iteration 11811: Policy loss: -0.101218. Value loss: 0.018892. Entropy: 0.306195.
episode: 4340   score: 285.0  epsilon: 1.0    steps: 816  evaluation reward: 376.35
Training network. lr: 0.000159. clip: 0.063753
Iteration 11812: Policy loss: 0.036173. Value loss: 0.092647. Entropy: 0.300621.
Iteration 11813: Policy loss: 0.034666. Value loss: 0.039750. Entropy: 0.300794.
Iteration 11814: Policy loss: 0.032638. Value loss: 0.027893. Entropy: 0.300372.
episode

Iteration 11874: Policy loss: -0.069419. Value loss: 0.037050. Entropy: 0.303388.
episode: 4358   score: 540.0  epsilon: 1.0    steps: 416  evaluation reward: 384.8
Training network. lr: 0.000159. clip: 0.063606
Iteration 11875: Policy loss: 0.062210. Value loss: 0.116848. Entropy: 0.305514.
Iteration 11876: Policy loss: 0.049876. Value loss: 0.039194. Entropy: 0.304843.
Iteration 11877: Policy loss: 0.046016. Value loss: 0.031345. Entropy: 0.302733.
Training network. lr: 0.000159. clip: 0.063606
Iteration 11878: Policy loss: -0.687195. Value loss: 0.397792. Entropy: 0.308886.
Iteration 11879: Policy loss: -0.697319. Value loss: 0.109834. Entropy: 0.307905.
Iteration 11880: Policy loss: -0.696631. Value loss: 0.058818. Entropy: 0.308850.
Training network. lr: 0.000159. clip: 0.063606
Iteration 11881: Policy loss: 0.044789. Value loss: 0.181170. Entropy: 0.306634.
Iteration 11882: Policy loss: 0.021321. Value loss: 0.052397. Entropy: 0.305899.
Iteration 11883: Policy loss: 0.033378. Val

Iteration 11941: Policy loss: 0.111401. Value loss: 0.076879. Entropy: 0.306848.
Iteration 11942: Policy loss: 0.101975. Value loss: 0.030290. Entropy: 0.307015.
Iteration 11943: Policy loss: 0.104076. Value loss: 0.023324. Entropy: 0.308374.
episode: 4378   score: 550.0  epsilon: 1.0    steps: 200  evaluation reward: 387.55
Training network. lr: 0.000159. clip: 0.063449
Iteration 11944: Policy loss: -0.092968. Value loss: 0.245506. Entropy: 0.304692.
Iteration 11945: Policy loss: -0.107441. Value loss: 0.116537. Entropy: 0.305934.
Iteration 11946: Policy loss: -0.106198. Value loss: 0.062414. Entropy: 0.303235.
episode: 4379   score: 635.0  epsilon: 1.0    steps: 624  evaluation reward: 391.5
episode: 4380   score: 790.0  epsilon: 1.0    steps: 784  evaluation reward: 394.45
Training network. lr: 0.000159. clip: 0.063449
Iteration 11947: Policy loss: -0.215059. Value loss: 0.422743. Entropy: 0.291559.
Iteration 11948: Policy loss: -0.229435. Value loss: 0.231702. Entropy: 0.293556.
It

Iteration 12009: Policy loss: 0.230033. Value loss: 0.023968. Entropy: 0.286131.
Training network. lr: 0.000158. clip: 0.063145
Iteration 12010: Policy loss: 0.202384. Value loss: 0.100672. Entropy: 0.305522.
Iteration 12011: Policy loss: 0.195271. Value loss: 0.034216. Entropy: 0.303463.
Iteration 12012: Policy loss: 0.192043. Value loss: 0.023862. Entropy: 0.303963.
episode: 4398   score: 260.0  epsilon: 1.0    steps: 224  evaluation reward: 412.7
episode: 4399   score: 215.0  epsilon: 1.0    steps: 360  evaluation reward: 412.5
Training network. lr: 0.000158. clip: 0.063145
Iteration 12013: Policy loss: -0.119756. Value loss: 0.188873. Entropy: 0.284203.
Iteration 12014: Policy loss: -0.137521. Value loss: 0.123396. Entropy: 0.281615.
Iteration 12015: Policy loss: -0.131920. Value loss: 0.083905. Entropy: 0.281734.
Training network. lr: 0.000158. clip: 0.063145
Iteration 12016: Policy loss: -0.199014. Value loss: 0.408263. Entropy: 0.308532.
Iteration 12017: Policy loss: -0.213206. 

Iteration 12076: Policy loss: -0.519823. Value loss: 0.528891. Entropy: 0.312112.
Iteration 12077: Policy loss: -0.536226. Value loss: 0.232842. Entropy: 0.311506.
Iteration 12078: Policy loss: -0.553217. Value loss: 0.069650. Entropy: 0.310967.
episode: 4418   score: 635.0  epsilon: 1.0    steps: 192  evaluation reward: 432.7
Training network. lr: 0.000157. clip: 0.062989
Iteration 12079: Policy loss: 0.120815. Value loss: 0.069944. Entropy: 0.307010.
Iteration 12080: Policy loss: 0.108205. Value loss: 0.028859. Entropy: 0.305770.
Iteration 12081: Policy loss: 0.102291. Value loss: 0.021520. Entropy: 0.307509.
Training network. lr: 0.000157. clip: 0.062989
Iteration 12082: Policy loss: -0.445908. Value loss: 0.297843. Entropy: 0.307728.
Iteration 12083: Policy loss: -0.451398. Value loss: 0.088973. Entropy: 0.306550.
Iteration 12084: Policy loss: -0.449222. Value loss: 0.057381. Entropy: 0.306877.
episode: 4419   score: 215.0  epsilon: 1.0    steps: 816  evaluation reward: 432.45
Trai

episode: 4437   score: 240.0  epsilon: 1.0    steps: 760  evaluation reward: 458.5
episode: 4438   score: 240.0  epsilon: 1.0    steps: 1008  evaluation reward: 455.15
Training network. lr: 0.000157. clip: 0.062832
Iteration 12145: Policy loss: 0.155835. Value loss: 0.127158. Entropy: 0.298208.
Iteration 12146: Policy loss: 0.137748. Value loss: 0.062395. Entropy: 0.298895.
Iteration 12147: Policy loss: 0.138591. Value loss: 0.049457. Entropy: 0.298076.
episode: 4439   score: 335.0  epsilon: 1.0    steps: 488  evaluation reward: 455.2
Training network. lr: 0.000157. clip: 0.062832
Iteration 12148: Policy loss: 0.220740. Value loss: 0.119760. Entropy: 0.302933.
Iteration 12149: Policy loss: 0.210138. Value loss: 0.056757. Entropy: 0.303385.
Iteration 12150: Policy loss: 0.208565. Value loss: 0.045748. Entropy: 0.302484.
episode: 4440   score: 335.0  epsilon: 1.0    steps: 448  evaluation reward: 455.7
episode: 4441   score: 315.0  epsilon: 1.0    steps: 496  evaluation reward: 453.7
Tra

Iteration 12209: Policy loss: 0.185397. Value loss: 0.058663. Entropy: 0.283864.
Iteration 12210: Policy loss: 0.173097. Value loss: 0.045742. Entropy: 0.285340.
episode: 4460   score: 395.0  epsilon: 1.0    steps: 232  evaluation reward: 452.3
Training network. lr: 0.000156. clip: 0.062528
Iteration 12211: Policy loss: 0.143817. Value loss: 0.158685. Entropy: 0.300826.
Iteration 12212: Policy loss: 0.131084. Value loss: 0.061049. Entropy: 0.299756.
Iteration 12213: Policy loss: 0.127898. Value loss: 0.041966. Entropy: 0.299676.
Training network. lr: 0.000156. clip: 0.062528
Iteration 12214: Policy loss: 0.024285. Value loss: 0.146686. Entropy: 0.310116.
Iteration 12215: Policy loss: 0.008629. Value loss: 0.053044. Entropy: 0.309037.
Iteration 12216: Policy loss: 0.001364. Value loss: 0.036866. Entropy: 0.308259.
Training network. lr: 0.000156. clip: 0.062528
Iteration 12217: Policy loss: 0.033002. Value loss: 0.063192. Entropy: 0.309507.
Iteration 12218: Policy loss: 0.026375. Value l

Iteration 12278: Policy loss: 0.165184. Value loss: 0.048847. Entropy: 0.294830.
Iteration 12279: Policy loss: 0.153252. Value loss: 0.033939. Entropy: 0.295383.
episode: 4479   score: 360.0  epsilon: 1.0    steps: 408  evaluation reward: 453.65
Training network. lr: 0.000156. clip: 0.062371
Iteration 12280: Policy loss: 0.094285. Value loss: 0.141529. Entropy: 0.304615.
Iteration 12281: Policy loss: 0.082684. Value loss: 0.044544. Entropy: 0.303149.
Iteration 12282: Policy loss: 0.081037. Value loss: 0.030560. Entropy: 0.304546.
episode: 4480   score: 405.0  epsilon: 1.0    steps: 1008  evaluation reward: 449.8
Training network. lr: 0.000156. clip: 0.062371
Iteration 12283: Policy loss: -0.077655. Value loss: 0.353091. Entropy: 0.303119.
Iteration 12284: Policy loss: -0.106353. Value loss: 0.179022. Entropy: 0.304093.
Iteration 12285: Policy loss: -0.095481. Value loss: 0.117681. Entropy: 0.302151.
episode: 4481   score: 480.0  epsilon: 1.0    steps: 968  evaluation reward: 447.6
Trai

Training network. lr: 0.000156. clip: 0.062224
Iteration 12346: Policy loss: 0.040691. Value loss: 0.085056. Entropy: 0.271007.
Iteration 12347: Policy loss: 0.040461. Value loss: 0.037206. Entropy: 0.267648.
Iteration 12348: Policy loss: 0.041275. Value loss: 0.029004. Entropy: 0.268819.
episode: 4500   score: 650.0  epsilon: 1.0    steps: 472  evaluation reward: 457.55
Training network. lr: 0.000156. clip: 0.062224
Iteration 12349: Policy loss: 0.097470. Value loss: 0.057478. Entropy: 0.291893.
Iteration 12350: Policy loss: 0.100162. Value loss: 0.029802. Entropy: 0.293164.
Iteration 12351: Policy loss: 0.100383. Value loss: 0.020285. Entropy: 0.292506.
Training network. lr: 0.000155. clip: 0.062067
Iteration 12352: Policy loss: -0.178563. Value loss: 0.321462. Entropy: 0.309092.
Iteration 12353: Policy loss: -0.187944. Value loss: 0.099161. Entropy: 0.309138.
Iteration 12354: Policy loss: -0.201035. Value loss: 0.053099. Entropy: 0.308041.
Training network. lr: 0.000155. clip: 0.062

Iteration 12416: Policy loss: -0.093547. Value loss: 0.100291. Entropy: 0.313509.
Iteration 12417: Policy loss: -0.103700. Value loss: 0.068779. Entropy: 0.313520.
episode: 4516   score: 725.0  epsilon: 1.0    steps: 736  evaluation reward: 458.8
Training network. lr: 0.000155. clip: 0.061910
Iteration 12418: Policy loss: 0.167665. Value loss: 0.163623. Entropy: 0.305465.
Iteration 12419: Policy loss: 0.155014. Value loss: 0.055395. Entropy: 0.304825.
Iteration 12420: Policy loss: 0.157084. Value loss: 0.039206. Entropy: 0.305937.
episode: 4517   score: 415.0  epsilon: 1.0    steps: 160  evaluation reward: 456.6
episode: 4518   score: 525.0  epsilon: 1.0    steps: 216  evaluation reward: 455.5
episode: 4519   score: 525.0  epsilon: 1.0    steps: 992  evaluation reward: 458.6
Training network. lr: 0.000155. clip: 0.061910
Iteration 12421: Policy loss: -0.017607. Value loss: 0.128283. Entropy: 0.295858.
Iteration 12422: Policy loss: -0.024024. Value loss: 0.053556. Entropy: 0.293669.
Ite

episode: 4536   score: 480.0  epsilon: 1.0    steps: 392  evaluation reward: 455.0
Training network. lr: 0.000154. clip: 0.061763
Iteration 12484: Policy loss: 0.132981. Value loss: 0.149738. Entropy: 0.289158.
Iteration 12485: Policy loss: 0.137997. Value loss: 0.061084. Entropy: 0.290737.
Iteration 12486: Policy loss: 0.130241. Value loss: 0.040066. Entropy: 0.288868.
Training network. lr: 0.000154. clip: 0.061763
Iteration 12487: Policy loss: -0.098711. Value loss: 0.155011. Entropy: 0.311641.
Iteration 12488: Policy loss: -0.100493. Value loss: 0.068104. Entropy: 0.309457.
Iteration 12489: Policy loss: -0.100802. Value loss: 0.050997. Entropy: 0.310059.
episode: 4537   score: 390.0  epsilon: 1.0    steps: 584  evaluation reward: 456.5
Training network. lr: 0.000154. clip: 0.061763
Iteration 12490: Policy loss: -0.114755. Value loss: 0.099237. Entropy: 0.300270.
Iteration 12491: Policy loss: -0.124677. Value loss: 0.048712. Entropy: 0.300868.
Iteration 12492: Policy loss: -0.127354.

episode: 4558   score: 365.0  epsilon: 1.0    steps: 992  evaluation reward: 459.6
Training network. lr: 0.000154. clip: 0.061606
Iteration 12550: Policy loss: 0.216658. Value loss: 0.205165. Entropy: 0.297353.
Iteration 12551: Policy loss: 0.199712. Value loss: 0.070422. Entropy: 0.296186.
Iteration 12552: Policy loss: 0.202780. Value loss: 0.049320. Entropy: 0.297413.
episode: 4559   score: 330.0  epsilon: 1.0    steps: 200  evaluation reward: 458.7
Training network. lr: 0.000154. clip: 0.061449
Iteration 12553: Policy loss: 0.277264. Value loss: 0.139604. Entropy: 0.294231.
Iteration 12554: Policy loss: 0.272608. Value loss: 0.055881. Entropy: 0.292220.
Iteration 12555: Policy loss: 0.265727. Value loss: 0.038827. Entropy: 0.291804.
Training network. lr: 0.000154. clip: 0.061449
Iteration 12556: Policy loss: -0.164473. Value loss: 0.198470. Entropy: 0.301208.
Iteration 12557: Policy loss: -0.185269. Value loss: 0.065725. Entropy: 0.302616.
Iteration 12558: Policy loss: -0.190652. Va

Iteration 12616: Policy loss: -0.146566. Value loss: 0.230604. Entropy: 0.304040.
Iteration 12617: Policy loss: -0.149138. Value loss: 0.067151. Entropy: 0.303757.
Iteration 12618: Policy loss: -0.155873. Value loss: 0.043117. Entropy: 0.304586.
Training network. lr: 0.000153. clip: 0.061302
Iteration 12619: Policy loss: 0.033274. Value loss: 0.245471. Entropy: 0.316503.
Iteration 12620: Policy loss: 0.014375. Value loss: 0.127881. Entropy: 0.317201.
Iteration 12621: Policy loss: 0.014029. Value loss: 0.106909. Entropy: 0.316241.
episode: 4579   score: 695.0  epsilon: 1.0    steps: 288  evaluation reward: 466.4
Training network. lr: 0.000153. clip: 0.061302
Iteration 12622: Policy loss: 0.135709. Value loss: 0.176853. Entropy: 0.299706.
Iteration 12623: Policy loss: 0.134510. Value loss: 0.081250. Entropy: 0.296570.
Iteration 12624: Policy loss: 0.122406. Value loss: 0.051548. Entropy: 0.298354.
episode: 4580   score: 285.0  epsilon: 1.0    steps: 176  evaluation reward: 465.2
Training

Iteration 12684: Policy loss: 0.131183. Value loss: 0.035379. Entropy: 0.311153.
episode: 4599   score: 285.0  epsilon: 1.0    steps: 800  evaluation reward: 451.0
Training network. lr: 0.000153. clip: 0.061145
Iteration 12685: Policy loss: -0.142222. Value loss: 0.118017. Entropy: 0.305231.
Iteration 12686: Policy loss: -0.157329. Value loss: 0.049813. Entropy: 0.306566.
Iteration 12687: Policy loss: -0.159702. Value loss: 0.033987. Entropy: 0.305994.
episode: 4600   score: 420.0  epsilon: 1.0    steps: 800  evaluation reward: 448.7
Training network. lr: 0.000153. clip: 0.061145
Iteration 12688: Policy loss: -0.193936. Value loss: 0.328763. Entropy: 0.310787.
Iteration 12689: Policy loss: -0.195941. Value loss: 0.140331. Entropy: 0.311630.
Iteration 12690: Policy loss: -0.234032. Value loss: 0.075527. Entropy: 0.310235.
Training network. lr: 0.000153. clip: 0.061145
Iteration 12691: Policy loss: -0.077899. Value loss: 0.092053. Entropy: 0.309003.
Iteration 12692: Policy loss: -0.08712

Training network. lr: 0.000152. clip: 0.060841
Iteration 12751: Policy loss: -0.080564. Value loss: 0.155979. Entropy: 0.307060.
Iteration 12752: Policy loss: -0.075217. Value loss: 0.055383. Entropy: 0.307182.
Iteration 12753: Policy loss: -0.090258. Value loss: 0.035091. Entropy: 0.305528.
episode: 4619   score: 345.0  epsilon: 1.0    steps: 736  evaluation reward: 442.15
Training network. lr: 0.000152. clip: 0.060841
Iteration 12754: Policy loss: 0.333788. Value loss: 0.143880. Entropy: 0.305607.
Iteration 12755: Policy loss: 0.324676. Value loss: 0.053926. Entropy: 0.305499.
Iteration 12756: Policy loss: 0.309556. Value loss: 0.033181. Entropy: 0.304786.
episode: 4620   score: 620.0  epsilon: 1.0    steps: 672  evaluation reward: 441.85
Training network. lr: 0.000152. clip: 0.060841
Iteration 12757: Policy loss: -0.239754. Value loss: 0.113437. Entropy: 0.307584.
Iteration 12758: Policy loss: -0.237169. Value loss: 0.032929. Entropy: 0.307182.
Iteration 12759: Policy loss: -0.24515

Iteration 12817: Policy loss: -0.177775. Value loss: 0.332181. Entropy: 0.305527.
Iteration 12818: Policy loss: -0.201570. Value loss: 0.224995. Entropy: 0.306289.
Iteration 12819: Policy loss: -0.206595. Value loss: 0.167293. Entropy: 0.305842.
episode: 4640   score: 390.0  epsilon: 1.0    steps: 408  evaluation reward: 454.3
Training network. lr: 0.000152. clip: 0.060685
Iteration 12820: Policy loss: 0.175916. Value loss: 0.130211. Entropy: 0.299511.
Iteration 12821: Policy loss: 0.154646. Value loss: 0.054325. Entropy: 0.298782.
Iteration 12822: Policy loss: 0.163828. Value loss: 0.040936. Entropy: 0.299551.
episode: 4641   score: 525.0  epsilon: 1.0    steps: 232  evaluation reward: 453.65
Training network. lr: 0.000152. clip: 0.060685
Iteration 12823: Policy loss: -0.049483. Value loss: 0.174068. Entropy: 0.303182.
Iteration 12824: Policy loss: -0.055462. Value loss: 0.065240. Entropy: 0.303348.
Iteration 12825: Policy loss: -0.059399. Value loss: 0.042521. Entropy: 0.303287.
epis

Iteration 12885: Policy loss: 0.279939. Value loss: 0.036166. Entropy: 0.304389.
episode: 4660   score: 470.0  epsilon: 1.0    steps: 24  evaluation reward: 452.7
episode: 4661   score: 695.0  epsilon: 1.0    steps: 424  evaluation reward: 457.3
Training network. lr: 0.000151. clip: 0.060528
Iteration 12886: Policy loss: 0.220254. Value loss: 0.092425. Entropy: 0.297155.
Iteration 12887: Policy loss: 0.210911. Value loss: 0.046941. Entropy: 0.296090.
Iteration 12888: Policy loss: 0.201828. Value loss: 0.035416. Entropy: 0.295289.
episode: 4662   score: 395.0  epsilon: 1.0    steps: 576  evaluation reward: 454.75
Training network. lr: 0.000151. clip: 0.060528
Iteration 12889: Policy loss: -0.215781. Value loss: 0.305529. Entropy: 0.301101.
Iteration 12890: Policy loss: -0.236913. Value loss: 0.209992. Entropy: 0.301893.
Iteration 12891: Policy loss: -0.249246. Value loss: 0.164611. Entropy: 0.300417.
Training network. lr: 0.000151. clip: 0.060528
Iteration 12892: Policy loss: 0.037967. 

Iteration 12951: Policy loss: 0.089486. Value loss: 0.035042. Entropy: 0.301864.
Training network. lr: 0.000151. clip: 0.060224
Iteration 12952: Policy loss: -0.088193. Value loss: 0.103093. Entropy: 0.308161.
Iteration 12953: Policy loss: -0.093219. Value loss: 0.045027. Entropy: 0.309684.
Iteration 12954: Policy loss: -0.091804. Value loss: 0.029955. Entropy: 0.308408.
episode: 4682   score: 545.0  epsilon: 1.0    steps: 72  evaluation reward: 430.75
Training network. lr: 0.000151. clip: 0.060224
Iteration 12955: Policy loss: -0.060240. Value loss: 0.057997. Entropy: 0.305954.
Iteration 12956: Policy loss: -0.063013. Value loss: 0.027423. Entropy: 0.306637.
Iteration 12957: Policy loss: -0.060893. Value loss: 0.020338. Entropy: 0.306661.
Training network. lr: 0.000151. clip: 0.060224
Iteration 12958: Policy loss: -0.089340. Value loss: 0.273387. Entropy: 0.304799.
Iteration 12959: Policy loss: -0.090256. Value loss: 0.060808. Entropy: 0.305574.
Iteration 12960: Policy loss: -0.083546

Training network. lr: 0.000150. clip: 0.060067
Iteration 13018: Policy loss: -0.115650. Value loss: 0.081771. Entropy: 0.302297.
Iteration 13019: Policy loss: -0.120664. Value loss: 0.048369. Entropy: 0.302748.
Iteration 13020: Policy loss: -0.120077. Value loss: 0.037881. Entropy: 0.302395.
episode: 4702   score: 210.0  epsilon: 1.0    steps: 864  evaluation reward: 435.65
Training network. lr: 0.000150. clip: 0.060067
Iteration 13021: Policy loss: -0.169760. Value loss: 0.334976. Entropy: 0.298725.
Iteration 13022: Policy loss: -0.194571. Value loss: 0.243017. Entropy: 0.298567.
Iteration 13023: Policy loss: -0.191747. Value loss: 0.195061. Entropy: 0.298459.
episode: 4703   score: 315.0  epsilon: 1.0    steps: 80  evaluation reward: 434.55
Training network. lr: 0.000150. clip: 0.060067
Iteration 13024: Policy loss: 0.283928. Value loss: 0.061490. Entropy: 0.300003.
Iteration 13025: Policy loss: 0.275963. Value loss: 0.021085. Entropy: 0.299917.
Iteration 13026: Policy loss: 0.269857

Training network. lr: 0.000150. clip: 0.059920
Iteration 13084: Policy loss: -0.264583. Value loss: 0.239852. Entropy: 0.311697.
Iteration 13085: Policy loss: -0.275203. Value loss: 0.077849. Entropy: 0.311009.
Iteration 13086: Policy loss: -0.285596. Value loss: 0.049208. Entropy: 0.311869.
episode: 4724   score: 280.0  epsilon: 1.0    steps: 728  evaluation reward: 410.2
episode: 4725   score: 255.0  epsilon: 1.0    steps: 872  evaluation reward: 408.55
Training network. lr: 0.000150. clip: 0.059920
Iteration 13087: Policy loss: 0.150277. Value loss: 0.151109. Entropy: 0.295523.
Iteration 13088: Policy loss: 0.143358. Value loss: 0.065786. Entropy: 0.295962.
Iteration 13089: Policy loss: 0.123630. Value loss: 0.046628. Entropy: 0.295520.
episode: 4726   score: 265.0  epsilon: 1.0    steps: 72  evaluation reward: 406.2
episode: 4727   score: 180.0  epsilon: 1.0    steps: 88  evaluation reward: 403.6
episode: 4728   score: 665.0  epsilon: 1.0    steps: 552  evaluation reward: 406.9
Tra

Iteration 13149: Policy loss: -0.029995. Value loss: 0.034396. Entropy: 0.300074.
episode: 4747   score: 390.0  epsilon: 1.0    steps: 168  evaluation reward: 382.95
Training network. lr: 0.000149. clip: 0.059763
Iteration 13150: Policy loss: 0.212543. Value loss: 0.102724. Entropy: 0.294686.
Iteration 13151: Policy loss: 0.209083. Value loss: 0.049306. Entropy: 0.295433.
Iteration 13152: Policy loss: 0.203115. Value loss: 0.033731. Entropy: 0.294744.
Training network. lr: 0.000149. clip: 0.059606
Iteration 13153: Policy loss: 0.310591. Value loss: 0.143064. Entropy: 0.305237.
Iteration 13154: Policy loss: 0.314604. Value loss: 0.045054. Entropy: 0.303341.
Iteration 13155: Policy loss: 0.291419. Value loss: 0.031795. Entropy: 0.302823.
Training network. lr: 0.000149. clip: 0.059606
Iteration 13156: Policy loss: -0.078737. Value loss: 0.261728. Entropy: 0.308277.
Iteration 13157: Policy loss: -0.072828. Value loss: 0.063710. Entropy: 0.306779.
Iteration 13158: Policy loss: -0.105549. Va

Iteration 13215: Policy loss: 0.139761. Value loss: 0.039879. Entropy: 0.300475.
episode: 4769   score: 670.0  epsilon: 1.0    steps: 336  evaluation reward: 384.7
episode: 4770   score: 210.0  epsilon: 1.0    steps: 584  evaluation reward: 384.05
Training network. lr: 0.000149. clip: 0.059459
Iteration 13216: Policy loss: -0.399954. Value loss: 0.281286. Entropy: 0.272338.
Iteration 13217: Policy loss: -0.415489. Value loss: 0.166535. Entropy: 0.271434.
Iteration 13218: Policy loss: -0.400053. Value loss: 0.090002. Entropy: 0.271412.
Training network. lr: 0.000149. clip: 0.059459
Iteration 13219: Policy loss: 0.348743. Value loss: 0.249888. Entropy: 0.306530.
Iteration 13220: Policy loss: 0.325824. Value loss: 0.087974. Entropy: 0.305139.
Iteration 13221: Policy loss: 0.327955. Value loss: 0.059071. Entropy: 0.305765.
Training network. lr: 0.000149. clip: 0.059459
Iteration 13222: Policy loss: 0.381014. Value loss: 0.223434. Entropy: 0.310151.
Iteration 13223: Policy loss: 0.364051. V

Iteration 13281: Policy loss: 0.350238. Value loss: 0.040040. Entropy: 0.298029.
episode: 4791   score: 360.0  epsilon: 1.0    steps: 688  evaluation reward: 376.9
Training network. lr: 0.000148. clip: 0.059302
Iteration 13282: Policy loss: 0.276260. Value loss: 0.132523. Entropy: 0.293139.
Iteration 13283: Policy loss: 0.268364. Value loss: 0.059977. Entropy: 0.291900.
Iteration 13284: Policy loss: 0.261610. Value loss: 0.042240. Entropy: 0.292014.
episode: 4792   score: 390.0  epsilon: 1.0    steps: 600  evaluation reward: 370.0
Training network. lr: 0.000148. clip: 0.059302
Iteration 13285: Policy loss: 0.026552. Value loss: 0.117925. Entropy: 0.296951.
Iteration 13286: Policy loss: 0.025773. Value loss: 0.053997. Entropy: 0.297771.
Iteration 13287: Policy loss: 0.023884. Value loss: 0.041052. Entropy: 0.296848.
Training network. lr: 0.000148. clip: 0.059302
Iteration 13288: Policy loss: -0.154119. Value loss: 0.291868. Entropy: 0.308219.
Iteration 13289: Policy loss: -0.161703. Val

Training network. lr: 0.000148. clip: 0.059145
Iteration 13348: Policy loss: -0.105645. Value loss: 0.269599. Entropy: 0.286817.
Iteration 13349: Policy loss: -0.113336. Value loss: 0.168114. Entropy: 0.287007.
Iteration 13350: Policy loss: -0.119505. Value loss: 0.124045. Entropy: 0.286264.
Training network. lr: 0.000147. clip: 0.058998
Iteration 13351: Policy loss: 0.031157. Value loss: 0.060158. Entropy: 0.304908.
Iteration 13352: Policy loss: 0.028012. Value loss: 0.028098. Entropy: 0.306497.
Iteration 13353: Policy loss: 0.027154. Value loss: 0.019564. Entropy: 0.304714.
Training network. lr: 0.000147. clip: 0.058998
Iteration 13354: Policy loss: 0.072026. Value loss: 0.116775. Entropy: 0.301036.
Iteration 13355: Policy loss: 0.070229. Value loss: 0.035532. Entropy: 0.299151.
Iteration 13356: Policy loss: 0.060303. Value loss: 0.026119. Entropy: 0.299078.
episode: 4812   score: 465.0  epsilon: 1.0    steps: 856  evaluation reward: 372.95
Training network. lr: 0.000147. clip: 0.058

Iteration 13418: Policy loss: -0.022419. Value loss: 0.031019. Entropy: 0.311855.
Iteration 13419: Policy loss: -0.020889. Value loss: 0.022356. Entropy: 0.312287.
episode: 4829   score: 650.0  epsilon: 1.0    steps: 960  evaluation reward: 395.2
Training network. lr: 0.000147. clip: 0.058841
Iteration 13420: Policy loss: -0.151879. Value loss: 0.336565. Entropy: 0.307998.
Iteration 13421: Policy loss: -0.152494. Value loss: 0.176768. Entropy: 0.308525.
Iteration 13422: Policy loss: -0.154722. Value loss: 0.112291. Entropy: 0.307680.
episode: 4830   score: 290.0  epsilon: 1.0    steps: 520  evaluation reward: 396.3
Training network. lr: 0.000147. clip: 0.058841
Iteration 13423: Policy loss: 0.314799. Value loss: 0.107065. Entropy: 0.290565.
Iteration 13424: Policy loss: 0.298116. Value loss: 0.042326. Entropy: 0.290332.
Iteration 13425: Policy loss: 0.295015. Value loss: 0.028957. Entropy: 0.289739.
episode: 4831   score: 395.0  epsilon: 1.0    steps: 448  evaluation reward: 396.8
epis

Iteration 13484: Policy loss: 0.374229. Value loss: 0.062773. Entropy: 0.279806.
Iteration 13485: Policy loss: 0.379655. Value loss: 0.049094. Entropy: 0.278771.
now time :  2019-09-06 04:11:59.881842
episode: 4851   score: 360.0  epsilon: 1.0    steps: 160  evaluation reward: 410.15
Training network. lr: 0.000147. clip: 0.058685
Iteration 13486: Policy loss: 0.415706. Value loss: 0.148570. Entropy: 0.290778.
Iteration 13487: Policy loss: 0.420972. Value loss: 0.068815. Entropy: 0.292939.
Iteration 13488: Policy loss: 0.409977. Value loss: 0.050840. Entropy: 0.294066.
episode: 4852   score: 565.0  epsilon: 1.0    steps: 720  evaluation reward: 413.05
Training network. lr: 0.000147. clip: 0.058685
Iteration 13489: Policy loss: -0.200697. Value loss: 0.137990. Entropy: 0.297876.
Iteration 13490: Policy loss: -0.209752. Value loss: 0.053489. Entropy: 0.297385.
Iteration 13491: Policy loss: -0.207429. Value loss: 0.037061. Entropy: 0.297911.
episode: 4853   score: 670.0  epsilon: 1.0    st

Iteration 13551: Policy loss: -0.114384. Value loss: 0.055397. Entropy: 0.298861.
episode: 4871   score: 450.0  epsilon: 1.0    steps: 384  evaluation reward: 430.05
episode: 4872   score: 520.0  epsilon: 1.0    steps: 600  evaluation reward: 433.7
Training network. lr: 0.000146. clip: 0.058381
Iteration 13552: Policy loss: 0.161387. Value loss: 0.106854. Entropy: 0.293767.
Iteration 13553: Policy loss: 0.148537. Value loss: 0.042511. Entropy: 0.293314.
Iteration 13554: Policy loss: 0.144773. Value loss: 0.031101. Entropy: 0.292746.
episode: 4873   score: 290.0  epsilon: 1.0    steps: 424  evaluation reward: 433.45
Training network. lr: 0.000146. clip: 0.058381
Iteration 13555: Policy loss: -0.033609. Value loss: 0.310251. Entropy: 0.293293.
Iteration 13556: Policy loss: -0.056863. Value loss: 0.124477. Entropy: 0.295718.
Iteration 13557: Policy loss: -0.064127. Value loss: 0.063491. Entropy: 0.292859.
Training network. lr: 0.000146. clip: 0.058381
Iteration 13558: Policy loss: 0.12577

episode: 4892   score: 265.0  epsilon: 1.0    steps: 448  evaluation reward: 445.3
Training network. lr: 0.000146. clip: 0.058224
Iteration 13618: Policy loss: -0.235453. Value loss: 0.347320. Entropy: 0.299752.
Iteration 13619: Policy loss: -0.230404. Value loss: 0.127027. Entropy: 0.301023.
Iteration 13620: Policy loss: -0.246439. Value loss: 0.076233. Entropy: 0.298853.
Training network. lr: 0.000146. clip: 0.058224
Iteration 13621: Policy loss: 0.200218. Value loss: 0.110886. Entropy: 0.310666.
Iteration 13622: Policy loss: 0.192617. Value loss: 0.051543. Entropy: 0.309865.
Iteration 13623: Policy loss: 0.189925. Value loss: 0.037936. Entropy: 0.310097.
episode: 4893   score: 570.0  epsilon: 1.0    steps: 392  evaluation reward: 448.0
episode: 4894   score: 230.0  epsilon: 1.0    steps: 664  evaluation reward: 448.8
Training network. lr: 0.000146. clip: 0.058224
Iteration 13624: Policy loss: 0.038048. Value loss: 0.280083. Entropy: 0.285305.
Iteration 13625: Policy loss: 0.037535. 

Training network. lr: 0.000145. clip: 0.058076
Iteration 13684: Policy loss: 0.263702. Value loss: 0.094346. Entropy: 0.290135.
Iteration 13685: Policy loss: 0.257541. Value loss: 0.041321. Entropy: 0.290659.
Iteration 13686: Policy loss: 0.256620. Value loss: 0.027986. Entropy: 0.289648.
episode: 4914   score: 660.0  epsilon: 1.0    steps: 456  evaluation reward: 437.55
Training network. lr: 0.000145. clip: 0.058076
Iteration 13687: Policy loss: -0.020835. Value loss: 0.105588. Entropy: 0.301305.
Iteration 13688: Policy loss: -0.029640. Value loss: 0.043553. Entropy: 0.301355.
Iteration 13689: Policy loss: -0.028576. Value loss: 0.030553. Entropy: 0.300768.
episode: 4915   score: 525.0  epsilon: 1.0    steps: 72  evaluation reward: 436.9
episode: 4916   score: 425.0  epsilon: 1.0    steps: 1000  evaluation reward: 436.65
Training network. lr: 0.000145. clip: 0.058076
Iteration 13690: Policy loss: -0.099152. Value loss: 0.109663. Entropy: 0.291407.
Iteration 13691: Policy loss: -0.0992

Iteration 13750: Policy loss: -0.066928. Value loss: 0.074673. Entropy: 0.304340.
Iteration 13751: Policy loss: -0.070034. Value loss: 0.035704. Entropy: 0.304537.
Iteration 13752: Policy loss: -0.072209. Value loss: 0.027386. Entropy: 0.304149.
Training network. lr: 0.000144. clip: 0.057763
Iteration 13753: Policy loss: 0.159513. Value loss: 0.089757. Entropy: 0.309339.
Iteration 13754: Policy loss: 0.160290. Value loss: 0.030449. Entropy: 0.309333.
Iteration 13755: Policy loss: 0.152283. Value loss: 0.020818. Entropy: 0.308811.
Training network. lr: 0.000144. clip: 0.057763
Iteration 13756: Policy loss: -0.387531. Value loss: 0.337736. Entropy: 0.300533.
Iteration 13757: Policy loss: -0.408868. Value loss: 0.241963. Entropy: 0.303179.
Iteration 13758: Policy loss: -0.406500. Value loss: 0.200855. Entropy: 0.302783.
episode: 4935   score: 535.0  epsilon: 1.0    steps: 408  evaluation reward: 426.25
Training network. lr: 0.000144. clip: 0.057763
Iteration 13759: Policy loss: -0.233321.

Training network. lr: 0.000144. clip: 0.057616
Iteration 13819: Policy loss: 0.273240. Value loss: 0.195377. Entropy: 0.310326.
Iteration 13820: Policy loss: 0.260658. Value loss: 0.060758. Entropy: 0.309195.
Iteration 13821: Policy loss: 0.251218. Value loss: 0.033326. Entropy: 0.308990.
episode: 4953   score: 565.0  epsilon: 1.0    steps: 1008  evaluation reward: 425.35
Training network. lr: 0.000144. clip: 0.057616
Iteration 13822: Policy loss: 0.311992. Value loss: 0.164922. Entropy: 0.298685.
Iteration 13823: Policy loss: 0.292794. Value loss: 0.068820. Entropy: 0.296902.
Iteration 13824: Policy loss: 0.289781. Value loss: 0.044344. Entropy: 0.296111.
episode: 4954   score: 460.0  epsilon: 1.0    steps: 280  evaluation reward: 427.85
Training network. lr: 0.000144. clip: 0.057616
Iteration 13825: Policy loss: -0.281105. Value loss: 0.337679. Entropy: 0.274414.
Iteration 13826: Policy loss: -0.280813. Value loss: 0.180460. Entropy: 0.273885.
Iteration 13827: Policy loss: -0.304388.

Iteration 13887: Policy loss: 0.343204. Value loss: 0.029123. Entropy: 0.306091.
Training network. lr: 0.000144. clip: 0.057459
Iteration 13888: Policy loss: -0.019030. Value loss: 0.168571. Entropy: 0.308689.
Iteration 13889: Policy loss: -0.021450. Value loss: 0.064498. Entropy: 0.309123.
Iteration 13890: Policy loss: -0.025324. Value loss: 0.041114. Entropy: 0.309183.
episode: 4972   score: 340.0  epsilon: 1.0    steps: 936  evaluation reward: 435.2
Training network. lr: 0.000144. clip: 0.057459
Iteration 13891: Policy loss: 0.131167. Value loss: 0.115040. Entropy: 0.308491.
Iteration 13892: Policy loss: 0.130816. Value loss: 0.036986. Entropy: 0.309590.
Iteration 13893: Policy loss: 0.123065. Value loss: 0.025929. Entropy: 0.310033.
episode: 4973   score: 590.0  epsilon: 1.0    steps: 184  evaluation reward: 438.2
episode: 4974   score: 230.0  epsilon: 1.0    steps: 224  evaluation reward: 434.85
Training network. lr: 0.000144. clip: 0.057459
Iteration 13894: Policy loss: -0.032954

Iteration 13952: Policy loss: 0.009194. Value loss: 0.045191. Entropy: 0.317072.
Iteration 13953: Policy loss: 0.001637. Value loss: 0.033677. Entropy: 0.315841.
episode: 4995   score: 285.0  epsilon: 1.0    steps: 192  evaluation reward: 423.25
Training network. lr: 0.000143. clip: 0.057155
Iteration 13954: Policy loss: -0.471267. Value loss: 0.339699. Entropy: 0.302895.
Iteration 13955: Policy loss: -0.499427. Value loss: 0.141714. Entropy: 0.303934.
Iteration 13956: Policy loss: -0.516259. Value loss: 0.089423. Entropy: 0.304393.
Training network. lr: 0.000143. clip: 0.057155
Iteration 13957: Policy loss: 0.053315. Value loss: 0.259432. Entropy: 0.305157.
Iteration 13958: Policy loss: 0.042784. Value loss: 0.093029. Entropy: 0.304171.
Iteration 13959: Policy loss: 0.031710. Value loss: 0.048776. Entropy: 0.303602.
episode: 4996   score: 260.0  epsilon: 1.0    steps: 304  evaluation reward: 422.55
Training network. lr: 0.000143. clip: 0.057155
Iteration 13960: Policy loss: -0.231778.

Iteration 14017: Policy loss: -0.149772. Value loss: 0.245146. Entropy: 0.303678.
Iteration 14018: Policy loss: -0.147961. Value loss: 0.090253. Entropy: 0.303674.
Iteration 14019: Policy loss: -0.155505. Value loss: 0.056157. Entropy: 0.303617.
Training network. lr: 0.000142. clip: 0.056998
Iteration 14020: Policy loss: 0.098790. Value loss: 0.170559. Entropy: 0.302051.
Iteration 14021: Policy loss: 0.089167. Value loss: 0.068179. Entropy: 0.301805.
Iteration 14022: Policy loss: 0.082335. Value loss: 0.047614. Entropy: 0.300354.
episode: 5017   score: 210.0  epsilon: 1.0    steps: 544  evaluation reward: 435.9
Training network. lr: 0.000142. clip: 0.056998
Iteration 14023: Policy loss: -0.023430. Value loss: 0.115354. Entropy: 0.297356.
Iteration 14024: Policy loss: -0.017682. Value loss: 0.039104. Entropy: 0.297426.
Iteration 14025: Policy loss: -0.024908. Value loss: 0.027006. Entropy: 0.297044.
episode: 5018   score: 250.0  epsilon: 1.0    steps: 608  evaluation reward: 434.5
Train

Iteration 14084: Policy loss: 0.189425. Value loss: 0.051701. Entropy: 0.310718.
Iteration 14085: Policy loss: 0.185040. Value loss: 0.039148. Entropy: 0.310189.
episode: 5038   score: 775.0  epsilon: 1.0    steps: 480  evaluation reward: 428.55
episode: 5039   score: 315.0  epsilon: 1.0    steps: 1016  evaluation reward: 429.35
Training network. lr: 0.000142. clip: 0.056841
Iteration 14086: Policy loss: 0.081852. Value loss: 0.109350. Entropy: 0.302227.
Iteration 14087: Policy loss: 0.074060. Value loss: 0.053786. Entropy: 0.302421.
Iteration 14088: Policy loss: 0.075116. Value loss: 0.038493. Entropy: 0.303679.
episode: 5040   score: 260.0  epsilon: 1.0    steps: 48  evaluation reward: 426.45
Training network. lr: 0.000142. clip: 0.056841
Iteration 14089: Policy loss: -0.072136. Value loss: 0.124776. Entropy: 0.299273.
Iteration 14090: Policy loss: -0.078425. Value loss: 0.043907. Entropy: 0.297912.
Iteration 14091: Policy loss: -0.080907. Value loss: 0.033711. Entropy: 0.297893.
Tra

Training network. lr: 0.000141. clip: 0.056537
Iteration 14152: Policy loss: 0.459755. Value loss: 0.228411. Entropy: 0.302139.
Iteration 14153: Policy loss: 0.449273. Value loss: 0.075400. Entropy: 0.303998.
Iteration 14154: Policy loss: 0.440298. Value loss: 0.050822. Entropy: 0.302944.
Training network. lr: 0.000141. clip: 0.056537
Iteration 14155: Policy loss: 0.207239. Value loss: 0.133116. Entropy: 0.305333.
Iteration 14156: Policy loss: 0.197494. Value loss: 0.047521. Entropy: 0.305219.
Iteration 14157: Policy loss: 0.188617. Value loss: 0.033947. Entropy: 0.304602.
Training network. lr: 0.000141. clip: 0.056537
Iteration 14158: Policy loss: 0.146266. Value loss: 0.079311. Entropy: 0.312337.
Iteration 14159: Policy loss: 0.141162. Value loss: 0.036957. Entropy: 0.312185.
Iteration 14160: Policy loss: 0.133945. Value loss: 0.026114. Entropy: 0.311612.
episode: 5057   score: 290.0  epsilon: 1.0    steps: 696  evaluation reward: 420.45
episode: 5058   score: 550.0  epsilon: 1.0    

episode: 5076   score: 375.0  epsilon: 1.0    steps: 960  evaluation reward: 406.2
Training network. lr: 0.000141. clip: 0.056381
Iteration 14221: Policy loss: -0.123049. Value loss: 0.085358. Entropy: 0.298465.
Iteration 14222: Policy loss: -0.127811. Value loss: 0.038465. Entropy: 0.302352.
Iteration 14223: Policy loss: -0.131641. Value loss: 0.027366. Entropy: 0.301454.
Training network. lr: 0.000141. clip: 0.056381
Iteration 14224: Policy loss: -0.004266. Value loss: 0.110854. Entropy: 0.308584.
Iteration 14225: Policy loss: -0.007084. Value loss: 0.035324. Entropy: 0.308568.
Iteration 14226: Policy loss: -0.013107. Value loss: 0.023541. Entropy: 0.308551.
episode: 5077   score: 410.0  epsilon: 1.0    steps: 416  evaluation reward: 405.2
Training network. lr: 0.000141. clip: 0.056381
Iteration 14227: Policy loss: 0.159574. Value loss: 0.104824. Entropy: 0.308142.
Iteration 14228: Policy loss: 0.158594. Value loss: 0.051662. Entropy: 0.306152.
Iteration 14229: Policy loss: 0.145923.

Iteration 14287: Policy loss: 0.076737. Value loss: 0.119756. Entropy: 0.307273.
Iteration 14288: Policy loss: 0.067054. Value loss: 0.042658. Entropy: 0.307758.
Iteration 14289: Policy loss: 0.055291. Value loss: 0.026186. Entropy: 0.307439.
episode: 5097   score: 440.0  epsilon: 1.0    steps: 208  evaluation reward: 396.4
Training network. lr: 0.000141. clip: 0.056233
Iteration 14290: Policy loss: -0.019694. Value loss: 0.147178. Entropy: 0.303520.
Iteration 14291: Policy loss: -0.023719. Value loss: 0.064956. Entropy: 0.303768.
Iteration 14292: Policy loss: -0.034171. Value loss: 0.044830. Entropy: 0.304486.
episode: 5098   score: 420.0  epsilon: 1.0    steps: 520  evaluation reward: 396.7
Training network. lr: 0.000141. clip: 0.056233
Iteration 14293: Policy loss: 0.015646. Value loss: 0.090866. Entropy: 0.308295.
Iteration 14294: Policy loss: 0.015644. Value loss: 0.040067. Entropy: 0.308031.
Iteration 14295: Policy loss: 0.009522. Value loss: 0.028191. Entropy: 0.308548.
episode:

Iteration 14352: Policy loss: 0.033554. Value loss: 0.045472. Entropy: 0.304013.
episode: 5120   score: 330.0  epsilon: 1.0    steps: 528  evaluation reward: 382.35
Training network. lr: 0.000140. clip: 0.055920
Iteration 14353: Policy loss: -0.051892. Value loss: 0.078087. Entropy: 0.296208.
Iteration 14354: Policy loss: -0.059292. Value loss: 0.039194. Entropy: 0.295878.
Iteration 14355: Policy loss: -0.056710. Value loss: 0.030465. Entropy: 0.295647.
episode: 5121   score: 315.0  epsilon: 1.0    steps: 216  evaluation reward: 382.3
Training network. lr: 0.000140. clip: 0.055920
Iteration 14356: Policy loss: -0.539072. Value loss: 0.468196. Entropy: 0.287567.
Iteration 14357: Policy loss: -0.549631. Value loss: 0.237155. Entropy: 0.286948.
Iteration 14358: Policy loss: -0.567907. Value loss: 0.180925. Entropy: 0.288497.
episode: 5122   score: 215.0  epsilon: 1.0    steps: 704  evaluation reward: 382.35
episode: 5123   score: 365.0  epsilon: 1.0    steps: 896  evaluation reward: 379.7

Training network. lr: 0.000139. clip: 0.055772
Iteration 14419: Policy loss: 0.121521. Value loss: 0.082669. Entropy: 0.293068.
Iteration 14420: Policy loss: 0.112011. Value loss: 0.035575. Entropy: 0.292698.
Iteration 14421: Policy loss: 0.113728. Value loss: 0.027699. Entropy: 0.293116.
episode: 5141   score: 475.0  epsilon: 1.0    steps: 232  evaluation reward: 393.65
Training network. lr: 0.000139. clip: 0.055772
Iteration 14422: Policy loss: -0.527589. Value loss: 0.512845. Entropy: 0.303408.
Iteration 14423: Policy loss: -0.513948. Value loss: 0.331176. Entropy: 0.302588.
Iteration 14424: Policy loss: -0.533282. Value loss: 0.181932. Entropy: 0.302648.
Training network. lr: 0.000139. clip: 0.055772
Iteration 14425: Policy loss: 0.013581. Value loss: 0.063798. Entropy: 0.306241.
Iteration 14426: Policy loss: 0.005858. Value loss: 0.034777. Entropy: 0.306289.
Iteration 14427: Policy loss: 0.003384. Value loss: 0.026939. Entropy: 0.306247.
Training network. lr: 0.000139. clip: 0.055

Iteration 14488: Policy loss: 0.256546. Value loss: 0.086816. Entropy: 0.311939.
Iteration 14489: Policy loss: 0.252829. Value loss: 0.031478. Entropy: 0.310845.
Iteration 14490: Policy loss: 0.255796. Value loss: 0.022843. Entropy: 0.312225.
episode: 5158   score: 420.0  epsilon: 1.0    steps: 752  evaluation reward: 398.0
Training network. lr: 0.000139. clip: 0.055616
Iteration 14491: Policy loss: -0.125088. Value loss: 0.339781. Entropy: 0.303041.
Iteration 14492: Policy loss: -0.140594. Value loss: 0.152893. Entropy: 0.303349.
Iteration 14493: Policy loss: -0.121747. Value loss: 0.105609. Entropy: 0.304477.
episode: 5159   score: 880.0  epsilon: 1.0    steps: 192  evaluation reward: 402.9
Training network. lr: 0.000139. clip: 0.055616
Iteration 14494: Policy loss: 0.267877. Value loss: 0.137623. Entropy: 0.293865.
Iteration 14495: Policy loss: 0.254914. Value loss: 0.053802. Entropy: 0.291224.
Iteration 14496: Policy loss: 0.261589. Value loss: 0.042071. Entropy: 0.291240.
episode:

Iteration 14555: Policy loss: -0.150839. Value loss: 0.214371. Entropy: 0.290939.
Iteration 14556: Policy loss: -0.142720. Value loss: 0.129122. Entropy: 0.289934.
episode: 5179   score: 670.0  epsilon: 1.0    steps: 128  evaluation reward: 417.3
Training network. lr: 0.000138. clip: 0.055312
Iteration 14557: Policy loss: -0.007999. Value loss: 0.119262. Entropy: 0.299540.
Iteration 14558: Policy loss: -0.014632. Value loss: 0.042008. Entropy: 0.297787.
Iteration 14559: Policy loss: -0.015283. Value loss: 0.027376. Entropy: 0.299670.
episode: 5180   score: 190.0  epsilon: 1.0    steps: 824  evaluation reward: 414.65
episode: 5181   score: 805.0  epsilon: 1.0    steps: 936  evaluation reward: 419.25
Training network. lr: 0.000138. clip: 0.055312
Iteration 14560: Policy loss: -0.036369. Value loss: 0.488955. Entropy: 0.293930.
Iteration 14561: Policy loss: -0.046800. Value loss: 0.283490. Entropy: 0.294065.
Iteration 14562: Policy loss: -0.056193. Value loss: 0.189510. Entropy: 0.292887.

episode: 5200   score: 390.0  epsilon: 1.0    steps: 296  evaluation reward: 423.1
Training network. lr: 0.000138. clip: 0.055155
Iteration 14623: Policy loss: 0.126251. Value loss: 0.106447. Entropy: 0.296615.
Iteration 14624: Policy loss: 0.129388. Value loss: 0.045461. Entropy: 0.295226.
Iteration 14625: Policy loss: 0.126613. Value loss: 0.033737. Entropy: 0.295720.
now time :  2019-09-06 05:22:48.201755
episode: 5201   score: 415.0  epsilon: 1.0    steps: 480  evaluation reward: 421.05
Training network. lr: 0.000138. clip: 0.055155
Iteration 14626: Policy loss: -0.016539. Value loss: 0.092572. Entropy: 0.305789.
Iteration 14627: Policy loss: -0.018491. Value loss: 0.049541. Entropy: 0.305958.
Iteration 14628: Policy loss: -0.021685. Value loss: 0.036101. Entropy: 0.305992.
Training network. lr: 0.000138. clip: 0.055155
Iteration 14629: Policy loss: -0.298813. Value loss: 0.380087. Entropy: 0.312884.
Iteration 14630: Policy loss: -0.313164. Value loss: 0.286805. Entropy: 0.310790.


episode: 5221   score: 285.0  epsilon: 1.0    steps: 472  evaluation reward: 426.55
episode: 5222   score: 530.0  epsilon: 1.0    steps: 656  evaluation reward: 429.7
Training network. lr: 0.000137. clip: 0.054998
Iteration 14689: Policy loss: 0.212393. Value loss: 0.136196. Entropy: 0.292539.
Iteration 14690: Policy loss: 0.194307. Value loss: 0.046728. Entropy: 0.290298.
Iteration 14691: Policy loss: 0.189601. Value loss: 0.035216. Entropy: 0.290386.
Training network. lr: 0.000137. clip: 0.054998
Iteration 14692: Policy loss: 0.040353. Value loss: 0.144197. Entropy: 0.307658.
Iteration 14693: Policy loss: 0.034081. Value loss: 0.069106. Entropy: 0.306191.
Iteration 14694: Policy loss: 0.021967. Value loss: 0.051059. Entropy: 0.306683.
episode: 5223   score: 330.0  epsilon: 1.0    steps: 800  evaluation reward: 429.35
Training network. lr: 0.000137. clip: 0.054998
Iteration 14695: Policy loss: 0.125728. Value loss: 0.128840. Entropy: 0.304972.
Iteration 14696: Policy loss: 0.128676. V

Training network. lr: 0.000137. clip: 0.054694
Iteration 14755: Policy loss: -0.086558. Value loss: 0.203679. Entropy: 0.304779.
Iteration 14756: Policy loss: -0.105745. Value loss: 0.085736. Entropy: 0.304042.
Iteration 14757: Policy loss: -0.108599. Value loss: 0.063025. Entropy: 0.304196.
Training network. lr: 0.000137. clip: 0.054694
Iteration 14758: Policy loss: 0.121828. Value loss: 0.124117. Entropy: 0.315047.
Iteration 14759: Policy loss: 0.111008. Value loss: 0.048384. Entropy: 0.314234.
Iteration 14760: Policy loss: 0.110784. Value loss: 0.032211. Entropy: 0.314491.
episode: 5243   score: 330.0  epsilon: 1.0    steps: 744  evaluation reward: 415.6
Training network. lr: 0.000137. clip: 0.054694
Iteration 14761: Policy loss: 0.092039. Value loss: 0.145707. Entropy: 0.308730.
Iteration 14762: Policy loss: 0.095331. Value loss: 0.045498. Entropy: 0.307513.
Iteration 14763: Policy loss: 0.079706. Value loss: 0.027851. Entropy: 0.307350.
Training network. lr: 0.000137. clip: 0.0546

episode: 5261   score: 495.0  epsilon: 1.0    steps: 696  evaluation reward: 416.35
episode: 5262   score: 260.0  epsilon: 1.0    steps: 896  evaluation reward: 410.85
Training network. lr: 0.000136. clip: 0.054537
Iteration 14824: Policy loss: -0.116322. Value loss: 0.228719. Entropy: 0.302048.
Iteration 14825: Policy loss: -0.126480. Value loss: 0.072303. Entropy: 0.304386.
Iteration 14826: Policy loss: -0.122901. Value loss: 0.048354. Entropy: 0.302262.
episode: 5263   score: 315.0  epsilon: 1.0    steps: 192  evaluation reward: 408.0
episode: 5264   score: 620.0  epsilon: 1.0    steps: 480  evaluation reward: 412.1
Training network. lr: 0.000136. clip: 0.054537
Iteration 14827: Policy loss: 0.053939. Value loss: 0.126396. Entropy: 0.296480.
Iteration 14828: Policy loss: 0.049175. Value loss: 0.060063. Entropy: 0.295929.
Iteration 14829: Policy loss: 0.050565. Value loss: 0.042477. Entropy: 0.296083.
episode: 5265   score: 505.0  epsilon: 1.0    steps: 80  evaluation reward: 413.7
T

Iteration 14889: Policy loss: -0.038010. Value loss: 0.033124. Entropy: 0.288830.
episode: 5284   score: 300.0  epsilon: 1.0    steps: 760  evaluation reward: 404.0
Training network. lr: 0.000136. clip: 0.054390
Iteration 14890: Policy loss: 0.100438. Value loss: 0.167269. Entropy: 0.297565.
Iteration 14891: Policy loss: 0.093718. Value loss: 0.071638. Entropy: 0.295981.
Iteration 14892: Policy loss: 0.090030. Value loss: 0.049246. Entropy: 0.296033.
episode: 5285   score: 385.0  epsilon: 1.0    steps: 856  evaluation reward: 406.5
Training network. lr: 0.000136. clip: 0.054390
Iteration 14893: Policy loss: -0.076468. Value loss: 0.269418. Entropy: 0.302739.
Iteration 14894: Policy loss: -0.117292. Value loss: 0.108064. Entropy: 0.300783.
Iteration 14895: Policy loss: -0.112097. Value loss: 0.059586. Entropy: 0.301673.
Training network. lr: 0.000136. clip: 0.054390
Iteration 14896: Policy loss: -0.004285. Value loss: 0.072377. Entropy: 0.305981.
Iteration 14897: Policy loss: -0.005043.

episode: 5304   score: 365.0  epsilon: 1.0    steps: 936  evaluation reward: 412.2
Training network. lr: 0.000135. clip: 0.054077
Iteration 14956: Policy loss: 0.297953. Value loss: 0.121796. Entropy: 0.298643.
Iteration 14957: Policy loss: 0.295516. Value loss: 0.046722. Entropy: 0.297057.
Iteration 14958: Policy loss: 0.291842. Value loss: 0.037501. Entropy: 0.297129.
episode: 5305   score: 585.0  epsilon: 1.0    steps: 408  evaluation reward: 415.95
Training network. lr: 0.000135. clip: 0.054077
Iteration 14959: Policy loss: -0.286050. Value loss: 0.309210. Entropy: 0.287892.
Iteration 14960: Policy loss: -0.290609. Value loss: 0.201863. Entropy: 0.286340.
Iteration 14961: Policy loss: -0.311435. Value loss: 0.140086. Entropy: 0.286601.
Training network. lr: 0.000135. clip: 0.054077
Iteration 14962: Policy loss: -0.430441. Value loss: 0.194173. Entropy: 0.312183.
Iteration 14963: Policy loss: -0.433845. Value loss: 0.075662. Entropy: 0.312727.
Iteration 14964: Policy loss: -0.441808

Iteration 15022: Policy loss: 0.010150. Value loss: 0.072098. Entropy: 0.307051.
Iteration 15023: Policy loss: 0.011901. Value loss: 0.028731. Entropy: 0.306263.
Iteration 15024: Policy loss: 0.004979. Value loss: 0.021380. Entropy: 0.306840.
Training network. lr: 0.000135. clip: 0.053929
Iteration 15025: Policy loss: -0.466075. Value loss: 0.329894. Entropy: 0.310907.
Iteration 15026: Policy loss: -0.471653. Value loss: 0.141467. Entropy: 0.310815.
Iteration 15027: Policy loss: -0.464419. Value loss: 0.053332. Entropy: 0.311231.
episode: 5325   score: 535.0  epsilon: 1.0    steps: 184  evaluation reward: 425.1
Training network. lr: 0.000135. clip: 0.053929
Iteration 15028: Policy loss: 0.233258. Value loss: 0.204467. Entropy: 0.305013.
Iteration 15029: Policy loss: 0.234915. Value loss: 0.071027. Entropy: 0.305593.
Iteration 15030: Policy loss: 0.223000. Value loss: 0.041947. Entropy: 0.305344.
episode: 5326   score: 315.0  epsilon: 1.0    steps: 80  evaluation reward: 423.85
episode:

Training network. lr: 0.000134. clip: 0.053773
Iteration 15088: Policy loss: 0.061466. Value loss: 0.120890. Entropy: 0.304520.
Iteration 15089: Policy loss: 0.057180. Value loss: 0.053523. Entropy: 0.303487.
Iteration 15090: Policy loss: 0.056079. Value loss: 0.038664. Entropy: 0.302478.
Training network. lr: 0.000134. clip: 0.053773
Iteration 15091: Policy loss: 0.130603. Value loss: 0.138082. Entropy: 0.307122.
Iteration 15092: Policy loss: 0.129780. Value loss: 0.054979. Entropy: 0.307288.
Iteration 15093: Policy loss: 0.124445. Value loss: 0.032756. Entropy: 0.306573.
episode: 5347   score: 320.0  epsilon: 1.0    steps: 672  evaluation reward: 426.35
Training network. lr: 0.000134. clip: 0.053773
Iteration 15094: Policy loss: 0.314640. Value loss: 0.166851. Entropy: 0.306381.
Iteration 15095: Policy loss: 0.298679. Value loss: 0.043077. Entropy: 0.305594.
Iteration 15096: Policy loss: 0.294545. Value loss: 0.031328. Entropy: 0.305750.
episode: 5348   score: 320.0  epsilon: 1.0    

Iteration 15153: Policy loss: 0.322375. Value loss: 0.053536. Entropy: 0.295094.
episode: 5369   score: 425.0  epsilon: 1.0    steps: 728  evaluation reward: 410.85
Training network. lr: 0.000134. clip: 0.053468
Iteration 15154: Policy loss: 0.349210. Value loss: 0.138793. Entropy: 0.308883.
Iteration 15155: Policy loss: 0.341309. Value loss: 0.044051. Entropy: 0.307432.
Iteration 15156: Policy loss: 0.329837. Value loss: 0.030137. Entropy: 0.305923.
episode: 5370   score: 105.0  epsilon: 1.0    steps: 240  evaluation reward: 409.3
episode: 5371   score: 260.0  epsilon: 1.0    steps: 976  evaluation reward: 409.2
Training network. lr: 0.000134. clip: 0.053468
Iteration 15157: Policy loss: -0.130823. Value loss: 0.260189. Entropy: 0.302568.
Iteration 15158: Policy loss: -0.115715. Value loss: 0.087569. Entropy: 0.303491.
Iteration 15159: Policy loss: -0.150613. Value loss: 0.052899. Entropy: 0.303535.
Training network. lr: 0.000134. clip: 0.053468
Iteration 15160: Policy loss: -0.188477

Iteration 15219: Policy loss: -0.148866. Value loss: 0.068918. Entropy: 0.313655.
Training network. lr: 0.000133. clip: 0.053312
Iteration 15220: Policy loss: -0.072854. Value loss: 0.277407. Entropy: 0.311807.
Iteration 15221: Policy loss: -0.084190. Value loss: 0.078424. Entropy: 0.312498.
Iteration 15222: Policy loss: -0.068427. Value loss: 0.054864. Entropy: 0.311365.
Training network. lr: 0.000133. clip: 0.053312
Iteration 15223: Policy loss: 0.288722. Value loss: 0.150888. Entropy: 0.308394.
Iteration 15224: Policy loss: 0.290682. Value loss: 0.049447. Entropy: 0.306189.
Iteration 15225: Policy loss: 0.282215. Value loss: 0.030970. Entropy: 0.306628.
episode: 5391   score: 500.0  epsilon: 1.0    steps: 168  evaluation reward: 408.6
episode: 5392   score: 210.0  epsilon: 1.0    steps: 176  evaluation reward: 408.4
episode: 5393   score: 460.0  epsilon: 1.0    steps: 600  evaluation reward: 409.1
episode: 5394   score: 535.0  epsilon: 1.0    steps: 968  evaluation reward: 410.55
Tr

Training network. lr: 0.000133. clip: 0.053155
Iteration 15283: Policy loss: -0.213277. Value loss: 0.239710. Entropy: 0.291455.
Iteration 15284: Policy loss: -0.197392. Value loss: 0.109487. Entropy: 0.292136.
Iteration 15285: Policy loss: -0.217446. Value loss: 0.068437. Entropy: 0.292155.
Training network. lr: 0.000133. clip: 0.053155
Iteration 15286: Policy loss: -0.614220. Value loss: 0.263377. Entropy: 0.306910.
Iteration 15287: Policy loss: -0.619158. Value loss: 0.097061. Entropy: 0.309191.
Iteration 15288: Policy loss: -0.626119. Value loss: 0.082146. Entropy: 0.308558.
Training network. lr: 0.000133. clip: 0.053155
Iteration 15289: Policy loss: 0.069080. Value loss: 0.121199. Entropy: 0.300867.
Iteration 15290: Policy loss: 0.066692. Value loss: 0.048213. Entropy: 0.301551.
Iteration 15291: Policy loss: 0.060643. Value loss: 0.033596. Entropy: 0.300511.
episode: 5415   score: 375.0  epsilon: 1.0    steps: 72  evaluation reward: 404.2
Training network. lr: 0.000133. clip: 0.05

Iteration 15351: Policy loss: -0.293008. Value loss: 0.078268. Entropy: 0.297207.
Training network. lr: 0.000132. clip: 0.052851
Iteration 15352: Policy loss: 0.057693. Value loss: 0.129597. Entropy: 0.305200.
Iteration 15353: Policy loss: 0.053119. Value loss: 0.058373. Entropy: 0.304441.
Iteration 15354: Policy loss: 0.046549. Value loss: 0.038958. Entropy: 0.304664.
Training network. lr: 0.000132. clip: 0.052851
Iteration 15355: Policy loss: -0.744059. Value loss: 0.304182. Entropy: 0.311291.
Iteration 15356: Policy loss: -0.749257. Value loss: 0.098901. Entropy: 0.312874.
Iteration 15357: Policy loss: -0.754736. Value loss: 0.045535. Entropy: 0.312687.
episode: 5434   score: 745.0  epsilon: 1.0    steps: 328  evaluation reward: 422.25
episode: 5435   score: 340.0  epsilon: 1.0    steps: 808  evaluation reward: 422.15
Training network. lr: 0.000132. clip: 0.052851
Iteration 15358: Policy loss: 0.243949. Value loss: 0.191967. Entropy: 0.294596.
Iteration 15359: Policy loss: 0.223765.

Iteration 15419: Policy loss: -0.198061. Value loss: 0.103124. Entropy: 0.306578.
Iteration 15420: Policy loss: -0.206274. Value loss: 0.061199. Entropy: 0.306655.
Training network. lr: 0.000132. clip: 0.052694
Iteration 15421: Policy loss: 0.344864. Value loss: 0.290772. Entropy: 0.312201.
Iteration 15422: Policy loss: 0.317901. Value loss: 0.073074. Entropy: 0.311359.
Iteration 15423: Policy loss: 0.314260. Value loss: 0.037144. Entropy: 0.311534.
episode: 5453   score: 595.0  epsilon: 1.0    steps: 216  evaluation reward: 426.85
episode: 5454   score: 640.0  epsilon: 1.0    steps: 744  evaluation reward: 431.0
Training network. lr: 0.000132. clip: 0.052694
Iteration 15424: Policy loss: 0.185133. Value loss: 0.183689. Entropy: 0.291613.
Iteration 15425: Policy loss: 0.192555. Value loss: 0.081595. Entropy: 0.292776.
Iteration 15426: Policy loss: 0.187808. Value loss: 0.044845. Entropy: 0.291788.
episode: 5455   score: 345.0  epsilon: 1.0    steps: 200  evaluation reward: 429.0
episod

Iteration 15485: Policy loss: -0.241852. Value loss: 0.081792. Entropy: 0.290229.
Iteration 15486: Policy loss: -0.265183. Value loss: 0.052970. Entropy: 0.292873.
Training network. lr: 0.000131. clip: 0.052547
Iteration 15487: Policy loss: -0.042457. Value loss: 0.088114. Entropy: 0.310595.
Iteration 15488: Policy loss: -0.053214. Value loss: 0.044935. Entropy: 0.311107.
Iteration 15489: Policy loss: -0.054937. Value loss: 0.028999. Entropy: 0.310477.
Training network. lr: 0.000131. clip: 0.052547
Iteration 15490: Policy loss: -0.077509. Value loss: 0.211459. Entropy: 0.304412.
Iteration 15491: Policy loss: -0.102084. Value loss: 0.059149. Entropy: 0.304617.
Iteration 15492: Policy loss: -0.096733. Value loss: 0.033252. Entropy: 0.304951.
episode: 5475   score: 545.0  epsilon: 1.0    steps: 8  evaluation reward: 441.0
episode: 5476   score: 330.0  epsilon: 1.0    steps: 480  evaluation reward: 439.1
episode: 5477   score: 355.0  epsilon: 1.0    steps: 736  evaluation reward: 437.8
epi

Training network. lr: 0.000131. clip: 0.052233
Iteration 15553: Policy loss: 0.108291. Value loss: 0.148678. Entropy: 0.304753.
Iteration 15554: Policy loss: 0.115419. Value loss: 0.059515. Entropy: 0.305611.
Iteration 15555: Policy loss: 0.110543. Value loss: 0.039061. Entropy: 0.305036.
episode: 5495   score: 695.0  epsilon: 1.0    steps: 520  evaluation reward: 437.35
Training network. lr: 0.000131. clip: 0.052233
Iteration 15556: Policy loss: 0.068017. Value loss: 0.098190. Entropy: 0.302956.
Iteration 15557: Policy loss: 0.064385. Value loss: 0.044004. Entropy: 0.303525.
Iteration 15558: Policy loss: 0.064126. Value loss: 0.034116. Entropy: 0.303973.
episode: 5496   score: 240.0  epsilon: 1.0    steps: 232  evaluation reward: 433.85
episode: 5497   score: 345.0  epsilon: 1.0    steps: 520  evaluation reward: 433.2
Training network. lr: 0.000131. clip: 0.052233
Iteration 15559: Policy loss: 0.152436. Value loss: 0.244467. Entropy: 0.293024.
Iteration 15560: Policy loss: 0.153590. V

Iteration 15620: Policy loss: -0.049102. Value loss: 0.042995. Entropy: 0.306068.
Iteration 15621: Policy loss: -0.057743. Value loss: 0.030907. Entropy: 0.307009.
Training network. lr: 0.000130. clip: 0.052086
Iteration 15622: Policy loss: -0.006266. Value loss: 0.098679. Entropy: 0.308823.
Iteration 15623: Policy loss: -0.019003. Value loss: 0.048519. Entropy: 0.308508.
Iteration 15624: Policy loss: -0.011752. Value loss: 0.036316. Entropy: 0.309180.
episode: 5515   score: 410.0  epsilon: 1.0    steps: 16  evaluation reward: 436.0
Training network. lr: 0.000130. clip: 0.052086
Iteration 15625: Policy loss: 0.233843. Value loss: 0.118048. Entropy: 0.295816.
Iteration 15626: Policy loss: 0.222014. Value loss: 0.046530. Entropy: 0.294417.
Iteration 15627: Policy loss: 0.227889. Value loss: 0.034543. Entropy: 0.295058.
Training network. lr: 0.000130. clip: 0.052086
Iteration 15628: Policy loss: 0.156005. Value loss: 0.092098. Entropy: 0.308766.
Iteration 15629: Policy loss: 0.141902. Val

Iteration 15689: Policy loss: -0.306114. Value loss: 0.108616. Entropy: 0.308997.
Iteration 15690: Policy loss: -0.327061. Value loss: 0.066293. Entropy: 0.309892.
episode: 5533   score: 590.0  epsilon: 1.0    steps: 536  evaluation reward: 433.3
episode: 5534   score: 620.0  epsilon: 1.0    steps: 960  evaluation reward: 432.05
episode: 5535   score: 330.0  epsilon: 1.0    steps: 1016  evaluation reward: 431.95
Training network. lr: 0.000130. clip: 0.051929
Iteration 15691: Policy loss: 0.251071. Value loss: 0.103187. Entropy: 0.296618.
Iteration 15692: Policy loss: 0.237618. Value loss: 0.037476. Entropy: 0.295409.
Iteration 15693: Policy loss: 0.233109. Value loss: 0.026376. Entropy: 0.294563.
episode: 5536   score: 330.0  epsilon: 1.0    steps: 152  evaluation reward: 429.35
Training network. lr: 0.000130. clip: 0.051929
Iteration 15694: Policy loss: -0.014591. Value loss: 0.109814. Entropy: 0.288101.
Iteration 15695: Policy loss: -0.028033. Value loss: 0.059136. Entropy: 0.286493.

Training network. lr: 0.000129. clip: 0.051625
Iteration 15754: Policy loss: 0.200658. Value loss: 0.224468. Entropy: 0.308578.
Iteration 15755: Policy loss: 0.204687. Value loss: 0.074206. Entropy: 0.307577.
Iteration 15756: Policy loss: 0.179882. Value loss: 0.046240. Entropy: 0.307453.
Training network. lr: 0.000129. clip: 0.051625
Iteration 15757: Policy loss: 0.082397. Value loss: 0.167519. Entropy: 0.304939.
Iteration 15758: Policy loss: 0.058959. Value loss: 0.050431. Entropy: 0.304637.
Iteration 15759: Policy loss: 0.073494. Value loss: 0.030245. Entropy: 0.305479.
episode: 5556   score: 495.0  epsilon: 1.0    steps: 592  evaluation reward: 419.9
episode: 5557   score: 300.0  epsilon: 1.0    steps: 936  evaluation reward: 418.7
Training network. lr: 0.000129. clip: 0.051625
Iteration 15760: Policy loss: 0.234649. Value loss: 0.119771. Entropy: 0.299145.
Iteration 15761: Policy loss: 0.224028. Value loss: 0.050330. Entropy: 0.298701.
Iteration 15762: Policy loss: 0.222168. Value

Iteration 15820: Policy loss: -0.013716. Value loss: 0.132625. Entropy: 0.288023.
Iteration 15821: Policy loss: -0.005284. Value loss: 0.055842. Entropy: 0.287645.
Iteration 15822: Policy loss: -0.018852. Value loss: 0.042241. Entropy: 0.288307.
Training network. lr: 0.000129. clip: 0.051469
Iteration 15823: Policy loss: 0.123310. Value loss: 0.083717. Entropy: 0.315427.
Iteration 15824: Policy loss: 0.111017. Value loss: 0.036802. Entropy: 0.315900.
Iteration 15825: Policy loss: 0.111621. Value loss: 0.026338. Entropy: 0.315678.
episode: 5577   score: 605.0  epsilon: 1.0    steps: 16  evaluation reward: 418.35
episode: 5578   score: 265.0  epsilon: 1.0    steps: 896  evaluation reward: 417.35
Training network. lr: 0.000129. clip: 0.051469
Iteration 15826: Policy loss: 0.097220. Value loss: 0.075588. Entropy: 0.295655.
Iteration 15827: Policy loss: 0.087624. Value loss: 0.028125. Entropy: 0.294134.
Iteration 15828: Policy loss: 0.084647. Value loss: 0.021555. Entropy: 0.293152.
Trainin

Training network. lr: 0.000128. clip: 0.051312
Iteration 15886: Policy loss: 0.060926. Value loss: 0.102860. Entropy: 0.284635.
Iteration 15887: Policy loss: 0.059000. Value loss: 0.052414. Entropy: 0.287013.
Iteration 15888: Policy loss: 0.050217. Value loss: 0.037830. Entropy: 0.286911.
episode: 5599   score: 285.0  epsilon: 1.0    steps: 832  evaluation reward: 415.35
Training network. lr: 0.000128. clip: 0.051312
Iteration 15889: Policy loss: 0.237828. Value loss: 0.158838. Entropy: 0.300103.
Iteration 15890: Policy loss: 0.236257. Value loss: 0.079081. Entropy: 0.300267.
Iteration 15891: Policy loss: 0.229920. Value loss: 0.052191. Entropy: 0.298844.
Training network. lr: 0.000128. clip: 0.051312
Iteration 15892: Policy loss: -0.017594. Value loss: 0.083533. Entropy: 0.309422.
Iteration 15893: Policy loss: -0.019836. Value loss: 0.040872. Entropy: 0.309362.
Iteration 15894: Policy loss: -0.027619. Value loss: 0.030446. Entropy: 0.308987.
episode: 5600   score: 260.0  epsilon: 1.0 

Iteration 15951: Policy loss: -0.156460. Value loss: 0.051530. Entropy: 0.287650.
episode: 5621   score: 225.0  epsilon: 1.0    steps: 952  evaluation reward: 390.05
Training network. lr: 0.000128. clip: 0.051008
Iteration 15952: Policy loss: 0.639480. Value loss: 0.316008. Entropy: 0.306581.
Iteration 15953: Policy loss: 0.612150. Value loss: 0.086134. Entropy: 0.305851.
Iteration 15954: Policy loss: 0.617651. Value loss: 0.059098. Entropy: 0.305929.
episode: 5622   score: 295.0  epsilon: 1.0    steps: 392  evaluation reward: 389.35
episode: 5623   score: 335.0  epsilon: 1.0    steps: 496  evaluation reward: 389.85
Training network. lr: 0.000128. clip: 0.051008
Iteration 15955: Policy loss: 0.130658. Value loss: 0.102187. Entropy: 0.288329.
Iteration 15956: Policy loss: 0.122237. Value loss: 0.044457. Entropy: 0.288819.
Iteration 15957: Policy loss: 0.117245. Value loss: 0.032625. Entropy: 0.289283.
episode: 5624   score: 210.0  epsilon: 1.0    steps: 520  evaluation reward: 388.65
ep

Training network. lr: 0.000127. clip: 0.050851
Iteration 16015: Policy loss: 0.055269. Value loss: 0.121383. Entropy: 0.312797.
Iteration 16016: Policy loss: 0.054438. Value loss: 0.068795. Entropy: 0.312172.
Iteration 16017: Policy loss: 0.052129. Value loss: 0.049959. Entropy: 0.313446.
Training network. lr: 0.000127. clip: 0.050851
Iteration 16018: Policy loss: -0.164606. Value loss: 0.101972. Entropy: 0.305716.
Iteration 16019: Policy loss: -0.163570. Value loss: 0.049059. Entropy: 0.306415.
Iteration 16020: Policy loss: -0.169346. Value loss: 0.035092. Entropy: 0.306122.
episode: 5646   score: 640.0  epsilon: 1.0    steps: 376  evaluation reward: 372.15
Training network. lr: 0.000127. clip: 0.050851
Iteration 16021: Policy loss: -0.069867. Value loss: 0.099704. Entropy: 0.307660.
Iteration 16022: Policy loss: -0.073940. Value loss: 0.049492. Entropy: 0.307154.
Iteration 16023: Policy loss: -0.077550. Value loss: 0.034004. Entropy: 0.307155.
episode: 5647   score: 210.0  epsilon: 1

episode: 5667   score: 345.0  epsilon: 1.0    steps: 1008  evaluation reward: 356.2
Training network. lr: 0.000127. clip: 0.050704
Iteration 16081: Policy loss: 0.051944. Value loss: 0.100628. Entropy: 0.296741.
Iteration 16082: Policy loss: 0.040341. Value loss: 0.035451. Entropy: 0.297092.
Iteration 16083: Policy loss: 0.035627. Value loss: 0.026751. Entropy: 0.296647.
Training network. lr: 0.000127. clip: 0.050704
Iteration 16084: Policy loss: 0.097855. Value loss: 0.070424. Entropy: 0.302957.
Iteration 16085: Policy loss: 0.096092. Value loss: 0.038093. Entropy: 0.301951.
Iteration 16086: Policy loss: 0.086745. Value loss: 0.030439. Entropy: 0.302257.
Training network. lr: 0.000127. clip: 0.050704
Iteration 16087: Policy loss: 0.076121. Value loss: 0.131994. Entropy: 0.309955.
Iteration 16088: Policy loss: 0.070212. Value loss: 0.044831. Entropy: 0.309624.
Iteration 16089: Policy loss: 0.071255. Value loss: 0.026329. Entropy: 0.308511.
episode: 5668   score: 420.0  epsilon: 1.0    

Iteration 16148: Policy loss: -0.009396. Value loss: 0.080731. Entropy: 0.311259.
Iteration 16149: Policy loss: -0.010296. Value loss: 0.054405. Entropy: 0.311670.
episode: 5687   score: 575.0  epsilon: 1.0    steps: 208  evaluation reward: 350.85
Training network. lr: 0.000126. clip: 0.050547
Iteration 16150: Policy loss: 0.420277. Value loss: 0.138194. Entropy: 0.299420.
Iteration 16151: Policy loss: 0.405724. Value loss: 0.061064. Entropy: 0.299008.
Iteration 16152: Policy loss: 0.396814. Value loss: 0.038403. Entropy: 0.299053.
episode: 5688   score: 490.0  epsilon: 1.0    steps: 96  evaluation reward: 351.8
episode: 5689   score: 880.0  epsilon: 1.0    steps: 736  evaluation reward: 353.9
episode: 5690   score: 420.0  epsilon: 1.0    steps: 976  evaluation reward: 355.5
Training network. lr: 0.000126. clip: 0.050390
Iteration 16153: Policy loss: 0.344950. Value loss: 0.125000. Entropy: 0.293344.
Iteration 16154: Policy loss: 0.347522. Value loss: 0.047491. Entropy: 0.294848.
Itera

Iteration 16213: Policy loss: -0.367144. Value loss: 0.597153. Entropy: 0.293204.
Iteration 16214: Policy loss: -0.391051. Value loss: 0.417451. Entropy: 0.290782.
Iteration 16215: Policy loss: -0.370189. Value loss: 0.295391. Entropy: 0.292359.
Training network. lr: 0.000126. clip: 0.050243
Iteration 16216: Policy loss: -0.255288. Value loss: 0.249852. Entropy: 0.314541.
Iteration 16217: Policy loss: -0.267907. Value loss: 0.171195. Entropy: 0.314247.
Iteration 16218: Policy loss: -0.268394. Value loss: 0.132166. Entropy: 0.313965.
episode: 5709   score: 460.0  epsilon: 1.0    steps: 648  evaluation reward: 372.2
Training network. lr: 0.000126. clip: 0.050243
Iteration 16219: Policy loss: 0.008981. Value loss: 0.225490. Entropy: 0.287829.
Iteration 16220: Policy loss: -0.003909. Value loss: 0.114391. Entropy: 0.288346.
Iteration 16221: Policy loss: -0.018530. Value loss: 0.066608. Entropy: 0.288786.
episode: 5710   score: 700.0  epsilon: 1.0    steps: 840  evaluation reward: 375.05
Tr

episode: 5728   score: 330.0  epsilon: 1.0    steps: 120  evaluation reward: 407.6
episode: 5729   score: 365.0  epsilon: 1.0    steps: 1000  evaluation reward: 408.15
Training network. lr: 0.000125. clip: 0.050086
Iteration 16282: Policy loss: -0.071266. Value loss: 0.097726. Entropy: 0.289376.
Iteration 16283: Policy loss: -0.068080. Value loss: 0.038018. Entropy: 0.289805.
Iteration 16284: Policy loss: -0.070540. Value loss: 0.032590. Entropy: 0.290432.
episode: 5730   score: 260.0  epsilon: 1.0    steps: 936  evaluation reward: 409.5
Training network. lr: 0.000125. clip: 0.050086
Iteration 16285: Policy loss: 0.313711. Value loss: 0.113081. Entropy: 0.293139.
Iteration 16286: Policy loss: 0.300417. Value loss: 0.044555. Entropy: 0.293274.
Iteration 16287: Policy loss: 0.304018. Value loss: 0.029706. Entropy: 0.294655.
Training network. lr: 0.000125. clip: 0.050086
Iteration 16288: Policy loss: 0.097408. Value loss: 0.112642. Entropy: 0.299644.
Iteration 16289: Policy loss: 0.100927

Iteration 16348: Policy loss: 0.014464. Value loss: 0.388646. Entropy: 0.295687.
Iteration 16349: Policy loss: 0.039921. Value loss: 0.219292. Entropy: 0.295657.
Iteration 16350: Policy loss: 0.015173. Value loss: 0.135471. Entropy: 0.295404.
episode: 5749   score: 830.0  epsilon: 1.0    steps: 568  evaluation reward: 429.75
Training network. lr: 0.000124. clip: 0.049782
Iteration 16351: Policy loss: 0.103248. Value loss: 0.148480. Entropy: 0.286097.
Iteration 16352: Policy loss: 0.093095. Value loss: 0.074509. Entropy: 0.284977.
Iteration 16353: Policy loss: 0.094819. Value loss: 0.052687. Entropy: 0.287171.
Training network. lr: 0.000124. clip: 0.049782
Iteration 16354: Policy loss: 0.094343. Value loss: 0.212691. Entropy: 0.312811.
Iteration 16355: Policy loss: 0.072169. Value loss: 0.075429. Entropy: 0.312657.
Iteration 16356: Policy loss: 0.056856. Value loss: 0.049341. Entropy: 0.312309.
episode: 5750   score: 490.0  epsilon: 1.0    steps: 192  evaluation reward: 432.55
now time 

episode: 5771   score: 290.0  epsilon: 1.0    steps: 896  evaluation reward: 444.8
Training network. lr: 0.000124. clip: 0.049625
Iteration 16414: Policy loss: 0.452104. Value loss: 0.214438. Entropy: 0.300465.
Iteration 16415: Policy loss: 0.452180. Value loss: 0.053311. Entropy: 0.297916.
Iteration 16416: Policy loss: 0.451859. Value loss: 0.029737. Entropy: 0.299117.
episode: 5772   score: 620.0  epsilon: 1.0    steps: 168  evaluation reward: 449.2
Training network. lr: 0.000124. clip: 0.049625
Iteration 16417: Policy loss: -0.644393. Value loss: 0.539465. Entropy: 0.297575.
Iteration 16418: Policy loss: -0.629888. Value loss: 0.376635. Entropy: 0.297262.
Iteration 16419: Policy loss: -0.663558. Value loss: 0.309292. Entropy: 0.295965.
Training network. lr: 0.000124. clip: 0.049625
Iteration 16420: Policy loss: -0.046937. Value loss: 0.186465. Entropy: 0.296969.
Iteration 16421: Policy loss: -0.048818. Value loss: 0.083603. Entropy: 0.294872.
Iteration 16422: Policy loss: -0.062667.

Iteration 16481: Policy loss: 0.098507. Value loss: 0.035040. Entropy: 0.274232.
Iteration 16482: Policy loss: 0.098798. Value loss: 0.029518. Entropy: 0.273477.
Training network. lr: 0.000124. clip: 0.049469
Iteration 16483: Policy loss: 0.127530. Value loss: 0.372279. Entropy: 0.295507.
Iteration 16484: Policy loss: 0.127687. Value loss: 0.237803. Entropy: 0.293859.
Iteration 16485: Policy loss: 0.131487. Value loss: 0.210136. Entropy: 0.293395.
Training network. lr: 0.000124. clip: 0.049469
Iteration 16486: Policy loss: 0.152058. Value loss: 0.144723. Entropy: 0.301217.
Iteration 16487: Policy loss: 0.159872. Value loss: 0.052428. Entropy: 0.302154.
Iteration 16488: Policy loss: 0.149867. Value loss: 0.035580. Entropy: 0.301278.
episode: 5791   score: 210.0  epsilon: 1.0    steps: 48  evaluation reward: 452.45
episode: 5792   score: 670.0  epsilon: 1.0    steps: 848  evaluation reward: 455.1
episode: 5793   score: 535.0  epsilon: 1.0    steps: 888  evaluation reward: 454.75
Training

Iteration 16547: Policy loss: 0.171056. Value loss: 0.052003. Entropy: 0.283797.
Iteration 16548: Policy loss: 0.174368. Value loss: 0.039796. Entropy: 0.284028.
Training network. lr: 0.000123. clip: 0.049321
Iteration 16549: Policy loss: -0.159585. Value loss: 0.158843. Entropy: 0.298652.
Iteration 16550: Policy loss: -0.154780. Value loss: 0.061553. Entropy: 0.297975.
Iteration 16551: Policy loss: -0.169998. Value loss: 0.038915. Entropy: 0.299050.
Training network. lr: 0.000123. clip: 0.049165
Iteration 16552: Policy loss: 0.073421. Value loss: 0.158672. Entropy: 0.300524.
Iteration 16553: Policy loss: 0.065548. Value loss: 0.065520. Entropy: 0.300190.
Iteration 16554: Policy loss: 0.062824. Value loss: 0.041532. Entropy: 0.300124.
episode: 5812   score: 520.0  epsilon: 1.0    steps: 488  evaluation reward: 430.55
episode: 5813   score: 555.0  epsilon: 1.0    steps: 584  evaluation reward: 429.9
Training network. lr: 0.000123. clip: 0.049165
Iteration 16555: Policy loss: 0.368932. V

Iteration 16613: Policy loss: 0.018241. Value loss: 0.062434. Entropy: 0.298271.
Iteration 16614: Policy loss: 0.017862. Value loss: 0.038428. Entropy: 0.298822.
episode: 5834   score: 155.0  epsilon: 1.0    steps: 424  evaluation reward: 415.15
episode: 5835   score: 320.0  epsilon: 1.0    steps: 456  evaluation reward: 411.8
Training network. lr: 0.000123. clip: 0.049008
Iteration 16615: Policy loss: 0.038361. Value loss: 0.171111. Entropy: 0.293205.
Iteration 16616: Policy loss: 0.032948. Value loss: 0.049203. Entropy: 0.292998.
Iteration 16617: Policy loss: 0.021773. Value loss: 0.034552. Entropy: 0.292691.
Training network. lr: 0.000123. clip: 0.049008
Iteration 16618: Policy loss: 0.239104. Value loss: 0.155424. Entropy: 0.301037.
Iteration 16619: Policy loss: 0.230035. Value loss: 0.057848. Entropy: 0.300206.
Iteration 16620: Policy loss: 0.226524. Value loss: 0.033395. Entropy: 0.299063.
episode: 5836   score: 215.0  epsilon: 1.0    steps: 136  evaluation reward: 410.5
episode:

Training network. lr: 0.000122. clip: 0.048860
Iteration 16678: Policy loss: -0.091337. Value loss: 0.252169. Entropy: 0.290626.
Iteration 16679: Policy loss: -0.098126. Value loss: 0.187891. Entropy: 0.287871.
Iteration 16680: Policy loss: -0.108066. Value loss: 0.126815. Entropy: 0.288596.
episode: 5857   score: 830.0  epsilon: 1.0    steps: 104  evaluation reward: 390.2
episode: 5858   score: 290.0  epsilon: 1.0    steps: 1024  evaluation reward: 391.3
Training network. lr: 0.000122. clip: 0.048860
Iteration 16681: Policy loss: -0.046656. Value loss: 0.125581. Entropy: 0.303172.
Iteration 16682: Policy loss: -0.058042. Value loss: 0.057367. Entropy: 0.302176.
Iteration 16683: Policy loss: -0.050940. Value loss: 0.037736. Entropy: 0.302783.
Training network. lr: 0.000122. clip: 0.048860
Iteration 16684: Policy loss: -0.373376. Value loss: 0.325934. Entropy: 0.292003.
Iteration 16685: Policy loss: -0.391003. Value loss: 0.113107. Entropy: 0.291318.
Iteration 16686: Policy loss: -0.403

Iteration 16743: Policy loss: -0.201277. Value loss: 0.121057. Entropy: 0.312142.
episode: 5880   score: 305.0  epsilon: 1.0    steps: 184  evaluation reward: 370.65
episode: 5881   score: 515.0  epsilon: 1.0    steps: 544  evaluation reward: 372.5
Training network. lr: 0.000122. clip: 0.048704
Iteration 16744: Policy loss: 0.064560. Value loss: 0.131422. Entropy: 0.297973.
Iteration 16745: Policy loss: 0.067771. Value loss: 0.062044. Entropy: 0.298518.
Iteration 16746: Policy loss: 0.053540. Value loss: 0.046558. Entropy: 0.298025.
episode: 5882   score: 485.0  epsilon: 1.0    steps: 648  evaluation reward: 372.85
Training network. lr: 0.000122. clip: 0.048704
Iteration 16747: Policy loss: -0.315049. Value loss: 0.375038. Entropy: 0.296583.
Iteration 16748: Policy loss: -0.318110. Value loss: 0.160369. Entropy: 0.296799.
Iteration 16749: Policy loss: -0.331933. Value loss: 0.089219. Entropy: 0.296861.
episode: 5883   score: 590.0  epsilon: 1.0    steps: 512  evaluation reward: 375.8
e

Iteration 16807: Policy loss: 0.229956. Value loss: 0.130985. Entropy: 0.289038.
Iteration 16808: Policy loss: 0.235185. Value loss: 0.058801. Entropy: 0.288350.
Iteration 16809: Policy loss: 0.218724. Value loss: 0.047871. Entropy: 0.288165.
Training network. lr: 0.000121. clip: 0.048400
Iteration 16810: Policy loss: -0.312030. Value loss: 0.320109. Entropy: 0.295558.
Iteration 16811: Policy loss: -0.344929. Value loss: 0.131819. Entropy: 0.295540.
Iteration 16812: Policy loss: -0.346809. Value loss: 0.080724. Entropy: 0.295227.
episode: 5904   score: 330.0  epsilon: 1.0    steps: 376  evaluation reward: 361.2
episode: 5905   score: 700.0  epsilon: 1.0    steps: 680  evaluation reward: 365.8
Training network. lr: 0.000121. clip: 0.048400
Iteration 16813: Policy loss: -0.058550. Value loss: 0.154274. Entropy: 0.298080.
Iteration 16814: Policy loss: -0.069529. Value loss: 0.077603. Entropy: 0.295569.
Iteration 16815: Policy loss: -0.071092. Value loss: 0.054176. Entropy: 0.298063.
episo

Iteration 16872: Policy loss: 0.264121. Value loss: 0.046249. Entropy: 0.292004.
episode: 5927   score: 275.0  epsilon: 1.0    steps: 464  evaluation reward: 359.2
episode: 5928   score: 485.0  epsilon: 1.0    steps: 528  evaluation reward: 361.45
Training network. lr: 0.000121. clip: 0.048243
Iteration 16873: Policy loss: 0.296516. Value loss: 0.158609. Entropy: 0.282631.
Iteration 16874: Policy loss: 0.288976. Value loss: 0.073206. Entropy: 0.281024.
Iteration 16875: Policy loss: 0.293190. Value loss: 0.048140. Entropy: 0.281411.
episode: 5929   score: 360.0  epsilon: 1.0    steps: 376  evaluation reward: 361.75
Training network. lr: 0.000121. clip: 0.048243
Iteration 16876: Policy loss: 0.223060. Value loss: 0.084780. Entropy: 0.296941.
Iteration 16877: Policy loss: 0.221501. Value loss: 0.035533. Entropy: 0.297063.
Iteration 16878: Policy loss: 0.213498. Value loss: 0.024575. Entropy: 0.296715.
episode: 5930   score: 125.0  epsilon: 1.0    steps: 256  evaluation reward: 358.1
Train

Iteration 16938: Policy loss: -0.022127. Value loss: 0.027048. Entropy: 0.301720.
Training network. lr: 0.000120. clip: 0.048086
Iteration 16939: Policy loss: -0.024191. Value loss: 0.110199. Entropy: 0.304394.
Iteration 16940: Policy loss: -0.031073. Value loss: 0.047926. Entropy: 0.300125.
Iteration 16941: Policy loss: -0.034301. Value loss: 0.035780. Entropy: 0.300962.
episode: 5949   score: 245.0  epsilon: 1.0    steps: 896  evaluation reward: 372.6
Training network. lr: 0.000120. clip: 0.048086
Iteration 16942: Policy loss: -0.155284. Value loss: 0.372540. Entropy: 0.288213.
Iteration 16943: Policy loss: -0.146650. Value loss: 0.201333. Entropy: 0.291778.
Iteration 16944: Policy loss: -0.157871. Value loss: 0.091910. Entropy: 0.289815.
episode: 5950   score: 440.0  epsilon: 1.0    steps: 256  evaluation reward: 371.6
Training network. lr: 0.000120. clip: 0.048086
Iteration 16945: Policy loss: -0.007023. Value loss: 0.093344. Entropy: 0.292019.
Iteration 16946: Policy loss: -0.0125

Iteration 17004: Policy loss: 0.141167. Value loss: 0.028271. Entropy: 0.297380.
episode: 5970   score: 640.0  epsilon: 1.0    steps: 424  evaluation reward: 369.55
episode: 5971   score: 495.0  epsilon: 1.0    steps: 1024  evaluation reward: 373.15
Training network. lr: 0.000119. clip: 0.047782
Iteration 17005: Policy loss: -0.150835. Value loss: 0.338960. Entropy: 0.305289.
Iteration 17006: Policy loss: -0.175656. Value loss: 0.150884. Entropy: 0.303129.
Iteration 17007: Policy loss: -0.165888. Value loss: 0.089604. Entropy: 0.304123.
episode: 5972   score: 300.0  epsilon: 1.0    steps: 112  evaluation reward: 372.9
episode: 5973   score: 550.0  epsilon: 1.0    steps: 704  evaluation reward: 372.95
Training network. lr: 0.000119. clip: 0.047782
Iteration 17008: Policy loss: 0.155256. Value loss: 0.102395. Entropy: 0.284830.
Iteration 17009: Policy loss: 0.147022. Value loss: 0.040660. Entropy: 0.283573.
Iteration 17010: Policy loss: 0.142055. Value loss: 0.030255. Entropy: 0.284400.


Training network. lr: 0.000119. clip: 0.047625
Iteration 17068: Policy loss: 0.075251. Value loss: 0.113989. Entropy: 0.296599.
Iteration 17069: Policy loss: 0.055852. Value loss: 0.052216. Entropy: 0.295973.
Iteration 17070: Policy loss: 0.060898. Value loss: 0.035427. Entropy: 0.295104.
episode: 5994   score: 420.0  epsilon: 1.0    steps: 392  evaluation reward: 379.55
Training network. lr: 0.000119. clip: 0.047625
Iteration 17071: Policy loss: -0.284306. Value loss: 0.319931. Entropy: 0.296928.
Iteration 17072: Policy loss: -0.275206. Value loss: 0.162073. Entropy: 0.296623.
Iteration 17073: Policy loss: -0.276906. Value loss: 0.071645. Entropy: 0.296230.
episode: 5995   score: 420.0  epsilon: 1.0    steps: 456  evaluation reward: 381.15
episode: 5996   score: 350.0  epsilon: 1.0    steps: 736  evaluation reward: 382.7
Training network. lr: 0.000119. clip: 0.047625
Iteration 17074: Policy loss: -0.146457. Value loss: 0.101979. Entropy: 0.258163.
Iteration 17075: Policy loss: -0.1527

Iteration 17133: Policy loss: 0.041718. Value loss: 0.040474. Entropy: 0.279931.
episode: 6016   score: 355.0  epsilon: 1.0    steps: 960  evaluation reward: 379.3
Training network. lr: 0.000119. clip: 0.047478
Iteration 17134: Policy loss: -0.106789. Value loss: 0.117014. Entropy: 0.306142.
Iteration 17135: Policy loss: -0.111454. Value loss: 0.048059. Entropy: 0.305196.
Iteration 17136: Policy loss: -0.107889. Value loss: 0.032812. Entropy: 0.305106.
episode: 6017   score: 355.0  epsilon: 1.0    steps: 488  evaluation reward: 378.95
Training network. lr: 0.000119. clip: 0.047478
Iteration 17137: Policy loss: -0.215688. Value loss: 0.139625. Entropy: 0.273363.
Iteration 17138: Policy loss: -0.221238. Value loss: 0.056113. Entropy: 0.273055.
Iteration 17139: Policy loss: -0.219813. Value loss: 0.036992. Entropy: 0.272798.
Training network. lr: 0.000119. clip: 0.047478
Iteration 17140: Policy loss: 0.043245. Value loss: 0.137125. Entropy: 0.284343.
Iteration 17141: Policy loss: 0.032781

Training network. lr: 0.000118. clip: 0.047321
Iteration 17197: Policy loss: -0.257769. Value loss: 0.285911. Entropy: 0.291044.
Iteration 17198: Policy loss: -0.263711. Value loss: 0.123575. Entropy: 0.290510.
Iteration 17199: Policy loss: -0.272068. Value loss: 0.075916. Entropy: 0.290929.
episode: 6040   score: 165.0  epsilon: 1.0    steps: 216  evaluation reward: 374.45
Training network. lr: 0.000118. clip: 0.047321
Iteration 17200: Policy loss: 0.046707. Value loss: 0.466569. Entropy: 0.292164.
Iteration 17201: Policy loss: 0.015866. Value loss: 0.307577. Entropy: 0.287236.
Iteration 17202: Policy loss: 0.018170. Value loss: 0.228137. Entropy: 0.290205.
episode: 6041   score: 500.0  epsilon: 1.0    steps: 192  evaluation reward: 376.25
Training network. lr: 0.000118. clip: 0.047165
Iteration 17203: Policy loss: -0.164834. Value loss: 0.161824. Entropy: 0.277829.
Iteration 17204: Policy loss: -0.167687. Value loss: 0.077377. Entropy: 0.277553.
Iteration 17205: Policy loss: -0.16794

episode: 6061   score: 375.0  epsilon: 1.0    steps: 792  evaluation reward: 393.3
Training network. lr: 0.000118. clip: 0.047017
Iteration 17263: Policy loss: -0.272012. Value loss: 0.293800. Entropy: 0.284555.
Iteration 17264: Policy loss: -0.270245. Value loss: 0.112442. Entropy: 0.281512.
Iteration 17265: Policy loss: -0.285505. Value loss: 0.075496. Entropy: 0.281904.
Training network. lr: 0.000118. clip: 0.047017
Iteration 17266: Policy loss: -0.213420. Value loss: 0.314065. Entropy: 0.307002.
Iteration 17267: Policy loss: -0.218894. Value loss: 0.105345. Entropy: 0.306171.
Iteration 17268: Policy loss: -0.236527. Value loss: 0.045317. Entropy: 0.306950.
Training network. lr: 0.000118. clip: 0.047017
Iteration 17269: Policy loss: 0.008075. Value loss: 0.114632. Entropy: 0.296395.
Iteration 17270: Policy loss: -0.002894. Value loss: 0.045550. Entropy: 0.293268.
Iteration 17271: Policy loss: -0.003653. Value loss: 0.036575. Entropy: 0.294493.
episode: 6062   score: 440.0  epsilon: 

Iteration 17330: Policy loss: -0.079939. Value loss: 0.042450. Entropy: 0.305596.
Iteration 17331: Policy loss: -0.081977. Value loss: 0.029226. Entropy: 0.306909.
episode: 6081   score: 555.0  epsilon: 1.0    steps: 976  evaluation reward: 406.65
Training network. lr: 0.000117. clip: 0.046861
Iteration 17332: Policy loss: 0.125903. Value loss: 0.085601. Entropy: 0.302902.
Iteration 17333: Policy loss: 0.123830. Value loss: 0.043715. Entropy: 0.302481.
Iteration 17334: Policy loss: 0.119264. Value loss: 0.033395. Entropy: 0.302605.
episode: 6082   score: 530.0  epsilon: 1.0    steps: 872  evaluation reward: 410.4
Training network. lr: 0.000117. clip: 0.046861
Iteration 17335: Policy loss: 0.153997. Value loss: 0.088072. Entropy: 0.296162.
Iteration 17336: Policy loss: 0.151241. Value loss: 0.037451. Entropy: 0.295886.
Iteration 17337: Policy loss: 0.139815. Value loss: 0.022815. Entropy: 0.296857.
episode: 6083   score: 570.0  epsilon: 1.0    steps: 8  evaluation reward: 412.25
Trainin

Iteration 17397: Policy loss: -0.165217. Value loss: 0.043831. Entropy: 0.288363.
episode: 6102   score: 565.0  epsilon: 1.0    steps: 568  evaluation reward: 422.35
Training network. lr: 0.000117. clip: 0.046704
Iteration 17398: Policy loss: 0.156025. Value loss: 0.325453. Entropy: 0.293410.
Iteration 17399: Policy loss: 0.130224. Value loss: 0.181497. Entropy: 0.291356.
Iteration 17400: Policy loss: 0.148509. Value loss: 0.137772. Entropy: 0.292143.
episode: 6103   score: 855.0  epsilon: 1.0    steps: 952  evaluation reward: 427.25
episode: 6104   score: 260.0  epsilon: 1.0    steps: 1008  evaluation reward: 427.0
Training network. lr: 0.000116. clip: 0.046556
Iteration 17401: Policy loss: 0.355501. Value loss: 0.212218. Entropy: 0.305622.
Iteration 17402: Policy loss: 0.356947. Value loss: 0.061702. Entropy: 0.303072.
Iteration 17403: Policy loss: 0.349207. Value loss: 0.036040. Entropy: 0.304221.
Training network. lr: 0.000116. clip: 0.046556
Iteration 17404: Policy loss: 0.074509.

Iteration 17463: Policy loss: -0.127255. Value loss: 0.025339. Entropy: 0.304360.
Training network. lr: 0.000116. clip: 0.046400
Iteration 17464: Policy loss: 0.209736. Value loss: 0.171508. Entropy: 0.309493.
Iteration 17465: Policy loss: 0.190955. Value loss: 0.059394. Entropy: 0.307554.
Iteration 17466: Policy loss: 0.199543. Value loss: 0.041142. Entropy: 0.308796.
episode: 6124   score: 350.0  epsilon: 1.0    steps: 1024  evaluation reward: 449.55
Training network. lr: 0.000116. clip: 0.046400
Iteration 17467: Policy loss: 0.313630. Value loss: 0.179803. Entropy: 0.308460.
Iteration 17468: Policy loss: 0.302237. Value loss: 0.068948. Entropy: 0.307528.
Iteration 17469: Policy loss: 0.301773. Value loss: 0.046817. Entropy: 0.307657.
episode: 6125   score: 350.0  epsilon: 1.0    steps: 424  evaluation reward: 450.7
episode: 6126   score: 420.0  epsilon: 1.0    steps: 856  evaluation reward: 453.05
Training network. lr: 0.000116. clip: 0.046400
Iteration 17470: Policy loss: -0.007790

episode: 6145   score: 480.0  epsilon: 1.0    steps: 200  evaluation reward: 451.2
episode: 6146   score: 535.0  epsilon: 1.0    steps: 880  evaluation reward: 453.55
Training network. lr: 0.000116. clip: 0.046243
Iteration 17530: Policy loss: 0.045751. Value loss: 0.160343. Entropy: 0.284722.
Iteration 17531: Policy loss: 0.040813. Value loss: 0.077905. Entropy: 0.288236.
Iteration 17532: Policy loss: 0.036595. Value loss: 0.053927. Entropy: 0.286780.
episode: 6147   score: 310.0  epsilon: 1.0    steps: 784  evaluation reward: 449.7
Training network. lr: 0.000116. clip: 0.046243
Iteration 17533: Policy loss: 0.122014. Value loss: 0.172600. Entropy: 0.306164.
Iteration 17534: Policy loss: 0.119514. Value loss: 0.060622. Entropy: 0.306905.
Iteration 17535: Policy loss: 0.115497. Value loss: 0.044740. Entropy: 0.305721.
episode: 6148   score: 290.0  epsilon: 1.0    steps: 424  evaluation reward: 442.6
Training network. lr: 0.000116. clip: 0.046243
Iteration 17536: Policy loss: 0.039964. 

episode: 6166   score: 835.0  epsilon: 1.0    steps: 680  evaluation reward: 457.55
Training network. lr: 0.000115. clip: 0.046096
Iteration 17596: Policy loss: -0.068911. Value loss: 0.114418. Entropy: 0.260304.
Iteration 17597: Policy loss: -0.073704. Value loss: 0.046250. Entropy: 0.259563.
Iteration 17598: Policy loss: -0.079123. Value loss: 0.037756. Entropy: 0.258844.
Training network. lr: 0.000115. clip: 0.046096
Iteration 17599: Policy loss: 0.093746. Value loss: 0.189331. Entropy: 0.279744.
Iteration 17600: Policy loss: 0.092286. Value loss: 0.070793. Entropy: 0.281204.
Iteration 17601: Policy loss: 0.087931. Value loss: 0.042611. Entropy: 0.278265.
episode: 6167   score: 340.0  epsilon: 1.0    steps: 968  evaluation reward: 456.45
Training network. lr: 0.000115. clip: 0.045939
Iteration 17602: Policy loss: -0.282832. Value loss: 0.566038. Entropy: 0.307065.
Iteration 17603: Policy loss: -0.315155. Value loss: 0.425799. Entropy: 0.305777.
Iteration 17604: Policy loss: -0.33804

Iteration 17664: Policy loss: -0.152125. Value loss: 0.045762. Entropy: 0.306932.
episode: 6185   score: 465.0  epsilon: 1.0    steps: 464  evaluation reward: 452.1
episode: 6186   score: 335.0  epsilon: 1.0    steps: 472  evaluation reward: 452.3
Training network. lr: 0.000114. clip: 0.045782
Iteration 17665: Policy loss: 0.377558. Value loss: 0.160812. Entropy: 0.281952.
Iteration 17666: Policy loss: 0.376021. Value loss: 0.092982. Entropy: 0.278491.
Iteration 17667: Policy loss: 0.358671. Value loss: 0.068740. Entropy: 0.278024.
Training network. lr: 0.000114. clip: 0.045782
Iteration 17668: Policy loss: 0.003766. Value loss: 0.129221. Entropy: 0.287503.
Iteration 17669: Policy loss: -0.004482. Value loss: 0.040150. Entropy: 0.286723.
Iteration 17670: Policy loss: -0.011591. Value loss: 0.027223. Entropy: 0.285181.
episode: 6187   score: 615.0  epsilon: 1.0    steps: 680  evaluation reward: 452.65
Training network. lr: 0.000114. clip: 0.045782
Iteration 17671: Policy loss: 0.065887.

Iteration 17732: Policy loss: -0.323117. Value loss: 0.121173. Entropy: 0.297075.
Iteration 17733: Policy loss: -0.336415. Value loss: 0.071733. Entropy: 0.297008.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17734: Policy loss: -0.120443. Value loss: 0.204766. Entropy: 0.295619.
Iteration 17735: Policy loss: -0.146317. Value loss: 0.065705. Entropy: 0.293687.
Iteration 17736: Policy loss: -0.158701. Value loss: 0.044203. Entropy: 0.292822.
episode: 6204   score: 680.0  epsilon: 1.0    steps: 288  evaluation reward: 465.05
episode: 6205   score: 695.0  epsilon: 1.0    steps: 408  evaluation reward: 468.95
Training network. lr: 0.000114. clip: 0.045635
Iteration 17737: Policy loss: 0.214990. Value loss: 0.226112. Entropy: 0.288844.
Iteration 17738: Policy loss: 0.218589. Value loss: 0.067453. Entropy: 0.287563.
Iteration 17739: Policy loss: 0.213760. Value loss: 0.047274. Entropy: 0.285689.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17740: Policy loss: 0.250633

Iteration 17799: Policy loss: 0.115199. Value loss: 0.049166. Entropy: 0.298897.
Training network. lr: 0.000114. clip: 0.045478
Iteration 17800: Policy loss: -0.043237. Value loss: 0.091522. Entropy: 0.305393.
Iteration 17801: Policy loss: -0.054121. Value loss: 0.045151. Entropy: 0.306935.
Iteration 17802: Policy loss: -0.053799. Value loss: 0.034712. Entropy: 0.305038.
episode: 6225   score: 620.0  epsilon: 1.0    steps: 632  evaluation reward: 464.75
Training network. lr: 0.000113. clip: 0.045321
Iteration 17803: Policy loss: 0.084031. Value loss: 0.120997. Entropy: 0.298625.
Iteration 17804: Policy loss: 0.073638. Value loss: 0.050717. Entropy: 0.299502.
Iteration 17805: Policy loss: 0.073783. Value loss: 0.037213. Entropy: 0.299407.
Training network. lr: 0.000113. clip: 0.045321
Iteration 17806: Policy loss: -0.120035. Value loss: 0.374255. Entropy: 0.307562.
Iteration 17807: Policy loss: -0.121923. Value loss: 0.202939. Entropy: 0.305860.
Iteration 17808: Policy loss: -0.145611. 

Training network. lr: 0.000113. clip: 0.045174
Iteration 17866: Policy loss: -0.046558. Value loss: 0.581169. Entropy: 0.311081.
Iteration 17867: Policy loss: -0.064794. Value loss: 0.299698. Entropy: 0.311439.
Iteration 17868: Policy loss: -0.090153. Value loss: 0.184205. Entropy: 0.311930.
episode: 6246   score: 390.0  epsilon: 1.0    steps: 512  evaluation reward: 474.7
Training network. lr: 0.000113. clip: 0.045174
Iteration 17869: Policy loss: 0.160534. Value loss: 0.115746. Entropy: 0.305499.
Iteration 17870: Policy loss: 0.154537. Value loss: 0.059859. Entropy: 0.306475.
Iteration 17871: Policy loss: 0.155917. Value loss: 0.040115. Entropy: 0.305656.
Training network. lr: 0.000113. clip: 0.045174
Iteration 17872: Policy loss: 0.162472. Value loss: 0.188764. Entropy: 0.302470.
Iteration 17873: Policy loss: 0.151696. Value loss: 0.060123. Entropy: 0.303583.
Iteration 17874: Policy loss: 0.149252. Value loss: 0.032976. Entropy: 0.303578.
episode: 6247   score: 480.0  epsilon: 1.0  

Training network. lr: 0.000113. clip: 0.045017
Iteration 17932: Policy loss: -0.120568. Value loss: 0.400924. Entropy: 0.308029.
Iteration 17933: Policy loss: -0.139337. Value loss: 0.162740. Entropy: 0.306812.
Iteration 17934: Policy loss: -0.140550. Value loss: 0.089777. Entropy: 0.307224.
episode: 6267   score: 670.0  epsilon: 1.0    steps: 184  evaluation reward: 471.0
episode: 6268   score: 800.0  epsilon: 1.0    steps: 248  evaluation reward: 474.65
Training network. lr: 0.000113. clip: 0.045017
Iteration 17935: Policy loss: 0.335507. Value loss: 0.203803. Entropy: 0.301828.
Iteration 17936: Policy loss: 0.341358. Value loss: 0.095394. Entropy: 0.298594.
Iteration 17937: Policy loss: 0.345071. Value loss: 0.064595. Entropy: 0.298505.
Training network. lr: 0.000113. clip: 0.045017
Iteration 17938: Policy loss: 0.124627. Value loss: 0.165636. Entropy: 0.307223.
Iteration 17939: Policy loss: 0.120344. Value loss: 0.066787. Entropy: 0.307529.
Iteration 17940: Policy loss: 0.112756. V

Iteration 17998: Policy loss: -0.133107. Value loss: 0.224163. Entropy: 0.307419.
Iteration 17999: Policy loss: -0.153831. Value loss: 0.096958. Entropy: 0.306268.
Iteration 18000: Policy loss: -0.153215. Value loss: 0.064603. Entropy: 0.307300.
episode: 6288   score: 460.0  epsilon: 1.0    steps: 840  evaluation reward: 484.3
Training network. lr: 0.000112. clip: 0.044713
Iteration 18001: Policy loss: 0.155994. Value loss: 0.300211. Entropy: 0.299260.
Iteration 18002: Policy loss: 0.157626. Value loss: 0.091526. Entropy: 0.299280.
Iteration 18003: Policy loss: 0.150845. Value loss: 0.050316. Entropy: 0.298622.
Training network. lr: 0.000112. clip: 0.044713
Iteration 18004: Policy loss: 0.029899. Value loss: 0.092407. Entropy: 0.309880.
Iteration 18005: Policy loss: 0.024396. Value loss: 0.036708. Entropy: 0.309326.
Iteration 18006: Policy loss: 0.021662. Value loss: 0.024606. Entropy: 0.309513.
episode: 6289   score: 405.0  epsilon: 1.0    steps: 320  evaluation reward: 485.2
Training

Iteration 18068: Policy loss: 0.334818. Value loss: 0.064163. Entropy: 0.312586.
Iteration 18069: Policy loss: 0.328615. Value loss: 0.038414. Entropy: 0.312616.
Training network. lr: 0.000111. clip: 0.044557
Iteration 18070: Policy loss: 0.118440. Value loss: 0.134931. Entropy: 0.307247.
Iteration 18071: Policy loss: 0.117642. Value loss: 0.055487. Entropy: 0.307857.
Iteration 18072: Policy loss: 0.116029. Value loss: 0.037918. Entropy: 0.307562.
episode: 6305   score: 535.0  epsilon: 1.0    steps: 368  evaluation reward: 480.35
episode: 6306   score: 435.0  epsilon: 1.0    steps: 984  evaluation reward: 480.9
Training network. lr: 0.000111. clip: 0.044557
Iteration 18073: Policy loss: -0.498572. Value loss: 0.397192. Entropy: 0.294107.
Iteration 18074: Policy loss: -0.506012. Value loss: 0.128527. Entropy: 0.293255.
Iteration 18075: Policy loss: -0.499476. Value loss: 0.078942. Entropy: 0.294257.
episode: 6307   score: 270.0  epsilon: 1.0    steps: 432  evaluation reward: 479.35
epis

Iteration 18131: Policy loss: -0.195222. Value loss: 0.080999. Entropy: 0.297689.
Iteration 18132: Policy loss: -0.206884. Value loss: 0.055591. Entropy: 0.298311.
Training network. lr: 0.000111. clip: 0.044400
Iteration 18133: Policy loss: 0.008203. Value loss: 0.105115. Entropy: 0.312219.
Iteration 18134: Policy loss: 0.003199. Value loss: 0.053476. Entropy: 0.313305.
Iteration 18135: Policy loss: 0.001068. Value loss: 0.041022. Entropy: 0.312674.
episode: 6330   score: 300.0  epsilon: 1.0    steps: 616  evaluation reward: 478.75
Training network. lr: 0.000111. clip: 0.044400
Iteration 18136: Policy loss: 0.224614. Value loss: 0.138083. Entropy: 0.305801.
Iteration 18137: Policy loss: 0.227217. Value loss: 0.049814. Entropy: 0.303929.
Iteration 18138: Policy loss: 0.205636. Value loss: 0.035216. Entropy: 0.305114.
episode: 6331   score: 435.0  epsilon: 1.0    steps: 176  evaluation reward: 480.05
Training network. lr: 0.000111. clip: 0.044400
Iteration 18139: Policy loss: -0.323132. 

Iteration 18198: Policy loss: -0.065207. Value loss: 0.056286. Entropy: 0.303252.
now time :  2019-09-06 09:04:06.840218
episode: 6351   score: 225.0  epsilon: 1.0    steps: 552  evaluation reward: 471.0
Training network. lr: 0.000111. clip: 0.044252
Iteration 18199: Policy loss: 0.161728. Value loss: 0.145959. Entropy: 0.309250.
Iteration 18200: Policy loss: 0.156092. Value loss: 0.070710. Entropy: 0.307618.
Iteration 18201: Policy loss: 0.160566. Value loss: 0.043967. Entropy: 0.308209.
episode: 6352   score: 315.0  epsilon: 1.0    steps: 792  evaluation reward: 467.75
Training network. lr: 0.000110. clip: 0.044096
Iteration 18202: Policy loss: 0.170465. Value loss: 0.177205. Entropy: 0.304612.
Iteration 18203: Policy loss: 0.168485. Value loss: 0.081755. Entropy: 0.302873.
Iteration 18204: Policy loss: 0.159629. Value loss: 0.054273. Entropy: 0.303490.
Training network. lr: 0.000110. clip: 0.044096
Iteration 18205: Policy loss: -0.118907. Value loss: 0.093903. Entropy: 0.308314.
Ite

Iteration 18264: Policy loss: 0.096104. Value loss: 0.034754. Entropy: 0.311518.
episode: 6372   score: 725.0  epsilon: 1.0    steps: 512  evaluation reward: 444.1
episode: 6373   score: 655.0  epsilon: 1.0    steps: 600  evaluation reward: 446.7
Training network. lr: 0.000110. clip: 0.043939
Iteration 18265: Policy loss: 0.120130. Value loss: 0.088899. Entropy: 0.288004.
Iteration 18266: Policy loss: 0.108188. Value loss: 0.032817. Entropy: 0.287725.
Iteration 18267: Policy loss: 0.104763. Value loss: 0.022122. Entropy: 0.288558.
episode: 6374   score: 180.0  epsilon: 1.0    steps: 696  evaluation reward: 445.65
Training network. lr: 0.000110. clip: 0.043939
Iteration 18268: Policy loss: 0.087132. Value loss: 0.135917. Entropy: 0.296937.
Iteration 18269: Policy loss: 0.084254. Value loss: 0.060566. Entropy: 0.297663.
Iteration 18270: Policy loss: 0.084394. Value loss: 0.042580. Entropy: 0.297901.
episode: 6375   score: 305.0  epsilon: 1.0    steps: 224  evaluation reward: 444.7
Traini

Training network. lr: 0.000109. clip: 0.043792
Iteration 18331: Policy loss: -0.092648. Value loss: 0.132708. Entropy: 0.305874.
Iteration 18332: Policy loss: -0.098801. Value loss: 0.057204. Entropy: 0.307273.
Iteration 18333: Policy loss: -0.101402. Value loss: 0.044438. Entropy: 0.306140.
episode: 6393   score: 390.0  epsilon: 1.0    steps: 808  evaluation reward: 443.7
Training network. lr: 0.000109. clip: 0.043792
Iteration 18334: Policy loss: 0.045138. Value loss: 0.124024. Entropy: 0.299834.
Iteration 18335: Policy loss: 0.038611. Value loss: 0.062579. Entropy: 0.297209.
Iteration 18336: Policy loss: 0.035645. Value loss: 0.048631. Entropy: 0.298305.
Training network. lr: 0.000109. clip: 0.043792
Iteration 18337: Policy loss: 0.201547. Value loss: 0.131617. Entropy: 0.306202.
Iteration 18338: Policy loss: 0.195231. Value loss: 0.048072. Entropy: 0.305222.
Iteration 18339: Policy loss: 0.191659. Value loss: 0.033975. Entropy: 0.306072.
episode: 6394   score: 400.0  epsilon: 1.0  

Iteration 18396: Policy loss: -0.176641. Value loss: 0.076554. Entropy: 0.300818.
episode: 6415   score: 345.0  epsilon: 1.0    steps: 824  evaluation reward: 413.3
Training network. lr: 0.000109. clip: 0.043635
Iteration 18397: Policy loss: -0.043035. Value loss: 0.095315. Entropy: 0.294959.
Iteration 18398: Policy loss: -0.059744. Value loss: 0.031981. Entropy: 0.296285.
Iteration 18399: Policy loss: -0.054106. Value loss: 0.021904. Entropy: 0.295709.
Training network. lr: 0.000109. clip: 0.043635
Iteration 18400: Policy loss: 0.063128. Value loss: 0.307032. Entropy: 0.311576.
Iteration 18401: Policy loss: 0.066431. Value loss: 0.122975. Entropy: 0.310605.
Iteration 18402: Policy loss: 0.046931. Value loss: 0.089515. Entropy: 0.310798.
Training network. lr: 0.000109. clip: 0.043478
Iteration 18403: Policy loss: -0.207759. Value loss: 0.270103. Entropy: 0.307355.
Iteration 18404: Policy loss: -0.219592. Value loss: 0.100268. Entropy: 0.307607.
Iteration 18405: Policy loss: -0.224796. 

Iteration 18463: Policy loss: -0.114496. Value loss: 0.187288. Entropy: 0.306402.
Iteration 18464: Policy loss: -0.126788. Value loss: 0.082784. Entropy: 0.305936.
Iteration 18465: Policy loss: -0.134576. Value loss: 0.054001. Entropy: 0.306257.
episode: 6435   score: 735.0  epsilon: 1.0    steps: 760  evaluation reward: 428.05
Training network. lr: 0.000108. clip: 0.043331
Iteration 18466: Policy loss: -0.376373. Value loss: 0.371293. Entropy: 0.297737.
Iteration 18467: Policy loss: -0.357603. Value loss: 0.149980. Entropy: 0.299656.
Iteration 18468: Policy loss: -0.391964. Value loss: 0.101727. Entropy: 0.297804.
episode: 6436   score: 515.0  epsilon: 1.0    steps: 152  evaluation reward: 427.6
episode: 6437   score: 585.0  epsilon: 1.0    steps: 928  evaluation reward: 431.35
Training network. lr: 0.000108. clip: 0.043331
Iteration 18469: Policy loss: 0.295420. Value loss: 0.277489. Entropy: 0.295518.
Iteration 18470: Policy loss: 0.280492. Value loss: 0.100686. Entropy: 0.292022.
I

episode: 6457   score: 230.0  epsilon: 1.0    steps: 168  evaluation reward: 438.8
Training network. lr: 0.000108. clip: 0.043174
Iteration 18529: Policy loss: 0.051962. Value loss: 0.090220. Entropy: 0.294115.
Iteration 18530: Policy loss: 0.053523. Value loss: 0.056231. Entropy: 0.294263.
Iteration 18531: Policy loss: 0.051031. Value loss: 0.040435. Entropy: 0.293508.
Training network. lr: 0.000108. clip: 0.043174
Iteration 18532: Policy loss: -0.152087. Value loss: 0.147805. Entropy: 0.312636.
Iteration 18533: Policy loss: -0.159088. Value loss: 0.087740. Entropy: 0.314205.
Iteration 18534: Policy loss: -0.163305. Value loss: 0.064561. Entropy: 0.313685.
episode: 6458   score: 925.0  epsilon: 1.0    steps: 256  evaluation reward: 442.1
Training network. lr: 0.000108. clip: 0.043174
Iteration 18535: Policy loss: -0.179537. Value loss: 0.282149. Entropy: 0.282192.
Iteration 18536: Policy loss: -0.191951. Value loss: 0.156981. Entropy: 0.282962.
Iteration 18537: Policy loss: -0.211355.

episode: 6476   score: 360.0  epsilon: 1.0    steps: 592  evaluation reward: 466.0
Training network. lr: 0.000108. clip: 0.043017
Iteration 18598: Policy loss: 0.146693. Value loss: 0.134405. Entropy: 0.274843.
Iteration 18599: Policy loss: 0.132667. Value loss: 0.063884. Entropy: 0.273684.
Iteration 18600: Policy loss: 0.142056. Value loss: 0.045185. Entropy: 0.275406.
episode: 6477   score: 380.0  epsilon: 1.0    steps: 224  evaluation reward: 463.6
episode: 6478   score: 395.0  epsilon: 1.0    steps: 912  evaluation reward: 463.15
Training network. lr: 0.000107. clip: 0.042870
Iteration 18601: Policy loss: 0.001502. Value loss: 0.258712. Entropy: 0.298057.
Iteration 18602: Policy loss: 0.015026. Value loss: 0.126214. Entropy: 0.297777.
Iteration 18603: Policy loss: -0.008644. Value loss: 0.068492. Entropy: 0.298901.
Training network. lr: 0.000107. clip: 0.042870
Iteration 18604: Policy loss: 0.037969. Value loss: 0.148464. Entropy: 0.288506.
Iteration 18605: Policy loss: 0.031956. V

Training network. lr: 0.000107. clip: 0.042713
Iteration 18664: Policy loss: 0.177821. Value loss: 0.126782. Entropy: 0.304207.
Iteration 18665: Policy loss: 0.178261. Value loss: 0.057589. Entropy: 0.304370.
Iteration 18666: Policy loss: 0.175676. Value loss: 0.041383. Entropy: 0.303239.
episode: 6498   score: 335.0  epsilon: 1.0    steps: 120  evaluation reward: 458.65
episode: 6499   score: 210.0  epsilon: 1.0    steps: 408  evaluation reward: 457.05
Training network. lr: 0.000107. clip: 0.042713
Iteration 18667: Policy loss: -0.068039. Value loss: 0.084112. Entropy: 0.293004.
Iteration 18668: Policy loss: -0.071856. Value loss: 0.038510. Entropy: 0.291627.
Iteration 18669: Policy loss: -0.070307. Value loss: 0.029858. Entropy: 0.290888.
episode: 6500   score: 215.0  epsilon: 1.0    steps: 72  evaluation reward: 453.5
now time :  2019-09-06 09:32:41.706618
episode: 6501   score: 590.0  epsilon: 1.0    steps: 896  evaluation reward: 452.9
Training network. lr: 0.000107. clip: 0.04271

Iteration 18727: Policy loss: 0.020943. Value loss: 0.106732. Entropy: 0.316112.
Iteration 18728: Policy loss: 0.016208. Value loss: 0.044532. Entropy: 0.316502.
Iteration 18729: Policy loss: 0.011168. Value loss: 0.031825. Entropy: 0.315446.
episode: 6522   score: 385.0  epsilon: 1.0    steps: 256  evaluation reward: 430.6
Training network. lr: 0.000106. clip: 0.042557
Iteration 18730: Policy loss: -0.031302. Value loss: 0.095165. Entropy: 0.295838.
Iteration 18731: Policy loss: -0.032066. Value loss: 0.042407. Entropy: 0.295011.
Iteration 18732: Policy loss: -0.037245. Value loss: 0.031924. Entropy: 0.295295.
episode: 6523   score: 260.0  epsilon: 1.0    steps: 808  evaluation reward: 430.05
Training network. lr: 0.000106. clip: 0.042557
Iteration 18733: Policy loss: 0.061813. Value loss: 0.113301. Entropy: 0.309177.
Iteration 18734: Policy loss: 0.050745. Value loss: 0.061156. Entropy: 0.307426.
Iteration 18735: Policy loss: 0.043644. Value loss: 0.050510. Entropy: 0.307692.
episode

Training network. lr: 0.000106. clip: 0.042409
Iteration 18790: Policy loss: -0.159212. Value loss: 0.340302. Entropy: 0.276981.
Iteration 18791: Policy loss: -0.157724. Value loss: 0.246429. Entropy: 0.275008.
Iteration 18792: Policy loss: -0.172857. Value loss: 0.191630. Entropy: 0.275487.
Training network. lr: 0.000106. clip: 0.042409
Iteration 18793: Policy loss: -0.024465. Value loss: 0.119541. Entropy: 0.309160.
Iteration 18794: Policy loss: -0.031554. Value loss: 0.053574. Entropy: 0.308107.
Iteration 18795: Policy loss: -0.029857. Value loss: 0.039447. Entropy: 0.308163.
episode: 6548   score: 565.0  epsilon: 1.0    steps: 544  evaluation reward: 406.3
Training network. lr: 0.000106. clip: 0.042409
Iteration 18796: Policy loss: 0.201153. Value loss: 0.136642. Entropy: 0.308924.
Iteration 18797: Policy loss: 0.194008. Value loss: 0.078293. Entropy: 0.306138.
Iteration 18798: Policy loss: 0.197624. Value loss: 0.050360. Entropy: 0.306804.
Training network. lr: 0.000106. clip: 0.0

Iteration 18857: Policy loss: 0.061204. Value loss: 0.057952. Entropy: 0.307490.
Iteration 18858: Policy loss: 0.056946. Value loss: 0.041309. Entropy: 0.306058.
episode: 6568   score: 470.0  epsilon: 1.0    steps: 376  evaluation reward: 390.35
episode: 6569   score: 315.0  epsilon: 1.0    steps: 376  evaluation reward: 390.05
Training network. lr: 0.000105. clip: 0.042096
Iteration 18859: Policy loss: 0.275150. Value loss: 0.110751. Entropy: 0.291005.
Iteration 18860: Policy loss: 0.277391. Value loss: 0.052934. Entropy: 0.289989.
Iteration 18861: Policy loss: 0.268948. Value loss: 0.035777. Entropy: 0.290936.
episode: 6570   score: 320.0  epsilon: 1.0    steps: 560  evaluation reward: 387.0
episode: 6571   score: 500.0  epsilon: 1.0    steps: 904  evaluation reward: 384.15
episode: 6572   score: 390.0  epsilon: 1.0    steps: 904  evaluation reward: 385.35
Training network. lr: 0.000105. clip: 0.042096
Iteration 18862: Policy loss: 0.133004. Value loss: 0.107652. Entropy: 0.295675.
I

episode: 6591   score: 260.0  epsilon: 1.0    steps: 976  evaluation reward: 364.6
Training network. lr: 0.000105. clip: 0.041948
Iteration 18922: Policy loss: 0.021304. Value loss: 0.227369. Entropy: 0.312182.
Iteration 18923: Policy loss: 0.003377. Value loss: 0.100222. Entropy: 0.312572.
Iteration 18924: Policy loss: -0.001392. Value loss: 0.073384. Entropy: 0.312809.
episode: 6592   score: 300.0  epsilon: 1.0    steps: 680  evaluation reward: 365.2
episode: 6593   score: 365.0  epsilon: 1.0    steps: 816  evaluation reward: 365.3
Training network. lr: 0.000105. clip: 0.041948
Iteration 18925: Policy loss: -0.010451. Value loss: 0.340564. Entropy: 0.292340.
Iteration 18926: Policy loss: -0.013590. Value loss: 0.189287. Entropy: 0.290511.
Iteration 18927: Policy loss: -0.020433. Value loss: 0.131788. Entropy: 0.291575.
episode: 6594   score: 440.0  epsilon: 1.0    steps: 216  evaluation reward: 365.1
episode: 6595   score: 365.0  epsilon: 1.0    steps: 392  evaluation reward: 362.25


Iteration 18984: Policy loss: -0.131739. Value loss: 0.022181. Entropy: 0.302494.
episode: 6617   score: 305.0  epsilon: 1.0    steps: 944  evaluation reward: 353.3
Training network. lr: 0.000104. clip: 0.041792
Iteration 18985: Policy loss: -0.286082. Value loss: 0.103961. Entropy: 0.316970.
Iteration 18986: Policy loss: -0.289834. Value loss: 0.047799. Entropy: 0.317336.
Iteration 18987: Policy loss: -0.294075. Value loss: 0.034775. Entropy: 0.317007.
episode: 6618   score: 285.0  epsilon: 1.0    steps: 400  evaluation reward: 354.0
Training network. lr: 0.000104. clip: 0.041792
Iteration 18988: Policy loss: -0.305024. Value loss: 0.207532. Entropy: 0.303313.
Iteration 18989: Policy loss: -0.301057. Value loss: 0.093906. Entropy: 0.304783.
Iteration 18990: Policy loss: -0.318810. Value loss: 0.058545. Entropy: 0.304585.
Training network. lr: 0.000104. clip: 0.041792
Iteration 18991: Policy loss: 0.217752. Value loss: 0.168173. Entropy: 0.312741.
Iteration 18992: Policy loss: 0.205669

Iteration 19050: Policy loss: 0.044549. Value loss: 0.051585. Entropy: 0.315648.
Training network. lr: 0.000104. clip: 0.041488
Iteration 19051: Policy loss: 0.037380. Value loss: 0.101153. Entropy: 0.310332.
Iteration 19052: Policy loss: 0.030645. Value loss: 0.042266. Entropy: 0.309167.
Iteration 19053: Policy loss: 0.030520. Value loss: 0.028495. Entropy: 0.308351.
episode: 6639   score: 290.0  epsilon: 1.0    steps: 624  evaluation reward: 374.55
episode: 6640   score: 225.0  epsilon: 1.0    steps: 688  evaluation reward: 374.4
Training network. lr: 0.000104. clip: 0.041488
Iteration 19054: Policy loss: -0.052658. Value loss: 0.092875. Entropy: 0.292144.
Iteration 19055: Policy loss: -0.056286. Value loss: 0.045601. Entropy: 0.290669.
Iteration 19056: Policy loss: -0.054383. Value loss: 0.035024. Entropy: 0.289298.
episode: 6641   score: 360.0  epsilon: 1.0    steps: 560  evaluation reward: 374.85
episode: 6642   score: 315.0  epsilon: 1.0    steps: 688  evaluation reward: 371.8
Tr

Iteration 19114: Policy loss: 0.217035. Value loss: 0.104461. Entropy: 0.313129.
Iteration 19115: Policy loss: 0.216332. Value loss: 0.038593. Entropy: 0.312714.
Iteration 19116: Policy loss: 0.220547. Value loss: 0.029480. Entropy: 0.312129.
Training network. lr: 0.000103. clip: 0.041331
Iteration 19117: Policy loss: -0.022535. Value loss: 0.120137. Entropy: 0.313020.
Iteration 19118: Policy loss: -0.031164. Value loss: 0.055330. Entropy: 0.312608.
Iteration 19119: Policy loss: -0.028031. Value loss: 0.038533. Entropy: 0.312303.
episode: 6662   score: 600.0  epsilon: 1.0    steps: 392  evaluation reward: 359.85
episode: 6663   score: 355.0  epsilon: 1.0    steps: 960  evaluation reward: 358.25
Training network. lr: 0.000103. clip: 0.041331
Iteration 19120: Policy loss: 0.081527. Value loss: 0.092394. Entropy: 0.298981.
Iteration 19121: Policy loss: 0.076362. Value loss: 0.037024. Entropy: 0.298822.
Iteration 19122: Policy loss: 0.072917. Value loss: 0.025995. Entropy: 0.298048.
episod

Training network. lr: 0.000103. clip: 0.041174
Iteration 19180: Policy loss: -0.035786. Value loss: 0.085313. Entropy: 0.318469.
Iteration 19181: Policy loss: -0.033004. Value loss: 0.037115. Entropy: 0.318836.
Iteration 19182: Policy loss: -0.039465. Value loss: 0.025209. Entropy: 0.318007.
Training network. lr: 0.000103. clip: 0.041174
Iteration 19183: Policy loss: -0.261581. Value loss: 0.317859. Entropy: 0.312034.
Iteration 19184: Policy loss: -0.241246. Value loss: 0.178016. Entropy: 0.311466.
Iteration 19185: Policy loss: -0.254462. Value loss: 0.135217. Entropy: 0.311120.
episode: 6684   score: 225.0  epsilon: 1.0    steps: 168  evaluation reward: 354.0
episode: 6685   score: 215.0  epsilon: 1.0    steps: 520  evaluation reward: 353.0
episode: 6686   score: 355.0  epsilon: 1.0    steps: 800  evaluation reward: 353.85
Training network. lr: 0.000103. clip: 0.041174
Iteration 19186: Policy loss: 0.405749. Value loss: 0.199825. Entropy: 0.272841.
Iteration 19187: Policy loss: 0.4029

episode: 6705   score: 340.0  epsilon: 1.0    steps: 152  evaluation reward: 361.6
episode: 6706   score: 365.0  epsilon: 1.0    steps: 304  evaluation reward: 363.9
episode: 6707   score: 335.0  epsilon: 1.0    steps: 1024  evaluation reward: 362.95
Training network. lr: 0.000103. clip: 0.041027
Iteration 19246: Policy loss: 0.269790. Value loss: 0.141777. Entropy: 0.282250.
Iteration 19247: Policy loss: 0.258287. Value loss: 0.051608. Entropy: 0.281557.
Iteration 19248: Policy loss: 0.262354. Value loss: 0.031417. Entropy: 0.280510.
episode: 6708   score: 475.0  epsilon: 1.0    steps: 80  evaluation reward: 365.45
Training network. lr: 0.000103. clip: 0.041027
Iteration 19249: Policy loss: -0.081374. Value loss: 0.110832. Entropy: 0.289892.
Iteration 19250: Policy loss: -0.077419. Value loss: 0.052463. Entropy: 0.288392.
Iteration 19251: Policy loss: -0.089098. Value loss: 0.041025. Entropy: 0.288237.
episode: 6709   score: 305.0  epsilon: 1.0    steps: 736  evaluation reward: 366.0


Iteration 19313: Policy loss: 0.185184. Value loss: 0.054640. Entropy: 0.300869.
Iteration 19314: Policy loss: 0.185448. Value loss: 0.032420. Entropy: 0.301492.
Training network. lr: 0.000102. clip: 0.040713
Iteration 19315: Policy loss: -0.288434. Value loss: 0.403931. Entropy: 0.319674.
Iteration 19316: Policy loss: -0.299240. Value loss: 0.199586. Entropy: 0.320099.
Iteration 19317: Policy loss: -0.304310. Value loss: 0.142732. Entropy: 0.319772.
episode: 6725   score: 715.0  epsilon: 1.0    steps: 352  evaluation reward: 392.55
episode: 6726   score: 660.0  epsilon: 1.0    steps: 824  evaluation reward: 397.65
Training network. lr: 0.000102. clip: 0.040713
Iteration 19318: Policy loss: 0.151515. Value loss: 0.118626. Entropy: 0.282643.
Iteration 19319: Policy loss: 0.143891. Value loss: 0.051173. Entropy: 0.283012.
Iteration 19320: Policy loss: 0.143817. Value loss: 0.033220. Entropy: 0.282280.
episode: 6727   score: 300.0  epsilon: 1.0    steps: 8  evaluation reward: 397.1
episod

Iteration 19381: Policy loss: -0.123216. Value loss: 0.256472. Entropy: 0.312680.
Iteration 19382: Policy loss: -0.125880. Value loss: 0.086406. Entropy: 0.311278.
Iteration 19383: Policy loss: -0.134501. Value loss: 0.057033. Entropy: 0.312266.
episode: 6744   score: 535.0  epsilon: 1.0    steps: 496  evaluation reward: 427.2
Training network. lr: 0.000101. clip: 0.040566
Iteration 19384: Policy loss: 0.548653. Value loss: 0.458021. Entropy: 0.302401.
Iteration 19385: Policy loss: 0.545196. Value loss: 0.133484. Entropy: 0.304089.
Iteration 19386: Policy loss: 0.543712. Value loss: 0.083732. Entropy: 0.303793.
episode: 6745   score: 680.0  epsilon: 1.0    steps: 80  evaluation reward: 431.6
episode: 6746   score: 550.0  epsilon: 1.0    steps: 320  evaluation reward: 434.0
episode: 6747   score: 895.0  epsilon: 1.0    steps: 536  evaluation reward: 442.05
episode: 6748   score: 650.0  epsilon: 1.0    steps: 880  evaluation reward: 445.9
Training network. lr: 0.000101. clip: 0.040566
It

episode: 6766   score: 655.0  epsilon: 1.0    steps: 320  evaluation reward: 447.55
Training network. lr: 0.000101. clip: 0.040409
Iteration 19447: Policy loss: 0.054103. Value loss: 0.075703. Entropy: 0.283772.
Iteration 19448: Policy loss: 0.059927. Value loss: 0.045163. Entropy: 0.284828.
Iteration 19449: Policy loss: 0.051557. Value loss: 0.036240. Entropy: 0.283625.
episode: 6767   score: 695.0  epsilon: 1.0    steps: 664  evaluation reward: 449.25
Training network. lr: 0.000101. clip: 0.040409
Iteration 19450: Policy loss: -0.029665. Value loss: 0.266729. Entropy: 0.305520.
Iteration 19451: Policy loss: -0.039930. Value loss: 0.135824. Entropy: 0.304891.
Iteration 19452: Policy loss: -0.027109. Value loss: 0.091200. Entropy: 0.305345.
Training network. lr: 0.000101. clip: 0.040253
Iteration 19453: Policy loss: 0.093040. Value loss: 0.207928. Entropy: 0.315517.
Iteration 19454: Policy loss: 0.084580. Value loss: 0.122015. Entropy: 0.314692.
Iteration 19455: Policy loss: 0.092603. 

Iteration 19514: Policy loss: 0.102138. Value loss: 0.025833. Entropy: 0.310653.
Iteration 19515: Policy loss: 0.103468. Value loss: 0.020947. Entropy: 0.310368.
episode: 6786   score: 365.0  epsilon: 1.0    steps: 16  evaluation reward: 465.5
episode: 6787   score: 400.0  epsilon: 1.0    steps: 344  evaluation reward: 464.15
Training network. lr: 0.000100. clip: 0.040105
Iteration 19516: Policy loss: -0.052224. Value loss: 0.114022. Entropy: 0.278223.
Iteration 19517: Policy loss: -0.053373. Value loss: 0.050684. Entropy: 0.278477.
Iteration 19518: Policy loss: -0.057718. Value loss: 0.038591. Entropy: 0.278902.
episode: 6788   score: 315.0  epsilon: 1.0    steps: 208  evaluation reward: 463.7
episode: 6789   score: 495.0  epsilon: 1.0    steps: 872  evaluation reward: 465.8
Training network. lr: 0.000100. clip: 0.040105
Iteration 19519: Policy loss: -0.103325. Value loss: 0.114043. Entropy: 0.298626.
Iteration 19520: Policy loss: -0.115653. Value loss: 0.055302. Entropy: 0.300219.
It

Iteration 19580: Policy loss: -0.225067. Value loss: 0.174684. Entropy: 0.307716.
Iteration 19581: Policy loss: -0.219634. Value loss: 0.073012. Entropy: 0.307353.
episode: 6807   score: 390.0  epsilon: 1.0    steps: 376  evaluation reward: 475.5
Training network. lr: 0.000100. clip: 0.039949
Iteration 19582: Policy loss: 0.175025. Value loss: 0.126517. Entropy: 0.297868.
Iteration 19583: Policy loss: 0.171253. Value loss: 0.062695. Entropy: 0.297079.
Iteration 19584: Policy loss: 0.171534. Value loss: 0.040666. Entropy: 0.297206.
episode: 6808   score: 325.0  epsilon: 1.0    steps: 656  evaluation reward: 474.0
Training network. lr: 0.000100. clip: 0.039949
Iteration 19585: Policy loss: 0.324713. Value loss: 0.190419. Entropy: 0.310115.
Iteration 19586: Policy loss: 0.312556. Value loss: 0.101798. Entropy: 0.310911.
Iteration 19587: Policy loss: 0.308192. Value loss: 0.067676. Entropy: 0.310508.
episode: 6809   score: 410.0  epsilon: 1.0    steps: 80  evaluation reward: 475.05
Trainin

Training network. lr: 0.000099. clip: 0.039792
Iteration 19645: Policy loss: -0.379851. Value loss: 0.328217. Entropy: 0.309941.
Iteration 19646: Policy loss: -0.390311. Value loss: 0.209397. Entropy: 0.310050.
Iteration 19647: Policy loss: -0.374377. Value loss: 0.139164. Entropy: 0.309360.
episode: 6831   score: 390.0  epsilon: 1.0    steps: 440  evaluation reward: 445.75
Training network. lr: 0.000099. clip: 0.039792
Iteration 19648: Policy loss: -0.310352. Value loss: 0.280199. Entropy: 0.304267.
Iteration 19649: Policy loss: -0.301247. Value loss: 0.125271. Entropy: 0.302954.
Iteration 19650: Policy loss: -0.309858. Value loss: 0.079011. Entropy: 0.301937.
episode: 6832   score: 460.0  epsilon: 1.0    steps: 488  evaluation reward: 446.05
Training network. lr: 0.000099. clip: 0.039644
Iteration 19651: Policy loss: 0.153863. Value loss: 0.096692. Entropy: 0.299081.
Iteration 19652: Policy loss: 0.144048. Value loss: 0.034335. Entropy: 0.298795.
Iteration 19653: Policy loss: 0.13755

episode: 6852   score: 285.0  epsilon: 1.0    steps: 976  evaluation reward: 417.8
Training network. lr: 0.000099. clip: 0.039488
Iteration 19711: Policy loss: 0.301259. Value loss: 0.172884. Entropy: 0.305662.
Iteration 19712: Policy loss: 0.302015. Value loss: 0.058349. Entropy: 0.305050.
Iteration 19713: Policy loss: 0.307211. Value loss: 0.033308. Entropy: 0.303799.
episode: 6853   score: 210.0  epsilon: 1.0    steps: 784  evaluation reward: 416.25
Training network. lr: 0.000099. clip: 0.039488
Iteration 19714: Policy loss: 0.021793. Value loss: 0.165220. Entropy: 0.306251.
Iteration 19715: Policy loss: 0.013938. Value loss: 0.078693. Entropy: 0.305315.
Iteration 19716: Policy loss: 0.006822. Value loss: 0.056309. Entropy: 0.305144.
Training network. lr: 0.000099. clip: 0.039488
Iteration 19717: Policy loss: 0.164736. Value loss: 0.125468. Entropy: 0.315553.
Iteration 19718: Policy loss: 0.158836. Value loss: 0.052174. Entropy: 0.316151.
Iteration 19719: Policy loss: 0.146632. Valu

Training network. lr: 0.000098. clip: 0.039331
Iteration 19777: Policy loss: 0.088300. Value loss: 0.268030. Entropy: 0.305361.
Iteration 19778: Policy loss: 0.090195. Value loss: 0.152531. Entropy: 0.303943.
Iteration 19779: Policy loss: 0.090451. Value loss: 0.111491. Entropy: 0.304836.
Training network. lr: 0.000098. clip: 0.039331
Iteration 19780: Policy loss: -0.253155. Value loss: 0.305716. Entropy: 0.309202.
Iteration 19781: Policy loss: -0.250770. Value loss: 0.113559. Entropy: 0.308811.
Iteration 19782: Policy loss: -0.267260. Value loss: 0.055761. Entropy: 0.309909.
episode: 6874   score: 335.0  epsilon: 1.0    steps: 616  evaluation reward: 413.7
Training network. lr: 0.000098. clip: 0.039331
Iteration 19783: Policy loss: -0.078275. Value loss: 0.072441. Entropy: 0.312702.
Iteration 19784: Policy loss: -0.084369. Value loss: 0.035078. Entropy: 0.311859.
Iteration 19785: Policy loss: -0.084871. Value loss: 0.025764. Entropy: 0.311776.
episode: 6875   score: 515.0  epsilon: 1.

episode: 6896   score: 210.0  epsilon: 1.0    steps: 552  evaluation reward: 402.3
Training network. lr: 0.000098. clip: 0.039184
Iteration 19843: Policy loss: 0.063888. Value loss: 0.101569. Entropy: 0.309932.
Iteration 19844: Policy loss: 0.058005. Value loss: 0.055743. Entropy: 0.309503.
Iteration 19845: Policy loss: 0.058830. Value loss: 0.042242. Entropy: 0.309391.
Training network. lr: 0.000098. clip: 0.039184
Iteration 19846: Policy loss: 0.079135. Value loss: 0.102763. Entropy: 0.314326.
Iteration 19847: Policy loss: 0.067735. Value loss: 0.047811. Entropy: 0.313862.
Iteration 19848: Policy loss: 0.061522. Value loss: 0.034028. Entropy: 0.314360.
episode: 6897   score: 180.0  epsilon: 1.0    steps: 80  evaluation reward: 397.35
episode: 6898   score: 210.0  epsilon: 1.0    steps: 984  evaluation reward: 394.5
Training network. lr: 0.000098. clip: 0.039184
Iteration 19849: Policy loss: -0.044111. Value loss: 0.103766. Entropy: 0.304957.
Iteration 19850: Policy loss: -0.049724. V

Iteration 19907: Policy loss: -0.099960. Value loss: 0.037371. Entropy: 0.300201.
Iteration 19908: Policy loss: -0.102855. Value loss: 0.031030. Entropy: 0.299283.
Training network. lr: 0.000097. clip: 0.038870
Iteration 19909: Policy loss: -0.211924. Value loss: 0.327893. Entropy: 0.315959.
Iteration 19910: Policy loss: -0.197925. Value loss: 0.113384. Entropy: 0.316108.
Iteration 19911: Policy loss: -0.210124. Value loss: 0.057639. Entropy: 0.315555.
Training network. lr: 0.000097. clip: 0.038870
Iteration 19912: Policy loss: -0.113319. Value loss: 0.359648. Entropy: 0.311285.
Iteration 19913: Policy loss: -0.116749. Value loss: 0.216372. Entropy: 0.310971.
Iteration 19914: Policy loss: -0.117656. Value loss: 0.167340. Entropy: 0.311130.
episode: 6919   score: 540.0  epsilon: 1.0    steps: 664  evaluation reward: 391.05
Training network. lr: 0.000097. clip: 0.038870
Iteration 19915: Policy loss: -0.512927. Value loss: 0.567108. Entropy: 0.304191.
Iteration 19916: Policy loss: -0.5037

Iteration 19974: Policy loss: 0.307555. Value loss: 0.048872. Entropy: 0.302359.
episode: 6940   score: 450.0  epsilon: 1.0    steps: 408  evaluation reward: 404.3
Training network. lr: 0.000097. clip: 0.038723
Iteration 19975: Policy loss: 0.204309. Value loss: 0.212356. Entropy: 0.309302.
Iteration 19976: Policy loss: 0.190712. Value loss: 0.092134. Entropy: 0.308700.
Iteration 19977: Policy loss: 0.196297. Value loss: 0.064283. Entropy: 0.309334.
episode: 6941   score: 335.0  epsilon: 1.0    steps: 904  evaluation reward: 405.55
Training network. lr: 0.000097. clip: 0.038723
Iteration 19978: Policy loss: -0.298047. Value loss: 0.271012. Entropy: 0.307137.
Iteration 19979: Policy loss: -0.299856. Value loss: 0.115330. Entropy: 0.307263.
Iteration 19980: Policy loss: -0.317068. Value loss: 0.074110. Entropy: 0.308329.
Training network. lr: 0.000097. clip: 0.038723
Iteration 19981: Policy loss: 0.028774. Value loss: 0.118103. Entropy: 0.308476.
Iteration 19982: Policy loss: 0.030075. V

Iteration 20038: Policy loss: -0.194235. Value loss: 0.223619. Entropy: 0.311539.
Iteration 20039: Policy loss: -0.212414. Value loss: 0.092097. Entropy: 0.311728.
Iteration 20040: Policy loss: -0.210176. Value loss: 0.049625. Entropy: 0.310967.
episode: 6963   score: 185.0  epsilon: 1.0    steps: 688  evaluation reward: 388.2
Training network. lr: 0.000096. clip: 0.038566
Iteration 20041: Policy loss: 0.048187. Value loss: 0.245429. Entropy: 0.309139.
Iteration 20042: Policy loss: 0.062050. Value loss: 0.100801. Entropy: 0.307108.
Iteration 20043: Policy loss: 0.046867. Value loss: 0.061280. Entropy: 0.310081.
episode: 6964   score: 210.0  epsilon: 1.0    steps: 432  evaluation reward: 387.6
episode: 6965   score: 535.0  epsilon: 1.0    steps: 552  evaluation reward: 388.75
episode: 6966   score: 285.0  epsilon: 1.0    steps: 888  evaluation reward: 388.7
Training network. lr: 0.000096. clip: 0.038566
Iteration 20044: Policy loss: 0.162756. Value loss: 0.171010. Entropy: 0.306106.
Ite

episode: 6986   score: 285.0  epsilon: 1.0    steps: 544  evaluation reward: 384.15
Training network. lr: 0.000096. clip: 0.038262
Iteration 20104: Policy loss: 0.181820. Value loss: 0.092898. Entropy: 0.309542.
Iteration 20105: Policy loss: 0.174899. Value loss: 0.042415. Entropy: 0.308069.
Iteration 20106: Policy loss: 0.177370. Value loss: 0.029708. Entropy: 0.307576.
episode: 6987   score: 540.0  epsilon: 1.0    steps: 160  evaluation reward: 386.0
episode: 6988   score: 360.0  epsilon: 1.0    steps: 760  evaluation reward: 386.05
Training network. lr: 0.000096. clip: 0.038262
Iteration 20107: Policy loss: -0.096231. Value loss: 0.367682. Entropy: 0.310103.
Iteration 20108: Policy loss: -0.094034. Value loss: 0.124949. Entropy: 0.309170.
Iteration 20109: Policy loss: -0.102735. Value loss: 0.063751. Entropy: 0.309549.
Training network. lr: 0.000096. clip: 0.038262
Iteration 20110: Policy loss: 0.447980. Value loss: 0.144124. Entropy: 0.311961.
Iteration 20111: Policy loss: 0.445888

Iteration 20171: Policy loss: 0.259510. Value loss: 0.067839. Entropy: 0.309358.
Iteration 20172: Policy loss: 0.257566. Value loss: 0.046424. Entropy: 0.308554.
Training network. lr: 0.000095. clip: 0.038105
Iteration 20173: Policy loss: 0.338953. Value loss: 0.073569. Entropy: 0.307948.
Iteration 20174: Policy loss: 0.340225. Value loss: 0.028592. Entropy: 0.307364.
Iteration 20175: Policy loss: 0.330006. Value loss: 0.020028. Entropy: 0.307956.
episode: 7006   score: 475.0  epsilon: 1.0    steps: 704  evaluation reward: 406.2
Training network. lr: 0.000095. clip: 0.038105
Iteration 20176: Policy loss: -0.038350. Value loss: 0.135016. Entropy: 0.309459.
Iteration 20177: Policy loss: -0.044620. Value loss: 0.055368. Entropy: 0.309134.
Iteration 20178: Policy loss: -0.038861. Value loss: 0.038143. Entropy: 0.309645.
episode: 7007   score: 670.0  epsilon: 1.0    steps: 344  evaluation reward: 406.2
episode: 7008   score: 435.0  epsilon: 1.0    steps: 624  evaluation reward: 407.25
Train

Iteration 20238: Policy loss: -0.368692. Value loss: 0.241428. Entropy: 0.311361.
episode: 7027   score: 415.0  epsilon: 1.0    steps: 400  evaluation reward: 384.25
Training network. lr: 0.000095. clip: 0.037949
Iteration 20239: Policy loss: -0.066109. Value loss: 0.059691. Entropy: 0.303147.
Iteration 20240: Policy loss: -0.066766. Value loss: 0.029154. Entropy: 0.304852.
Iteration 20241: Policy loss: -0.072044. Value loss: 0.021302. Entropy: 0.305858.
episode: 7028   score: 400.0  epsilon: 1.0    steps: 968  evaluation reward: 386.1
Training network. lr: 0.000095. clip: 0.037949
Iteration 20242: Policy loss: 0.013608. Value loss: 0.109246. Entropy: 0.307970.
Iteration 20243: Policy loss: 0.013276. Value loss: 0.045955. Entropy: 0.307518.
Iteration 20244: Policy loss: 0.007955. Value loss: 0.033181. Entropy: 0.308672.
Training network. lr: 0.000095. clip: 0.037949
Iteration 20245: Policy loss: -0.152481. Value loss: 0.379364. Entropy: 0.313348.
Iteration 20246: Policy loss: -0.167884

Iteration 20303: Policy loss: 0.118877. Value loss: 0.046587. Entropy: 0.309423.
Iteration 20304: Policy loss: 0.124024. Value loss: 0.035642. Entropy: 0.309939.
episode: 7050   score: 285.0  epsilon: 1.0    steps: 136  evaluation reward: 388.3
Training network. lr: 0.000094. clip: 0.037645
Iteration 20305: Policy loss: -0.102200. Value loss: 0.108564. Entropy: 0.308955.
Iteration 20306: Policy loss: -0.106664. Value loss: 0.051144. Entropy: 0.308140.
Iteration 20307: Policy loss: -0.104637. Value loss: 0.036653. Entropy: 0.309550.
Training network. lr: 0.000094. clip: 0.037645
Iteration 20308: Policy loss: -0.451419. Value loss: 0.341824. Entropy: 0.312411.
Iteration 20309: Policy loss: -0.453716. Value loss: 0.174496. Entropy: 0.312929.
Iteration 20310: Policy loss: -0.447279. Value loss: 0.117736. Entropy: 0.312489.
now time :  2019-09-06 11:12:27.753938
episode: 7051   score: 365.0  epsilon: 1.0    steps: 648  evaluation reward: 386.5
Training network. lr: 0.000094. clip: 0.037645


Iteration 20370: Policy loss: -0.430704. Value loss: 0.124533. Entropy: 0.298313.
Training network. lr: 0.000094. clip: 0.037488
Iteration 20371: Policy loss: -0.174034. Value loss: 0.133225. Entropy: 0.309704.
Iteration 20372: Policy loss: -0.176155. Value loss: 0.062793. Entropy: 0.310299.
Iteration 20373: Policy loss: -0.176857. Value loss: 0.046958. Entropy: 0.309689.
Training network. lr: 0.000094. clip: 0.037488
Iteration 20374: Policy loss: 0.084760. Value loss: 0.164787. Entropy: 0.304522.
Iteration 20375: Policy loss: 0.081884. Value loss: 0.085316. Entropy: 0.303045.
Iteration 20376: Policy loss: 0.076283. Value loss: 0.065678. Entropy: 0.303016.
episode: 7070   score: 590.0  epsilon: 1.0    steps: 536  evaluation reward: 410.35
episode: 7071   score: 520.0  epsilon: 1.0    steps: 712  evaluation reward: 411.3
episode: 7072   score: 800.0  epsilon: 1.0    steps: 928  evaluation reward: 416.9
Training network. lr: 0.000094. clip: 0.037488
Iteration 20377: Policy loss: 0.117771

Iteration 20436: Policy loss: 0.270699. Value loss: 0.046820. Entropy: 0.304118.
Training network. lr: 0.000093. clip: 0.037340
Iteration 20437: Policy loss: -0.403275. Value loss: 0.400211. Entropy: 0.302465.
Iteration 20438: Policy loss: -0.405230. Value loss: 0.200785. Entropy: 0.303319.
Iteration 20439: Policy loss: -0.403289. Value loss: 0.128062. Entropy: 0.303167.
Training network. lr: 0.000093. clip: 0.037340
Iteration 20440: Policy loss: 0.068743. Value loss: 0.121101. Entropy: 0.304359.
Iteration 20441: Policy loss: 0.068482. Value loss: 0.057078. Entropy: 0.303012.
Iteration 20442: Policy loss: 0.066639. Value loss: 0.039524. Entropy: 0.304057.
Training network. lr: 0.000093. clip: 0.037340
Iteration 20443: Policy loss: 0.118854. Value loss: 0.176935. Entropy: 0.308745.
Iteration 20444: Policy loss: 0.122760. Value loss: 0.074774. Entropy: 0.308987.
Iteration 20445: Policy loss: 0.112311. Value loss: 0.051794. Entropy: 0.308052.
Training network. lr: 0.000093. clip: 0.037340

episode: 7109   score: 435.0  epsilon: 1.0    steps: 632  evaluation reward: 425.55
episode: 7110   score: 500.0  epsilon: 1.0    steps: 872  evaluation reward: 424.9
episode: 7111   score: 390.0  epsilon: 1.0    steps: 880  evaluation reward: 424.6
Training network. lr: 0.000093. clip: 0.037027
Iteration 20506: Policy loss: -0.031915. Value loss: 0.108419. Entropy: 0.284876.
Iteration 20507: Policy loss: -0.039020. Value loss: 0.058129. Entropy: 0.285303.
Iteration 20508: Policy loss: -0.039789. Value loss: 0.044790. Entropy: 0.283434.
Training network. lr: 0.000093. clip: 0.037027
Iteration 20509: Policy loss: -0.064276. Value loss: 0.107360. Entropy: 0.311987.
Iteration 20510: Policy loss: -0.066686. Value loss: 0.042971. Entropy: 0.312539.
Iteration 20511: Policy loss: -0.069780. Value loss: 0.027934. Entropy: 0.310694.
episode: 7112   score: 500.0  epsilon: 1.0    steps: 320  evaluation reward: 426.85
Training network. lr: 0.000093. clip: 0.037027
Iteration 20512: Policy loss: 0.1

Iteration 20572: Policy loss: 0.147771. Value loss: 0.120342. Entropy: 0.301584.
Iteration 20573: Policy loss: 0.151026. Value loss: 0.043727. Entropy: 0.300784.
Iteration 20574: Policy loss: 0.137263. Value loss: 0.030900. Entropy: 0.301122.
episode: 7130   score: 120.0  epsilon: 1.0    steps: 760  evaluation reward: 432.95
Training network. lr: 0.000092. clip: 0.036880
Iteration 20575: Policy loss: -0.111838. Value loss: 0.377716. Entropy: 0.306511.
Iteration 20576: Policy loss: -0.122438. Value loss: 0.232583. Entropy: 0.303249.
Iteration 20577: Policy loss: -0.133312. Value loss: 0.182737. Entropy: 0.302538.
episode: 7131   score: 345.0  epsilon: 1.0    steps: 224  evaluation reward: 432.45
Training network. lr: 0.000092. clip: 0.036880
Iteration 20578: Policy loss: -0.004705. Value loss: 0.130478. Entropy: 0.303050.
Iteration 20579: Policy loss: -0.013672. Value loss: 0.063903. Entropy: 0.302442.
Iteration 20580: Policy loss: -0.007586. Value loss: 0.048999. Entropy: 0.302809.
epi

Training network. lr: 0.000092. clip: 0.036723
Iteration 20638: Policy loss: 0.119927. Value loss: 0.162075. Entropy: 0.306804.
Iteration 20639: Policy loss: 0.123503. Value loss: 0.081479. Entropy: 0.304634.
Iteration 20640: Policy loss: 0.116734. Value loss: 0.060159. Entropy: 0.304460.
Training network. lr: 0.000092. clip: 0.036723
Iteration 20641: Policy loss: 0.460063. Value loss: 0.221492. Entropy: 0.310864.
Iteration 20642: Policy loss: 0.438280. Value loss: 0.075351. Entropy: 0.310887.
Iteration 20643: Policy loss: 0.444570. Value loss: 0.051961. Entropy: 0.308888.
episode: 7152   score: 300.0  epsilon: 1.0    steps: 376  evaluation reward: 444.6
episode: 7153   score: 760.0  epsilon: 1.0    steps: 576  evaluation reward: 447.15
Training network. lr: 0.000092. clip: 0.036723
Iteration 20644: Policy loss: 0.269905. Value loss: 0.194396. Entropy: 0.289603.
Iteration 20645: Policy loss: 0.262511. Value loss: 0.073210. Entropy: 0.288059.
Iteration 20646: Policy loss: 0.260849. Valu

Iteration 20706: Policy loss: 0.415077. Value loss: 0.059249. Entropy: 0.311908.
episode: 7171   score: 385.0  epsilon: 1.0    steps: 480  evaluation reward: 444.95
episode: 7172   score: 705.0  epsilon: 1.0    steps: 920  evaluation reward: 444.0
episode: 7173   score: 775.0  epsilon: 1.0    steps: 1008  evaluation reward: 444.8
Training network. lr: 0.000091. clip: 0.036419
Iteration 20707: Policy loss: 0.304352. Value loss: 0.246083. Entropy: 0.291269.
Iteration 20708: Policy loss: 0.292818. Value loss: 0.094704. Entropy: 0.288947.
Iteration 20709: Policy loss: 0.295617. Value loss: 0.065373. Entropy: 0.289343.
Training network. lr: 0.000091. clip: 0.036419
Iteration 20710: Policy loss: 0.253533. Value loss: 0.142501. Entropy: 0.299433.
Iteration 20711: Policy loss: 0.243353. Value loss: 0.045673. Entropy: 0.298348.
Iteration 20712: Policy loss: 0.247099. Value loss: 0.031018. Entropy: 0.298449.
Training network. lr: 0.000091. clip: 0.036419
Iteration 20713: Policy loss: 0.419945. V

Iteration 20774: Policy loss: 0.079228. Value loss: 0.030316. Entropy: 0.305921.
Iteration 20775: Policy loss: 0.077682. Value loss: 0.019878. Entropy: 0.307159.
episode: 7190   score: 345.0  epsilon: 1.0    steps: 112  evaluation reward: 454.45
episode: 7191   score: 620.0  epsilon: 1.0    steps: 656  evaluation reward: 457.25
Training network. lr: 0.000091. clip: 0.036262
Iteration 20776: Policy loss: 0.048410. Value loss: 0.105586. Entropy: 0.279664.
Iteration 20777: Policy loss: 0.045144. Value loss: 0.051775. Entropy: 0.280795.
Iteration 20778: Policy loss: 0.039347. Value loss: 0.037946. Entropy: 0.281000.
episode: 7192   score: 315.0  epsilon: 1.0    steps: 360  evaluation reward: 454.2
episode: 7193   score: 240.0  epsilon: 1.0    steps: 424  evaluation reward: 454.75
episode: 7194   score: 695.0  epsilon: 1.0    steps: 1000  evaluation reward: 458.1
Training network. lr: 0.000091. clip: 0.036262
Iteration 20779: Policy loss: 0.111858. Value loss: 0.136022. Entropy: 0.290806.
I

Iteration 20840: Policy loss: -0.283714. Value loss: 0.181634. Entropy: 0.301911.
Iteration 20841: Policy loss: -0.284962. Value loss: 0.127050. Entropy: 0.302302.
Training network. lr: 0.000090. clip: 0.036105
Iteration 20842: Policy loss: 0.086907. Value loss: 0.137245. Entropy: 0.301442.
Iteration 20843: Policy loss: 0.089432. Value loss: 0.060693. Entropy: 0.301518.
Iteration 20844: Policy loss: 0.091821. Value loss: 0.039914. Entropy: 0.301829.
Training network. lr: 0.000090. clip: 0.036105
Iteration 20845: Policy loss: -0.308974. Value loss: 0.366373. Entropy: 0.306984.
Iteration 20846: Policy loss: -0.306549. Value loss: 0.174226. Entropy: 0.307440.
Iteration 20847: Policy loss: -0.304615. Value loss: 0.115449. Entropy: 0.309583.
Training network. lr: 0.000090. clip: 0.036105
Iteration 20848: Policy loss: 0.162651. Value loss: 0.233931. Entropy: 0.308955.
Iteration 20849: Policy loss: 0.164812. Value loss: 0.087158. Entropy: 0.309363.
Iteration 20850: Policy loss: 0.164849. Valu

Iteration 20908: Policy loss: -0.031506. Value loss: 0.123444. Entropy: 0.295913.
Iteration 20909: Policy loss: -0.036726. Value loss: 0.057807. Entropy: 0.295601.
Iteration 20910: Policy loss: -0.040737. Value loss: 0.041876. Entropy: 0.295207.
episode: 7230   score: 850.0  epsilon: 1.0    steps: 8  evaluation reward: 498.8
episode: 7231   score: 410.0  epsilon: 1.0    steps: 736  evaluation reward: 499.45
Training network. lr: 0.000090. clip: 0.035801
Iteration 20911: Policy loss: 0.125936. Value loss: 0.083311. Entropy: 0.286720.
Iteration 20912: Policy loss: 0.121149. Value loss: 0.037018. Entropy: 0.284145.
Iteration 20913: Policy loss: 0.118173. Value loss: 0.026856. Entropy: 0.284702.
Training network. lr: 0.000090. clip: 0.035801
Iteration 20914: Policy loss: -0.169917. Value loss: 0.324397. Entropy: 0.305902.
Iteration 20915: Policy loss: -0.171237. Value loss: 0.204847. Entropy: 0.304410.
Iteration 20916: Policy loss: -0.158298. Value loss: 0.138615. Entropy: 0.306050.
episod

Training network. lr: 0.000089. clip: 0.035645
Iteration 20977: Policy loss: 0.237380. Value loss: 0.219370. Entropy: 0.302447.
Iteration 20978: Policy loss: 0.235221. Value loss: 0.068062. Entropy: 0.304024.
Iteration 20979: Policy loss: 0.229897. Value loss: 0.043427. Entropy: 0.302988.
episode: 7249   score: 330.0  epsilon: 1.0    steps: 624  evaluation reward: 514.8
Training network. lr: 0.000089. clip: 0.035645
Iteration 20980: Policy loss: 0.014222. Value loss: 0.262939. Entropy: 0.294589.
Iteration 20981: Policy loss: 0.018918. Value loss: 0.099297. Entropy: 0.295650.
Iteration 20982: Policy loss: -0.022420. Value loss: 0.050466. Entropy: 0.296638.
Training network. lr: 0.000089. clip: 0.035645
Iteration 20983: Policy loss: -0.071145. Value loss: 0.149020. Entropy: 0.305152.
Iteration 20984: Policy loss: -0.072696. Value loss: 0.056152. Entropy: 0.304773.
Iteration 20985: Policy loss: -0.078461. Value loss: 0.041555. Entropy: 0.303900.
episode: 7250   score: 530.0  epsilon: 1.0 

Iteration 21045: Policy loss: 0.030537. Value loss: 0.042639. Entropy: 0.294061.
Training network. lr: 0.000089. clip: 0.035497
Iteration 21046: Policy loss: 0.186272. Value loss: 0.105470. Entropy: 0.312320.
Iteration 21047: Policy loss: 0.182829. Value loss: 0.041799. Entropy: 0.309711.
Iteration 21048: Policy loss: 0.179933. Value loss: 0.030764. Entropy: 0.310950.
episode: 7268   score: 345.0  epsilon: 1.0    steps: 176  evaluation reward: 515.9
episode: 7269   score: 1000.0  epsilon: 1.0    steps: 792  evaluation reward: 521.1
Training network. lr: 0.000089. clip: 0.035497
Iteration 21049: Policy loss: 0.053880. Value loss: 0.087516. Entropy: 0.301138.
Iteration 21050: Policy loss: 0.059106. Value loss: 0.038657. Entropy: 0.300476.
Iteration 21051: Policy loss: 0.052452. Value loss: 0.025470. Entropy: 0.301341.
Training network. lr: 0.000088. clip: 0.035341
Iteration 21052: Policy loss: -0.264555. Value loss: 0.316373. Entropy: 0.299083.
Iteration 21053: Policy loss: -0.268201. Va

Training network. lr: 0.000088. clip: 0.035184
Iteration 21112: Policy loss: -0.109084. Value loss: 0.119788. Entropy: 0.312068.
Iteration 21113: Policy loss: -0.121756. Value loss: 0.072520. Entropy: 0.312411.
Iteration 21114: Policy loss: -0.128650. Value loss: 0.055786. Entropy: 0.312024.
Training network. lr: 0.000088. clip: 0.035184
Iteration 21115: Policy loss: 0.134583. Value loss: 0.195383. Entropy: 0.303009.
Iteration 21116: Policy loss: 0.131238. Value loss: 0.062273. Entropy: 0.302146.
Iteration 21117: Policy loss: 0.125293. Value loss: 0.042746. Entropy: 0.302685.
episode: 7289   score: 605.0  epsilon: 1.0    steps: 168  evaluation reward: 507.1
episode: 7290   score: 580.0  epsilon: 1.0    steps: 256  evaluation reward: 509.45
Training network. lr: 0.000088. clip: 0.035184
Iteration 21118: Policy loss: 0.229796. Value loss: 0.132829. Entropy: 0.287899.
Iteration 21119: Policy loss: 0.227960. Value loss: 0.044612. Entropy: 0.286372.
Iteration 21120: Policy loss: 0.222032. V

Iteration 21180: Policy loss: 0.114073. Value loss: 0.051064. Entropy: 0.312254.
episode: 7308   score: 350.0  epsilon: 1.0    steps: 472  evaluation reward: 504.3
episode: 7309   score: 320.0  epsilon: 1.0    steps: 824  evaluation reward: 504.05
Training network. lr: 0.000088. clip: 0.035036
Iteration 21181: Policy loss: 0.186868. Value loss: 0.120358. Entropy: 0.286683.
Iteration 21182: Policy loss: 0.189553. Value loss: 0.041884. Entropy: 0.287280.
Iteration 21183: Policy loss: 0.182471. Value loss: 0.025163. Entropy: 0.286348.
Training network. lr: 0.000088. clip: 0.035036
Iteration 21184: Policy loss: -0.548666. Value loss: 0.330166. Entropy: 0.311566.
Iteration 21185: Policy loss: -0.551332. Value loss: 0.166788. Entropy: 0.311682.
Iteration 21186: Policy loss: -0.558388. Value loss: 0.094888. Entropy: 0.312086.
episode: 7310   score: 450.0  epsilon: 1.0    steps: 240  evaluation reward: 505.35
episode: 7311   score: 395.0  epsilon: 1.0    steps: 960  evaluation reward: 504.6
Tr

Iteration 21246: Policy loss: 0.257961. Value loss: 0.033949. Entropy: 0.292006.
episode: 7330   score: 290.0  epsilon: 1.0    steps: 408  evaluation reward: 489.35
Training network. lr: 0.000087. clip: 0.034880
Iteration 21247: Policy loss: 0.142816. Value loss: 0.088578. Entropy: 0.286909.
Iteration 21248: Policy loss: 0.141144. Value loss: 0.042949. Entropy: 0.286066.
Iteration 21249: Policy loss: 0.141362. Value loss: 0.032773. Entropy: 0.286774.
episode: 7331   score: 535.0  epsilon: 1.0    steps: 280  evaluation reward: 490.6
episode: 7332   score: 260.0  epsilon: 1.0    steps: 1008  evaluation reward: 485.8
Training network. lr: 0.000087. clip: 0.034880
Iteration 21250: Policy loss: 0.010064. Value loss: 0.065829. Entropy: 0.293754.
Iteration 21251: Policy loss: 0.004412. Value loss: 0.038662. Entropy: 0.294241.
Iteration 21252: Policy loss: 0.005592. Value loss: 0.030896. Entropy: 0.294581.
Training network. lr: 0.000087. clip: 0.034723
Iteration 21253: Policy loss: -0.030556. 

Iteration 21315: Policy loss: -0.699032. Value loss: 0.181099. Entropy: 0.313344.
episode: 7348   score: 670.0  epsilon: 1.0    steps: 112  evaluation reward: 476.8
episode: 7349   score: 1200.0  epsilon: 1.0    steps: 320  evaluation reward: 485.5
episode: 7350   score: 390.0  epsilon: 1.0    steps: 728  evaluation reward: 484.1
now time :  2019-09-06 12:13:41.529406
episode: 7351   score: 650.0  epsilon: 1.0    steps: 1016  evaluation reward: 486.1
Training network. lr: 0.000086. clip: 0.034576
Iteration 21316: Policy loss: 0.169411. Value loss: 0.124701. Entropy: 0.279749.
Iteration 21317: Policy loss: 0.159000. Value loss: 0.069542. Entropy: 0.278492.
Iteration 21318: Policy loss: 0.161020. Value loss: 0.047356. Entropy: 0.278096.
episode: 7352   score: 605.0  epsilon: 1.0    steps: 200  evaluation reward: 488.25
episode: 7353   score: 750.0  epsilon: 1.0    steps: 264  evaluation reward: 489.05
Training network. lr: 0.000086. clip: 0.034576
Iteration 21319: Policy loss: 0.008681. 

Iteration 21381: Policy loss: -0.039286. Value loss: 0.044380. Entropy: 0.296407.
episode: 7369   score: 595.0  epsilon: 1.0    steps: 544  evaluation reward: 491.95
episode: 7370   score: 545.0  epsilon: 1.0    steps: 792  evaluation reward: 493.95
episode: 7371   score: 485.0  epsilon: 1.0    steps: 808  evaluation reward: 492.1
Training network. lr: 0.000086. clip: 0.034419
Iteration 21382: Policy loss: 0.290732. Value loss: 0.201298. Entropy: 0.281008.
Iteration 21383: Policy loss: 0.286918. Value loss: 0.065544. Entropy: 0.279478.
Iteration 21384: Policy loss: 0.279547. Value loss: 0.052298. Entropy: 0.280052.
episode: 7372   score: 695.0  epsilon: 1.0    steps: 96  evaluation reward: 493.35
episode: 7373   score: 345.0  epsilon: 1.0    steps: 352  evaluation reward: 493.9
Training network. lr: 0.000086. clip: 0.034419
Iteration 21385: Policy loss: -0.032910. Value loss: 0.176306. Entropy: 0.276202.
Iteration 21386: Policy loss: -0.029981. Value loss: 0.078870. Entropy: 0.275970.


Training network. lr: 0.000086. clip: 0.034262
Iteration 21448: Policy loss: 0.135193. Value loss: 0.149941. Entropy: 0.290755.
Iteration 21449: Policy loss: 0.139241. Value loss: 0.069731. Entropy: 0.290713.
Iteration 21450: Policy loss: 0.132942. Value loss: 0.049753. Entropy: 0.290606.
episode: 7390   score: 220.0  epsilon: 1.0    steps: 592  evaluation reward: 504.9
episode: 7391   score: 580.0  epsilon: 1.0    steps: 992  evaluation reward: 503.1
Training network. lr: 0.000085. clip: 0.034115
Iteration 21451: Policy loss: 0.243366. Value loss: 0.253280. Entropy: 0.287485.
Iteration 21452: Policy loss: 0.232031. Value loss: 0.117064. Entropy: 0.287294.
Iteration 21453: Policy loss: 0.224716. Value loss: 0.074830. Entropy: 0.288174.
Training network. lr: 0.000085. clip: 0.034115
Iteration 21454: Policy loss: -0.059419. Value loss: 0.143175. Entropy: 0.286633.
Iteration 21455: Policy loss: -0.072656. Value loss: 0.061790. Entropy: 0.287850.
Iteration 21456: Policy loss: -0.071137. Va

episode: 7408   score: 425.0  epsilon: 1.0    steps: 168  evaluation reward: 500.0
Training network. lr: 0.000085. clip: 0.033958
Iteration 21517: Policy loss: 0.010078. Value loss: 0.097267. Entropy: 0.304921.
Iteration 21518: Policy loss: 0.009936. Value loss: 0.037862. Entropy: 0.304198.
Iteration 21519: Policy loss: 0.005803. Value loss: 0.028630. Entropy: 0.303631.
Training network. lr: 0.000085. clip: 0.033958
Iteration 21520: Policy loss: 0.124930. Value loss: 0.166508. Entropy: 0.304696.
Iteration 21521: Policy loss: 0.127222. Value loss: 0.071764. Entropy: 0.305524.
Iteration 21522: Policy loss: 0.120294. Value loss: 0.053877. Entropy: 0.305054.
episode: 7409   score: 355.0  epsilon: 1.0    steps: 48  evaluation reward: 500.35
episode: 7410   score: 445.0  epsilon: 1.0    steps: 352  evaluation reward: 500.3
Training network. lr: 0.000085. clip: 0.033958
Iteration 21523: Policy loss: 0.153752. Value loss: 0.074034. Entropy: 0.281070.
Iteration 21524: Policy loss: 0.150128. Val

Iteration 21583: Policy loss: 0.145985. Value loss: 0.115357. Entropy: 0.300125.
Iteration 21584: Policy loss: 0.139494. Value loss: 0.051107. Entropy: 0.301756.
Iteration 21585: Policy loss: 0.136611. Value loss: 0.035508. Entropy: 0.300546.
episode: 7429   score: 985.0  epsilon: 1.0    steps: 192  evaluation reward: 503.1
episode: 7430   score: 670.0  epsilon: 1.0    steps: 432  evaluation reward: 506.9
Training network. lr: 0.000085. clip: 0.033801
Iteration 21586: Policy loss: -0.189468. Value loss: 0.375829. Entropy: 0.284485.
Iteration 21587: Policy loss: -0.208477. Value loss: 0.228196. Entropy: 0.283886.
Iteration 21588: Policy loss: -0.220033. Value loss: 0.175778. Entropy: 0.283169.
Training network. lr: 0.000085. clip: 0.033801
Iteration 21589: Policy loss: 0.074810. Value loss: 0.119932. Entropy: 0.305719.
Iteration 21590: Policy loss: 0.066897. Value loss: 0.046963. Entropy: 0.305485.
Iteration 21591: Policy loss: 0.072961. Value loss: 0.032517. Entropy: 0.305808.
Training

Iteration 21651: Policy loss: 0.285915. Value loss: 0.034711. Entropy: 0.306000.
episode: 7449   score: 685.0  epsilon: 1.0    steps: 352  evaluation reward: 501.6
episode: 7450   score: 410.0  epsilon: 1.0    steps: 576  evaluation reward: 501.8
Training network. lr: 0.000084. clip: 0.033497
Iteration 21652: Policy loss: 0.066833. Value loss: 0.051762. Entropy: 0.290195.
Iteration 21653: Policy loss: 0.067673. Value loss: 0.030031. Entropy: 0.289928.
Iteration 21654: Policy loss: 0.065934. Value loss: 0.024990. Entropy: 0.289588.
Training network. lr: 0.000084. clip: 0.033497
Iteration 21655: Policy loss: -0.268488. Value loss: 0.268866. Entropy: 0.307734.
Iteration 21656: Policy loss: -0.272231. Value loss: 0.172833. Entropy: 0.308102.
Iteration 21657: Policy loss: -0.278530. Value loss: 0.139914. Entropy: 0.308456.
now time :  2019-09-06 12:34:33.097809
episode: 7451   score: 475.0  epsilon: 1.0    steps: 816  evaluation reward: 500.05
Training network. lr: 0.000084. clip: 0.033497


Iteration 21718: Policy loss: -0.132319. Value loss: 0.195609. Entropy: 0.316237.
Iteration 21719: Policy loss: -0.133989. Value loss: 0.079747. Entropy: 0.314768.
Iteration 21720: Policy loss: -0.140491. Value loss: 0.054971. Entropy: 0.314892.
episode: 7469   score: 345.0  epsilon: 1.0    steps: 264  evaluation reward: 491.1
Training network. lr: 0.000083. clip: 0.033341
Iteration 21721: Policy loss: -0.047256. Value loss: 0.084479. Entropy: 0.294761.
Iteration 21722: Policy loss: -0.051432. Value loss: 0.032897. Entropy: 0.296175.
Iteration 21723: Policy loss: -0.054894. Value loss: 0.022002. Entropy: 0.294635.
Training network. lr: 0.000083. clip: 0.033341
Iteration 21724: Policy loss: 0.855454. Value loss: 0.243649. Entropy: 0.306563.
Iteration 21725: Policy loss: 0.852798. Value loss: 0.068948. Entropy: 0.306759.
Iteration 21726: Policy loss: 0.837635. Value loss: 0.040150. Entropy: 0.307379.
episode: 7470   score: 755.0  epsilon: 1.0    steps: 592  evaluation reward: 493.2
Train

Iteration 21785: Policy loss: 0.069466. Value loss: 0.057167. Entropy: 0.295042.
Iteration 21786: Policy loss: 0.072887. Value loss: 0.036725. Entropy: 0.293999.
Training network. lr: 0.000083. clip: 0.033193
Iteration 21787: Policy loss: -0.157091. Value loss: 0.290118. Entropy: 0.290118.
Iteration 21788: Policy loss: -0.156280. Value loss: 0.147581. Entropy: 0.289903.
Iteration 21789: Policy loss: -0.161962. Value loss: 0.085225. Entropy: 0.288681.
Training network. lr: 0.000083. clip: 0.033193
Iteration 21790: Policy loss: -0.259782. Value loss: 0.194593. Entropy: 0.302833.
Iteration 21791: Policy loss: -0.239142. Value loss: 0.082799. Entropy: 0.303106.
Iteration 21792: Policy loss: -0.262815. Value loss: 0.059257. Entropy: 0.303354.
Training network. lr: 0.000083. clip: 0.033193
Iteration 21793: Policy loss: -0.182233. Value loss: 0.151592. Entropy: 0.308917.
Iteration 21794: Policy loss: -0.185447. Value loss: 0.054322. Entropy: 0.309478.
Iteration 21795: Policy loss: -0.188787. 

Iteration 21856: Policy loss: 0.088594. Value loss: 0.176507. Entropy: 0.312744.
Iteration 21857: Policy loss: 0.088783. Value loss: 0.084290. Entropy: 0.312461.
Iteration 21858: Policy loss: 0.083117. Value loss: 0.058071. Entropy: 0.311714.
episode: 7506   score: 755.0  epsilon: 1.0    steps: 296  evaluation reward: 497.65
Training network. lr: 0.000082. clip: 0.032880
Iteration 21859: Policy loss: 0.174418. Value loss: 0.335853. Entropy: 0.306502.
Iteration 21860: Policy loss: 0.169637. Value loss: 0.124823. Entropy: 0.306691.
Iteration 21861: Policy loss: 0.164170. Value loss: 0.056948. Entropy: 0.307774.
episode: 7507   score: 820.0  epsilon: 1.0    steps: 264  evaluation reward: 498.25
episode: 7508   score: 390.0  epsilon: 1.0    steps: 728  evaluation reward: 497.9
episode: 7509   score: 585.0  epsilon: 1.0    steps: 848  evaluation reward: 500.2
Training network. lr: 0.000082. clip: 0.032880
Iteration 21862: Policy loss: 0.025961. Value loss: 0.160177. Entropy: 0.297209.
Itera

Iteration 21924: Policy loss: -0.165999. Value loss: 0.130046. Entropy: 0.290384.
Training network. lr: 0.000082. clip: 0.032732
Iteration 21925: Policy loss: 0.122892. Value loss: 0.107591. Entropy: 0.300658.
Iteration 21926: Policy loss: 0.118144. Value loss: 0.046003. Entropy: 0.300401.
Iteration 21927: Policy loss: 0.115054. Value loss: 0.029500. Entropy: 0.299266.
episode: 7526   score: 525.0  epsilon: 1.0    steps: 168  evaluation reward: 519.7
episode: 7527   score: 330.0  epsilon: 1.0    steps: 184  evaluation reward: 518.85
episode: 7528   score: 470.0  epsilon: 1.0    steps: 912  evaluation reward: 518.25
Training network. lr: 0.000082. clip: 0.032732
Iteration 21928: Policy loss: 0.036645. Value loss: 0.127131. Entropy: 0.290729.
Iteration 21929: Policy loss: 0.035697. Value loss: 0.057006. Entropy: 0.290739.
Iteration 21930: Policy loss: 0.033226. Value loss: 0.042281. Entropy: 0.289898.
episode: 7529   score: 335.0  epsilon: 1.0    steps: 888  evaluation reward: 511.75
Tra

Iteration 21992: Policy loss: 0.002064. Value loss: 0.048381. Entropy: 0.286894.
Iteration 21993: Policy loss: -0.010420. Value loss: 0.037268. Entropy: 0.285026.
episode: 7545   score: 590.0  epsilon: 1.0    steps: 96  evaluation reward: 522.15
Training network. lr: 0.000081. clip: 0.032576
Iteration 21994: Policy loss: -0.483542. Value loss: 0.244811. Entropy: 0.297065.
Iteration 21995: Policy loss: -0.490404. Value loss: 0.111685. Entropy: 0.299283.
Iteration 21996: Policy loss: -0.492357. Value loss: 0.068613. Entropy: 0.298102.
Training network. lr: 0.000081. clip: 0.032576
Iteration 21997: Policy loss: 0.660747. Value loss: 0.246670. Entropy: 0.305728.
Iteration 21998: Policy loss: 0.672051. Value loss: 0.065187. Entropy: 0.306661.
Iteration 21999: Policy loss: 0.654435. Value loss: 0.040080. Entropy: 0.305622.
Training network. lr: 0.000081. clip: 0.032576
Iteration 22000: Policy loss: 0.082799. Value loss: 0.111819. Entropy: 0.315453.
Iteration 22001: Policy loss: 0.075495. Val

Training network. lr: 0.000081. clip: 0.032272
Iteration 22060: Policy loss: -0.086945. Value loss: 0.115924. Entropy: 0.291431.
Iteration 22061: Policy loss: -0.080778. Value loss: 0.057611. Entropy: 0.289371.
Iteration 22062: Policy loss: -0.087568. Value loss: 0.042141. Entropy: 0.288550.
Training network. lr: 0.000081. clip: 0.032272
Iteration 22063: Policy loss: -0.003481. Value loss: 0.191923. Entropy: 0.300529.
Iteration 22064: Policy loss: -0.017358. Value loss: 0.080988. Entropy: 0.298929.
Iteration 22065: Policy loss: -0.015315. Value loss: 0.056644. Entropy: 0.297531.
episode: 7565   score: 420.0  epsilon: 1.0    steps: 728  evaluation reward: 532.3
episode: 7566   score: 730.0  epsilon: 1.0    steps: 784  evaluation reward: 534.75
Training network. lr: 0.000081. clip: 0.032272
Iteration 22066: Policy loss: 0.254074. Value loss: 0.180597. Entropy: 0.291831.
Iteration 22067: Policy loss: 0.238102. Value loss: 0.094500. Entropy: 0.291976.
Iteration 22068: Policy loss: 0.236842

Iteration 22128: Policy loss: 0.343928. Value loss: 0.056424. Entropy: 0.307287.
episode: 7584   score: 395.0  epsilon: 1.0    steps: 368  evaluation reward: 539.0
Training network. lr: 0.000080. clip: 0.032115
Iteration 22129: Policy loss: 0.120970. Value loss: 0.114859. Entropy: 0.282875.
Iteration 22130: Policy loss: 0.116071. Value loss: 0.047521. Entropy: 0.284765.
Iteration 22131: Policy loss: 0.117405. Value loss: 0.036166. Entropy: 0.284111.
Training network. lr: 0.000080. clip: 0.032115
Iteration 22132: Policy loss: 0.098119. Value loss: 0.080476. Entropy: 0.308333.
Iteration 22133: Policy loss: 0.091473. Value loss: 0.040617. Entropy: 0.308861.
Iteration 22134: Policy loss: 0.092692. Value loss: 0.030955. Entropy: 0.308169.
Training network. lr: 0.000080. clip: 0.032115
Iteration 22135: Policy loss: -0.060115. Value loss: 0.110754. Entropy: 0.306931.
Iteration 22136: Policy loss: -0.065720. Value loss: 0.053830. Entropy: 0.306150.
Iteration 22137: Policy loss: -0.066319. Valu

Iteration 22196: Policy loss: -0.073798. Value loss: 0.054692. Entropy: 0.298252.
Iteration 22197: Policy loss: -0.075381. Value loss: 0.037472. Entropy: 0.299681.
Training network. lr: 0.000080. clip: 0.031958
Iteration 22198: Policy loss: 0.090128. Value loss: 0.201850. Entropy: 0.311164.
Iteration 22199: Policy loss: 0.095594. Value loss: 0.084630. Entropy: 0.311263.
Iteration 22200: Policy loss: 0.094653. Value loss: 0.068946. Entropy: 0.310372.
Training network. lr: 0.000080. clip: 0.031811
Iteration 22201: Policy loss: 0.039884. Value loss: 0.201006. Entropy: 0.304897.
Iteration 22202: Policy loss: 0.042256. Value loss: 0.111139. Entropy: 0.304530.
Iteration 22203: Policy loss: 0.035508. Value loss: 0.077180. Entropy: 0.303224.
episode: 7603   score: 470.0  epsilon: 1.0    steps: 288  evaluation reward: 533.2
episode: 7604   score: 425.0  epsilon: 1.0    steps: 640  evaluation reward: 531.4
Training network. lr: 0.000080. clip: 0.031811
Iteration 22204: Policy loss: 0.198656. Val

Iteration 22262: Policy loss: -0.285083. Value loss: 0.185865. Entropy: 0.294000.
Iteration 22263: Policy loss: -0.283664. Value loss: 0.159291. Entropy: 0.295757.
Training network. lr: 0.000079. clip: 0.031654
Iteration 22264: Policy loss: 0.107994. Value loss: 0.138625. Entropy: 0.299679.
Iteration 22265: Policy loss: 0.110035. Value loss: 0.072218. Entropy: 0.299518.
Iteration 22266: Policy loss: 0.100987. Value loss: 0.048625. Entropy: 0.298567.
Training network. lr: 0.000079. clip: 0.031654
Iteration 22267: Policy loss: 0.033049. Value loss: 0.084462. Entropy: 0.310041.
Iteration 22268: Policy loss: 0.026639. Value loss: 0.036771. Entropy: 0.308373.
Iteration 22269: Policy loss: 0.035601. Value loss: 0.026966. Entropy: 0.308483.
episode: 7625   score: 385.0  epsilon: 1.0    steps: 936  evaluation reward: 502.95
Training network. lr: 0.000079. clip: 0.031654
Iteration 22270: Policy loss: 0.048207. Value loss: 0.099392. Entropy: 0.308247.
Iteration 22271: Policy loss: 0.049247. Valu

Iteration 22330: Policy loss: -0.104370. Value loss: 0.130549. Entropy: 0.297067.
Iteration 22331: Policy loss: -0.110784. Value loss: 0.049342. Entropy: 0.296466.
Iteration 22332: Policy loss: -0.117409. Value loss: 0.033732. Entropy: 0.297003.
Training network. lr: 0.000079. clip: 0.031497
Iteration 22333: Policy loss: -0.181539. Value loss: 0.316406. Entropy: 0.314602.
Iteration 22334: Policy loss: -0.180343. Value loss: 0.170365. Entropy: 0.313804.
Iteration 22335: Policy loss: -0.176515. Value loss: 0.064782. Entropy: 0.313012.
episode: 7644   score: 485.0  epsilon: 1.0    steps: 384  evaluation reward: 498.4
Training network. lr: 0.000079. clip: 0.031497
Iteration 22336: Policy loss: -0.002084. Value loss: 0.336619. Entropy: 0.295829.
Iteration 22337: Policy loss: 0.007708. Value loss: 0.168209. Entropy: 0.295847.
Iteration 22338: Policy loss: -0.011265. Value loss: 0.116301. Entropy: 0.295482.
Training network. lr: 0.000079. clip: 0.031497
Iteration 22339: Policy loss: 0.034087.

Iteration 22399: Policy loss: 0.123215. Value loss: 0.068096. Entropy: 0.315035.
Iteration 22400: Policy loss: 0.119096. Value loss: 0.039402. Entropy: 0.315262.
Iteration 22401: Policy loss: 0.118052. Value loss: 0.026869. Entropy: 0.314516.
Training network. lr: 0.000078. clip: 0.031193
Iteration 22402: Policy loss: 0.035153. Value loss: 0.060428. Entropy: 0.311156.
Iteration 22403: Policy loss: 0.031293. Value loss: 0.029572. Entropy: 0.311925.
Iteration 22404: Policy loss: 0.032765. Value loss: 0.021480. Entropy: 0.310705.
episode: 7662   score: 265.0  epsilon: 1.0    steps: 768  evaluation reward: 501.25
Training network. lr: 0.000078. clip: 0.031193
Iteration 22405: Policy loss: -0.396152. Value loss: 0.368434. Entropy: 0.303018.
Iteration 22406: Policy loss: -0.403164. Value loss: 0.234306. Entropy: 0.303735.
Iteration 22407: Policy loss: -0.395975. Value loss: 0.180662. Entropy: 0.303864.
episode: 7663   score: 620.0  epsilon: 1.0    steps: 416  evaluation reward: 501.4
episode

Iteration 22466: Policy loss: 0.155391. Value loss: 0.053643. Entropy: 0.292960.
Iteration 22467: Policy loss: 0.150427. Value loss: 0.040266. Entropy: 0.290949.
episode: 7683   score: 595.0  epsilon: 1.0    steps: 336  evaluation reward: 472.7
episode: 7684   score: 420.0  epsilon: 1.0    steps: 880  evaluation reward: 472.95
Training network. lr: 0.000078. clip: 0.031037
Iteration 22468: Policy loss: -0.135768. Value loss: 0.273305. Entropy: 0.290382.
Iteration 22469: Policy loss: -0.130168. Value loss: 0.099671. Entropy: 0.290364.
Iteration 22470: Policy loss: -0.149736. Value loss: 0.070893. Entropy: 0.289465.
Training network. lr: 0.000078. clip: 0.031037
Iteration 22471: Policy loss: -0.046754. Value loss: 0.209088. Entropy: 0.304267.
Iteration 22472: Policy loss: -0.050781. Value loss: 0.072356. Entropy: 0.305354.
Iteration 22473: Policy loss: -0.060518. Value loss: 0.047311. Entropy: 0.304634.
episode: 7685   score: 930.0  epsilon: 1.0    steps: 464  evaluation reward: 478.95
e

Training network. lr: 0.000077. clip: 0.030889
Iteration 22534: Policy loss: 0.239463. Value loss: 0.116008. Entropy: 0.306398.
Iteration 22535: Policy loss: 0.234212. Value loss: 0.056058. Entropy: 0.306004.
Iteration 22536: Policy loss: 0.233496. Value loss: 0.040803. Entropy: 0.305598.
episode: 7703   score: 450.0  epsilon: 1.0    steps: 656  evaluation reward: 475.4
Training network. lr: 0.000077. clip: 0.030889
Iteration 22537: Policy loss: 0.429819. Value loss: 0.174958. Entropy: 0.298722.
Iteration 22538: Policy loss: 0.437832. Value loss: 0.080669. Entropy: 0.298145.
Iteration 22539: Policy loss: 0.417410. Value loss: 0.058986. Entropy: 0.297986.
episode: 7704   score: 495.0  epsilon: 1.0    steps: 768  evaluation reward: 476.1
Training network. lr: 0.000077. clip: 0.030889
Iteration 22540: Policy loss: 0.084177. Value loss: 0.143619. Entropy: 0.303138.
Iteration 22541: Policy loss: 0.092655. Value loss: 0.078896. Entropy: 0.301861.
Iteration 22542: Policy loss: 0.085041. Value

Iteration 22602: Policy loss: 0.143711. Value loss: 0.039770. Entropy: 0.308504.
Training network. lr: 0.000076. clip: 0.030576
Iteration 22603: Policy loss: 0.208703. Value loss: 0.169661. Entropy: 0.306465.
Iteration 22604: Policy loss: 0.205095. Value loss: 0.073171. Entropy: 0.305851.
Iteration 22605: Policy loss: 0.213865. Value loss: 0.045100. Entropy: 0.305650.
episode: 7722   score: 185.0  epsilon: 1.0    steps: 64  evaluation reward: 476.85
Training network. lr: 0.000076. clip: 0.030576
Iteration 22606: Policy loss: -0.065908. Value loss: 0.270470. Entropy: 0.298646.
Iteration 22607: Policy loss: -0.068302. Value loss: 0.186406. Entropy: 0.298772.
Iteration 22608: Policy loss: -0.089378. Value loss: 0.161461. Entropy: 0.297888.
episode: 7723   score: 435.0  epsilon: 1.0    steps: 360  evaluation reward: 476.5
episode: 7724   score: 560.0  epsilon: 1.0    steps: 1008  evaluation reward: 478.65
Training network. lr: 0.000076. clip: 0.030576
Iteration 22609: Policy loss: 0.097067

episode: 7743   score: 560.0  epsilon: 1.0    steps: 872  evaluation reward: 474.8
Training network. lr: 0.000076. clip: 0.030428
Iteration 22669: Policy loss: -0.015279. Value loss: 0.167451. Entropy: 0.299707.
Iteration 22670: Policy loss: -0.017090. Value loss: 0.051529. Entropy: 0.299021.
Iteration 22671: Policy loss: -0.015595. Value loss: 0.032572. Entropy: 0.298515.
Training network. lr: 0.000076. clip: 0.030428
Iteration 22672: Policy loss: -0.070822. Value loss: 0.221725. Entropy: 0.304174.
Iteration 22673: Policy loss: -0.065830. Value loss: 0.089177. Entropy: 0.304027.
Iteration 22674: Policy loss: -0.072377. Value loss: 0.068390. Entropy: 0.303858.
Training network. lr: 0.000076. clip: 0.030428
Iteration 22675: Policy loss: -0.742796. Value loss: 0.531033. Entropy: 0.306377.
Iteration 22676: Policy loss: -0.741571. Value loss: 0.272618. Entropy: 0.306561.
Iteration 22677: Policy loss: -0.754975. Value loss: 0.182070. Entropy: 0.307021.
episode: 7744   score: 335.0  epsilon:

Iteration 22736: Policy loss: -0.274778. Value loss: 0.146103. Entropy: 0.282118.
Iteration 22737: Policy loss: -0.272945. Value loss: 0.078506. Entropy: 0.280706.
Training network. lr: 0.000076. clip: 0.030272
Iteration 22738: Policy loss: 0.200230. Value loss: 0.118639. Entropy: 0.307982.
Iteration 22739: Policy loss: 0.196631. Value loss: 0.044503. Entropy: 0.305373.
Iteration 22740: Policy loss: 0.186230. Value loss: 0.033451. Entropy: 0.305510.
Training network. lr: 0.000076. clip: 0.030272
Iteration 22741: Policy loss: 0.172804. Value loss: 0.142185. Entropy: 0.303478.
Iteration 22742: Policy loss: 0.164350. Value loss: 0.047363. Entropy: 0.302317.
Iteration 22743: Policy loss: 0.155015. Value loss: 0.032252. Entropy: 0.301600.
episode: 7763   score: 395.0  epsilon: 1.0    steps: 656  evaluation reward: 470.75
Training network. lr: 0.000076. clip: 0.030272
Iteration 22744: Policy loss: -0.008453. Value loss: 0.238316. Entropy: 0.299732.
Iteration 22745: Policy loss: -0.031502. Va

Iteration 22806: Policy loss: 0.354055. Value loss: 0.070472. Entropy: 0.308377.
episode: 7780   score: 570.0  epsilon: 1.0    steps: 496  evaluation reward: 491.75
Training network. lr: 0.000075. clip: 0.029968
Iteration 22807: Policy loss: 0.055858. Value loss: 0.262876. Entropy: 0.291669.
Iteration 22808: Policy loss: 0.059353. Value loss: 0.118567. Entropy: 0.291329.
Iteration 22809: Policy loss: 0.049906. Value loss: 0.072062. Entropy: 0.290787.
episode: 7781   score: 150.0  epsilon: 1.0    steps: 704  evaluation reward: 490.55
Training network. lr: 0.000075. clip: 0.029968
Iteration 22810: Policy loss: 0.218836. Value loss: 0.174711. Entropy: 0.294385.
Iteration 22811: Policy loss: 0.215007. Value loss: 0.079858. Entropy: 0.294248.
Iteration 22812: Policy loss: 0.210963. Value loss: 0.052327. Entropy: 0.293644.
episode: 7782   score: 910.0  epsilon: 1.0    steps: 248  evaluation reward: 494.25
episode: 7783   score: 420.0  epsilon: 1.0    steps: 272  evaluation reward: 492.5
epis

now time :  2019-09-06 13:49:29.228734
episode: 7801   score: 225.0  epsilon: 1.0    steps: 768  evaluation reward: 510.95
episode: 7802   score: 530.0  epsilon: 1.0    steps: 832  evaluation reward: 509.3
Training network. lr: 0.000075. clip: 0.029811
Iteration 22873: Policy loss: 0.305242. Value loss: 0.157610. Entropy: 0.293221.
Iteration 22874: Policy loss: 0.299539. Value loss: 0.074100. Entropy: 0.293112.
Iteration 22875: Policy loss: 0.308211. Value loss: 0.054562. Entropy: 0.291987.
episode: 7803   score: 210.0  epsilon: 1.0    steps: 648  evaluation reward: 506.9
Training network. lr: 0.000075. clip: 0.029811
Iteration 22876: Policy loss: 0.013163. Value loss: 0.273743. Entropy: 0.288004.
Iteration 22877: Policy loss: 0.007829. Value loss: 0.135571. Entropy: 0.288527.
Iteration 22878: Policy loss: -0.003242. Value loss: 0.098679. Entropy: 0.287020.
Training network. lr: 0.000075. clip: 0.029811
Iteration 22879: Policy loss: 0.061742. Value loss: 0.086399. Entropy: 0.308235.
It

episode: 7819   score: 435.0  epsilon: 1.0    steps: 232  evaluation reward: 515.45
episode: 7820   score: 835.0  epsilon: 1.0    steps: 752  evaluation reward: 514.05
episode: 7821   score: 285.0  epsilon: 1.0    steps: 808  evaluation reward: 512.2
Training network. lr: 0.000074. clip: 0.029654
Iteration 22942: Policy loss: -0.211149. Value loss: 0.531657. Entropy: 0.277918.
Iteration 22943: Policy loss: -0.217743. Value loss: 0.391250. Entropy: 0.278960.
Iteration 22944: Policy loss: -0.233529. Value loss: 0.350746. Entropy: 0.278497.
episode: 7822   score: 475.0  epsilon: 1.0    steps: 192  evaluation reward: 515.1
Training network. lr: 0.000074. clip: 0.029654
Iteration 22945: Policy loss: 0.014326. Value loss: 0.058199. Entropy: 0.293591.
Iteration 22946: Policy loss: 0.011831. Value loss: 0.033789. Entropy: 0.292970.
Iteration 22947: Policy loss: 0.010501. Value loss: 0.026670. Entropy: 0.293031.
Training network. lr: 0.000074. clip: 0.029654
Iteration 22948: Policy loss: 0.0268

Iteration 23007: Policy loss: 0.117993. Value loss: 0.034147. Entropy: 0.295875.
episode: 7842   score: 245.0  epsilon: 1.0    steps: 184  evaluation reward: 499.15
episode: 7843   score: 210.0  epsilon: 1.0    steps: 264  evaluation reward: 495.65
Training network. lr: 0.000073. clip: 0.029350
Iteration 23008: Policy loss: -0.053287. Value loss: 0.068433. Entropy: 0.282683.
Iteration 23009: Policy loss: -0.054550. Value loss: 0.032807. Entropy: 0.282144.
Iteration 23010: Policy loss: -0.056121. Value loss: 0.026282. Entropy: 0.281550.
episode: 7844   score: 565.0  epsilon: 1.0    steps: 96  evaluation reward: 497.95
episode: 7845   score: 485.0  epsilon: 1.0    steps: 976  evaluation reward: 495.9
Training network. lr: 0.000073. clip: 0.029350
Iteration 23011: Policy loss: -0.489300. Value loss: 0.360835. Entropy: 0.297325.
Iteration 23012: Policy loss: -0.501536. Value loss: 0.231025. Entropy: 0.296614.
Iteration 23013: Policy loss: -0.528467. Value loss: 0.190558. Entropy: 0.296274.

Iteration 23073: Policy loss: -0.169770. Value loss: 0.024968. Entropy: 0.308698.
episode: 7863   score: 310.0  epsilon: 1.0    steps: 360  evaluation reward: 478.1
Training network. lr: 0.000073. clip: 0.029193
Iteration 23074: Policy loss: -0.167007. Value loss: 0.134378. Entropy: 0.301601.
Iteration 23075: Policy loss: -0.170916. Value loss: 0.062589. Entropy: 0.301593.
Iteration 23076: Policy loss: -0.176190. Value loss: 0.044832. Entropy: 0.300693.
episode: 7864   score: 305.0  epsilon: 1.0    steps: 112  evaluation reward: 476.05
episode: 7865   score: 335.0  epsilon: 1.0    steps: 464  evaluation reward: 475.9
Training network. lr: 0.000073. clip: 0.029193
Iteration 23077: Policy loss: -0.069744. Value loss: 0.084969. Entropy: 0.284926.
Iteration 23078: Policy loss: -0.064538. Value loss: 0.045707. Entropy: 0.284010.
Iteration 23079: Policy loss: -0.071205. Value loss: 0.036320. Entropy: 0.284790.
episode: 7866   score: 400.0  epsilon: 1.0    steps: 312  evaluation reward: 474.8

Iteration 23137: Policy loss: -0.091047. Value loss: 0.261860. Entropy: 0.292193.
Iteration 23138: Policy loss: -0.116503. Value loss: 0.094201. Entropy: 0.293053.
Iteration 23139: Policy loss: -0.127305. Value loss: 0.062029. Entropy: 0.292818.
Training network. lr: 0.000073. clip: 0.029046
Iteration 23140: Policy loss: -0.343614. Value loss: 0.228831. Entropy: 0.296412.
Iteration 23141: Policy loss: -0.336993. Value loss: 0.082813. Entropy: 0.296889.
Iteration 23142: Policy loss: -0.339456. Value loss: 0.050411. Entropy: 0.297511.
episode: 7887   score: 390.0  epsilon: 1.0    steps: 552  evaluation reward: 434.7
Training network. lr: 0.000073. clip: 0.029046
Iteration 23143: Policy loss: -0.165162. Value loss: 0.073333. Entropy: 0.296644.
Iteration 23144: Policy loss: -0.164064. Value loss: 0.035830. Entropy: 0.294524.
Iteration 23145: Policy loss: -0.168462. Value loss: 0.025901. Entropy: 0.294540.
episode: 7888   score: 515.0  epsilon: 1.0    steps: 336  evaluation reward: 436.55
e

Iteration 23203: Policy loss: 0.088893. Value loss: 0.090262. Entropy: 0.293616.
Iteration 23204: Policy loss: 0.094024. Value loss: 0.033906. Entropy: 0.293548.
Iteration 23205: Policy loss: 0.086227. Value loss: 0.024682. Entropy: 0.294107.
episode: 7908   score: 420.0  epsilon: 1.0    steps: 456  evaluation reward: 430.45
Training network. lr: 0.000072. clip: 0.028733
Iteration 23206: Policy loss: 0.189849. Value loss: 0.113602. Entropy: 0.287490.
Iteration 23207: Policy loss: 0.189615. Value loss: 0.051981. Entropy: 0.287331.
Iteration 23208: Policy loss: 0.175865. Value loss: 0.031825. Entropy: 0.287741.
Training network. lr: 0.000072. clip: 0.028733
Iteration 23209: Policy loss: -0.146974. Value loss: 0.319974. Entropy: 0.306221.
Iteration 23210: Policy loss: -0.173846. Value loss: 0.236009. Entropy: 0.305737.
Iteration 23211: Policy loss: -0.155757. Value loss: 0.184846. Entropy: 0.306937.
Training network. lr: 0.000072. clip: 0.028733
Iteration 23212: Policy loss: 0.036219. Val

Iteration 23271: Policy loss: 0.039355. Value loss: 0.032247. Entropy: 0.279228.
Training network. lr: 0.000071. clip: 0.028585
Iteration 23272: Policy loss: 0.137370. Value loss: 0.191978. Entropy: 0.308166.
Iteration 23273: Policy loss: 0.130677. Value loss: 0.068547. Entropy: 0.306573.
Iteration 23274: Policy loss: 0.141486. Value loss: 0.042997. Entropy: 0.306621.
Training network. lr: 0.000071. clip: 0.028585
Iteration 23275: Policy loss: -0.138830. Value loss: 0.111545. Entropy: 0.311662.
Iteration 23276: Policy loss: -0.140227. Value loss: 0.052141. Entropy: 0.311263.
Iteration 23277: Policy loss: -0.141828. Value loss: 0.040750. Entropy: 0.311626.
Training network. lr: 0.000071. clip: 0.028585
Iteration 23278: Policy loss: -0.091135. Value loss: 0.167325. Entropy: 0.311072.
Iteration 23279: Policy loss: -0.097177. Value loss: 0.084433. Entropy: 0.311264.
Iteration 23280: Policy loss: -0.095823. Value loss: 0.064145. Entropy: 0.311185.
episode: 7928   score: 400.0  epsilon: 1.0 

Training network. lr: 0.000071. clip: 0.028429
Iteration 23338: Policy loss: 0.009231. Value loss: 0.275491. Entropy: 0.292225.
Iteration 23339: Policy loss: -0.008692. Value loss: 0.254967. Entropy: 0.292215.
Iteration 23340: Policy loss: -0.013980. Value loss: 0.243490. Entropy: 0.291013.
Training network. lr: 0.000071. clip: 0.028429
Iteration 23341: Policy loss: -0.106026. Value loss: 0.105918. Entropy: 0.308052.
Iteration 23342: Policy loss: -0.104485. Value loss: 0.062995. Entropy: 0.307707.
Iteration 23343: Policy loss: -0.113127. Value loss: 0.044891. Entropy: 0.307589.
Training network. lr: 0.000071. clip: 0.028429
Iteration 23344: Policy loss: 0.199950. Value loss: 0.172582. Entropy: 0.304775.
Iteration 23345: Policy loss: 0.194223. Value loss: 0.080948. Entropy: 0.303038.
Iteration 23346: Policy loss: 0.194898. Value loss: 0.058896. Entropy: 0.303080.
episode: 7949   score: 340.0  epsilon: 1.0    steps: 64  evaluation reward: 435.35
episode: 7950   score: 320.0  epsilon: 1.0

Iteration 23406: Policy loss: 0.213893. Value loss: 0.058949. Entropy: 0.298246.
Training network. lr: 0.000070. clip: 0.028124
Iteration 23407: Policy loss: 0.238223. Value loss: 0.135617. Entropy: 0.307958.
Iteration 23408: Policy loss: 0.240513. Value loss: 0.056705. Entropy: 0.306876.
Iteration 23409: Policy loss: 0.231511. Value loss: 0.039717. Entropy: 0.306737.
episode: 7968   score: 630.0  epsilon: 1.0    steps: 504  evaluation reward: 457.3
Training network. lr: 0.000070. clip: 0.028124
Iteration 23410: Policy loss: -0.344593. Value loss: 0.344050. Entropy: 0.297919.
Iteration 23411: Policy loss: -0.379908. Value loss: 0.250578. Entropy: 0.297257.
Iteration 23412: Policy loss: -0.380524. Value loss: 0.196173. Entropy: 0.297588.
episode: 7969   score: 480.0  epsilon: 1.0    steps: 256  evaluation reward: 455.45
episode: 7970   score: 610.0  epsilon: 1.0    steps: 696  evaluation reward: 453.7
Training network. lr: 0.000070. clip: 0.028124
Iteration 23413: Policy loss: 0.035900.

Iteration 23474: Policy loss: -0.131655. Value loss: 0.212009. Entropy: 0.307723.
Iteration 23475: Policy loss: -0.139122. Value loss: 0.158638. Entropy: 0.307962.
episode: 7987   score: 315.0  epsilon: 1.0    steps: 256  evaluation reward: 475.1
episode: 7988   score: 800.0  epsilon: 1.0    steps: 392  evaluation reward: 477.95
Training network. lr: 0.000070. clip: 0.027968
Iteration 23476: Policy loss: -0.458143. Value loss: 0.547460. Entropy: 0.280922.
Iteration 23477: Policy loss: -0.469707. Value loss: 0.285743. Entropy: 0.281741.
Iteration 23478: Policy loss: -0.467643. Value loss: 0.158620. Entropy: 0.282432.
episode: 7989   score: 590.0  epsilon: 1.0    steps: 736  evaluation reward: 478.5
Training network. lr: 0.000070. clip: 0.027968
Iteration 23479: Policy loss: 0.245812. Value loss: 0.117430. Entropy: 0.298815.
Iteration 23480: Policy loss: 0.246248. Value loss: 0.061347. Entropy: 0.298988.
Iteration 23481: Policy loss: 0.248889. Value loss: 0.047474. Entropy: 0.298010.
Tra

Iteration 23539: Policy loss: -0.168349. Value loss: 0.166368. Entropy: 0.309825.
Iteration 23540: Policy loss: -0.164324. Value loss: 0.066124. Entropy: 0.309149.
Iteration 23541: Policy loss: -0.174824. Value loss: 0.045470. Entropy: 0.309730.
Training network. lr: 0.000070. clip: 0.027811
Iteration 23542: Policy loss: 0.115452. Value loss: 0.091419. Entropy: 0.309744.
Iteration 23543: Policy loss: 0.111914. Value loss: 0.052622. Entropy: 0.309820.
Iteration 23544: Policy loss: 0.112730. Value loss: 0.041876. Entropy: 0.309991.
episode: 8009   score: 575.0  epsilon: 1.0    steps: 160  evaluation reward: 480.25
Training network. lr: 0.000070. clip: 0.027811
Iteration 23545: Policy loss: 0.238350. Value loss: 0.138801. Entropy: 0.297237.
Iteration 23546: Policy loss: 0.239158. Value loss: 0.051399. Entropy: 0.298272.
Iteration 23547: Policy loss: 0.235514. Value loss: 0.035668. Entropy: 0.297977.
Training network. lr: 0.000070. clip: 0.027811
Iteration 23548: Policy loss: -0.047945. Va

Iteration 23609: Policy loss: 0.110255. Value loss: 0.074357. Entropy: 0.300694.
Iteration 23610: Policy loss: 0.102507. Value loss: 0.053558. Entropy: 0.299421.
episode: 8027   score: 425.0  epsilon: 1.0    steps: 632  evaluation reward: 486.75
Training network. lr: 0.000069. clip: 0.027507
Iteration 23611: Policy loss: 0.074994. Value loss: 0.122660. Entropy: 0.292435.
Iteration 23612: Policy loss: 0.070727. Value loss: 0.054175. Entropy: 0.292286.
Iteration 23613: Policy loss: 0.075310. Value loss: 0.039712. Entropy: 0.291777.
episode: 8028   score: 390.0  epsilon: 1.0    steps: 344  evaluation reward: 486.65
episode: 8029   score: 405.0  epsilon: 1.0    steps: 432  evaluation reward: 479.5
Training network. lr: 0.000069. clip: 0.027507
Iteration 23614: Policy loss: 0.013614. Value loss: 0.087671. Entropy: 0.279373.
Iteration 23615: Policy loss: 0.007767. Value loss: 0.038459. Entropy: 0.279582.
Iteration 23616: Policy loss: 0.012843. Value loss: 0.028351. Entropy: 0.279711.
episode

Iteration 23675: Policy loss: 0.149706. Value loss: 0.049620. Entropy: 0.291444.
Iteration 23676: Policy loss: 0.151000. Value loss: 0.037773. Entropy: 0.291258.
episode: 8049   score: 385.0  epsilon: 1.0    steps: 184  evaluation reward: 482.1
episode: 8050   score: 495.0  epsilon: 1.0    steps: 856  evaluation reward: 483.85
Training network. lr: 0.000068. clip: 0.027350
Iteration 23677: Policy loss: 0.063349. Value loss: 0.132609. Entropy: 0.282402.
Iteration 23678: Policy loss: 0.049067. Value loss: 0.067608. Entropy: 0.283799.
Iteration 23679: Policy loss: 0.053524. Value loss: 0.047574. Entropy: 0.284018.
now time :  2019-09-06 14:39:36.004065
episode: 8051   score: 405.0  epsilon: 1.0    steps: 792  evaluation reward: 483.75
Training network. lr: 0.000068. clip: 0.027350
Iteration 23680: Policy loss: -0.289526. Value loss: 0.230517. Entropy: 0.298020.
Iteration 23681: Policy loss: -0.297869. Value loss: 0.094629. Entropy: 0.298530.
Iteration 23682: Policy loss: -0.302085. Value 

Iteration 23740: Policy loss: 0.277479. Value loss: 0.124526. Entropy: 0.281733.
Iteration 23741: Policy loss: 0.272823. Value loss: 0.052686. Entropy: 0.282491.
Iteration 23742: Policy loss: 0.276341. Value loss: 0.036820. Entropy: 0.282767.
Training network. lr: 0.000068. clip: 0.027203
Iteration 23743: Policy loss: 0.217061. Value loss: 0.140274. Entropy: 0.307491.
Iteration 23744: Policy loss: 0.210282. Value loss: 0.083862. Entropy: 0.305718.
Iteration 23745: Policy loss: 0.215145. Value loss: 0.064700. Entropy: 0.306086.
episode: 8071   score: 550.0  epsilon: 1.0    steps: 960  evaluation reward: 476.25
Training network. lr: 0.000068. clip: 0.027203
Iteration 23746: Policy loss: 0.124585. Value loss: 0.153357. Entropy: 0.302796.
Iteration 23747: Policy loss: 0.118803. Value loss: 0.072207. Entropy: 0.302302.
Iteration 23748: Policy loss: 0.111182. Value loss: 0.051751. Entropy: 0.303435.
Training network. lr: 0.000068. clip: 0.027203
Iteration 23749: Policy loss: -0.022909. Value

Training network. lr: 0.000067. clip: 0.026889
Iteration 23809: Policy loss: 0.253132. Value loss: 0.090613. Entropy: 0.300665.
Iteration 23810: Policy loss: 0.255122. Value loss: 0.043799. Entropy: 0.300751.
Iteration 23811: Policy loss: 0.257405. Value loss: 0.032133. Entropy: 0.301808.
Training network. lr: 0.000067. clip: 0.026889
Iteration 23812: Policy loss: -0.034851. Value loss: 0.117068. Entropy: 0.306741.
Iteration 23813: Policy loss: -0.040758. Value loss: 0.059514. Entropy: 0.306702.
Iteration 23814: Policy loss: -0.043984. Value loss: 0.042051. Entropy: 0.307309.
episode: 8090   score: 495.0  epsilon: 1.0    steps: 144  evaluation reward: 467.7
Training network. lr: 0.000067. clip: 0.026889
Iteration 23815: Policy loss: 0.226951. Value loss: 0.082954. Entropy: 0.297284.
Iteration 23816: Policy loss: 0.226946. Value loss: 0.040168. Entropy: 0.298758.
Iteration 23817: Policy loss: 0.222767. Value loss: 0.032862. Entropy: 0.298416.
episode: 8091   score: 600.0  epsilon: 1.0  

Iteration 23877: Policy loss: -0.156429. Value loss: 0.069770. Entropy: 0.293801.
episode: 8109   score: 725.0  epsilon: 1.0    steps: 640  evaluation reward: 474.95
episode: 8110   score: 420.0  epsilon: 1.0    steps: 936  evaluation reward: 475.8
Training network. lr: 0.000067. clip: 0.026742
Iteration 23878: Policy loss: 0.055875. Value loss: 0.239313. Entropy: 0.280897.
Iteration 23879: Policy loss: 0.050914. Value loss: 0.101135. Entropy: 0.278908.
Iteration 23880: Policy loss: 0.052390. Value loss: 0.067378. Entropy: 0.279252.
episode: 8111   score: 415.0  epsilon: 1.0    steps: 880  evaluation reward: 477.1
Training network. lr: 0.000067. clip: 0.026742
Iteration 23881: Policy loss: 0.221729. Value loss: 0.086509. Entropy: 0.298914.
Iteration 23882: Policy loss: 0.218525. Value loss: 0.037843. Entropy: 0.300490.
Iteration 23883: Policy loss: 0.218246. Value loss: 0.029119. Entropy: 0.298736.
Training network. lr: 0.000067. clip: 0.026742
Iteration 23884: Policy loss: -0.305399. 

Training network. lr: 0.000066. clip: 0.026585
Iteration 23944: Policy loss: 0.311367. Value loss: 0.213871. Entropy: 0.287294.
Iteration 23945: Policy loss: 0.301036. Value loss: 0.085116. Entropy: 0.287864.
Iteration 23946: Policy loss: 0.289094. Value loss: 0.067489. Entropy: 0.286909.
episode: 8130   score: 495.0  epsilon: 1.0    steps: 440  evaluation reward: 480.5
episode: 8131   score: 540.0  epsilon: 1.0    steps: 584  evaluation reward: 482.85
Training network. lr: 0.000066. clip: 0.026585
Iteration 23947: Policy loss: 0.007381. Value loss: 0.131185. Entropy: 0.274226.
Iteration 23948: Policy loss: -0.001548. Value loss: 0.049901. Entropy: 0.274143.
Iteration 23949: Policy loss: 0.011725. Value loss: 0.036129. Entropy: 0.274266.
episode: 8132   score: 450.0  epsilon: 1.0    steps: 728  evaluation reward: 482.15
Training network. lr: 0.000066. clip: 0.026585
Iteration 23950: Policy loss: 0.036825. Value loss: 0.125897. Entropy: 0.297405.
Iteration 23951: Policy loss: 0.037545. 

Iteration 24012: Policy loss: -0.088506. Value loss: 0.162557. Entropy: 0.306543.
episode: 8149   score: 620.0  epsilon: 1.0    steps: 360  evaluation reward: 492.5
episode: 8150   score: 670.0  epsilon: 1.0    steps: 400  evaluation reward: 494.25
Training network. lr: 0.000066. clip: 0.026281
Iteration 24013: Policy loss: 0.237382. Value loss: 0.095917. Entropy: 0.269268.
Iteration 24014: Policy loss: 0.241140. Value loss: 0.047891. Entropy: 0.269171.
Iteration 24015: Policy loss: 0.242660. Value loss: 0.038421. Entropy: 0.269422.
now time :  2019-09-06 15:00:28.051386
episode: 8151   score: 350.0  epsilon: 1.0    steps: 208  evaluation reward: 493.7
episode: 8152   score: 575.0  epsilon: 1.0    steps: 888  evaluation reward: 492.15
Training network. lr: 0.000066. clip: 0.026281
Iteration 24016: Policy loss: -0.050947. Value loss: 0.075410. Entropy: 0.291281.
Iteration 24017: Policy loss: -0.060181. Value loss: 0.039890. Entropy: 0.290407.
Iteration 24018: Policy loss: -0.057887. Val

episode: 8169   score: 445.0  epsilon: 1.0    steps: 136  evaluation reward: 499.85
episode: 8170   score: 520.0  epsilon: 1.0    steps: 832  evaluation reward: 497.65
Training network. lr: 0.000065. clip: 0.026125
Iteration 24079: Policy loss: -0.062788. Value loss: 0.143775. Entropy: 0.281454.
Iteration 24080: Policy loss: -0.061671. Value loss: 0.061914. Entropy: 0.282830.
Iteration 24081: Policy loss: -0.064479. Value loss: 0.040158. Entropy: 0.281695.
Training network. lr: 0.000065. clip: 0.026125
Iteration 24082: Policy loss: -0.003981. Value loss: 0.141493. Entropy: 0.311492.
Iteration 24083: Policy loss: -0.003267. Value loss: 0.078520. Entropy: 0.311050.
Iteration 24084: Policy loss: -0.007262. Value loss: 0.048195. Entropy: 0.310696.
episode: 8171   score: 420.0  epsilon: 1.0    steps: 736  evaluation reward: 496.35
episode: 8172   score: 775.0  epsilon: 1.0    steps: 872  evaluation reward: 500.95
Training network. lr: 0.000065. clip: 0.026125
Iteration 24085: Policy loss: -

Iteration 24145: Policy loss: -0.009392. Value loss: 0.100938. Entropy: 0.306183.
Iteration 24146: Policy loss: -0.011758. Value loss: 0.036473. Entropy: 0.305576.
Iteration 24147: Policy loss: -0.010866. Value loss: 0.025687. Entropy: 0.306567.
Training network. lr: 0.000065. clip: 0.025968
Iteration 24148: Policy loss: 0.037180. Value loss: 0.228977. Entropy: 0.308004.
Iteration 24149: Policy loss: 0.027417. Value loss: 0.094739. Entropy: 0.306724.
Iteration 24150: Policy loss: 0.026070. Value loss: 0.075546. Entropy: 0.306173.
episode: 8190   score: 640.0  epsilon: 1.0    steps: 400  evaluation reward: 507.65
episode: 8191   score: 315.0  epsilon: 1.0    steps: 488  evaluation reward: 504.8
Training network. lr: 0.000065. clip: 0.025820
Iteration 24151: Policy loss: 0.041552. Value loss: 0.130620. Entropy: 0.285773.
Iteration 24152: Policy loss: 0.039407. Value loss: 0.066788. Entropy: 0.284991.
Iteration 24153: Policy loss: 0.035917. Value loss: 0.049012. Entropy: 0.286143.
episode

Iteration 24213: Policy loss: -0.247719. Value loss: 0.045618. Entropy: 0.294308.
episode: 8209   score: 210.0  epsilon: 1.0    steps: 200  evaluation reward: 505.65
episode: 8210   score: 430.0  epsilon: 1.0    steps: 896  evaluation reward: 505.75
Training network. lr: 0.000064. clip: 0.025664
Iteration 24214: Policy loss: -0.221518. Value loss: 0.074599. Entropy: 0.286899.
Iteration 24215: Policy loss: -0.231684. Value loss: 0.038563. Entropy: 0.288355.
Iteration 24216: Policy loss: -0.233558. Value loss: 0.030276. Entropy: 0.288031.
Training network. lr: 0.000064. clip: 0.025664
Iteration 24217: Policy loss: -0.242535. Value loss: 0.547059. Entropy: 0.306943.
Iteration 24218: Policy loss: -0.270838. Value loss: 0.245465. Entropy: 0.306440.
Iteration 24219: Policy loss: -0.282456. Value loss: 0.119870. Entropy: 0.305052.
episode: 8211   score: 625.0  epsilon: 1.0    steps: 744  evaluation reward: 507.85
Training network. lr: 0.000064. clip: 0.025664
Iteration 24220: Policy loss: -0.

Iteration 24280: Policy loss: -0.020323. Value loss: 0.159462. Entropy: 0.297807.
Iteration 24281: Policy loss: -0.012968. Value loss: 0.063811. Entropy: 0.297190.
Iteration 24282: Policy loss: -0.020144. Value loss: 0.043409. Entropy: 0.296868.
episode: 8229   score: 440.0  epsilon: 1.0    steps: 472  evaluation reward: 517.3
Training network. lr: 0.000064. clip: 0.025507
Iteration 24283: Policy loss: 0.138452. Value loss: 0.140815. Entropy: 0.297825.
Iteration 24284: Policy loss: 0.142677. Value loss: 0.068288. Entropy: 0.296735.
Iteration 24285: Policy loss: 0.145489. Value loss: 0.046432. Entropy: 0.296569.
episode: 8230   score: 725.0  epsilon: 1.0    steps: 760  evaluation reward: 519.6
Training network. lr: 0.000064. clip: 0.025507
Iteration 24286: Policy loss: 0.429371. Value loss: 0.187955. Entropy: 0.303745.
Iteration 24287: Policy loss: 0.419106. Value loss: 0.050425. Entropy: 0.302069.
Iteration 24288: Policy loss: 0.423597. Value loss: 0.034413. Entropy: 0.302686.
episode:

Iteration 24348: Policy loss: 0.021055. Value loss: 0.036871. Entropy: 0.308211.
episode: 8249   score: 640.0  epsilon: 1.0    steps: 512  evaluation reward: 509.6
Training network. lr: 0.000063. clip: 0.025360
Iteration 24349: Policy loss: -0.120154. Value loss: 0.297896. Entropy: 0.295800.
Iteration 24350: Policy loss: -0.131526. Value loss: 0.225232. Entropy: 0.295728.
Iteration 24351: Policy loss: -0.122774. Value loss: 0.196740. Entropy: 0.294416.
episode: 8250   score: 420.0  epsilon: 1.0    steps: 184  evaluation reward: 507.1
now time :  2019-09-06 15:21:22.512782
episode: 8251   score: 440.0  epsilon: 1.0    steps: 760  evaluation reward: 508.0
episode: 8252   score: 420.0  epsilon: 1.0    steps: 888  evaluation reward: 506.45
Training network. lr: 0.000063. clip: 0.025203
Iteration 24352: Policy loss: 0.153101. Value loss: 0.114723. Entropy: 0.286973.
Iteration 24353: Policy loss: 0.148463. Value loss: 0.055324. Entropy: 0.286498.
Iteration 24354: Policy loss: 0.143427. Value

Iteration 24414: Policy loss: 0.090017. Value loss: 0.035248. Entropy: 0.279645.
Training network. lr: 0.000063. clip: 0.025046
Iteration 24415: Policy loss: 0.014776. Value loss: 0.128832. Entropy: 0.308530.
Iteration 24416: Policy loss: 0.010595. Value loss: 0.057898. Entropy: 0.308714.
Iteration 24417: Policy loss: 0.000551. Value loss: 0.040271. Entropy: 0.308631.
episode: 8270   score: 390.0  epsilon: 1.0    steps: 1008  evaluation reward: 498.9
Training network. lr: 0.000063. clip: 0.025046
Iteration 24418: Policy loss: -0.090048. Value loss: 0.119803. Entropy: 0.311917.
Iteration 24419: Policy loss: -0.093670. Value loss: 0.065531. Entropy: 0.311346.
Iteration 24420: Policy loss: -0.096585. Value loss: 0.050746. Entropy: 0.311243.
episode: 8271   score: 275.0  epsilon: 1.0    steps: 160  evaluation reward: 497.45
Training network. lr: 0.000063. clip: 0.025046
Iteration 24421: Policy loss: 0.146955. Value loss: 0.045665. Entropy: 0.283619.
Iteration 24422: Policy loss: 0.143858. 

episode: 8287   score: 580.0  epsilon: 1.0    steps: 376  evaluation reward: 496.85
episode: 8288   score: 335.0  epsilon: 1.0    steps: 672  evaluation reward: 496.85
episode: 8289   score: 400.0  epsilon: 1.0    steps: 896  evaluation reward: 498.0
Training network. lr: 0.000062. clip: 0.024899
Iteration 24484: Policy loss: -0.347422. Value loss: 0.351498. Entropy: 0.270661.
Iteration 24485: Policy loss: -0.339318. Value loss: 0.154337. Entropy: 0.270828.
Iteration 24486: Policy loss: -0.339086. Value loss: 0.112843. Entropy: 0.271011.
episode: 8290   score: 720.0  epsilon: 1.0    steps: 936  evaluation reward: 498.8
Training network. lr: 0.000062. clip: 0.024899
Iteration 24487: Policy loss: 0.186204. Value loss: 0.154071. Entropy: 0.304478.
Iteration 24488: Policy loss: 0.190796. Value loss: 0.072432. Entropy: 0.303174.
Iteration 24489: Policy loss: 0.185023. Value loss: 0.048480. Entropy: 0.302861.
Training network. lr: 0.000062. clip: 0.024899
Iteration 24490: Policy loss: 0.0303

Training network. lr: 0.000062. clip: 0.024742
Iteration 24550: Policy loss: 0.209590. Value loss: 0.424789. Entropy: 0.297083.
Iteration 24551: Policy loss: 0.214206. Value loss: 0.202050. Entropy: 0.297104.
Iteration 24552: Policy loss: 0.211050. Value loss: 0.098946. Entropy: 0.296472.
Training network. lr: 0.000061. clip: 0.024585
Iteration 24553: Policy loss: 0.080842. Value loss: 0.148883. Entropy: 0.303897.
Iteration 24554: Policy loss: 0.079248. Value loss: 0.078777. Entropy: 0.303485.
Iteration 24555: Policy loss: 0.075857. Value loss: 0.057344. Entropy: 0.304401.
episode: 8308   score: 530.0  epsilon: 1.0    steps: 536  evaluation reward: 499.6
episode: 8309   score: 270.0  epsilon: 1.0    steps: 632  evaluation reward: 500.2
Training network. lr: 0.000061. clip: 0.024585
Iteration 24556: Policy loss: 0.080488. Value loss: 0.148854. Entropy: 0.273447.
Iteration 24557: Policy loss: 0.086128. Value loss: 0.069177. Entropy: 0.273901.
Iteration 24558: Policy loss: 0.092705. Value

Iteration 24617: Policy loss: -0.191766. Value loss: 0.140903. Entropy: 0.306027.
Iteration 24618: Policy loss: -0.202964. Value loss: 0.085269. Entropy: 0.305784.
episode: 8328   score: 565.0  epsilon: 1.0    steps: 104  evaluation reward: 482.2
episode: 8329   score: 595.0  epsilon: 1.0    steps: 872  evaluation reward: 483.75
Training network. lr: 0.000061. clip: 0.024438
Iteration 24619: Policy loss: 0.141199. Value loss: 0.152142. Entropy: 0.300291.
Iteration 24620: Policy loss: 0.146375. Value loss: 0.053792. Entropy: 0.299839.
Iteration 24621: Policy loss: 0.141768. Value loss: 0.035099. Entropy: 0.299343.
Training network. lr: 0.000061. clip: 0.024438
Iteration 24622: Policy loss: 0.001728. Value loss: 0.090001. Entropy: 0.309952.
Iteration 24623: Policy loss: 0.006513. Value loss: 0.048013. Entropy: 0.309868.
Iteration 24624: Policy loss: 0.007602. Value loss: 0.036674. Entropy: 0.309985.
episode: 8330   score: 650.0  epsilon: 1.0    steps: 712  evaluation reward: 483.0
episod

Iteration 24684: Policy loss: 0.116256. Value loss: 0.053527. Entropy: 0.293941.
episode: 8349   score: 545.0  epsilon: 1.0    steps: 208  evaluation reward: 487.55
Training network. lr: 0.000061. clip: 0.024281
Iteration 24685: Policy loss: -0.166642. Value loss: 0.281432. Entropy: 0.295353.
Iteration 24686: Policy loss: -0.157152. Value loss: 0.122279. Entropy: 0.295162.
Iteration 24687: Policy loss: -0.168361. Value loss: 0.079756. Entropy: 0.296232.
Training network. lr: 0.000061. clip: 0.024281
Iteration 24688: Policy loss: -0.001016. Value loss: 0.138601. Entropy: 0.303282.
Iteration 24689: Policy loss: 0.008094. Value loss: 0.066677. Entropy: 0.304028.
Iteration 24690: Policy loss: 0.008127. Value loss: 0.044577. Entropy: 0.302488.
Training network. lr: 0.000061. clip: 0.024281
Iteration 24691: Policy loss: 0.007825. Value loss: 0.281477. Entropy: 0.305760.
Iteration 24692: Policy loss: -0.004129. Value loss: 0.125106. Entropy: 0.305885.
Iteration 24693: Policy loss: 0.002370. V