# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [None]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
while (e < EPISODES):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[3,:,:]
            action, value = agent.get_action(np.float32(env.history[:4,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[4,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:4, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                e += 1
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                env.done = False
                evaluation_reward.append(env.score)
                env.score = 0
                env.history = np.zeros([5,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()

'''
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        curr_state = history[3,:,:]
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        #r = np.clip(reward, -1, 1)
        r = reward
        """
        if terminal_state:
            r -= 20
        """
        # Store the transition in memory 
        
        agent.memory.push(deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            _, frame_next_val = agent.get_action(np.float32(history[1:, :, :]) / 255.)
            agent.train_policy_net(frame, frame_next_val)
            # Update the target network
            agent.update_target_net()
        score += r
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/spaceinvaders_ppo.png")
            torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 700 and len(evaluation_reward) > 40:
                torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                sys.exit()
'''

  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: 0.015861. Value loss: 1.545248. Entropy: 1.778339.
Iteration 2: Policy loss: 0.003832. Value loss: 1.441768. Entropy: 1.771869.
Iteration 3: Policy loss: 0.000260. Value loss: 1.198427. Entropy: 1.772347.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: 0.012576. Value loss: 4.035398. Entropy: 1.781936.
Iteration 5: Policy loss: 0.007540. Value loss: 3.012988. Entropy: 1.782049.
Iteration 6: Policy loss: 0.005987. Value loss: 2.157456. Entropy: 1.779896.
now time :  2019-02-22 23:36:31.975633
episode: 1   score: 50.0  epsilon: 1.0    steps: 375  evaluation reward: nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: 0.011531. Value loss: 2.992571. Entropy: 1.776144.
Iteration 8: Policy loss: 0.007103. Value loss: 2.172000. Entropy: 1.771981.
Iteration 9: Policy loss: 0.003141. Value loss: 1.637235. Entropy: 1.771698.
episode: 2   score: 55.0  epsilon: 1.0    steps: 414  evaluation reward: 50.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: 0.011566. Value loss: 1.253117. Entropy: 1.771417.
Iteration 11: Policy loss: 0.011086. Value loss: 1.012391. Entropy: 1.761750.
Iteration 12: Policy loss: 0.004805. Value loss: 0.675911. Entropy: 1.764812.
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: 0.015258. Value loss: 3.367350. Entropy: 1.719672.
Iteration 14: Policy loss: 0.013589. Value loss: 1.941004. Entropy: 1.728053.
Iteration 15: Policy loss: -0.002368. Value loss: 1.520332. Entropy: 1.731293.
episode: 3   score: 135.0  epsilon: 1.0    steps: 122  evaluation reward: 52.5
ep

episode: 28   score: 150.0  epsilon: 1.0    steps: 660  evaluation reward: 171.4814814814815
episode: 29   score: 210.0  epsilon: 1.0    steps: 876  evaluation reward: 170.71428571428572
episode: 30   score: 180.0  epsilon: 1.0    steps: 995  evaluation reward: 172.06896551724137
Training network. lr: 0.000250. clip: 0.099888
Iteration 70: Policy loss: 0.016925. Value loss: 2.424394. Entropy: 1.350628.
Iteration 71: Policy loss: 0.012597. Value loss: 1.868973. Entropy: 1.346933.
Iteration 72: Policy loss: 0.005398. Value loss: 1.649648. Entropy: 1.340101.
Training network. lr: 0.000250. clip: 0.099888
Iteration 73: Policy loss: 0.015878. Value loss: 2.809553. Entropy: 1.345127.
Iteration 74: Policy loss: 0.005624. Value loss: 2.105753. Entropy: 1.313123.
Iteration 75: Policy loss: 0.004090. Value loss: 1.902726. Entropy: 1.376369.
episode: 31   score: 135.0  epsilon: 1.0    steps: 233  evaluation reward: 172.33333333333334
episode: 32   score: 165.0  epsilon: 1.0    steps: 514  evaluat

Iteration 130: Policy loss: 0.006993. Value loss: 1.949746. Entropy: 1.396048.
Iteration 131: Policy loss: 0.008118. Value loss: 1.395005. Entropy: 1.444005.
Iteration 132: Policy loss: -0.000301. Value loss: 1.155280. Entropy: 1.422557.
episode: 55   score: 135.0  epsilon: 1.0    steps: 294  evaluation reward: 170.46296296296296
episode: 56   score: 155.0  epsilon: 1.0    steps: 533  evaluation reward: 169.8181818181818
episode: 57   score: 180.0  epsilon: 1.0    steps: 795  evaluation reward: 169.55357142857142
Training network. lr: 0.000249. clip: 0.099775
Iteration 133: Policy loss: 0.011786. Value loss: 4.819378. Entropy: 1.487226.
Iteration 134: Policy loss: 0.012830. Value loss: 3.655415. Entropy: 1.489974.
Iteration 135: Policy loss: 0.036134. Value loss: 3.137287. Entropy: 1.486282.
episode: 58   score: 50.0  epsilon: 1.0    steps: 416  evaluation reward: 169.73684210526315
Training network. lr: 0.000249. clip: 0.099775
Iteration 136: Policy loss: 0.014319. Value loss: 2.13220

Iteration 191: Policy loss: 0.003692. Value loss: 1.263352. Entropy: 1.477494.
Iteration 192: Policy loss: 0.010573. Value loss: 1.038341. Entropy: 1.485531.
episode: 82   score: 210.0  epsilon: 1.0    steps: 703  evaluation reward: 176.97530864197532
episode: 83   score: 180.0  epsilon: 1.0    steps: 844  evaluation reward: 177.3780487804878
Training network. lr: 0.000249. clip: 0.099663
Iteration 193: Policy loss: 0.006799. Value loss: 1.938038. Entropy: 1.411999.
Iteration 194: Policy loss: 0.005876. Value loss: 1.292757. Entropy: 1.422238.
Iteration 195: Policy loss: -0.003976. Value loss: 1.071682. Entropy: 1.432591.
Training network. lr: 0.000249. clip: 0.099663
Iteration 196: Policy loss: 0.005232. Value loss: 1.647711. Entropy: 1.433031.
Iteration 197: Policy loss: 0.004543. Value loss: 1.011537. Entropy: 1.459490.
Iteration 198: Policy loss: -0.004331. Value loss: 0.863351. Entropy: 1.447180.
episode: 84   score: 155.0  epsilon: 1.0    steps: 79  evaluation reward: 177.4096385

episode: 108   score: 140.0  epsilon: 1.0    steps: 463  evaluation reward: 186.05
episode: 109   score: 265.0  epsilon: 1.0    steps: 705  evaluation reward: 185.0
Training network. lr: 0.000249. clip: 0.099438
Iteration 253: Policy loss: 0.005663. Value loss: 2.342088. Entropy: 1.520561.
Iteration 254: Policy loss: 0.002078. Value loss: 1.403857. Entropy: 1.523123.
Iteration 255: Policy loss: -0.004375. Value loss: 1.136263. Entropy: 1.515924.
episode: 110   score: 410.0  epsilon: 1.0    steps: 901  evaluation reward: 186.1
Training network. lr: 0.000249. clip: 0.099438
Iteration 256: Policy loss: 0.009455. Value loss: 1.523094. Entropy: 1.578521.
Iteration 257: Policy loss: 0.007549. Value loss: 1.101897. Entropy: 1.564899.
Iteration 258: Policy loss: 0.006924. Value loss: 0.862210. Entropy: 1.566898.
Training network. lr: 0.000249. clip: 0.099438
Iteration 259: Policy loss: 0.012003. Value loss: 3.960396. Entropy: 1.602586.
Iteration 260: Policy loss: 0.027008. Value loss: 3.615917

Training network. lr: 0.000248. clip: 0.099325
Iteration 313: Policy loss: 0.006367. Value loss: 1.951225. Entropy: 1.405560.
Iteration 314: Policy loss: 0.000816. Value loss: 1.546871. Entropy: 1.378446.
Iteration 315: Policy loss: -0.006322. Value loss: 1.156101. Entropy: 1.388941.
episode: 139   score: 160.0  epsilon: 1.0    steps: 388  evaluation reward: 186.55
Training network. lr: 0.000248. clip: 0.099325
Iteration 316: Policy loss: 0.002427. Value loss: 0.936501. Entropy: 1.517485.
Iteration 317: Policy loss: 0.005362. Value loss: 0.695986. Entropy: 1.509080.
Iteration 318: Policy loss: -0.000776. Value loss: 0.543530. Entropy: 1.509692.
episode: 140   score: 105.0  epsilon: 1.0    steps: 209  evaluation reward: 187.1
Training network. lr: 0.000248. clip: 0.099325
Iteration 319: Policy loss: 0.016342. Value loss: 1.075956. Entropy: 1.531897.
Iteration 320: Policy loss: 0.008496. Value loss: 0.553112. Entropy: 1.520585.
Iteration 321: Policy loss: 0.008378. Value loss: 0.459153. 

Iteration 374: Policy loss: 0.015130. Value loss: 1.014819. Entropy: 1.212167.
Iteration 375: Policy loss: 0.000088. Value loss: 0.898371. Entropy: 1.214229.
Training network. lr: 0.000248. clip: 0.099213
Iteration 376: Policy loss: 0.007228. Value loss: 2.287537. Entropy: 1.174848.
Iteration 377: Policy loss: 0.008556. Value loss: 1.339106. Entropy: 1.191965.
Iteration 378: Policy loss: 0.006687. Value loss: 1.147631. Entropy: 1.212984.
Training network. lr: 0.000248. clip: 0.099213
Iteration 379: Policy loss: 0.006300. Value loss: 1.528726. Entropy: 1.286214.
Iteration 380: Policy loss: 0.006061. Value loss: 0.949052. Entropy: 1.278240.
Iteration 381: Policy loss: -0.004966. Value loss: 0.812440. Entropy: 1.297113.
episode: 168   score: 215.0  epsilon: 1.0    steps: 607  evaluation reward: 187.05
Training network. lr: 0.000248. clip: 0.099213
Iteration 382: Policy loss: 0.008856. Value loss: 1.345180. Entropy: 1.496516.
Iteration 383: Policy loss: 0.020210. Value loss: 0.861993. Entr

episode: 193   score: 110.0  epsilon: 1.0    steps: 178  evaluation reward: 184.45
episode: 194   score: 105.0  epsilon: 1.0    steps: 465  evaluation reward: 184.35
Training network. lr: 0.000248. clip: 0.099100
Iteration 439: Policy loss: 0.004353. Value loss: 1.464985. Entropy: 1.266081.
Iteration 440: Policy loss: 0.014287. Value loss: 0.926038. Entropy: 1.222895.
Iteration 441: Policy loss: -0.004222. Value loss: 0.786773. Entropy: 1.236573.
episode: 195   score: 155.0  epsilon: 1.0    steps: 892  evaluation reward: 183.6
episode: 196   score: 280.0  epsilon: 1.0    steps: 920  evaluation reward: 183.05
Training network. lr: 0.000248. clip: 0.099100
Iteration 442: Policy loss: 0.017882. Value loss: 1.962260. Entropy: 1.037172.
Iteration 443: Policy loss: 0.013887. Value loss: 1.124368. Entropy: 0.968137.
Iteration 444: Policy loss: 0.009462. Value loss: 0.954687. Entropy: 0.991295.
episode: 197   score: 210.0  epsilon: 1.0    steps: 352  evaluation reward: 183.0
episode: 198   sco

episode: 220   score: 210.0  epsilon: 1.0    steps: 427  evaluation reward: 191.7
Training network. lr: 0.000247. clip: 0.098875
Iteration 502: Policy loss: 0.011164. Value loss: 1.738742. Entropy: 1.063764.
Iteration 503: Policy loss: 0.015828. Value loss: 1.155811. Entropy: 1.087033.
Iteration 504: Policy loss: 0.006542. Value loss: 0.940331. Entropy: 1.116734.
episode: 221   score: 210.0  epsilon: 1.0    steps: 327  evaluation reward: 192.6
episode: 222   score: 210.0  epsilon: 1.0    steps: 716  evaluation reward: 193.0
episode: 223   score: 180.0  epsilon: 1.0    steps: 959  evaluation reward: 194.0
Training network. lr: 0.000247. clip: 0.098875
Iteration 505: Policy loss: 0.007552. Value loss: 1.393625. Entropy: 1.307959.
Iteration 506: Policy loss: 0.009824. Value loss: 0.827167. Entropy: 1.310900.
Iteration 507: Policy loss: 0.000704. Value loss: 0.732756. Entropy: 1.295059.
episode: 224   score: 210.0  epsilon: 1.0    steps: 11  evaluation reward: 194.75
Training network. lr: 

episode: 247   score: 110.0  epsilon: 1.0    steps: 811  evaluation reward: 194.75
Training network. lr: 0.000247. clip: 0.098763
Iteration 565: Policy loss: 0.013175. Value loss: 2.043343. Entropy: 1.405481.
Iteration 566: Policy loss: 0.009867. Value loss: 1.132200. Entropy: 1.413497.
Iteration 567: Policy loss: -0.005189. Value loss: 0.975688. Entropy: 1.388665.
episode: 248   score: 210.0  epsilon: 1.0    steps: 189  evaluation reward: 195.05
episode: 249   score: 265.0  epsilon: 1.0    steps: 649  evaluation reward: 194.5
Training network. lr: 0.000247. clip: 0.098763
Iteration 568: Policy loss: 0.018229. Value loss: 2.167974. Entropy: 1.501540.
Iteration 569: Policy loss: 0.023483. Value loss: 1.281470. Entropy: 1.488799.
Iteration 570: Policy loss: 0.009608. Value loss: 0.857026. Entropy: 1.501392.
episode: 250   score: 155.0  epsilon: 1.0    steps: 559  evaluation reward: 196.35
now time :  2019-02-22 23:48:17.659135
episode: 251   score: 105.0  epsilon: 1.0    steps: 987  eval

Iteration 626: Policy loss: -0.001886. Value loss: 1.759519. Entropy: 1.408956.
Iteration 627: Policy loss: 0.003721. Value loss: 1.585222. Entropy: 1.387250.
Training network. lr: 0.000247. clip: 0.098650
Iteration 628: Policy loss: 0.017619. Value loss: 3.680356. Entropy: 1.429558.
Iteration 629: Policy loss: 0.006567. Value loss: 2.581632. Entropy: 1.422739.
Iteration 630: Policy loss: -0.000431. Value loss: 1.894367. Entropy: 1.435892.
episode: 276   score: 180.0  epsilon: 1.0    steps: 509  evaluation reward: 205.35
Training network. lr: 0.000247. clip: 0.098650
Iteration 631: Policy loss: 0.005485. Value loss: 3.384349. Entropy: 1.479795.
Iteration 632: Policy loss: 0.003462. Value loss: 2.081682. Entropy: 1.471327.
Iteration 633: Policy loss: 0.000870. Value loss: 1.933886. Entropy: 1.478374.
episode: 277   score: 110.0  epsilon: 1.0    steps: 15  evaluation reward: 205.75
Training network. lr: 0.000247. clip: 0.098650
Iteration 634: Policy loss: 0.013425. Value loss: 4.276467. 

Iteration 687: Policy loss: 0.011555. Value loss: 1.833593. Entropy: 1.473767.
episode: 305   score: 155.0  epsilon: 1.0    steps: 82  evaluation reward: 196.5
episode: 306   score: 120.0  epsilon: 1.0    steps: 795  evaluation reward: 195.95
Training network. lr: 0.000246. clip: 0.098538
Iteration 688: Policy loss: 0.008238. Value loss: 2.742592. Entropy: 1.512217.
Iteration 689: Policy loss: -0.002090. Value loss: 1.759880. Entropy: 1.516942.
Iteration 690: Policy loss: -0.006981. Value loss: 1.315162. Entropy: 1.513577.
Training network. lr: 0.000246. clip: 0.098538
Iteration 691: Policy loss: 0.002532. Value loss: 2.457370. Entropy: 1.485353.
Iteration 692: Policy loss: -0.006213. Value loss: 1.272391. Entropy: 1.469463.
Iteration 693: Policy loss: -0.010031. Value loss: 0.926588. Entropy: 1.482947.
episode: 307   score: 90.0  epsilon: 1.0    steps: 158  evaluation reward: 195.35
episode: 308   score: 195.0  epsilon: 1.0    steps: 283  evaluation reward: 195.2
Training network. lr:

Iteration 748: Policy loss: 0.010521. Value loss: 4.566996. Entropy: 1.514440.
Iteration 749: Policy loss: -0.000428. Value loss: 3.617214. Entropy: 1.508724.
Iteration 750: Policy loss: 0.002679. Value loss: 3.285803. Entropy: 1.516002.
Training network. lr: 0.000246. clip: 0.098312
Iteration 751: Policy loss: 0.003708. Value loss: 3.209926. Entropy: 1.574941.
Iteration 752: Policy loss: 0.002703. Value loss: 2.095958. Entropy: 1.595861.
Iteration 753: Policy loss: -0.001762. Value loss: 1.650977. Entropy: 1.613165.
episode: 334   score: 290.0  epsilon: 1.0    steps: 160  evaluation reward: 193.5
episode: 335   score: 315.0  epsilon: 1.0    steps: 714  evaluation reward: 194.6
Training network. lr: 0.000246. clip: 0.098312
Iteration 754: Policy loss: 0.011505. Value loss: 3.065981. Entropy: 1.608184.
Iteration 755: Policy loss: 0.004523. Value loss: 1.885092. Entropy: 1.620156.
Iteration 756: Policy loss: 0.008170. Value loss: 1.604651. Entropy: 1.624303.
episode: 336   score: 60.0  e

episode: 365   score: 255.0  epsilon: 1.0    steps: 538  evaluation reward: 181.25
Training network. lr: 0.000246. clip: 0.098200
Iteration 808: Policy loss: 0.005516. Value loss: 2.828168. Entropy: 1.528585.
Iteration 809: Policy loss: 0.004130. Value loss: 1.614264. Entropy: 1.528201.
Iteration 810: Policy loss: -0.001471. Value loss: 1.396353. Entropy: 1.512451.
episode: 366   score: 140.0  epsilon: 1.0    steps: 770  evaluation reward: 182.0
episode: 367   score: 65.0  epsilon: 1.0    steps: 916  evaluation reward: 180.25
Training network. lr: 0.000246. clip: 0.098200
Iteration 811: Policy loss: 0.006243. Value loss: 3.218943. Entropy: 1.567700.
Iteration 812: Policy loss: -0.000283. Value loss: 1.897748. Entropy: 1.560246.
Iteration 813: Policy loss: -0.002058. Value loss: 1.557707. Entropy: 1.565959.
episode: 368   score: 535.0  epsilon: 1.0    steps: 426  evaluation reward: 179.35
Training network. lr: 0.000246. clip: 0.098200
Iteration 814: Policy loss: 0.008641. Value loss: 3.

Training network. lr: 0.000245. clip: 0.098088
Iteration 868: Policy loss: 0.009384. Value loss: 4.084013. Entropy: 1.576617.
Iteration 869: Policy loss: 0.011450. Value loss: 2.369496. Entropy: 1.575741.
Iteration 870: Policy loss: 0.003723. Value loss: 1.999916. Entropy: 1.557862.
episode: 396   score: 540.0  epsilon: 1.0    steps: 33  evaluation reward: 179.8
episode: 397   score: 45.0  epsilon: 1.0    steps: 976  evaluation reward: 184.45
Training network. lr: 0.000245. clip: 0.098088
Iteration 871: Policy loss: 0.014082. Value loss: 3.067236. Entropy: 1.551342.
Iteration 872: Policy loss: 0.002716. Value loss: 1.746921. Entropy: 1.558912.
Iteration 873: Policy loss: -0.002868. Value loss: 1.453970. Entropy: 1.561605.
episode: 398   score: 290.0  epsilon: 1.0    steps: 468  evaluation reward: 184.3
Training network. lr: 0.000245. clip: 0.098088
Iteration 874: Policy loss: 0.006385. Value loss: 2.597267. Entropy: 1.580780.
Iteration 875: Policy loss: 0.012382. Value loss: 1.617146. 

Training network. lr: 0.000245. clip: 0.097975
Iteration 928: Policy loss: 0.007395. Value loss: 2.827979. Entropy: 1.631865.
Iteration 929: Policy loss: 0.003775. Value loss: 1.627090. Entropy: 1.638118.
Iteration 930: Policy loss: -0.002629. Value loss: 1.321311. Entropy: 1.637475.
episode: 426   score: 200.0  epsilon: 1.0    steps: 1012  evaluation reward: 180.3
Training network. lr: 0.000245. clip: 0.097975
Iteration 931: Policy loss: 0.011271. Value loss: 3.933746. Entropy: 1.641233.
Iteration 932: Policy loss: 0.008839. Value loss: 2.523692. Entropy: 1.647132.
Iteration 933: Policy loss: 0.003887. Value loss: 1.927801. Entropy: 1.648885.
Training network. lr: 0.000245. clip: 0.097975
Iteration 934: Policy loss: 0.017711. Value loss: 2.834578. Entropy: 1.629531.
Iteration 935: Policy loss: 0.002499. Value loss: 1.819175. Entropy: 1.631090.
Iteration 936: Policy loss: -0.002278. Value loss: 1.459617. Entropy: 1.646811.
episode: 427   score: 140.0  epsilon: 1.0    steps: 433  evalua

Training network. lr: 0.000245. clip: 0.097863
Iteration 988: Policy loss: 0.007019. Value loss: 2.385115. Entropy: 1.480832.
Iteration 989: Policy loss: 0.004387. Value loss: 1.390515. Entropy: 1.469078.
Iteration 990: Policy loss: -0.004378. Value loss: 1.196443. Entropy: 1.473686.
episode: 456   score: 210.0  epsilon: 1.0    steps: 548  evaluation reward: 180.85
Training network. lr: 0.000245. clip: 0.097863
Iteration 991: Policy loss: 0.006479. Value loss: 3.199613. Entropy: 1.487814.
Iteration 992: Policy loss: 0.006678. Value loss: 2.070043. Entropy: 1.495141.
Iteration 993: Policy loss: -0.003540. Value loss: 1.671647. Entropy: 1.510925.
Training network. lr: 0.000245. clip: 0.097863
Iteration 994: Policy loss: 0.005048. Value loss: 2.828631. Entropy: 1.589116.
Iteration 995: Policy loss: -0.000634. Value loss: 1.693003. Entropy: 1.605364.
Iteration 996: Policy loss: -0.003457. Value loss: 1.438789. Entropy: 1.592112.
episode: 457   score: 30.0  epsilon: 1.0    steps: 307  evalu

episode: 486   score: 180.0  epsilon: 1.0    steps: 615  evaluation reward: 168.9
episode: 487   score: 210.0  epsilon: 1.0    steps: 675  evaluation reward: 169.6
episode: 488   score: 185.0  epsilon: 1.0    steps: 787  evaluation reward: 170.55
Training network. lr: 0.000244. clip: 0.097750
Iteration 1048: Policy loss: 0.008593. Value loss: 2.157304. Entropy: 1.476047.
Iteration 1049: Policy loss: 0.007136. Value loss: 1.331671. Entropy: 1.488256.
Iteration 1050: Policy loss: 0.008243. Value loss: 1.109892. Entropy: 1.493101.
episode: 489   score: 155.0  epsilon: 1.0    steps: 416  evaluation reward: 172.05
Training network. lr: 0.000244. clip: 0.097638
Iteration 1051: Policy loss: 0.013672. Value loss: 2.311729. Entropy: 1.476190.
Iteration 1052: Policy loss: 0.010256. Value loss: 1.430212. Entropy: 1.498430.
Iteration 1053: Policy loss: 0.004666. Value loss: 1.187218. Entropy: 1.465487.
Training network. lr: 0.000244. clip: 0.097638
Iteration 1054: Policy loss: 0.008804. Value loss

Training network. lr: 0.000244. clip: 0.097525
Iteration 1105: Policy loss: 0.015220. Value loss: 3.456893. Entropy: 1.504152.
Iteration 1106: Policy loss: 0.007204. Value loss: 2.503087. Entropy: 1.524000.
Iteration 1107: Policy loss: 0.003812. Value loss: 1.825124. Entropy: 1.525381.
episode: 519   score: 210.0  epsilon: 1.0    steps: 45  evaluation reward: 157.65
episode: 520   score: 155.0  epsilon: 1.0    steps: 781  evaluation reward: 158.2
Training network. lr: 0.000244. clip: 0.097525
Iteration 1108: Policy loss: 0.001177. Value loss: 2.633445. Entropy: 1.574539.
Iteration 1109: Policy loss: 0.001637. Value loss: 1.592695. Entropy: 1.562344.
Iteration 1110: Policy loss: -0.002070. Value loss: 1.318091. Entropy: 1.579884.
episode: 521   score: 110.0  epsilon: 1.0    steps: 600  evaluation reward: 158.8
Training network. lr: 0.000244. clip: 0.097525
Iteration 1111: Policy loss: 0.002060. Value loss: 2.331614. Entropy: 1.574189.
Iteration 1112: Policy loss: 0.001259. Value loss: 1

Iteration 1166: Policy loss: 0.013433. Value loss: 3.644739. Entropy: 1.636656.
Iteration 1167: Policy loss: 0.017404. Value loss: 2.923262. Entropy: 1.627652.
Training network. lr: 0.000244. clip: 0.097412
Iteration 1168: Policy loss: 0.008620. Value loss: 2.933997. Entropy: 1.590832.
Iteration 1169: Policy loss: 0.012090. Value loss: 1.820541. Entropy: 1.570131.
Iteration 1170: Policy loss: -0.003031. Value loss: 1.506107. Entropy: 1.566020.
episode: 548   score: 150.0  epsilon: 1.0    steps: 775  evaluation reward: 153.55
Training network. lr: 0.000244. clip: 0.097412
Iteration 1171: Policy loss: 0.012277. Value loss: 2.842219. Entropy: 1.568204.
Iteration 1172: Policy loss: -0.001231. Value loss: 1.689453. Entropy: 1.568555.
Iteration 1173: Policy loss: -0.000887. Value loss: 1.331713. Entropy: 1.583185.
episode: 549   score: 545.0  epsilon: 1.0    steps: 454  evaluation reward: 154.2
Training network. lr: 0.000244. clip: 0.097412
Iteration 1174: Policy loss: 0.001722. Value loss: 

episode: 572   score: 415.0  epsilon: 1.0    steps: 166  evaluation reward: 180.2
episode: 573   score: 260.0  epsilon: 1.0    steps: 317  evaluation reward: 182.25
episode: 574   score: 180.0  epsilon: 1.0    steps: 459  evaluation reward: 184.35
episode: 575   score: 210.0  epsilon: 1.0    steps: 604  evaluation reward: 184.6
Training network. lr: 0.000243. clip: 0.097300
Iteration 1231: Policy loss: 0.012733. Value loss: 3.282839. Entropy: 1.527752.
Iteration 1232: Policy loss: 0.011348. Value loss: 2.251497. Entropy: 1.516625.
Iteration 1233: Policy loss: -0.005754. Value loss: 1.921889. Entropy: 1.542510.
episode: 576   score: 290.0  epsilon: 1.0    steps: 703  evaluation reward: 185.6
Training network. lr: 0.000243. clip: 0.097300
Iteration 1234: Policy loss: 0.002561. Value loss: 3.073389. Entropy: 1.535089.
Iteration 1235: Policy loss: 0.009741. Value loss: 2.181249. Entropy: 1.531811.
Iteration 1236: Policy loss: 0.001520. Value loss: 1.876492. Entropy: 1.534360.
episode: 577 

Iteration 1293: Policy loss: 0.003070. Value loss: 1.065920. Entropy: 1.403428.
episode: 600   score: 420.0  epsilon: 1.0    steps: 51  evaluation reward: 207.6
now time :  2019-02-23 00:03:28.280288
episode: 601   score: 100.0  epsilon: 1.0    steps: 719  evaluation reward: 205.25
episode: 602   score: 320.0  epsilon: 1.0    steps: 1004  evaluation reward: 204.65
Training network. lr: 0.000243. clip: 0.097188
Iteration 1294: Policy loss: 0.009952. Value loss: 2.873676. Entropy: 1.497862.
Iteration 1295: Policy loss: -0.000798. Value loss: 1.910499. Entropy: 1.469675.
Iteration 1296: Policy loss: -0.011094. Value loss: 1.667460. Entropy: 1.487527.
episode: 603   score: 265.0  epsilon: 1.0    steps: 470  evaluation reward: 207.5
Training network. lr: 0.000243. clip: 0.097188
Iteration 1297: Policy loss: 0.005043. Value loss: 2.393355. Entropy: 1.482995.
Iteration 1298: Policy loss: -0.002801. Value loss: 1.867630. Entropy: 1.457605.
Iteration 1299: Policy loss: -0.010559. Value loss: 1.

Training network. lr: 0.000242. clip: 0.096963
Iteration 1354: Policy loss: 0.002710. Value loss: 3.182122. Entropy: 1.304065.
Iteration 1355: Policy loss: -0.002847. Value loss: 1.830396. Entropy: 1.309990.
Iteration 1356: Policy loss: -0.005434. Value loss: 1.301034. Entropy: 1.287969.
Training network. lr: 0.000242. clip: 0.096963
Iteration 1357: Policy loss: 0.001268. Value loss: 3.043266. Entropy: 1.410705.
Iteration 1358: Policy loss: 0.006161. Value loss: 1.844447. Entropy: 1.381512.
Iteration 1359: Policy loss: -0.007371. Value loss: 1.310705. Entropy: 1.390341.
episode: 629   score: 475.0  epsilon: 1.0    steps: 36  evaluation reward: 221.95
episode: 630   score: 210.0  epsilon: 1.0    steps: 265  evaluation reward: 224.3
episode: 631   score: 335.0  epsilon: 1.0    steps: 545  evaluation reward: 225.8
Training network. lr: 0.000242. clip: 0.096963
Iteration 1360: Policy loss: 0.012852. Value loss: 2.914844. Entropy: 1.333792.
Iteration 1361: Policy loss: 0.006702. Value loss:

Iteration 1416: Policy loss: -0.003809. Value loss: 1.212152. Entropy: 1.321504.
episode: 656   score: 135.0  epsilon: 1.0    steps: 834  evaluation reward: 234.65
Training network. lr: 0.000242. clip: 0.096850
Iteration 1417: Policy loss: 0.006285. Value loss: 2.368522. Entropy: 1.303730.
Iteration 1418: Policy loss: 0.007364. Value loss: 1.375442. Entropy: 1.303923.
Iteration 1419: Policy loss: -0.009130. Value loss: 1.197260. Entropy: 1.304307.
episode: 657   score: 105.0  epsilon: 1.0    steps: 89  evaluation reward: 227.45
Training network. lr: 0.000242. clip: 0.096850
Iteration 1420: Policy loss: 0.007622. Value loss: 2.438880. Entropy: 1.309323.
Iteration 1421: Policy loss: 0.006970. Value loss: 1.377514. Entropy: 1.286384.
Iteration 1422: Policy loss: -0.001130. Value loss: 1.139618. Entropy: 1.296869.
episode: 658   score: 185.0  epsilon: 1.0    steps: 325  evaluation reward: 223.6
Training network. lr: 0.000242. clip: 0.096850
Iteration 1423: Policy loss: 0.007983. Value loss

Iteration 1479: Policy loss: -0.007040. Value loss: 1.643306. Entropy: 1.154910.
episode: 682   score: 180.0  epsilon: 1.0    steps: 174  evaluation reward: 228.3
episode: 683   score: 130.0  epsilon: 1.0    steps: 421  evaluation reward: 225.95
Training network. lr: 0.000242. clip: 0.096738
Iteration 1480: Policy loss: 0.004422. Value loss: 4.809155. Entropy: 1.117133.
Iteration 1481: Policy loss: 0.003295. Value loss: 4.570579. Entropy: 1.129880.
Iteration 1482: Policy loss: 0.015101. Value loss: 3.936856. Entropy: 1.145323.
episode: 684   score: 395.0  epsilon: 1.0    steps: 679  evaluation reward: 224.8
episode: 685   score: 180.0  epsilon: 1.0    steps: 921  evaluation reward: 224.5
Training network. lr: 0.000242. clip: 0.096738
Iteration 1483: Policy loss: 0.010523. Value loss: 2.403260. Entropy: 1.215146.
Iteration 1484: Policy loss: 0.004137. Value loss: 1.577354. Entropy: 1.195083.
Iteration 1485: Policy loss: 0.000795. Value loss: 1.252151. Entropy: 1.225531.
Training network

Iteration 1541: Policy loss: -0.005132. Value loss: 1.942100. Entropy: 1.331888.
Iteration 1542: Policy loss: -0.011624. Value loss: 1.660689. Entropy: 1.351022.
episode: 709   score: 320.0  epsilon: 1.0    steps: 448  evaluation reward: 228.05
Training network. lr: 0.000242. clip: 0.096625
Iteration 1543: Policy loss: 0.006721. Value loss: 2.842098. Entropy: 1.301278.
Iteration 1544: Policy loss: 0.003293. Value loss: 1.662888. Entropy: 1.308871.
Iteration 1545: Policy loss: 0.000561. Value loss: 1.402794. Entropy: 1.305264.
episode: 710   score: 180.0  epsilon: 1.0    steps: 34  evaluation reward: 229.9
Training network. lr: 0.000242. clip: 0.096625
Iteration 1546: Policy loss: 0.005910. Value loss: 2.583258. Entropy: 1.212374.
Iteration 1547: Policy loss: -0.002027. Value loss: 1.548532. Entropy: 1.236277.
Iteration 1548: Policy loss: -0.006177. Value loss: 1.317708. Entropy: 1.209637.
episode: 711   score: 260.0  epsilon: 1.0    steps: 675  evaluation reward: 228.4
Training network

Iteration 1602: Policy loss: 0.002037. Value loss: 1.678718. Entropy: 1.285180.
episode: 738   score: 115.0  epsilon: 1.0    steps: 659  evaluation reward: 226.3
Training network. lr: 0.000241. clip: 0.096400
Iteration 1603: Policy loss: 0.010515. Value loss: 2.907350. Entropy: 1.167970.
Iteration 1604: Policy loss: 0.005152. Value loss: 1.804578. Entropy: 1.190621.
Iteration 1605: Policy loss: -0.000558. Value loss: 1.329495. Entropy: 1.202523.
episode: 739   score: 20.0  epsilon: 1.0    steps: 161  evaluation reward: 226.25
episode: 740   score: 265.0  epsilon: 1.0    steps: 420  evaluation reward: 225.25
episode: 741   score: 100.0  epsilon: 1.0    steps: 621  evaluation reward: 227.45
episode: 742   score: 50.0  epsilon: 1.0    steps: 987  evaluation reward: 226.35
Training network. lr: 0.000241. clip: 0.096400
Iteration 1606: Policy loss: 0.004003. Value loss: 2.621903. Entropy: 1.153624.
Iteration 1607: Policy loss: -0.002135. Value loss: 1.511318. Entropy: 1.135475.
Iteration 16

Iteration 1662: Policy loss: -0.002252. Value loss: 1.415186. Entropy: 1.338007.
episode: 768   score: 150.0  epsilon: 1.0    steps: 474  evaluation reward: 208.3
episode: 769   score: 155.0  epsilon: 1.0    steps: 824  evaluation reward: 204.7
Training network. lr: 0.000241. clip: 0.096287
Iteration 1663: Policy loss: 0.009612. Value loss: 5.126457. Entropy: 1.398013.
Iteration 1664: Policy loss: 0.014302. Value loss: 4.442392. Entropy: 1.366192.
Iteration 1665: Policy loss: 0.018502. Value loss: 3.120445. Entropy: 1.375651.
Training network. lr: 0.000241. clip: 0.096287
Iteration 1666: Policy loss: 0.014614. Value loss: 3.051777. Entropy: 1.399958.
Iteration 1667: Policy loss: 0.004919. Value loss: 1.592397. Entropy: 1.412017.
Iteration 1668: Policy loss: -0.002303. Value loss: 1.060196. Entropy: 1.423524.
episode: 770   score: 365.0  epsilon: 1.0    steps: 124  evaluation reward: 204.15
Training network. lr: 0.000241. clip: 0.096287
Iteration 1669: Policy loss: 0.007028. Value loss:

Training network. lr: 0.000240. clip: 0.096175
Iteration 1726: Policy loss: 0.009671. Value loss: 6.620612. Entropy: 1.097439.
Iteration 1727: Policy loss: 0.014861. Value loss: 5.193146. Entropy: 1.112308.
Iteration 1728: Policy loss: 0.019703. Value loss: 4.420551. Entropy: 1.116590.
episode: 795   score: 260.0  epsilon: 1.0    steps: 915  evaluation reward: 212.1
Training network. lr: 0.000240. clip: 0.096175
Iteration 1729: Policy loss: 0.006334. Value loss: 3.039223. Entropy: 0.976808.
Iteration 1730: Policy loss: -0.003359. Value loss: 1.753003. Entropy: 1.048035.
Iteration 1731: Policy loss: -0.005910. Value loss: 1.618977. Entropy: 1.028544.
episode: 796   score: 140.0  epsilon: 1.0    steps: 315  evaluation reward: 210.3
episode: 797   score: 690.0  epsilon: 1.0    steps: 461  evaluation reward: 207.8
Training network. lr: 0.000240. clip: 0.096175
Iteration 1732: Policy loss: 0.005444. Value loss: 2.725043. Entropy: 1.170671.
Iteration 1733: Policy loss: -0.001272. Value loss:

Iteration 1791: Policy loss: -0.006247. Value loss: 1.416614. Entropy: 1.134154.
episode: 819   score: 575.0  epsilon: 1.0    steps: 131  evaluation reward: 221.35
episode: 820   score: 150.0  epsilon: 1.0    steps: 946  evaluation reward: 221.7
Training network. lr: 0.000240. clip: 0.096063
Iteration 1792: Policy loss: -0.001684. Value loss: 2.034823. Entropy: 1.204243.
Iteration 1793: Policy loss: -0.005903. Value loss: 1.153113. Entropy: 1.244640.
Iteration 1794: Policy loss: -0.008542. Value loss: 0.790852. Entropy: 1.247087.
episode: 821   score: 105.0  epsilon: 1.0    steps: 297  evaluation reward: 221.4
Training network. lr: 0.000240. clip: 0.096063
Iteration 1795: Policy loss: 0.004579. Value loss: 2.283953. Entropy: 1.223547.
Iteration 1796: Policy loss: 0.016198. Value loss: 1.314171. Entropy: 1.258779.
Iteration 1797: Policy loss: -0.003136. Value loss: 1.034420. Entropy: 1.222753.
episode: 822   score: 410.0  epsilon: 1.0    steps: 884  evaluation reward: 221.45
Training ne

Iteration 1854: Policy loss: -0.012442. Value loss: 1.434865. Entropy: 1.343670.
episode: 845   score: 135.0  epsilon: 1.0    steps: 71  evaluation reward: 226.8
episode: 846   score: 265.0  epsilon: 1.0    steps: 398  evaluation reward: 227.35
Training network. lr: 0.000240. clip: 0.095838
Iteration 1855: Policy loss: 0.009255. Value loss: 2.127739. Entropy: 1.468367.
Iteration 1856: Policy loss: 0.004066. Value loss: 1.263652. Entropy: 1.458191.
Iteration 1857: Policy loss: 0.001055. Value loss: 1.004735. Entropy: 1.468062.
episode: 847   score: 210.0  epsilon: 1.0    steps: 535  evaluation reward: 229.5
Training network. lr: 0.000240. clip: 0.095838
Iteration 1858: Policy loss: 0.000475. Value loss: 1.911890. Entropy: 1.344682.
Iteration 1859: Policy loss: -0.004060. Value loss: 1.248076. Entropy: 1.366522.
Iteration 1860: Policy loss: -0.010593. Value loss: 1.046932. Entropy: 1.345988.
Training network. lr: 0.000240. clip: 0.095838
Iteration 1861: Policy loss: 0.012253. Value loss:

Iteration 1917: Policy loss: -0.004837. Value loss: 1.363789. Entropy: 1.270024.
episode: 871   score: 50.0  epsilon: 1.0    steps: 766  evaluation reward: 238.2
Training network. lr: 0.000239. clip: 0.095725
Iteration 1918: Policy loss: 0.008320. Value loss: 2.480316. Entropy: 1.120578.
Iteration 1919: Policy loss: 0.004845. Value loss: 1.393516. Entropy: 1.124023.
Iteration 1920: Policy loss: -0.003819. Value loss: 1.119747. Entropy: 1.131617.
episode: 872   score: 240.0  epsilon: 1.0    steps: 293  evaluation reward: 236.4
Training network. lr: 0.000239. clip: 0.095725
Iteration 1921: Policy loss: 0.004132. Value loss: 1.761928. Entropy: 1.128247.
Iteration 1922: Policy loss: -0.004104. Value loss: 1.175969. Entropy: 1.146863.
Iteration 1923: Policy loss: -0.003901. Value loss: 0.994654. Entropy: 1.160875.
episode: 873   score: 210.0  epsilon: 1.0    steps: 498  evaluation reward: 235.95
episode: 874   score: 155.0  epsilon: 1.0    steps: 770  evaluation reward: 231.8
Training netwo

Iteration 1979: Policy loss: 0.000415. Value loss: 1.668363. Entropy: 1.300866.
Iteration 1980: Policy loss: -0.005431. Value loss: 1.169802. Entropy: 1.311256.
episode: 899   score: 135.0  epsilon: 1.0    steps: 489  evaluation reward: 233.9
Training network. lr: 0.000239. clip: 0.095613
Iteration 1981: Policy loss: 0.004186. Value loss: 2.875654. Entropy: 1.345997.
Iteration 1982: Policy loss: -0.004967. Value loss: 1.775605. Entropy: 1.343846.
Iteration 1983: Policy loss: -0.004893. Value loss: 1.464714. Entropy: 1.360000.
Training network. lr: 0.000239. clip: 0.095613
Iteration 1984: Policy loss: 0.003918. Value loss: 2.644335. Entropy: 1.355738.
Iteration 1985: Policy loss: 0.003419. Value loss: 1.472326. Entropy: 1.348969.
Iteration 1986: Policy loss: 0.002908. Value loss: 1.051171. Entropy: 1.359463.
episode: 900   score: 210.0  epsilon: 1.0    steps: 177  evaluation reward: 229.85
now time :  2019-02-23 00:17:59.110030
episode: 901   score: 110.0  epsilon: 1.0    steps: 374  ev

Iteration 2038: Policy loss: 0.000978. Value loss: 2.629215. Entropy: 1.414518.
Iteration 2039: Policy loss: 0.002571. Value loss: 1.524729. Entropy: 1.414621.
Iteration 2040: Policy loss: -0.002085. Value loss: 1.406951. Entropy: 1.426052.
episode: 929   score: 210.0  epsilon: 1.0    steps: 650  evaluation reward: 202.35
Training network. lr: 0.000239. clip: 0.095500
Iteration 2041: Policy loss: 0.007840. Value loss: 2.353899. Entropy: 1.281548.
Iteration 2042: Policy loss: -0.005909. Value loss: 1.440749. Entropy: 1.305611.
Iteration 2043: Policy loss: -0.014467. Value loss: 1.114658. Entropy: 1.303363.
episode: 930   score: 185.0  epsilon: 1.0    steps: 990  evaluation reward: 199.95
Training network. lr: 0.000239. clip: 0.095500
Iteration 2044: Policy loss: 0.003510. Value loss: 2.507039. Entropy: 1.269452.
Iteration 2045: Policy loss: -0.001076. Value loss: 1.470780. Entropy: 1.264538.
Iteration 2046: Policy loss: -0.002609. Value loss: 1.256725. Entropy: 1.275928.
Training networ

Iteration 2102: Policy loss: 0.004697. Value loss: 0.960138. Entropy: 1.208620.
Iteration 2103: Policy loss: -0.012092. Value loss: 0.753158. Entropy: 1.201035.
episode: 954   score: 490.0  epsilon: 1.0    steps: 435  evaluation reward: 199.35
episode: 955   score: 460.0  epsilon: 1.0    steps: 737  evaluation reward: 201.65
Training network. lr: 0.000238. clip: 0.095275
Iteration 2104: Policy loss: 0.008623. Value loss: 2.953909. Entropy: 1.203744.
Iteration 2105: Policy loss: 0.005785. Value loss: 1.683019. Entropy: 1.224985.
Iteration 2106: Policy loss: -0.000956. Value loss: 1.489125. Entropy: 1.209877.
episode: 956   score: 210.0  epsilon: 1.0    steps: 514  evaluation reward: 202.8
Training network. lr: 0.000238. clip: 0.095275
Iteration 2107: Policy loss: -0.000024. Value loss: 2.385597. Entropy: 1.261530.
Iteration 2108: Policy loss: -0.000471. Value loss: 1.345854. Entropy: 1.252549.
Iteration 2109: Policy loss: 0.004725. Value loss: 1.156835. Entropy: 1.254603.
episode: 957  

Iteration 2166: Policy loss: -0.008034. Value loss: 1.147447. Entropy: 1.473727.
episode: 980   score: 240.0  epsilon: 1.0    steps: 235  evaluation reward: 219.35
episode: 981   score: 270.0  epsilon: 1.0    steps: 283  evaluation reward: 220.4
Training network. lr: 0.000238. clip: 0.095163
Iteration 2167: Policy loss: 0.007873. Value loss: 2.429178. Entropy: 1.398311.
Iteration 2168: Policy loss: -0.004106. Value loss: 1.247994. Entropy: 1.410175.
Iteration 2169: Policy loss: -0.003687. Value loss: 1.270589. Entropy: 1.401703.
episode: 982   score: 410.0  epsilon: 1.0    steps: 735  evaluation reward: 219.0
Training network. lr: 0.000238. clip: 0.095163
Iteration 2170: Policy loss: 0.005692. Value loss: 3.101099. Entropy: 1.366899.
Iteration 2171: Policy loss: 0.013192. Value loss: 1.778527. Entropy: 1.350847.
Iteration 2172: Policy loss: -0.000505. Value loss: 1.543411. Entropy: 1.362213.
episode: 983   score: 485.0  epsilon: 1.0    steps: 817  evaluation reward: 219.5
Training netw

Iteration 2231: Policy loss: 0.001417. Value loss: 1.760792. Entropy: 1.561530.
Iteration 2232: Policy loss: -0.006858. Value loss: 1.225378. Entropy: 1.565198.
episode: 1004   score: 365.0  epsilon: 1.0    steps: 281  evaluation reward: 225.4
episode: 1005   score: 120.0  epsilon: 1.0    steps: 473  evaluation reward: 228.2
episode: 1006   score: 280.0  epsilon: 1.0    steps: 602  evaluation reward: 227.65
episode: 1007   score: 655.0  epsilon: 1.0    steps: 824  evaluation reward: 227.05
Training network. lr: 0.000238. clip: 0.095050
Iteration 2233: Policy loss: 0.009735. Value loss: 2.152660. Entropy: 1.564250.
Iteration 2234: Policy loss: 0.001819. Value loss: 1.362972. Entropy: 1.546502.
Iteration 2235: Policy loss: -0.002961. Value loss: 1.134321. Entropy: 1.565813.
episode: 1008   score: 155.0  epsilon: 1.0    steps: 4  evaluation reward: 232.5
Training network. lr: 0.000238. clip: 0.095050
Iteration 2236: Policy loss: 0.011957. Value loss: 3.176293. Entropy: 1.545554.
Iteration

Iteration 2295: Policy loss: 0.007318. Value loss: 1.852140. Entropy: 1.567320.
episode: 1029   score: 240.0  epsilon: 1.0    steps: 42  evaluation reward: 250.85
Training network. lr: 0.000237. clip: 0.094938
Iteration 2296: Policy loss: 0.012784. Value loss: 1.295745. Entropy: 1.508181.
Iteration 2297: Policy loss: 0.006794. Value loss: 0.765292. Entropy: 1.513790.
Iteration 2298: Policy loss: 0.003341. Value loss: 0.646565. Entropy: 1.510506.
episode: 1030   score: 470.0  epsilon: 1.0    steps: 992  evaluation reward: 251.15
Training network. lr: 0.000237. clip: 0.094938
Iteration 2299: Policy loss: 0.008084. Value loss: 4.423546. Entropy: 1.496493.
Iteration 2300: Policy loss: 0.011884. Value loss: 3.944960. Entropy: 1.508143.
Iteration 2301: Policy loss: 0.013705. Value loss: 3.524084. Entropy: 1.519788.
episode: 1031   score: 210.0  epsilon: 1.0    steps: 470  evaluation reward: 254.0
Training network. lr: 0.000237. clip: 0.094825
Iteration 2302: Policy loss: 0.016897. Value loss

Training network. lr: 0.000237. clip: 0.094713
Iteration 2359: Policy loss: 0.006176. Value loss: 2.708296. Entropy: 1.457168.
Iteration 2360: Policy loss: 0.013449. Value loss: 1.427388. Entropy: 1.395797.
Iteration 2361: Policy loss: 0.007208. Value loss: 0.950861. Entropy: 1.437157.
Training network. lr: 0.000237. clip: 0.094713
Iteration 2362: Policy loss: 0.010166. Value loss: 1.966656. Entropy: 1.568460.
Iteration 2363: Policy loss: 0.005468. Value loss: 1.133264. Entropy: 1.582390.
Iteration 2364: Policy loss: 0.005714. Value loss: 0.926682. Entropy: 1.564989.
episode: 1054   score: 180.0  epsilon: 1.0    steps: 30  evaluation reward: 273.55
episode: 1055   score: 260.0  epsilon: 1.0    steps: 240  evaluation reward: 270.45
episode: 1056   score: 110.0  epsilon: 1.0    steps: 595  evaluation reward: 268.45
Training network. lr: 0.000237. clip: 0.094713
Iteration 2365: Policy loss: 0.000748. Value loss: 4.368702. Entropy: 1.582993.
Iteration 2366: Policy loss: 0.011118. Value los

Iteration 2423: Policy loss: 0.011244. Value loss: 1.192693. Entropy: 1.410981.
Iteration 2424: Policy loss: 0.001105. Value loss: 0.963249. Entropy: 1.427908.
Training network. lr: 0.000237. clip: 0.094600
Iteration 2425: Policy loss: 0.004495. Value loss: 2.161502. Entropy: 1.477724.
Iteration 2426: Policy loss: 0.000030. Value loss: 1.331204. Entropy: 1.494596.
Iteration 2427: Policy loss: -0.000933. Value loss: 1.036622. Entropy: 1.494519.
episode: 1079   score: 215.0  epsilon: 1.0    steps: 146  evaluation reward: 265.3
episode: 1080   score: 670.0  epsilon: 1.0    steps: 696  evaluation reward: 262.7
episode: 1081   score: 125.0  epsilon: 1.0    steps: 833  evaluation reward: 267.0
Training network. lr: 0.000237. clip: 0.094600
Iteration 2428: Policy loss: 0.014973. Value loss: 2.371556. Entropy: 1.465706.
Iteration 2429: Policy loss: 0.001593. Value loss: 1.608327. Entropy: 1.449148.
Iteration 2430: Policy loss: 0.002352. Value loss: 1.354711. Entropy: 1.451463.
episode: 1082   

Iteration 2483: Policy loss: 0.002661. Value loss: 1.662645. Entropy: 1.435373.
Iteration 2484: Policy loss: -0.003755. Value loss: 1.284365. Entropy: 1.424987.
episode: 1108   score: 35.0  epsilon: 1.0    steps: 283  evaluation reward: 252.75
episode: 1109   score: 135.0  epsilon: 1.0    steps: 487  evaluation reward: 251.55
episode: 1110   score: 185.0  epsilon: 1.0    steps: 560  evaluation reward: 249.75
episode: 1111   score: 150.0  epsilon: 1.0    steps: 986  evaluation reward: 249.0
Training network. lr: 0.000236. clip: 0.094488
Iteration 2485: Policy loss: 0.010956. Value loss: 2.137820. Entropy: 1.443183.
Iteration 2486: Policy loss: 0.007602. Value loss: 1.237928. Entropy: 1.463240.
Iteration 2487: Policy loss: 0.003001. Value loss: 1.086142. Entropy: 1.452171.
episode: 1112   score: 240.0  epsilon: 1.0    steps: 113  evaluation reward: 250.2
Training network. lr: 0.000236. clip: 0.094488
Iteration 2488: Policy loss: 0.015534. Value loss: 2.898962. Entropy: 1.509463.
Iteratio

Iteration 2544: Policy loss: -0.002987. Value loss: 1.339235. Entropy: 1.589587.
episode: 1137   score: 105.0  epsilon: 1.0    steps: 851  evaluation reward: 225.3
Training network. lr: 0.000236. clip: 0.094375
Iteration 2545: Policy loss: 0.015129. Value loss: 2.465254. Entropy: 1.560480.
Iteration 2546: Policy loss: 0.016905. Value loss: 1.289480. Entropy: 1.555240.
Iteration 2547: Policy loss: 0.004915. Value loss: 0.991749. Entropy: 1.558589.
episode: 1138   score: 215.0  epsilon: 1.0    steps: 204  evaluation reward: 223.45
episode: 1139   score: 125.0  epsilon: 1.0    steps: 720  evaluation reward: 224.0
Training network. lr: 0.000236. clip: 0.094375
Iteration 2548: Policy loss: 0.006420. Value loss: 1.610365. Entropy: 1.557593.
Iteration 2549: Policy loss: 0.002249. Value loss: 0.978165. Entropy: 1.554763.
Iteration 2550: Policy loss: 0.002345. Value loss: 0.810029. Entropy: 1.562878.
episode: 1140   score: 125.0  epsilon: 1.0    steps: 90  evaluation reward: 223.45
episode: 114

Training network. lr: 0.000235. clip: 0.094150
Iteration 2605: Policy loss: 0.005934. Value loss: 1.631590. Entropy: 1.410041.
Iteration 2606: Policy loss: 0.008940. Value loss: 1.033816. Entropy: 1.428730.
Iteration 2607: Policy loss: -0.003294. Value loss: 0.712925. Entropy: 1.447810.
episode: 1165   score: 115.0  epsilon: 1.0    steps: 173  evaluation reward: 200.9
episode: 1166   score: 135.0  epsilon: 1.0    steps: 471  evaluation reward: 198.4
episode: 1167   score: 85.0  epsilon: 1.0    steps: 653  evaluation reward: 196.45
episode: 1168   score: 75.0  epsilon: 1.0    steps: 942  evaluation reward: 195.2
Training network. lr: 0.000235. clip: 0.094150
Iteration 2608: Policy loss: 0.007530. Value loss: 1.807655. Entropy: 1.412366.
Iteration 2609: Policy loss: 0.011979. Value loss: 1.126470. Entropy: 1.399682.
Iteration 2610: Policy loss: 0.002196. Value loss: 0.862111. Entropy: 1.408176.
Training network. lr: 0.000235. clip: 0.094150
Iteration 2611: Policy loss: 0.007334. Value lo

Iteration 2665: Policy loss: 0.005662. Value loss: 3.709318. Entropy: 1.499527.
Iteration 2666: Policy loss: 0.006290. Value loss: 2.455227. Entropy: 1.492543.
Iteration 2667: Policy loss: -0.003863. Value loss: 1.877503. Entropy: 1.484287.
Training network. lr: 0.000235. clip: 0.094037
Iteration 2668: Policy loss: 0.005124. Value loss: 2.746043. Entropy: 1.520873.
Iteration 2669: Policy loss: 0.008332. Value loss: 1.782385. Entropy: 1.521561.
Iteration 2670: Policy loss: 0.002499. Value loss: 1.345136. Entropy: 1.513832.
episode: 1194   score: 165.0  epsilon: 1.0    steps: 402  evaluation reward: 187.1
episode: 1195   score: 210.0  epsilon: 1.0    steps: 993  evaluation reward: 182.55
Training network. lr: 0.000235. clip: 0.094037
Iteration 2671: Policy loss: 0.012219. Value loss: 2.750463. Entropy: 1.526155.
Iteration 2672: Policy loss: -0.003263. Value loss: 1.720133. Entropy: 1.535788.
Iteration 2673: Policy loss: -0.007649. Value loss: 1.311654. Entropy: 1.525502.
Training network

Iteration 2727: Policy loss: 0.002009. Value loss: 1.460117. Entropy: 1.507759.
Training network. lr: 0.000235. clip: 0.093925
Iteration 2728: Policy loss: 0.007545. Value loss: 2.523601. Entropy: 1.531272.
Iteration 2729: Policy loss: 0.000194. Value loss: 1.563240. Entropy: 1.523510.
Iteration 2730: Policy loss: -0.003574. Value loss: 1.265711. Entropy: 1.521547.
episode: 1221   score: 55.0  epsilon: 1.0    steps: 737  evaluation reward: 197.4
Training network. lr: 0.000235. clip: 0.093925
Iteration 2731: Policy loss: 0.015484. Value loss: 3.542590. Entropy: 1.496503.
Iteration 2732: Policy loss: 0.002203. Value loss: 1.962256. Entropy: 1.486191.
Iteration 2733: Policy loss: -0.002131. Value loss: 1.365656. Entropy: 1.494204.
episode: 1222   score: 185.0  epsilon: 1.0    steps: 832  evaluation reward: 197.0
Training network. lr: 0.000235. clip: 0.093925
Iteration 2734: Policy loss: 0.015212. Value loss: 2.599632. Entropy: 1.522958.
Iteration 2735: Policy loss: 0.006472. Value loss: 1

Iteration 2788: Policy loss: 0.003741. Value loss: 3.082844. Entropy: 1.487519.
Iteration 2789: Policy loss: 0.003851. Value loss: 1.800605. Entropy: 1.499710.
Iteration 2790: Policy loss: -0.003233. Value loss: 1.349037. Entropy: 1.505806.
episode: 1249   score: 180.0  epsilon: 1.0    steps: 792  evaluation reward: 202.1
episode: 1250   score: 265.0  epsilon: 1.0    steps: 1000  evaluation reward: 201.4
Training network. lr: 0.000235. clip: 0.093813
Iteration 2791: Policy loss: 0.005086. Value loss: 5.492764. Entropy: 1.440900.
Iteration 2792: Policy loss: 0.008363. Value loss: 4.921618. Entropy: 1.451609.
Iteration 2793: Policy loss: -0.001091. Value loss: 4.102877. Entropy: 1.452845.
now time :  2019-02-23 00:35:15.898996
episode: 1251   score: 120.0  epsilon: 1.0    steps: 656  evaluation reward: 202.25
Training network. lr: 0.000235. clip: 0.093813
Iteration 2794: Policy loss: 0.008859. Value loss: 2.257245. Entropy: 1.456228.
Iteration 2795: Policy loss: 0.005062. Value loss: 1.4

episode: 1275   score: 80.0  epsilon: 1.0    steps: 597  evaluation reward: 213.15
episode: 1276   score: 80.0  epsilon: 1.0    steps: 735  evaluation reward: 212.35
Training network. lr: 0.000234. clip: 0.093588
Iteration 2851: Policy loss: 0.010690. Value loss: 2.651407. Entropy: 1.433710.
Iteration 2852: Policy loss: 0.009894. Value loss: 1.565111. Entropy: 1.436285.
Iteration 2853: Policy loss: -0.003523. Value loss: 1.313401. Entropy: 1.439983.
episode: 1277   score: 315.0  epsilon: 1.0    steps: 125  evaluation reward: 211.05
Training network. lr: 0.000234. clip: 0.093588
Iteration 2854: Policy loss: 0.001746. Value loss: 3.282470. Entropy: 1.428090.
Iteration 2855: Policy loss: 0.003484. Value loss: 1.854596. Entropy: 1.409173.
Iteration 2856: Policy loss: -0.007149. Value loss: 1.544922. Entropy: 1.428783.
episode: 1278   score: 170.0  epsilon: 1.0    steps: 192  evaluation reward: 211.5
episode: 1279   score: 100.0  epsilon: 1.0    steps: 318  evaluation reward: 211.85
episode

Iteration 2912: Policy loss: 0.003418. Value loss: 1.641904. Entropy: 1.326275.
Iteration 2913: Policy loss: -0.008003. Value loss: 1.384684. Entropy: 1.323840.
episode: 1303   score: 180.0  epsilon: 1.0    steps: 24  evaluation reward: 208.1
episode: 1304   score: 380.0  epsilon: 1.0    steps: 621  evaluation reward: 207.8
episode: 1305   score: 155.0  epsilon: 1.0    steps: 721  evaluation reward: 209.8
Training network. lr: 0.000234. clip: 0.093475
Iteration 2914: Policy loss: 0.000080. Value loss: 2.756772. Entropy: 1.338453.
Iteration 2915: Policy loss: -0.000848. Value loss: 1.662007. Entropy: 1.326204.
Iteration 2916: Policy loss: 0.005818. Value loss: 1.217444. Entropy: 1.307563.
episode: 1306   score: 155.0  epsilon: 1.0    steps: 856  evaluation reward: 210.75
Training network. lr: 0.000234. clip: 0.093475
Iteration 2917: Policy loss: 0.000729. Value loss: 2.984957. Entropy: 1.316149.
Iteration 2918: Policy loss: 0.002950. Value loss: 2.050493. Entropy: 1.302536.
Iteration 29

Iteration 2974: Policy loss: 0.008120. Value loss: 2.955250. Entropy: 1.485047.
Iteration 2975: Policy loss: 0.003379. Value loss: 1.847244. Entropy: 1.502828.
Iteration 2976: Policy loss: 0.006192. Value loss: 1.118001. Entropy: 1.494094.
episode: 1331   score: 240.0  epsilon: 1.0    steps: 185  evaluation reward: 212.95
Training network. lr: 0.000233. clip: 0.093363
Iteration 2977: Policy loss: 0.012286. Value loss: 3.132396. Entropy: 1.385739.
Iteration 2978: Policy loss: 0.007419. Value loss: 1.601222. Entropy: 1.396372.
Iteration 2979: Policy loss: -0.001696. Value loss: 1.318127. Entropy: 1.373874.
Training network. lr: 0.000233. clip: 0.093363
Iteration 2980: Policy loss: 0.015352. Value loss: 3.266585. Entropy: 1.503244.
Iteration 2981: Policy loss: 0.011675. Value loss: 1.798614. Entropy: 1.499269.
Iteration 2982: Policy loss: 0.015302. Value loss: 1.163435. Entropy: 1.497381.
episode: 1332   score: 185.0  epsilon: 1.0    steps: 313  evaluation reward: 212.85
episode: 1333   s

Iteration 3034: Policy loss: 0.007863. Value loss: 3.303807. Entropy: 1.460175.
Iteration 3035: Policy loss: 0.003943. Value loss: 2.026137. Entropy: 1.432988.
Iteration 3036: Policy loss: 0.007303. Value loss: 1.520715. Entropy: 1.451148.
Training network. lr: 0.000233. clip: 0.093250
Iteration 3037: Policy loss: 0.008128. Value loss: 2.864710. Entropy: 1.463444.
Iteration 3038: Policy loss: 0.009594. Value loss: 1.847948. Entropy: 1.460320.
Iteration 3039: Policy loss: 0.008325. Value loss: 1.415505. Entropy: 1.471029.
episode: 1360   score: 35.0  epsilon: 1.0    steps: 52  evaluation reward: 214.2
episode: 1361   score: 85.0  epsilon: 1.0    steps: 481  evaluation reward: 212.75
episode: 1362   score: 150.0  epsilon: 1.0    steps: 572  evaluation reward: 213.35
Training network. lr: 0.000233. clip: 0.093250
Iteration 3040: Policy loss: 0.009433. Value loss: 2.872358. Entropy: 1.487718.
Iteration 3041: Policy loss: 0.002712. Value loss: 1.607383. Entropy: 1.498931.
Iteration 3042: Po

Training network. lr: 0.000233. clip: 0.093137
Iteration 3094: Policy loss: 0.005501. Value loss: 3.039372. Entropy: 1.418977.
Iteration 3095: Policy loss: 0.001106. Value loss: 2.006057. Entropy: 1.417908.
Iteration 3096: Policy loss: -0.001042. Value loss: 1.466897. Entropy: 1.429868.
episode: 1390   score: 140.0  epsilon: 1.0    steps: 82  evaluation reward: 215.45
episode: 1391   score: 135.0  epsilon: 1.0    steps: 658  evaluation reward: 215.3
Training network. lr: 0.000233. clip: 0.093137
Iteration 3097: Policy loss: 0.011866. Value loss: 2.218108. Entropy: 1.410427.
Iteration 3098: Policy loss: 0.018265. Value loss: 1.526942. Entropy: 1.428735.
Iteration 3099: Policy loss: -0.000713. Value loss: 1.178370. Entropy: 1.415910.
episode: 1392   score: 135.0  epsilon: 1.0    steps: 201  evaluation reward: 213.85
episode: 1393   score: 120.0  epsilon: 1.0    steps: 867  evaluation reward: 212.45
Training network. lr: 0.000233. clip: 0.093137
Iteration 3100: Policy loss: 0.008644. Valu

Training network. lr: 0.000232. clip: 0.092913
Iteration 3154: Policy loss: -0.001405. Value loss: 2.935454. Entropy: 1.373171.
Iteration 3155: Policy loss: -0.006697. Value loss: 1.805392. Entropy: 1.404423.
Iteration 3156: Policy loss: -0.013657. Value loss: 1.521097. Entropy: 1.389302.
episode: 1419   score: 125.0  epsilon: 1.0    steps: 279  evaluation reward: 215.3
episode: 1420   score: 215.0  epsilon: 1.0    steps: 1020  evaluation reward: 213.45
Training network. lr: 0.000232. clip: 0.092913
Iteration 3157: Policy loss: 0.009611. Value loss: 3.514093. Entropy: 1.377923.
Iteration 3158: Policy loss: 0.018745. Value loss: 1.934533. Entropy: 1.394893.
Iteration 3159: Policy loss: 0.000952. Value loss: 1.463970. Entropy: 1.391498.
episode: 1421   score: 345.0  epsilon: 1.0    steps: 592  evaluation reward: 211.25
Training network. lr: 0.000232. clip: 0.092913
Iteration 3160: Policy loss: 0.001662. Value loss: 6.264034. Entropy: 1.403450.
Iteration 3161: Policy loss: 0.014748. Value

Iteration 3215: Policy loss: 0.001729. Value loss: 1.152480. Entropy: 1.444484.
Iteration 3216: Policy loss: 0.010243. Value loss: 0.934486. Entropy: 1.422295.
episode: 1447   score: 420.0  epsilon: 1.0    steps: 737  evaluation reward: 211.4
episode: 1448   score: 135.0  epsilon: 1.0    steps: 936  evaluation reward: 215.15
Training network. lr: 0.000232. clip: 0.092800
Iteration 3217: Policy loss: 0.006800. Value loss: 2.192226. Entropy: 1.392149.
Iteration 3218: Policy loss: 0.005146. Value loss: 1.240162. Entropy: 1.383515.
Iteration 3219: Policy loss: -0.003380. Value loss: 1.016288. Entropy: 1.387266.
Training network. lr: 0.000232. clip: 0.092800
Iteration 3220: Policy loss: 0.009881. Value loss: 2.600304. Entropy: 1.378146.
Iteration 3221: Policy loss: -0.002370. Value loss: 1.294927. Entropy: 1.370800.
Iteration 3222: Policy loss: -0.004864. Value loss: 1.021306. Entropy: 1.372228.
episode: 1449   score: 165.0  epsilon: 1.0    steps: 63  evaluation reward: 214.95
episode: 1450

Iteration 3275: Policy loss: 0.002552. Value loss: 1.232489. Entropy: 1.360181.
Iteration 3276: Policy loss: 0.000944. Value loss: 0.828502. Entropy: 1.366508.
episode: 1476   score: 565.0  epsilon: 1.0    steps: 232  evaluation reward: 206.2
episode: 1477   score: 210.0  epsilon: 1.0    steps: 568  evaluation reward: 209.3
episode: 1478   score: 410.0  epsilon: 1.0    steps: 982  evaluation reward: 208.5
Training network. lr: 0.000232. clip: 0.092688
Iteration 3277: Policy loss: 0.001468. Value loss: 1.920512. Entropy: 1.386193.
Iteration 3278: Policy loss: -0.001936. Value loss: 1.146397. Entropy: 1.354037.
Iteration 3279: Policy loss: -0.005604. Value loss: 0.883344. Entropy: 1.358507.
episode: 1479   score: 135.0  epsilon: 1.0    steps: 6  evaluation reward: 210.45
Training network. lr: 0.000232. clip: 0.092688
Iteration 3280: Policy loss: 0.007512. Value loss: 2.507179. Entropy: 1.353790.
Iteration 3281: Policy loss: 0.000660. Value loss: 1.598550. Entropy: 1.353898.
Iteration 328

Iteration 3334: Policy loss: 0.011268. Value loss: 3.192226. Entropy: 1.348390.
Iteration 3335: Policy loss: -0.000195. Value loss: 1.986734. Entropy: 1.357931.
Iteration 3336: Policy loss: -0.006100. Value loss: 1.624401. Entropy: 1.377931.
episode: 1506   score: 325.0  epsilon: 1.0    steps: 266  evaluation reward: 219.25
episode: 1507   score: 380.0  epsilon: 1.0    steps: 846  evaluation reward: 221.0
Training network. lr: 0.000231. clip: 0.092575
Iteration 3337: Policy loss: 0.008014. Value loss: 2.930840. Entropy: 1.173060.
Iteration 3338: Policy loss: 0.008137. Value loss: 2.200387. Entropy: 1.167670.
Iteration 3339: Policy loss: -0.002969. Value loss: 1.742261. Entropy: 1.158130.
episode: 1508   score: 120.0  epsilon: 1.0    steps: 121  evaluation reward: 222.95
Training network. lr: 0.000231. clip: 0.092575
Iteration 3340: Policy loss: 0.008772. Value loss: 3.837866. Entropy: 1.202471.
Iteration 3341: Policy loss: 0.000946. Value loss: 2.333531. Entropy: 1.202432.
Iteration 33

Iteration 3396: Policy loss: 0.001551. Value loss: 2.044028. Entropy: 1.309201.
episode: 1534   score: 420.0  epsilon: 1.0    steps: 396  evaluation reward: 221.9
episode: 1535   score: 660.0  epsilon: 1.0    steps: 516  evaluation reward: 221.7
Training network. lr: 0.000231. clip: 0.092463
Iteration 3397: Policy loss: 0.016435. Value loss: 2.852136. Entropy: 1.155234.
Iteration 3398: Policy loss: 0.009496. Value loss: 1.869780. Entropy: 1.152386.
Iteration 3399: Policy loss: -0.004647. Value loss: 1.603326. Entropy: 1.140023.
Training network. lr: 0.000231. clip: 0.092463
Iteration 3400: Policy loss: 0.007413. Value loss: 3.265234. Entropy: 1.181861.
Iteration 3401: Policy loss: 0.014922. Value loss: 2.225699. Entropy: 1.177315.
Iteration 3402: Policy loss: 0.007909. Value loss: 1.632401. Entropy: 1.160211.
episode: 1536   score: 155.0  epsilon: 1.0    steps: 11  evaluation reward: 225.9
Training network. lr: 0.000231. clip: 0.092350
Iteration 3403: Policy loss: 0.007216. Value loss:

Iteration 3458: Policy loss: -0.003865. Value loss: 2.007413. Entropy: 1.275929.
Iteration 3459: Policy loss: 0.005418. Value loss: 1.498126. Entropy: 1.297217.
episode: 1561   score: 315.0  epsilon: 1.0    steps: 621  evaluation reward: 231.9
Training network. lr: 0.000231. clip: 0.092238
Iteration 3460: Policy loss: 0.009425. Value loss: 2.616131. Entropy: 1.264971.
Iteration 3461: Policy loss: 0.004862. Value loss: 1.371476. Entropy: 1.281476.
Iteration 3462: Policy loss: 0.005700. Value loss: 0.954670. Entropy: 1.309944.
episode: 1562   score: 370.0  epsilon: 1.0    steps: 260  evaluation reward: 232.8
episode: 1563   score: 155.0  epsilon: 1.0    steps: 836  evaluation reward: 234.4
episode: 1564   score: 215.0  epsilon: 1.0    steps: 1011  evaluation reward: 233.85
Training network. lr: 0.000231. clip: 0.092238
Iteration 3463: Policy loss: 0.008302. Value loss: 3.199937. Entropy: 1.270785.
Iteration 3464: Policy loss: 0.009804. Value loss: 1.923706. Entropy: 1.248230.
Iteration 3

Iteration 3518: Policy loss: -0.003704. Value loss: 1.285096. Entropy: 1.075449.
Iteration 3519: Policy loss: -0.008637. Value loss: 0.972500. Entropy: 1.084729.
episode: 1591   score: 210.0  epsilon: 1.0    steps: 274  evaluation reward: 231.35
Training network. lr: 0.000230. clip: 0.092125
Iteration 3520: Policy loss: 0.005915. Value loss: 2.705053. Entropy: 1.218722.
Iteration 3521: Policy loss: 0.011169. Value loss: 1.643982. Entropy: 1.227439.
Iteration 3522: Policy loss: 0.002723. Value loss: 1.528250. Entropy: 1.216694.
episode: 1592   score: 150.0  epsilon: 1.0    steps: 490  evaluation reward: 232.45
episode: 1593   score: 185.0  epsilon: 1.0    steps: 552  evaluation reward: 230.3
episode: 1594   score: 215.0  epsilon: 1.0    steps: 641  evaluation reward: 231.25
Training network. lr: 0.000230. clip: 0.092125
Iteration 3523: Policy loss: 0.010873. Value loss: 2.128646. Entropy: 1.146846.
Iteration 3524: Policy loss: 0.008127. Value loss: 1.303770. Entropy: 1.134536.
Iteration

Iteration 3579: Policy loss: -0.001551. Value loss: 1.644286. Entropy: 1.206408.
episode: 1619   score: 315.0  epsilon: 1.0    steps: 360  evaluation reward: 232.65
Training network. lr: 0.000230. clip: 0.092013
Iteration 3580: Policy loss: 0.016221. Value loss: 2.594381. Entropy: 1.377005.
Iteration 3581: Policy loss: 0.009333. Value loss: 1.460541. Entropy: 1.377756.
Iteration 3582: Policy loss: -0.000446. Value loss: 1.013340. Entropy: 1.357085.
Training network. lr: 0.000230. clip: 0.092013
Iteration 3583: Policy loss: 0.017530. Value loss: 1.946184. Entropy: 1.450657.
Iteration 3584: Policy loss: 0.002327. Value loss: 1.263986. Entropy: 1.427415.
Iteration 3585: Policy loss: 0.004683. Value loss: 0.925223. Entropy: 1.440290.
episode: 1620   score: 210.0  epsilon: 1.0    steps: 218  evaluation reward: 234.65
episode: 1621   score: 210.0  epsilon: 1.0    steps: 443  evaluation reward: 232.55
Training network. lr: 0.000230. clip: 0.092013
Iteration 3586: Policy loss: 0.012306. Value 

Iteration 3640: Policy loss: 0.005229. Value loss: 2.199677. Entropy: 1.338293.
Iteration 3641: Policy loss: 0.005978. Value loss: 1.447094. Entropy: 1.348752.
Iteration 3642: Policy loss: 0.013701. Value loss: 1.307638. Entropy: 1.343315.
episode: 1647   score: 135.0  epsilon: 1.0    steps: 891  evaluation reward: 229.1
Training network. lr: 0.000230. clip: 0.091900
Iteration 3643: Policy loss: 0.013013. Value loss: 3.135396. Entropy: 1.529842.
Iteration 3644: Policy loss: 0.011139. Value loss: 1.690266. Entropy: 1.534835.
Iteration 3645: Policy loss: 0.021339. Value loss: 1.215125. Entropy: 1.517370.
Training network. lr: 0.000230. clip: 0.091900
Iteration 3646: Policy loss: 0.005251. Value loss: 1.908706. Entropy: 1.424863.
Iteration 3647: Policy loss: 0.005128. Value loss: 1.063612. Entropy: 1.434104.
Iteration 3648: Policy loss: 0.005414. Value loss: 0.828046. Entropy: 1.447064.
episode: 1648   score: 185.0  epsilon: 1.0    steps: 224  evaluation reward: 228.3
episode: 1649   scor

episode: 1673   score: 110.0  epsilon: 1.0    steps: 995  evaluation reward: 234.0
Training network. lr: 0.000229. clip: 0.091675
Iteration 3703: Policy loss: 0.007009. Value loss: 2.451199. Entropy: 1.195730.
Iteration 3704: Policy loss: 0.002068. Value loss: 1.433336. Entropy: 1.231457.
Iteration 3705: Policy loss: -0.001650. Value loss: 1.200655. Entropy: 1.219078.
episode: 1674   score: 140.0  epsilon: 1.0    steps: 125  evaluation reward: 233.2
episode: 1675   score: 285.0  epsilon: 1.0    steps: 265  evaluation reward: 233.9
Training network. lr: 0.000229. clip: 0.091675
Iteration 3706: Policy loss: 0.008519. Value loss: 3.189157. Entropy: 1.186244.
Iteration 3707: Policy loss: 0.007277. Value loss: 2.042819. Entropy: 1.156833.
Iteration 3708: Policy loss: -0.004745. Value loss: 1.491022. Entropy: 1.165046.
episode: 1676   score: 630.0  epsilon: 1.0    steps: 219  evaluation reward: 235.95
Training network. lr: 0.000229. clip: 0.091675
Iteration 3709: Policy loss: 0.001539. Value

Iteration 3765: Policy loss: -0.004236. Value loss: 1.085862. Entropy: 1.372226.
episode: 1700   score: 210.0  epsilon: 1.0    steps: 19  evaluation reward: 249.15
now time :  2019-02-23 00:55:39.462550
episode: 1701   score: 390.0  epsilon: 1.0    steps: 602  evaluation reward: 249.15
episode: 1702   score: 210.0  epsilon: 1.0    steps: 648  evaluation reward: 249.25
Training network. lr: 0.000229. clip: 0.091563
Iteration 3766: Policy loss: 0.011217. Value loss: 2.580617. Entropy: 1.302680.
Iteration 3767: Policy loss: 0.004816. Value loss: 1.608381. Entropy: 1.294457.
Iteration 3768: Policy loss: -0.005333. Value loss: 1.274978. Entropy: 1.311985.
episode: 1703   score: 395.0  epsilon: 1.0    steps: 211  evaluation reward: 249.55
Training network. lr: 0.000229. clip: 0.091563
Iteration 3769: Policy loss: 0.012995. Value loss: 6.090239. Entropy: 1.321496.
Iteration 3770: Policy loss: 0.004214. Value loss: 2.954487. Entropy: 1.312176.
Iteration 3771: Policy loss: 0.012048. Value loss:

Iteration 3823: Policy loss: 0.009991. Value loss: 2.467679. Entropy: 1.188292.
Iteration 3824: Policy loss: -0.001848. Value loss: 1.523827. Entropy: 1.194225.
Iteration 3825: Policy loss: -0.005475. Value loss: 1.258902. Entropy: 1.193664.
episode: 1731   score: 475.0  epsilon: 1.0    steps: 41  evaluation reward: 238.15
Training network. lr: 0.000229. clip: 0.091450
Iteration 3826: Policy loss: 0.002794. Value loss: 1.783502. Entropy: 1.170698.
Iteration 3827: Policy loss: 0.001419. Value loss: 1.218612. Entropy: 1.169253.
Iteration 3828: Policy loss: 0.003554. Value loss: 0.966930. Entropy: 1.168098.
episode: 1732   score: 265.0  epsilon: 1.0    steps: 227  evaluation reward: 239.85
episode: 1733   score: 120.0  epsilon: 1.0    steps: 646  evaluation reward: 240.85
Training network. lr: 0.000229. clip: 0.091450
Iteration 3829: Policy loss: 0.013684. Value loss: 3.226769. Entropy: 1.254879.
Iteration 3830: Policy loss: 0.008764. Value loss: 2.040068. Entropy: 1.264734.
Iteration 383

episode: 1761   score: 210.0  epsilon: 1.0    steps: 738  evaluation reward: 236.7
Training network. lr: 0.000228. clip: 0.091338
Iteration 3883: Policy loss: 0.010390. Value loss: 3.028478. Entropy: 1.184773.
Iteration 3884: Policy loss: 0.007458. Value loss: 1.735124. Entropy: 1.229438.
Iteration 3885: Policy loss: 0.001787. Value loss: 1.357232. Entropy: 1.219123.
episode: 1762   score: 240.0  epsilon: 1.0    steps: 834  evaluation reward: 235.65
Training network. lr: 0.000228. clip: 0.091338
Iteration 3886: Policy loss: 0.009891. Value loss: 2.754103. Entropy: 1.202856.
Iteration 3887: Policy loss: 0.013511. Value loss: 1.765420. Entropy: 1.205014.
Iteration 3888: Policy loss: 0.001010. Value loss: 1.401479. Entropy: 1.200330.
episode: 1763   score: 210.0  epsilon: 1.0    steps: 141  evaluation reward: 233.85
Training network. lr: 0.000228. clip: 0.091338
Iteration 3889: Policy loss: 0.012065. Value loss: 5.764894. Entropy: 1.301494.
Iteration 3890: Policy loss: 0.013367. Value los

Training network. lr: 0.000228. clip: 0.091225
Iteration 3943: Policy loss: 0.005512. Value loss: 3.129339. Entropy: 1.262725.
Iteration 3944: Policy loss: 0.000903. Value loss: 1.794294. Entropy: 1.246382.
Iteration 3945: Policy loss: -0.009663. Value loss: 1.249651. Entropy: 1.245043.
episode: 1791   score: 210.0  epsilon: 1.0    steps: 449  evaluation reward: 238.25
Training network. lr: 0.000228. clip: 0.091225
Iteration 3946: Policy loss: 0.005189. Value loss: 3.595232. Entropy: 1.205104.
Iteration 3947: Policy loss: 0.009649. Value loss: 2.072289. Entropy: 1.210017.
Iteration 3948: Policy loss: -0.001073. Value loss: 1.622948. Entropy: 1.212211.
episode: 1792   score: 260.0  epsilon: 1.0    steps: 45  evaluation reward: 237.45
episode: 1793   score: 90.0  epsilon: 1.0    steps: 699  evaluation reward: 237.25
Training network. lr: 0.000228. clip: 0.091225
Iteration 3949: Policy loss: 0.010614. Value loss: 3.032457. Entropy: 1.262478.
Iteration 3950: Policy loss: 0.007336. Value lo

episode: 1813   score: 135.0  epsilon: 1.0    steps: 597  evaluation reward: 244.25
Training network. lr: 0.000228. clip: 0.091000
Iteration 4009: Policy loss: 0.001189. Value loss: 5.738624. Entropy: 1.293106.
Iteration 4010: Policy loss: 0.008376. Value loss: 3.920928. Entropy: 1.266181.
Iteration 4011: Policy loss: -0.006675. Value loss: 3.837406. Entropy: 1.275977.
episode: 1814   score: 605.0  epsilon: 1.0    steps: 1011  evaluation reward: 243.95
Training network. lr: 0.000228. clip: 0.091000
Iteration 4012: Policy loss: 0.001610. Value loss: 1.189404. Entropy: 1.391449.
Iteration 4013: Policy loss: -0.005724. Value loss: 0.735096. Entropy: 1.409599.
Iteration 4014: Policy loss: -0.008376. Value loss: 0.576322. Entropy: 1.399355.
Training network. lr: 0.000228. clip: 0.091000
Iteration 4015: Policy loss: 0.004224. Value loss: 2.904218. Entropy: 1.302519.
Iteration 4016: Policy loss: 0.000702. Value loss: 1.562689. Entropy: 1.309546.
Iteration 4017: Policy loss: 0.003448. Value lo

Iteration 4070: Policy loss: 0.006755. Value loss: 1.463745. Entropy: 1.110402.
Iteration 4071: Policy loss: -0.001017. Value loss: 1.080627. Entropy: 1.127130.
episode: 1841   score: 180.0  epsilon: 1.0    steps: 796  evaluation reward: 258.2
Training network. lr: 0.000227. clip: 0.090888
Iteration 4072: Policy loss: 0.000953. Value loss: 2.366630. Entropy: 1.298791.
Iteration 4073: Policy loss: 0.003482. Value loss: 1.493868. Entropy: 1.292804.
Iteration 4074: Policy loss: 0.004584. Value loss: 1.274211. Entropy: 1.311635.
episode: 1842   score: 180.0  epsilon: 1.0    steps: 146  evaluation reward: 256.3
Training network. lr: 0.000227. clip: 0.090888
Iteration 4075: Policy loss: 0.006528. Value loss: 3.254434. Entropy: 1.348325.
Iteration 4076: Policy loss: 0.005965. Value loss: 1.939939. Entropy: 1.360644.
Iteration 4077: Policy loss: 0.001011. Value loss: 1.389178. Entropy: 1.362444.
episode: 1843   score: 340.0  epsilon: 1.0    steps: 32  evaluation reward: 256.8
episode: 1844   s

episode: 1868   score: 215.0  epsilon: 1.0    steps: 471  evaluation reward: 257.75
episode: 1869   score: 325.0  epsilon: 1.0    steps: 567  evaluation reward: 257.8
Training network. lr: 0.000227. clip: 0.090775
Iteration 4132: Policy loss: 0.007305. Value loss: 2.903881. Entropy: 1.368872.
Iteration 4133: Policy loss: 0.005620. Value loss: 1.740875. Entropy: 1.344857.
Iteration 4134: Policy loss: -0.001320. Value loss: 1.340640. Entropy: 1.313121.
episode: 1870   score: 225.0  epsilon: 1.0    steps: 185  evaluation reward: 258.65
episode: 1871   score: 330.0  epsilon: 1.0    steps: 931  evaluation reward: 256.0
Training network. lr: 0.000227. clip: 0.090775
Iteration 4135: Policy loss: 0.009803. Value loss: 3.264563. Entropy: 1.231153.
Iteration 4136: Policy loss: 0.003660. Value loss: 2.116635. Entropy: 1.218791.
Iteration 4137: Policy loss: -0.005564. Value loss: 2.006723. Entropy: 1.228028.
Training network. lr: 0.000227. clip: 0.090775
Iteration 4138: Policy loss: 0.013317. Valu

Iteration 4191: Policy loss: 0.000166. Value loss: 1.239344. Entropy: 1.398200.
episode: 1899   score: 135.0  epsilon: 1.0    steps: 731  evaluation reward: 254.75
Training network. lr: 0.000227. clip: 0.090663
Iteration 4192: Policy loss: -0.000305. Value loss: 2.902998. Entropy: 1.380719.
Iteration 4193: Policy loss: 0.009193. Value loss: 1.475181. Entropy: 1.364991.
Iteration 4194: Policy loss: 0.011421. Value loss: 1.066687. Entropy: 1.353456.
episode: 1900   score: 245.0  epsilon: 1.0    steps: 253  evaluation reward: 250.45
now time :  2019-02-23 01:04:35.197457
episode: 1901   score: 100.0  epsilon: 1.0    steps: 319  evaluation reward: 250.65
episode: 1902   score: 155.0  epsilon: 1.0    steps: 417  evaluation reward: 250.15
episode: 1903   score: 210.0  epsilon: 1.0    steps: 795  evaluation reward: 246.6
Training network. lr: 0.000227. clip: 0.090663
Iteration 4195: Policy loss: 0.003969. Value loss: 2.810643. Entropy: 1.204004.
Iteration 4196: Policy loss: 0.000818. Value lo

episode: 1927   score: 240.0  epsilon: 1.0    steps: 840  evaluation reward: 225.75
Training network. lr: 0.000226. clip: 0.090438
Iteration 4252: Policy loss: -0.000309. Value loss: 2.670228. Entropy: 1.319795.
Iteration 4253: Policy loss: 0.007875. Value loss: 1.551692. Entropy: 1.325124.
Iteration 4254: Policy loss: -0.002358. Value loss: 1.162255. Entropy: 1.311308.
episode: 1928   score: 315.0  epsilon: 1.0    steps: 201  evaluation reward: 226.85
episode: 1929   score: 210.0  epsilon: 1.0    steps: 427  evaluation reward: 227.75
episode: 1930   score: 120.0  epsilon: 1.0    steps: 609  evaluation reward: 225.2
Training network. lr: 0.000226. clip: 0.090438
Iteration 4255: Policy loss: 0.008301. Value loss: 2.495831. Entropy: 1.362217.
Iteration 4256: Policy loss: -0.001212. Value loss: 1.795271. Entropy: 1.326276.
Iteration 4257: Policy loss: -0.011735. Value loss: 1.390060. Entropy: 1.346755.
Training network. lr: 0.000226. clip: 0.090438
Iteration 4258: Policy loss: 0.012776. V

Training network. lr: 0.000226. clip: 0.090325
Iteration 4315: Policy loss: 0.017344. Value loss: 3.239854. Entropy: 1.441289.
Iteration 4316: Policy loss: 0.016043. Value loss: 1.853100. Entropy: 1.430925.
Iteration 4317: Policy loss: 0.020545. Value loss: 1.743504. Entropy: 1.409014.
episode: 1953   score: 375.0  epsilon: 1.0    steps: 281  evaluation reward: 227.9
episode: 1954   score: 320.0  epsilon: 1.0    steps: 426  evaluation reward: 229.05
episode: 1955   score: 370.0  epsilon: 1.0    steps: 858  evaluation reward: 229.2
Training network. lr: 0.000226. clip: 0.090325
Iteration 4318: Policy loss: 0.015413. Value loss: 3.468445. Entropy: 1.414644.
Iteration 4319: Policy loss: 0.010836. Value loss: 2.272132. Entropy: 1.400698.
Iteration 4320: Policy loss: 0.009326. Value loss: 1.624943. Entropy: 1.384801.
episode: 1956   score: 330.0  epsilon: 1.0    steps: 34  evaluation reward: 230.8
episode: 1957   score: 510.0  epsilon: 1.0    steps: 921  evaluation reward: 232.9
Training ne

Training network. lr: 0.000226. clip: 0.090213
Iteration 4375: Policy loss: 0.009544. Value loss: 3.020653. Entropy: 1.309238.
Iteration 4376: Policy loss: 0.007858. Value loss: 1.641815. Entropy: 1.314529.
Iteration 4377: Policy loss: -0.000081. Value loss: 1.245968. Entropy: 1.322045.
episode: 1983   score: 120.0  epsilon: 1.0    steps: 345  evaluation reward: 231.55
episode: 1984   score: 210.0  epsilon: 1.0    steps: 445  evaluation reward: 231.2
Training network. lr: 0.000226. clip: 0.090213
Iteration 4378: Policy loss: 0.005501. Value loss: 2.114759. Entropy: 1.262799.
Iteration 4379: Policy loss: 0.004243. Value loss: 1.291757. Entropy: 1.258447.
Iteration 4380: Policy loss: -0.004181. Value loss: 1.023848. Entropy: 1.259746.
episode: 1985   score: 180.0  epsilon: 1.0    steps: 90  evaluation reward: 231.8
episode: 1986   score: 260.0  epsilon: 1.0    steps: 606  evaluation reward: 229.95
episode: 1987   score: 95.0  epsilon: 1.0    steps: 739  evaluation reward: 230.75
episode:

episode: 2012   score: 210.0  epsilon: 1.0    steps: 685  evaluation reward: 232.85
episode: 2013   score: 150.0  epsilon: 1.0    steps: 822  evaluation reward: 233.9
Training network. lr: 0.000225. clip: 0.090100
Iteration 4435: Policy loss: 0.003412. Value loss: 2.081026. Entropy: 1.244618.
Iteration 4436: Policy loss: 0.002750. Value loss: 1.539113. Entropy: 1.228249.
Iteration 4437: Policy loss: -0.002998. Value loss: 1.141111. Entropy: 1.255001.
episode: 2014   score: 65.0  epsilon: 1.0    steps: 279  evaluation reward: 233.55
Training network. lr: 0.000225. clip: 0.090100
Iteration 4438: Policy loss: 0.015488. Value loss: 2.819018. Entropy: 1.328998.
Iteration 4439: Policy loss: 0.018623. Value loss: 1.831178. Entropy: 1.349255.
Iteration 4440: Policy loss: 0.013485. Value loss: 1.466059. Entropy: 1.330234.
episode: 2015   score: 155.0  epsilon: 1.0    steps: 435  evaluation reward: 232.85
episode: 2016   score: 100.0  epsilon: 1.0    steps: 566  evaluation reward: 229.7
episode:

Iteration 4496: Policy loss: -0.000671. Value loss: 1.238623. Entropy: 1.334004.
Iteration 4497: Policy loss: 0.000134. Value loss: 1.212993. Entropy: 1.348865.
episode: 2040   score: 410.0  epsilon: 1.0    steps: 51  evaluation reward: 229.0
episode: 2041   score: 210.0  epsilon: 1.0    steps: 848  evaluation reward: 230.5
Training network. lr: 0.000225. clip: 0.089988
Iteration 4498: Policy loss: 0.003668. Value loss: 2.323880. Entropy: 1.310127.
Iteration 4499: Policy loss: 0.006832. Value loss: 1.411162. Entropy: 1.314561.
Iteration 4500: Policy loss: 0.010860. Value loss: 1.246713. Entropy: 1.319223.
Training network. lr: 0.000225. clip: 0.089875
Iteration 4501: Policy loss: 0.005118. Value loss: 3.660108. Entropy: 1.336510.
Iteration 4502: Policy loss: -0.000864. Value loss: 2.117166. Entropy: 1.337674.
Iteration 4503: Policy loss: -0.007132. Value loss: 1.632207. Entropy: 1.332355.
episode: 2042   score: 210.0  epsilon: 1.0    steps: 603  evaluation reward: 229.75
Training netwo

Iteration 4559: Policy loss: 0.005620. Value loss: 1.573749. Entropy: 1.405097.
Iteration 4560: Policy loss: -0.002768. Value loss: 1.157788. Entropy: 1.403670.
episode: 2066   score: 180.0  epsilon: 1.0    steps: 415  evaluation reward: 227.4
Training network. lr: 0.000224. clip: 0.089763
Iteration 4561: Policy loss: 0.010998. Value loss: 2.360318. Entropy: 1.476359.
Iteration 4562: Policy loss: 0.007018. Value loss: 1.573949. Entropy: 1.462886.
Iteration 4563: Policy loss: 0.002672. Value loss: 1.233550. Entropy: 1.476265.
episode: 2067   score: 210.0  epsilon: 1.0    steps: 590  evaluation reward: 227.1
Training network. lr: 0.000224. clip: 0.089763
Iteration 4564: Policy loss: 0.002054. Value loss: 3.214770. Entropy: 1.472024.
Iteration 4565: Policy loss: 0.009630. Value loss: 2.096840. Entropy: 1.462524.
Iteration 4566: Policy loss: 0.000511. Value loss: 1.705680. Entropy: 1.462087.
episode: 2068   score: 135.0  epsilon: 1.0    steps: 302  evaluation reward: 227.65
episode: 2069  

Iteration 4623: Policy loss: -0.007320. Value loss: 1.238412. Entropy: 1.333228.
Training network. lr: 0.000224. clip: 0.089650
Iteration 4624: Policy loss: 0.008534. Value loss: 3.014153. Entropy: 1.279094.
Iteration 4625: Policy loss: 0.008040. Value loss: 1.520900. Entropy: 1.286794.
Iteration 4626: Policy loss: 0.008941. Value loss: 1.017500. Entropy: 1.272141.
episode: 2091   score: 210.0  epsilon: 1.0    steps: 259  evaluation reward: 252.45
episode: 2092   score: 285.0  epsilon: 1.0    steps: 399  evaluation reward: 253.25
episode: 2093   score: 390.0  epsilon: 1.0    steps: 726  evaluation reward: 254.75
Training network. lr: 0.000224. clip: 0.089650
Iteration 4627: Policy loss: 0.012498. Value loss: 2.678126. Entropy: 1.197543.
Iteration 4628: Policy loss: 0.010111. Value loss: 1.747406. Entropy: 1.216544.
Iteration 4629: Policy loss: -0.002847. Value loss: 1.419169. Entropy: 1.217016.
episode: 2094   score: 285.0  epsilon: 1.0    steps: 187  evaluation reward: 256.4
episode: 

Iteration 4685: Policy loss: 0.000069. Value loss: 1.584239. Entropy: 1.325408.
Iteration 4686: Policy loss: -0.005088. Value loss: 1.209666. Entropy: 1.321477.
episode: 2118   score: 285.0  epsilon: 1.0    steps: 159  evaluation reward: 252.9
episode: 2119   score: 210.0  epsilon: 1.0    steps: 605  evaluation reward: 254.5
Training network. lr: 0.000224. clip: 0.089538
Iteration 4687: Policy loss: 0.006797. Value loss: 2.455755. Entropy: 1.345058.
Iteration 4688: Policy loss: -0.006958. Value loss: 1.348529. Entropy: 1.324414.
Iteration 4689: Policy loss: -0.006354. Value loss: 1.058922. Entropy: 1.328714.
episode: 2120   score: 260.0  epsilon: 1.0    steps: 900  evaluation reward: 255.8
Training network. lr: 0.000224. clip: 0.089538
Iteration 4690: Policy loss: 0.003713. Value loss: 2.389030. Entropy: 1.252606.
Iteration 4691: Policy loss: 0.006953. Value loss: 1.574880. Entropy: 1.256022.
Iteration 4692: Policy loss: 0.002355. Value loss: 1.347942. Entropy: 1.231811.
episode: 2121 

Iteration 4747: Policy loss: 0.007409. Value loss: 3.007355. Entropy: 1.413176.
Iteration 4748: Policy loss: 0.006207. Value loss: 1.692058. Entropy: 1.428662.
Iteration 4749: Policy loss: 0.001356. Value loss: 1.399520. Entropy: 1.404484.
Training network. lr: 0.000224. clip: 0.089425
Iteration 4750: Policy loss: 0.006105. Value loss: 3.113316. Entropy: 1.477534.
Iteration 4751: Policy loss: 0.004459. Value loss: 2.923242. Entropy: 1.495307.
Iteration 4752: Policy loss: -0.004112. Value loss: 2.650370. Entropy: 1.513623.
episode: 2145   score: 150.0  epsilon: 1.0    steps: 210  evaluation reward: 265.1
episode: 2146   score: 210.0  epsilon: 1.0    steps: 407  evaluation reward: 262.05
episode: 2147   score: 120.0  epsilon: 1.0    steps: 618  evaluation reward: 261.15
episode: 2148   score: 325.0  epsilon: 1.0    steps: 789  evaluation reward: 259.5
Training network. lr: 0.000223. clip: 0.089313
Iteration 4753: Policy loss: 0.003723. Value loss: 3.154935. Entropy: 1.349069.
Iteration 4

Iteration 4811: Policy loss: -0.003840. Value loss: 1.698244. Entropy: 1.295534.
Iteration 4812: Policy loss: -0.007021. Value loss: 1.302732. Entropy: 1.278190.
episode: 2170   score: 140.0  epsilon: 1.0    steps: 136  evaluation reward: 269.15
Training network. lr: 0.000223. clip: 0.089200
Iteration 4813: Policy loss: 0.009649. Value loss: 3.355712. Entropy: 1.274662.
Iteration 4814: Policy loss: -0.002308. Value loss: 2.747317. Entropy: 1.244726.
Iteration 4815: Policy loss: -0.002646. Value loss: 2.158174. Entropy: 1.262019.
episode: 2171   score: 135.0  epsilon: 1.0    steps: 622  evaluation reward: 268.15
Training network. lr: 0.000223. clip: 0.089200
Iteration 4816: Policy loss: 0.007676. Value loss: 2.374064. Entropy: 1.267843.
Iteration 4817: Policy loss: -0.000484. Value loss: 1.210296. Entropy: 1.289166.
Iteration 4818: Policy loss: 0.000143. Value loss: 0.932413. Entropy: 1.272633.
episode: 2172   score: 485.0  epsilon: 1.0    steps: 311  evaluation reward: 267.7
episode: 2

episode: 2198   score: 135.0  epsilon: 1.0    steps: 837  evaluation reward: 260.65
episode: 2199   score: 280.0  epsilon: 1.0    steps: 1023  evaluation reward: 259.9
Training network. lr: 0.000223. clip: 0.089088
Iteration 4873: Policy loss: 0.000398. Value loss: 3.122610. Entropy: 1.093032.
Iteration 4874: Policy loss: -0.006434. Value loss: 1.757870. Entropy: 1.067864.
Iteration 4875: Policy loss: -0.004690. Value loss: 1.369538. Entropy: 1.068364.
episode: 2200   score: 225.0  epsilon: 1.0    steps: 592  evaluation reward: 262.4
Training network. lr: 0.000223. clip: 0.089088
Iteration 4876: Policy loss: 0.005406. Value loss: 2.020825. Entropy: 1.068938.
Iteration 4877: Policy loss: 0.002288. Value loss: 1.295957. Entropy: 1.042733.
Iteration 4878: Policy loss: -0.004572. Value loss: 1.086534. Entropy: 1.037102.
Training network. lr: 0.000223. clip: 0.089088
Iteration 4879: Policy loss: 0.003978. Value loss: 2.189957. Entropy: 1.232139.
Iteration 4880: Policy loss: -0.002341. Value

episode: 2224   score: 180.0  epsilon: 1.0    steps: 516  evaluation reward: 275.1
Training network. lr: 0.000222. clip: 0.088975
Iteration 4936: Policy loss: 0.007934. Value loss: 2.354147. Entropy: 1.044820.
Iteration 4937: Policy loss: 0.002867. Value loss: 1.476547. Entropy: 1.029001.
Iteration 4938: Policy loss: -0.001260. Value loss: 1.108683. Entropy: 1.047294.
episode: 2225   score: 210.0  epsilon: 1.0    steps: 51  evaluation reward: 271.3
Training network. lr: 0.000222. clip: 0.088975
Iteration 4939: Policy loss: 0.000543. Value loss: 2.063927. Entropy: 1.175132.
Iteration 4940: Policy loss: -0.002931. Value loss: 1.292803. Entropy: 1.153091.
Iteration 4941: Policy loss: -0.011381. Value loss: 0.964636. Entropy: 1.180984.
Training network. lr: 0.000222. clip: 0.088975
Iteration 4942: Policy loss: 0.005229. Value loss: 2.622790. Entropy: 1.240730.
Iteration 4943: Policy loss: -0.000067. Value loss: 1.563565. Entropy: 1.249673.
Iteration 4944: Policy loss: 0.001416. Value loss:

episode: 2247   score: 345.0  epsilon: 1.0    steps: 679  evaluation reward: 270.8
Training network. lr: 0.000222. clip: 0.088750
Iteration 5002: Policy loss: -0.000697. Value loss: 2.368001. Entropy: 1.104037.
Iteration 5003: Policy loss: -0.000848. Value loss: 1.477916. Entropy: 1.141786.
Iteration 5004: Policy loss: -0.002777. Value loss: 1.311884. Entropy: 1.121091.
Training network. lr: 0.000222. clip: 0.088750
Iteration 5005: Policy loss: 0.012988. Value loss: 3.321256. Entropy: 1.202583.
Iteration 5006: Policy loss: 0.000605. Value loss: 2.244111. Entropy: 1.214117.
Iteration 5007: Policy loss: -0.001315. Value loss: 1.659052. Entropy: 1.219815.
episode: 2248   score: 75.0  epsilon: 1.0    steps: 58  evaluation reward: 273.05
Training network. lr: 0.000222. clip: 0.088750
Iteration 5008: Policy loss: 0.006149. Value loss: 2.428124. Entropy: 1.171834.
Iteration 5009: Policy loss: -0.001636. Value loss: 1.418910. Entropy: 1.163270.
Iteration 5010: Policy loss: -0.007055. Value los

Iteration 5065: Policy loss: 0.002023. Value loss: 3.220127. Entropy: 1.252829.
Iteration 5066: Policy loss: 0.001448. Value loss: 1.773671. Entropy: 1.252319.
Iteration 5067: Policy loss: -0.006011. Value loss: 1.409683. Entropy: 1.271341.
episode: 2272   score: 340.0  epsilon: 1.0    steps: 357  evaluation reward: 268.9
Training network. lr: 0.000222. clip: 0.088638
Iteration 5068: Policy loss: 0.007139. Value loss: 2.834099. Entropy: 1.264147.
Iteration 5069: Policy loss: 0.003868. Value loss: 1.671499. Entropy: 1.261293.
Iteration 5070: Policy loss: -0.005476. Value loss: 1.425729. Entropy: 1.272127.
episode: 2273   score: 215.0  epsilon: 1.0    steps: 689  evaluation reward: 267.45
episode: 2274   score: 140.0  epsilon: 1.0    steps: 1010  evaluation reward: 263.6
Training network. lr: 0.000222. clip: 0.088638
Iteration 5071: Policy loss: 0.008402. Value loss: 2.694821. Entropy: 1.196989.
Iteration 5072: Policy loss: 0.000355. Value loss: 1.609946. Entropy: 1.211633.
Iteration 507

Iteration 5126: Policy loss: 0.004028. Value loss: 1.520397. Entropy: 1.245073.
Iteration 5127: Policy loss: 0.009152. Value loss: 1.094307. Entropy: 1.219765.
now time :  2019-02-23 01:24:12.591755
episode: 2301   score: 170.0  epsilon: 1.0    steps: 70  evaluation reward: 264.0
episode: 2302   score: 190.0  epsilon: 1.0    steps: 720  evaluation reward: 264.15
Training network. lr: 0.000221. clip: 0.088525
Iteration 5128: Policy loss: 0.005848. Value loss: 2.292782. Entropy: 1.134201.
Iteration 5129: Policy loss: 0.005788. Value loss: 1.296998. Entropy: 1.132533.
Iteration 5130: Policy loss: 0.002115. Value loss: 1.059059. Entropy: 1.138243.
Training network. lr: 0.000221. clip: 0.088525
Iteration 5131: Policy loss: -0.000097. Value loss: 2.921494. Entropy: 1.266632.
Iteration 5132: Policy loss: -0.000761. Value loss: 1.863122. Entropy: 1.282288.
Iteration 5133: Policy loss: -0.004279. Value loss: 1.413684. Entropy: 1.280557.
episode: 2303   score: 200.0  epsilon: 1.0    steps: 283  

episode: 2325   score: 440.0  epsilon: 1.0    steps: 172  evaluation reward: 265.75
Training network. lr: 0.000221. clip: 0.088413
Iteration 5191: Policy loss: 0.013862. Value loss: 3.429447. Entropy: 1.096481.
Iteration 5192: Policy loss: 0.004070. Value loss: 1.953692. Entropy: 1.098289.
Iteration 5193: Policy loss: 0.000681. Value loss: 1.403035. Entropy: 1.090140.
episode: 2326   score: 215.0  epsilon: 1.0    steps: 570  evaluation reward: 268.05
episode: 2327   score: 320.0  epsilon: 1.0    steps: 789  evaluation reward: 267.2
episode: 2328   score: 300.0  epsilon: 1.0    steps: 940  evaluation reward: 268.3
Training network. lr: 0.000221. clip: 0.088413
Iteration 5194: Policy loss: 0.002759. Value loss: 2.289747. Entropy: 0.896345.
Iteration 5195: Policy loss: -0.008180. Value loss: 1.437835. Entropy: 0.902862.
Iteration 5196: Policy loss: -0.007842. Value loss: 1.280793. Entropy: 0.910197.
Training network. lr: 0.000221. clip: 0.088413
Iteration 5197: Policy loss: 0.004181. Valu

Iteration 5250: Policy loss: 0.008914. Value loss: 4.261166. Entropy: 1.158099.
Training network. lr: 0.000220. clip: 0.088188
Iteration 5251: Policy loss: 0.003820. Value loss: 3.222728. Entropy: 1.305527.
Iteration 5252: Policy loss: 0.001727. Value loss: 1.794559. Entropy: 1.274307.
Iteration 5253: Policy loss: -0.005788. Value loss: 1.218963. Entropy: 1.281104.
episode: 2355   score: 320.0  epsilon: 1.0    steps: 78  evaluation reward: 235.5
episode: 2356   score: 195.0  epsilon: 1.0    steps: 241  evaluation reward: 234.9
Training network. lr: 0.000220. clip: 0.088188
Iteration 5254: Policy loss: 0.004750. Value loss: 3.473761. Entropy: 1.187872.
Iteration 5255: Policy loss: 0.006018. Value loss: 1.964596. Entropy: 1.160895.
Iteration 5256: Policy loss: -0.005315. Value loss: 1.310006. Entropy: 1.191877.
episode: 2357   score: 195.0  epsilon: 1.0    steps: 516  evaluation reward: 234.7
episode: 2358   score: 210.0  epsilon: 1.0    steps: 817  evaluation reward: 235.5
Training netw

Iteration 5310: Policy loss: 0.000616. Value loss: 1.521449. Entropy: 1.124074.
Training network. lr: 0.000220. clip: 0.088075
Iteration 5311: Policy loss: 0.009172. Value loss: 3.572651. Entropy: 1.215289.
Iteration 5312: Policy loss: 0.005199. Value loss: 1.843001. Entropy: 1.219172.
Iteration 5313: Policy loss: 0.000497. Value loss: 1.489407. Entropy: 1.202063.
episode: 2385   score: 95.0  epsilon: 1.0    steps: 670  evaluation reward: 242.35
Training network. lr: 0.000220. clip: 0.088075
Iteration 5314: Policy loss: 0.009756. Value loss: 4.903692. Entropy: 1.255990.
Iteration 5315: Policy loss: 0.005528. Value loss: 2.998360. Entropy: 1.286279.
Iteration 5316: Policy loss: 0.002331. Value loss: 3.669084. Entropy: 1.269376.
episode: 2386   score: 245.0  epsilon: 1.0    steps: 294  evaluation reward: 242.15
Training network. lr: 0.000220. clip: 0.088075
Iteration 5317: Policy loss: 0.015652. Value loss: 5.021692. Entropy: 1.201983.
Iteration 5318: Policy loss: 0.017055. Value loss: 4

Training network. lr: 0.000220. clip: 0.087962
Iteration 5371: Policy loss: 0.002368. Value loss: 2.539128. Entropy: 1.139545.
Iteration 5372: Policy loss: -0.004057. Value loss: 1.501648. Entropy: 1.125021.
Iteration 5373: Policy loss: -0.005169. Value loss: 1.180776. Entropy: 1.141641.
Training network. lr: 0.000220. clip: 0.087962
Iteration 5374: Policy loss: -0.000523. Value loss: 2.349355. Entropy: 1.222115.
Iteration 5375: Policy loss: 0.004458. Value loss: 1.229030. Entropy: 1.216449.
Iteration 5376: Policy loss: -0.004357. Value loss: 0.889148. Entropy: 1.240258.
episode: 2413   score: 110.0  epsilon: 1.0    steps: 230  evaluation reward: 248.45
Training network. lr: 0.000220. clip: 0.087962
Iteration 5377: Policy loss: 0.009852. Value loss: 3.494663. Entropy: 1.285039.
Iteration 5378: Policy loss: 0.008318. Value loss: 1.929269. Entropy: 1.296928.
Iteration 5379: Policy loss: -0.002065. Value loss: 1.390982. Entropy: 1.278392.
episode: 2414   score: 470.0  epsilon: 1.0    step

Training network. lr: 0.000220. clip: 0.087850
Iteration 5434: Policy loss: 0.002986. Value loss: 3.232425. Entropy: 1.205961.
Iteration 5435: Policy loss: 0.005166. Value loss: 1.754918. Entropy: 1.191456.
Iteration 5436: Policy loss: 0.009893. Value loss: 1.321035. Entropy: 1.185478.
episode: 2439   score: 155.0  epsilon: 1.0    steps: 107  evaluation reward: 241.7
episode: 2440   score: 300.0  epsilon: 1.0    steps: 335  evaluation reward: 242.05
episode: 2441   score: 115.0  epsilon: 1.0    steps: 780  evaluation reward: 242.15
episode: 2442   score: 90.0  epsilon: 1.0    steps: 938  evaluation reward: 242.5
Training network. lr: 0.000220. clip: 0.087850
Iteration 5437: Policy loss: 0.003415. Value loss: 2.192363. Entropy: 1.062026.
Iteration 5438: Policy loss: -0.004174. Value loss: 1.240248. Entropy: 1.029638.
Iteration 5439: Policy loss: -0.006684. Value loss: 1.248096. Entropy: 1.046053.
episode: 2443   score: 410.0  epsilon: 1.0    steps: 685  evaluation reward: 242.05
Trainin

Iteration 5492: Policy loss: 0.014109. Value loss: 1.865859. Entropy: 1.350574.
Iteration 5493: Policy loss: 0.000765. Value loss: 1.357966. Entropy: 1.356216.
Training network. lr: 0.000219. clip: 0.087738
Iteration 5494: Policy loss: 0.003552. Value loss: 2.280006. Entropy: 1.246395.
Iteration 5495: Policy loss: 0.005763. Value loss: 1.333802. Entropy: 1.228384.
Iteration 5496: Policy loss: -0.005648. Value loss: 1.053388. Entropy: 1.247912.
episode: 2470   score: 260.0  epsilon: 1.0    steps: 241  evaluation reward: 221.4
episode: 2471   score: 285.0  epsilon: 1.0    steps: 387  evaluation reward: 221.75
Training network. lr: 0.000219. clip: 0.087738
Iteration 5497: Policy loss: 0.016031. Value loss: 3.280153. Entropy: 1.375138.
Iteration 5498: Policy loss: -0.000264. Value loss: 1.598867. Entropy: 1.356452.
Iteration 5499: Policy loss: -0.000043. Value loss: 1.131207. Entropy: 1.358320.
episode: 2472   score: 345.0  epsilon: 1.0    steps: 659  evaluation reward: 222.35
Training net

Iteration 5554: Policy loss: 0.011141. Value loss: 2.330247. Entropy: 1.250095.
Iteration 5555: Policy loss: 0.002455. Value loss: 1.290867. Entropy: 1.251570.
Iteration 5556: Policy loss: -0.004471. Value loss: 0.972025. Entropy: 1.273177.
episode: 2497   score: 160.0  epsilon: 1.0    steps: 569  evaluation reward: 232.6
Training network. lr: 0.000219. clip: 0.087513
Iteration 5557: Policy loss: 0.012033. Value loss: 3.190976. Entropy: 1.284437.
Iteration 5558: Policy loss: 0.012356. Value loss: 1.950260. Entropy: 1.254518.
Iteration 5559: Policy loss: -0.000387. Value loss: 1.617575. Entropy: 1.257141.
episode: 2498   score: 505.0  epsilon: 1.0    steps: 103  evaluation reward: 231.05
episode: 2499   score: 215.0  epsilon: 1.0    steps: 1001  evaluation reward: 234.85
Training network. lr: 0.000219. clip: 0.087513
Iteration 5560: Policy loss: 0.003840. Value loss: 2.960632. Entropy: 1.240639.
Iteration 5561: Policy loss: 0.001603. Value loss: 1.746395. Entropy: 1.217598.
Iteration 55

Training network. lr: 0.000218. clip: 0.087400
Iteration 5617: Policy loss: 0.011863. Value loss: 2.609689. Entropy: 1.308557.
Iteration 5618: Policy loss: 0.001064. Value loss: 1.331369. Entropy: 1.305582.
Iteration 5619: Policy loss: -0.007349. Value loss: 1.078918. Entropy: 1.296892.
episode: 2523   score: 135.0  epsilon: 1.0    steps: 196  evaluation reward: 241.9
Training network. lr: 0.000218. clip: 0.087400
Iteration 5620: Policy loss: 0.004910. Value loss: 6.189209. Entropy: 1.382640.
Iteration 5621: Policy loss: 0.003291. Value loss: 4.039350. Entropy: 1.359055.
Iteration 5622: Policy loss: 0.005606. Value loss: 2.886950. Entropy: 1.393302.
episode: 2524   score: 390.0  epsilon: 1.0    steps: 321  evaluation reward: 242.45
episode: 2525   score: 210.0  epsilon: 1.0    steps: 956  evaluation reward: 243.3
Training network. lr: 0.000218. clip: 0.087400
Iteration 5623: Policy loss: 0.006297. Value loss: 1.783126. Entropy: 1.305521.
Iteration 5624: Policy loss: 0.002069. Value los

Training network. lr: 0.000218. clip: 0.087288
Iteration 5680: Policy loss: 0.001541. Value loss: 2.126493. Entropy: 1.182247.
Iteration 5681: Policy loss: -0.001829. Value loss: 1.207785. Entropy: 1.183473.
Iteration 5682: Policy loss: -0.001668. Value loss: 0.984537. Entropy: 1.171695.
episode: 2549   score: 65.0  epsilon: 1.0    steps: 234  evaluation reward: 251.45
episode: 2550   score: 65.0  epsilon: 1.0    steps: 303  evaluation reward: 250.9
Training network. lr: 0.000218. clip: 0.087288
Iteration 5683: Policy loss: 0.008563. Value loss: 2.341146. Entropy: 1.250939.
Iteration 5684: Policy loss: 0.000892. Value loss: 1.590800. Entropy: 1.257410.
Iteration 5685: Policy loss: -0.004311. Value loss: 1.107361. Entropy: 1.255154.
now time :  2019-02-23 01:35:52.412164
episode: 2551   score: 315.0  epsilon: 1.0    steps: 402  evaluation reward: 249.4
Training network. lr: 0.000218. clip: 0.087288
Iteration 5686: Policy loss: 0.008359. Value loss: 2.699213. Entropy: 1.426385.
Iteration

episode: 2572   score: 290.0  epsilon: 1.0    steps: 553  evaluation reward: 280.55
Training network. lr: 0.000218. clip: 0.087175
Iteration 5746: Policy loss: 0.017278. Value loss: 5.596743. Entropy: 1.260415.
Iteration 5747: Policy loss: 0.016391. Value loss: 4.066823. Entropy: 1.257024.
Iteration 5748: Policy loss: 0.022165. Value loss: 3.310501. Entropy: 1.248188.
episode: 2573   score: 535.0  epsilon: 1.0    steps: 394  evaluation reward: 280.0
Training network. lr: 0.000218. clip: 0.087175
Iteration 5749: Policy loss: 0.005852. Value loss: 3.508898. Entropy: 1.342862.
Iteration 5750: Policy loss: 0.009269. Value loss: 1.806159. Entropy: 1.327979.
Iteration 5751: Policy loss: -0.001511. Value loss: 2.486731. Entropy: 1.350273.
episode: 2574   score: 485.0  epsilon: 1.0    steps: 167  evaluation reward: 282.2
Training network. lr: 0.000218. clip: 0.087063
Iteration 5752: Policy loss: 0.012093. Value loss: 2.115027. Entropy: 1.396205.
Iteration 5753: Policy loss: 0.007534. Value los

Iteration 5811: Policy loss: 0.008367. Value loss: 3.015254. Entropy: 1.296462.
episode: 2596   score: 135.0  epsilon: 1.0    steps: 108  evaluation reward: 299.85
episode: 2597   score: 260.0  epsilon: 1.0    steps: 558  evaluation reward: 295.65
episode: 2598   score: 120.0  epsilon: 1.0    steps: 896  evaluation reward: 296.65
Training network. lr: 0.000217. clip: 0.086950
Iteration 5812: Policy loss: 0.006669. Value loss: 2.079350. Entropy: 1.435898.
Iteration 5813: Policy loss: 0.007374. Value loss: 1.257457. Entropy: 1.425520.
Iteration 5814: Policy loss: 0.001804. Value loss: 0.999057. Entropy: 1.427490.
episode: 2599   score: 410.0  epsilon: 1.0    steps: 391  evaluation reward: 292.8
Training network. lr: 0.000217. clip: 0.086950
Iteration 5815: Policy loss: 0.000497. Value loss: 2.150706. Entropy: 1.389065.
Iteration 5816: Policy loss: -0.005895. Value loss: 1.629554. Entropy: 1.395281.
Iteration 5817: Policy loss: -0.004686. Value loss: 1.251436. Entropy: 1.396124.
episode: 

episode: 2621   score: 260.0  epsilon: 1.0    steps: 688  evaluation reward: 295.7
Training network. lr: 0.000217. clip: 0.086837
Iteration 5875: Policy loss: 0.006261. Value loss: 3.208373. Entropy: 1.408499.
Iteration 5876: Policy loss: 0.010289. Value loss: 2.012433. Entropy: 1.388658.
Iteration 5877: Policy loss: -0.003530. Value loss: 1.633609. Entropy: 1.362842.
episode: 2622   score: 130.0  epsilon: 1.0    steps: 489  evaluation reward: 296.45
Training network. lr: 0.000217. clip: 0.086837
Iteration 5878: Policy loss: 0.003191. Value loss: 2.258358. Entropy: 1.395430.
Iteration 5879: Policy loss: 0.010790. Value loss: 1.046523. Entropy: 1.404545.
Iteration 5880: Policy loss: -0.001675. Value loss: 0.717361. Entropy: 1.404541.
episode: 2623   score: 90.0  epsilon: 1.0    steps: 243  evaluation reward: 296.4
Training network. lr: 0.000217. clip: 0.086837
Iteration 5881: Policy loss: 0.015369. Value loss: 2.405271. Entropy: 1.463561.
Iteration 5882: Policy loss: 0.001525. Value los

Training network. lr: 0.000217. clip: 0.086725
Iteration 5938: Policy loss: 0.006339. Value loss: 2.589515. Entropy: 1.295656.
Iteration 5939: Policy loss: 0.000487. Value loss: 1.510394. Entropy: 1.286235.
Iteration 5940: Policy loss: -0.005078. Value loss: 1.234398. Entropy: 1.287896.
episode: 2647   score: 210.0  epsilon: 1.0    steps: 94  evaluation reward: 309.95
episode: 2648   score: 155.0  epsilon: 1.0    steps: 384  evaluation reward: 308.15
Training network. lr: 0.000217. clip: 0.086725
Iteration 5941: Policy loss: 0.007764. Value loss: 3.102929. Entropy: 1.396525.
Iteration 5942: Policy loss: 0.002235. Value loss: 1.820980. Entropy: 1.390786.
Iteration 5943: Policy loss: 0.000070. Value loss: 1.290824. Entropy: 1.394055.
Training network. lr: 0.000217. clip: 0.086725
Iteration 5944: Policy loss: 0.007314. Value loss: 4.234629. Entropy: 1.364691.
Iteration 5945: Policy loss: 0.017541. Value loss: 3.351852. Entropy: 1.325172.
Iteration 5946: Policy loss: 0.010585. Value loss: 

Training network. lr: 0.000216. clip: 0.086500
Iteration 6001: Policy loss: 0.005756. Value loss: 2.919947. Entropy: 1.404244.
Iteration 6002: Policy loss: -0.001066. Value loss: 1.548092. Entropy: 1.404886.
Iteration 6003: Policy loss: -0.009760. Value loss: 1.163202. Entropy: 1.389487.
episode: 2673   score: 210.0  epsilon: 1.0    steps: 899  evaluation reward: 301.1
Training network. lr: 0.000216. clip: 0.086500
Iteration 6004: Policy loss: 0.009234. Value loss: 2.559340. Entropy: 1.431560.
Iteration 6005: Policy loss: 0.001247. Value loss: 1.341853. Entropy: 1.415241.
Iteration 6006: Policy loss: -0.001024. Value loss: 1.019298. Entropy: 1.434953.
episode: 2674   score: 285.0  epsilon: 1.0    steps: 488  evaluation reward: 297.85
Training network. lr: 0.000216. clip: 0.086500
Iteration 6007: Policy loss: 0.001338. Value loss: 2.718837. Entropy: 1.409578.
Iteration 6008: Policy loss: 0.003414. Value loss: 1.534197. Entropy: 1.415942.
Iteration 6009: Policy loss: 0.001057. Value loss

Iteration 6067: Policy loss: 0.009241. Value loss: 2.524060. Entropy: 1.438144.
Iteration 6068: Policy loss: 0.009235. Value loss: 1.352283. Entropy: 1.421941.
Iteration 6069: Policy loss: 0.004960. Value loss: 0.988030. Entropy: 1.425419.
episode: 2695   score: 420.0  epsilon: 1.0    steps: 633  evaluation reward: 286.8
Training network. lr: 0.000216. clip: 0.086388
Iteration 6070: Policy loss: 0.002497. Value loss: 2.691075. Entropy: 1.478526.
Iteration 6071: Policy loss: -0.000934. Value loss: 1.732308. Entropy: 1.473427.
Iteration 6072: Policy loss: -0.005084. Value loss: 1.344854. Entropy: 1.467951.
episode: 2696   score: 225.0  epsilon: 1.0    steps: 895  evaluation reward: 286.55
Training network. lr: 0.000216. clip: 0.086388
Iteration 6073: Policy loss: 0.005113. Value loss: 1.541386. Entropy: 1.442675.
Iteration 6074: Policy loss: -0.003426. Value loss: 1.069950. Entropy: 1.441488.
Iteration 6075: Policy loss: -0.006905. Value loss: 0.793230. Entropy: 1.441237.
Training networ

Iteration 6129: Policy loss: 0.003796. Value loss: 1.227425. Entropy: 1.385218.
Training network. lr: 0.000216. clip: 0.086275
Iteration 6130: Policy loss: 0.002609. Value loss: 3.042099. Entropy: 1.568485.
Iteration 6131: Policy loss: 0.011788. Value loss: 1.751122. Entropy: 1.565303.
Iteration 6132: Policy loss: -0.000833. Value loss: 1.251480. Entropy: 1.547536.
Training network. lr: 0.000216. clip: 0.086275
Iteration 6133: Policy loss: 0.008796. Value loss: 5.661649. Entropy: 1.579218.
Iteration 6134: Policy loss: 0.013798. Value loss: 4.271190. Entropy: 1.580011.
Iteration 6135: Policy loss: 0.005942. Value loss: 3.835044. Entropy: 1.559164.
episode: 2722   score: 315.0  epsilon: 1.0    steps: 165  evaluation reward: 276.95
episode: 2723   score: 135.0  epsilon: 1.0    steps: 870  evaluation reward: 278.8
episode: 2724   score: 410.0  epsilon: 1.0    steps: 1016  evaluation reward: 279.25
Training network. lr: 0.000216. clip: 0.086275
Iteration 6136: Policy loss: 0.010958. Value l

Iteration 6194: Policy loss: -0.000854. Value loss: 1.891459. Entropy: 1.306859.
Iteration 6195: Policy loss: -0.005740. Value loss: 1.526805. Entropy: 1.320793.
episode: 2746   score: 345.0  epsilon: 1.0    steps: 124  evaluation reward: 272.65
episode: 2747   score: 485.0  epsilon: 1.0    steps: 533  evaluation reward: 272.5
Training network. lr: 0.000215. clip: 0.086163
Iteration 6196: Policy loss: 0.010200. Value loss: 2.950514. Entropy: 1.313638.
Iteration 6197: Policy loss: 0.002406. Value loss: 1.773150. Entropy: 1.328688.
Iteration 6198: Policy loss: -0.005737. Value loss: 1.447461. Entropy: 1.330605.
episode: 2748   score: 360.0  epsilon: 1.0    steps: 234  evaluation reward: 275.25
Training network. lr: 0.000215. clip: 0.086163
Iteration 6199: Policy loss: 0.000346. Value loss: 2.112200. Entropy: 1.381814.
Iteration 6200: Policy loss: -0.000881. Value loss: 1.359396. Entropy: 1.353575.
Iteration 6201: Policy loss: -0.008554. Value loss: 1.009230. Entropy: 1.371294.
Training n

Training network. lr: 0.000215. clip: 0.085938
Iteration 6259: Policy loss: 0.006720. Value loss: 4.257643. Entropy: 1.335793.
Iteration 6260: Policy loss: 0.018604. Value loss: 2.604580. Entropy: 1.349861.
Iteration 6261: Policy loss: 0.009335. Value loss: 1.973125. Entropy: 1.358406.
Training network. lr: 0.000215. clip: 0.085938
Iteration 6262: Policy loss: 0.004391. Value loss: 2.440320. Entropy: 1.339971.
Iteration 6263: Policy loss: 0.000896. Value loss: 1.227417. Entropy: 1.360585.
Iteration 6264: Policy loss: 0.001516. Value loss: 0.889476. Entropy: 1.345518.
episode: 2770   score: 210.0  epsilon: 1.0    steps: 484  evaluation reward: 285.3
episode: 2771   score: 460.0  epsilon: 1.0    steps: 594  evaluation reward: 285.6
Training network. lr: 0.000215. clip: 0.085938
Iteration 6265: Policy loss: -0.000866. Value loss: 3.696299. Entropy: 1.323067.
Iteration 6266: Policy loss: 0.003157. Value loss: 2.250907. Entropy: 1.298769.
Iteration 6267: Policy loss: -0.009014. Value loss: 

Training network. lr: 0.000215. clip: 0.085825
Iteration 6325: Policy loss: 0.008649. Value loss: 3.726058. Entropy: 1.338440.
Iteration 6326: Policy loss: 0.021025. Value loss: 1.880349. Entropy: 1.341706.
Iteration 6327: Policy loss: 0.002492. Value loss: 1.533259. Entropy: 1.377623.
Training network. lr: 0.000215. clip: 0.085825
Iteration 6328: Policy loss: 0.005187. Value loss: 2.568507. Entropy: 1.555304.
Iteration 6329: Policy loss: -0.001822. Value loss: 1.589838. Entropy: 1.555902.
Iteration 6330: Policy loss: -0.002291. Value loss: 1.199215. Entropy: 1.557877.
episode: 2793   score: 155.0  epsilon: 1.0    steps: 115  evaluation reward: 301.4
episode: 2794   score: 485.0  epsilon: 1.0    steps: 173  evaluation reward: 299.0
episode: 2795   score: 210.0  epsilon: 1.0    steps: 352  evaluation reward: 301.05
episode: 2796   score: 365.0  epsilon: 1.0    steps: 547  evaluation reward: 298.95
Training network. lr: 0.000215. clip: 0.085825
Iteration 6331: Policy loss: 0.003206. Valu

Iteration 6388: Policy loss: 0.008710. Value loss: 1.845788. Entropy: 1.153875.
Iteration 6389: Policy loss: 0.001861. Value loss: 1.119701. Entropy: 1.126025.
Iteration 6390: Policy loss: -0.005719. Value loss: 0.768240. Entropy: 1.135052.
Training network. lr: 0.000214. clip: 0.085713
Iteration 6391: Policy loss: 0.011566. Value loss: 2.664640. Entropy: 1.308596.
Iteration 6392: Policy loss: -0.002062. Value loss: 1.648202. Entropy: 1.316536.
Iteration 6393: Policy loss: -0.005062. Value loss: 1.287205. Entropy: 1.311702.
episode: 2818   score: 540.0  epsilon: 1.0    steps: 490  evaluation reward: 306.2
Training network. lr: 0.000214. clip: 0.085713
Iteration 6394: Policy loss: -0.001648. Value loss: 2.071586. Entropy: 1.372460.
Iteration 6395: Policy loss: 0.007700. Value loss: 1.209140. Entropy: 1.356982.
Iteration 6396: Policy loss: -0.005619. Value loss: 0.985771. Entropy: 1.380391.
Training network. lr: 0.000214. clip: 0.085713
Iteration 6397: Policy loss: 0.002187. Value loss: 

Training network. lr: 0.000214. clip: 0.085488
Iteration 6454: Policy loss: 0.004371. Value loss: 3.016928. Entropy: 1.403718.
Iteration 6455: Policy loss: -0.001147. Value loss: 1.677009. Entropy: 1.381604.
Iteration 6456: Policy loss: -0.004753. Value loss: 1.117743. Entropy: 1.392526.
episode: 2841   score: 795.0  epsilon: 1.0    steps: 299  evaluation reward: 306.0
episode: 2842   score: 225.0  epsilon: 1.0    steps: 640  evaluation reward: 311.85
episode: 2843   score: 365.0  epsilon: 1.0    steps: 1003  evaluation reward: 310.2
Training network. lr: 0.000214. clip: 0.085488
Iteration 6457: Policy loss: 0.007911. Value loss: 2.710355. Entropy: 1.173307.
Iteration 6458: Policy loss: -0.001968. Value loss: 1.636088. Entropy: 1.219585.
Iteration 6459: Policy loss: -0.013529. Value loss: 1.341865. Entropy: 1.202689.
episode: 2844   score: 325.0  epsilon: 1.0    steps: 202  evaluation reward: 311.75
Training network. lr: 0.000214. clip: 0.085488
Iteration 6460: Policy loss: 0.013131. V

Iteration 6519: Policy loss: 0.005045. Value loss: 2.797117. Entropy: 1.277838.
episode: 2864   score: 495.0  epsilon: 1.0    steps: 599  evaluation reward: 294.8
Training network. lr: 0.000213. clip: 0.085375
Iteration 6520: Policy loss: 0.002079. Value loss: 2.094338. Entropy: 1.189883.
Iteration 6521: Policy loss: 0.001341. Value loss: 1.100592. Entropy: 1.235238.
Iteration 6522: Policy loss: -0.008652. Value loss: 0.852939. Entropy: 1.210010.
Training network. lr: 0.000213. clip: 0.085375
Iteration 6523: Policy loss: 0.005815. Value loss: 2.825773. Entropy: 1.209226.
Iteration 6524: Policy loss: 0.006565. Value loss: 1.316131. Entropy: 1.188377.
Iteration 6525: Policy loss: -0.005431. Value loss: 0.881814. Entropy: 1.228602.
episode: 2865   score: 265.0  epsilon: 1.0    steps: 75  evaluation reward: 297.95
episode: 2866   score: 325.0  epsilon: 1.0    steps: 310  evaluation reward: 297.9
episode: 2867   score: 285.0  epsilon: 1.0    steps: 882  evaluation reward: 298.3
episode: 286

Iteration 6581: Policy loss: 0.011273. Value loss: 1.863546. Entropy: 1.421566.
Iteration 6582: Policy loss: 0.000129. Value loss: 1.508044. Entropy: 1.438536.
Training network. lr: 0.000213. clip: 0.085263
Iteration 6583: Policy loss: 0.005232. Value loss: 2.880797. Entropy: 1.312678.
Iteration 6584: Policy loss: 0.000776. Value loss: 1.547206. Entropy: 1.286511.
Iteration 6585: Policy loss: -0.004051. Value loss: 1.122536. Entropy: 1.280562.
episode: 2891   score: 375.0  epsilon: 1.0    steps: 220  evaluation reward: 291.1
episode: 2892   score: 140.0  epsilon: 1.0    steps: 333  evaluation reward: 293.35
episode: 2893   score: 530.0  epsilon: 1.0    steps: 648  evaluation reward: 289.3
episode: 2894   score: 445.0  epsilon: 1.0    steps: 857  evaluation reward: 293.05
Training network. lr: 0.000213. clip: 0.085263
Iteration 6586: Policy loss: 0.009836. Value loss: 3.003761. Entropy: 1.377938.
Iteration 6587: Policy loss: 0.002615. Value loss: 1.745268. Entropy: 1.365162.
Iteration 6

episode: 2915   score: 270.0  epsilon: 1.0    steps: 881  evaluation reward: 295.4
episode: 2916   score: 135.0  epsilon: 1.0    steps: 980  evaluation reward: 294.0
Training network. lr: 0.000213. clip: 0.085150
Iteration 6646: Policy loss: 0.001977. Value loss: 1.701137. Entropy: 1.321018.
Iteration 6647: Policy loss: -0.001326. Value loss: 0.999401. Entropy: 1.301197.
Iteration 6648: Policy loss: 0.004637. Value loss: 0.807476. Entropy: 1.322597.
Training network. lr: 0.000213. clip: 0.085150
Iteration 6649: Policy loss: 0.007356. Value loss: 2.470392. Entropy: 1.280284.
Iteration 6650: Policy loss: -0.000437. Value loss: 1.256084. Entropy: 1.287094.
Iteration 6651: Policy loss: -0.002245. Value loss: 0.970953. Entropy: 1.297682.
episode: 2917   score: 395.0  epsilon: 1.0    steps: 203  evaluation reward: 293.25
Training network. lr: 0.000213. clip: 0.085038
Iteration 6652: Policy loss: 0.007548. Value loss: 2.714049. Entropy: 1.385129.
Iteration 6653: Policy loss: 0.005611. Value l

episode: 2941   score: 285.0  epsilon: 1.0    steps: 427  evaluation reward: 295.55
Training network. lr: 0.000212. clip: 0.084925
Iteration 6709: Policy loss: 0.005940. Value loss: 3.268577. Entropy: 1.305248.
Iteration 6710: Policy loss: 0.001588. Value loss: 1.690385. Entropy: 1.295164.
Iteration 6711: Policy loss: -0.002983. Value loss: 1.458263. Entropy: 1.307721.
Training network. lr: 0.000212. clip: 0.084925
Iteration 6712: Policy loss: 0.005629. Value loss: 2.679152. Entropy: 1.371468.
Iteration 6713: Policy loss: 0.005927. Value loss: 1.551892. Entropy: 1.371778.
Iteration 6714: Policy loss: -0.010789. Value loss: 1.052228. Entropy: 1.371019.
episode: 2942   score: 370.0  epsilon: 1.0    steps: 713  evaluation reward: 290.45
episode: 2943   score: 275.0  epsilon: 1.0    steps: 919  evaluation reward: 291.9
Training network. lr: 0.000212. clip: 0.084925
Iteration 6715: Policy loss: 0.002968. Value loss: 2.158331. Entropy: 1.324240.
Iteration 6716: Policy loss: 0.003025. Value l

Training network. lr: 0.000212. clip: 0.084813
Iteration 6775: Policy loss: 0.004427. Value loss: 5.665738. Entropy: 1.355101.
Iteration 6776: Policy loss: 0.009165. Value loss: 3.908630. Entropy: 1.367918.
Iteration 6777: Policy loss: 0.008614. Value loss: 2.898695. Entropy: 1.379816.
episode: 2963   score: 260.0  epsilon: 1.0    steps: 450  evaluation reward: 315.1
Training network. lr: 0.000212. clip: 0.084813
Iteration 6778: Policy loss: 0.002016. Value loss: 2.834504. Entropy: 1.246042.
Iteration 6779: Policy loss: 0.004907. Value loss: 1.668106. Entropy: 1.247645.
Iteration 6780: Policy loss: -0.000966. Value loss: 1.277496. Entropy: 1.240852.
episode: 2964   score: 210.0  epsilon: 1.0    steps: 72  evaluation reward: 314.0
episode: 2965   score: 725.0  epsilon: 1.0    steps: 338  evaluation reward: 311.15
Training network. lr: 0.000212. clip: 0.084813
Iteration 6781: Policy loss: 0.010059. Value loss: 2.589528. Entropy: 1.321839.
Iteration 6782: Policy loss: -0.001815. Value los

Training network. lr: 0.000212. clip: 0.084700
Iteration 6838: Policy loss: 0.007982. Value loss: 2.872544. Entropy: 1.347337.
Iteration 6839: Policy loss: 0.003652. Value loss: 1.808436. Entropy: 1.336062.
Iteration 6840: Policy loss: -0.004161. Value loss: 1.528507. Entropy: 1.342940.
episode: 2989   score: 370.0  epsilon: 1.0    steps: 138  evaluation reward: 301.85
episode: 2990   score: 410.0  epsilon: 1.0    steps: 887  evaluation reward: 304.35
Training network. lr: 0.000212. clip: 0.084700
Iteration 6841: Policy loss: 0.012379. Value loss: 3.394601. Entropy: 1.391171.
Iteration 6842: Policy loss: 0.002573. Value loss: 2.002517. Entropy: 1.396923.
Iteration 6843: Policy loss: -0.002462. Value loss: 1.593102. Entropy: 1.385796.
episode: 2991   score: 150.0  epsilon: 1.0    steps: 20  evaluation reward: 305.55
Training network. lr: 0.000212. clip: 0.084700
Iteration 6844: Policy loss: 0.005621. Value loss: 3.173027. Entropy: 1.328517.
Iteration 6845: Policy loss: -0.004923. Value 

episode: 3015   score: 320.0  epsilon: 1.0    steps: 942  evaluation reward: 285.6
Training network. lr: 0.000211. clip: 0.084475
Iteration 6901: Policy loss: 0.010393. Value loss: 2.590175. Entropy: 1.339165.
Iteration 6902: Policy loss: 0.003349. Value loss: 1.427494. Entropy: 1.337875.
Iteration 6903: Policy loss: 0.001083. Value loss: 1.081248. Entropy: 1.335110.
episode: 3016   score: 390.0  epsilon: 1.0    steps: 331  evaluation reward: 286.1
episode: 3017   score: 415.0  epsilon: 1.0    steps: 460  evaluation reward: 288.65
episode: 3018   score: 210.0  epsilon: 1.0    steps: 575  evaluation reward: 288.85
Training network. lr: 0.000211. clip: 0.084475
Iteration 6904: Policy loss: 0.004290. Value loss: 2.528795. Entropy: 1.181041.
Iteration 6905: Policy loss: -0.004919. Value loss: 1.694062. Entropy: 1.184754.
Iteration 6906: Policy loss: -0.007469. Value loss: 1.202025. Entropy: 1.183335.
Training network. lr: 0.000211. clip: 0.084475
Iteration 6907: Policy loss: 0.000398. Valu

episode: 3041   score: 360.0  epsilon: 1.0    steps: 280  evaluation reward: 284.8
Training network. lr: 0.000211. clip: 0.084363
Iteration 6964: Policy loss: 0.002481. Value loss: 3.585668. Entropy: 1.323960.
Iteration 6965: Policy loss: 0.009605. Value loss: 1.915509. Entropy: 1.302483.
Iteration 6966: Policy loss: 0.003390. Value loss: 1.557084. Entropy: 1.333738.
Training network. lr: 0.000211. clip: 0.084363
Iteration 6967: Policy loss: 0.005134. Value loss: 2.715939. Entropy: 1.398131.
Iteration 6968: Policy loss: -0.001139. Value loss: 1.229168. Entropy: 1.384247.
Iteration 6969: Policy loss: -0.006726. Value loss: 0.882395. Entropy: 1.396335.
Training network. lr: 0.000211. clip: 0.084363
Iteration 6970: Policy loss: 0.003436. Value loss: 2.244738. Entropy: 1.443066.
Iteration 6971: Policy loss: 0.000915. Value loss: 1.055517. Entropy: 1.447352.
Iteration 6972: Policy loss: -0.002992. Value loss: 0.809967. Entropy: 1.448033.
episode: 3042   score: 375.0  epsilon: 1.0    steps: 

Iteration 7028: Policy loss: 0.000590. Value loss: 3.808190. Entropy: 1.407003.
Iteration 7029: Policy loss: 0.007572. Value loss: 2.779046. Entropy: 1.416480.
episode: 3065   score: 425.0  epsilon: 1.0    steps: 518  evaluation reward: 277.3
Training network. lr: 0.000211. clip: 0.084250
Iteration 7030: Policy loss: 0.006015. Value loss: 2.664936. Entropy: 1.166912.
Iteration 7031: Policy loss: 0.005826. Value loss: 1.750047. Entropy: 1.193215.
Iteration 7032: Policy loss: 0.006549. Value loss: 1.252146. Entropy: 1.162433.
episode: 3066   score: 255.0  epsilon: 1.0    steps: 826  evaluation reward: 274.3
Training network. lr: 0.000211. clip: 0.084250
Iteration 7033: Policy loss: -0.000108. Value loss: 2.627402. Entropy: 1.237810.
Iteration 7034: Policy loss: 0.001273. Value loss: 1.596748. Entropy: 1.234887.
Iteration 7035: Policy loss: -0.005617. Value loss: 1.165434. Entropy: 1.246532.
episode: 3067   score: 300.0  epsilon: 1.0    steps: 244  evaluation reward: 273.65
Training netwo

Iteration 7092: Policy loss: -0.006551. Value loss: 1.251477. Entropy: 1.075865.
episode: 3090   score: 460.0  epsilon: 1.0    steps: 589  evaluation reward: 284.05
Training network. lr: 0.000210. clip: 0.084138
Iteration 7093: Policy loss: 0.004804. Value loss: 5.796342. Entropy: 1.198807.
Iteration 7094: Policy loss: 0.003795. Value loss: 4.084196. Entropy: 1.200020.
Iteration 7095: Policy loss: 0.005270. Value loss: 3.542264. Entropy: 1.188290.
Training network. lr: 0.000210. clip: 0.084138
Iteration 7096: Policy loss: 0.003037. Value loss: 3.076378. Entropy: 1.246971.
Iteration 7097: Policy loss: 0.001039. Value loss: 1.871025. Entropy: 1.251870.
Iteration 7098: Policy loss: -0.004889. Value loss: 1.282606. Entropy: 1.245386.
Training network. lr: 0.000210. clip: 0.084138
Iteration 7099: Policy loss: 0.004515. Value loss: 2.834673. Entropy: 1.353823.
Iteration 7100: Policy loss: 0.001395. Value loss: 1.560739. Entropy: 1.348145.
Iteration 7101: Policy loss: 0.003792. Value loss: 1.

Iteration 7155: Policy loss: -0.001949. Value loss: 1.239706. Entropy: 1.359263.
Training network. lr: 0.000210. clip: 0.083913
Iteration 7156: Policy loss: 0.013833. Value loss: 5.848490. Entropy: 1.354752.
Iteration 7157: Policy loss: 0.017349. Value loss: 4.055991. Entropy: 1.361863.
Iteration 7158: Policy loss: 0.014498. Value loss: 3.296015. Entropy: 1.360515.
episode: 3116   score: 210.0  epsilon: 1.0    steps: 276  evaluation reward: 295.0
episode: 3117   score: 315.0  epsilon: 1.0    steps: 601  evaluation reward: 293.2
Training network. lr: 0.000210. clip: 0.083913
Iteration 7159: Policy loss: 0.010079. Value loss: 2.536676. Entropy: 1.276017.
Iteration 7160: Policy loss: 0.004749. Value loss: 1.209211. Entropy: 1.268456.
Iteration 7161: Policy loss: -0.004021. Value loss: 0.995129. Entropy: 1.254489.
episode: 3118   score: 105.0  epsilon: 1.0    steps: 6  evaluation reward: 292.2
episode: 3119   score: 120.0  epsilon: 1.0    steps: 181  evaluation reward: 291.15
Training netw

episode: 3141   score: 250.0  epsilon: 1.0    steps: 315  evaluation reward: 298.0
episode: 3142   score: 290.0  epsilon: 1.0    steps: 812  evaluation reward: 296.9
episode: 3143   score: 670.0  epsilon: 1.0    steps: 936  evaluation reward: 296.05
Training network. lr: 0.000209. clip: 0.083800
Iteration 7219: Policy loss: 0.001943. Value loss: 2.777251. Entropy: 0.878849.
Iteration 7220: Policy loss: 0.000819. Value loss: 1.798722. Entropy: 0.882603.
Iteration 7221: Policy loss: -0.001350. Value loss: 1.538745. Entropy: 0.871466.
Training network. lr: 0.000209. clip: 0.083800
Iteration 7222: Policy loss: 0.005116. Value loss: 2.887113. Entropy: 1.171349.
Iteration 7223: Policy loss: 0.006990. Value loss: 2.069308. Entropy: 1.164046.
Iteration 7224: Policy loss: 0.004391. Value loss: 1.546387. Entropy: 1.139320.
Training network. lr: 0.000209. clip: 0.083800
Iteration 7225: Policy loss: 0.001663. Value loss: 1.770571. Entropy: 1.467509.
Iteration 7226: Policy loss: 0.002148. Value los

Training network. lr: 0.000209. clip: 0.083687
Iteration 7285: Policy loss: 0.004918. Value loss: 1.993458. Entropy: 1.195961.
Iteration 7286: Policy loss: -0.003095. Value loss: 1.442724. Entropy: 1.182637.
Iteration 7287: Policy loss: -0.005072. Value loss: 1.021756. Entropy: 1.194393.
episode: 3163   score: 350.0  epsilon: 1.0    steps: 42  evaluation reward: 301.1
episode: 3164   score: 120.0  epsilon: 1.0    steps: 522  evaluation reward: 300.45
Training network. lr: 0.000209. clip: 0.083687
Iteration 7288: Policy loss: 0.002042. Value loss: 2.340042. Entropy: 1.208889.
Iteration 7289: Policy loss: 0.000292. Value loss: 1.374797. Entropy: 1.233864.
Iteration 7290: Policy loss: -0.002525. Value loss: 0.975842. Entropy: 1.229919.
episode: 3165   score: 495.0  epsilon: 1.0    steps: 402  evaluation reward: 300.55
episode: 3166   score: 240.0  epsilon: 1.0    steps: 646  evaluation reward: 301.25
Training network. lr: 0.000209. clip: 0.083687
Iteration 7291: Policy loss: 0.009132. Val

Iteration 7349: Policy loss: 0.006959. Value loss: 1.407322. Entropy: 1.372903.
Iteration 7350: Policy loss: 0.004652. Value loss: 1.031020. Entropy: 1.364646.
Training network. lr: 0.000209. clip: 0.083463
Iteration 7351: Policy loss: 0.006728. Value loss: 3.698767. Entropy: 1.416482.
Iteration 7352: Policy loss: 0.002025. Value loss: 2.234360. Entropy: 1.407316.
Iteration 7353: Policy loss: 0.001120. Value loss: 1.728964. Entropy: 1.408466.
episode: 3188   score: 180.0  epsilon: 1.0    steps: 1001  evaluation reward: 301.25
Training network. lr: 0.000209. clip: 0.083463
Iteration 7354: Policy loss: 0.003001. Value loss: 3.704187. Entropy: 1.433197.
Iteration 7355: Policy loss: 0.006741. Value loss: 2.093110. Entropy: 1.414486.
Iteration 7356: Policy loss: 0.002219. Value loss: 1.684564. Entropy: 1.412471.
episode: 3189   score: 365.0  epsilon: 1.0    steps: 104  evaluation reward: 300.45
episode: 3190   score: 225.0  epsilon: 1.0    steps: 341  evaluation reward: 299.75
episode: 3191

episode: 3213   score: 460.0  epsilon: 1.0    steps: 1013  evaluation reward: 301.45
Training network. lr: 0.000208. clip: 0.083350
Iteration 7414: Policy loss: 0.006878. Value loss: 2.430119. Entropy: 1.317639.
Iteration 7415: Policy loss: 0.001858. Value loss: 1.342591. Entropy: 1.321692.
Iteration 7416: Policy loss: -0.003708. Value loss: 0.965987. Entropy: 1.334296.
episode: 3214   score: 225.0  epsilon: 1.0    steps: 698  evaluation reward: 303.2
Training network. lr: 0.000208. clip: 0.083350
Iteration 7417: Policy loss: 0.003776. Value loss: 2.791612. Entropy: 1.326647.
Iteration 7418: Policy loss: 0.000155. Value loss: 1.532510. Entropy: 1.325910.
Iteration 7419: Policy loss: -0.001910. Value loss: 1.340928. Entropy: 1.335466.
Training network. lr: 0.000208. clip: 0.083350
Iteration 7420: Policy loss: 0.000259. Value loss: 2.116234. Entropy: 1.390838.
Iteration 7421: Policy loss: 0.000301. Value loss: 1.391610. Entropy: 1.397130.
Iteration 7422: Policy loss: -0.005305. Value los

Training network. lr: 0.000208. clip: 0.083238
Iteration 7480: Policy loss: 0.002931. Value loss: 2.280295. Entropy: 1.456284.
Iteration 7481: Policy loss: 0.000356. Value loss: 1.214801. Entropy: 1.459788.
Iteration 7482: Policy loss: 0.000848. Value loss: 0.829293. Entropy: 1.454498.
Training network. lr: 0.000208. clip: 0.083238
Iteration 7483: Policy loss: 0.003059. Value loss: 5.752724. Entropy: 1.395316.
Iteration 7484: Policy loss: -0.000718. Value loss: 3.807881. Entropy: 1.399104.
Iteration 7485: Policy loss: 0.001459. Value loss: 3.056210. Entropy: 1.397352.
episode: 3236   score: 600.0  epsilon: 1.0    steps: 174  evaluation reward: 307.8
episode: 3237   score: 80.0  epsilon: 1.0    steps: 953  evaluation reward: 309.3
Training network. lr: 0.000208. clip: 0.083238
Iteration 7486: Policy loss: 0.018601. Value loss: 2.860121. Entropy: 1.296058.
Iteration 7487: Policy loss: 0.007576. Value loss: 1.652504. Entropy: 1.299020.
Iteration 7488: Policy loss: 0.007412. Value loss: 1.

Iteration 7544: Policy loss: 0.005592. Value loss: 3.071760. Entropy: 1.437467.
Iteration 7545: Policy loss: 0.010705. Value loss: 2.288568. Entropy: 1.430073.
Training network. lr: 0.000208. clip: 0.083125
Iteration 7546: Policy loss: 0.008963. Value loss: 3.772331. Entropy: 1.384176.
Iteration 7547: Policy loss: 0.001760. Value loss: 1.813240. Entropy: 1.380160.
Iteration 7548: Policy loss: -0.003826. Value loss: 1.385911. Entropy: 1.390337.
episode: 3260   score: 315.0  epsilon: 1.0    steps: 826  evaluation reward: 312.15
Training network. lr: 0.000208. clip: 0.083125
Iteration 7549: Policy loss: 0.008038. Value loss: 3.265210. Entropy: 1.381726.
Iteration 7550: Policy loss: 0.008964. Value loss: 2.039435. Entropy: 1.378225.
Iteration 7551: Policy loss: -0.004216. Value loss: 1.377516. Entropy: 1.372000.
episode: 3261   score: 535.0  epsilon: 1.0    steps: 30  evaluation reward: 310.45
episode: 3262   score: 155.0  epsilon: 1.0    steps: 596  evaluation reward: 313.7
Training netwo

episode: 3284   score: 520.0  epsilon: 1.0    steps: 653  evaluation reward: 318.95
Training network. lr: 0.000207. clip: 0.082900
Iteration 7609: Policy loss: 0.006800. Value loss: 5.468406. Entropy: 1.360888.
Iteration 7610: Policy loss: -0.004776. Value loss: 3.892198. Entropy: 1.375183.
Iteration 7611: Policy loss: -0.005644. Value loss: 3.693981. Entropy: 1.358516.
episode: 3285   score: 210.0  epsilon: 1.0    steps: 249  evaluation reward: 321.3
episode: 3286   score: 240.0  epsilon: 1.0    steps: 800  evaluation reward: 321.3
Training network. lr: 0.000207. clip: 0.082900
Iteration 7612: Policy loss: 0.006589. Value loss: 3.395904. Entropy: 1.321626.
Iteration 7613: Policy loss: 0.009074. Value loss: 1.912958. Entropy: 1.327552.
Iteration 7614: Policy loss: -0.002946. Value loss: 1.513743. Entropy: 1.317264.
episode: 3287   score: 335.0  epsilon: 1.0    steps: 333  evaluation reward: 321.9
episode: 3288   score: 155.0  epsilon: 1.0    steps: 480  evaluation reward: 321.8
episode

Training network. lr: 0.000207. clip: 0.082787
Iteration 7672: Policy loss: 0.004704. Value loss: 4.403938. Entropy: 1.365137.
Iteration 7673: Policy loss: 0.003557. Value loss: 3.431848. Entropy: 1.383062.
Iteration 7674: Policy loss: 0.005938. Value loss: 1.898331. Entropy: 1.391437.
episode: 3310   score: 365.0  epsilon: 1.0    steps: 965  evaluation reward: 324.35
Training network. lr: 0.000207. clip: 0.082787
Iteration 7675: Policy loss: 0.003847. Value loss: 3.039028. Entropy: 1.431592.
Iteration 7676: Policy loss: 0.007930. Value loss: 2.012223. Entropy: 1.420611.
Iteration 7677: Policy loss: -0.004239. Value loss: 1.680282. Entropy: 1.428025.
episode: 3311   score: 315.0  epsilon: 1.0    steps: 14  evaluation reward: 326.2
episode: 3312   score: 210.0  epsilon: 1.0    steps: 178  evaluation reward: 327.8
episode: 3313   score: 515.0  epsilon: 1.0    steps: 858  evaluation reward: 328.6
Training network. lr: 0.000207. clip: 0.082787
Iteration 7678: Policy loss: 0.008812. Value l

episode: 3336   score: 295.0  epsilon: 1.0    steps: 489  evaluation reward: 315.6
episode: 3337   score: 290.0  epsilon: 1.0    steps: 741  evaluation reward: 312.55
Training network. lr: 0.000207. clip: 0.082675
Iteration 7735: Policy loss: 0.006522. Value loss: 2.683542. Entropy: 1.455517.
Iteration 7736: Policy loss: 0.009194. Value loss: 1.393742. Entropy: 1.442012.
Iteration 7737: Policy loss: -0.005395. Value loss: 1.063455. Entropy: 1.465508.
episode: 3338   score: 280.0  epsilon: 1.0    steps: 146  evaluation reward: 314.65
episode: 3339   score: 75.0  epsilon: 1.0    steps: 537  evaluation reward: 311.85
Training network. lr: 0.000207. clip: 0.082675
Iteration 7738: Policy loss: 0.008738. Value loss: 2.737683. Entropy: 1.400052.
Iteration 7739: Policy loss: 0.005207. Value loss: 1.850994. Entropy: 1.390128.
Iteration 7740: Policy loss: -0.004591. Value loss: 1.459217. Entropy: 1.399360.
Training network. lr: 0.000207. clip: 0.082675
Iteration 7741: Policy loss: 0.006408. Valu

Training network. lr: 0.000206. clip: 0.082563
Iteration 7798: Policy loss: 0.011212. Value loss: 3.768214. Entropy: 1.385327.
Iteration 7799: Policy loss: 0.014288. Value loss: 2.077833. Entropy: 1.404622.
Iteration 7800: Policy loss: 0.009503. Value loss: 1.583819. Entropy: 1.393040.
episode: 3362   score: 215.0  epsilon: 1.0    steps: 608  evaluation reward: 296.95
episode: 3363   score: 260.0  epsilon: 1.0    steps: 663  evaluation reward: 297.55
episode: 3364   score: 240.0  epsilon: 1.0    steps: 845  evaluation reward: 295.0
Training network. lr: 0.000206. clip: 0.082450
Iteration 7801: Policy loss: 0.009191. Value loss: 3.535811. Entropy: 1.435435.
Iteration 7802: Policy loss: 0.005332. Value loss: 1.950554. Entropy: 1.423597.
Iteration 7803: Policy loss: -0.004795. Value loss: 1.503903. Entropy: 1.424308.
Training network. lr: 0.000206. clip: 0.082450
Iteration 7804: Policy loss: 0.003752. Value loss: 1.940885. Entropy: 1.483580.
Iteration 7805: Policy loss: 0.001416. Value lo

episode: 3385   score: 80.0  epsilon: 1.0    steps: 275  evaluation reward: 281.45
episode: 3386   score: 210.0  epsilon: 1.0    steps: 756  evaluation reward: 280.15
episode: 3387   score: 265.0  epsilon: 1.0    steps: 868  evaluation reward: 279.85
Training network. lr: 0.000206. clip: 0.082337
Iteration 7864: Policy loss: 0.008327. Value loss: 2.303613. Entropy: 1.430076.
Iteration 7865: Policy loss: 0.003579. Value loss: 1.531536. Entropy: 1.426645.
Iteration 7866: Policy loss: -0.003213. Value loss: 1.108698. Entropy: 1.432005.
episode: 3388   score: 240.0  epsilon: 1.0    steps: 72  evaluation reward: 279.15
episode: 3389   score: 210.0  epsilon: 1.0    steps: 205  evaluation reward: 280.0
Training network. lr: 0.000206. clip: 0.082337
Iteration 7867: Policy loss: 0.001312. Value loss: 5.340828. Entropy: 1.344934.
Iteration 7868: Policy loss: 0.001704. Value loss: 4.699861. Entropy: 1.341301.
Iteration 7869: Policy loss: 0.004325. Value loss: 3.808312. Entropy: 1.329920.
Training

Iteration 7925: Policy loss: 0.014630. Value loss: 4.205901. Entropy: 1.363845.
Iteration 7926: Policy loss: 0.017115. Value loss: 2.506802. Entropy: 1.354417.
episode: 3413   score: 125.0  epsilon: 1.0    steps: 315  evaluation reward: 264.0
episode: 3414   score: 500.0  epsilon: 1.0    steps: 978  evaluation reward: 260.1
Training network. lr: 0.000206. clip: 0.082225
Iteration 7927: Policy loss: 0.005514. Value loss: 4.077139. Entropy: 1.371261.
Iteration 7928: Policy loss: 0.011215. Value loss: 2.413090. Entropy: 1.348131.
Iteration 7929: Policy loss: -0.005434. Value loss: 1.724311. Entropy: 1.376033.
episode: 3415   score: 60.0  epsilon: 1.0    steps: 878  evaluation reward: 264.2
Training network. lr: 0.000206. clip: 0.082225
Iteration 7930: Policy loss: 0.010284. Value loss: 2.879030. Entropy: 1.369110.
Iteration 7931: Policy loss: 0.006868. Value loss: 1.655815. Entropy: 1.364159.
Iteration 7932: Policy loss: 0.004590. Value loss: 1.058017. Entropy: 1.354796.
episode: 3416   s

Iteration 7987: Policy loss: 0.009571. Value loss: 3.439450. Entropy: 1.499137.
Iteration 7988: Policy loss: 0.001327. Value loss: 1.736912. Entropy: 1.482947.
Iteration 7989: Policy loss: -0.001600. Value loss: 1.360290. Entropy: 1.483605.
Training network. lr: 0.000205. clip: 0.082113
Iteration 7990: Policy loss: 0.003653. Value loss: 3.284884. Entropy: 1.463939.
Iteration 7991: Policy loss: 0.004955. Value loss: 1.631144. Entropy: 1.461385.
Iteration 7992: Policy loss: 0.000929. Value loss: 1.182219. Entropy: 1.467669.
episode: 3440   score: 430.0  epsilon: 1.0    steps: 749  evaluation reward: 266.35
episode: 3441   score: 210.0  epsilon: 1.0    steps: 914  evaluation reward: 268.55
Training network. lr: 0.000205. clip: 0.082113
Iteration 7993: Policy loss: 0.006686. Value loss: 2.733370. Entropy: 1.389923.
Iteration 7994: Policy loss: -0.004460. Value loss: 1.604616. Entropy: 1.398753.
Iteration 7995: Policy loss: -0.004834. Value loss: 1.207408. Entropy: 1.388812.
episode: 3442  

Iteration 8052: Policy loss: -0.008885. Value loss: 1.074022. Entropy: 1.490007.
episode: 3464   score: 210.0  epsilon: 1.0    steps: 317  evaluation reward: 271.5
episode: 3465   score: 340.0  epsilon: 1.0    steps: 826  evaluation reward: 271.2
Training network. lr: 0.000205. clip: 0.081888
Iteration 8053: Policy loss: 0.010842. Value loss: 3.609673. Entropy: 1.431456.
Iteration 8054: Policy loss: 0.003885. Value loss: 2.082092. Entropy: 1.427674.
Iteration 8055: Policy loss: 0.000156. Value loss: 1.522796. Entropy: 1.437115.
Training network. lr: 0.000205. clip: 0.081888
Iteration 8056: Policy loss: 0.014163. Value loss: 2.541378. Entropy: 1.526568.
Iteration 8057: Policy loss: 0.003954. Value loss: 1.438588. Entropy: 1.514933.
Iteration 8058: Policy loss: 0.000994. Value loss: 1.100595. Entropy: 1.524296.
episode: 3466   score: 365.0  epsilon: 1.0    steps: 464  evaluation reward: 272.15
episode: 3467   score: 360.0  epsilon: 1.0    steps: 757  evaluation reward: 274.0
Training net

Iteration 8115: Policy loss: 0.001561. Value loss: 0.888754. Entropy: 1.511286.
episode: 3490   score: 250.0  epsilon: 1.0    steps: 334  evaluation reward: 284.05
Training network. lr: 0.000204. clip: 0.081775
Iteration 8116: Policy loss: 0.007923. Value loss: 2.997608. Entropy: 1.438943.
Iteration 8117: Policy loss: 0.000001. Value loss: 1.800434. Entropy: 1.429442.
Iteration 8118: Policy loss: -0.001855. Value loss: 1.439970. Entropy: 1.413942.
episode: 3491   score: 255.0  epsilon: 1.0    steps: 142  evaluation reward: 283.15
episode: 3492   score: 210.0  epsilon: 1.0    steps: 816  evaluation reward: 284.15
Training network. lr: 0.000204. clip: 0.081775
Iteration 8119: Policy loss: 0.002226. Value loss: 4.920060. Entropy: 1.250403.
Iteration 8120: Policy loss: 0.004660. Value loss: 3.105248. Entropy: 1.219135.
Iteration 8121: Policy loss: 0.015345. Value loss: 3.082291. Entropy: 1.216632.
episode: 3493   score: 435.0  epsilon: 1.0    steps: 712  evaluation reward: 284.7
episode: 3

Iteration 8178: Policy loss: 0.005987. Value loss: 2.818418. Entropy: 1.197511.
episode: 3516   score: 630.0  epsilon: 1.0    steps: 400  evaluation reward: 299.3
Training network. lr: 0.000204. clip: 0.081662
Iteration 8179: Policy loss: 0.003447. Value loss: 3.864943. Entropy: 1.326322.
Iteration 8180: Policy loss: 0.000682. Value loss: 2.563228. Entropy: 1.343173.
Iteration 8181: Policy loss: 0.005943. Value loss: 2.235056. Entropy: 1.341561.
Training network. lr: 0.000204. clip: 0.081662
Iteration 8182: Policy loss: 0.018487. Value loss: 5.761041. Entropy: 1.404817.
Iteration 8183: Policy loss: 0.018061. Value loss: 2.730345. Entropy: 1.414642.
Iteration 8184: Policy loss: 0.011693. Value loss: 2.082740. Entropy: 1.405213.
episode: 3517   score: 440.0  epsilon: 1.0    steps: 701  evaluation reward: 305.05
Training network. lr: 0.000204. clip: 0.081662
Iteration 8185: Policy loss: 0.001581. Value loss: 2.536235. Entropy: 1.459625.
Iteration 8186: Policy loss: 0.000982. Value loss: 1

Iteration 8243: Policy loss: 0.011117. Value loss: 1.494665. Entropy: 1.437923.
Iteration 8244: Policy loss: 0.012173. Value loss: 1.247983. Entropy: 1.458802.
episode: 3540   score: 365.0  epsilon: 1.0    steps: 492  evaluation reward: 318.45
Training network. lr: 0.000204. clip: 0.081550
Iteration 8245: Policy loss: 0.007566. Value loss: 4.268523. Entropy: 1.274585.
Iteration 8246: Policy loss: 0.006522. Value loss: 4.259900. Entropy: 1.267994.
Iteration 8247: Policy loss: 0.000597. Value loss: 2.683438. Entropy: 1.257236.
episode: 3541   score: 240.0  epsilon: 1.0    steps: 37  evaluation reward: 317.8
Training network. lr: 0.000204. clip: 0.081550
Iteration 8248: Policy loss: 0.001864. Value loss: 2.522326. Entropy: 1.286138.
Iteration 8249: Policy loss: 0.011927. Value loss: 1.417335. Entropy: 1.296481.
Iteration 8250: Policy loss: 0.002468. Value loss: 1.208375. Entropy: 1.286857.
episode: 3542   score: 210.0  epsilon: 1.0    steps: 208  evaluation reward: 318.1
episode: 3543   s

Iteration 8308: Policy loss: 0.004468. Value loss: 5.283802. Entropy: 1.284279.
Iteration 8309: Policy loss: 0.001131. Value loss: 3.898086. Entropy: 1.294133.
Iteration 8310: Policy loss: 0.003513. Value loss: 3.189138. Entropy: 1.290030.
episode: 3563   score: 515.0  epsilon: 1.0    steps: 58  evaluation reward: 316.3
episode: 3564   score: 830.0  epsilon: 1.0    steps: 931  evaluation reward: 319.6
Training network. lr: 0.000203. clip: 0.081325
Iteration 8311: Policy loss: 0.005016. Value loss: 3.575765. Entropy: 1.280411.
Iteration 8312: Policy loss: -0.002210. Value loss: 1.971679. Entropy: 1.270693.
Iteration 8313: Policy loss: -0.001875. Value loss: 2.151436. Entropy: 1.268552.
episode: 3565   score: 175.0  epsilon: 1.0    steps: 397  evaluation reward: 325.8
Training network. lr: 0.000203. clip: 0.081325
Iteration 8314: Policy loss: 0.009137. Value loss: 2.619035. Entropy: 1.248570.
Iteration 8315: Policy loss: 0.002503. Value loss: 1.707112. Entropy: 1.242219.
Iteration 8316: 

Training network. lr: 0.000203. clip: 0.081213
Iteration 8374: Policy loss: 0.005224. Value loss: 2.898944. Entropy: 1.371479.
Iteration 8375: Policy loss: 0.003596. Value loss: 1.536399. Entropy: 1.373659.
Iteration 8376: Policy loss: -0.006300. Value loss: 1.127788. Entropy: 1.375145.
Training network. lr: 0.000203. clip: 0.081213
Iteration 8377: Policy loss: 0.003221. Value loss: 6.610985. Entropy: 1.471868.
Iteration 8378: Policy loss: 0.013134. Value loss: 4.746809. Entropy: 1.464304.
Iteration 8379: Policy loss: 0.005012. Value loss: 3.264165. Entropy: 1.465893.
episode: 3586   score: 645.0  epsilon: 1.0    steps: 7  evaluation reward: 328.4
episode: 3587   score: 505.0  epsilon: 1.0    steps: 183  evaluation reward: 333.2
Training network. lr: 0.000203. clip: 0.081213
Iteration 8380: Policy loss: 0.002693. Value loss: 2.498214. Entropy: 1.197150.
Iteration 8381: Policy loss: 0.001924. Value loss: 1.753551. Entropy: 1.220547.
Iteration 8382: Policy loss: -0.008191. Value loss: 1.

Iteration 8437: Policy loss: 0.003065. Value loss: 2.551492. Entropy: 1.302267.
Iteration 8438: Policy loss: 0.009534. Value loss: 1.413920. Entropy: 1.307082.
Iteration 8439: Policy loss: -0.002301. Value loss: 1.220878. Entropy: 1.304448.
Training network. lr: 0.000203. clip: 0.081100
Iteration 8440: Policy loss: 0.007569. Value loss: 3.344764. Entropy: 1.356642.
Iteration 8441: Policy loss: 0.013959. Value loss: 1.847340. Entropy: 1.361072.
Iteration 8442: Policy loss: -0.002307. Value loss: 1.443245. Entropy: 1.345838.
episode: 3612   score: 155.0  epsilon: 1.0    steps: 422  evaluation reward: 334.1
Training network. lr: 0.000203. clip: 0.081100
Iteration 8443: Policy loss: 0.007830. Value loss: 1.765343. Entropy: 1.447377.
Iteration 8444: Policy loss: 0.004860. Value loss: 0.928464. Entropy: 1.460684.
Iteration 8445: Policy loss: -0.001820. Value loss: 0.755790. Entropy: 1.442658.
Training network. lr: 0.000203. clip: 0.081100
Iteration 8446: Policy loss: 0.011604. Value loss: 3.

episode: 3635   score: 345.0  epsilon: 1.0    steps: 947  evaluation reward: 319.85
Training network. lr: 0.000202. clip: 0.080875
Iteration 8503: Policy loss: 0.006318. Value loss: 3.797695. Entropy: 1.235626.
Iteration 8504: Policy loss: 0.010186. Value loss: 2.358331. Entropy: 1.236170.
Iteration 8505: Policy loss: -0.003114. Value loss: 1.823788. Entropy: 1.241670.
episode: 3636   score: 530.0  epsilon: 1.0    steps: 274  evaluation reward: 321.45
episode: 3637   score: 420.0  epsilon: 1.0    steps: 604  evaluation reward: 324.9
episode: 3638   score: 335.0  epsilon: 1.0    steps: 872  evaluation reward: 327.0
Training network. lr: 0.000202. clip: 0.080875
Iteration 8506: Policy loss: 0.002340. Value loss: 3.165578. Entropy: 1.077982.
Iteration 8507: Policy loss: 0.001079. Value loss: 2.061696. Entropy: 1.071264.
Iteration 8508: Policy loss: -0.003004. Value loss: 1.608995. Entropy: 1.064917.
episode: 3639   score: 210.0  epsilon: 1.0    steps: 68  evaluation reward: 327.0
Training

Iteration 8568: Policy loss: -0.001652. Value loss: 1.305702. Entropy: 1.210174.
episode: 3658   score: 585.0  epsilon: 1.0    steps: 65  evaluation reward: 340.9
episode: 3659   score: 345.0  epsilon: 1.0    steps: 871  evaluation reward: 344.3
episode: 3660   score: 210.0  epsilon: 1.0    steps: 960  evaluation reward: 345.5
Training network. lr: 0.000202. clip: 0.080763
Iteration 8569: Policy loss: 0.005992. Value loss: 3.250786. Entropy: 0.865311.
Iteration 8570: Policy loss: 0.004411. Value loss: 2.206997. Entropy: 0.881336.
Iteration 8571: Policy loss: 0.005366. Value loss: 1.705423. Entropy: 0.863206.
episode: 3661   score: 425.0  epsilon: 1.0    steps: 174  evaluation reward: 342.3
episode: 3662   score: 210.0  epsilon: 1.0    steps: 732  evaluation reward: 345.2
Training network. lr: 0.000202. clip: 0.080763
Iteration 8572: Policy loss: 0.011799. Value loss: 3.239386. Entropy: 0.910294.
Iteration 8573: Policy loss: 0.007827. Value loss: 2.449047. Entropy: 0.889307.
Iteration 8

Iteration 8630: Policy loss: 0.008251. Value loss: 4.800167. Entropy: 1.367015.
Iteration 8631: Policy loss: 0.004015. Value loss: 3.985264. Entropy: 1.370970.
episode: 3685   score: 355.0  epsilon: 1.0    steps: 10  evaluation reward: 321.95
episode: 3686   score: 210.0  epsilon: 1.0    steps: 229  evaluation reward: 324.7
Training network. lr: 0.000202. clip: 0.080650
Iteration 8632: Policy loss: 0.000108. Value loss: 3.197247. Entropy: 1.250569.
Iteration 8633: Policy loss: 0.003813. Value loss: 1.935622. Entropy: 1.261416.
Iteration 8634: Policy loss: -0.004179. Value loss: 1.468884. Entropy: 1.248485.
episode: 3687   score: 310.0  epsilon: 1.0    steps: 283  evaluation reward: 320.35
Training network. lr: 0.000202. clip: 0.080650
Iteration 8635: Policy loss: 0.009702. Value loss: 2.793957. Entropy: 1.208333.
Iteration 8636: Policy loss: 0.003722. Value loss: 1.650415. Entropy: 1.188870.
Iteration 8637: Policy loss: -0.006472. Value loss: 1.268552. Entropy: 1.202235.
Training netwo

Iteration 8692: Policy loss: 0.004799. Value loss: 2.345892. Entropy: 1.232880.
Iteration 8693: Policy loss: -0.001116. Value loss: 1.346428. Entropy: 1.218072.
Iteration 8694: Policy loss: -0.000515. Value loss: 1.113100. Entropy: 1.204841.
episode: 3712   score: 120.0  epsilon: 1.0    steps: 827  evaluation reward: 330.2
Training network. lr: 0.000201. clip: 0.080537
Iteration 8695: Policy loss: 0.003168. Value loss: 3.451049. Entropy: 1.147658.
Iteration 8696: Policy loss: 0.007032. Value loss: 1.980398. Entropy: 1.136214.
Iteration 8697: Policy loss: -0.000679. Value loss: 1.311917. Entropy: 1.139735.
episode: 3713   score: 245.0  epsilon: 1.0    steps: 690  evaluation reward: 329.85
Training network. lr: 0.000201. clip: 0.080537
Iteration 8698: Policy loss: 0.006301. Value loss: 2.594079. Entropy: 1.263651.
Iteration 8699: Policy loss: 0.003945. Value loss: 1.408897. Entropy: 1.282637.
Iteration 8700: Policy loss: 0.005007. Value loss: 1.095091. Entropy: 1.251344.
episode: 3714   

Iteration 8753: Policy loss: 0.006554. Value loss: 3.570013. Entropy: 1.257685.
Iteration 8754: Policy loss: 0.010780. Value loss: 2.928618. Entropy: 1.259449.
Training network. lr: 0.000201. clip: 0.080313
Iteration 8755: Policy loss: 0.005161. Value loss: 2.899424. Entropy: 1.380599.
Iteration 8756: Policy loss: 0.006405. Value loss: 1.667340. Entropy: 1.391252.
Iteration 8757: Policy loss: 0.000179. Value loss: 1.359506. Entropy: 1.389771.
episode: 3741   score: 290.0  epsilon: 1.0    steps: 61  evaluation reward: 299.35
episode: 3742   score: 225.0  epsilon: 1.0    steps: 771  evaluation reward: 298.95
Training network. lr: 0.000201. clip: 0.080313
Iteration 8758: Policy loss: 0.003793. Value loss: 1.867180. Entropy: 1.283582.
Iteration 8759: Policy loss: -0.003459. Value loss: 1.267147. Entropy: 1.267719.
Iteration 8760: Policy loss: -0.000442. Value loss: 0.991264. Entropy: 1.275602.
episode: 3743   score: 300.0  epsilon: 1.0    steps: 721  evaluation reward: 296.35
Training netw

Iteration 8818: Policy loss: 0.003918. Value loss: 2.751893. Entropy: 0.878242.
Iteration 8819: Policy loss: -0.002561. Value loss: 1.567538. Entropy: 0.885263.
Iteration 8820: Policy loss: -0.000795. Value loss: 1.312027. Entropy: 0.870969.
episode: 3764   score: 155.0  epsilon: 1.0    steps: 134  evaluation reward: 304.45
Training network. lr: 0.000201. clip: 0.080200
Iteration 8821: Policy loss: 0.007016. Value loss: 2.890615. Entropy: 0.948851.
Iteration 8822: Policy loss: 0.011438. Value loss: 1.676716. Entropy: 0.939645.
Iteration 8823: Policy loss: 0.008411. Value loss: 1.346813. Entropy: 0.939256.
episode: 3765   score: 210.0  epsilon: 1.0    steps: 66  evaluation reward: 303.9
Training network. lr: 0.000201. clip: 0.080200
Iteration 8824: Policy loss: 0.003177. Value loss: 1.714984. Entropy: 1.236428.
Iteration 8825: Policy loss: -0.000988. Value loss: 0.955238. Entropy: 1.222605.
Iteration 8826: Policy loss: -0.007647. Value loss: 0.789692. Entropy: 1.231420.
Training network

Iteration 8886: Policy loss: -0.000661. Value loss: 1.280086. Entropy: 1.309908.
episode: 3785   score: 255.0  epsilon: 1.0    steps: 375  evaluation reward: 324.25
episode: 3786   score: 510.0  epsilon: 1.0    steps: 950  evaluation reward: 323.25
Training network. lr: 0.000200. clip: 0.080088
Iteration 8887: Policy loss: 0.003592. Value loss: 3.216720. Entropy: 1.109777.
Iteration 8888: Policy loss: 0.014248. Value loss: 1.900839. Entropy: 1.128379.
Iteration 8889: Policy loss: 0.003201. Value loss: 1.463578. Entropy: 1.124256.
Training network. lr: 0.000200. clip: 0.080088
Iteration 8890: Policy loss: 0.004987. Value loss: 3.716922. Entropy: 0.998804.
Iteration 8891: Policy loss: 0.006539. Value loss: 1.847900. Entropy: 1.000394.
Iteration 8892: Policy loss: -0.002153. Value loss: 1.563126. Entropy: 0.972084.
episode: 3787   score: 80.0  epsilon: 1.0    steps: 250  evaluation reward: 326.25
Training network. lr: 0.000200. clip: 0.080088
Iteration 8893: Policy loss: 0.014623. Value l

Iteration 8950: Policy loss: 0.003446. Value loss: 1.860276. Entropy: 1.110932.
Iteration 8951: Policy loss: 0.011119. Value loss: 1.183091. Entropy: 1.128029.
Iteration 8952: Policy loss: -0.005282. Value loss: 0.873375. Entropy: 1.112725.
episode: 3809   score: 215.0  epsilon: 1.0    steps: 912  evaluation reward: 311.1
Training network. lr: 0.000200. clip: 0.079863
Iteration 8953: Policy loss: 0.011311. Value loss: 4.623986. Entropy: 1.104360.
Iteration 8954: Policy loss: 0.014616. Value loss: 3.562162. Entropy: 1.098386.
Iteration 8955: Policy loss: 0.013585. Value loss: 2.987120. Entropy: 1.095133.
episode: 3810   score: 210.0  epsilon: 1.0    steps: 143  evaluation reward: 311.15
episode: 3811   score: 315.0  epsilon: 1.0    steps: 768  evaluation reward: 311.15
episode: 3812   score: 580.0  epsilon: 1.0    steps: 826  evaluation reward: 313.3
Training network. lr: 0.000200. clip: 0.079863
Iteration 8956: Policy loss: -0.000005. Value loss: 2.444253. Entropy: 1.073994.
Iteration 

Iteration 9013: Policy loss: 0.011520. Value loss: 3.700496. Entropy: 0.892892.
Iteration 9014: Policy loss: 0.001608. Value loss: 2.328009. Entropy: 0.935173.
Iteration 9015: Policy loss: -0.003396. Value loss: 1.814109. Entropy: 0.937332.
episode: 3835   score: 210.0  epsilon: 1.0    steps: 66  evaluation reward: 347.6
Training network. lr: 0.000199. clip: 0.079750
Iteration 9016: Policy loss: 0.010078. Value loss: 2.419148. Entropy: 0.971979.
Iteration 9017: Policy loss: 0.002674. Value loss: 1.570150. Entropy: 0.986556.
Iteration 9018: Policy loss: 0.001167. Value loss: 1.189459. Entropy: 0.982144.
episode: 3836   score: 180.0  epsilon: 1.0    steps: 337  evaluation reward: 344.65
episode: 3837   score: 345.0  epsilon: 1.0    steps: 953  evaluation reward: 343.75
Training network. lr: 0.000199. clip: 0.079750
Iteration 9019: Policy loss: 0.007932. Value loss: 3.344996. Entropy: 0.948673.
Iteration 9020: Policy loss: 0.010668. Value loss: 1.967169. Entropy: 0.949100.
Iteration 9021:

Training network. lr: 0.000199. clip: 0.079638
Iteration 9079: Policy loss: 0.002357. Value loss: 3.247206. Entropy: 0.946002.
Iteration 9080: Policy loss: -0.000877. Value loss: 2.149606. Entropy: 0.894108.
Iteration 9081: Policy loss: -0.005769. Value loss: 1.653455. Entropy: 0.906287.
Training network. lr: 0.000199. clip: 0.079638
Iteration 9082: Policy loss: 0.001330. Value loss: 2.762092. Entropy: 0.993573.
Iteration 9083: Policy loss: -0.003576. Value loss: 1.970160. Entropy: 1.020579.
Iteration 9084: Policy loss: -0.006266. Value loss: 1.616220. Entropy: 1.009468.
episode: 3858   score: 320.0  epsilon: 1.0    steps: 649  evaluation reward: 352.8
Training network. lr: 0.000199. clip: 0.079638
Iteration 9085: Policy loss: 0.007494. Value loss: 6.102263. Entropy: 1.184965.
Iteration 9086: Policy loss: 0.011630. Value loss: 4.373112. Entropy: 1.180763.
Iteration 9087: Policy loss: 0.012572. Value loss: 2.435221. Entropy: 1.152530.
episode: 3859   score: 440.0  epsilon: 1.0    steps:

episode: 3881   score: 210.0  epsilon: 1.0    steps: 963  evaluation reward: 336.9
Training network. lr: 0.000199. clip: 0.079525
Iteration 9145: Policy loss: 0.000489. Value loss: 2.330171. Entropy: 1.114892.
Iteration 9146: Policy loss: 0.001357. Value loss: 1.587308. Entropy: 1.133713.
Iteration 9147: Policy loss: -0.005693. Value loss: 1.304392. Entropy: 1.132394.
Training network. lr: 0.000199. clip: 0.079525
Iteration 9148: Policy loss: 0.008916. Value loss: 3.685792. Entropy: 1.043081.
Iteration 9149: Policy loss: 0.005390. Value loss: 2.043117. Entropy: 1.068363.
Iteration 9150: Policy loss: 0.000804. Value loss: 1.579061. Entropy: 1.051430.
Training network. lr: 0.000199. clip: 0.079412
Iteration 9151: Policy loss: 0.012305. Value loss: 5.285957. Entropy: 1.202356.
Iteration 9152: Policy loss: 0.029876. Value loss: 2.875123. Entropy: 1.190602.
Iteration 9153: Policy loss: 0.010159. Value loss: 1.980349. Entropy: 1.191896.
episode: 3882   score: 150.0  epsilon: 1.0    steps: 73

Iteration 9207: Policy loss: 0.002895. Value loss: 2.107908. Entropy: 0.860870.
episode: 3908   score: 345.0  epsilon: 1.0    steps: 620  evaluation reward: 325.3
Training network. lr: 0.000198. clip: 0.079300
Iteration 9208: Policy loss: 0.001978. Value loss: 3.184972. Entropy: 1.039421.
Iteration 9209: Policy loss: -0.002755. Value loss: 1.741317. Entropy: 1.034491.
Iteration 9210: Policy loss: 0.003469. Value loss: 1.325021. Entropy: 1.006556.
episode: 3909   score: 510.0  epsilon: 1.0    steps: 390  evaluation reward: 325.15
Training network. lr: 0.000198. clip: 0.079300
Iteration 9211: Policy loss: 0.003410. Value loss: 2.674015. Entropy: 0.902979.
Iteration 9212: Policy loss: -0.007148. Value loss: 1.672053. Entropy: 0.908521.
Iteration 9213: Policy loss: -0.009303. Value loss: 1.405985. Entropy: 0.890835.
episode: 3910   score: 225.0  epsilon: 1.0    steps: 195  evaluation reward: 328.1
episode: 3911   score: 180.0  epsilon: 1.0    steps: 676  evaluation reward: 328.25
Training 

Training network. lr: 0.000198. clip: 0.079188
Iteration 9274: Policy loss: 0.016622. Value loss: 5.395298. Entropy: 1.447789.
Iteration 9275: Policy loss: 0.008380. Value loss: 2.375107. Entropy: 1.475537.
Iteration 9276: Policy loss: -0.002339. Value loss: 1.533126. Entropy: 1.486164.
episode: 3930   score: 470.0  epsilon: 1.0    steps: 18  evaluation reward: 330.1
episode: 3931   score: 235.0  epsilon: 1.0    steps: 177  evaluation reward: 330.85
episode: 3932   score: 215.0  epsilon: 1.0    steps: 672  evaluation reward: 330.35
Training network. lr: 0.000198. clip: 0.079188
Iteration 9277: Policy loss: 0.006101. Value loss: 3.400077. Entropy: 1.118270.
Iteration 9278: Policy loss: 0.004323. Value loss: 2.010683. Entropy: 1.121440.
Iteration 9279: Policy loss: 0.001709. Value loss: 1.584285. Entropy: 1.106885.
episode: 3933   score: 370.0  epsilon: 1.0    steps: 540  evaluation reward: 327.4
episode: 3934   score: 215.0  epsilon: 1.0    steps: 809  evaluation reward: 326.25
episode:

Iteration 9337: Policy loss: -0.001096. Value loss: 2.862776. Entropy: 1.083031.
Iteration 9338: Policy loss: -0.009013. Value loss: 1.821380. Entropy: 1.078547.
Iteration 9339: Policy loss: -0.014291. Value loss: 1.531383. Entropy: 1.089756.
episode: 3955   score: 260.0  epsilon: 1.0    steps: 613  evaluation reward: 319.75
episode: 3956   score: 105.0  epsilon: 1.0    steps: 760  evaluation reward: 320.85
episode: 3957   score: 240.0  epsilon: 1.0    steps: 965  evaluation reward: 313.4
Training network. lr: 0.000198. clip: 0.079075
Iteration 9340: Policy loss: 0.004473. Value loss: 3.263644. Entropy: 1.219924.
Iteration 9341: Policy loss: 0.005081. Value loss: 1.766998. Entropy: 1.202465.
Iteration 9342: Policy loss: 0.003021. Value loss: 1.385270. Entropy: 1.221663.
episode: 3958   score: 370.0  epsilon: 1.0    steps: 125  evaluation reward: 312.9
episode: 3959   score: 305.0  epsilon: 1.0    steps: 289  evaluation reward: 313.4
Training network. lr: 0.000198. clip: 0.079075
Iterat

episode: 3978   score: 215.0  epsilon: 1.0    steps: 649  evaluation reward: 308.0
episode: 3979   score: 370.0  epsilon: 1.0    steps: 798  evaluation reward: 307.4
Training network. lr: 0.000197. clip: 0.078850
Iteration 9403: Policy loss: 0.003421. Value loss: 1.910588. Entropy: 0.721608.
Iteration 9404: Policy loss: 0.000599. Value loss: 1.249067. Entropy: 0.733423.
Iteration 9405: Policy loss: -0.001475. Value loss: 1.148821. Entropy: 0.731002.
episode: 3980   score: 215.0  epsilon: 1.0    steps: 182  evaluation reward: 306.25
episode: 3981   score: 210.0  epsilon: 1.0    steps: 422  evaluation reward: 304.65
Training network. lr: 0.000197. clip: 0.078850
Iteration 9406: Policy loss: 0.001756. Value loss: 2.046860. Entropy: 0.583187.
Iteration 9407: Policy loss: 0.005060. Value loss: 1.400966. Entropy: 0.593626.
Iteration 9408: Policy loss: 0.002198. Value loss: 1.180784. Entropy: 0.559666.
episode: 3982   score: 415.0  epsilon: 1.0    steps: 916  evaluation reward: 304.65
Trainin

now time :  2019-02-23 02:55:03.528020
episode: 4001   score: 650.0  epsilon: 1.0    steps: 264  evaluation reward: 321.8
Training network. lr: 0.000197. clip: 0.078738
Iteration 9469: Policy loss: 0.004123. Value loss: 2.152819. Entropy: 1.098708.
Iteration 9470: Policy loss: -0.004124. Value loss: 1.609457. Entropy: 1.115461.
Iteration 9471: Policy loss: -0.005211. Value loss: 1.197491. Entropy: 1.062757.
episode: 4002   score: 225.0  epsilon: 1.0    steps: 235  evaluation reward: 325.85
Training network. lr: 0.000197. clip: 0.078738
Iteration 9472: Policy loss: 0.004486. Value loss: 2.041685. Entropy: 1.306670.
Iteration 9473: Policy loss: 0.000235. Value loss: 1.283382. Entropy: 1.281916.
Iteration 9474: Policy loss: -0.001449. Value loss: 0.823929. Entropy: 1.278601.
Training network. lr: 0.000197. clip: 0.078738
Iteration 9475: Policy loss: 0.012088. Value loss: 5.200612. Entropy: 1.220974.
Iteration 9476: Policy loss: 0.016219. Value loss: 2.613789. Entropy: 1.212981.
Iteration 

Iteration 9533: Policy loss: 0.004865. Value loss: 2.353030. Entropy: 1.178885.
Iteration 9534: Policy loss: -0.008948. Value loss: 1.630695. Entropy: 1.187265.
Training network. lr: 0.000197. clip: 0.078625
Iteration 9535: Policy loss: 0.008296. Value loss: 3.464034. Entropy: 1.287323.
Iteration 9536: Policy loss: -0.002834. Value loss: 1.675805. Entropy: 1.286446.
Iteration 9537: Policy loss: -0.006403. Value loss: 1.177114. Entropy: 1.297313.
Training network. lr: 0.000197. clip: 0.078625
Iteration 9538: Policy loss: 0.004723. Value loss: 3.640090. Entropy: 1.404610.
Iteration 9539: Policy loss: 0.007111. Value loss: 2.427378. Entropy: 1.396502.
Iteration 9540: Policy loss: 0.000433. Value loss: 1.890087. Entropy: 1.389760.
episode: 4025   score: 525.0  epsilon: 1.0    steps: 112  evaluation reward: 340.2
episode: 4026   score: 155.0  epsilon: 1.0    steps: 231  evaluation reward: 343.65
episode: 4027   score: 260.0  epsilon: 1.0    steps: 919  evaluation reward: 342.2
Training netw

episode: 4049   score: 410.0  epsilon: 1.0    steps: 462  evaluation reward: 340.6
episode: 4050   score: 235.0  epsilon: 1.0    steps: 845  evaluation reward: 340.55
Training network. lr: 0.000196. clip: 0.078512
Iteration 9598: Policy loss: 0.015964. Value loss: 3.299386. Entropy: 0.915940.
Iteration 9599: Policy loss: 0.014930. Value loss: 1.792794. Entropy: 0.901283.
Iteration 9600: Policy loss: 0.007000. Value loss: 1.577375. Entropy: 0.902620.
now time :  2019-02-23 02:57:47.802386
episode: 4051   score: 210.0  epsilon: 1.0    steps: 759  evaluation reward: 339.7
Training network. lr: 0.000196. clip: 0.078400
Iteration 9601: Policy loss: 0.002094. Value loss: 2.916918. Entropy: 0.916949.
Iteration 9602: Policy loss: 0.000056. Value loss: 1.801702. Entropy: 0.903396.
Iteration 9603: Policy loss: 0.000052. Value loss: 1.381277. Entropy: 0.914425.
Training network. lr: 0.000196. clip: 0.078400
Iteration 9604: Policy loss: 0.006569. Value loss: 2.067096. Entropy: 1.042398.
Iteration 

Iteration 9660: Policy loss: -0.006959. Value loss: 1.210525. Entropy: 1.060334.
Training network. lr: 0.000196. clip: 0.078288
Iteration 9661: Policy loss: 0.006459. Value loss: 2.803361. Entropy: 1.067535.
Iteration 9662: Policy loss: 0.004685. Value loss: 1.699715. Entropy: 1.066979.
Iteration 9663: Policy loss: -0.001410. Value loss: 1.310308. Entropy: 1.066267.
episode: 4076   score: 210.0  epsilon: 1.0    steps: 163  evaluation reward: 332.8
episode: 4077   score: 155.0  epsilon: 1.0    steps: 493  evaluation reward: 332.35
episode: 4078   score: 240.0  epsilon: 1.0    steps: 661  evaluation reward: 331.6
Training network. lr: 0.000196. clip: 0.078288
Iteration 9664: Policy loss: 0.001712. Value loss: 2.537184. Entropy: 1.013288.
Iteration 9665: Policy loss: -0.001061. Value loss: 1.655298. Entropy: 1.019339.
Iteration 9666: Policy loss: -0.000554. Value loss: 1.373621. Entropy: 0.991055.
episode: 4079   score: 255.0  epsilon: 1.0    steps: 69  evaluation reward: 331.85
Training 

Training network. lr: 0.000195. clip: 0.078175
Iteration 9724: Policy loss: 0.003667. Value loss: 2.537157. Entropy: 1.127815.
Iteration 9725: Policy loss: 0.003879. Value loss: 1.774382. Entropy: 1.124076.
Iteration 9726: Policy loss: -0.005067. Value loss: 1.408029. Entropy: 1.143776.
now time :  2019-02-23 03:00:21.798454
episode: 4101   score: 180.0  epsilon: 1.0    steps: 110  evaluation reward: 323.95
episode: 4102   score: 310.0  epsilon: 1.0    steps: 260  evaluation reward: 319.25
episode: 4103   score: 355.0  epsilon: 1.0    steps: 418  evaluation reward: 320.1
Training network. lr: 0.000195. clip: 0.078175
Iteration 9727: Policy loss: 0.003813. Value loss: 2.813734. Entropy: 0.933443.
Iteration 9728: Policy loss: -0.004724. Value loss: 1.829808. Entropy: 0.961988.
Iteration 9729: Policy loss: 0.000347. Value loss: 1.629327. Entropy: 0.967701.
episode: 4104   score: 345.0  epsilon: 1.0    steps: 226  evaluation reward: 317.7
episode: 4105   score: 255.0  epsilon: 1.0    steps

Iteration 9787: Policy loss: 0.010102. Value loss: 2.691703. Entropy: 1.098373.
Iteration 9788: Policy loss: 0.003826. Value loss: 1.687110. Entropy: 1.131669.
Iteration 9789: Policy loss: 0.000199. Value loss: 1.210020. Entropy: 1.111207.
episode: 4126   score: 435.0  epsilon: 1.0    steps: 348  evaluation reward: 296.1
episode: 4127   score: 210.0  epsilon: 1.0    steps: 662  evaluation reward: 298.9
Training network. lr: 0.000195. clip: 0.078063
Iteration 9790: Policy loss: 0.008634. Value loss: 2.143074. Entropy: 0.946451.
Iteration 9791: Policy loss: 0.002039. Value loss: 1.210413. Entropy: 0.981243.
Iteration 9792: Policy loss: 0.000489. Value loss: 1.026971. Entropy: 0.962627.
episode: 4128   score: 300.0  epsilon: 1.0    steps: 73  evaluation reward: 298.4
Training network. lr: 0.000195. clip: 0.078063
Iteration 9793: Policy loss: 0.002236. Value loss: 1.765929. Entropy: 1.069116.
Iteration 9794: Policy loss: 0.004270. Value loss: 1.087614. Entropy: 1.066850.
Iteration 9795: Po

Iteration 9851: Policy loss: 0.004646. Value loss: 1.414804. Entropy: 0.693732.
Iteration 9852: Policy loss: 0.001396. Value loss: 1.130697. Entropy: 0.707383.
Training network. lr: 0.000195. clip: 0.077838
Iteration 9853: Policy loss: 0.004628. Value loss: 3.347683. Entropy: 0.774875.
Iteration 9854: Policy loss: 0.004697. Value loss: 1.848374. Entropy: 0.780947.
Iteration 9855: Policy loss: -0.003055. Value loss: 1.441602. Entropy: 0.789520.
now time :  2019-02-23 03:03:02.645520
episode: 4151   score: 485.0  epsilon: 1.0    steps: 856  evaluation reward: 297.35
Training network. lr: 0.000195. clip: 0.077838
Iteration 9856: Policy loss: 0.014439. Value loss: 6.311963. Entropy: 0.942976.
Iteration 9857: Policy loss: 0.015443. Value loss: 4.484074. Entropy: 0.954565.
Iteration 9858: Policy loss: 0.009513. Value loss: 3.143670. Entropy: 0.941603.
episode: 4152   score: 65.0  epsilon: 1.0    steps: 701  evaluation reward: 300.1
Training network. lr: 0.000195. clip: 0.077838
Iteration 985

episode: 4175   score: 305.0  epsilon: 1.0    steps: 235  evaluation reward: 312.2
episode: 4176   score: 330.0  epsilon: 1.0    steps: 885  evaluation reward: 312.1
episode: 4177   score: 210.0  epsilon: 1.0    steps: 1013  evaluation reward: 313.3
Training network. lr: 0.000194. clip: 0.077725
Iteration 9916: Policy loss: -0.000803. Value loss: 2.544955. Entropy: 0.889258.
Iteration 9917: Policy loss: -0.001627. Value loss: 1.838276. Entropy: 0.894326.
Iteration 9918: Policy loss: -0.007754. Value loss: 1.714997. Entropy: 0.904726.
Training network. lr: 0.000194. clip: 0.077725
Iteration 9919: Policy loss: 0.004126. Value loss: 2.748601. Entropy: 0.767209.
Iteration 9920: Policy loss: -0.001850. Value loss: 1.750801. Entropy: 0.779287.
Iteration 9921: Policy loss: -0.004124. Value loss: 1.498186. Entropy: 0.779279.
Training network. lr: 0.000194. clip: 0.077725
Iteration 9922: Policy loss: 0.006497. Value loss: 2.788355. Entropy: 1.082503.
Iteration 9923: Policy loss: 0.003299. Value

Iteration 9981: Policy loss: 0.015110. Value loss: 1.838894. Entropy: 1.141670.
episode: 4199   score: 365.0  epsilon: 1.0    steps: 294  evaluation reward: 322.8
episode: 4200   score: 440.0  epsilon: 1.0    steps: 568  evaluation reward: 323.15
now time :  2019-02-23 03:05:38.746510
episode: 4201   score: 345.0  epsilon: 1.0    steps: 998  evaluation reward: 323.7
Training network. lr: 0.000194. clip: 0.077613
Iteration 9982: Policy loss: 0.010637. Value loss: 2.697253. Entropy: 1.195847.
Iteration 9983: Policy loss: 0.008644. Value loss: 1.423331. Entropy: 1.168693.
Iteration 9984: Policy loss: 0.001957. Value loss: 1.199289. Entropy: 1.212506.
Training network. lr: 0.000194. clip: 0.077613
Iteration 9985: Policy loss: 0.000999. Value loss: 2.606996. Entropy: 0.975643.
Iteration 9986: Policy loss: 0.006840. Value loss: 1.793326. Entropy: 0.985651.
Iteration 9987: Policy loss: 0.000636. Value loss: 1.191461. Entropy: 1.009818.
Training network. lr: 0.000194. clip: 0.077613
Iteration 

Iteration 10043: Policy loss: 0.003114. Value loss: 1.751770. Entropy: 0.994005.
Iteration 10044: Policy loss: -0.003510. Value loss: 1.319997. Entropy: 1.012704.
episode: 4225   score: 135.0  epsilon: 1.0    steps: 398  evaluation reward: 331.85
Training network. lr: 0.000194. clip: 0.077500
Iteration 10045: Policy loss: 0.000518. Value loss: 2.452318. Entropy: 1.078351.
Iteration 10046: Policy loss: 0.001208. Value loss: 1.581655. Entropy: 1.087301.
Iteration 10047: Policy loss: -0.000433. Value loss: 1.218194. Entropy: 1.123617.
Training network. lr: 0.000194. clip: 0.077500
Iteration 10048: Policy loss: 0.008459. Value loss: 5.847159. Entropy: 1.366279.
Iteration 10049: Policy loss: 0.007523. Value loss: 3.087370. Entropy: 1.366306.
Iteration 10050: Policy loss: 0.015559. Value loss: 2.369345. Entropy: 1.359413.
episode: 4226   score: 365.0  epsilon: 1.0    steps: 1015  evaluation reward: 327.75
Training network. lr: 0.000193. clip: 0.077388
Iteration 10051: Policy loss: 0.010289. 

Training network. lr: 0.000193. clip: 0.077275
Iteration 10108: Policy loss: 0.001184. Value loss: 2.309011. Entropy: 1.230528.
Iteration 10109: Policy loss: 0.003231. Value loss: 1.248971. Entropy: 1.244274.
Iteration 10110: Policy loss: -0.002391. Value loss: 1.016530. Entropy: 1.240148.
episode: 4249   score: 315.0  epsilon: 1.0    steps: 122  evaluation reward: 326.25
Training network. lr: 0.000193. clip: 0.077275
Iteration 10111: Policy loss: 0.005939. Value loss: 3.210626. Entropy: 1.180965.
Iteration 10112: Policy loss: 0.003435. Value loss: 1.546698. Entropy: 1.179399.
Iteration 10113: Policy loss: -0.002972. Value loss: 1.109740. Entropy: 1.151392.
episode: 4250   score: 390.0  epsilon: 1.0    steps: 227  evaluation reward: 328.4
Training network. lr: 0.000193. clip: 0.077275
Iteration 10114: Policy loss: 0.005514. Value loss: 2.620107. Entropy: 1.107134.
Iteration 10115: Policy loss: 0.009404. Value loss: 1.188700. Entropy: 1.115651.
Iteration 10116: Policy loss: -0.002378. V

Training network. lr: 0.000193. clip: 0.077162
Iteration 10174: Policy loss: 0.000790. Value loss: 1.594903. Entropy: 1.073889.
Iteration 10175: Policy loss: 0.006564. Value loss: 0.974186. Entropy: 1.092974.
Iteration 10176: Policy loss: -0.004146. Value loss: 0.779570. Entropy: 1.099962.
Training network. lr: 0.000193. clip: 0.077162
Iteration 10177: Policy loss: 0.007175. Value loss: 5.799091. Entropy: 1.031866.
Iteration 10178: Policy loss: 0.016161. Value loss: 3.286626. Entropy: 1.053326.
Iteration 10179: Policy loss: 0.014088. Value loss: 2.198797. Entropy: 1.065365.
episode: 4271   score: 365.0  epsilon: 1.0    steps: 815  evaluation reward: 335.6
Training network. lr: 0.000193. clip: 0.077162
Iteration 10180: Policy loss: 0.002212. Value loss: 2.819746. Entropy: 1.129765.
Iteration 10181: Policy loss: 0.004259. Value loss: 1.731559. Entropy: 1.132409.
Iteration 10182: Policy loss: -0.005470. Value loss: 1.331163. Entropy: 1.116224.
Training network. lr: 0.000193. clip: 0.07716

Iteration 10240: Policy loss: 0.008507. Value loss: 2.252166. Entropy: 1.231001.
Iteration 10241: Policy loss: 0.008301. Value loss: 1.305691. Entropy: 1.231219.
Iteration 10242: Policy loss: -0.003244. Value loss: 1.015501. Entropy: 1.238757.
episode: 4292   score: 395.0  epsilon: 1.0    steps: 470  evaluation reward: 357.8
Training network. lr: 0.000193. clip: 0.077050
Iteration 10243: Policy loss: 0.020203. Value loss: 6.564686. Entropy: 1.180616.
Iteration 10244: Policy loss: 0.036144. Value loss: 2.826433. Entropy: 1.186092.
Iteration 10245: Policy loss: 0.033080. Value loss: 1.732558. Entropy: 1.177402.
Training network. lr: 0.000193. clip: 0.077050
Iteration 10246: Policy loss: 0.004368. Value loss: 2.826648. Entropy: 1.166200.
Iteration 10247: Policy loss: -0.000439. Value loss: 1.737123. Entropy: 1.159715.
Iteration 10248: Policy loss: -0.002895. Value loss: 1.277489. Entropy: 1.166849.
episode: 4293   score: 245.0  epsilon: 1.0    steps: 624  evaluation reward: 359.65
episode

Iteration 10305: Policy loss: -0.001864. Value loss: 0.946631. Entropy: 1.053923.
episode: 4315   score: 210.0  epsilon: 1.0    steps: 87  evaluation reward: 360.4
Training network. lr: 0.000192. clip: 0.076825
Iteration 10306: Policy loss: 0.005115. Value loss: 2.339224. Entropy: 1.228232.
Iteration 10307: Policy loss: 0.004300. Value loss: 1.280521. Entropy: 1.252771.
Iteration 10308: Policy loss: 0.002177. Value loss: 1.034773. Entropy: 1.238080.
episode: 4316   score: 610.0  epsilon: 1.0    steps: 700  evaluation reward: 360.4
Training network. lr: 0.000192. clip: 0.076825
Iteration 10309: Policy loss: 0.009739. Value loss: 2.377946. Entropy: 1.204810.
Iteration 10310: Policy loss: 0.002287. Value loss: 1.361138. Entropy: 1.179688.
Iteration 10311: Policy loss: -0.001958. Value loss: 1.205935. Entropy: 1.177214.
Training network. lr: 0.000192. clip: 0.076825
Iteration 10312: Policy loss: 0.008742. Value loss: 2.821602. Entropy: 1.263023.
Iteration 10313: Policy loss: 0.009151. Valu

episode: 4336   score: 330.0  epsilon: 1.0    steps: 467  evaluation reward: 372.6
Training network. lr: 0.000192. clip: 0.076713
Iteration 10372: Policy loss: 0.000636. Value loss: 5.200540. Entropy: 1.318760.
Iteration 10373: Policy loss: -0.000127. Value loss: 2.237965. Entropy: 1.288367.
Iteration 10374: Policy loss: -0.004663. Value loss: 1.359663. Entropy: 1.294077.
Training network. lr: 0.000192. clip: 0.076713
Iteration 10375: Policy loss: 0.014013. Value loss: 7.366160. Entropy: 1.201048.
Iteration 10376: Policy loss: 0.025922. Value loss: 3.911079. Entropy: 1.208931.
Iteration 10377: Policy loss: 0.007449. Value loss: 2.345636. Entropy: 1.231036.
episode: 4337   score: 485.0  epsilon: 1.0    steps: 371  evaluation reward: 373.05
episode: 4338   score: 575.0  epsilon: 1.0    steps: 997  evaluation reward: 370.3
Training network. lr: 0.000192. clip: 0.076713
Iteration 10378: Policy loss: 0.009994. Value loss: 3.119121. Entropy: 1.240334.
Iteration 10379: Policy loss: 0.008914. 

Training network. lr: 0.000191. clip: 0.076600
Iteration 10438: Policy loss: 0.009095. Value loss: 1.949856. Entropy: 1.286896.
Iteration 10439: Policy loss: 0.007135. Value loss: 1.138388. Entropy: 1.281977.
Iteration 10440: Policy loss: 0.001508. Value loss: 0.882135. Entropy: 1.256186.
episode: 4358   score: 270.0  epsilon: 1.0    steps: 773  evaluation reward: 371.35
Training network. lr: 0.000191. clip: 0.076600
Iteration 10441: Policy loss: 0.017462. Value loss: 4.052761. Entropy: 1.128861.
Iteration 10442: Policy loss: 0.014875. Value loss: 2.115635. Entropy: 1.165196.
Iteration 10443: Policy loss: 0.008628. Value loss: 1.397806. Entropy: 1.122970.
episode: 4359   score: 310.0  epsilon: 1.0    steps: 616  evaluation reward: 368.05
episode: 4360   score: 410.0  epsilon: 1.0    steps: 737  evaluation reward: 369.35
Training network. lr: 0.000191. clip: 0.076600
Iteration 10444: Policy loss: 0.014603. Value loss: 4.346347. Entropy: 1.265813.
Iteration 10445: Policy loss: 0.006967. 

Iteration 10503: Policy loss: -0.003368. Value loss: 1.455356. Entropy: 0.791082.
episode: 4381   score: 365.0  epsilon: 1.0    steps: 386  evaluation reward: 351.75
Training network. lr: 0.000191. clip: 0.076375
Iteration 10504: Policy loss: 0.007363. Value loss: 2.173409. Entropy: 0.992666.
Iteration 10505: Policy loss: 0.000694. Value loss: 1.395046. Entropy: 1.009387.
Iteration 10506: Policy loss: -0.000791. Value loss: 1.156272. Entropy: 1.007873.
Training network. lr: 0.000191. clip: 0.076375
Iteration 10507: Policy loss: 0.006727. Value loss: 2.733207. Entropy: 1.344791.
Iteration 10508: Policy loss: -0.001428. Value loss: 1.596681. Entropy: 1.340884.
Iteration 10509: Policy loss: 0.000316. Value loss: 1.366341. Entropy: 1.340748.
episode: 4382   score: 210.0  epsilon: 1.0    steps: 866  evaluation reward: 352.25
Training network. lr: 0.000191. clip: 0.076375
Iteration 10510: Policy loss: 0.002062. Value loss: 1.729756. Entropy: 1.441052.
Iteration 10511: Policy loss: -0.003548.

Iteration 10569: Policy loss: 0.007800. Value loss: 2.720717. Entropy: 1.091885.
episode: 4402   score: 225.0  epsilon: 1.0    steps: 418  evaluation reward: 347.3
episode: 4403   score: 435.0  epsilon: 1.0    steps: 866  evaluation reward: 346.35
Training network. lr: 0.000191. clip: 0.076262
Iteration 10570: Policy loss: 0.003057. Value loss: 2.795425. Entropy: 0.944030.
Iteration 10571: Policy loss: -0.001549. Value loss: 1.974757. Entropy: 0.968749.
Iteration 10572: Policy loss: -0.006224. Value loss: 1.497562. Entropy: 0.951438.
episode: 4404   score: 285.0  epsilon: 1.0    steps: 30  evaluation reward: 347.65
Training network. lr: 0.000191. clip: 0.076262
Iteration 10573: Policy loss: 0.003694. Value loss: 2.105488. Entropy: 0.862105.
Iteration 10574: Policy loss: 0.006499. Value loss: 1.280946. Entropy: 0.880002.
Iteration 10575: Policy loss: -0.003841. Value loss: 1.050809. Entropy: 0.868662.
episode: 4405   score: 300.0  epsilon: 1.0    steps: 536  evaluation reward: 348.35
Tr

Iteration 10633: Policy loss: 0.002002. Value loss: 2.881870. Entropy: 0.904553.
Iteration 10634: Policy loss: -0.001072. Value loss: 2.007319. Entropy: 0.886063.
Iteration 10635: Policy loss: -0.006097. Value loss: 1.657804. Entropy: 0.880999.
episode: 4426   score: 490.0  epsilon: 1.0    steps: 822  evaluation reward: 355.1
Training network. lr: 0.000190. clip: 0.076150
Iteration 10636: Policy loss: 0.006448. Value loss: 3.393391. Entropy: 0.984573.
Iteration 10637: Policy loss: 0.005615. Value loss: 1.845963. Entropy: 0.972226.
Iteration 10638: Policy loss: 0.000898. Value loss: 1.417338. Entropy: 0.978023.
Training network. lr: 0.000190. clip: 0.076150
Iteration 10639: Policy loss: 0.003863. Value loss: 2.986126. Entropy: 1.131166.
Iteration 10640: Policy loss: 0.005457. Value loss: 1.674351. Entropy: 1.138898.
Iteration 10641: Policy loss: -0.000967. Value loss: 1.335577. Entropy: 1.134487.
episode: 4427   score: 210.0  epsilon: 1.0    steps: 244  evaluation reward: 357.9
Training

Iteration 10701: Policy loss: 0.002184. Value loss: 1.223202. Entropy: 1.133248.
episode: 4446   score: 450.0  epsilon: 1.0    steps: 482  evaluation reward: 361.55
Training network. lr: 0.000190. clip: 0.075925
Iteration 10702: Policy loss: 0.001149. Value loss: 1.732304. Entropy: 0.745218.
Iteration 10703: Policy loss: 0.006467. Value loss: 1.271872. Entropy: 0.746511.
Iteration 10704: Policy loss: -0.003171. Value loss: 1.001219. Entropy: 0.737500.
episode: 4447   score: 365.0  epsilon: 1.0    steps: 301  evaluation reward: 364.25
Training network. lr: 0.000190. clip: 0.075925
Iteration 10705: Policy loss: 0.005416. Value loss: 2.674748. Entropy: 0.810561.
Iteration 10706: Policy loss: -0.003835. Value loss: 1.712995. Entropy: 0.825673.
Iteration 10707: Policy loss: 0.000286. Value loss: 1.504035. Entropy: 0.784489.
Training network. lr: 0.000190. clip: 0.075925
Iteration 10708: Policy loss: 0.005628. Value loss: 2.081175. Entropy: 1.158078.
Iteration 10709: Policy loss: 0.005623. V

Iteration 10766: Policy loss: 0.021382. Value loss: 3.072030. Entropy: 1.232854.
Iteration 10767: Policy loss: 0.016942. Value loss: 1.974941. Entropy: 1.217064.
episode: 4468   score: 665.0  epsilon: 1.0    steps: 232  evaluation reward: 352.1
Training network. lr: 0.000190. clip: 0.075813
Iteration 10768: Policy loss: 0.001577. Value loss: 2.624874. Entropy: 1.193312.
Iteration 10769: Policy loss: 0.001891. Value loss: 1.517012. Entropy: 1.200989.
Iteration 10770: Policy loss: -0.003763. Value loss: 1.209406. Entropy: 1.202811.
episode: 4469   score: 210.0  epsilon: 1.0    steps: 36  evaluation reward: 355.25
episode: 4470   score: 240.0  epsilon: 1.0    steps: 902  evaluation reward: 355.2
Training network. lr: 0.000190. clip: 0.075813
Iteration 10771: Policy loss: 0.005323. Value loss: 2.621618. Entropy: 0.963014.
Iteration 10772: Policy loss: -0.000444. Value loss: 1.794636. Entropy: 0.953749.
Iteration 10773: Policy loss: -0.006018. Value loss: 1.326118. Entropy: 0.926851.
episod

Iteration 10829: Policy loss: 0.006256. Value loss: 2.347337. Entropy: 1.213489.
Iteration 10830: Policy loss: 0.004171. Value loss: 1.665443. Entropy: 1.206648.
episode: 4493   score: 375.0  epsilon: 1.0    steps: 283  evaluation reward: 369.1
Training network. lr: 0.000189. clip: 0.075700
Iteration 10831: Policy loss: 0.000419. Value loss: 3.158779. Entropy: 0.972399.
Iteration 10832: Policy loss: 0.002569. Value loss: 1.931652. Entropy: 0.975957.
Iteration 10833: Policy loss: -0.004302. Value loss: 1.651740. Entropy: 0.964299.
Training network. lr: 0.000189. clip: 0.075700
Iteration 10834: Policy loss: 0.006412. Value loss: 3.477327. Entropy: 1.053956.
Iteration 10835: Policy loss: 0.004463. Value loss: 1.787925. Entropy: 1.041714.
Iteration 10836: Policy loss: 0.008902. Value loss: 1.088486. Entropy: 1.036189.
episode: 4494   score: 315.0  epsilon: 1.0    steps: 590  evaluation reward: 370.15
Training network. lr: 0.000189. clip: 0.075700
Iteration 10837: Policy loss: 0.008511. Val

episode: 4516   score: 320.0  epsilon: 1.0    steps: 595  evaluation reward: 370.8
Training network. lr: 0.000189. clip: 0.075588
Iteration 10894: Policy loss: 0.001305. Value loss: 2.529730. Entropy: 1.155469.
Iteration 10895: Policy loss: -0.001390. Value loss: 1.952432. Entropy: 1.113122.
Iteration 10896: Policy loss: -0.003710. Value loss: 1.582716. Entropy: 1.150529.
episode: 4517   score: 280.0  epsilon: 1.0    steps: 379  evaluation reward: 368.1
Training network. lr: 0.000189. clip: 0.075588
Iteration 10897: Policy loss: 0.011148. Value loss: 2.717889. Entropy: 0.938123.
Iteration 10898: Policy loss: 0.006875. Value loss: 1.411378. Entropy: 0.940500.
Iteration 10899: Policy loss: 0.002892. Value loss: 0.971619. Entropy: 0.967180.
episode: 4518   score: 260.0  epsilon: 1.0    steps: 829  evaluation reward: 368.8
Training network. lr: 0.000189. clip: 0.075588
Iteration 10900: Policy loss: 0.005462. Value loss: 2.334187. Entropy: 0.919670.
Iteration 10901: Policy loss: 0.002740. V

Iteration 10959: Policy loss: 0.001659. Value loss: 1.730901. Entropy: 1.132710.
episode: 4539   score: 265.0  epsilon: 1.0    steps: 145  evaluation reward: 347.8
episode: 4540   score: 330.0  epsilon: 1.0    steps: 282  evaluation reward: 347.7
episode: 4541   score: 330.0  epsilon: 1.0    steps: 766  evaluation reward: 347.85
Training network. lr: 0.000188. clip: 0.075362
Iteration 10960: Policy loss: 0.004480. Value loss: 2.130038. Entropy: 0.931808.
Iteration 10961: Policy loss: 0.003573. Value loss: 1.460713. Entropy: 0.890082.
Iteration 10962: Policy loss: 0.005128. Value loss: 1.149348. Entropy: 0.895211.
Training network. lr: 0.000188. clip: 0.075362
Iteration 10963: Policy loss: 0.007679. Value loss: 5.976480. Entropy: 0.848678.
Iteration 10964: Policy loss: 0.002195. Value loss: 3.708936. Entropy: 0.796332.
Iteration 10965: Policy loss: 0.009708. Value loss: 3.191661. Entropy: 0.807816.
episode: 4542   score: 220.0  epsilon: 1.0    steps: 586  evaluation reward: 348.45
episo

Training network. lr: 0.000188. clip: 0.075250
Iteration 11023: Policy loss: 0.001943. Value loss: 2.252580. Entropy: 1.291338.
Iteration 11024: Policy loss: -0.001902. Value loss: 0.950326. Entropy: 1.295547.
Iteration 11025: Policy loss: -0.004567. Value loss: 0.608825. Entropy: 1.293385.
episode: 4563   score: 210.0  epsilon: 1.0    steps: 157  evaluation reward: 346.4
Training network. lr: 0.000188. clip: 0.075250
Iteration 11026: Policy loss: 0.008419. Value loss: 2.731060. Entropy: 1.231681.
Iteration 11027: Policy loss: 0.006739. Value loss: 1.622246. Entropy: 1.213264.
Iteration 11028: Policy loss: 0.002171. Value loss: 1.363271. Entropy: 1.224440.
episode: 4564   score: 260.0  epsilon: 1.0    steps: 107  evaluation reward: 346.1
Training network. lr: 0.000188. clip: 0.075250
Iteration 11029: Policy loss: 0.001789. Value loss: 3.134084. Entropy: 1.205494.
Iteration 11030: Policy loss: -0.006070. Value loss: 2.068942. Entropy: 1.224740.
Iteration 11031: Policy loss: -0.005578. V

Iteration 11091: Policy loss: 0.006887. Value loss: 2.521616. Entropy: 1.102478.
Training network. lr: 0.000188. clip: 0.075138
Iteration 11092: Policy loss: 0.007251. Value loss: 5.235759. Entropy: 1.330361.
Iteration 11093: Policy loss: 0.010906. Value loss: 2.661055. Entropy: 1.301355.
Iteration 11094: Policy loss: 0.011117. Value loss: 1.685065. Entropy: 1.326195.
episode: 4583   score: 485.0  epsilon: 1.0    steps: 260  evaluation reward: 371.4
episode: 4584   score: 450.0  epsilon: 1.0    steps: 879  evaluation reward: 374.1
Training network. lr: 0.000188. clip: 0.075138
Iteration 11095: Policy loss: 0.009741. Value loss: 2.468916. Entropy: 1.161866.
Iteration 11096: Policy loss: 0.006307. Value loss: 1.363282. Entropy: 1.180487.
Iteration 11097: Policy loss: -0.004135. Value loss: 1.112844. Entropy: 1.164572.
Training network. lr: 0.000188. clip: 0.075138
Iteration 11098: Policy loss: 0.007563. Value loss: 4.211754. Entropy: 1.197138.
Iteration 11099: Policy loss: 0.002386. Valu

Iteration 11157: Policy loss: -0.005526. Value loss: 1.482442. Entropy: 1.047961.
episode: 4605   score: 320.0  epsilon: 1.0    steps: 293  evaluation reward: 377.35
episode: 4606   score: 465.0  epsilon: 1.0    steps: 578  evaluation reward: 378.45
Training network. lr: 0.000187. clip: 0.074912
Iteration 11158: Policy loss: 0.004321. Value loss: 2.772893. Entropy: 0.882106.
Iteration 11159: Policy loss: 0.008153. Value loss: 1.749331. Entropy: 0.874721.
Iteration 11160: Policy loss: -0.000165. Value loss: 1.463799. Entropy: 0.906871.
episode: 4607   score: 180.0  epsilon: 1.0    steps: 38  evaluation reward: 381.0
Training network. lr: 0.000187. clip: 0.074912
Iteration 11161: Policy loss: 0.003714. Value loss: 2.451641. Entropy: 1.040236.
Iteration 11162: Policy loss: -0.003825. Value loss: 1.495082. Entropy: 1.042226.
Iteration 11163: Policy loss: -0.003533. Value loss: 1.375578. Entropy: 1.009840.
Training network. lr: 0.000187. clip: 0.074912
Iteration 11164: Policy loss: 0.005042

Iteration 11223: Policy loss: -0.011576. Value loss: 1.271125. Entropy: 1.258762.
Training network. lr: 0.000187. clip: 0.074800
Iteration 11224: Policy loss: 0.011328. Value loss: 6.966179. Entropy: 1.233276.
Iteration 11225: Policy loss: 0.021165. Value loss: 5.106690. Entropy: 1.209982.
Iteration 11226: Policy loss: 0.022901. Value loss: 3.025918. Entropy: 1.228819.
Training network. lr: 0.000187. clip: 0.074800
Iteration 11227: Policy loss: 0.006598. Value loss: 1.933207. Entropy: 1.227174.
Iteration 11228: Policy loss: 0.002658. Value loss: 1.061687. Entropy: 1.231404.
Iteration 11229: Policy loss: -0.005502. Value loss: 0.900653. Entropy: 1.212994.
episode: 4627   score: 245.0  epsilon: 1.0    steps: 614  evaluation reward: 391.4
episode: 4628   score: 210.0  epsilon: 1.0    steps: 797  evaluation reward: 391.45
Training network. lr: 0.000187. clip: 0.074800
Iteration 11230: Policy loss: 0.003046. Value loss: 3.434452. Entropy: 1.221277.
Iteration 11231: Policy loss: 0.001071. Va

Iteration 11286: Policy loss: 0.001088. Value loss: 1.059576. Entropy: 1.059052.
episode: 4652   score: 210.0  epsilon: 1.0    steps: 89  evaluation reward: 376.85
episode: 4653   score: 460.0  epsilon: 1.0    steps: 479  evaluation reward: 377.1
episode: 4654   score: 425.0  epsilon: 1.0    steps: 721  evaluation reward: 379.6
episode: 4655   score: 240.0  epsilon: 1.0    steps: 851  evaluation reward: 379.5
Training network. lr: 0.000187. clip: 0.074688
Iteration 11287: Policy loss: 0.006665. Value loss: 2.840886. Entropy: 0.888114.
Iteration 11288: Policy loss: 0.004424. Value loss: 1.957439. Entropy: 0.927992.
Iteration 11289: Policy loss: -0.003394. Value loss: 1.541112. Entropy: 0.933061.
Training network. lr: 0.000187. clip: 0.074688
Iteration 11290: Policy loss: 0.000354. Value loss: 2.175805. Entropy: 0.875705.
Iteration 11291: Policy loss: -0.001247. Value loss: 1.305344. Entropy: 0.906361.
Iteration 11292: Policy loss: -0.008335. Value loss: 0.959978. Entropy: 0.893151.
Trai

Iteration 11352: Policy loss: 0.000395. Value loss: 1.281625. Entropy: 1.186678.
Training network. lr: 0.000186. clip: 0.074463
Iteration 11353: Policy loss: 0.003250. Value loss: 1.976186. Entropy: 1.271321.
Iteration 11354: Policy loss: -0.002883. Value loss: 1.163969. Entropy: 1.284395.
Iteration 11355: Policy loss: -0.006174. Value loss: 0.920021. Entropy: 1.278520.
episode: 4674   score: 395.0  epsilon: 1.0    steps: 134  evaluation reward: 372.5
episode: 4675   score: 155.0  epsilon: 1.0    steps: 700  evaluation reward: 370.95
Training network. lr: 0.000186. clip: 0.074463
Iteration 11356: Policy loss: 0.006003. Value loss: 1.941039. Entropy: 1.080132.
Iteration 11357: Policy loss: 0.011730. Value loss: 1.005586. Entropy: 1.081918.
Iteration 11358: Policy loss: 0.005380. Value loss: 0.749123. Entropy: 1.074338.
episode: 4676   score: 390.0  epsilon: 1.0    steps: 339  evaluation reward: 368.05
episode: 4677   score: 420.0  epsilon: 1.0    steps: 481  evaluation reward: 366.6
epi

Iteration 11416: Policy loss: 0.001581. Value loss: 2.165792. Entropy: 1.104342.
Iteration 11417: Policy loss: 0.002785. Value loss: 1.104070. Entropy: 1.102499.
Iteration 11418: Policy loss: -0.002579. Value loss: 0.795349. Entropy: 1.102870.
episode: 4698   score: 230.0  epsilon: 1.0    steps: 640  evaluation reward: 343.8
episode: 4699   score: 365.0  epsilon: 1.0    steps: 710  evaluation reward: 342.45
Training network. lr: 0.000186. clip: 0.074350
Iteration 11419: Policy loss: 0.005452. Value loss: 1.979808. Entropy: 1.174262.
Iteration 11420: Policy loss: 0.003751. Value loss: 1.125427. Entropy: 1.181727.
Iteration 11421: Policy loss: -0.003080. Value loss: 0.967713. Entropy: 1.156002.
episode: 4700   score: 215.0  epsilon: 1.0    steps: 202  evaluation reward: 340.35
Training network. lr: 0.000186. clip: 0.074350
Iteration 11422: Policy loss: 0.001424. Value loss: 2.298223. Entropy: 1.025110.
Iteration 11423: Policy loss: 0.004992. Value loss: 1.491078. Entropy: 1.010741.
Itera

Training network. lr: 0.000186. clip: 0.074237
Iteration 11479: Policy loss: 0.006151. Value loss: 2.896870. Entropy: 0.939721.
Iteration 11480: Policy loss: -0.000513. Value loss: 1.789544. Entropy: 0.947439.
Iteration 11481: Policy loss: 0.001569. Value loss: 1.393777. Entropy: 0.921981.
Training network. lr: 0.000186. clip: 0.074237
Iteration 11482: Policy loss: 0.008497. Value loss: 1.968501. Entropy: 1.251839.
Iteration 11483: Policy loss: 0.000372. Value loss: 1.168618. Entropy: 1.270353.
Iteration 11484: Policy loss: -0.006078. Value loss: 0.885057. Entropy: 1.255313.
episode: 4724   score: 315.0  epsilon: 1.0    steps: 56  evaluation reward: 311.4
episode: 4725   score: 210.0  epsilon: 1.0    steps: 279  evaluation reward: 311.4
episode: 4726   score: 390.0  epsilon: 1.0    steps: 623  evaluation reward: 308.8
episode: 4727   score: 260.0  epsilon: 1.0    steps: 833  evaluation reward: 309.55
Training network. lr: 0.000186. clip: 0.074237
Iteration 11485: Policy loss: 0.003102.

episode: 4746   score: 515.0  epsilon: 1.0    steps: 239  evaluation reward: 316.25
Training network. lr: 0.000185. clip: 0.074125
Iteration 11545: Policy loss: 0.011832. Value loss: 2.883712. Entropy: 1.228287.
Iteration 11546: Policy loss: 0.003189. Value loss: 1.731544. Entropy: 1.215761.
Iteration 11547: Policy loss: -0.000691. Value loss: 1.354274. Entropy: 1.211471.
episode: 4747   score: 330.0  epsilon: 1.0    steps: 12  evaluation reward: 318.4
episode: 4748   score: 260.0  epsilon: 1.0    steps: 456  evaluation reward: 318.5
Training network. lr: 0.000185. clip: 0.074125
Iteration 11548: Policy loss: 0.004450. Value loss: 1.873664. Entropy: 0.893512.
Iteration 11549: Policy loss: -0.000575. Value loss: 1.067288. Entropy: 0.877630.
Iteration 11550: Policy loss: 0.000523. Value loss: 0.929627. Entropy: 0.886217.
episode: 4749   score: 595.0  epsilon: 1.0    steps: 284  evaluation reward: 319.3
episode: 4750   score: 210.0  epsilon: 1.0    steps: 644  evaluation reward: 322.4
Tra

Iteration 11606: Policy loss: 0.014446. Value loss: 3.175552. Entropy: 1.153069.
Iteration 11607: Policy loss: 0.010130. Value loss: 1.998269. Entropy: 1.161110.
Training network. lr: 0.000185. clip: 0.073900
Iteration 11608: Policy loss: 0.002199. Value loss: 2.519535. Entropy: 1.148175.
Iteration 11609: Policy loss: -0.000129. Value loss: 1.496902. Entropy: 1.144575.
Iteration 11610: Policy loss: -0.004037. Value loss: 1.125348. Entropy: 1.146035.
Training network. lr: 0.000185. clip: 0.073900
Iteration 11611: Policy loss: 0.005207. Value loss: 3.356519. Entropy: 1.358170.
Iteration 11612: Policy loss: -0.002435. Value loss: 1.879799. Entropy: 1.359479.
Iteration 11613: Policy loss: 0.000517. Value loss: 1.501727. Entropy: 1.356598.
episode: 4773   score: 935.0  epsilon: 1.0    steps: 262  evaluation reward: 310.15
Training network. lr: 0.000185. clip: 0.073900
Iteration 11614: Policy loss: 0.008211. Value loss: 3.771535. Entropy: 1.150038.
Iteration 11615: Policy loss: 0.002597. Val

episode: 4793   score: 820.0  epsilon: 1.0    steps: 650  evaluation reward: 319.15
Training network. lr: 0.000184. clip: 0.073788
Iteration 11674: Policy loss: 0.002089. Value loss: 3.040841. Entropy: 1.208683.
Iteration 11675: Policy loss: 0.003544. Value loss: 1.574229. Entropy: 1.225536.
Iteration 11676: Policy loss: -0.004955. Value loss: 1.216137. Entropy: 1.203446.
episode: 4794   score: 210.0  epsilon: 1.0    steps: 15  evaluation reward: 323.7
episode: 4795   score: 670.0  epsilon: 1.0    steps: 947  evaluation reward: 321.45
Training network. lr: 0.000184. clip: 0.073788
Iteration 11677: Policy loss: 0.000858. Value loss: 2.089024. Entropy: 1.142999.
Iteration 11678: Policy loss: 0.003616. Value loss: 1.237272. Entropy: 1.134270.
Iteration 11679: Policy loss: -0.005110. Value loss: 0.952356. Entropy: 1.138765.
Training network. lr: 0.000184. clip: 0.073788
Iteration 11680: Policy loss: 0.002551. Value loss: 4.322924. Entropy: 1.311311.
Iteration 11681: Policy loss: 0.006838. 

episode: 4815   score: 650.0  epsilon: 1.0    steps: 294  evaluation reward: 338.85
Training network. lr: 0.000184. clip: 0.073675
Iteration 11740: Policy loss: 0.005016. Value loss: 2.285139. Entropy: 1.383918.
Iteration 11741: Policy loss: -0.000809. Value loss: 1.360546. Entropy: 1.352760.
Iteration 11742: Policy loss: -0.008727. Value loss: 0.979082. Entropy: 1.368178.
episode: 4816   score: 345.0  epsilon: 1.0    steps: 24  evaluation reward: 342.15
episode: 4817   score: 210.0  epsilon: 1.0    steps: 746  evaluation reward: 343.45
episode: 4818   score: 535.0  epsilon: 1.0    steps: 939  evaluation reward: 343.45
Training network. lr: 0.000184. clip: 0.073675
Iteration 11743: Policy loss: 0.003189. Value loss: 2.942460. Entropy: 1.082811.
Iteration 11744: Policy loss: -0.000829. Value loss: 1.760639. Entropy: 1.093865.
Iteration 11745: Policy loss: -0.003911. Value loss: 1.403440. Entropy: 1.089632.
episode: 4819   score: 395.0  epsilon: 1.0    steps: 482  evaluation reward: 345.

episode: 4840   score: 315.0  epsilon: 1.0    steps: 689  evaluation reward: 355.05
Training network. lr: 0.000184. clip: 0.073450
Iteration 11803: Policy loss: 0.005003. Value loss: 3.090980. Entropy: 1.076400.
Iteration 11804: Policy loss: -0.001807. Value loss: 1.956188. Entropy: 1.065484.
Iteration 11805: Policy loss: -0.003162. Value loss: 1.607809. Entropy: 1.081780.
Training network. lr: 0.000184. clip: 0.073450
Iteration 11806: Policy loss: 0.004578. Value loss: 2.480854. Entropy: 1.083168.
Iteration 11807: Policy loss: 0.002995. Value loss: 1.494982. Entropy: 1.069595.
Iteration 11808: Policy loss: -0.003378. Value loss: 1.074076. Entropy: 1.058624.
episode: 4841   score: 260.0  epsilon: 1.0    steps: 427  evaluation reward: 354.75
Training network. lr: 0.000184. clip: 0.073450
Iteration 11809: Policy loss: 0.002614. Value loss: 2.100806. Entropy: 1.249364.
Iteration 11810: Policy loss: -0.001979. Value loss: 1.473334. Entropy: 1.216021.
Iteration 11811: Policy loss: -0.003365

episode: 4861   score: 440.0  epsilon: 1.0    steps: 630  evaluation reward: 361.8
Training network. lr: 0.000183. clip: 0.073337
Iteration 11869: Policy loss: 0.010215. Value loss: 3.759090. Entropy: 1.276987.
Iteration 11870: Policy loss: 0.003949. Value loss: 2.226823. Entropy: 1.286308.
Iteration 11871: Policy loss: 0.004648. Value loss: 1.578799. Entropy: 1.283621.
episode: 4862   score: 285.0  epsilon: 1.0    steps: 189  evaluation reward: 364.1
Training network. lr: 0.000183. clip: 0.073337
Iteration 11872: Policy loss: 0.001990. Value loss: 4.940511. Entropy: 1.206135.
Iteration 11873: Policy loss: 0.006198. Value loss: 3.634040. Entropy: 1.194667.
Iteration 11874: Policy loss: -0.000049. Value loss: 2.352810. Entropy: 1.200598.
episode: 4863   score: 390.0  epsilon: 1.0    steps: 415  evaluation reward: 361.1
Training network. lr: 0.000183. clip: 0.073337
Iteration 11875: Policy loss: 0.006307. Value loss: 2.658474. Entropy: 1.120915.
Iteration 11876: Policy loss: 0.000197. Va

Iteration 11937: Policy loss: 0.001783. Value loss: 1.934323. Entropy: 1.259952.
episode: 4881   score: 260.0  epsilon: 1.0    steps: 177  evaluation reward: 371.95
Training network. lr: 0.000183. clip: 0.073225
Iteration 11938: Policy loss: 0.004551. Value loss: 3.469799. Entropy: 1.199126.
Iteration 11939: Policy loss: 0.002520. Value loss: 2.229485. Entropy: 1.209289.
Iteration 11940: Policy loss: 0.000575. Value loss: 1.847152. Entropy: 1.202484.
episode: 4882   score: 325.0  epsilon: 1.0    steps: 706  evaluation reward: 371.4
episode: 4883   score: 410.0  epsilon: 1.0    steps: 976  evaluation reward: 369.0
Training network. lr: 0.000183. clip: 0.073225
Iteration 11941: Policy loss: 0.004719. Value loss: 5.371810. Entropy: 1.101856.
Iteration 11942: Policy loss: 0.000569. Value loss: 2.935969. Entropy: 1.114075.
Iteration 11943: Policy loss: 0.000588. Value loss: 2.097619. Entropy: 1.117049.
episode: 4884   score: 770.0  epsilon: 1.0    steps: 538  evaluation reward: 369.45
episo

Training network. lr: 0.000182. clip: 0.073000
Iteration 12001: Policy loss: 0.003240. Value loss: 2.479072. Entropy: 1.175207.
Iteration 12002: Policy loss: 0.001082. Value loss: 1.262375. Entropy: 1.175354.
Iteration 12003: Policy loss: -0.002395. Value loss: 0.904287. Entropy: 1.180135.
Training network. lr: 0.000182. clip: 0.073000
Iteration 12004: Policy loss: 0.015480. Value loss: 2.616474. Entropy: 1.303401.
Iteration 12005: Policy loss: 0.006349. Value loss: 1.293683. Entropy: 1.290135.
Iteration 12006: Policy loss: -0.002607. Value loss: 0.757218. Entropy: 1.285147.
episode: 4905   score: 215.0  epsilon: 1.0    steps: 1001  evaluation reward: 380.4
Training network. lr: 0.000182. clip: 0.073000
Iteration 12007: Policy loss: 0.009046. Value loss: 5.874397. Entropy: 1.230839.
Iteration 12008: Policy loss: 0.009218. Value loss: 4.085537. Entropy: 1.247231.
Iteration 12009: Policy loss: 0.010538. Value loss: 3.825801. Entropy: 1.245831.
episode: 4906   score: 415.0  epsilon: 1.0  

Training network. lr: 0.000182. clip: 0.072887
Iteration 12067: Policy loss: -0.001643. Value loss: 5.224264. Entropy: 1.159349.
Iteration 12068: Policy loss: -0.002904. Value loss: 3.745922. Entropy: 1.169294.
Iteration 12069: Policy loss: 0.008492. Value loss: 2.796140. Entropy: 1.150427.
episode: 4927   score: 375.0  epsilon: 1.0    steps: 326  evaluation reward: 395.4
episode: 4928   score: 520.0  epsilon: 1.0    steps: 500  evaluation reward: 396.0
episode: 4929   score: 470.0  epsilon: 1.0    steps: 695  evaluation reward: 398.05
Training network. lr: 0.000182. clip: 0.072887
Iteration 12070: Policy loss: -0.000174. Value loss: 2.939386. Entropy: 1.155812.
Iteration 12071: Policy loss: -0.000485. Value loss: 1.892841. Entropy: 1.139087.
Iteration 12072: Policy loss: -0.004198. Value loss: 1.555934. Entropy: 1.143720.
Training network. lr: 0.000182. clip: 0.072887
Iteration 12073: Policy loss: 0.006599. Value loss: 2.832470. Entropy: 1.081959.
Iteration 12074: Policy loss: 0.00822

episode: 4952   score: 210.0  epsilon: 1.0    steps: 724  evaluation reward: 391.65
Training network. lr: 0.000182. clip: 0.072775
Iteration 12130: Policy loss: 0.008165. Value loss: 6.635999. Entropy: 1.047629.
Iteration 12131: Policy loss: 0.004865. Value loss: 4.111731. Entropy: 1.026913.
Iteration 12132: Policy loss: 0.004238. Value loss: 3.463506. Entropy: 1.059721.
Training network. lr: 0.000182. clip: 0.072775
Iteration 12133: Policy loss: 0.006066. Value loss: 3.598065. Entropy: 1.156952.
Iteration 12134: Policy loss: 0.003785. Value loss: 1.823752. Entropy: 1.163000.
Iteration 12135: Policy loss: -0.002870. Value loss: 1.305443. Entropy: 1.180022.
episode: 4953   score: 360.0  epsilon: 1.0    steps: 335  evaluation reward: 390.15
episode: 4954   score: 210.0  epsilon: 1.0    steps: 574  evaluation reward: 389.8
episode: 4955   score: 595.0  epsilon: 1.0    steps: 900  evaluation reward: 387.7
Training network. lr: 0.000182. clip: 0.072775
Iteration 12136: Policy loss: 0.009094

Iteration 12195: Policy loss: -0.010022. Value loss: 1.203175. Entropy: 1.238123.
episode: 4975   score: 645.0  epsilon: 1.0    steps: 573  evaluation reward: 371.9
episode: 4976   score: 365.0  epsilon: 1.0    steps: 844  evaluation reward: 377.0
Training network. lr: 0.000182. clip: 0.072663
Iteration 12196: Policy loss: 0.005921. Value loss: 2.472922. Entropy: 1.123705.
Iteration 12197: Policy loss: 0.003990. Value loss: 1.516873. Entropy: 1.119016.
Iteration 12198: Policy loss: 0.003312. Value loss: 1.032247. Entropy: 1.129447.
Training network. lr: 0.000182. clip: 0.072663
Iteration 12199: Policy loss: 0.005631. Value loss: 4.582081. Entropy: 1.194128.
Iteration 12200: Policy loss: 0.002080. Value loss: 3.586825. Entropy: 1.195692.
Iteration 12201: Policy loss: -0.004779. Value loss: 3.254438. Entropy: 1.194278.
episode: 4977   score: 260.0  epsilon: 1.0    steps: 24  evaluation reward: 377.0
episode: 4978   score: 870.0  epsilon: 1.0    steps: 950  evaluation reward: 375.35
Train

Iteration 12260: Policy loss: 0.011482. Value loss: 1.765594. Entropy: 1.218412.
Iteration 12261: Policy loss: 0.001078. Value loss: 1.353557. Entropy: 1.214991.
episode: 4998   score: 285.0  epsilon: 1.0    steps: 217  evaluation reward: 374.35
episode: 4999   score: 280.0  epsilon: 1.0    steps: 340  evaluation reward: 371.6
episode: 5000   score: 305.0  epsilon: 1.0    steps: 588  evaluation reward: 369.95
now time :  2019-02-23 03:52:30.537488
episode: 5001   score: 320.0  epsilon: 1.0    steps: 1007  evaluation reward: 370.9
Training network. lr: 0.000181. clip: 0.072438
Iteration 12262: Policy loss: 0.004890. Value loss: 2.743203. Entropy: 1.114924.
Iteration 12263: Policy loss: 0.001575. Value loss: 1.755346. Entropy: 1.100701.
Iteration 12264: Policy loss: -0.001952. Value loss: 1.220756. Entropy: 1.111656.
Training network. lr: 0.000181. clip: 0.072438
Iteration 12265: Policy loss: 0.010534. Value loss: 4.457872. Entropy: 1.048838.
Iteration 12266: Policy loss: 0.010974. Value

Iteration 12324: Policy loss: 0.003274. Value loss: 1.976009. Entropy: 1.093269.
Training network. lr: 0.000181. clip: 0.072325
Iteration 12325: Policy loss: -0.000265. Value loss: 3.015785. Entropy: 1.247338.
Iteration 12326: Policy loss: -0.003700. Value loss: 1.617648. Entropy: 1.230600.
Iteration 12327: Policy loss: -0.007701. Value loss: 1.218533. Entropy: 1.224437.
episode: 5022   score: 150.0  epsilon: 1.0    steps: 21  evaluation reward: 372.0
episode: 5023   score: 135.0  epsilon: 1.0    steps: 656  evaluation reward: 371.4
Training network. lr: 0.000181. clip: 0.072325
Iteration 12328: Policy loss: 0.006688. Value loss: 3.845883. Entropy: 1.121624.
Iteration 12329: Policy loss: 0.004397. Value loss: 2.178904. Entropy: 1.111310.
Iteration 12330: Policy loss: -0.001411. Value loss: 1.894174. Entropy: 1.111258.
Training network. lr: 0.000181. clip: 0.072325
Iteration 12331: Policy loss: 0.010793. Value loss: 3.219587. Entropy: 1.259227.
Iteration 12332: Policy loss: 0.001420. Va

Iteration 12388: Policy loss: 0.007884. Value loss: 3.194735. Entropy: 1.113849.
Iteration 12389: Policy loss: 0.011723. Value loss: 1.747297. Entropy: 1.109434.
Iteration 12390: Policy loss: 0.001924. Value loss: 1.335614. Entropy: 1.124588.
episode: 5046   score: 355.0  epsilon: 1.0    steps: 66  evaluation reward: 376.0
episode: 5047   score: 180.0  epsilon: 1.0    steps: 814  evaluation reward: 376.05
Training network. lr: 0.000181. clip: 0.072213
Iteration 12391: Policy loss: 0.007934. Value loss: 2.876260. Entropy: 0.961316.
Iteration 12392: Policy loss: 0.004758. Value loss: 1.685042. Entropy: 0.968782.
Iteration 12393: Policy loss: 0.002017. Value loss: 1.395870. Entropy: 0.993968.
Training network. lr: 0.000181. clip: 0.072213
Iteration 12394: Policy loss: 0.008010. Value loss: 6.492255. Entropy: 1.131620.
Iteration 12395: Policy loss: 0.007051. Value loss: 3.504361. Entropy: 1.151985.
Iteration 12396: Policy loss: 0.010625. Value loss: 2.799049. Entropy: 1.140620.
Training ne

episode: 5068   score: 225.0  epsilon: 1.0    steps: 103  evaluation reward: 392.6
episode: 5069   score: 345.0  epsilon: 1.0    steps: 966  evaluation reward: 391.55
Training network. lr: 0.000180. clip: 0.071988
Iteration 12454: Policy loss: 0.011309. Value loss: 4.142272. Entropy: 1.243440.
Iteration 12455: Policy loss: -0.002813. Value loss: 2.576488. Entropy: 1.233966.
Iteration 12456: Policy loss: -0.003472. Value loss: 1.978332. Entropy: 1.248857.
episode: 5070   score: 295.0  epsilon: 1.0    steps: 604  evaluation reward: 393.2
Training network. lr: 0.000180. clip: 0.071988
Iteration 12457: Policy loss: 0.004565. Value loss: 2.999007. Entropy: 1.121066.
Iteration 12458: Policy loss: -0.001596. Value loss: 1.866255. Entropy: 1.142876.
Iteration 12459: Policy loss: -0.004501. Value loss: 1.373902. Entropy: 1.142590.
episode: 5071   score: 530.0  epsilon: 1.0    steps: 355  evaluation reward: 394.05
Training network. lr: 0.000180. clip: 0.071988
Iteration 12460: Policy loss: 0.000

Iteration 12518: Policy loss: -0.002026. Value loss: 1.734860. Entropy: 1.230029.
Iteration 12519: Policy loss: -0.012687. Value loss: 1.384391. Entropy: 1.209746.
episode: 5092   score: 330.0  epsilon: 1.0    steps: 215  evaluation reward: 392.35
episode: 5093   score: 255.0  epsilon: 1.0    steps: 887  evaluation reward: 389.25
Training network. lr: 0.000180. clip: 0.071875
Iteration 12520: Policy loss: 0.000553. Value loss: 3.181374. Entropy: 1.027475.
Iteration 12521: Policy loss: 0.004933. Value loss: 1.974916. Entropy: 1.042081.
Iteration 12522: Policy loss: -0.004773. Value loss: 1.698319. Entropy: 1.031565.
episode: 5094   score: 325.0  epsilon: 1.0    steps: 437  evaluation reward: 386.65
Training network. lr: 0.000180. clip: 0.071875
Iteration 12523: Policy loss: -0.001939. Value loss: 3.480581. Entropy: 0.889427.
Iteration 12524: Policy loss: -0.001823. Value loss: 2.451645. Entropy: 0.875826.
Iteration 12525: Policy loss: -0.007065. Value loss: 1.801445. Entropy: 0.882767.


Iteration 12581: Policy loss: 0.009454. Value loss: 1.901514. Entropy: 1.219422.
Iteration 12582: Policy loss: 0.002338. Value loss: 1.464308. Entropy: 1.255416.
Training network. lr: 0.000179. clip: 0.071763
Iteration 12583: Policy loss: 0.003600. Value loss: 2.346568. Entropy: 1.187333.
Iteration 12584: Policy loss: 0.001312. Value loss: 1.510131. Entropy: 1.199225.
Iteration 12585: Policy loss: -0.002258. Value loss: 1.166745. Entropy: 1.208904.
Training network. lr: 0.000179. clip: 0.071763
Iteration 12586: Policy loss: 0.003081. Value loss: 2.451763. Entropy: 1.271935.
Iteration 12587: Policy loss: 0.000291. Value loss: 1.175437. Entropy: 1.275224.
Iteration 12588: Policy loss: -0.004244. Value loss: 0.799998. Entropy: 1.281591.
episode: 5117   score: 425.0  epsilon: 1.0    steps: 329  evaluation reward: 361.15
episode: 5118   score: 215.0  epsilon: 1.0    steps: 658  evaluation reward: 363.25
Training network. lr: 0.000179. clip: 0.071763
Iteration 12589: Policy loss: 0.016573. V

Iteration 12647: Policy loss: -0.003269. Value loss: 1.167229. Entropy: 1.278670.
Iteration 12648: Policy loss: -0.006253. Value loss: 0.951094. Entropy: 1.289867.
Training network. lr: 0.000179. clip: 0.071650
Iteration 12649: Policy loss: 0.006178. Value loss: 3.115799. Entropy: 1.169245.
Iteration 12650: Policy loss: 0.001329. Value loss: 1.747472. Entropy: 1.152887.
Iteration 12651: Policy loss: 0.002578. Value loss: 1.303797. Entropy: 1.170834.
episode: 5139   score: 820.0  epsilon: 1.0    steps: 249  evaluation reward: 372.55
Training network. lr: 0.000179. clip: 0.071538
Iteration 12652: Policy loss: 0.002500. Value loss: 3.160494. Entropy: 1.163693.
Iteration 12653: Policy loss: 0.000190. Value loss: 1.893955. Entropy: 1.168873.
Iteration 12654: Policy loss: -0.004768. Value loss: 1.347720. Entropy: 1.171956.
episode: 5140   score: 465.0  epsilon: 1.0    steps: 97  evaluation reward: 371.8
episode: 5141   score: 500.0  epsilon: 1.0    steps: 320  evaluation reward: 374.35
episo

Iteration 12711: Policy loss: -0.007266. Value loss: 1.253228. Entropy: 1.071546.
episode: 5163   score: 435.0  epsilon: 1.0    steps: 109  evaluation reward: 355.45
episode: 5164   score: 260.0  epsilon: 1.0    steps: 298  evaluation reward: 357.2
Training network. lr: 0.000179. clip: 0.071425
Iteration 12712: Policy loss: 0.001150. Value loss: 2.406087. Entropy: 0.905020.
Iteration 12713: Policy loss: 0.005900. Value loss: 1.530897. Entropy: 0.935026.
Iteration 12714: Policy loss: -0.002940. Value loss: 1.191021. Entropy: 0.906366.
episode: 5165   score: 180.0  epsilon: 1.0    steps: 662  evaluation reward: 352.8
Training network. lr: 0.000179. clip: 0.071425
Iteration 12715: Policy loss: 0.003677. Value loss: 7.257524. Entropy: 0.879499.
Iteration 12716: Policy loss: 0.005894. Value loss: 2.883360. Entropy: 0.911939.
Iteration 12717: Policy loss: 0.013382. Value loss: 1.603814. Entropy: 0.884479.
episode: 5166   score: 295.0  epsilon: 1.0    steps: 397  evaluation reward: 349.4
Trai

Iteration 12778: Policy loss: 0.003137. Value loss: 2.980830. Entropy: 1.249432.
Iteration 12779: Policy loss: 0.005047. Value loss: 1.926245. Entropy: 1.236267.
Iteration 12780: Policy loss: -0.000879. Value loss: 1.482186. Entropy: 1.223509.
episode: 5183   score: 500.0  epsilon: 1.0    steps: 155  evaluation reward: 363.7
episode: 5184   score: 395.0  epsilon: 1.0    steps: 785  evaluation reward: 362.2
Training network. lr: 0.000178. clip: 0.071313
Iteration 12781: Policy loss: 0.008109. Value loss: 2.494067. Entropy: 1.155179.
Iteration 12782: Policy loss: 0.001916. Value loss: 1.484737. Entropy: 1.161328.
Iteration 12783: Policy loss: 0.001603. Value loss: 1.356124. Entropy: 1.141979.
episode: 5185   score: 415.0  epsilon: 1.0    steps: 4  evaluation reward: 362.25
Training network. lr: 0.000178. clip: 0.071313
Iteration 12784: Policy loss: 0.004854. Value loss: 2.409960. Entropy: 1.088993.
Iteration 12785: Policy loss: -0.000457. Value loss: 1.483176. Entropy: 1.049945.
Iteratio

Iteration 12843: Policy loss: 0.012179. Value loss: 1.406029. Entropy: 1.215538.
episode: 5206   score: 210.0  epsilon: 1.0    steps: 69  evaluation reward: 361.25
Training network. lr: 0.000178. clip: 0.071200
Iteration 12844: Policy loss: 0.003918. Value loss: 2.629833. Entropy: 1.138970.
Iteration 12845: Policy loss: -0.001471. Value loss: 1.519197. Entropy: 1.150412.
Iteration 12846: Policy loss: -0.006106. Value loss: 0.984532. Entropy: 1.138891.
episode: 5207   score: 770.0  epsilon: 1.0    steps: 780  evaluation reward: 361.25
Training network. lr: 0.000178. clip: 0.071200
Iteration 12847: Policy loss: -0.001589. Value loss: 3.249610. Entropy: 1.247918.
Iteration 12848: Policy loss: 0.006708. Value loss: 2.063937. Entropy: 1.246297.
Iteration 12849: Policy loss: 0.002015. Value loss: 1.408670. Entropy: 1.239445.
episode: 5208   score: 375.0  epsilon: 1.0    steps: 362  evaluation reward: 364.85
episode: 5209   score: 575.0  epsilon: 1.0    steps: 595  evaluation reward: 362.85
T

Iteration 12909: Policy loss: -0.002246. Value loss: 1.112238. Entropy: 0.986661.
episode: 5228   score: 365.0  epsilon: 1.0    steps: 203  evaluation reward: 376.95
Training network. lr: 0.000177. clip: 0.070975
Iteration 12910: Policy loss: 0.004666. Value loss: 3.682490. Entropy: 1.088847.
Iteration 12911: Policy loss: 0.009032. Value loss: 1.958830. Entropy: 1.096331.
Iteration 12912: Policy loss: -0.003159. Value loss: 1.226066. Entropy: 1.079075.
Training network. lr: 0.000177. clip: 0.070975
Iteration 12913: Policy loss: 0.005777. Value loss: 2.503807. Entropy: 1.182686.
Iteration 12914: Policy loss: 0.008729. Value loss: 1.298548. Entropy: 1.189046.
Iteration 12915: Policy loss: 0.000974. Value loss: 0.965042. Entropy: 1.200137.
episode: 5229   score: 670.0  epsilon: 1.0    steps: 92  evaluation reward: 376.7
episode: 5230   score: 470.0  epsilon: 1.0    steps: 440  evaluation reward: 376.9
Training network. lr: 0.000177. clip: 0.070975
Iteration 12916: Policy loss: 0.006210. V

Iteration 12972: Policy loss: -0.003948. Value loss: 0.942646. Entropy: 1.140694.
Training network. lr: 0.000177. clip: 0.070863
Iteration 12973: Policy loss: 0.004211. Value loss: 3.077287. Entropy: 0.910466.
Iteration 12974: Policy loss: 0.002946. Value loss: 1.711355. Entropy: 0.906281.
Iteration 12975: Policy loss: -0.003327. Value loss: 1.331732. Entropy: 0.917584.
episode: 5253   score: 320.0  epsilon: 1.0    steps: 350  evaluation reward: 381.55
Training network. lr: 0.000177. clip: 0.070863
Iteration 12976: Policy loss: 0.003081. Value loss: 3.008253. Entropy: 1.286047.
Iteration 12977: Policy loss: 0.003144. Value loss: 1.686015. Entropy: 1.287165.
Iteration 12978: Policy loss: 0.000667. Value loss: 1.259875. Entropy: 1.285332.
episode: 5254   score: 435.0  epsilon: 1.0    steps: 701  evaluation reward: 382.65
Training network. lr: 0.000177. clip: 0.070863
Iteration 12979: Policy loss: 0.007243. Value loss: 4.206303. Entropy: 1.057654.
Iteration 12980: Policy loss: 0.017749. V

Iteration 13040: Policy loss: 0.007595. Value loss: 1.840033. Entropy: 0.954195.
Iteration 13041: Policy loss: -0.001358. Value loss: 1.467742. Entropy: 0.967515.
Training network. lr: 0.000177. clip: 0.070750
Iteration 13042: Policy loss: 0.007022. Value loss: 3.465759. Entropy: 1.227048.
Iteration 13043: Policy loss: 0.010475. Value loss: 1.611838. Entropy: 1.260090.
Iteration 13044: Policy loss: 0.001478. Value loss: 1.239120. Entropy: 1.260315.
episode: 5272   score: 580.0  epsilon: 1.0    steps: 65  evaluation reward: 392.15
episode: 5273   score: 355.0  epsilon: 1.0    steps: 934  evaluation reward: 392.7
Training network. lr: 0.000177. clip: 0.070750
Iteration 13045: Policy loss: 0.003362. Value loss: 3.700351. Entropy: 1.078810.
Iteration 13046: Policy loss: 0.006780. Value loss: 2.179698. Entropy: 1.095832.
Iteration 13047: Policy loss: -0.000552. Value loss: 1.791703. Entropy: 1.083726.
Training network. lr: 0.000177. clip: 0.070750
Iteration 13048: Policy loss: 0.007240. Val

Iteration 13106: Policy loss: 0.002338. Value loss: 1.904502. Entropy: 0.908226.
Iteration 13107: Policy loss: 0.002197. Value loss: 1.413988. Entropy: 0.885115.
episode: 5294   score: 390.0  epsilon: 1.0    steps: 302  evaluation reward: 388.4
episode: 5295   score: 695.0  epsilon: 1.0    steps: 791  evaluation reward: 388.1
Training network. lr: 0.000176. clip: 0.070525
Iteration 13108: Policy loss: 0.007340. Value loss: 2.878876. Entropy: 0.929977.
Iteration 13109: Policy loss: 0.004086. Value loss: 1.563267. Entropy: 0.948267.
Iteration 13110: Policy loss: -0.002499. Value loss: 1.204317. Entropy: 0.952891.
episode: 5296   score: 320.0  epsilon: 1.0    steps: 480  evaluation reward: 392.45
Training network. lr: 0.000176. clip: 0.070525
Iteration 13111: Policy loss: 0.009556. Value loss: 2.518656. Entropy: 1.097109.
Iteration 13112: Policy loss: 0.003619. Value loss: 1.387758. Entropy: 1.083673.
Iteration 13113: Policy loss: 0.001004. Value loss: 1.142677. Entropy: 1.076912.
episode

Iteration 13169: Policy loss: -0.001425. Value loss: 1.558303. Entropy: 1.108799.
Iteration 13170: Policy loss: -0.001803. Value loss: 1.289444. Entropy: 1.112770.
episode: 5319   score: 410.0  epsilon: 1.0    steps: 402  evaluation reward: 378.45
episode: 5320   score: 545.0  epsilon: 1.0    steps: 577  evaluation reward: 374.35
episode: 5321   score: 185.0  epsilon: 1.0    steps: 925  evaluation reward: 375.0
Training network. lr: 0.000176. clip: 0.070413
Iteration 13171: Policy loss: 0.005088. Value loss: 2.753335. Entropy: 0.682556.
Iteration 13172: Policy loss: 0.007582. Value loss: 1.812666. Entropy: 0.683093.
Iteration 13173: Policy loss: -0.000647. Value loss: 1.547790. Entropy: 0.672565.
episode: 5322   score: 180.0  epsilon: 1.0    steps: 781  evaluation reward: 375.05
Training network. lr: 0.000176. clip: 0.070413
Iteration 13174: Policy loss: 0.006710. Value loss: 2.451038. Entropy: 0.786487.
Iteration 13175: Policy loss: 0.003037. Value loss: 1.418485. Entropy: 0.780315.
I

Iteration 13234: Policy loss: 0.002802. Value loss: 2.453762. Entropy: 1.122283.
Iteration 13235: Policy loss: -0.000065. Value loss: 1.441885. Entropy: 1.089032.
Iteration 13236: Policy loss: -0.002695. Value loss: 0.984836. Entropy: 1.105171.
episode: 5342   score: 495.0  epsilon: 1.0    steps: 649  evaluation reward: 373.7
episode: 5343   score: 180.0  epsilon: 1.0    steps: 843  evaluation reward: 376.85
Training network. lr: 0.000176. clip: 0.070300
Iteration 13237: Policy loss: 0.006968. Value loss: 2.867692. Entropy: 1.290403.
Iteration 13238: Policy loss: 0.005836. Value loss: 2.009202. Entropy: 1.280048.
Iteration 13239: Policy loss: -0.001388. Value loss: 1.650734. Entropy: 1.290560.
episode: 5344   score: 370.0  epsilon: 1.0    steps: 938  evaluation reward: 376.4
Training network. lr: 0.000176. clip: 0.070300
Iteration 13240: Policy loss: 0.002104. Value loss: 4.076887. Entropy: 1.091887.
Iteration 13241: Policy loss: 0.002664. Value loss: 2.410523. Entropy: 1.072980.
Itera

Iteration 13299: Policy loss: 0.002933. Value loss: 3.742918. Entropy: 0.871560.
Training network. lr: 0.000175. clip: 0.070187
Iteration 13300: Policy loss: 0.003857. Value loss: 4.232433. Entropy: 1.207888.
Iteration 13301: Policy loss: 0.005307. Value loss: 2.210814. Entropy: 1.187936.
Iteration 13302: Policy loss: -0.004640. Value loss: 1.831383. Entropy: 1.209646.
Training network. lr: 0.000175. clip: 0.070075
Iteration 13303: Policy loss: 0.007726. Value loss: 2.495346. Entropy: 1.229422.
Iteration 13304: Policy loss: 0.000622. Value loss: 1.435497. Entropy: 1.228485.
Iteration 13305: Policy loss: -0.005299. Value loss: 1.054568. Entropy: 1.230374.
episode: 5365   score: 390.0  epsilon: 1.0    steps: 153  evaluation reward: 377.0
episode: 5366   score: 625.0  epsilon: 1.0    steps: 266  evaluation reward: 375.6
Training network. lr: 0.000175. clip: 0.070075
Iteration 13306: Policy loss: 0.008119. Value loss: 4.965828. Entropy: 1.164750.
Iteration 13307: Policy loss: -0.000677. Va

Iteration 13363: Policy loss: 0.005270. Value loss: 2.687200. Entropy: 1.166784.
Iteration 13364: Policy loss: 0.006311. Value loss: 1.564101. Entropy: 1.171637.
Iteration 13365: Policy loss: -0.001219. Value loss: 1.267702. Entropy: 1.169117.
episode: 5389   score: 260.0  epsilon: 1.0    steps: 1001  evaluation reward: 356.55
Training network. lr: 0.000175. clip: 0.069963
Iteration 13366: Policy loss: 0.010915. Value loss: 2.596575. Entropy: 1.219467.
Iteration 13367: Policy loss: 0.006060. Value loss: 1.633837. Entropy: 1.233594.
Iteration 13368: Policy loss: 0.001148. Value loss: 1.149367. Entropy: 1.221308.
episode: 5390   score: 495.0  epsilon: 1.0    steps: 532  evaluation reward: 356.2
Training network. lr: 0.000175. clip: 0.069963
Iteration 13369: Policy loss: 0.009755. Value loss: 2.472760. Entropy: 1.175129.
Iteration 13370: Policy loss: 0.002458. Value loss: 1.317856. Entropy: 1.151864.
Iteration 13371: Policy loss: -0.009801. Value loss: 1.101632. Entropy: 1.175744.
episode

Training network. lr: 0.000175. clip: 0.069850
Iteration 13429: Policy loss: -0.000165. Value loss: 2.952812. Entropy: 1.243699.
Iteration 13430: Policy loss: 0.000554. Value loss: 1.805502. Entropy: 1.244048.
Iteration 13431: Policy loss: 0.001055. Value loss: 1.536909. Entropy: 1.252102.
Training network. lr: 0.000175. clip: 0.069850
Iteration 13432: Policy loss: 0.007132. Value loss: 2.411326. Entropy: 1.284683.
Iteration 13433: Policy loss: 0.007657. Value loss: 1.411850. Entropy: 1.298013.
Iteration 13434: Policy loss: 0.007202. Value loss: 1.070140. Entropy: 1.304437.
episode: 5411   score: 175.0  epsilon: 1.0    steps: 196  evaluation reward: 369.5
episode: 5412   score: 245.0  epsilon: 1.0    steps: 829  evaluation reward: 365.95
Training network. lr: 0.000175. clip: 0.069850
Iteration 13435: Policy loss: 0.003538. Value loss: 3.943243. Entropy: 1.176346.
Iteration 13436: Policy loss: 0.004619. Value loss: 2.047166. Entropy: 1.169455.
Iteration 13437: Policy loss: -0.000116. Va

episode: 5433   score: 605.0  epsilon: 1.0    steps: 498  evaluation reward: 377.3
Training network. lr: 0.000174. clip: 0.069738
Iteration 13495: Policy loss: 0.012215. Value loss: 5.216525. Entropy: 1.157355.
Iteration 13496: Policy loss: 0.007851. Value loss: 4.211107. Entropy: 1.152016.
Iteration 13497: Policy loss: 0.016439. Value loss: 2.836829. Entropy: 1.157249.
episode: 5434   score: 670.0  epsilon: 1.0    steps: 269  evaluation reward: 380.5
Training network. lr: 0.000174. clip: 0.069738
Iteration 13498: Policy loss: 0.005621. Value loss: 1.604332. Entropy: 1.049445.
Iteration 13499: Policy loss: 0.003697. Value loss: 1.068228. Entropy: 1.028418.
Iteration 13500: Policy loss: 0.002460. Value loss: 0.831678. Entropy: 1.046130.
Training network. lr: 0.000174. clip: 0.069625
Iteration 13501: Policy loss: 0.011569. Value loss: 3.698737. Entropy: 1.204239.
Iteration 13502: Policy loss: 0.005440. Value loss: 1.796317. Entropy: 1.229629.
Iteration 13503: Policy loss: -0.000381. Valu

Iteration 13560: Policy loss: -0.007738. Value loss: 1.399065. Entropy: 0.946470.
episode: 5455   score: 80.0  epsilon: 1.0    steps: 843  evaluation reward: 389.35
Training network. lr: 0.000174. clip: 0.069513
Iteration 13561: Policy loss: 0.000075. Value loss: 3.469004. Entropy: 0.811303.
Iteration 13562: Policy loss: 0.009189. Value loss: 1.986420. Entropy: 0.801633.
Iteration 13563: Policy loss: -0.004614. Value loss: 1.525290. Entropy: 0.794438.
episode: 5456   score: 460.0  epsilon: 1.0    steps: 352  evaluation reward: 385.95
Training network. lr: 0.000174. clip: 0.069513
Iteration 13564: Policy loss: 0.003926. Value loss: 2.616227. Entropy: 1.109147.
Iteration 13565: Policy loss: -0.001168. Value loss: 1.690562. Entropy: 1.068103.
Iteration 13566: Policy loss: -0.001161. Value loss: 1.284166. Entropy: 1.078574.
episode: 5457   score: 135.0  epsilon: 1.0    steps: 505  evaluation reward: 387.95
episode: 5458   score: 380.0  epsilon: 1.0    steps: 935  evaluation reward: 383.95


Training network. lr: 0.000174. clip: 0.069400
Iteration 13624: Policy loss: 0.003943. Value loss: 2.405553. Entropy: 1.049798.
Iteration 13625: Policy loss: 0.002078. Value loss: 1.459840. Entropy: 1.039046.
Iteration 13626: Policy loss: 0.001927. Value loss: 1.077665. Entropy: 1.038591.
Training network. lr: 0.000174. clip: 0.069400
Iteration 13627: Policy loss: 0.005546. Value loss: 5.739398. Entropy: 1.026114.
Iteration 13628: Policy loss: 0.008402. Value loss: 2.388773. Entropy: 0.983566.
Iteration 13629: Policy loss: 0.002944. Value loss: 1.725535. Entropy: 1.011989.
episode: 5479   score: 210.0  epsilon: 1.0    steps: 489  evaluation reward: 376.5
episode: 5480   score: 210.0  epsilon: 1.0    steps: 630  evaluation reward: 376.45
episode: 5481   score: 240.0  epsilon: 1.0    steps: 723  evaluation reward: 372.45
episode: 5482   score: 565.0  epsilon: 1.0    steps: 863  evaluation reward: 372.45
Training network. lr: 0.000174. clip: 0.069400
Iteration 13630: Policy loss: 0.000808

Training network. lr: 0.000173. clip: 0.069288
Iteration 13687: Policy loss: 0.002911. Value loss: 2.056117. Entropy: 0.829857.
Iteration 13688: Policy loss: 0.009421. Value loss: 1.516918. Entropy: 0.814696.
Iteration 13689: Policy loss: 0.004597. Value loss: 1.245530. Entropy: 0.830101.
Training network. lr: 0.000173. clip: 0.069288
Iteration 13690: Policy loss: 0.000670. Value loss: 2.497062. Entropy: 1.085328.
Iteration 13691: Policy loss: 0.004448. Value loss: 1.677425. Entropy: 1.111962.
Iteration 13692: Policy loss: -0.001964. Value loss: 1.338751. Entropy: 1.136153.
episode: 5504   score: 545.0  epsilon: 1.0    steps: 418  evaluation reward: 374.65
Training network. lr: 0.000173. clip: 0.069288
Iteration 13693: Policy loss: 0.005086. Value loss: 4.413281. Entropy: 1.322582.
Iteration 13694: Policy loss: 0.002865. Value loss: 4.539032. Entropy: 1.313374.
Iteration 13695: Policy loss: 0.003171. Value loss: 3.827332. Entropy: 1.315173.
Training network. lr: 0.000173. clip: 0.06928

episode: 5523   score: 260.0  epsilon: 1.0    steps: 546  evaluation reward: 377.4
episode: 5524   score: 320.0  epsilon: 1.0    steps: 1005  evaluation reward: 376.15
Training network. lr: 0.000173. clip: 0.069062
Iteration 13756: Policy loss: 0.007511. Value loss: 2.267208. Entropy: 1.145416.
Iteration 13757: Policy loss: -0.000928. Value loss: 1.333862. Entropy: 1.154194.
Iteration 13758: Policy loss: -0.004498. Value loss: 1.035100. Entropy: 1.148667.
Training network. lr: 0.000173. clip: 0.069062
Iteration 13759: Policy loss: 0.003018. Value loss: 5.938465. Entropy: 1.171890.
Iteration 13760: Policy loss: 0.005918. Value loss: 3.382554. Entropy: 1.157338.
Iteration 13761: Policy loss: 0.003586. Value loss: 2.819751. Entropy: 1.145842.
episode: 5525   score: 410.0  epsilon: 1.0    steps: 252  evaluation reward: 373.65
episode: 5526   score: 760.0  epsilon: 1.0    steps: 889  evaluation reward: 373.05
Training network. lr: 0.000173. clip: 0.069062
Iteration 13762: Policy loss: -0.00

episode: 5545   score: 390.0  epsilon: 1.0    steps: 608  evaluation reward: 376.95
episode: 5546   score: 750.0  epsilon: 1.0    steps: 983  evaluation reward: 376.65
Training network. lr: 0.000172. clip: 0.068950
Iteration 13822: Policy loss: -0.000923. Value loss: 3.337269. Entropy: 1.151244.
Iteration 13823: Policy loss: -0.005237. Value loss: 1.814275. Entropy: 1.171709.
Iteration 13824: Policy loss: -0.005063. Value loss: 1.493148. Entropy: 1.169341.
Training network. lr: 0.000172. clip: 0.068950
Iteration 13825: Policy loss: 0.002146. Value loss: 2.797566. Entropy: 1.110117.
Iteration 13826: Policy loss: 0.004757. Value loss: 1.584984. Entropy: 1.127082.
Iteration 13827: Policy loss: 0.000354. Value loss: 1.212950. Entropy: 1.112730.
Training network. lr: 0.000172. clip: 0.068950
Iteration 13828: Policy loss: 0.007168. Value loss: 2.878911. Entropy: 1.196900.
Iteration 13829: Policy loss: -0.002394. Value loss: 1.775715. Entropy: 1.225164.
Iteration 13830: Policy loss: 0.001300.

Training network. lr: 0.000172. clip: 0.068838
Iteration 13885: Policy loss: 0.006148. Value loss: 6.105183. Entropy: 1.118286.
Iteration 13886: Policy loss: 0.007324. Value loss: 3.944289. Entropy: 1.100065.
Iteration 13887: Policy loss: 0.009175. Value loss: 3.432908. Entropy: 1.126819.
episode: 5570   score: 240.0  epsilon: 1.0    steps: 311  evaluation reward: 384.6
Training network. lr: 0.000172. clip: 0.068838
Iteration 13888: Policy loss: 0.001605. Value loss: 3.040894. Entropy: 0.915539.
Iteration 13889: Policy loss: 0.002757. Value loss: 1.950182. Entropy: 0.925143.
Iteration 13890: Policy loss: -0.004907. Value loss: 1.554270. Entropy: 0.923468.
episode: 5571   score: 515.0  epsilon: 1.0    steps: 142  evaluation reward: 386.0
Training network. lr: 0.000172. clip: 0.068838
Iteration 13891: Policy loss: 0.005498. Value loss: 3.928585. Entropy: 1.094748.
Iteration 13892: Policy loss: -0.004421. Value loss: 2.142611. Entropy: 1.060560.
Iteration 13893: Policy loss: -0.005005. Va

Iteration 13951: Policy loss: 0.004932. Value loss: 2.290496. Entropy: 1.389803.
Iteration 13952: Policy loss: -0.000072. Value loss: 1.349396. Entropy: 1.379576.
Iteration 13953: Policy loss: -0.005210. Value loss: 0.987324. Entropy: 1.374210.
episode: 5591   score: 375.0  epsilon: 1.0    steps: 624  evaluation reward: 397.9
episode: 5592   score: 210.0  epsilon: 1.0    steps: 908  evaluation reward: 397.75
Training network. lr: 0.000172. clip: 0.068613
Iteration 13954: Policy loss: 0.004155. Value loss: 4.008450. Entropy: 1.186071.
Iteration 13955: Policy loss: 0.011652. Value loss: 2.011017. Entropy: 1.158715.
Iteration 13956: Policy loss: 0.001407. Value loss: 1.462985. Entropy: 1.169589.
episode: 5593   score: 260.0  epsilon: 1.0    steps: 249  evaluation reward: 395.0
episode: 5594   score: 285.0  epsilon: 1.0    steps: 294  evaluation reward: 395.3
episode: 5595   score: 385.0  epsilon: 1.0    steps: 690  evaluation reward: 393.1
Training network. lr: 0.000172. clip: 0.068613
It

episode: 5613   score: 240.0  epsilon: 1.0    steps: 993  evaluation reward: 390.55
Training network. lr: 0.000171. clip: 0.068500
Iteration 14017: Policy loss: 0.003248. Value loss: 2.140931. Entropy: 1.375750.
Iteration 14018: Policy loss: 0.007616. Value loss: 1.260977. Entropy: 1.361804.
Iteration 14019: Policy loss: -0.000399. Value loss: 1.127267. Entropy: 1.367744.
episode: 5614   score: 405.0  epsilon: 1.0    steps: 573  evaluation reward: 390.35
episode: 5615   score: 260.0  epsilon: 1.0    steps: 892  evaluation reward: 391.8
Training network. lr: 0.000171. clip: 0.068500
Iteration 14020: Policy loss: 0.002909. Value loss: 2.313667. Entropy: 1.252228.
Iteration 14021: Policy loss: 0.008375. Value loss: 1.529133. Entropy: 1.275518.
Iteration 14022: Policy loss: -0.000744. Value loss: 1.220613. Entropy: 1.228129.
Training network. lr: 0.000171. clip: 0.068500
Iteration 14023: Policy loss: 0.011313. Value loss: 3.058589. Entropy: 1.250462.
Iteration 14024: Policy loss: 0.014254.

episode: 5638   score: 320.0  epsilon: 1.0    steps: 608  evaluation reward: 372.95
Training network. lr: 0.000171. clip: 0.068388
Iteration 14080: Policy loss: 0.005017. Value loss: 4.166945. Entropy: 1.220092.
Iteration 14081: Policy loss: -0.004366. Value loss: 2.983660. Entropy: 1.207933.
Iteration 14082: Policy loss: -0.002902. Value loss: 2.843014. Entropy: 1.225446.
Training network. lr: 0.000171. clip: 0.068388
Iteration 14083: Policy loss: 0.002879. Value loss: 2.698042. Entropy: 1.127786.
Iteration 14084: Policy loss: 0.001394. Value loss: 1.498569. Entropy: 1.124830.
Iteration 14085: Policy loss: 0.000949. Value loss: 1.115310. Entropy: 1.122381.
Training network. lr: 0.000171. clip: 0.068388
Iteration 14086: Policy loss: 0.009744. Value loss: 3.230117. Entropy: 1.298334.
Iteration 14087: Policy loss: 0.006203. Value loss: 2.321453. Entropy: 1.308925.
Iteration 14088: Policy loss: -0.000296. Value loss: 1.877698. Entropy: 1.308744.
episode: 5639   score: 360.0  epsilon: 1.0 

Iteration 14146: Policy loss: 0.012748. Value loss: 4.305968. Entropy: 1.238539.
Iteration 14147: Policy loss: 0.011503. Value loss: 2.220562. Entropy: 1.239468.
Iteration 14148: Policy loss: 0.016061. Value loss: 1.619060. Entropy: 1.261066.
episode: 5659   score: 380.0  epsilon: 1.0    steps: 218  evaluation reward: 372.2
episode: 5660   score: 290.0  epsilon: 1.0    steps: 569  evaluation reward: 373.55
Training network. lr: 0.000171. clip: 0.068275
Iteration 14149: Policy loss: 0.003086. Value loss: 2.048754. Entropy: 1.042601.
Iteration 14150: Policy loss: 0.002566. Value loss: 1.411631. Entropy: 1.057839.
Iteration 14151: Policy loss: -0.003531. Value loss: 1.237648. Entropy: 1.070818.
Training network. lr: 0.000170. clip: 0.068163
Iteration 14152: Policy loss: 0.003203. Value loss: 3.289687. Entropy: 1.208804.
Iteration 14153: Policy loss: -0.000206. Value loss: 2.146510. Entropy: 1.199795.
Iteration 14154: Policy loss: -0.004107. Value loss: 1.504576. Entropy: 1.211578.
episode

Training network. lr: 0.000170. clip: 0.068050
Iteration 14212: Policy loss: 0.006460. Value loss: 2.113339. Entropy: 1.005634.
Iteration 14213: Policy loss: 0.004222. Value loss: 1.282126. Entropy: 0.999890.
Iteration 14214: Policy loss: -0.000185. Value loss: 1.164658. Entropy: 1.038283.
episode: 5682   score: 485.0  epsilon: 1.0    steps: 823  evaluation reward: 371.85
Training network. lr: 0.000170. clip: 0.068050
Iteration 14215: Policy loss: 0.002045. Value loss: 2.159642. Entropy: 1.134649.
Iteration 14216: Policy loss: 0.000470. Value loss: 1.858401. Entropy: 1.111657.
Iteration 14217: Policy loss: -0.002601. Value loss: 1.994342. Entropy: 1.118848.
episode: 5683   score: 155.0  epsilon: 1.0    steps: 570  evaluation reward: 371.25
episode: 5684   score: 325.0  epsilon: 1.0    steps: 703  evaluation reward: 370.2
Training network. lr: 0.000170. clip: 0.068050
Iteration 14218: Policy loss: 0.001096. Value loss: 4.441742. Entropy: 1.199869.
Iteration 14219: Policy loss: -0.002676

Iteration 14280: Policy loss: -0.003255. Value loss: 1.159495. Entropy: 1.179699.
now time :  2019-02-23 04:33:56.254405
episode: 5701   score: 310.0  epsilon: 1.0    steps: 313  evaluation reward: 383.15
episode: 5702   score: 260.0  epsilon: 1.0    steps: 554  evaluation reward: 383.85
Training network. lr: 0.000170. clip: 0.067938
Iteration 14281: Policy loss: 0.007498. Value loss: 2.139731. Entropy: 1.156426.
Iteration 14282: Policy loss: -0.001255. Value loss: 1.224150. Entropy: 1.153491.
Iteration 14283: Policy loss: -0.006696. Value loss: 0.958157. Entropy: 1.139639.
episode: 5703   score: 345.0  epsilon: 1.0    steps: 405  evaluation reward: 382.8
Training network. lr: 0.000170. clip: 0.067938
Iteration 14284: Policy loss: 0.006446. Value loss: 3.678202. Entropy: 0.966119.
Iteration 14285: Policy loss: 0.003016. Value loss: 2.368316. Entropy: 0.979195.
Iteration 14286: Policy loss: 0.004641. Value loss: 1.185726. Entropy: 0.967207.
episode: 5704   score: 260.0  epsilon: 1.0    

Iteration 14344: Policy loss: 0.005981. Value loss: 2.024574. Entropy: 1.215167.
Iteration 14345: Policy loss: 0.005227. Value loss: 1.476650. Entropy: 1.190634.
Iteration 14346: Policy loss: 0.002682. Value loss: 1.420941. Entropy: 1.179415.
episode: 5724   score: 155.0  epsilon: 1.0    steps: 243  evaluation reward: 385.2
episode: 5725   score: 210.0  epsilon: 1.0    steps: 492  evaluation reward: 384.6
episode: 5726   score: 210.0  epsilon: 1.0    steps: 698  evaluation reward: 383.85
Training network. lr: 0.000170. clip: 0.067825
Iteration 14347: Policy loss: 0.001349. Value loss: 1.904738. Entropy: 1.180797.
Iteration 14348: Policy loss: 0.000606. Value loss: 1.393451. Entropy: 1.190962.
Iteration 14349: Policy loss: 0.001975. Value loss: 1.032230. Entropy: 1.178238.
Training network. lr: 0.000170. clip: 0.067825
Iteration 14350: Policy loss: 0.001432. Value loss: 2.906112. Entropy: 0.999665.
Iteration 14351: Policy loss: 0.002944. Value loss: 1.424745. Entropy: 1.004648.
Iteratio

episode: 5743   score: 365.0  epsilon: 1.0    steps: 287  evaluation reward: 387.2
Training network. lr: 0.000169. clip: 0.067600
Iteration 14413: Policy loss: 0.006228. Value loss: 2.548310. Entropy: 1.139163.
Iteration 14414: Policy loss: 0.008104. Value loss: 1.556243. Entropy: 1.137841.
Iteration 14415: Policy loss: 0.003608. Value loss: 1.159204. Entropy: 1.130437.
episode: 5744   score: 390.0  epsilon: 1.0    steps: 452  evaluation reward: 387.65
Training network. lr: 0.000169. clip: 0.067600
Iteration 14416: Policy loss: 0.008227. Value loss: 2.424701. Entropy: 1.154293.
Iteration 14417: Policy loss: -0.000822. Value loss: 1.496796. Entropy: 1.151453.
Iteration 14418: Policy loss: -0.002730. Value loss: 1.132682. Entropy: 1.180689.
episode: 5745   score: 610.0  epsilon: 1.0    steps: 44  evaluation reward: 386.7
Training network. lr: 0.000169. clip: 0.067600
Iteration 14419: Policy loss: 0.001752. Value loss: 2.841089. Entropy: 1.041765.
Iteration 14420: Policy loss: 0.007443. V

episode: 5765   score: 270.0  epsilon: 1.0    steps: 990  evaluation reward: 387.35
Training network. lr: 0.000169. clip: 0.067488
Iteration 14479: Policy loss: -0.000172. Value loss: 4.413785. Entropy: 1.269429.
Iteration 14480: Policy loss: 0.004826. Value loss: 3.406509. Entropy: 1.275388.
Iteration 14481: Policy loss: 0.002610. Value loss: 2.569363. Entropy: 1.269179.
Training network. lr: 0.000169. clip: 0.067488
Iteration 14482: Policy loss: 0.004652. Value loss: 2.174838. Entropy: 1.113986.
Iteration 14483: Policy loss: 0.001969. Value loss: 1.363310. Entropy: 1.117546.
Iteration 14484: Policy loss: -0.000712. Value loss: 1.126101. Entropy: 1.109242.
Training network. lr: 0.000169. clip: 0.067488
Iteration 14485: Policy loss: 0.003122. Value loss: 2.823295. Entropy: 1.106503.
Iteration 14486: Policy loss: 0.000276. Value loss: 1.806695. Entropy: 1.123852.
Iteration 14487: Policy loss: -0.001657. Value loss: 1.432952. Entropy: 1.112886.
episode: 5766   score: 315.0  epsilon: 1.0 

Iteration 14547: Policy loss: -0.001998. Value loss: 1.119469. Entropy: 1.399122.
Training network. lr: 0.000168. clip: 0.067375
Iteration 14548: Policy loss: 0.004886. Value loss: 1.808852. Entropy: 1.285069.
Iteration 14549: Policy loss: 0.000267. Value loss: 1.255443. Entropy: 1.278009.
Iteration 14550: Policy loss: -0.000865. Value loss: 0.994845. Entropy: 1.278531.
episode: 5785   score: 525.0  epsilon: 1.0    steps: 857  evaluation reward: 406.85
Training network. lr: 0.000168. clip: 0.067263
Iteration 14551: Policy loss: 0.004571. Value loss: 2.470530. Entropy: 1.213548.
Iteration 14552: Policy loss: 0.003864. Value loss: 1.492644. Entropy: 1.218387.
Iteration 14553: Policy loss: 0.000688. Value loss: 1.039623. Entropy: 1.222246.
episode: 5786   score: 315.0  epsilon: 1.0    steps: 980  evaluation reward: 409.25
Training network. lr: 0.000168. clip: 0.067263
Iteration 14554: Policy loss: 0.008618. Value loss: 2.052527. Entropy: 1.149485.
Iteration 14555: Policy loss: 0.003593. V

Training network. lr: 0.000168. clip: 0.067150
Iteration 14611: Policy loss: 0.001995. Value loss: 1.885963. Entropy: 1.359379.
Iteration 14612: Policy loss: -0.000376. Value loss: 1.159576. Entropy: 1.352304.
Iteration 14613: Policy loss: -0.001336. Value loss: 0.923853. Entropy: 1.358593.
Training network. lr: 0.000168. clip: 0.067150
Iteration 14614: Policy loss: -0.000001. Value loss: 1.662684. Entropy: 1.519429.
Iteration 14615: Policy loss: 0.000773. Value loss: 1.015281. Entropy: 1.505430.
Iteration 14616: Policy loss: -0.004617. Value loss: 0.900066. Entropy: 1.513280.
episode: 5809   score: 210.0  epsilon: 1.0    steps: 262  evaluation reward: 398.5
Training network. lr: 0.000168. clip: 0.067150
Iteration 14617: Policy loss: 0.005863. Value loss: 1.703836. Entropy: 1.322283.
Iteration 14618: Policy loss: 0.001325. Value loss: 1.397045. Entropy: 1.323370.
Iteration 14619: Policy loss: -0.000558. Value loss: 1.056919. Entropy: 1.309732.
episode: 5810   score: 320.0  epsilon: 1.0

Iteration 14677: Policy loss: 0.006919. Value loss: 5.286727. Entropy: 0.926955.
Iteration 14678: Policy loss: 0.005748. Value loss: 3.704409. Entropy: 0.929955.
Iteration 14679: Policy loss: 0.001039. Value loss: 2.745198. Entropy: 0.945121.
Training network. lr: 0.000168. clip: 0.067038
Iteration 14680: Policy loss: 0.001251. Value loss: 2.232579. Entropy: 1.183624.
Iteration 14681: Policy loss: 0.003420. Value loss: 1.390281. Entropy: 1.184649.
Iteration 14682: Policy loss: 0.000626. Value loss: 1.016650. Entropy: 1.173700.
episode: 5830   score: 315.0  epsilon: 1.0    steps: 418  evaluation reward: 406.8
episode: 5831   score: 395.0  epsilon: 1.0    steps: 701  evaluation reward: 405.1
Training network. lr: 0.000168. clip: 0.067038
Iteration 14683: Policy loss: 0.015627. Value loss: 4.039338. Entropy: 1.373180.
Iteration 14684: Policy loss: 0.010357. Value loss: 2.149532. Entropy: 1.374028.
Iteration 14685: Policy loss: 0.001484. Value loss: 1.824144. Entropy: 1.378812.
Training ne

episode: 5852   score: 480.0  epsilon: 1.0    steps: 216  evaluation reward: 400.1
Training network. lr: 0.000167. clip: 0.066925
Iteration 14743: Policy loss: 0.013057. Value loss: 4.676654. Entropy: 1.001620.
Iteration 14744: Policy loss: 0.022514. Value loss: 2.071024. Entropy: 1.001554.
Iteration 14745: Policy loss: 0.022663. Value loss: 1.261640. Entropy: 0.984072.
episode: 5853   score: 465.0  epsilon: 1.0    steps: 760  evaluation reward: 398.95
Training network. lr: 0.000167. clip: 0.066925
Iteration 14746: Policy loss: 0.009011. Value loss: 3.876573. Entropy: 1.167526.
Iteration 14747: Policy loss: 0.009176. Value loss: 2.324921. Entropy: 1.184539.
Iteration 14748: Policy loss: 0.001876. Value loss: 1.620627. Entropy: 1.173079.
Training network. lr: 0.000167. clip: 0.066925
Iteration 14749: Policy loss: 0.007506. Value loss: 5.648565. Entropy: 1.180699.
Iteration 14750: Policy loss: 0.004746. Value loss: 2.406998. Entropy: 1.172767.
Iteration 14751: Policy loss: 0.005015. Valu

Iteration 14809: Policy loss: 0.006307. Value loss: 3.105565. Entropy: 1.121188.
Iteration 14810: Policy loss: -0.000442. Value loss: 1.836170. Entropy: 1.135456.
Iteration 14811: Policy loss: -0.000917. Value loss: 1.450602. Entropy: 1.147492.
episode: 5873   score: 475.0  epsilon: 1.0    steps: 1000  evaluation reward: 402.55
Training network. lr: 0.000167. clip: 0.066700
Iteration 14812: Policy loss: 0.003458. Value loss: 2.452019. Entropy: 1.402505.
Iteration 14813: Policy loss: 0.002683. Value loss: 1.646537. Entropy: 1.413273.
Iteration 14814: Policy loss: -0.006446. Value loss: 1.301330. Entropy: 1.395013.
episode: 5874   score: 215.0  epsilon: 1.0    steps: 182  evaluation reward: 405.2
episode: 5875   score: 210.0  epsilon: 1.0    steps: 576  evaluation reward: 402.65
Training network. lr: 0.000167. clip: 0.066700
Iteration 14815: Policy loss: 0.004296. Value loss: 2.577076. Entropy: 1.256196.
Iteration 14816: Policy loss: 0.003490. Value loss: 1.542960. Entropy: 1.283890.
Ite

Iteration 14874: Policy loss: -0.005435. Value loss: 0.830818. Entropy: 1.362493.
Training network. lr: 0.000166. clip: 0.066588
Iteration 14875: Policy loss: 0.008371. Value loss: 2.237020. Entropy: 1.016023.
Iteration 14876: Policy loss: 0.003211. Value loss: 1.488956. Entropy: 1.005123.
Iteration 14877: Policy loss: 0.004483. Value loss: 1.174194. Entropy: 1.013520.
episode: 5896   score: 315.0  epsilon: 1.0    steps: 543  evaluation reward: 382.85
episode: 5897   score: 280.0  epsilon: 1.0    steps: 696  evaluation reward: 382.95
Training network. lr: 0.000166. clip: 0.066588
Iteration 14878: Policy loss: 0.005300. Value loss: 2.516328. Entropy: 1.073353.
Iteration 14879: Policy loss: 0.001657. Value loss: 1.555474. Entropy: 1.084012.
Iteration 14880: Policy loss: -0.003913. Value loss: 1.210372. Entropy: 1.075939.
Training network. lr: 0.000166. clip: 0.066588
Iteration 14881: Policy loss: 0.004886. Value loss: 5.967122. Entropy: 1.105513.
Iteration 14882: Policy loss: 0.003664. V

Training network. lr: 0.000166. clip: 0.066475
Iteration 14941: Policy loss: 0.007031. Value loss: 5.748244. Entropy: 1.307921.
Iteration 14942: Policy loss: 0.011183. Value loss: 3.154399. Entropy: 1.270151.
Iteration 14943: Policy loss: 0.001823. Value loss: 2.152087. Entropy: 1.289505.
episode: 5917   score: 620.0  epsilon: 1.0    steps: 986  evaluation reward: 389.2
Training network. lr: 0.000166. clip: 0.066475
Iteration 14944: Policy loss: 0.004574. Value loss: 3.247048. Entropy: 1.287320.
Iteration 14945: Policy loss: 0.002413. Value loss: 1.830673. Entropy: 1.312241.
Iteration 14946: Policy loss: -0.000780. Value loss: 1.379781. Entropy: 1.284006.
episode: 5918   score: 620.0  epsilon: 1.0    steps: 318  evaluation reward: 392.25
Training network. lr: 0.000166. clip: 0.066475
Iteration 14947: Policy loss: 0.007219. Value loss: 5.148067. Entropy: 1.184047.
Iteration 14948: Policy loss: 0.005423. Value loss: 2.942267. Entropy: 1.202263.
Iteration 14949: Policy loss: 0.007615. Val

episode: 5939   score: 265.0  epsilon: 1.0    steps: 549  evaluation reward: 393.8
Training network. lr: 0.000166. clip: 0.066250
Iteration 15007: Policy loss: 0.002634. Value loss: 2.170056. Entropy: 1.106673.
Iteration 15008: Policy loss: -0.002548. Value loss: 1.485434. Entropy: 1.094676.
Iteration 15009: Policy loss: 0.001082. Value loss: 1.164347. Entropy: 1.104266.
Training network. lr: 0.000166. clip: 0.066250
Iteration 15010: Policy loss: -0.001121. Value loss: 2.869929. Entropy: 1.146225.
Iteration 15011: Policy loss: 0.000839. Value loss: 1.676567. Entropy: 1.163795.
Iteration 15012: Policy loss: -0.003683. Value loss: 1.369977. Entropy: 1.154931.
episode: 5940   score: 365.0  epsilon: 1.0    steps: 226  evaluation reward: 394.35
Training network. lr: 0.000166. clip: 0.066250
Iteration 15013: Policy loss: 0.007593. Value loss: 4.265192. Entropy: 1.417257.
Iteration 15014: Policy loss: 0.010845. Value loss: 3.244019. Entropy: 1.422384.
Iteration 15015: Policy loss: 0.024765. V

Iteration 15071: Policy loss: -0.004132. Value loss: 1.667233. Entropy: 1.122164.
Iteration 15072: Policy loss: -0.005498. Value loss: 1.474737. Entropy: 1.152816.
Training network. lr: 0.000165. clip: 0.066138
Iteration 15073: Policy loss: 0.004476. Value loss: 2.624226. Entropy: 1.270391.
Iteration 15074: Policy loss: 0.005558. Value loss: 1.461468. Entropy: 1.286539.
Iteration 15075: Policy loss: 0.002755. Value loss: 1.094530. Entropy: 1.293427.
Training network. lr: 0.000165. clip: 0.066138
Iteration 15076: Policy loss: 0.004979. Value loss: 4.550685. Entropy: 1.311526.
Iteration 15077: Policy loss: 0.005507. Value loss: 2.707427. Entropy: 1.307322.
Iteration 15078: Policy loss: 0.003474. Value loss: 1.742500. Entropy: 1.322392.
Training network. lr: 0.000165. clip: 0.066138
Iteration 15079: Policy loss: 0.006112. Value loss: 5.259109. Entropy: 1.323409.
Iteration 15080: Policy loss: 0.012638. Value loss: 2.617494. Entropy: 1.342762.
Iteration 15081: Policy loss: 0.003236. Value l

episode: 5986   score: 210.0  epsilon: 1.0    steps: 201  evaluation reward: 382.35
Training network. lr: 0.000165. clip: 0.066025
Iteration 15136: Policy loss: 0.004334. Value loss: 3.685670. Entropy: 1.269517.
Iteration 15137: Policy loss: -0.001061. Value loss: 1.937684. Entropy: 1.277999.
Iteration 15138: Policy loss: -0.003839. Value loss: 1.507065. Entropy: 1.261400.
episode: 5987   score: 565.0  epsilon: 1.0    steps: 904  evaluation reward: 381.6
Training network. lr: 0.000165. clip: 0.066025
Iteration 15139: Policy loss: 0.007854. Value loss: 4.729338. Entropy: 1.122341.
Iteration 15140: Policy loss: 0.002506. Value loss: 3.483071. Entropy: 1.144875.
Iteration 15141: Policy loss: -0.001766. Value loss: 2.785599. Entropy: 1.138304.
Training network. lr: 0.000165. clip: 0.066025
Iteration 15142: Policy loss: 0.003425. Value loss: 3.241254. Entropy: 1.103585.
Iteration 15143: Policy loss: 0.001074. Value loss: 2.312711. Entropy: 1.142858.
Iteration 15144: Policy loss: -0.002490. 

Iteration 15202: Policy loss: 0.001486. Value loss: 2.625729. Entropy: 1.346914.
Iteration 15203: Policy loss: 0.001889. Value loss: 1.526704. Entropy: 1.337830.
Iteration 15204: Policy loss: -0.002108. Value loss: 1.143826. Entropy: 1.336254.
episode: 6007   score: 450.0  epsilon: 1.0    steps: 237  evaluation reward: 399.1
episode: 6008   score: 565.0  epsilon: 1.0    steps: 452  evaluation reward: 401.5
Training network. lr: 0.000165. clip: 0.065800
Iteration 15205: Policy loss: 0.002299. Value loss: 4.420660. Entropy: 1.165021.
Iteration 15206: Policy loss: -0.001992. Value loss: 2.908759. Entropy: 1.155661.
Iteration 15207: Policy loss: -0.003929. Value loss: 1.928130. Entropy: 1.152205.
episode: 6009   score: 495.0  epsilon: 1.0    steps: 282  evaluation reward: 403.4
episode: 6010   score: 240.0  epsilon: 1.0    steps: 933  evaluation reward: 404.7
Training network. lr: 0.000165. clip: 0.065800
Iteration 15208: Policy loss: 0.004666. Value loss: 3.402380. Entropy: 1.131383.
Iter

Training network. lr: 0.000164. clip: 0.065688
Iteration 15268: Policy loss: 0.007085. Value loss: 3.506250. Entropy: 1.348961.
Iteration 15269: Policy loss: 0.003309. Value loss: 2.105005. Entropy: 1.348700.
Iteration 15270: Policy loss: 0.004857. Value loss: 1.582456. Entropy: 1.354286.
episode: 6030   score: 495.0  epsilon: 1.0    steps: 113  evaluation reward: 401.7
Training network. lr: 0.000164. clip: 0.065688
Iteration 15271: Policy loss: 0.009126. Value loss: 5.478826. Entropy: 1.332895.
Iteration 15272: Policy loss: 0.009111. Value loss: 3.358135. Entropy: 1.328130.
Iteration 15273: Policy loss: 0.007606. Value loss: 3.168724. Entropy: 1.335961.
Training network. lr: 0.000164. clip: 0.065688
Iteration 15274: Policy loss: 0.009837. Value loss: 2.431597. Entropy: 1.406056.
Iteration 15275: Policy loss: 0.002865. Value loss: 1.420978. Entropy: 1.412450.
Iteration 15276: Policy loss: 0.003210. Value loss: 1.145131. Entropy: 1.423082.
episode: 6031   score: 575.0  epsilon: 1.0    s

Training network. lr: 0.000164. clip: 0.065575
Iteration 15337: Policy loss: 0.005731. Value loss: 4.347013. Entropy: 1.400967.
Iteration 15338: Policy loss: 0.004065. Value loss: 2.368450. Entropy: 1.381809.
Iteration 15339: Policy loss: 0.001662. Value loss: 1.932863. Entropy: 1.402764.
episode: 6049   score: 365.0  epsilon: 1.0    steps: 4  evaluation reward: 412.2
episode: 6050   score: 285.0  epsilon: 1.0    steps: 817  evaluation reward: 412.95
Training network. lr: 0.000164. clip: 0.065575
Iteration 15340: Policy loss: 0.002407. Value loss: 3.203336. Entropy: 1.304698.
Iteration 15341: Policy loss: -0.002449. Value loss: 1.939847. Entropy: 1.323067.
Iteration 15342: Policy loss: -0.005158. Value loss: 1.477288. Entropy: 1.320579.
now time :  2019-02-23 04:55:46.161486
episode: 6051   score: 375.0  epsilon: 1.0    steps: 322  evaluation reward: 411.6
episode: 6052   score: 345.0  epsilon: 1.0    steps: 757  evaluation reward: 410.2
Training network. lr: 0.000164. clip: 0.065575
I

Iteration 15401: Policy loss: 0.009838. Value loss: 2.531704. Entropy: 1.315098.
Iteration 15402: Policy loss: 0.003803. Value loss: 2.222414. Entropy: 1.332995.
Training network. lr: 0.000163. clip: 0.065350
Iteration 15403: Policy loss: 0.009266. Value loss: 4.333629. Entropy: 1.368863.
Iteration 15404: Policy loss: 0.009023. Value loss: 2.513722. Entropy: 1.366713.
Iteration 15405: Policy loss: 0.002280. Value loss: 1.786063. Entropy: 1.375709.
episode: 6073   score: 560.0  epsilon: 1.0    steps: 131  evaluation reward: 404.9
Training network. lr: 0.000163. clip: 0.065350
Iteration 15406: Policy loss: 0.002297. Value loss: 2.211628. Entropy: 1.271739.
Iteration 15407: Policy loss: 0.001106. Value loss: 1.376020. Entropy: 1.267222.
Iteration 15408: Policy loss: -0.003694. Value loss: 1.137232. Entropy: 1.289025.
episode: 6074   score: 345.0  epsilon: 1.0    steps: 842  evaluation reward: 406.4
Training network. lr: 0.000163. clip: 0.065350
Iteration 15409: Policy loss: 0.001981. Valu

Iteration 15466: Policy loss: 0.004734. Value loss: 2.366107. Entropy: 1.370756.
Iteration 15467: Policy loss: 0.000827. Value loss: 1.347993. Entropy: 1.357905.
Iteration 15468: Policy loss: -0.001618. Value loss: 1.030288. Entropy: 1.364181.
episode: 6096   score: 290.0  epsilon: 1.0    steps: 470  evaluation reward: 421.1
Training network. lr: 0.000163. clip: 0.065238
Iteration 15469: Policy loss: 0.005821. Value loss: 4.985777. Entropy: 1.233060.
Iteration 15470: Policy loss: 0.002789. Value loss: 4.337726. Entropy: 1.215107.
Iteration 15471: Policy loss: 0.016964. Value loss: 2.754759. Entropy: 1.187447.
Training network. lr: 0.000163. clip: 0.065238
Iteration 15472: Policy loss: 0.005625. Value loss: 2.274888. Entropy: 1.145460.
Iteration 15473: Policy loss: -0.001256. Value loss: 1.509134. Entropy: 1.144271.
Iteration 15474: Policy loss: 0.002335. Value loss: 1.137660. Entropy: 1.174790.
episode: 6097   score: 260.0  epsilon: 1.0    steps: 139  evaluation reward: 419.5
Training 

Iteration 15533: Policy loss: 0.007194. Value loss: 2.526280. Entropy: 1.224172.
Iteration 15534: Policy loss: 0.003997. Value loss: 2.436949. Entropy: 1.228236.
episode: 6117   score: 575.0  epsilon: 1.0    steps: 5  evaluation reward: 414.95
episode: 6118   score: 110.0  epsilon: 1.0    steps: 246  evaluation reward: 416.35
episode: 6119   score: 785.0  epsilon: 1.0    steps: 405  evaluation reward: 412.5
episode: 6120   score: 240.0  epsilon: 1.0    steps: 684  evaluation reward: 414.75
Training network. lr: 0.000163. clip: 0.065125
Iteration 15535: Policy loss: -0.000327. Value loss: 2.062058. Entropy: 1.123648.
Iteration 15536: Policy loss: 0.000003. Value loss: 1.462175. Entropy: 1.101933.
Iteration 15537: Policy loss: -0.006980. Value loss: 1.215843. Entropy: 1.125920.
episode: 6121   score: 225.0  epsilon: 1.0    steps: 902  evaluation reward: 411.8
Training network. lr: 0.000163. clip: 0.065125
Iteration 15538: Policy loss: 0.003632. Value loss: 2.635111. Entropy: 0.955285.
It

Iteration 15597: Policy loss: -0.004078. Value loss: 1.061085. Entropy: 1.275088.
episode: 6141   score: 210.0  epsilon: 1.0    steps: 250  evaluation reward: 403.2
Training network. lr: 0.000163. clip: 0.065013
Iteration 15598: Policy loss: -0.000626. Value loss: 2.921565. Entropy: 1.073263.
Iteration 15599: Policy loss: -0.004913. Value loss: 1.734616. Entropy: 1.095819.
Iteration 15600: Policy loss: -0.005717. Value loss: 1.331187. Entropy: 1.092175.
episode: 6142   score: 330.0  epsilon: 1.0    steps: 340  evaluation reward: 400.3
episode: 6143   score: 360.0  epsilon: 1.0    steps: 667  evaluation reward: 399.65
Training network. lr: 0.000162. clip: 0.064900
Iteration 15601: Policy loss: -0.001962. Value loss: 2.794965. Entropy: 1.218831.
Iteration 15602: Policy loss: -0.001668. Value loss: 1.802126. Entropy: 1.186314.
Iteration 15603: Policy loss: -0.007370. Value loss: 1.453021. Entropy: 1.209829.
episode: 6144   score: 335.0  epsilon: 1.0    steps: 1011  evaluation reward: 401.

Iteration 15663: Policy loss: -0.004717. Value loss: 1.189744. Entropy: 1.205086.
episode: 6162   score: 465.0  epsilon: 1.0    steps: 101  evaluation reward: 393.45
episode: 6163   score: 530.0  epsilon: 1.0    steps: 396  evaluation reward: 395.4
Training network. lr: 0.000162. clip: 0.064788
Iteration 15664: Policy loss: 0.006003. Value loss: 2.417587. Entropy: 1.040107.
Iteration 15665: Policy loss: 0.006989. Value loss: 1.385111. Entropy: 1.066435.
Iteration 15666: Policy loss: 0.001031. Value loss: 0.974873. Entropy: 1.062500.
episode: 6164   score: 555.0  epsilon: 1.0    steps: 364  evaluation reward: 395.5
Training network. lr: 0.000162. clip: 0.064788
Iteration 15667: Policy loss: 0.007749. Value loss: 2.556514. Entropy: 1.253145.
Iteration 15668: Policy loss: 0.008486. Value loss: 1.354254. Entropy: 1.263213.
Iteration 15669: Policy loss: 0.005249. Value loss: 1.146876. Entropy: 1.246663.
episode: 6165   score: 420.0  epsilon: 1.0    steps: 791  evaluation reward: 398.95
Trai

Iteration 15729: Policy loss: 0.000952. Value loss: 1.058918. Entropy: 1.372424.
Training network. lr: 0.000162. clip: 0.064675
Iteration 15730: Policy loss: 0.002547. Value loss: 3.529071. Entropy: 1.319224.
Iteration 15731: Policy loss: -0.002426. Value loss: 2.091141. Entropy: 1.286958.
Iteration 15732: Policy loss: 0.000338. Value loss: 1.644861. Entropy: 1.299891.
episode: 6184   score: 585.0  epsilon: 1.0    steps: 517  evaluation reward: 381.95
Training network. lr: 0.000162. clip: 0.064675
Iteration 15733: Policy loss: 0.002614. Value loss: 3.046698. Entropy: 1.222377.
Iteration 15734: Policy loss: 0.002287. Value loss: 1.977513. Entropy: 1.202946.
Iteration 15735: Policy loss: -0.002971. Value loss: 1.575603. Entropy: 1.216653.
episode: 6185   score: 150.0  epsilon: 1.0    steps: 275  evaluation reward: 383.45
Training network. lr: 0.000162. clip: 0.064675
Iteration 15736: Policy loss: 0.013430. Value loss: 4.442863. Entropy: 1.149310.
Iteration 15737: Policy loss: 0.004087. V

Iteration 15792: Policy loss: 0.005915. Value loss: 2.055403. Entropy: 1.203072.
Training network. lr: 0.000161. clip: 0.064562
Iteration 15793: Policy loss: 0.005769. Value loss: 4.522347. Entropy: 1.144589.
Iteration 15794: Policy loss: 0.018801. Value loss: 2.260260. Entropy: 1.172530.
Iteration 15795: Policy loss: 0.019899. Value loss: 1.298721. Entropy: 1.148068.
episode: 6209   score: 285.0  epsilon: 1.0    steps: 524  evaluation reward: 378.35
Training network. lr: 0.000161. clip: 0.064562
Iteration 15796: Policy loss: 0.010243. Value loss: 4.763000. Entropy: 1.130821.
Iteration 15797: Policy loss: 0.018250. Value loss: 2.151794. Entropy: 1.115480.
Iteration 15798: Policy loss: 0.015485. Value loss: 1.657158. Entropy: 1.124200.
episode: 6210   score: 180.0  epsilon: 1.0    steps: 116  evaluation reward: 377.6
episode: 6211   score: 290.0  epsilon: 1.0    steps: 291  evaluation reward: 374.75
Training network. lr: 0.000161. clip: 0.064562
Iteration 15799: Policy loss: 0.002374. V

Iteration 15858: Policy loss: -0.002991. Value loss: 1.741638. Entropy: 1.200450.
Training network. lr: 0.000161. clip: 0.064338
Iteration 15859: Policy loss: 0.003691. Value loss: 2.870585. Entropy: 1.152662.
Iteration 15860: Policy loss: 0.001893. Value loss: 1.596884. Entropy: 1.152413.
Iteration 15861: Policy loss: 0.001564. Value loss: 1.185638. Entropy: 1.127430.
episode: 6231   score: 405.0  epsilon: 1.0    steps: 716  evaluation reward: 375.55
episode: 6232   score: 210.0  epsilon: 1.0    steps: 813  evaluation reward: 376.7
Training network. lr: 0.000161. clip: 0.064338
Iteration 15862: Policy loss: 0.002048. Value loss: 2.938055. Entropy: 1.247453.
Iteration 15863: Policy loss: 0.007722. Value loss: 1.981740. Entropy: 1.234383.
Iteration 15864: Policy loss: 0.001447. Value loss: 1.439152. Entropy: 1.247504.
episode: 6233   score: 590.0  epsilon: 1.0    steps: 321  evaluation reward: 376.55
Training network. lr: 0.000161. clip: 0.064338
Iteration 15865: Policy loss: 0.006794. 

Training network. lr: 0.000161. clip: 0.064225
Iteration 15925: Policy loss: 0.001956. Value loss: 3.400501. Entropy: 1.083032.
Iteration 15926: Policy loss: 0.000129. Value loss: 1.948378. Entropy: 1.044923.
Iteration 15927: Policy loss: -0.006028. Value loss: 1.609113. Entropy: 1.082353.
episode: 6252   score: 320.0  epsilon: 1.0    steps: 782  evaluation reward: 398.2
episode: 6253   score: 515.0  epsilon: 1.0    steps: 980  evaluation reward: 398.55
Training network. lr: 0.000161. clip: 0.064225
Iteration 15928: Policy loss: 0.001243. Value loss: 2.860003. Entropy: 1.076683.
Iteration 15929: Policy loss: -0.002417. Value loss: 1.897342. Entropy: 1.088451.
Iteration 15930: Policy loss: -0.005110. Value loss: 1.450245. Entropy: 1.072330.
episode: 6254   score: 310.0  epsilon: 1.0    steps: 369  evaluation reward: 400.5
Training network. lr: 0.000161. clip: 0.064225
Iteration 15931: Policy loss: 0.013264. Value loss: 4.434763. Entropy: 1.072712.
Iteration 15932: Policy loss: 0.003765.

Training network. lr: 0.000160. clip: 0.064113
Iteration 15991: Policy loss: 0.002222. Value loss: 3.038974. Entropy: 1.258518.
Iteration 15992: Policy loss: 0.000794. Value loss: 1.871904. Entropy: 1.273684.
Iteration 15993: Policy loss: -0.002293. Value loss: 1.540816. Entropy: 1.267669.
episode: 6274   score: 210.0  epsilon: 1.0    steps: 16  evaluation reward: 400.7
Training network. lr: 0.000160. clip: 0.064113
Iteration 15994: Policy loss: 0.002467. Value loss: 3.256055. Entropy: 1.125731.
Iteration 15995: Policy loss: 0.001666. Value loss: 2.008783. Entropy: 1.115267.
Iteration 15996: Policy loss: -0.004428. Value loss: 1.519944. Entropy: 1.122808.
episode: 6275   score: 395.0  epsilon: 1.0    steps: 137  evaluation reward: 399.35
episode: 6276   score: 210.0  epsilon: 1.0    steps: 581  evaluation reward: 400.0
episode: 6277   score: 630.0  epsilon: 1.0    steps: 894  evaluation reward: 398.7
Training network. lr: 0.000160. clip: 0.064113
Iteration 15997: Policy loss: 0.002199.

Iteration 16054: Policy loss: 0.000665. Value loss: 4.894886. Entropy: 1.169319.
Iteration 16055: Policy loss: 0.009487. Value loss: 2.756593. Entropy: 1.181305.
Iteration 16056: Policy loss: 0.006192. Value loss: 2.076047. Entropy: 1.145770.
Training network. lr: 0.000160. clip: 0.063887
Iteration 16057: Policy loss: -0.000588. Value loss: 3.595785. Entropy: 1.129892.
Iteration 16058: Policy loss: 0.008439. Value loss: 2.234847. Entropy: 1.098085.
Iteration 16059: Policy loss: 0.000296. Value loss: 1.764421. Entropy: 1.112410.
episode: 6299   score: 845.0  epsilon: 1.0    steps: 81  evaluation reward: 395.25
episode: 6300   score: 180.0  epsilon: 1.0    steps: 885  evaluation reward: 399.1
Training network. lr: 0.000160. clip: 0.063887
Iteration 16060: Policy loss: 0.001362. Value loss: 2.634748. Entropy: 1.174910.
Iteration 16061: Policy loss: 0.003553. Value loss: 1.702852. Entropy: 1.163130.
Iteration 16062: Policy loss: -0.003637. Value loss: 1.263522. Entropy: 1.173699.
Training 

Iteration 16122: Policy loss: 0.008812. Value loss: 4.627936. Entropy: 1.042470.
episode: 6319   score: 210.0  epsilon: 1.0    steps: 931  evaluation reward: 416.95
Training network. lr: 0.000159. clip: 0.063775
Iteration 16123: Policy loss: 0.007517. Value loss: 3.444665. Entropy: 1.132802.
Iteration 16124: Policy loss: 0.005681. Value loss: 1.757953. Entropy: 1.115484.
Iteration 16125: Policy loss: 0.002755. Value loss: 1.394230. Entropy: 1.121721.
episode: 6320   score: 365.0  epsilon: 1.0    steps: 111  evaluation reward: 413.15
episode: 6321   score: 785.0  epsilon: 1.0    steps: 739  evaluation reward: 413.3
Training network. lr: 0.000159. clip: 0.063775
Iteration 16126: Policy loss: -0.000302. Value loss: 3.975267. Entropy: 1.189056.
Iteration 16127: Policy loss: -0.002007. Value loss: 2.607448. Entropy: 1.188814.
Iteration 16128: Policy loss: -0.001502. Value loss: 2.082359. Entropy: 1.173544.
episode: 6322   score: 305.0  epsilon: 1.0    steps: 305  evaluation reward: 417.5
Tr

Iteration 16188: Policy loss: -0.001223. Value loss: 1.655674. Entropy: 1.009301.
episode: 6341   score: 470.0  epsilon: 1.0    steps: 973  evaluation reward: 424.9
Training network. lr: 0.000159. clip: 0.063662
Iteration 16189: Policy loss: 0.004950. Value loss: 3.054079. Entropy: 0.994984.
Iteration 16190: Policy loss: 0.002435. Value loss: 1.918489. Entropy: 1.014915.
Iteration 16191: Policy loss: -0.000202. Value loss: 1.679654. Entropy: 1.014093.
Training network. lr: 0.000159. clip: 0.063662
Iteration 16192: Policy loss: 0.001734. Value loss: 3.131108. Entropy: 1.143764.
Iteration 16193: Policy loss: 0.003035. Value loss: 1.696048. Entropy: 1.132934.
Iteration 16194: Policy loss: 0.000831. Value loss: 1.377931. Entropy: 1.124461.
episode: 6342   score: 335.0  epsilon: 1.0    steps: 868  evaluation reward: 425.4
Training network. lr: 0.000159. clip: 0.063662
Iteration 16195: Policy loss: 0.002801. Value loss: 2.457730. Entropy: 1.246077.
Iteration 16196: Policy loss: 0.000131. Val

Iteration 16256: Policy loss: 0.006833. Value loss: 1.828480. Entropy: 1.201316.
Iteration 16257: Policy loss: 0.006333. Value loss: 1.472368. Entropy: 1.191683.
episode: 6360   score: 415.0  epsilon: 1.0    steps: 715  evaluation reward: 442.05
Training network. lr: 0.000159. clip: 0.063438
Iteration 16258: Policy loss: 0.003014. Value loss: 5.411162. Entropy: 1.166345.
Iteration 16259: Policy loss: 0.011021. Value loss: 3.735023. Entropy: 1.191585.
Iteration 16260: Policy loss: 0.024260. Value loss: 2.447985. Entropy: 1.189603.
episode: 6361   score: 500.0  epsilon: 1.0    steps: 498  evaluation reward: 443.6
episode: 6362   score: 305.0  epsilon: 1.0    steps: 1010  evaluation reward: 444.7
Training network. lr: 0.000159. clip: 0.063438
Iteration 16261: Policy loss: 0.008620. Value loss: 2.670841. Entropy: 1.128386.
Iteration 16262: Policy loss: 0.005726. Value loss: 1.573822. Entropy: 1.133161.
Iteration 16263: Policy loss: 0.005615. Value loss: 1.335410. Entropy: 1.115100.
episode

Iteration 16321: Policy loss: 0.006789. Value loss: 3.919479. Entropy: 0.811224.
Iteration 16322: Policy loss: 0.006373. Value loss: 2.529889. Entropy: 0.807891.
Iteration 16323: Policy loss: 0.001607. Value loss: 1.849717. Entropy: 0.800492.
Training network. lr: 0.000158. clip: 0.063325
Iteration 16324: Policy loss: 0.007309. Value loss: 3.525192. Entropy: 0.972162.
Iteration 16325: Policy loss: 0.003951. Value loss: 2.142217. Entropy: 0.995206.
Iteration 16326: Policy loss: 0.002534. Value loss: 1.496732. Entropy: 1.006517.
episode: 6383   score: 885.0  epsilon: 1.0    steps: 138  evaluation reward: 438.8
Training network. lr: 0.000158. clip: 0.063325
Iteration 16327: Policy loss: 0.009772. Value loss: 7.951569. Entropy: 1.004486.
Iteration 16328: Policy loss: 0.011853. Value loss: 3.939834. Entropy: 1.018002.
Iteration 16329: Policy loss: 0.008774. Value loss: 3.556782. Entropy: 1.017604.
episode: 6384   score: 920.0  epsilon: 1.0    steps: 634  evaluation reward: 443.35
episode: 6

episode: 6405   score: 265.0  epsilon: 1.0    steps: 388  evaluation reward: 453.85
episode: 6406   score: 360.0  epsilon: 1.0    steps: 650  evaluation reward: 451.3
Training network. lr: 0.000158. clip: 0.063213
Iteration 16387: Policy loss: 0.004877. Value loss: 3.194709. Entropy: 0.837335.
Iteration 16388: Policy loss: 0.012570. Value loss: 1.830175. Entropy: 0.865827.
Iteration 16389: Policy loss: -0.003380. Value loss: 1.498702. Entropy: 0.857451.
episode: 6407   score: 330.0  epsilon: 1.0    steps: 623  evaluation reward: 448.7
Training network. lr: 0.000158. clip: 0.063213
Iteration 16390: Policy loss: 0.003220. Value loss: 2.492819. Entropy: 0.934277.
Iteration 16391: Policy loss: -0.003115. Value loss: 1.390240. Entropy: 0.942820.
Iteration 16392: Policy loss: -0.003594. Value loss: 1.094791. Entropy: 0.930845.
episode: 6408   score: 365.0  epsilon: 1.0    steps: 83  evaluation reward: 449.4
Training network. lr: 0.000158. clip: 0.063213
Iteration 16393: Policy loss: 0.004715

Iteration 16452: Policy loss: 0.017535. Value loss: 3.391446. Entropy: 1.179538.
episode: 6428   score: 345.0  epsilon: 1.0    steps: 95  evaluation reward: 431.9
episode: 6429   score: 620.0  epsilon: 1.0    steps: 633  evaluation reward: 429.95
Training network. lr: 0.000157. clip: 0.062988
Iteration 16453: Policy loss: 0.001712. Value loss: 2.882986. Entropy: 0.933799.
Iteration 16454: Policy loss: 0.003830. Value loss: 1.944325. Entropy: 0.913794.
Iteration 16455: Policy loss: -0.001851. Value loss: 1.514528. Entropy: 0.927801.
episode: 6430   score: 210.0  epsilon: 1.0    steps: 372  evaluation reward: 432.65
Training network. lr: 0.000157. clip: 0.062988
Iteration 16456: Policy loss: 0.008816. Value loss: 4.062296. Entropy: 0.991773.
Iteration 16457: Policy loss: 0.013554. Value loss: 3.069148. Entropy: 1.033570.
Iteration 16458: Policy loss: 0.008369. Value loss: 2.113648. Entropy: 0.997813.
episode: 6431   score: 955.0  epsilon: 1.0    steps: 850  evaluation reward: 430.4
Train

episode: 6452   score: 635.0  epsilon: 1.0    steps: 816  evaluation reward: 420.65
Training network. lr: 0.000157. clip: 0.062875
Iteration 16516: Policy loss: 0.002452. Value loss: 2.307971. Entropy: 0.967081.
Iteration 16517: Policy loss: -0.001458. Value loss: 1.599165. Entropy: 0.978979.
Iteration 16518: Policy loss: -0.001852. Value loss: 1.194955. Entropy: 0.992751.
episode: 6453   score: 480.0  epsilon: 1.0    steps: 70  evaluation reward: 421.75
episode: 6454   score: 380.0  epsilon: 1.0    steps: 695  evaluation reward: 420.9
Training network. lr: 0.000157. clip: 0.062875
Iteration 16519: Policy loss: 0.002625. Value loss: 4.169870. Entropy: 1.013399.
Iteration 16520: Policy loss: 0.006885. Value loss: 3.481657. Entropy: 1.038346.
Iteration 16521: Policy loss: 0.003332. Value loss: 3.321507. Entropy: 1.005419.
episode: 6455   score: 210.0  epsilon: 1.0    steps: 198  evaluation reward: 419.3
Training network. lr: 0.000157. clip: 0.062875
Iteration 16522: Policy loss: 0.006445

Training network. lr: 0.000157. clip: 0.062763
Iteration 16579: Policy loss: 0.002097. Value loss: 4.477788. Entropy: 1.186883.
Iteration 16580: Policy loss: 0.002242. Value loss: 2.994904. Entropy: 1.186133.
Iteration 16581: Policy loss: -0.000407. Value loss: 2.988701. Entropy: 1.171383.
episode: 6477   score: 655.0  epsilon: 1.0    steps: 441  evaluation reward: 394.2
episode: 6478   score: 500.0  epsilon: 1.0    steps: 779  evaluation reward: 393.0
Training network. lr: 0.000157. clip: 0.062763
Iteration 16582: Policy loss: 0.003260. Value loss: 1.561792. Entropy: 0.938279.
Iteration 16583: Policy loss: 0.001520. Value loss: 0.971392. Entropy: 0.920981.
Iteration 16584: Policy loss: -0.002404. Value loss: 0.957671. Entropy: 0.966669.
episode: 6479   score: 285.0  epsilon: 1.0    steps: 726  evaluation reward: 396.6
Training network. lr: 0.000157. clip: 0.062763
Iteration 16585: Policy loss: 0.004908. Value loss: 2.205635. Entropy: 1.032203.
Iteration 16586: Policy loss: 0.003134. V

Iteration 16644: Policy loss: -0.002226. Value loss: 1.315677. Entropy: 0.822080.
Training network. lr: 0.000157. clip: 0.062650
Iteration 16645: Policy loss: 0.003614. Value loss: 4.419087. Entropy: 1.286240.
Iteration 16646: Policy loss: 0.000492. Value loss: 3.047335. Entropy: 1.276686.
Iteration 16647: Policy loss: 0.004361. Value loss: 2.941552. Entropy: 1.273840.
episode: 6500   score: 430.0  epsilon: 1.0    steps: 158  evaluation reward: 381.25
Training network. lr: 0.000157. clip: 0.062650
Iteration 16648: Policy loss: 0.006476. Value loss: 2.713774. Entropy: 1.149145.
Iteration 16649: Policy loss: 0.001722. Value loss: 1.679949. Entropy: 1.116556.
Iteration 16650: Policy loss: 0.000003. Value loss: 1.289546. Entropy: 1.152459.
now time :  2019-02-23 05:22:38.986803
episode: 6501   score: 845.0  epsilon: 1.0    steps: 469  evaluation reward: 381.7
episode: 6502   score: 180.0  epsilon: 1.0    steps: 560  evaluation reward: 386.85
Training network. lr: 0.000156. clip: 0.062538
I

Iteration 16710: Policy loss: -0.005577. Value loss: 1.145089. Entropy: 0.643062.
Training network. lr: 0.000156. clip: 0.062425
Iteration 16711: Policy loss: 0.007050. Value loss: 2.232349. Entropy: 1.120778.
Iteration 16712: Policy loss: 0.004169. Value loss: 1.582224. Entropy: 1.099990.
Iteration 16713: Policy loss: 0.000376. Value loss: 1.352886. Entropy: 1.105267.
episode: 6522   score: 450.0  epsilon: 1.0    steps: 1012  evaluation reward: 397.6
Training network. lr: 0.000156. clip: 0.062425
Iteration 16714: Policy loss: 0.002481. Value loss: 2.047112. Entropy: 1.142833.
Iteration 16715: Policy loss: 0.000344. Value loss: 1.102604. Entropy: 1.150668.
Iteration 16716: Policy loss: -0.001622. Value loss: 0.815627. Entropy: 1.159178.
episode: 6523   score: 180.0  epsilon: 1.0    steps: 119  evaluation reward: 400.0
Training network. lr: 0.000156. clip: 0.062425
Iteration 16717: Policy loss: 0.003626. Value loss: 1.684197. Entropy: 1.300954.
Iteration 16718: Policy loss: 0.000426. Va

episode: 6543   score: 350.0  epsilon: 1.0    steps: 491  evaluation reward: 387.0
episode: 6544   score: 785.0  epsilon: 1.0    steps: 714  evaluation reward: 384.6
Training network. lr: 0.000156. clip: 0.062313
Iteration 16777: Policy loss: -0.001062. Value loss: 4.445024. Entropy: 1.075909.
Iteration 16778: Policy loss: 0.001378. Value loss: 3.063106. Entropy: 1.082806.
Iteration 16779: Policy loss: 0.001009. Value loss: 2.659774. Entropy: 1.071938.
episode: 6545   score: 285.0  epsilon: 1.0    steps: 525  evaluation reward: 389.55
Training network. lr: 0.000156. clip: 0.062313
Iteration 16780: Policy loss: 0.001904. Value loss: 2.158420. Entropy: 0.655278.
Iteration 16781: Policy loss: -0.001586. Value loss: 1.641753. Entropy: 0.662941.
Iteration 16782: Policy loss: -0.003139. Value loss: 1.184133. Entropy: 0.647230.
episode: 6546   score: 520.0  epsilon: 1.0    steps: 260  evaluation reward: 388.6
Training network. lr: 0.000156. clip: 0.062313
Iteration 16783: Policy loss: 0.01151

Iteration 16842: Policy loss: -0.000461. Value loss: 1.339188. Entropy: 1.143713.
episode: 6566   score: 290.0  epsilon: 1.0    steps: 287  evaluation reward: 416.1
Training network. lr: 0.000156. clip: 0.062200
Iteration 16843: Policy loss: 0.002889. Value loss: 3.285549. Entropy: 1.056486.
Iteration 16844: Policy loss: 0.003370. Value loss: 1.951889. Entropy: 1.065493.
Iteration 16845: Policy loss: -0.002371. Value loss: 1.401320. Entropy: 1.069086.
episode: 6567   score: 350.0  epsilon: 1.0    steps: 167  evaluation reward: 415.65
episode: 6568   score: 315.0  epsilon: 1.0    steps: 738  evaluation reward: 417.35
Training network. lr: 0.000156. clip: 0.062200
Iteration 16846: Policy loss: 0.007437. Value loss: 3.315376. Entropy: 1.011887.
Iteration 16847: Policy loss: -0.001587. Value loss: 2.086309. Entropy: 0.994302.
Iteration 16848: Policy loss: -0.000991. Value loss: 1.421399. Entropy: 1.013238.
episode: 6569   score: 295.0  epsilon: 1.0    steps: 84  evaluation reward: 417.2
Tr

Iteration 16908: Policy loss: 0.012981. Value loss: 2.074619. Entropy: 1.129525.
episode: 6588   score: 660.0  epsilon: 1.0    steps: 286  evaluation reward: 421.35
Training network. lr: 0.000155. clip: 0.061975
Iteration 16909: Policy loss: 0.010951. Value loss: 4.770668. Entropy: 0.973570.
Iteration 16910: Policy loss: 0.012956. Value loss: 2.428649. Entropy: 0.979532.
Iteration 16911: Policy loss: 0.004301. Value loss: 2.005118. Entropy: 0.989096.
episode: 6589   score: 210.0  epsilon: 1.0    steps: 916  evaluation reward: 422.75
Training network. lr: 0.000155. clip: 0.061975
Iteration 16912: Policy loss: 0.003278. Value loss: 5.604122. Entropy: 1.070819.
Iteration 16913: Policy loss: 0.012346. Value loss: 2.885488. Entropy: 1.044648.
Iteration 16914: Policy loss: 0.005401. Value loss: 1.724360. Entropy: 1.064364.
Training network. lr: 0.000155. clip: 0.061975
Iteration 16915: Policy loss: 0.006045. Value loss: 3.053671. Entropy: 1.013632.
Iteration 16916: Policy loss: 0.006042. Val

Training network. lr: 0.000155. clip: 0.061863
Iteration 16972: Policy loss: 0.003050. Value loss: 2.484580. Entropy: 1.046771.
Iteration 16973: Policy loss: 0.002802. Value loss: 1.528433. Entropy: 1.019013.
Iteration 16974: Policy loss: -0.006444. Value loss: 1.130250. Entropy: 1.025665.
episode: 6612   score: 210.0  epsilon: 1.0    steps: 249  evaluation reward: 415.65
Training network. lr: 0.000155. clip: 0.061863
Iteration 16975: Policy loss: 0.002058. Value loss: 2.309786. Entropy: 1.032744.
Iteration 16976: Policy loss: 0.000676. Value loss: 1.338386. Entropy: 1.045690.
Iteration 16977: Policy loss: -0.004126. Value loss: 0.987724. Entropy: 1.055285.
episode: 6613   score: 745.0  epsilon: 1.0    steps: 836  evaluation reward: 414.3
Training network. lr: 0.000155. clip: 0.061863
Iteration 16978: Policy loss: 0.002387. Value loss: 3.018224. Entropy: 0.878411.
Iteration 16979: Policy loss: -0.001049. Value loss: 1.728615. Entropy: 0.887940.
Iteration 16980: Policy loss: -0.006387. 

episode: 6637   score: 290.0  epsilon: 1.0    steps: 735  evaluation reward: 391.15
Training network. lr: 0.000154. clip: 0.061750
Iteration 17035: Policy loss: 0.005315. Value loss: 3.773471. Entropy: 0.955191.
Iteration 17036: Policy loss: 0.002761. Value loss: 2.366675. Entropy: 0.971178.
Iteration 17037: Policy loss: 0.002256. Value loss: 1.865971. Entropy: 0.960841.
Training network. lr: 0.000154. clip: 0.061750
Iteration 17038: Policy loss: 0.005413. Value loss: 2.977103. Entropy: 0.991090.
Iteration 17039: Policy loss: 0.009380. Value loss: 1.833618. Entropy: 0.968940.
Iteration 17040: Policy loss: 0.004241. Value loss: 1.429125. Entropy: 0.998264.
episode: 6638   score: 335.0  epsilon: 1.0    steps: 384  evaluation reward: 392.25
episode: 6639   score: 140.0  epsilon: 1.0    steps: 405  evaluation reward: 391.05
episode: 6640   score: 425.0  epsilon: 1.0    steps: 820  evaluation reward: 390.35
Training network. lr: 0.000154. clip: 0.061750
Iteration 17041: Policy loss: 0.00176

Training network. lr: 0.000154. clip: 0.061525
Iteration 17101: Policy loss: 0.005632. Value loss: 3.233341. Entropy: 1.164863.
Iteration 17102: Policy loss: -0.002855. Value loss: 1.752671. Entropy: 1.151517.
Iteration 17103: Policy loss: -0.003068. Value loss: 1.368542. Entropy: 1.156219.
episode: 6659   score: 710.0  epsilon: 1.0    steps: 464  evaluation reward: 371.4
episode: 6660   score: 315.0  epsilon: 1.0    steps: 899  evaluation reward: 375.9
Training network. lr: 0.000154. clip: 0.061525
Iteration 17104: Policy loss: 0.003055. Value loss: 3.526918. Entropy: 1.038460.
Iteration 17105: Policy loss: 0.006044. Value loss: 2.246157. Entropy: 1.081880.
Iteration 17106: Policy loss: 0.001153. Value loss: 1.784748. Entropy: 1.039054.
episode: 6661   score: 825.0  epsilon: 1.0    steps: 205  evaluation reward: 373.75
episode: 6662   score: 515.0  epsilon: 1.0    steps: 731  evaluation reward: 375.3
Training network. lr: 0.000154. clip: 0.061525
Iteration 17107: Policy loss: 0.003606

Training network. lr: 0.000154. clip: 0.061413
Iteration 17167: Policy loss: 0.005363. Value loss: 2.227280. Entropy: 1.176730.
Iteration 17168: Policy loss: 0.005460. Value loss: 1.322361. Entropy: 1.182558.
Iteration 17169: Policy loss: -0.003104. Value loss: 1.147578. Entropy: 1.213005.
episode: 6681   score: 240.0  epsilon: 1.0    steps: 444  evaluation reward: 376.25
episode: 6682   score: 580.0  epsilon: 1.0    steps: 726  evaluation reward: 372.85
Training network. lr: 0.000154. clip: 0.061413
Iteration 17170: Policy loss: 0.002521. Value loss: 2.685651. Entropy: 1.034177.
Iteration 17171: Policy loss: 0.008353. Value loss: 2.054288. Entropy: 1.043174.
Iteration 17172: Policy loss: -0.000536. Value loss: 1.560441. Entropy: 1.012310.
Training network. lr: 0.000154. clip: 0.061413
Iteration 17173: Policy loss: 0.005038. Value loss: 3.275363. Entropy: 1.093506.
Iteration 17174: Policy loss: 0.005549. Value loss: 1.981949. Entropy: 1.099215.
Iteration 17175: Policy loss: 0.004294. V

Iteration 17233: Policy loss: 0.002095. Value loss: 2.440267. Entropy: 1.087456.
Iteration 17234: Policy loss: -0.001246. Value loss: 1.681348. Entropy: 1.100568.
Iteration 17235: Policy loss: 0.001624. Value loss: 1.361991. Entropy: 1.089516.
Training network. lr: 0.000153. clip: 0.061300
Iteration 17236: Policy loss: 0.006331. Value loss: 2.812016. Entropy: 1.242338.
Iteration 17237: Policy loss: 0.005607. Value loss: 1.499222. Entropy: 1.232802.
Iteration 17238: Policy loss: 0.002407. Value loss: 1.073567. Entropy: 1.219478.
episode: 6702   score: 560.0  epsilon: 1.0    steps: 427  evaluation reward: 363.35
episode: 6703   score: 195.0  epsilon: 1.0    steps: 664  evaluation reward: 366.55
episode: 6704   score: 245.0  epsilon: 1.0    steps: 916  evaluation reward: 366.7
Training network. lr: 0.000153. clip: 0.061300
Iteration 17239: Policy loss: 0.000836. Value loss: 2.147696. Entropy: 1.049357.
Iteration 17240: Policy loss: -0.003113. Value loss: 1.440576. Entropy: 1.026128.
Itera

Iteration 17299: Policy loss: 0.001922. Value loss: 2.434766. Entropy: 1.224677.
Iteration 17300: Policy loss: -0.001110. Value loss: 1.561686. Entropy: 1.217588.
Iteration 17301: Policy loss: -0.000757. Value loss: 1.173817. Entropy: 1.251124.
episode: 6724   score: 240.0  epsilon: 1.0    steps: 982  evaluation reward: 378.8
Training network. lr: 0.000153. clip: 0.061075
Iteration 17302: Policy loss: 0.001149. Value loss: 3.846630. Entropy: 1.238812.
Iteration 17303: Policy loss: -0.004507. Value loss: 2.017438. Entropy: 1.241863.
Iteration 17304: Policy loss: -0.000280. Value loss: 1.617114. Entropy: 1.239182.
episode: 6725   score: 365.0  epsilon: 1.0    steps: 234  evaluation reward: 375.6
Training network. lr: 0.000153. clip: 0.061075
Iteration 17305: Policy loss: 0.008018. Value loss: 3.664213. Entropy: 1.124788.
Iteration 17306: Policy loss: 0.003359. Value loss: 2.005766. Entropy: 1.164459.
Iteration 17307: Policy loss: 0.005751. Value loss: 1.458426. Entropy: 1.157762.
Trainin

Iteration 17363: Policy loss: -0.000419. Value loss: 2.336006. Entropy: 1.103278.
Iteration 17364: Policy loss: -0.008427. Value loss: 1.749815. Entropy: 1.059314.
Training network. lr: 0.000152. clip: 0.060963
Iteration 17365: Policy loss: 0.000148. Value loss: 3.103303. Entropy: 1.183445.
Iteration 17366: Policy loss: 0.003581. Value loss: 2.063436. Entropy: 1.219170.
Iteration 17367: Policy loss: -0.003533. Value loss: 1.685332. Entropy: 1.193300.
episode: 6749   score: 215.0  epsilon: 1.0    steps: 725  evaluation reward: 398.0
episode: 6750   score: 155.0  epsilon: 1.0    steps: 938  evaluation reward: 393.6
Training network. lr: 0.000152. clip: 0.060963
Iteration 17368: Policy loss: 0.004016. Value loss: 3.420558. Entropy: 1.070104.
Iteration 17369: Policy loss: -0.000138. Value loss: 2.147903. Entropy: 1.099129.
Iteration 17370: Policy loss: 0.000474. Value loss: 1.768513. Entropy: 1.091660.
now time :  2019-02-23 05:37:26.745854
episode: 6751   score: 245.0  epsilon: 1.0    ste

Iteration 17428: Policy loss: 0.000900. Value loss: 2.978474. Entropy: 0.968071.
Iteration 17429: Policy loss: -0.002382. Value loss: 2.245626. Entropy: 0.965337.
Iteration 17430: Policy loss: -0.005799. Value loss: 1.863060. Entropy: 0.983637.
episode: 6772   score: 805.0  epsilon: 1.0    steps: 77  evaluation reward: 381.15
episode: 6773   score: 210.0  epsilon: 1.0    steps: 273  evaluation reward: 384.65
episode: 6774   score: 275.0  epsilon: 1.0    steps: 692  evaluation reward: 381.45
Training network. lr: 0.000152. clip: 0.060850
Iteration 17431: Policy loss: 0.006546. Value loss: 2.801791. Entropy: 0.903211.
Iteration 17432: Policy loss: 0.009050. Value loss: 1.709976. Entropy: 0.925872.
Iteration 17433: Policy loss: 0.001563. Value loss: 1.381303. Entropy: 0.899771.
episode: 6775   score: 285.0  epsilon: 1.0    steps: 242  evaluation reward: 381.6
Training network. lr: 0.000152. clip: 0.060850
Iteration 17434: Policy loss: 0.009035. Value loss: 2.939712. Entropy: 1.055609.
Ite

episode: 6798   score: 590.0  epsilon: 1.0    steps: 789  evaluation reward: 360.05
Training network. lr: 0.000152. clip: 0.060738
Iteration 17491: Policy loss: 0.005466. Value loss: 2.682184. Entropy: 0.715028.
Iteration 17492: Policy loss: -0.001295. Value loss: 1.939263. Entropy: 0.713006.
Iteration 17493: Policy loss: -0.002753. Value loss: 1.688361. Entropy: 0.740449.
Training network. lr: 0.000152. clip: 0.060738
Iteration 17494: Policy loss: 0.001731. Value loss: 2.424245. Entropy: 0.873824.
Iteration 17495: Policy loss: -0.000676. Value loss: 1.738811. Entropy: 0.862520.
Iteration 17496: Policy loss: 0.002174. Value loss: 1.317685. Entropy: 0.865267.
Training network. lr: 0.000152. clip: 0.060738
Iteration 17497: Policy loss: 0.002903. Value loss: 6.008281. Entropy: 1.113459.
Iteration 17498: Policy loss: 0.007540. Value loss: 4.050930. Entropy: 1.094518.
Iteration 17499: Policy loss: 0.003186. Value loss: 3.045068. Entropy: 1.107422.
episode: 6799   score: 180.0  epsilon: 1.0 

Iteration 17557: Policy loss: 0.006985. Value loss: 3.690816. Entropy: 1.022290.
Iteration 17558: Policy loss: 0.008918. Value loss: 2.241565. Entropy: 1.027590.
Iteration 17559: Policy loss: 0.006740. Value loss: 1.620660. Entropy: 1.021649.
episode: 6819   score: 360.0  epsilon: 1.0    steps: 668  evaluation reward: 364.15
episode: 6820   score: 225.0  epsilon: 1.0    steps: 881  evaluation reward: 361.75
Training network. lr: 0.000151. clip: 0.060513
Iteration 17560: Policy loss: 0.001741. Value loss: 3.340672. Entropy: 0.990367.
Iteration 17561: Policy loss: -0.002179. Value loss: 2.240215. Entropy: 0.986289.
Iteration 17562: Policy loss: -0.000655. Value loss: 1.902619. Entropy: 1.000398.
Training network. lr: 0.000151. clip: 0.060513
Iteration 17563: Policy loss: 0.007637. Value loss: 2.916748. Entropy: 1.073202.
Iteration 17564: Policy loss: 0.006156. Value loss: 1.698358. Entropy: 1.107126.
Iteration 17565: Policy loss: 0.001134. Value loss: 1.313691. Entropy: 1.088200.
episode

Training network. lr: 0.000151. clip: 0.060400
Iteration 17623: Policy loss: 0.001158. Value loss: 2.927215. Entropy: 1.054664.
Iteration 17624: Policy loss: 0.003892. Value loss: 1.800135. Entropy: 1.066462.
Iteration 17625: Policy loss: -0.000528. Value loss: 1.362770. Entropy: 1.049547.
Training network. lr: 0.000151. clip: 0.060400
Iteration 17626: Policy loss: 0.002314. Value loss: 3.924898. Entropy: 1.018893.
Iteration 17627: Policy loss: -0.000607. Value loss: 2.574708. Entropy: 1.021646.
Iteration 17628: Policy loss: -0.004670. Value loss: 1.915850. Entropy: 1.001540.
episode: 6842   score: 290.0  epsilon: 1.0    steps: 343  evaluation reward: 363.2
episode: 6843   score: 315.0  epsilon: 1.0    steps: 735  evaluation reward: 364.0
Training network. lr: 0.000151. clip: 0.060400
Iteration 17629: Policy loss: 0.002766. Value loss: 3.963552. Entropy: 1.071676.
Iteration 17630: Policy loss: 0.001708. Value loss: 2.626593. Entropy: 1.080494.
Iteration 17631: Policy loss: -0.003205. V

Iteration 17687: Policy loss: -0.003525. Value loss: 2.237822. Entropy: 1.030836.
Iteration 17688: Policy loss: -0.008491. Value loss: 1.864083. Entropy: 1.041656.
episode: 6865   score: 210.0  epsilon: 1.0    steps: 287  evaluation reward: 364.3
episode: 6866   score: 690.0  epsilon: 1.0    steps: 661  evaluation reward: 360.0
Training network. lr: 0.000151. clip: 0.060288
Iteration 17689: Policy loss: 0.007640. Value loss: 3.433086. Entropy: 0.973949.
Iteration 17690: Policy loss: 0.000652. Value loss: 2.175148. Entropy: 0.917693.
Iteration 17691: Policy loss: 0.002315. Value loss: 1.695728. Entropy: 0.930713.
Training network. lr: 0.000151. clip: 0.060288
Iteration 17692: Policy loss: -0.002349. Value loss: 2.153520. Entropy: 1.069878.
Iteration 17693: Policy loss: -0.002086. Value loss: 1.344068. Entropy: 1.069660.
Iteration 17694: Policy loss: -0.002660. Value loss: 1.030515. Entropy: 1.093374.
episode: 6867   score: 285.0  epsilon: 1.0    steps: 885  evaluation reward: 362.2
epis

Iteration 17753: Policy loss: -0.000169. Value loss: 1.774420. Entropy: 1.092247.
Iteration 17754: Policy loss: -0.003120. Value loss: 1.360065. Entropy: 1.077548.
episode: 6887   score: 285.0  epsilon: 1.0    steps: 82  evaluation reward: 376.35
Training network. lr: 0.000150. clip: 0.060063
Iteration 17755: Policy loss: 0.001621. Value loss: 3.007339. Entropy: 1.143912.
Iteration 17756: Policy loss: 0.003080. Value loss: 1.824927. Entropy: 1.180445.
Iteration 17757: Policy loss: -0.003131. Value loss: 1.394606. Entropy: 1.148031.
Training network. lr: 0.000150. clip: 0.060063
Iteration 17758: Policy loss: 0.002179. Value loss: 5.505985. Entropy: 1.208596.
Iteration 17759: Policy loss: 0.002392. Value loss: 4.875138. Entropy: 1.208203.
Iteration 17760: Policy loss: -0.001659. Value loss: 3.872419. Entropy: 1.193569.
episode: 6888   score: 380.0  epsilon: 1.0    steps: 215  evaluation reward: 374.45
Training network. lr: 0.000150. clip: 0.060063
Iteration 17761: Policy loss: 0.004626. 

Iteration 17816: Policy loss: 0.007558. Value loss: 1.670208. Entropy: 1.157300.
Iteration 17817: Policy loss: 0.000874. Value loss: 1.297224. Entropy: 1.144299.
Training network. lr: 0.000150. clip: 0.059950
Iteration 17818: Policy loss: 0.003810. Value loss: 3.085384. Entropy: 1.118070.
Iteration 17819: Policy loss: 0.000907. Value loss: 1.832726. Entropy: 1.137389.
Iteration 17820: Policy loss: -0.002965. Value loss: 1.438055. Entropy: 1.137303.
Training network. lr: 0.000150. clip: 0.059950
Iteration 17821: Policy loss: 0.005433. Value loss: 2.681771. Entropy: 1.338037.
Iteration 17822: Policy loss: 0.001634. Value loss: 1.630671. Entropy: 1.325676.
Iteration 17823: Policy loss: 0.000700. Value loss: 1.223750. Entropy: 1.333825.
episode: 6912   score: 285.0  epsilon: 1.0    steps: 803  evaluation reward: 369.75
Training network. lr: 0.000150. clip: 0.059950
Iteration 17824: Policy loss: 0.001834. Value loss: 2.691360. Entropy: 1.249327.
Iteration 17825: Policy loss: 0.003655. Value

Iteration 17883: Policy loss: -0.002200. Value loss: 1.394453. Entropy: 1.251930.
Training network. lr: 0.000150. clip: 0.059838
Iteration 17884: Policy loss: 0.004959. Value loss: 2.915824. Entropy: 1.368351.
Iteration 17885: Policy loss: -0.001924. Value loss: 1.786882. Entropy: 1.386423.
Iteration 17886: Policy loss: -0.002469. Value loss: 1.489479. Entropy: 1.389876.
episode: 6933   score: 210.0  epsilon: 1.0    steps: 60  evaluation reward: 371.9
episode: 6934   score: 260.0  epsilon: 1.0    steps: 512  evaluation reward: 369.9
episode: 6935   score: 465.0  epsilon: 1.0    steps: 658  evaluation reward: 369.35
Training network. lr: 0.000150. clip: 0.059838
Iteration 17887: Policy loss: 0.005623. Value loss: 3.698312. Entropy: 1.186528.
Iteration 17888: Policy loss: 0.000685. Value loss: 2.222616. Entropy: 1.161914.
Iteration 17889: Policy loss: 0.002032. Value loss: 1.629636. Entropy: 1.178208.
episode: 6936   score: 330.0  epsilon: 1.0    steps: 200  evaluation reward: 367.15
epi

Iteration 17946: Policy loss: -0.003777. Value loss: 1.449606. Entropy: 1.082763.
Training network. lr: 0.000149. clip: 0.059725
Iteration 17947: Policy loss: 0.002972. Value loss: 3.815447. Entropy: 1.144599.
Iteration 17948: Policy loss: 0.006350. Value loss: 2.490271. Entropy: 1.171185.
Iteration 17949: Policy loss: 0.001041. Value loss: 1.762240. Entropy: 1.140855.
episode: 6958   score: 210.0  epsilon: 1.0    steps: 99  evaluation reward: 358.75
Training network. lr: 0.000149. clip: 0.059725
Iteration 17950: Policy loss: 0.000885. Value loss: 2.222928. Entropy: 1.287120.
Iteration 17951: Policy loss: 0.001421. Value loss: 1.343472. Entropy: 1.298192.
Iteration 17952: Policy loss: -0.005812. Value loss: 1.200251. Entropy: 1.320499.
episode: 6959   score: 555.0  epsilon: 1.0    steps: 732  evaluation reward: 357.65
episode: 6960   score: 520.0  epsilon: 1.0    steps: 895  evaluation reward: 359.85
Training network. lr: 0.000149. clip: 0.059613
Iteration 17953: Policy loss: 0.004585.

episode: 6983   score: 260.0  epsilon: 1.0    steps: 479  evaluation reward: 347.6
episode: 6984   score: 210.0  epsilon: 1.0    steps: 736  evaluation reward: 344.35
Training network. lr: 0.000149. clip: 0.059500
Iteration 18010: Policy loss: 0.001840. Value loss: 3.936499. Entropy: 1.214269.
Iteration 18011: Policy loss: 0.008482. Value loss: 2.409691. Entropy: 1.197904.
Iteration 18012: Policy loss: -0.000548. Value loss: 2.064465. Entropy: 1.195928.
episode: 6985   score: 245.0  epsilon: 1.0    steps: 364  evaluation reward: 342.85
Training network. lr: 0.000149. clip: 0.059500
Iteration 18013: Policy loss: 0.005704. Value loss: 2.652694. Entropy: 1.147095.
Iteration 18014: Policy loss: 0.006302. Value loss: 1.682274. Entropy: 1.151914.
Iteration 18015: Policy loss: 0.000123. Value loss: 1.385595. Entropy: 1.145841.
episode: 6986   score: 375.0  epsilon: 1.0    steps: 174  evaluation reward: 339.3
episode: 6987   score: 260.0  epsilon: 1.0    steps: 521  evaluation reward: 337.65
T

Iteration 18074: Policy loss: -0.002284. Value loss: 1.735739. Entropy: 1.285615.
Iteration 18075: Policy loss: -0.004089. Value loss: 1.196166. Entropy: 1.297145.
episode: 7007   score: 160.0  epsilon: 1.0    steps: 386  evaluation reward: 339.75
episode: 7008   score: 140.0  epsilon: 1.0    steps: 646  evaluation reward: 335.9
episode: 7009   score: 300.0  epsilon: 1.0    steps: 1007  evaluation reward: 335.5
Training network. lr: 0.000148. clip: 0.059388
Iteration 18076: Policy loss: 0.007673. Value loss: 4.001570. Entropy: 1.161247.
Iteration 18077: Policy loss: 0.012004. Value loss: 2.267965. Entropy: 1.170240.
Iteration 18078: Policy loss: 0.007510. Value loss: 1.726235. Entropy: 1.158009.
Training network. lr: 0.000148. clip: 0.059388
Iteration 18079: Policy loss: 0.002550. Value loss: 3.119976. Entropy: 1.197486.
Iteration 18080: Policy loss: 0.001352. Value loss: 1.917133. Entropy: 1.221616.
Iteration 18081: Policy loss: -0.000133. Value loss: 1.482121. Entropy: 1.204704.
epis

Iteration 18137: Policy loss: 0.007751. Value loss: 2.073766. Entropy: 1.396726.
Iteration 18138: Policy loss: -0.000062. Value loss: 1.523908. Entropy: 1.399211.
episode: 7033   score: 315.0  epsilon: 1.0    steps: 664  evaluation reward: 307.8
episode: 7034   score: 365.0  epsilon: 1.0    steps: 824  evaluation reward: 308.85
Training network. lr: 0.000148. clip: 0.059275
Iteration 18139: Policy loss: 0.003580. Value loss: 3.203878. Entropy: 1.238005.
Iteration 18140: Policy loss: -0.000392. Value loss: 2.155927. Entropy: 1.226330.
Iteration 18141: Policy loss: -0.000113. Value loss: 1.566203. Entropy: 1.243110.
episode: 7035   score: 210.0  epsilon: 1.0    steps: 6  evaluation reward: 309.9
Training network. lr: 0.000148. clip: 0.059275
Iteration 18142: Policy loss: -0.000509. Value loss: 2.852366. Entropy: 1.236110.
Iteration 18143: Policy loss: 0.003438. Value loss: 1.723184. Entropy: 1.270471.
Iteration 18144: Policy loss: -0.001466. Value loss: 1.289773. Entropy: 1.262064.
episo

Training network. lr: 0.000148. clip: 0.059163
Iteration 18199: Policy loss: 0.004002. Value loss: 6.132699. Entropy: 1.137260.
Iteration 18200: Policy loss: 0.007254. Value loss: 4.383751. Entropy: 1.170669.
Iteration 18201: Policy loss: 0.016983. Value loss: 3.648519. Entropy: 1.126526.
episode: 7060   score: 315.0  epsilon: 1.0    steps: 81  evaluation reward: 294.95
episode: 7061   score: 225.0  epsilon: 1.0    steps: 947  evaluation reward: 292.9
Training network. lr: 0.000148. clip: 0.059050
Iteration 18202: Policy loss: 0.001214. Value loss: 3.526818. Entropy: 1.066343.
Iteration 18203: Policy loss: 0.002491. Value loss: 2.239601. Entropy: 1.069136.
Iteration 18204: Policy loss: 0.000229. Value loss: 1.963025. Entropy: 1.051682.
Training network. lr: 0.000148. clip: 0.059050
Iteration 18205: Policy loss: 0.002906. Value loss: 6.035412. Entropy: 1.174934.
Iteration 18206: Policy loss: 0.000141. Value loss: 4.059577. Entropy: 1.144243.
Iteration 18207: Policy loss: 0.006254. Value

Iteration 18263: Policy loss: 0.002656. Value loss: 1.643558. Entropy: 1.380909.
Iteration 18264: Policy loss: 0.006804. Value loss: 1.268465. Entropy: 1.411234.
Training network. lr: 0.000147. clip: 0.058938
Iteration 18265: Policy loss: 0.009206. Value loss: 3.633813. Entropy: 1.470270.
Iteration 18266: Policy loss: 0.001556. Value loss: 2.263080. Entropy: 1.473445.
Iteration 18267: Policy loss: -0.000439. Value loss: 1.830439. Entropy: 1.479667.
episode: 7084   score: 305.0  epsilon: 1.0    steps: 120  evaluation reward: 297.2
Training network. lr: 0.000147. clip: 0.058938
Iteration 18268: Policy loss: 0.005654. Value loss: 3.674185. Entropy: 1.292318.
Iteration 18269: Policy loss: 0.004072. Value loss: 1.916996. Entropy: 1.282794.
Iteration 18270: Policy loss: -0.000698. Value loss: 1.614973. Entropy: 1.281801.
episode: 7085   score: 315.0  epsilon: 1.0    steps: 995  evaluation reward: 298.15
Training network. lr: 0.000147. clip: 0.058938
Iteration 18271: Policy loss: 0.001895. Va

Training network. lr: 0.000147. clip: 0.058825
Iteration 18328: Policy loss: 0.003289. Value loss: 5.157448. Entropy: 1.389757.
Iteration 18329: Policy loss: 0.011755. Value loss: 3.373149. Entropy: 1.379841.
Iteration 18330: Policy loss: 0.007319. Value loss: 2.645672. Entropy: 1.381670.
episode: 7107   score: 210.0  epsilon: 1.0    steps: 1009  evaluation reward: 307.2
Training network. lr: 0.000147. clip: 0.058825
Iteration 18331: Policy loss: 0.001863. Value loss: 2.687322. Entropy: 1.427577.
Iteration 18332: Policy loss: 0.001091. Value loss: 1.643616. Entropy: 1.419764.
Iteration 18333: Policy loss: 0.000839. Value loss: 1.228345. Entropy: 1.417101.
episode: 7108   score: 755.0  epsilon: 1.0    steps: 234  evaluation reward: 307.7
episode: 7109   score: 210.0  epsilon: 1.0    steps: 295  evaluation reward: 313.85
episode: 7110   score: 155.0  epsilon: 1.0    steps: 569  evaluation reward: 312.95
Training network. lr: 0.000147. clip: 0.058825
Iteration 18334: Policy loss: 0.002205

Iteration 18393: Policy loss: 0.001808. Value loss: 2.481197. Entropy: 1.442199.
episode: 7130   score: 135.0  epsilon: 1.0    steps: 51  evaluation reward: 326.65
episode: 7131   score: 830.0  epsilon: 1.0    steps: 134  evaluation reward: 324.85
episode: 7132   score: 620.0  epsilon: 1.0    steps: 737  evaluation reward: 331.9
Training network. lr: 0.000147. clip: 0.058713
Iteration 18394: Policy loss: 0.006558. Value loss: 3.387093. Entropy: 1.244362.
Iteration 18395: Policy loss: 0.003855. Value loss: 2.188529. Entropy: 1.249059.
Iteration 18396: Policy loss: 0.002583. Value loss: 1.860798. Entropy: 1.232363.
episode: 7133   score: 270.0  epsilon: 1.0    steps: 498  evaluation reward: 335.2
episode: 7134   score: 435.0  epsilon: 1.0    steps: 919  evaluation reward: 334.75
Training network. lr: 0.000147. clip: 0.058713
Iteration 18397: Policy loss: 0.000716. Value loss: 2.855903. Entropy: 1.193701.
Iteration 18398: Policy loss: 0.005175. Value loss: 1.770183. Entropy: 1.179430.
Ite

Iteration 18454: Policy loss: 0.001538. Value loss: 2.631586. Entropy: 1.135569.
Iteration 18455: Policy loss: 0.000371. Value loss: 1.607045. Entropy: 1.117408.
Iteration 18456: Policy loss: -0.001513. Value loss: 1.367314. Entropy: 1.129195.
episode: 7157   score: 210.0  epsilon: 1.0    steps: 744  evaluation reward: 339.7
Training network. lr: 0.000146. clip: 0.058488
Iteration 18457: Policy loss: 0.002697. Value loss: 2.465953. Entropy: 1.463464.
Iteration 18458: Policy loss: 0.001466. Value loss: 1.582221. Entropy: 1.448691.
Iteration 18459: Policy loss: -0.001141. Value loss: 1.258901. Entropy: 1.427780.
Training network. lr: 0.000146. clip: 0.058488
Iteration 18460: Policy loss: 0.010022. Value loss: 2.843756. Entropy: 1.412071.
Iteration 18461: Policy loss: 0.008848. Value loss: 1.306980. Entropy: 1.390473.
Iteration 18462: Policy loss: 0.009239. Value loss: 0.943643. Entropy: 1.407777.
episode: 7158   score: 180.0  epsilon: 1.0    steps: 964  evaluation reward: 339.45
Training

Iteration 18520: Policy loss: 0.005868. Value loss: 1.886187. Entropy: 1.168071.
Iteration 18521: Policy loss: -0.000943. Value loss: 1.262277. Entropy: 1.185910.
Iteration 18522: Policy loss: -0.000350. Value loss: 1.069517. Entropy: 1.182554.
Training network. lr: 0.000146. clip: 0.058375
Iteration 18523: Policy loss: 0.011079. Value loss: 6.228754. Entropy: 1.385358.
Iteration 18524: Policy loss: 0.012408. Value loss: 2.630963. Entropy: 1.365203.
Iteration 18525: Policy loss: 0.011941. Value loss: 1.649969. Entropy: 1.366075.
Training network. lr: 0.000146. clip: 0.058375
Iteration 18526: Policy loss: 0.004863. Value loss: 4.116778. Entropy: 1.286079.
Iteration 18527: Policy loss: 0.006946. Value loss: 2.034671. Entropy: 1.285533.
Iteration 18528: Policy loss: 0.001336. Value loss: 1.741385. Entropy: 1.286506.
Training network. lr: 0.000146. clip: 0.058375
Iteration 18529: Policy loss: 0.008832. Value loss: 3.858563. Entropy: 1.347049.
Iteration 18530: Policy loss: 0.000815. Value l

Iteration 18589: Policy loss: 0.004209. Value loss: 7.164600. Entropy: 1.400370.
Iteration 18590: Policy loss: 0.007558. Value loss: 4.629952. Entropy: 1.392263.
Iteration 18591: Policy loss: 0.012501. Value loss: 3.224394. Entropy: 1.393137.
episode: 7197   score: 465.0  epsilon: 1.0    steps: 797  evaluation reward: 381.1
Training network. lr: 0.000146. clip: 0.058263
Iteration 18592: Policy loss: 0.005281. Value loss: 3.050287. Entropy: 1.263544.
Iteration 18593: Policy loss: 0.009434. Value loss: 1.872818. Entropy: 1.283782.
Iteration 18594: Policy loss: 0.005558. Value loss: 1.497068. Entropy: 1.272151.
episode: 7198   score: 695.0  epsilon: 1.0    steps: 364  evaluation reward: 378.15
episode: 7199   score: 695.0  epsilon: 1.0    steps: 704  evaluation reward: 381.9
Training network. lr: 0.000146. clip: 0.058263
Iteration 18595: Policy loss: 0.004413. Value loss: 3.111936. Entropy: 1.108984.
Iteration 18596: Policy loss: 0.004319. Value loss: 2.162935. Entropy: 1.112916.
Iteratio

Iteration 18654: Policy loss: -0.000642. Value loss: 1.363996. Entropy: 1.164626.
episode: 7220   score: 600.0  epsilon: 1.0    steps: 644  evaluation reward: 400.8
episode: 7221   score: 260.0  epsilon: 1.0    steps: 998  evaluation reward: 402.6
Training network. lr: 0.000145. clip: 0.058038
Iteration 18655: Policy loss: 0.002050. Value loss: 2.478624. Entropy: 1.207401.
Iteration 18656: Policy loss: 0.000784. Value loss: 1.489433. Entropy: 1.213509.
Iteration 18657: Policy loss: 0.002861. Value loss: 1.229473. Entropy: 1.209379.
Training network. lr: 0.000145. clip: 0.058038
Iteration 18658: Policy loss: 0.004178. Value loss: 3.043180. Entropy: 1.192688.
Iteration 18659: Policy loss: 0.003550. Value loss: 1.889527. Entropy: 1.200003.
Iteration 18660: Policy loss: 0.003311. Value loss: 1.374884. Entropy: 1.202143.
episode: 7222   score: 85.0  epsilon: 1.0    steps: 73  evaluation reward: 399.95
Training network. lr: 0.000145. clip: 0.058038
Iteration 18661: Policy loss: 0.002836. Val

Iteration 18719: Policy loss: 0.004672. Value loss: 1.295197. Entropy: 1.243580.
Iteration 18720: Policy loss: -0.000881. Value loss: 1.070483. Entropy: 1.245146.
Training network. lr: 0.000145. clip: 0.057925
Iteration 18721: Policy loss: 0.001182. Value loss: 2.803940. Entropy: 1.255517.
Iteration 18722: Policy loss: 0.001353. Value loss: 1.603357. Entropy: 1.264896.
Iteration 18723: Policy loss: -0.002831. Value loss: 1.277954. Entropy: 1.257643.
episode: 7243   score: 315.0  epsilon: 1.0    steps: 281  evaluation reward: 407.1
episode: 7244   score: 400.0  epsilon: 1.0    steps: 724  evaluation reward: 407.35
Training network. lr: 0.000145. clip: 0.057925
Iteration 18724: Policy loss: 0.007538. Value loss: 5.986269. Entropy: 1.285079.
Iteration 18725: Policy loss: 0.006562. Value loss: 4.704171. Entropy: 1.249076.
Iteration 18726: Policy loss: 0.021375. Value loss: 2.790889. Entropy: 1.248928.
episode: 7245   score: 430.0  epsilon: 1.0    steps: 237  evaluation reward: 406.1
Traini

Iteration 18784: Policy loss: 0.004466. Value loss: 3.724045. Entropy: 1.256949.
Iteration 18785: Policy loss: 0.005410. Value loss: 2.432667. Entropy: 1.276187.
Iteration 18786: Policy loss: 0.000527. Value loss: 2.018699. Entropy: 1.271798.
episode: 7265   score: 300.0  epsilon: 1.0    steps: 451  evaluation reward: 433.4
episode: 7266   score: 395.0  epsilon: 1.0    steps: 625  evaluation reward: 434.3
Training network. lr: 0.000145. clip: 0.057813
Iteration 18787: Policy loss: 0.002014. Value loss: 3.032077. Entropy: 1.165286.
Iteration 18788: Policy loss: -0.000733. Value loss: 1.906958. Entropy: 1.188192.
Iteration 18789: Policy loss: 0.000706. Value loss: 1.601321. Entropy: 1.169687.
Training network. lr: 0.000145. clip: 0.057813
Iteration 18790: Policy loss: 0.005101. Value loss: 2.397772. Entropy: 1.107583.
Iteration 18791: Policy loss: 0.008226. Value loss: 1.778144. Entropy: 1.104912.
Iteration 18792: Policy loss: 0.001771. Value loss: 1.213971. Entropy: 1.104191.
Training n

Iteration 18850: Policy loss: 0.002791. Value loss: 1.934859. Entropy: 1.013533.
Iteration 18851: Policy loss: -0.000978. Value loss: 1.366963. Entropy: 1.013209.
Iteration 18852: Policy loss: -0.000499. Value loss: 1.037545. Entropy: 1.012743.
Training network. lr: 0.000144. clip: 0.057588
Iteration 18853: Policy loss: 0.002440. Value loss: 5.334144. Entropy: 1.199231.
Iteration 18854: Policy loss: 0.009551. Value loss: 3.338947. Entropy: 1.172953.
Iteration 18855: Policy loss: 0.014763. Value loss: 1.893077. Entropy: 1.188149.
Training network. lr: 0.000144. clip: 0.057588
Iteration 18856: Policy loss: 0.008308. Value loss: 4.103987. Entropy: 1.286146.
Iteration 18857: Policy loss: 0.005872. Value loss: 2.452459. Entropy: 1.288913.
Iteration 18858: Policy loss: 0.005260. Value loss: 1.831227. Entropy: 1.296889.
episode: 7287   score: 345.0  epsilon: 1.0    steps: 672  evaluation reward: 423.65
episode: 7288   score: 420.0  epsilon: 1.0    steps: 806  evaluation reward: 420.25
Trainin

Iteration 18916: Policy loss: 0.002233. Value loss: 2.875703. Entropy: 1.043277.
Iteration 18917: Policy loss: -0.001535. Value loss: 2.028211. Entropy: 1.038417.
Iteration 18918: Policy loss: -0.002560. Value loss: 1.622129. Entropy: 1.055787.
Training network. lr: 0.000144. clip: 0.057475
Iteration 18919: Policy loss: 0.001227. Value loss: 2.742319. Entropy: 1.254900.
Iteration 18920: Policy loss: 0.007401. Value loss: 1.756268. Entropy: 1.263437.
Iteration 18921: Policy loss: 0.005448. Value loss: 1.346018. Entropy: 1.237035.
episode: 7308   score: 365.0  epsilon: 1.0    steps: 230  evaluation reward: 411.15
Training network. lr: 0.000144. clip: 0.057475
Iteration 18922: Policy loss: -0.001947. Value loss: 2.043668. Entropy: 1.290198.
Iteration 18923: Policy loss: 0.000645. Value loss: 1.434689. Entropy: 1.286039.
Iteration 18924: Policy loss: -0.004151. Value loss: 1.025998. Entropy: 1.276706.
episode: 7309   score: 885.0  epsilon: 1.0    steps: 495  evaluation reward: 410.55
Train

Iteration 18980: Policy loss: 0.003460. Value loss: 2.093505. Entropy: 1.109977.
Iteration 18981: Policy loss: 0.001965. Value loss: 1.655053. Entropy: 1.117921.
Training network. lr: 0.000143. clip: 0.057363
Iteration 18982: Policy loss: 0.001666. Value loss: 1.516917. Entropy: 1.058001.
Iteration 18983: Policy loss: -0.001914. Value loss: 1.162436. Entropy: 1.059736.
Iteration 18984: Policy loss: -0.004715. Value loss: 0.912566. Entropy: 1.073464.
Training network. lr: 0.000143. clip: 0.057363
Iteration 18985: Policy loss: 0.001767. Value loss: 2.773630. Entropy: 1.051781.
Iteration 18986: Policy loss: 0.001025. Value loss: 2.007097. Entropy: 1.039787.
Iteration 18987: Policy loss: -0.005340. Value loss: 1.501622. Entropy: 1.022634.
episode: 7332   score: 285.0  epsilon: 1.0    steps: 563  evaluation reward: 403.95
Training network. lr: 0.000143. clip: 0.057363
Iteration 18988: Policy loss: 0.002947. Value loss: 3.519697. Entropy: 1.240061.
Iteration 18989: Policy loss: 0.003815. Val

episode: 7355   score: 695.0  epsilon: 1.0    steps: 839  evaluation reward: 391.65
Training network. lr: 0.000143. clip: 0.057250
Iteration 19045: Policy loss: 0.008539. Value loss: 2.579236. Entropy: 1.011097.
Iteration 19046: Policy loss: -0.002108. Value loss: 1.697536. Entropy: 0.999722.
Iteration 19047: Policy loss: -0.004274. Value loss: 1.363647. Entropy: 1.004007.
episode: 7356   score: 230.0  epsilon: 1.0    steps: 120  evaluation reward: 394.75
Training network. lr: 0.000143. clip: 0.057250
Iteration 19048: Policy loss: 0.004856. Value loss: 3.512765. Entropy: 1.092679.
Iteration 19049: Policy loss: 0.006753. Value loss: 2.176421. Entropy: 1.069456.
Iteration 19050: Policy loss: 0.005274. Value loss: 1.662567. Entropy: 1.068983.
Training network. lr: 0.000143. clip: 0.057138
Iteration 19051: Policy loss: 0.001552. Value loss: 2.338988. Entropy: 1.039441.
Iteration 19052: Policy loss: -0.000800. Value loss: 1.482572. Entropy: 1.045737.
Iteration 19053: Policy loss: -0.004423.

Iteration 19111: Policy loss: 0.002174. Value loss: 4.936991. Entropy: 1.225549.
Iteration 19112: Policy loss: -0.002796. Value loss: 2.733327. Entropy: 1.253577.
Iteration 19113: Policy loss: -0.001650. Value loss: 1.856452. Entropy: 1.227959.
episode: 7376   score: 700.0  epsilon: 1.0    steps: 552  evaluation reward: 383.7
Training network. lr: 0.000143. clip: 0.057025
Iteration 19114: Policy loss: 0.003968. Value loss: 3.774208. Entropy: 1.174139.
Iteration 19115: Policy loss: 0.005052. Value loss: 2.409465. Entropy: 1.182375.
Iteration 19116: Policy loss: -0.002681. Value loss: 2.122928. Entropy: 1.174273.
episode: 7377   score: 510.0  epsilon: 1.0    steps: 102  evaluation reward: 387.85
episode: 7378   score: 500.0  epsilon: 1.0    steps: 219  evaluation reward: 390.8
episode: 7379   score: 605.0  epsilon: 1.0    steps: 672  evaluation reward: 392.35
Training network. lr: 0.000143. clip: 0.057025
Iteration 19117: Policy loss: 0.007344. Value loss: 5.659170. Entropy: 1.051554.
It

Training network. lr: 0.000142. clip: 0.056913
Iteration 19174: Policy loss: 0.003946. Value loss: 2.738211. Entropy: 1.090836.
Iteration 19175: Policy loss: 0.002078. Value loss: 1.538285. Entropy: 1.105230.
Iteration 19176: Policy loss: -0.002268. Value loss: 1.346354. Entropy: 1.083813.
episode: 7402   score: 210.0  epsilon: 1.0    steps: 840  evaluation reward: 374.2
Training network. lr: 0.000142. clip: 0.056913
Iteration 19177: Policy loss: 0.001163. Value loss: 3.494455. Entropy: 1.163431.
Iteration 19178: Policy loss: -0.003593. Value loss: 2.064071. Entropy: 1.146853.
Iteration 19179: Policy loss: -0.004399. Value loss: 1.612978. Entropy: 1.146791.
episode: 7403   score: 270.0  epsilon: 1.0    steps: 13  evaluation reward: 373.0
Training network. lr: 0.000142. clip: 0.056913
Iteration 19180: Policy loss: -0.002959. Value loss: 2.551485. Entropy: 1.194789.
Iteration 19181: Policy loss: -0.001272. Value loss: 1.607648. Entropy: 1.195711.
Iteration 19182: Policy loss: -0.005485. 

episode: 7427   score: 815.0  epsilon: 1.0    steps: 1012  evaluation reward: 364.5
Training network. lr: 0.000142. clip: 0.056800
Iteration 19237: Policy loss: -0.000882. Value loss: 2.862375. Entropy: 0.906153.
Iteration 19238: Policy loss: -0.005213. Value loss: 1.841774. Entropy: 0.914963.
Iteration 19239: Policy loss: -0.006209. Value loss: 1.497337. Entropy: 0.922872.
Training network. lr: 0.000142. clip: 0.056800
Iteration 19240: Policy loss: -0.001116. Value loss: 2.564372. Entropy: 0.738560.
Iteration 19241: Policy loss: -0.004234. Value loss: 1.742196. Entropy: 0.742218.
Iteration 19242: Policy loss: -0.005949. Value loss: 1.369896. Entropy: 0.717040.
episode: 7428   score: 360.0  epsilon: 1.0    steps: 502  evaluation reward: 369.5
Training network. lr: 0.000142. clip: 0.056800
Iteration 19243: Policy loss: 0.003299. Value loss: 3.593449. Entropy: 1.195853.
Iteration 19244: Policy loss: 0.002202. Value loss: 1.923333. Entropy: 1.200003.
Iteration 19245: Policy loss: -0.00062

Training network. lr: 0.000141. clip: 0.056575
Iteration 19303: Policy loss: 0.000603. Value loss: 2.521180. Entropy: 1.251237.
Iteration 19304: Policy loss: 0.001057. Value loss: 1.475464. Entropy: 1.254238.
Iteration 19305: Policy loss: -0.000893. Value loss: 1.178920. Entropy: 1.257479.
episode: 7449   score: 625.0  epsilon: 1.0    steps: 290  evaluation reward: 358.65
episode: 7450   score: 350.0  epsilon: 1.0    steps: 560  evaluation reward: 362.05
now time :  2019-02-23 06:17:18.626453
episode: 7451   score: 350.0  epsilon: 1.0    steps: 1000  evaluation reward: 363.4
Training network. lr: 0.000141. clip: 0.056575
Iteration 19306: Policy loss: 0.003712. Value loss: 3.253071. Entropy: 1.153730.
Iteration 19307: Policy loss: 0.000765. Value loss: 2.029782. Entropy: 1.163698.
Iteration 19308: Policy loss: 0.000576. Value loss: 1.578928. Entropy: 1.158186.
Training network. lr: 0.000141. clip: 0.056575
Iteration 19309: Policy loss: 0.004962. Value loss: 3.835732. Entropy: 1.163826.


Iteration 19366: Policy loss: 0.004244. Value loss: 2.975587. Entropy: 1.154269.
Iteration 19367: Policy loss: 0.009433. Value loss: 1.794129. Entropy: 1.152838.
Iteration 19368: Policy loss: 0.002959. Value loss: 1.366298. Entropy: 1.167214.
episode: 7473   score: 450.0  epsilon: 1.0    steps: 579  evaluation reward: 362.3
episode: 7474   score: 185.0  epsilon: 1.0    steps: 802  evaluation reward: 362.9
episode: 7475   score: 370.0  epsilon: 1.0    steps: 997  evaluation reward: 361.05
Training network. lr: 0.000141. clip: 0.056463
Iteration 19369: Policy loss: 0.005280. Value loss: 2.711929. Entropy: 0.903570.
Iteration 19370: Policy loss: 0.002405. Value loss: 1.415155. Entropy: 0.926807.
Iteration 19371: Policy loss: -0.003570. Value loss: 1.117002. Entropy: 0.911388.
Training network. lr: 0.000141. clip: 0.056463
Iteration 19372: Policy loss: 0.001317. Value loss: 2.603564. Entropy: 1.222768.
Iteration 19373: Policy loss: -0.002007. Value loss: 1.531059. Entropy: 1.225749.
Iterat

Iteration 19429: Policy loss: 0.004327. Value loss: 3.133839. Entropy: 0.959359.
Iteration 19430: Policy loss: -0.000093. Value loss: 2.118949. Entropy: 0.963717.
Iteration 19431: Policy loss: -0.000976. Value loss: 1.849480. Entropy: 0.981827.
episode: 7498   score: 290.0  epsilon: 1.0    steps: 992  evaluation reward: 344.55
Training network. lr: 0.000141. clip: 0.056350
Iteration 19432: Policy loss: 0.002261. Value loss: 3.128446. Entropy: 1.076985.
Iteration 19433: Policy loss: 0.000166. Value loss: 1.963897. Entropy: 1.091874.
Iteration 19434: Policy loss: -0.004094. Value loss: 1.468858. Entropy: 1.090364.
Training network. lr: 0.000141. clip: 0.056350
Iteration 19435: Policy loss: -0.000157. Value loss: 2.751007. Entropy: 1.119897.
Iteration 19436: Policy loss: 0.003956. Value loss: 1.701896. Entropy: 1.108319.
Iteration 19437: Policy loss: -0.001478. Value loss: 1.163902. Entropy: 1.115964.
episode: 7499   score: 425.0  epsilon: 1.0    steps: 444  evaluation reward: 343.05
epis

episode: 7520   score: 320.0  epsilon: 1.0    steps: 270  evaluation reward: 363.75
Training network. lr: 0.000141. clip: 0.056238
Iteration 19495: Policy loss: 0.000676. Value loss: 2.640708. Entropy: 1.069545.
Iteration 19496: Policy loss: 0.004326. Value loss: 1.705426. Entropy: 1.088180.
Iteration 19497: Policy loss: -0.003891. Value loss: 1.247758. Entropy: 1.088467.
Training network. lr: 0.000141. clip: 0.056238
Iteration 19498: Policy loss: 0.004942. Value loss: 3.615841. Entropy: 1.149359.
Iteration 19499: Policy loss: 0.003536. Value loss: 2.153581. Entropy: 1.154567.
Iteration 19500: Policy loss: 0.006624. Value loss: 1.508855. Entropy: 1.155084.
episode: 7521   score: 345.0  epsilon: 1.0    steps: 97  evaluation reward: 364.5
Training network. lr: 0.000140. clip: 0.056125
Iteration 19501: Policy loss: 0.004297. Value loss: 7.841836. Entropy: 1.279160.
Iteration 19502: Policy loss: 0.002676. Value loss: 6.347660. Entropy: 1.271978.
Iteration 19503: Policy loss: 0.007731. Valu

episode: 7545   score: 665.0  epsilon: 1.0    steps: 743  evaluation reward: 350.6
Training network. lr: 0.000140. clip: 0.056013
Iteration 19558: Policy loss: 0.003601. Value loss: 3.015661. Entropy: 0.932676.
Iteration 19559: Policy loss: 0.001678. Value loss: 1.958013. Entropy: 0.915358.
Iteration 19560: Policy loss: 0.003037. Value loss: 1.588986. Entropy: 0.935458.
episode: 7546   score: 50.0  epsilon: 1.0    steps: 265  evaluation reward: 353.35
Training network. lr: 0.000140. clip: 0.056013
Iteration 19561: Policy loss: 0.008242. Value loss: 2.715945. Entropy: 0.941501.
Iteration 19562: Policy loss: 0.002808. Value loss: 1.628015. Entropy: 0.935791.
Iteration 19563: Policy loss: -0.000229. Value loss: 1.374080. Entropy: 0.951305.
Training network. lr: 0.000140. clip: 0.056013
Iteration 19564: Policy loss: 0.003644. Value loss: 2.299881. Entropy: 1.278222.
Iteration 19565: Policy loss: 0.002980. Value loss: 1.364361. Entropy: 1.313632.
Iteration 19566: Policy loss: 0.000433. Valu

Iteration 19623: Policy loss: 0.000425. Value loss: 1.513867. Entropy: 1.017128.
episode: 7568   score: 360.0  epsilon: 1.0    steps: 175  evaluation reward: 353.5
Training network. lr: 0.000140. clip: 0.055900
Iteration 19624: Policy loss: 0.002697. Value loss: 2.488625. Entropy: 1.105625.
Iteration 19625: Policy loss: 0.001334. Value loss: 1.601663. Entropy: 1.159857.
Iteration 19626: Policy loss: -0.004457. Value loss: 1.182105. Entropy: 1.107282.
episode: 7569   score: 470.0  epsilon: 1.0    steps: 906  evaluation reward: 353.95
Training network. lr: 0.000140. clip: 0.055900
Iteration 19627: Policy loss: 0.004636. Value loss: 2.992229. Entropy: 1.137510.
Iteration 19628: Policy loss: 0.004994. Value loss: 1.669287. Entropy: 1.107813.
Iteration 19629: Policy loss: 0.003456. Value loss: 1.149071. Entropy: 1.110820.
episode: 7570   score: 225.0  epsilon: 1.0    steps: 78  evaluation reward: 355.0
Training network. lr: 0.000140. clip: 0.055900
Iteration 19630: Policy loss: 0.006350. Va

episode: 7592   score: 380.0  epsilon: 1.0    steps: 616  evaluation reward: 351.05
Training network. lr: 0.000139. clip: 0.055788
Iteration 19687: Policy loss: 0.001450. Value loss: 2.381932. Entropy: 1.253653.
Iteration 19688: Policy loss: 0.003958. Value loss: 1.406347. Entropy: 1.242228.
Iteration 19689: Policy loss: 0.000695. Value loss: 1.026494. Entropy: 1.242142.
episode: 7593   score: 250.0  epsilon: 1.0    steps: 251  evaluation reward: 351.0
episode: 7594   score: 315.0  epsilon: 1.0    steps: 337  evaluation reward: 351.7
Training network. lr: 0.000139. clip: 0.055788
Iteration 19690: Policy loss: 0.008579. Value loss: 2.948994. Entropy: 1.203413.
Iteration 19691: Policy loss: 0.008876. Value loss: 1.794880. Entropy: 1.216516.
Iteration 19692: Policy loss: -0.000740. Value loss: 1.514795. Entropy: 1.188417.
episode: 7595   score: 245.0  epsilon: 1.0    steps: 12  evaluation reward: 352.7
episode: 7596   score: 395.0  epsilon: 1.0    steps: 830  evaluation reward: 353.05
Tra

episode: 7614   score: 365.0  epsilon: 1.0    steps: 82  evaluation reward: 350.8
episode: 7615   score: 780.0  epsilon: 1.0    steps: 476  evaluation reward: 348.65
episode: 7616   score: 505.0  epsilon: 1.0    steps: 844  evaluation reward: 354.3
Training network. lr: 0.000139. clip: 0.055563
Iteration 19753: Policy loss: -0.000252. Value loss: 3.376598. Entropy: 1.191979.
Iteration 19754: Policy loss: 0.001113. Value loss: 2.262182. Entropy: 1.199026.
Iteration 19755: Policy loss: -0.002830. Value loss: 1.924283. Entropy: 1.184257.
episode: 7617   score: 210.0  epsilon: 1.0    steps: 298  evaluation reward: 355.9
episode: 7618   score: 475.0  epsilon: 1.0    steps: 657  evaluation reward: 355.9
episode: 7619   score: 210.0  epsilon: 1.0    steps: 991  evaluation reward: 355.6
Training network. lr: 0.000139. clip: 0.055563
Iteration 19756: Policy loss: 0.003057. Value loss: 2.993541. Entropy: 0.747050.
Iteration 19757: Policy loss: 0.002782. Value loss: 2.106544. Entropy: 0.740185.
I

Iteration 19817: Policy loss: -0.003122. Value loss: 1.832149. Entropy: 1.264514.
Iteration 19818: Policy loss: -0.006790. Value loss: 1.283795. Entropy: 1.257039.
Training network. lr: 0.000139. clip: 0.055450
Iteration 19819: Policy loss: 0.002255. Value loss: 3.128783. Entropy: 1.197102.
Iteration 19820: Policy loss: 0.000811. Value loss: 1.936899. Entropy: 1.178930.
Iteration 19821: Policy loss: 0.006313. Value loss: 1.279164. Entropy: 1.189071.
episode: 7638   score: 320.0  epsilon: 1.0    steps: 817  evaluation reward: 363.2
Training network. lr: 0.000139. clip: 0.055450
Iteration 19822: Policy loss: 0.003329. Value loss: 6.161335. Entropy: 1.176425.
Iteration 19823: Policy loss: 0.010743. Value loss: 3.830317. Entropy: 1.164139.
Iteration 19824: Policy loss: 0.005896. Value loss: 2.468381. Entropy: 1.152436.
episode: 7639   score: 515.0  epsilon: 1.0    steps: 240  evaluation reward: 363.1
episode: 7640   score: 210.0  epsilon: 1.0    steps: 554  evaluation reward: 366.15
episod

Iteration 19882: Policy loss: 0.003858. Value loss: 3.720584. Entropy: 1.118323.
Iteration 19883: Policy loss: 0.002456. Value loss: 2.547309. Entropy: 1.111531.
Iteration 19884: Policy loss: 0.002501. Value loss: 1.863522. Entropy: 1.122711.
episode: 7660   score: 455.0  epsilon: 1.0    steps: 446  evaluation reward: 381.25
Training network. lr: 0.000138. clip: 0.055338
Iteration 19885: Policy loss: 0.000558. Value loss: 2.568781. Entropy: 1.112283.
Iteration 19886: Policy loss: 0.003005. Value loss: 1.835396. Entropy: 1.109053.
Iteration 19887: Policy loss: -0.001091. Value loss: 1.262097. Entropy: 1.117389.
episode: 7661   score: 365.0  epsilon: 1.0    steps: 533  evaluation reward: 381.35
Training network. lr: 0.000138. clip: 0.055338
Iteration 19888: Policy loss: 0.005626. Value loss: 2.412534. Entropy: 0.993046.
Iteration 19889: Policy loss: 0.004682. Value loss: 1.506799. Entropy: 0.995972.
Iteration 19890: Policy loss: 0.000469. Value loss: 1.227718. Entropy: 0.969832.
episode:

Iteration 19950: Policy loss: -0.002058. Value loss: 1.328763. Entropy: 1.041263.
episode: 7680   score: 360.0  epsilon: 1.0    steps: 46  evaluation reward: 388.05
episode: 7681   score: 470.0  epsilon: 1.0    steps: 405  evaluation reward: 389.35
episode: 7682   score: 350.0  epsilon: 1.0    steps: 594  evaluation reward: 391.95
episode: 7683   score: 335.0  epsilon: 1.0    steps: 870  evaluation reward: 388.7
episode: 7684   score: 210.0  epsilon: 1.0    steps: 965  evaluation reward: 390.25
Training network. lr: 0.000138. clip: 0.055113
Iteration 19951: Policy loss: 0.003246. Value loss: 3.674425. Entropy: 0.913753.
Iteration 19952: Policy loss: 0.010908. Value loss: 2.366445. Entropy: 0.925694.
Iteration 19953: Policy loss: 0.007573. Value loss: 1.852665. Entropy: 0.905577.
episode: 7685   score: 485.0  epsilon: 1.0    steps: 646  evaluation reward: 390.6
Training network. lr: 0.000138. clip: 0.055113
Iteration 19954: Policy loss: 0.004114. Value loss: 2.562467. Entropy: 0.726817.

Iteration 20016: Policy loss: -0.002831. Value loss: 1.454955. Entropy: 0.984543.
episode: 7702   score: 425.0  epsilon: 1.0    steps: 53  evaluation reward: 413.6
episode: 7703   score: 555.0  epsilon: 1.0    steps: 169  evaluation reward: 414.65
episode: 7704   score: 315.0  epsilon: 1.0    steps: 502  evaluation reward: 417.0
Training network. lr: 0.000138. clip: 0.055000
Iteration 20017: Policy loss: 0.005878. Value loss: 4.698685. Entropy: 0.922585.
Iteration 20018: Policy loss: 0.019945. Value loss: 2.538283. Entropy: 0.937987.
Iteration 20019: Policy loss: 0.000414. Value loss: 2.227612. Entropy: 0.959992.
episode: 7705   score: 470.0  epsilon: 1.0    steps: 606  evaluation reward: 416.4
Training network. lr: 0.000138. clip: 0.055000
Iteration 20020: Policy loss: 0.000469. Value loss: 3.202709. Entropy: 0.900037.
Iteration 20021: Policy loss: 0.002029. Value loss: 1.919124. Entropy: 0.915948.
Iteration 20022: Policy loss: -0.001690. Value loss: 1.520379. Entropy: 0.924880.
episo

Training network. lr: 0.000137. clip: 0.054888
Iteration 20080: Policy loss: 0.004024. Value loss: 3.316352. Entropy: 1.306629.
Iteration 20081: Policy loss: 0.003723. Value loss: 1.778302. Entropy: 1.305326.
Iteration 20082: Policy loss: 0.004093. Value loss: 1.363927. Entropy: 1.295010.
episode: 7727   score: 435.0  epsilon: 1.0    steps: 387  evaluation reward: 409.15
episode: 7728   score: 610.0  epsilon: 1.0    steps: 559  evaluation reward: 409.15
Training network. lr: 0.000137. clip: 0.054888
Iteration 20083: Policy loss: 0.002234. Value loss: 2.665489. Entropy: 0.828472.
Iteration 20084: Policy loss: -0.000284. Value loss: 1.838034. Entropy: 0.845483.
Iteration 20085: Policy loss: -0.002385. Value loss: 1.504172. Entropy: 0.838396.
Training network. lr: 0.000137. clip: 0.054888
Iteration 20086: Policy loss: 0.009834. Value loss: 3.794420. Entropy: 1.037060.
Iteration 20087: Policy loss: 0.004036. Value loss: 2.295611. Entropy: 1.050432.
Iteration 20088: Policy loss: 0.004946. V

Iteration 20144: Policy loss: 0.001866. Value loss: 1.559821. Entropy: 1.059501.
Iteration 20145: Policy loss: 0.001617. Value loss: 1.368849. Entropy: 1.048509.
Training network. lr: 0.000137. clip: 0.054775
Iteration 20146: Policy loss: 0.003028. Value loss: 5.227982. Entropy: 1.112250.
Iteration 20147: Policy loss: 0.003268. Value loss: 4.108709. Entropy: 1.131023.
Iteration 20148: Policy loss: 0.003422. Value loss: 3.673189. Entropy: 1.118042.
now time :  2019-02-23 06:34:37.774437
episode: 7751   score: 155.0  epsilon: 1.0    steps: 80  evaluation reward: 395.85
episode: 7752   score: 315.0  epsilon: 1.0    steps: 400  evaluation reward: 393.9
episode: 7753   score: 350.0  epsilon: 1.0    steps: 849  evaluation reward: 395.2
Training network. lr: 0.000137. clip: 0.054775
Iteration 20149: Policy loss: 0.003678. Value loss: 2.530427. Entropy: 0.884068.
Iteration 20150: Policy loss: 0.004264. Value loss: 1.708849. Entropy: 0.916784.
Iteration 20151: Policy loss: -0.000768. Value loss

Iteration 20207: Policy loss: 0.004303. Value loss: 2.040005. Entropy: 1.017493.
Iteration 20208: Policy loss: 0.003569. Value loss: 1.484232. Entropy: 1.042862.
Training network. lr: 0.000136. clip: 0.054550
Iteration 20209: Policy loss: 0.000953. Value loss: 2.872115. Entropy: 0.940630.
Iteration 20210: Policy loss: -0.000646. Value loss: 1.924432. Entropy: 0.966595.
Iteration 20211: Policy loss: -0.004109. Value loss: 1.559593. Entropy: 0.941920.
Training network. lr: 0.000136. clip: 0.054550
Iteration 20212: Policy loss: 0.003229. Value loss: 5.937138. Entropy: 1.174758.
Iteration 20213: Policy loss: 0.008709. Value loss: 4.306128. Entropy: 1.180768.
Iteration 20214: Policy loss: 0.004646. Value loss: 3.283254. Entropy: 1.162765.
episode: 7776   score: 305.0  epsilon: 1.0    steps: 108  evaluation reward: 380.9
episode: 7777   score: 415.0  epsilon: 1.0    steps: 1002  evaluation reward: 379.9
Training network. lr: 0.000136. clip: 0.054550
Iteration 20215: Policy loss: 0.004732. Va

Training network. lr: 0.000136. clip: 0.054438
Iteration 20272: Policy loss: 0.000801. Value loss: 3.469283. Entropy: 0.771475.
Iteration 20273: Policy loss: 0.002809. Value loss: 2.518290. Entropy: 0.785263.
Iteration 20274: Policy loss: -0.003307. Value loss: 1.858716. Entropy: 0.784680.
episode: 7800   score: 375.0  epsilon: 1.0    steps: 73  evaluation reward: 373.45
Training network. lr: 0.000136. clip: 0.054438
Iteration 20275: Policy loss: 0.000551. Value loss: 3.691623. Entropy: 0.660859.
Iteration 20276: Policy loss: 0.002646. Value loss: 2.529294. Entropy: 0.657325.
Iteration 20277: Policy loss: -0.004771. Value loss: 2.097586. Entropy: 0.646138.
Training network. lr: 0.000136. clip: 0.054438
Iteration 20278: Policy loss: 0.002986. Value loss: 2.958985. Entropy: 1.019157.
Iteration 20279: Policy loss: 0.002258. Value loss: 1.913388. Entropy: 1.038368.
Iteration 20280: Policy loss: 0.000613. Value loss: 1.445521. Entropy: 1.021635.
now time :  2019-02-23 06:37:21.771019
episod

Iteration 20337: Policy loss: 0.003775. Value loss: 1.245501. Entropy: 0.955051.
episode: 7823   score: 445.0  epsilon: 1.0    steps: 626  evaluation reward: 365.15
Training network. lr: 0.000136. clip: 0.054325
Iteration 20338: Policy loss: 0.003140. Value loss: 2.867115. Entropy: 0.904320.
Iteration 20339: Policy loss: 0.001464. Value loss: 1.824658. Entropy: 0.882979.
Iteration 20340: Policy loss: 0.003075. Value loss: 1.403527. Entropy: 0.898085.
episode: 7824   score: 180.0  epsilon: 1.0    steps: 290  evaluation reward: 364.95
episode: 7825   score: 495.0  epsilon: 1.0    steps: 704  evaluation reward: 364.35
episode: 7826   score: 360.0  epsilon: 1.0    steps: 975  evaluation reward: 365.15
Training network. lr: 0.000136. clip: 0.054325
Iteration 20341: Policy loss: 0.001792. Value loss: 5.628055. Entropy: 0.930048.
Iteration 20342: Policy loss: 0.004463. Value loss: 4.693977. Entropy: 0.922330.
Iteration 20343: Policy loss: 0.003660. Value loss: 3.911119. Entropy: 0.908349.
Tra

Iteration 20403: Policy loss: -0.002194. Value loss: 1.645040. Entropy: 1.225218.
episode: 7845   score: 230.0  epsilon: 1.0    steps: 431  evaluation reward: 364.15
episode: 7846   score: 375.0  epsilon: 1.0    steps: 627  evaluation reward: 364.3
episode: 7847   score: 490.0  epsilon: 1.0    steps: 924  evaluation reward: 363.8
Training network. lr: 0.000135. clip: 0.054100
Iteration 20404: Policy loss: 0.008507. Value loss: 4.212052. Entropy: 0.963020.
Iteration 20405: Policy loss: 0.002703. Value loss: 2.654162. Entropy: 0.952973.
Iteration 20406: Policy loss: -0.001924. Value loss: 2.114307. Entropy: 0.938606.
episode: 7848   score: 540.0  epsilon: 1.0    steps: 63  evaluation reward: 366.8
Training network. lr: 0.000135. clip: 0.054100
Iteration 20407: Policy loss: -0.002575. Value loss: 2.949515. Entropy: 0.753104.
Iteration 20408: Policy loss: 0.001843. Value loss: 2.179512. Entropy: 0.765028.
Iteration 20409: Policy loss: -0.001320. Value loss: 1.755735. Entropy: 0.763091.
epi

Iteration 20465: Policy loss: 0.002731. Value loss: 2.274228. Entropy: 0.801972.
Iteration 20466: Policy loss: -0.000636. Value loss: 1.956911. Entropy: 0.815004.
episode: 7871   score: 460.0  epsilon: 1.0    steps: 821  evaluation reward: 381.45
Training network. lr: 0.000135. clip: 0.053988
Iteration 20467: Policy loss: 0.006621. Value loss: 4.495644. Entropy: 0.764712.
Iteration 20468: Policy loss: 0.015230. Value loss: 2.364630. Entropy: 0.781688.
Iteration 20469: Policy loss: 0.012033. Value loss: 1.676053. Entropy: 0.777810.
Training network. lr: 0.000135. clip: 0.053988
Iteration 20470: Policy loss: 0.000027. Value loss: 2.706271. Entropy: 0.886019.
Iteration 20471: Policy loss: -0.004036. Value loss: 1.808701. Entropy: 0.854076.
Iteration 20472: Policy loss: -0.003314. Value loss: 1.315303. Entropy: 0.894171.
episode: 7872   score: 485.0  epsilon: 1.0    steps: 242  evaluation reward: 381.8
episode: 7873   score: 335.0  epsilon: 1.0    steps: 529  evaluation reward: 381.95
Trai

Iteration 20530: Policy loss: 0.005312. Value loss: 3.476668. Entropy: 0.911869.
Iteration 20531: Policy loss: 0.000810. Value loss: 2.176086. Entropy: 0.938721.
Iteration 20532: Policy loss: -0.001656. Value loss: 1.719918. Entropy: 0.914694.
episode: 7894   score: 395.0  epsilon: 1.0    steps: 755  evaluation reward: 363.95
Training network. lr: 0.000135. clip: 0.053875
Iteration 20533: Policy loss: 0.008003. Value loss: 3.099516. Entropy: 0.843390.
Iteration 20534: Policy loss: 0.025921. Value loss: 1.537317. Entropy: 0.815429.
Iteration 20535: Policy loss: 0.005583. Value loss: 1.217116. Entropy: 0.812646.
episode: 7895   score: 395.0  epsilon: 1.0    steps: 129  evaluation reward: 367.1
Training network. lr: 0.000135. clip: 0.053875
Iteration 20536: Policy loss: 0.003045. Value loss: 4.096486. Entropy: 0.784793.
Iteration 20537: Policy loss: 0.003837. Value loss: 1.999499. Entropy: 0.796338.
Iteration 20538: Policy loss: -0.001271. Value loss: 1.499260. Entropy: 0.774412.
episode:

Iteration 20598: Policy loss: -0.002862. Value loss: 1.739093. Entropy: 1.131677.
episode: 7914   score: 760.0  epsilon: 1.0    steps: 169  evaluation reward: 384.25
episode: 7915   score: 420.0  epsilon: 1.0    steps: 531  evaluation reward: 386.95
episode: 7916   score: 260.0  epsilon: 1.0    steps: 792  evaluation reward: 388.55
Training network. lr: 0.000134. clip: 0.053763
Iteration 20599: Policy loss: 0.004089. Value loss: 2.379787. Entropy: 0.863406.
Iteration 20600: Policy loss: 0.000622. Value loss: 1.664352. Entropy: 0.878291.
Iteration 20601: Policy loss: 0.000388. Value loss: 1.382996. Entropy: 0.854028.
episode: 7917   score: 155.0  epsilon: 1.0    steps: 96  evaluation reward: 387.95
episode: 7918   score: 425.0  epsilon: 1.0    steps: 930  evaluation reward: 387.4
Training network. lr: 0.000134. clip: 0.053650
Iteration 20602: Policy loss: 0.001811. Value loss: 2.814442. Entropy: 0.837599.
Iteration 20603: Policy loss: 0.003660. Value loss: 1.966900. Entropy: 0.841430.
I

Iteration 20662: Policy loss: 0.002258. Value loss: 2.960603. Entropy: 1.011516.
Iteration 20663: Policy loss: 0.002721. Value loss: 1.799021. Entropy: 1.002617.
Iteration 20664: Policy loss: 0.002923. Value loss: 1.480920. Entropy: 1.004513.
Training network. lr: 0.000134. clip: 0.053538
Iteration 20665: Policy loss: 0.001926. Value loss: 2.329565. Entropy: 1.191961.
Iteration 20666: Policy loss: 0.006459. Value loss: 1.375692. Entropy: 1.205557.
Iteration 20667: Policy loss: 0.002145. Value loss: 0.997956. Entropy: 1.185609.
episode: 7938   score: 180.0  epsilon: 1.0    steps: 133  evaluation reward: 394.75
episode: 7939   score: 225.0  epsilon: 1.0    steps: 1006  evaluation reward: 392.3
Training network. lr: 0.000134. clip: 0.053538
Iteration 20668: Policy loss: 0.004462. Value loss: 3.645494. Entropy: 1.004489.
Iteration 20669: Policy loss: 0.002633. Value loss: 2.224957. Entropy: 1.024220.
Iteration 20670: Policy loss: -0.003363. Value loss: 1.699528. Entropy: 1.009881.
episode:

Training network. lr: 0.000134. clip: 0.053425
Iteration 20728: Policy loss: 0.006743. Value loss: 3.372856. Entropy: 1.036018.
Iteration 20729: Policy loss: 0.004227. Value loss: 2.049540. Entropy: 1.020730.
Iteration 20730: Policy loss: -0.000450. Value loss: 1.578666. Entropy: 1.024706.
Training network. lr: 0.000134. clip: 0.053425
Iteration 20731: Policy loss: 0.002563. Value loss: 3.481398. Entropy: 0.780424.
Iteration 20732: Policy loss: -0.001561. Value loss: 2.237883. Entropy: 0.820941.
Iteration 20733: Policy loss: 0.000294. Value loss: 1.697946. Entropy: 0.795710.
episode: 7960   score: 425.0  epsilon: 1.0    steps: 322  evaluation reward: 397.25
Training network. lr: 0.000134. clip: 0.053425
Iteration 20734: Policy loss: 0.000634. Value loss: 3.479249. Entropy: 0.959681.
Iteration 20735: Policy loss: 0.003541. Value loss: 2.200537. Entropy: 0.949778.
Iteration 20736: Policy loss: -0.000887. Value loss: 1.750703. Entropy: 0.976189.
Training network. lr: 0.000134. clip: 0.053

Iteration 20795: Policy loss: 0.009414. Value loss: 2.360226. Entropy: 0.923495.
Iteration 20796: Policy loss: 0.005476. Value loss: 1.854419. Entropy: 0.962446.
episode: 7981   score: 515.0  epsilon: 1.0    steps: 761  evaluation reward: 404.1
Training network. lr: 0.000133. clip: 0.053313
Iteration 20797: Policy loss: 0.010941. Value loss: 4.561200. Entropy: 1.089600.
Iteration 20798: Policy loss: 0.004385. Value loss: 2.665808. Entropy: 1.086836.
Iteration 20799: Policy loss: 0.001260. Value loss: 1.814864. Entropy: 1.038078.
episode: 7982   score: 725.0  epsilon: 1.0    steps: 790  evaluation reward: 405.9
Training network. lr: 0.000133. clip: 0.053313
Iteration 20800: Policy loss: 0.006042. Value loss: 3.849471. Entropy: 0.760700.
Iteration 20801: Policy loss: 0.005420. Value loss: 2.857663. Entropy: 0.774273.
Iteration 20802: Policy loss: 0.007608. Value loss: 2.067071. Entropy: 0.765938.
Training network. lr: 0.000133. clip: 0.053200
Iteration 20803: Policy loss: 0.002806. Value

now time :  2019-02-23 06:49:18.248588
episode: 8001   score: 225.0  epsilon: 1.0    steps: 374  evaluation reward: 419.7
Training network. lr: 0.000133. clip: 0.053088
Iteration 20863: Policy loss: 0.003867. Value loss: 3.838627. Entropy: 1.061490.
Iteration 20864: Policy loss: -0.001044. Value loss: 2.329324. Entropy: 1.066360.
Iteration 20865: Policy loss: -0.000569. Value loss: 2.013415. Entropy: 1.053867.
Training network. lr: 0.000133. clip: 0.053088
Iteration 20866: Policy loss: 0.000136. Value loss: 3.238316. Entropy: 1.073138.
Iteration 20867: Policy loss: -0.001839. Value loss: 1.573698. Entropy: 1.066988.
Iteration 20868: Policy loss: 0.002068. Value loss: 1.239821. Entropy: 1.071484.
episode: 8002   score: 625.0  epsilon: 1.0    steps: 113  evaluation reward: 417.9
Training network. lr: 0.000133. clip: 0.053088
Iteration 20869: Policy loss: 0.006698. Value loss: 3.364654. Entropy: 1.160884.
Iteration 20870: Policy loss: 0.003628. Value loss: 2.018657. Entropy: 1.128426.
Ite

Iteration 20930: Policy loss: 0.004569. Value loss: 2.488709. Entropy: 0.857540.
Iteration 20931: Policy loss: -0.003092. Value loss: 1.916343. Entropy: 0.831979.
Training network. lr: 0.000132. clip: 0.052975
Iteration 20932: Policy loss: 0.006093. Value loss: 3.958356. Entropy: 1.096384.
Iteration 20933: Policy loss: 0.000502. Value loss: 2.443741. Entropy: 1.097890.
Iteration 20934: Policy loss: 0.002272. Value loss: 1.979966. Entropy: 1.097248.
episode: 8021   score: 640.0  epsilon: 1.0    steps: 180  evaluation reward: 446.6
Training network. lr: 0.000132. clip: 0.052975
Iteration 20935: Policy loss: 0.002206. Value loss: 3.792760. Entropy: 0.961325.
Iteration 20936: Policy loss: 0.003549. Value loss: 2.495908. Entropy: 0.998032.
Iteration 20937: Policy loss: -0.002088. Value loss: 1.932568. Entropy: 0.967708.
episode: 8022   score: 405.0  epsilon: 1.0    steps: 71  evaluation reward: 450.4
episode: 8023   score: 280.0  epsilon: 1.0    steps: 463  evaluation reward: 452.65
Trainin

episode: 8041   score: 760.0  epsilon: 1.0    steps: 692  evaluation reward: 451.0
Training network. lr: 0.000132. clip: 0.052863
Iteration 20998: Policy loss: 0.002298. Value loss: 4.110208. Entropy: 0.936588.
Iteration 20999: Policy loss: 0.003385. Value loss: 2.922893. Entropy: 0.921150.
Iteration 21000: Policy loss: -0.000452. Value loss: 2.096087. Entropy: 0.915559.
episode: 8042   score: 470.0  epsilon: 1.0    steps: 635  evaluation reward: 456.75
Training network. lr: 0.000132. clip: 0.052750
Iteration 21001: Policy loss: 0.006883. Value loss: 3.363646. Entropy: 1.005818.
Iteration 21002: Policy loss: 0.000956. Value loss: 1.946562. Entropy: 0.988890.
Iteration 21003: Policy loss: -0.002657. Value loss: 1.282908. Entropy: 0.996845.
episode: 8043   score: 650.0  epsilon: 1.0    steps: 212  evaluation reward: 457.0
Training network. lr: 0.000132. clip: 0.052750
Iteration 21004: Policy loss: 0.003658. Value loss: 3.100038. Entropy: 0.923667.
Iteration 21005: Policy loss: 0.004475. 

Iteration 21064: Policy loss: -0.002157. Value loss: 2.750977. Entropy: 0.940619.
Iteration 21065: Policy loss: -0.004263. Value loss: 1.580918. Entropy: 0.924815.
Iteration 21066: Policy loss: 0.000202. Value loss: 1.305137. Entropy: 0.934601.
Training network. lr: 0.000132. clip: 0.052638
Iteration 21067: Policy loss: 0.004534. Value loss: 5.285325. Entropy: 1.174859.
Iteration 21068: Policy loss: 0.007087. Value loss: 3.565638. Entropy: 1.194830.
Iteration 21069: Policy loss: -0.000514. Value loss: 3.279943. Entropy: 1.168533.
episode: 8062   score: 425.0  epsilon: 1.0    steps: 231  evaluation reward: 464.6
episode: 8063   score: 985.0  epsilon: 1.0    steps: 361  evaluation reward: 462.2
Training network. lr: 0.000132. clip: 0.052638
Iteration 21070: Policy loss: 0.008241. Value loss: 2.125298. Entropy: 1.260290.
Iteration 21071: Policy loss: 0.004712. Value loss: 1.765175. Entropy: 1.287318.
Iteration 21072: Policy loss: 0.000279. Value loss: 1.323008. Entropy: 1.273587.
episode:

Iteration 21131: Policy loss: 0.006185. Value loss: 1.920039. Entropy: 1.026026.
Iteration 21132: Policy loss: 0.003120. Value loss: 1.471192. Entropy: 0.982868.
episode: 8083   score: 520.0  epsilon: 1.0    steps: 116  evaluation reward: 458.7
Training network. lr: 0.000131. clip: 0.052525
Iteration 21133: Policy loss: 0.004480. Value loss: 3.385014. Entropy: 1.136477.
Iteration 21134: Policy loss: 0.001790. Value loss: 2.024560. Entropy: 1.132761.
Iteration 21135: Policy loss: 0.000942. Value loss: 1.625326. Entropy: 1.135790.
Training network. lr: 0.000131. clip: 0.052525
Iteration 21136: Policy loss: 0.002575. Value loss: 2.422909. Entropy: 1.099277.
Iteration 21137: Policy loss: 0.001672. Value loss: 1.249869. Entropy: 1.082012.
Iteration 21138: Policy loss: -0.000184. Value loss: 0.892407. Entropy: 1.098354.
Training network. lr: 0.000131. clip: 0.052525
Iteration 21139: Policy loss: 0.003830. Value loss: 3.667699. Entropy: 1.222235.
Iteration 21140: Policy loss: 0.005995. Value 

Iteration 21198: Policy loss: 0.000495. Value loss: 1.911390. Entropy: 1.152430.
episode: 8104   score: 700.0  epsilon: 1.0    steps: 703  evaluation reward: 449.55
episode: 8105   score: 210.0  epsilon: 1.0    steps: 1007  evaluation reward: 452.95
Training network. lr: 0.000131. clip: 0.052413
Iteration 21199: Policy loss: 0.003642. Value loss: 4.166242. Entropy: 1.112409.
Iteration 21200: Policy loss: 0.007525. Value loss: 2.561689. Entropy: 1.128495.
Iteration 21201: Policy loss: 0.002811. Value loss: 1.975743. Entropy: 1.127492.
Training network. lr: 0.000131. clip: 0.052300
Iteration 21202: Policy loss: 0.001266. Value loss: 2.519619. Entropy: 1.097866.
Iteration 21203: Policy loss: 0.003250. Value loss: 1.605544. Entropy: 1.099922.
Iteration 21204: Policy loss: 0.002394. Value loss: 1.218741. Entropy: 1.081133.
episode: 8106   score: 285.0  epsilon: 1.0    steps: 807  evaluation reward: 450.65
Training network. lr: 0.000131. clip: 0.052300
Iteration 21205: Policy loss: 0.002639.

episode: 8125   score: 270.0  epsilon: 1.0    steps: 328  evaluation reward: 418.5
episode: 8126   score: 905.0  epsilon: 1.0    steps: 417  evaluation reward: 415.65
Training network. lr: 0.000130. clip: 0.052188
Iteration 21265: Policy loss: 0.003690. Value loss: 2.798168. Entropy: 0.949274.
Iteration 21266: Policy loss: 0.006626. Value loss: 1.588354. Entropy: 0.950543.
Iteration 21267: Policy loss: 0.000040. Value loss: 1.306396. Entropy: 0.932653.
episode: 8127   score: 425.0  epsilon: 1.0    steps: 986  evaluation reward: 421.55
Training network. lr: 0.000130. clip: 0.052188
Iteration 21268: Policy loss: 0.001834. Value loss: 3.295230. Entropy: 1.056683.
Iteration 21269: Policy loss: -0.002437. Value loss: 1.718521. Entropy: 1.052268.
Iteration 21270: Policy loss: -0.004441. Value loss: 1.268005. Entropy: 1.037205.
episode: 8128   score: 495.0  epsilon: 1.0    steps: 225  evaluation reward: 422.0
episode: 8129   score: 265.0  epsilon: 1.0    steps: 539  evaluation reward: 422.3
T

Training network. lr: 0.000130. clip: 0.052075
Iteration 21331: Policy loss: 0.002414. Value loss: 3.730410. Entropy: 0.963905.
Iteration 21332: Policy loss: -0.001329. Value loss: 2.514505. Entropy: 0.955710.
Iteration 21333: Policy loss: -0.003501. Value loss: 2.082674. Entropy: 0.993547.
episode: 8147   score: 215.0  epsilon: 1.0    steps: 549  evaluation reward: 430.45
Training network. lr: 0.000130. clip: 0.052075
Iteration 21334: Policy loss: 0.002900. Value loss: 3.141896. Entropy: 1.078035.
Iteration 21335: Policy loss: 0.000441. Value loss: 2.158778. Entropy: 1.093408.
Iteration 21336: Policy loss: -0.003381. Value loss: 1.761086. Entropy: 1.043420.
episode: 8148   score: 810.0  epsilon: 1.0    steps: 480  evaluation reward: 426.75
Training network. lr: 0.000130. clip: 0.052075
Iteration 21337: Policy loss: 0.000102. Value loss: 3.056895. Entropy: 1.130760.
Iteration 21338: Policy loss: 0.004676. Value loss: 1.987714. Entropy: 1.143019.
Iteration 21339: Policy loss: 0.000450. 

Iteration 21399: Policy loss: -0.000178. Value loss: 1.523150. Entropy: 1.029848.
episode: 8166   score: 420.0  epsilon: 1.0    steps: 721  evaluation reward: 426.7
Training network. lr: 0.000130. clip: 0.051963
Iteration 21400: Policy loss: 0.001992. Value loss: 4.324728. Entropy: 1.125270.
Iteration 21401: Policy loss: 0.005556. Value loss: 2.367232. Entropy: 1.133177.
Iteration 21402: Policy loss: 0.001201. Value loss: 1.896183. Entropy: 1.120528.
episode: 8167   score: 440.0  epsilon: 1.0    steps: 587  evaluation reward: 427.45
Training network. lr: 0.000130. clip: 0.051850
Iteration 21403: Policy loss: 0.001431. Value loss: 4.278871. Entropy: 1.076685.
Iteration 21404: Policy loss: 0.004007. Value loss: 2.925011. Entropy: 1.077429.
Iteration 21405: Policy loss: 0.007463. Value loss: 2.253872. Entropy: 1.069629.
episode: 8168   score: 635.0  epsilon: 1.0    steps: 5  evaluation reward: 427.65
episode: 8169   score: 555.0  epsilon: 1.0    steps: 845  evaluation reward: 427.95
Train

Training network. lr: 0.000129. clip: 0.051738
Iteration 21466: Policy loss: 0.002340. Value loss: 4.127398. Entropy: 1.149889.
Iteration 21467: Policy loss: 0.000905. Value loss: 2.326357. Entropy: 1.156348.
Iteration 21468: Policy loss: -0.000690. Value loss: 1.806413. Entropy: 1.143876.
episode: 8187   score: 380.0  epsilon: 1.0    steps: 488  evaluation reward: 438.45
episode: 8188   score: 650.0  epsilon: 1.0    steps: 677  evaluation reward: 440.45
Training network. lr: 0.000129. clip: 0.051738
Iteration 21469: Policy loss: 0.001116. Value loss: 3.234935. Entropy: 1.064292.
Iteration 21470: Policy loss: 0.003109. Value loss: 2.201535. Entropy: 1.057929.
Iteration 21471: Policy loss: 0.003619. Value loss: 1.560136. Entropy: 1.061437.
Training network. lr: 0.000129. clip: 0.051738
Iteration 21472: Policy loss: 0.006839. Value loss: 4.865756. Entropy: 1.096157.
Iteration 21473: Policy loss: 0.007226. Value loss: 2.728434. Entropy: 1.078548.
Iteration 21474: Policy loss: 0.009988. Va

episode: 8205   score: 360.0  epsilon: 1.0    steps: 871  evaluation reward: 449.25
Training network. lr: 0.000129. clip: 0.051625
Iteration 21535: Policy loss: 0.003125. Value loss: 3.684685. Entropy: 1.081464.
Iteration 21536: Policy loss: -0.000017. Value loss: 2.131191. Entropy: 1.056897.
Iteration 21537: Policy loss: 0.000389. Value loss: 1.912499. Entropy: 1.070882.
episode: 8206   score: 450.0  epsilon: 1.0    steps: 183  evaluation reward: 450.75
Training network. lr: 0.000129. clip: 0.051625
Iteration 21538: Policy loss: 0.005543. Value loss: 3.538170. Entropy: 1.113669.
Iteration 21539: Policy loss: 0.001851. Value loss: 2.139915. Entropy: 1.143001.
Iteration 21540: Policy loss: 0.004878. Value loss: 1.763316. Entropy: 1.112366.
episode: 8207   score: 475.0  epsilon: 1.0    steps: 1012  evaluation reward: 452.4
Training network. lr: 0.000129. clip: 0.051625
Iteration 21541: Policy loss: 0.003380. Value loss: 3.413222. Entropy: 1.227437.
Iteration 21542: Policy loss: 0.005614.

Iteration 21601: Policy loss: 0.002759. Value loss: 5.180990. Entropy: 1.160213.
Iteration 21602: Policy loss: 0.004595. Value loss: 3.422193. Entropy: 1.157346.
Iteration 21603: Policy loss: 0.003065. Value loss: 2.382499. Entropy: 1.163699.
Training network. lr: 0.000129. clip: 0.051400
Iteration 21604: Policy loss: 0.002677. Value loss: 3.069689. Entropy: 1.110608.
Iteration 21605: Policy loss: 0.002527. Value loss: 2.042709. Entropy: 1.125140.
Iteration 21606: Policy loss: -0.005209. Value loss: 1.408928. Entropy: 1.124892.
episode: 8226   score: 600.0  epsilon: 1.0    steps: 252  evaluation reward: 465.0
episode: 8227   score: 855.0  epsilon: 1.0    steps: 386  evaluation reward: 461.95
Training network. lr: 0.000129. clip: 0.051400
Iteration 21607: Policy loss: 0.004881. Value loss: 2.734408. Entropy: 1.008172.
Iteration 21608: Policy loss: 0.007617. Value loss: 1.507226. Entropy: 0.983401.
Iteration 21609: Policy loss: 0.002233. Value loss: 1.070689. Entropy: 1.004997.
episode: 

Iteration 21669: Policy loss: -0.004624. Value loss: 1.297436. Entropy: 0.976424.
episode: 8246   score: 575.0  epsilon: 1.0    steps: 130  evaluation reward: 470.3
Training network. lr: 0.000128. clip: 0.051288
Iteration 21670: Policy loss: 0.003072. Value loss: 2.633551. Entropy: 0.840297.
Iteration 21671: Policy loss: 0.000592. Value loss: 1.523837. Entropy: 0.869713.
Iteration 21672: Policy loss: 0.004474. Value loss: 1.220021. Entropy: 0.842632.
Training network. lr: 0.000128. clip: 0.051288
Iteration 21673: Policy loss: 0.004065. Value loss: 4.516934. Entropy: 1.074157.
Iteration 21674: Policy loss: 0.005705. Value loss: 2.501913. Entropy: 1.103732.
Iteration 21675: Policy loss: -0.000177. Value loss: 1.923311. Entropy: 1.102157.
episode: 8247   score: 395.0  epsilon: 1.0    steps: 374  evaluation reward: 472.55
Training network. lr: 0.000128. clip: 0.051288
Iteration 21676: Policy loss: 0.015853. Value loss: 6.051835. Entropy: 1.182389.
Iteration 21677: Policy loss: 0.009546. Va

episode: 8267   score: 315.0  epsilon: 1.0    steps: 160  evaluation reward: 459.25
episode: 8268   score: 260.0  epsilon: 1.0    steps: 402  evaluation reward: 458.0
Training network. lr: 0.000128. clip: 0.051175
Iteration 21736: Policy loss: 0.004256. Value loss: 1.929742. Entropy: 0.837809.
Iteration 21737: Policy loss: 0.000572. Value loss: 1.255858. Entropy: 0.825105.
Iteration 21738: Policy loss: -0.002258. Value loss: 1.088365. Entropy: 0.833037.
Training network. lr: 0.000128. clip: 0.051175
Iteration 21739: Policy loss: 0.008867. Value loss: 3.567622. Entropy: 1.066245.
Iteration 21740: Policy loss: 0.004282. Value loss: 2.377105. Entropy: 1.025408.
Iteration 21741: Policy loss: -0.000356. Value loss: 1.843708. Entropy: 1.038189.
episode: 8269   score: 625.0  epsilon: 1.0    steps: 852  evaluation reward: 454.25
Training network. lr: 0.000128. clip: 0.051175
Iteration 21742: Policy loss: 0.005628. Value loss: 3.317023. Entropy: 1.035986.
Iteration 21743: Policy loss: 0.005286.

Training network. lr: 0.000127. clip: 0.050950
Iteration 21802: Policy loss: 0.005953. Value loss: 5.080420. Entropy: 0.950561.
Iteration 21803: Policy loss: 0.004989. Value loss: 4.138946. Entropy: 0.954921.
Iteration 21804: Policy loss: 0.008030. Value loss: 2.757968. Entropy: 0.958002.
episode: 8289   score: 595.0  epsilon: 1.0    steps: 931  evaluation reward: 450.8
Training network. lr: 0.000127. clip: 0.050950
Iteration 21805: Policy loss: 0.000778. Value loss: 2.033175. Entropy: 0.921679.
Iteration 21806: Policy loss: -0.002762. Value loss: 1.319154. Entropy: 0.908803.
Iteration 21807: Policy loss: -0.002495. Value loss: 1.123774. Entropy: 0.919917.
Training network. lr: 0.000127. clip: 0.050950
Iteration 21808: Policy loss: 0.002609. Value loss: 1.848807. Entropy: 1.038158.
Iteration 21809: Policy loss: 0.001701. Value loss: 1.177153. Entropy: 1.047514.
Iteration 21810: Policy loss: -0.004574. Value loss: 0.871014. Entropy: 1.037353.
Training network. lr: 0.000127. clip: 0.0509

episode: 8311   score: 410.0  epsilon: 1.0    steps: 713  evaluation reward: 438.85
Training network. lr: 0.000127. clip: 0.050838
Iteration 21868: Policy loss: 0.007039. Value loss: 3.164248. Entropy: 0.909322.
Iteration 21869: Policy loss: 0.005227. Value loss: 1.892681. Entropy: 0.896320.
Iteration 21870: Policy loss: 0.008513. Value loss: 1.674323. Entropy: 0.896801.
episode: 8312   score: 750.0  epsilon: 1.0    steps: 97  evaluation reward: 439.5
episode: 8313   score: 530.0  epsilon: 1.0    steps: 971  evaluation reward: 440.95
Training network. lr: 0.000127. clip: 0.050838
Iteration 21871: Policy loss: 0.004677. Value loss: 3.048174. Entropy: 1.008370.
Iteration 21872: Policy loss: 0.005663. Value loss: 2.076540. Entropy: 1.002199.
Iteration 21873: Policy loss: 0.003747. Value loss: 1.687826. Entropy: 1.015017.
episode: 8314   score: 285.0  epsilon: 1.0    steps: 459  evaluation reward: 442.45
Training network. lr: 0.000127. clip: 0.050838
Iteration 21874: Policy loss: 0.001061.

Iteration 21933: Policy loss: -0.002482. Value loss: 1.674582. Entropy: 1.043487.
episode: 8334   score: 315.0  epsilon: 1.0    steps: 46  evaluation reward: 426.85
Training network. lr: 0.000127. clip: 0.050725
Iteration 21934: Policy loss: 0.003219. Value loss: 3.447889. Entropy: 1.014621.
Iteration 21935: Policy loss: 0.001155. Value loss: 2.180270. Entropy: 0.998798.
Iteration 21936: Policy loss: 0.003777. Value loss: 1.743928. Entropy: 1.035984.
episode: 8335   score: 415.0  epsilon: 1.0    steps: 383  evaluation reward: 426.85
episode: 8336   score: 485.0  epsilon: 1.0    steps: 951  evaluation reward: 427.05
Training network. lr: 0.000127. clip: 0.050725
Iteration 21937: Policy loss: 0.005666. Value loss: 2.990210. Entropy: 0.990817.
Iteration 21938: Policy loss: 0.006953. Value loss: 1.842019. Entropy: 0.957737.
Iteration 21939: Policy loss: 0.002213. Value loss: 1.411254. Entropy: 0.975765.
Training network. lr: 0.000127. clip: 0.050725
Iteration 21940: Policy loss: 0.002679. 

Iteration 21998: Policy loss: 0.003175. Value loss: 3.774724. Entropy: 0.968974.
Iteration 21999: Policy loss: 0.000135. Value loss: 3.010418. Entropy: 0.946663.
episode: 8357   score: 570.0  epsilon: 1.0    steps: 320  evaluation reward: 415.9
Training network. lr: 0.000127. clip: 0.050613
Iteration 22000: Policy loss: 0.005502. Value loss: 3.173442. Entropy: 0.720984.
Iteration 22001: Policy loss: 0.000269. Value loss: 2.230451. Entropy: 0.688394.
Iteration 22002: Policy loss: 0.001361. Value loss: 1.855380. Entropy: 0.706718.
Training network. lr: 0.000126. clip: 0.050500
Iteration 22003: Policy loss: 0.001025. Value loss: 3.561045. Entropy: 0.886081.
Iteration 22004: Policy loss: 0.002275. Value loss: 2.854220. Entropy: 0.870055.
Iteration 22005: Policy loss: 0.001239. Value loss: 2.553640. Entropy: 0.870407.
episode: 8358   score: 285.0  epsilon: 1.0    steps: 528  evaluation reward: 418.15
Training network. lr: 0.000126. clip: 0.050500
Iteration 22006: Policy loss: 0.004341. Valu

Training network. lr: 0.000126. clip: 0.050388
Iteration 22063: Policy loss: 0.005842. Value loss: 3.707758. Entropy: 1.010886.
Iteration 22064: Policy loss: 0.004351. Value loss: 2.448259. Entropy: 1.011201.
Iteration 22065: Policy loss: 0.002126. Value loss: 1.809900. Entropy: 1.025146.
Training network. lr: 0.000126. clip: 0.050388
Iteration 22066: Policy loss: 0.002553. Value loss: 2.857490. Entropy: 1.175354.
Iteration 22067: Policy loss: -0.002156. Value loss: 1.660828. Entropy: 1.168974.
Iteration 22068: Policy loss: -0.005661. Value loss: 1.357339. Entropy: 1.182869.
episode: 8381   score: 630.0  epsilon: 1.0    steps: 299  evaluation reward: 394.2
episode: 8382   score: 405.0  epsilon: 1.0    steps: 947  evaluation reward: 397.4
Training network. lr: 0.000126. clip: 0.050388
Iteration 22069: Policy loss: 0.000833. Value loss: 2.691637. Entropy: 0.989811.
Iteration 22070: Policy loss: -0.002826. Value loss: 1.735398. Entropy: 0.999813.
Iteration 22071: Policy loss: -0.004198. V

episode: 8403   score: 255.0  epsilon: 1.0    steps: 345  evaluation reward: 369.55
Training network. lr: 0.000126. clip: 0.050275
Iteration 22129: Policy loss: 0.006151. Value loss: 4.909884. Entropy: 1.125754.
Iteration 22130: Policy loss: 0.004794. Value loss: 3.050139. Entropy: 1.118748.
Iteration 22131: Policy loss: 0.001322. Value loss: 2.932806. Entropy: 1.099557.
episode: 8404   score: 285.0  epsilon: 1.0    steps: 221  evaluation reward: 367.1
Training network. lr: 0.000126. clip: 0.050275
Iteration 22132: Policy loss: 0.000652. Value loss: 2.965812. Entropy: 1.109695.
Iteration 22133: Policy loss: 0.005978. Value loss: 1.538231. Entropy: 1.097176.
Iteration 22134: Policy loss: -0.001558. Value loss: 1.243976. Entropy: 1.139780.
episode: 8405   score: 290.0  epsilon: 1.0    steps: 492  evaluation reward: 366.85
episode: 8406   score: 660.0  epsilon: 1.0    steps: 524  evaluation reward: 367.95
episode: 8407   score: 230.0  epsilon: 1.0    steps: 866  evaluation reward: 372.2
T

episode: 8425   score: 580.0  epsilon: 1.0    steps: 270  evaluation reward: 372.6
Training network. lr: 0.000125. clip: 0.050163
Iteration 22195: Policy loss: 0.003465. Value loss: 3.292003. Entropy: 1.119425.
Iteration 22196: Policy loss: 0.001456. Value loss: 2.169946. Entropy: 1.145308.
Iteration 22197: Policy loss: 0.001724. Value loss: 1.783565. Entropy: 1.114405.
Training network. lr: 0.000125. clip: 0.050163
Iteration 22198: Policy loss: 0.002349. Value loss: 2.082225. Entropy: 1.138555.
Iteration 22199: Policy loss: -0.000082. Value loss: 1.490093. Entropy: 1.136600.
Iteration 22200: Policy loss: -0.002325. Value loss: 1.223031. Entropy: 1.153141.
episode: 8426   score: 310.0  epsilon: 1.0    steps: 562  evaluation reward: 376.05
episode: 8427   score: 800.0  epsilon: 1.0    steps: 787  evaluation reward: 374.75
Training network. lr: 0.000125. clip: 0.050050
Iteration 22201: Policy loss: 0.002508. Value loss: 2.890224. Entropy: 1.048772.
Iteration 22202: Policy loss: -0.000594

Iteration 22259: Policy loss: 0.006120. Value loss: 1.346517. Entropy: 1.211198.
Iteration 22260: Policy loss: 0.000505. Value loss: 1.139766. Entropy: 1.206724.
episode: 8449   score: 400.0  epsilon: 1.0    steps: 111  evaluation reward: 360.65
episode: 8450   score: 260.0  epsilon: 1.0    steps: 342  evaluation reward: 360.25
Training network. lr: 0.000125. clip: 0.049938
Iteration 22261: Policy loss: 0.001337. Value loss: 4.204939. Entropy: 1.230767.
Iteration 22262: Policy loss: 0.002867. Value loss: 2.634465. Entropy: 1.224766.
Iteration 22263: Policy loss: -0.000642. Value loss: 1.932464. Entropy: 1.235539.
Training network. lr: 0.000125. clip: 0.049938
Iteration 22264: Policy loss: 0.001675. Value loss: 3.002040. Entropy: 1.103934.
Iteration 22265: Policy loss: 0.002071. Value loss: 1.721490. Entropy: 1.109257.
Iteration 22266: Policy loss: -0.000658. Value loss: 1.451129. Entropy: 1.123773.
now time :  2019-02-23 07:18:06.616486
episode: 8451   score: 325.0  epsilon: 1.0    ste

Iteration 22324: Policy loss: -0.000208. Value loss: 3.919378. Entropy: 1.182359.
Iteration 22325: Policy loss: -0.002137. Value loss: 2.164424. Entropy: 1.182496.
Iteration 22326: Policy loss: -0.003172. Value loss: 1.849260. Entropy: 1.179266.
episode: 8471   score: 315.0  epsilon: 1.0    steps: 332  evaluation reward: 364.6
Training network. lr: 0.000125. clip: 0.049825
Iteration 22327: Policy loss: 0.002845. Value loss: 2.523254. Entropy: 1.207600.
Iteration 22328: Policy loss: 0.003441. Value loss: 1.747499. Entropy: 1.233743.
Iteration 22329: Policy loss: -0.001502. Value loss: 1.714963. Entropy: 1.202048.
Training network. lr: 0.000125. clip: 0.049825
Iteration 22330: Policy loss: 0.006079. Value loss: 5.018435. Entropy: 1.289366.
Iteration 22331: Policy loss: 0.013516. Value loss: 3.859804. Entropy: 1.282963.
Iteration 22332: Policy loss: 0.018037. Value loss: 1.636702. Entropy: 1.287501.
episode: 8472   score: 520.0  epsilon: 1.0    steps: 227  evaluation reward: 366.2
episode

Iteration 22392: Policy loss: -0.000823. Value loss: 1.339297. Entropy: 1.027998.
episode: 8491   score: 150.0  epsilon: 1.0    steps: 857  evaluation reward: 385.7
Training network. lr: 0.000124. clip: 0.049713
Iteration 22393: Policy loss: 0.003830. Value loss: 3.350498. Entropy: 1.276591.
Iteration 22394: Policy loss: 0.006798. Value loss: 1.981699. Entropy: 1.278958.
Iteration 22395: Policy loss: 0.004668. Value loss: 1.501114. Entropy: 1.286515.
episode: 8492   score: 360.0  epsilon: 1.0    steps: 1000  evaluation reward: 385.05
Training network. lr: 0.000124. clip: 0.049713
Iteration 22396: Policy loss: 0.003264. Value loss: 4.373123. Entropy: 1.243708.
Iteration 22397: Policy loss: 0.001252. Value loss: 2.682899. Entropy: 1.252642.
Iteration 22398: Policy loss: 0.001701. Value loss: 2.156071. Entropy: 1.237968.
episode: 8493   score: 520.0  epsilon: 1.0    steps: 240  evaluation reward: 384.65
episode: 8494   score: 315.0  epsilon: 1.0    steps: 275  evaluation reward: 385.25
Tr

episode: 8512   score: 420.0  epsilon: 1.0    steps: 111  evaluation reward: 396.25
Training network. lr: 0.000124. clip: 0.049488
Iteration 22459: Policy loss: 0.005898. Value loss: 4.272606. Entropy: 1.314515.
Iteration 22460: Policy loss: 0.006491. Value loss: 2.327257. Entropy: 1.289077.
Iteration 22461: Policy loss: 0.004616. Value loss: 1.581314. Entropy: 1.300288.
Training network. lr: 0.000124. clip: 0.049488
Iteration 22462: Policy loss: 0.003388. Value loss: 3.380656. Entropy: 1.287027.
Iteration 22463: Policy loss: 0.002774. Value loss: 1.882252. Entropy: 1.273168.
Iteration 22464: Policy loss: 0.002633. Value loss: 1.421364. Entropy: 1.288403.
episode: 8513   score: 320.0  epsilon: 1.0    steps: 259  evaluation reward: 395.55
Training network. lr: 0.000124. clip: 0.049488
Iteration 22465: Policy loss: 0.000907. Value loss: 2.680698. Entropy: 1.143378.
Iteration 22466: Policy loss: 0.002547. Value loss: 1.809142. Entropy: 1.163317.
Iteration 22467: Policy loss: -0.001900. Va

Iteration 22526: Policy loss: 0.006515. Value loss: 2.960152. Entropy: 0.943199.
Iteration 22527: Policy loss: 0.000772. Value loss: 2.521980. Entropy: 0.975003.
episode: 8533   score: 335.0  epsilon: 1.0    steps: 95  evaluation reward: 391.75
episode: 8534   score: 820.0  epsilon: 1.0    steps: 382  evaluation reward: 391.25
Training network. lr: 0.000123. clip: 0.049375
Iteration 22528: Policy loss: 0.008573. Value loss: 6.038840. Entropy: 1.238738.
Iteration 22529: Policy loss: 0.008000. Value loss: 3.969915. Entropy: 1.250342.
Iteration 22530: Policy loss: 0.006839. Value loss: 2.947513. Entropy: 1.259114.
Training network. lr: 0.000123. clip: 0.049375
Iteration 22531: Policy loss: 0.003110. Value loss: 3.331101. Entropy: 1.024932.
Iteration 22532: Policy loss: 0.003947. Value loss: 2.168255. Entropy: 1.013220.
Iteration 22533: Policy loss: 0.001805. Value loss: 1.719320. Entropy: 1.004798.
episode: 8535   score: 240.0  epsilon: 1.0    steps: 475  evaluation reward: 397.3
episode:

Training network. lr: 0.000123. clip: 0.049263
Iteration 22591: Policy loss: 0.005411. Value loss: 3.530763. Entropy: 1.027840.
Iteration 22592: Policy loss: 0.004280. Value loss: 2.448866. Entropy: 1.042093.
Iteration 22593: Policy loss: 0.001738. Value loss: 1.898431. Entropy: 1.037407.
Training network. lr: 0.000123. clip: 0.049263
Iteration 22594: Policy loss: 0.004547. Value loss: 3.260741. Entropy: 1.026476.
Iteration 22595: Policy loss: 0.001920. Value loss: 2.121609. Entropy: 1.041577.
Iteration 22596: Policy loss: 0.000002. Value loss: 1.667841. Entropy: 1.031094.
Training network. lr: 0.000123. clip: 0.049263
Iteration 22597: Policy loss: 0.000447. Value loss: 3.040668. Entropy: 1.290433.
Iteration 22598: Policy loss: -0.001854. Value loss: 2.002151. Entropy: 1.267699.
Iteration 22599: Policy loss: -0.002592. Value loss: 1.608588. Entropy: 1.268194.
Training network. lr: 0.000123. clip: 0.049263
Iteration 22600: Policy loss: 0.003776. Value loss: 7.906883. Entropy: 1.387956.


Iteration 22659: Policy loss: 0.001995. Value loss: 1.406736. Entropy: 1.132709.
episode: 8576   score: 330.0  epsilon: 1.0    steps: 805  evaluation reward: 430.45
episode: 8577   score: 395.0  epsilon: 1.0    steps: 913  evaluation reward: 429.15
Training network. lr: 0.000123. clip: 0.049038
Iteration 22660: Policy loss: 0.001056. Value loss: 4.762589. Entropy: 0.902323.
Iteration 22661: Policy loss: 0.009443. Value loss: 2.917150. Entropy: 0.917797.
Iteration 22662: Policy loss: 0.010164. Value loss: 1.885442. Entropy: 0.895023.
episode: 8578   score: 350.0  epsilon: 1.0    steps: 227  evaluation reward: 421.75
Training network. lr: 0.000123. clip: 0.049038
Iteration 22663: Policy loss: 0.005423. Value loss: 3.735515. Entropy: 1.180155.
Iteration 22664: Policy loss: 0.006945. Value loss: 2.027424. Entropy: 1.179167.
Iteration 22665: Policy loss: 0.000292. Value loss: 1.521520. Entropy: 1.184345.
Training network. lr: 0.000123. clip: 0.049038
Iteration 22666: Policy loss: 0.003939. 

Iteration 22723: Policy loss: 0.002915. Value loss: 2.582839. Entropy: 1.075263.
Iteration 22724: Policy loss: -0.000210. Value loss: 1.853775. Entropy: 1.073716.
Iteration 22725: Policy loss: -0.002693. Value loss: 1.512307. Entropy: 1.073629.
episode: 8600   score: 405.0  epsilon: 1.0    steps: 18  evaluation reward: 420.15
Training network. lr: 0.000122. clip: 0.048925
Iteration 22726: Policy loss: 0.005543. Value loss: 4.350276. Entropy: 1.076388.
Iteration 22727: Policy loss: 0.009531. Value loss: 2.755901. Entropy: 1.085611.
Iteration 22728: Policy loss: 0.006762. Value loss: 1.913974. Entropy: 1.107677.
now time :  2019-02-23 07:27:34.375306
episode: 8601   score: 415.0  epsilon: 1.0    steps: 345  evaluation reward: 418.35
episode: 8602   score: 210.0  epsilon: 1.0    steps: 583  evaluation reward: 420.2
Training network. lr: 0.000122. clip: 0.048925
Iteration 22729: Policy loss: 0.004891. Value loss: 3.370844. Entropy: 1.060421.
Iteration 22730: Policy loss: 0.008942. Value lo

Iteration 22790: Policy loss: 0.002188. Value loss: 1.455191. Entropy: 1.247825.
Iteration 22791: Policy loss: 0.004317. Value loss: 1.089859. Entropy: 1.195418.
Training network. lr: 0.000122. clip: 0.048813
Iteration 22792: Policy loss: 0.004971. Value loss: 2.626476. Entropy: 1.070547.
Iteration 22793: Policy loss: 0.002132. Value loss: 1.546813. Entropy: 1.106697.
Iteration 22794: Policy loss: 0.001569. Value loss: 1.323535. Entropy: 1.082006.
episode: 8621   score: 280.0  epsilon: 1.0    steps: 142  evaluation reward: 421.2
episode: 8622   score: 370.0  epsilon: 1.0    steps: 890  evaluation reward: 421.15
Training network. lr: 0.000122. clip: 0.048813
Iteration 22795: Policy loss: 0.004870. Value loss: 6.858940. Entropy: 1.040760.
Iteration 22796: Policy loss: 0.017617. Value loss: 3.822878. Entropy: 1.058727.
Iteration 22797: Policy loss: 0.010696. Value loss: 2.653640. Entropy: 1.064178.
Training network. lr: 0.000122. clip: 0.048813
Iteration 22798: Policy loss: 0.005171. Valu

Training network. lr: 0.000121. clip: 0.048588
Iteration 22858: Policy loss: 0.002672. Value loss: 5.025150. Entropy: 0.999789.
Iteration 22859: Policy loss: 0.003963. Value loss: 4.611066. Entropy: 1.026319.
Iteration 22860: Policy loss: 0.004032. Value loss: 3.655351. Entropy: 1.026153.
Training network. lr: 0.000121. clip: 0.048588
Iteration 22861: Policy loss: 0.004979. Value loss: 2.436698. Entropy: 1.095248.
Iteration 22862: Policy loss: 0.003466. Value loss: 1.591857. Entropy: 1.116014.
Iteration 22863: Policy loss: 0.002706. Value loss: 1.362339. Entropy: 1.105819.
Training network. lr: 0.000121. clip: 0.048588
Iteration 22864: Policy loss: 0.006176. Value loss: 3.487367. Entropy: 1.390742.
Iteration 22865: Policy loss: 0.007414. Value loss: 1.983963. Entropy: 1.381982.
Iteration 22866: Policy loss: 0.007363. Value loss: 1.622729. Entropy: 1.391492.
episode: 8641   score: 335.0  epsilon: 1.0    steps: 695  evaluation reward: 440.45
Training network. lr: 0.000121. clip: 0.048588

Iteration 22924: Policy loss: 0.000854. Value loss: 4.723237. Entropy: 1.152817.
Iteration 22925: Policy loss: 0.001186. Value loss: 2.649450. Entropy: 1.161633.
Iteration 22926: Policy loss: 0.000805. Value loss: 2.090380. Entropy: 1.166208.
episode: 8662   score: 210.0  epsilon: 1.0    steps: 920  evaluation reward: 428.1
Training network. lr: 0.000121. clip: 0.048475
Iteration 22927: Policy loss: 0.012217. Value loss: 5.266551. Entropy: 1.229046.
Iteration 22928: Policy loss: 0.008103. Value loss: 3.333277. Entropy: 1.229298.
Iteration 22929: Policy loss: 0.010838. Value loss: 2.541031. Entropy: 1.224104.
Training network. lr: 0.000121. clip: 0.048475
Iteration 22930: Policy loss: 0.002218. Value loss: 4.040861. Entropy: 1.322166.
Iteration 22931: Policy loss: 0.004147. Value loss: 2.727400. Entropy: 1.332526.
Iteration 22932: Policy loss: -0.000472. Value loss: 1.962475. Entropy: 1.318682.
episode: 8663   score: 405.0  epsilon: 1.0    steps: 742  evaluation reward: 426.25
Training 

Training network. lr: 0.000121. clip: 0.048363
Iteration 22990: Policy loss: 0.004023. Value loss: 2.972799. Entropy: 1.154795.
Iteration 22991: Policy loss: 0.003024. Value loss: 1.977773. Entropy: 1.179131.
Iteration 22992: Policy loss: 0.000417. Value loss: 1.575731. Entropy: 1.163216.
episode: 8685   score: 365.0  epsilon: 1.0    steps: 20  evaluation reward: 418.05
Training network. lr: 0.000121. clip: 0.048363
Iteration 22993: Policy loss: 0.006124. Value loss: 3.450215. Entropy: 1.056281.
Iteration 22994: Policy loss: 0.005027. Value loss: 2.154333. Entropy: 1.049601.
Iteration 22995: Policy loss: 0.001066. Value loss: 1.834354. Entropy: 1.060970.
episode: 8686   score: 440.0  epsilon: 1.0    steps: 345  evaluation reward: 419.25
episode: 8687   score: 550.0  epsilon: 1.0    steps: 918  evaluation reward: 421.85
Training network. lr: 0.000121. clip: 0.048363
Iteration 22996: Policy loss: 0.004196. Value loss: 5.946615. Entropy: 1.149050.
Iteration 22997: Policy loss: 0.003021. V

Training network. lr: 0.000120. clip: 0.048138
Iteration 23056: Policy loss: 0.001357. Value loss: 6.000458. Entropy: 1.132994.
Iteration 23057: Policy loss: 0.000746. Value loss: 4.180043. Entropy: 1.148650.
Iteration 23058: Policy loss: 0.001667. Value loss: 3.021799. Entropy: 1.152512.
Training network. lr: 0.000120. clip: 0.048138
Iteration 23059: Policy loss: 0.005875. Value loss: 5.734313. Entropy: 1.228836.
Iteration 23060: Policy loss: 0.008039. Value loss: 3.685821. Entropy: 1.223148.
Iteration 23061: Policy loss: 0.013170. Value loss: 2.400879. Entropy: 1.230221.
episode: 8707   score: 440.0  epsilon: 1.0    steps: 200  evaluation reward: 443.4
episode: 8708   score: 425.0  epsilon: 1.0    steps: 696  evaluation reward: 440.4
Training network. lr: 0.000120. clip: 0.048138
Iteration 23062: Policy loss: 0.001816. Value loss: 2.714214. Entropy: 0.974482.
Iteration 23063: Policy loss: 0.002025. Value loss: 2.152414. Entropy: 0.971218.
Iteration 23064: Policy loss: -0.004641. Valu

Iteration 23123: Policy loss: 0.009004. Value loss: 4.748100. Entropy: 1.156340.
Iteration 23124: Policy loss: 0.004775. Value loss: 3.603166. Entropy: 1.178529.
episode: 8727   score: 315.0  epsilon: 1.0    steps: 155  evaluation reward: 441.25
episode: 8728   score: 515.0  epsilon: 1.0    steps: 801  evaluation reward: 438.65
Training network. lr: 0.000120. clip: 0.048025
Iteration 23125: Policy loss: 0.004294. Value loss: 3.585990. Entropy: 1.038618.
Iteration 23126: Policy loss: 0.006753. Value loss: 2.403269. Entropy: 1.081275.
Iteration 23127: Policy loss: 0.000754. Value loss: 2.192709. Entropy: 1.038837.
episode: 8729   score: 725.0  epsilon: 1.0    steps: 508  evaluation reward: 436.4
Training network. lr: 0.000120. clip: 0.048025
Iteration 23128: Policy loss: 0.001276. Value loss: 3.538751. Entropy: 1.218384.
Iteration 23129: Policy loss: 0.004714. Value loss: 2.451866. Entropy: 1.209547.
Iteration 23130: Policy loss: 0.004268. Value loss: 1.854607. Entropy: 1.208808.
Trainin

Iteration 23189: Policy loss: 0.003560. Value loss: 2.334821. Entropy: 1.262582.
Iteration 23190: Policy loss: 0.000765. Value loss: 1.927012. Entropy: 1.271813.
episode: 8749   score: 355.0  epsilon: 1.0    steps: 307  evaluation reward: 435.5
Training network. lr: 0.000120. clip: 0.047913
Iteration 23191: Policy loss: 0.002944. Value loss: 6.054779. Entropy: 1.150011.
Iteration 23192: Policy loss: -0.000836. Value loss: 4.831490. Entropy: 1.146569.
Iteration 23193: Policy loss: -0.003298. Value loss: 3.792480. Entropy: 1.155643.
episode: 8750   score: 335.0  epsilon: 1.0    steps: 221  evaluation reward: 431.5
now time :  2019-02-23 07:37:07.821349
episode: 8751   score: 415.0  epsilon: 1.0    steps: 681  evaluation reward: 430.2
Training network. lr: 0.000120. clip: 0.047913
Iteration 23194: Policy loss: -0.002018. Value loss: 4.202270. Entropy: 1.103356.
Iteration 23195: Policy loss: -0.000274. Value loss: 2.626769. Entropy: 1.102879.
Iteration 23196: Policy loss: 0.000439. Value l

Iteration 23252: Policy loss: 0.002485. Value loss: 4.792107. Entropy: 1.106991.
Iteration 23253: Policy loss: 0.000479. Value loss: 4.520433. Entropy: 1.114501.
episode: 8774   score: 420.0  epsilon: 1.0    steps: 193  evaluation reward: 416.05
episode: 8775   score: 395.0  epsilon: 1.0    steps: 403  evaluation reward: 416.75
episode: 8776   score: 180.0  epsilon: 1.0    steps: 1003  evaluation reward: 416.7
Training network. lr: 0.000119. clip: 0.047688
Iteration 23254: Policy loss: 0.001062. Value loss: 2.193188. Entropy: 1.064742.
Iteration 23255: Policy loss: -0.000795. Value loss: 1.478392. Entropy: 1.086022.
Iteration 23256: Policy loss: -0.002035. Value loss: 1.119191. Entropy: 1.081630.
episode: 8777   score: 645.0  epsilon: 1.0    steps: 658  evaluation reward: 412.15
Training network. lr: 0.000119. clip: 0.047688
Iteration 23257: Policy loss: 0.007478. Value loss: 2.067042. Entropy: 1.045677.
Iteration 23258: Policy loss: 0.005486. Value loss: 1.292972. Entropy: 1.054538.
I

Training network. lr: 0.000119. clip: 0.047575
Iteration 23314: Policy loss: 0.005786. Value loss: 4.755011. Entropy: 1.038263.
Iteration 23315: Policy loss: 0.006349. Value loss: 3.294453. Entropy: 1.023410.
Iteration 23316: Policy loss: 0.001574. Value loss: 2.674186. Entropy: 0.983856.
Training network. lr: 0.000119. clip: 0.047575
Iteration 23317: Policy loss: 0.002050. Value loss: 2.229035. Entropy: 0.971415.
Iteration 23318: Policy loss: 0.002350. Value loss: 1.492007. Entropy: 0.988112.
Iteration 23319: Policy loss: -0.000219. Value loss: 1.277626. Entropy: 0.959749.
Training network. lr: 0.000119. clip: 0.047575
Iteration 23320: Policy loss: 0.002231. Value loss: 3.383809. Entropy: 1.175210.
Iteration 23321: Policy loss: 0.003835. Value loss: 1.613652. Entropy: 1.199262.
Iteration 23322: Policy loss: -0.004254. Value loss: 1.172268. Entropy: 1.179314.
now time :  2019-02-23 07:39:47.738605
episode: 8801   score: 320.0  epsilon: 1.0    steps: 857  evaluation reward: 394.4
Traini

Iteration 23381: Policy loss: 0.010426. Value loss: 2.988611. Entropy: 1.192234.
Iteration 23382: Policy loss: 0.004681. Value loss: 2.408374. Entropy: 1.163291.
episode: 8821   score: 290.0  epsilon: 1.0    steps: 736  evaluation reward: 391.65
Training network. lr: 0.000119. clip: 0.047463
Iteration 23383: Policy loss: 0.002302. Value loss: 4.475223. Entropy: 1.232912.
Iteration 23384: Policy loss: 0.005843. Value loss: 3.277102. Entropy: 1.249739.
Iteration 23385: Policy loss: 0.008114. Value loss: 2.492263. Entropy: 1.251765.
episode: 8822   score: 610.0  epsilon: 1.0    steps: 780  evaluation reward: 391.25
Training network. lr: 0.000119. clip: 0.047463
Iteration 23386: Policy loss: 0.003336. Value loss: 3.366498. Entropy: 1.173457.
Iteration 23387: Policy loss: 0.004936. Value loss: 2.392866. Entropy: 1.202040.
Iteration 23388: Policy loss: 0.000215. Value loss: 2.033462. Entropy: 1.190500.
episode: 8823   score: 210.0  epsilon: 1.0    steps: 556  evaluation reward: 391.15
Traini

Iteration 23447: Policy loss: 0.000434. Value loss: 2.246036. Entropy: 1.202400.
Iteration 23448: Policy loss: -0.002454. Value loss: 1.655594. Entropy: 1.204273.
Training network. lr: 0.000118. clip: 0.047350
Iteration 23449: Policy loss: 0.001278. Value loss: 3.599215. Entropy: 1.170450.
Iteration 23450: Policy loss: 0.005542. Value loss: 2.240913. Entropy: 1.173445.
Iteration 23451: Policy loss: -0.000766. Value loss: 1.637154. Entropy: 1.176114.
episode: 8843   score: 740.0  epsilon: 1.0    steps: 323  evaluation reward: 380.65
episode: 8844   score: 360.0  epsilon: 1.0    steps: 911  evaluation reward: 381.55
Training network. lr: 0.000118. clip: 0.047238
Iteration 23452: Policy loss: 0.001917. Value loss: 2.582610. Entropy: 0.864917.
Iteration 23453: Policy loss: 0.000385. Value loss: 1.847263. Entropy: 0.869351.
Iteration 23454: Policy loss: -0.001318. Value loss: 1.527351. Entropy: 0.878353.
episode: 8845   score: 710.0  epsilon: 1.0    steps: 521  evaluation reward: 380.15
Tra

Iteration 23513: Policy loss: 0.001816. Value loss: 2.156339. Entropy: 1.169611.
Iteration 23514: Policy loss: -0.001476. Value loss: 1.690660. Entropy: 1.142486.
episode: 8864   score: 655.0  epsilon: 1.0    steps: 290  evaluation reward: 391.6
Training network. lr: 0.000118. clip: 0.047125
Iteration 23515: Policy loss: 0.001664. Value loss: 2.448773. Entropy: 0.842866.
Iteration 23516: Policy loss: 0.000738. Value loss: 1.784868. Entropy: 0.837925.
Iteration 23517: Policy loss: -0.001504. Value loss: 1.381039. Entropy: 0.828429.
Training network. lr: 0.000118. clip: 0.047125
Iteration 23518: Policy loss: 0.001965. Value loss: 3.274219. Entropy: 0.934205.
Iteration 23519: Policy loss: 0.000350. Value loss: 1.780265. Entropy: 0.938239.
Iteration 23520: Policy loss: -0.002266. Value loss: 1.400805. Entropy: 0.930018.
episode: 8865   score: 715.0  epsilon: 1.0    steps: 524  evaluation reward: 396.3
Training network. lr: 0.000118. clip: 0.047125
Iteration 23521: Policy loss: 0.001474. Va

Training network. lr: 0.000118. clip: 0.047013
Iteration 23581: Policy loss: 0.001871. Value loss: 3.045398. Entropy: 1.003289.
Iteration 23582: Policy loss: -0.001215. Value loss: 1.857001. Entropy: 1.008100.
Iteration 23583: Policy loss: -0.004741. Value loss: 1.604798. Entropy: 1.009575.
episode: 8884   score: 480.0  epsilon: 1.0    steps: 85  evaluation reward: 419.4
Training network. lr: 0.000118. clip: 0.047013
Iteration 23584: Policy loss: 0.004310. Value loss: 3.496281. Entropy: 1.093040.
Iteration 23585: Policy loss: 0.003902. Value loss: 1.870038. Entropy: 1.086398.
Iteration 23586: Policy loss: 0.003417. Value loss: 1.514743. Entropy: 1.119288.
episode: 8885   score: 470.0  epsilon: 1.0    steps: 756  evaluation reward: 421.0
Training network. lr: 0.000118. clip: 0.047013
Iteration 23587: Policy loss: 0.002711. Value loss: 3.396800. Entropy: 1.303180.
Iteration 23588: Policy loss: 0.002273. Value loss: 2.000512. Entropy: 1.329635.
Iteration 23589: Policy loss: 0.001232. Valu

Iteration 23648: Policy loss: 0.001845. Value loss: 1.993268. Entropy: 1.236547.
Iteration 23649: Policy loss: -0.002083. Value loss: 1.655290. Entropy: 1.229518.
episode: 8904   score: 295.0  epsilon: 1.0    steps: 693  evaluation reward: 423.0
episode: 8905   score: 470.0  epsilon: 1.0    steps: 998  evaluation reward: 420.3
Training network. lr: 0.000117. clip: 0.046900
Iteration 23650: Policy loss: 0.003618. Value loss: 6.793022. Entropy: 1.372275.
Iteration 23651: Policy loss: 0.007825. Value loss: 5.092857. Entropy: 1.343163.
Iteration 23652: Policy loss: 0.008878. Value loss: 3.743892. Entropy: 1.336752.
episode: 8906   score: 310.0  epsilon: 1.0    steps: 177  evaluation reward: 421.25
Training network. lr: 0.000117. clip: 0.046788
Iteration 23653: Policy loss: 0.004504. Value loss: 1.896856. Entropy: 1.015498.
Iteration 23654: Policy loss: 0.001484. Value loss: 1.259441. Entropy: 1.024413.
Iteration 23655: Policy loss: 0.000977. Value loss: 1.011747. Entropy: 1.015010.
Trainin

Iteration 23717: Policy loss: 0.002356. Value loss: 1.997898. Entropy: 1.319388.
Iteration 23718: Policy loss: -0.001576. Value loss: 1.399757. Entropy: 1.310139.
episode: 8923   score: 270.0  epsilon: 1.0    steps: 30  evaluation reward: 425.7
Training network. lr: 0.000117. clip: 0.046675
Iteration 23719: Policy loss: -0.002068. Value loss: 3.754442. Entropy: 1.254267.
Iteration 23720: Policy loss: -0.002339. Value loss: 2.124462. Entropy: 1.255198.
Iteration 23721: Policy loss: -0.005768. Value loss: 1.797524. Entropy: 1.248240.
episode: 8924   score: 495.0  epsilon: 1.0    steps: 452  evaluation reward: 426.3
Training network. lr: 0.000117. clip: 0.046675
Iteration 23722: Policy loss: 0.003714. Value loss: 5.687889. Entropy: 1.264592.
Iteration 23723: Policy loss: 0.001766. Value loss: 4.087478. Entropy: 1.240467.
Iteration 23724: Policy loss: 0.002532. Value loss: 3.648647. Entropy: 1.255674.
episode: 8925   score: 420.0  epsilon: 1.0    steps: 295  evaluation reward: 428.25
Train

episode: 8944   score: 495.0  epsilon: 1.0    steps: 245  evaluation reward: 441.05
episode: 8945   score: 80.0  epsilon: 1.0    steps: 499  evaluation reward: 442.4
episode: 8946   score: 670.0  epsilon: 1.0    steps: 835  evaluation reward: 436.1
Training network. lr: 0.000116. clip: 0.046563
Iteration 23785: Policy loss: 0.002316. Value loss: 3.573619. Entropy: 1.217621.
Iteration 23786: Policy loss: 0.000858. Value loss: 2.146039. Entropy: 1.197911.
Iteration 23787: Policy loss: -0.003605. Value loss: 1.671078. Entropy: 1.200734.
Training network. lr: 0.000116. clip: 0.046563
Iteration 23788: Policy loss: 0.001904. Value loss: 3.238707. Entropy: 1.067542.
Iteration 23789: Policy loss: -0.002976. Value loss: 2.056989. Entropy: 1.046661.
Iteration 23790: Policy loss: 0.000208. Value loss: 1.714429. Entropy: 1.051177.
Training network. lr: 0.000116. clip: 0.046563
Iteration 23791: Policy loss: 0.001362. Value loss: 4.033117. Entropy: 1.092893.
Iteration 23792: Policy loss: 0.000266. V

Iteration 23847: Policy loss: -0.001212. Value loss: 1.836353. Entropy: 0.908551.
Training network. lr: 0.000116. clip: 0.046450
Iteration 23848: Policy loss: 0.002557. Value loss: 6.082843. Entropy: 1.118494.
Iteration 23849: Policy loss: 0.008402. Value loss: 3.779521. Entropy: 1.107490.
Iteration 23850: Policy loss: 0.012394. Value loss: 2.610643. Entropy: 1.116695.
episode: 8970   score: 510.0  epsilon: 1.0    steps: 430  evaluation reward: 419.45
Training network. lr: 0.000116. clip: 0.046338
Iteration 23851: Policy loss: 0.004488. Value loss: 4.910743. Entropy: 1.130859.
Iteration 23852: Policy loss: 0.001409. Value loss: 3.285384. Entropy: 1.126330.
Iteration 23853: Policy loss: 0.002058. Value loss: 2.757050. Entropy: 1.126444.
episode: 8971   score: 450.0  epsilon: 1.0    steps: 1017  evaluation reward: 418.75
Training network. lr: 0.000116. clip: 0.046338
Iteration 23854: Policy loss: 0.005369. Value loss: 3.952050. Entropy: 1.127900.
Iteration 23855: Policy loss: 0.003069. V

Iteration 23909: Policy loss: 0.003962. Value loss: 1.592648. Entropy: 0.945082.
Iteration 23910: Policy loss: 0.006797. Value loss: 1.318938. Entropy: 0.948604.
Training network. lr: 0.000116. clip: 0.046225
Iteration 23911: Policy loss: 0.001595. Value loss: 3.353733. Entropy: 0.998434.
Iteration 23912: Policy loss: 0.001829. Value loss: 2.155993. Entropy: 0.997818.
Iteration 23913: Policy loss: -0.002175. Value loss: 1.718851. Entropy: 1.012064.
episode: 8996   score: 345.0  epsilon: 1.0    steps: 94  evaluation reward: 382.2
Training network. lr: 0.000116. clip: 0.046225
Iteration 23914: Policy loss: 0.005652. Value loss: 3.369184. Entropy: 1.278965.
Iteration 23915: Policy loss: 0.007245. Value loss: 2.001838. Entropy: 1.252224.
Iteration 23916: Policy loss: 0.002360. Value loss: 1.492523. Entropy: 1.283763.
episode: 8997   score: 180.0  epsilon: 1.0    steps: 943  evaluation reward: 382.8
Training network. lr: 0.000116. clip: 0.046225
Iteration 23917: Policy loss: 0.003022. Value

Training network. lr: 0.000115. clip: 0.046113
Iteration 23974: Policy loss: 0.002054. Value loss: 2.569409. Entropy: 0.978654.
Iteration 23975: Policy loss: 0.001652. Value loss: 1.794583. Entropy: 0.975753.
Iteration 23976: Policy loss: -0.000344. Value loss: 1.442371. Entropy: 1.002737.
episode: 9019   score: 285.0  epsilon: 1.0    steps: 4  evaluation reward: 372.6
episode: 9020   score: 405.0  epsilon: 1.0    steps: 1007  evaluation reward: 370.5
Training network. lr: 0.000115. clip: 0.046113
Iteration 23977: Policy loss: 0.001009. Value loss: 2.852601. Entropy: 1.080433.
Iteration 23978: Policy loss: 0.001298. Value loss: 1.874445. Entropy: 1.086990.
Iteration 23979: Policy loss: -0.000531. Value loss: 1.616955. Entropy: 1.079605.
Training network. lr: 0.000115. clip: 0.046113
Iteration 23980: Policy loss: 0.002662. Value loss: 2.685353. Entropy: 1.011201.
Iteration 23981: Policy loss: 0.001492. Value loss: 1.645039. Entropy: 1.028416.
Iteration 23982: Policy loss: 0.001699. Valu

Iteration 24039: Policy loss: -0.005602. Value loss: 1.166211. Entropy: 0.962850.
Training network. lr: 0.000115. clip: 0.046000
Iteration 24040: Policy loss: 0.001514. Value loss: 2.407685. Entropy: 1.027552.
Iteration 24041: Policy loss: -0.001001. Value loss: 1.663639. Entropy: 1.034321.
Iteration 24042: Policy loss: -0.003544. Value loss: 1.301279. Entropy: 1.033744.
episode: 9042   score: 315.0  epsilon: 1.0    steps: 427  evaluation reward: 348.1
episode: 9043   score: 405.0  epsilon: 1.0    steps: 646  evaluation reward: 348.35
Training network. lr: 0.000115. clip: 0.046000
Iteration 24043: Policy loss: 0.003103. Value loss: 2.956500. Entropy: 1.057570.
Iteration 24044: Policy loss: 0.001015. Value loss: 1.826057. Entropy: 1.068863.
Iteration 24045: Policy loss: -0.000883. Value loss: 1.469042. Entropy: 1.049737.
Training network. lr: 0.000115. clip: 0.046000
Iteration 24046: Policy loss: 0.003472. Value loss: 7.186761. Entropy: 1.335399.
Iteration 24047: Policy loss: 0.005738. 

Iteration 24104: Policy loss: 0.003106. Value loss: 1.389874. Entropy: 1.303995.
Iteration 24105: Policy loss: -0.000370. Value loss: 1.100603. Entropy: 1.309323.
episode: 9065   score: 450.0  epsilon: 1.0    steps: 852  evaluation reward: 348.5
episode: 9066   score: 260.0  epsilon: 1.0    steps: 998  evaluation reward: 345.9
Training network. lr: 0.000114. clip: 0.045775
Iteration 24106: Policy loss: 0.003563. Value loss: 5.653505. Entropy: 1.291442.
Iteration 24107: Policy loss: 0.008976. Value loss: 3.400585. Entropy: 1.276833.
Iteration 24108: Policy loss: 0.006384. Value loss: 2.425291. Entropy: 1.275192.
Training network. lr: 0.000114. clip: 0.045775
Iteration 24109: Policy loss: 0.002817. Value loss: 3.567725. Entropy: 1.007209.
Iteration 24110: Policy loss: 0.003767. Value loss: 2.725701. Entropy: 1.047867.
Iteration 24111: Policy loss: 0.004596. Value loss: 1.734874. Entropy: 1.004601.
episode: 9067   score: 530.0  epsilon: 1.0    steps: 128  evaluation reward: 345.65
Trainin

Iteration 24168: Policy loss: -0.000420. Value loss: 1.500769. Entropy: 0.770603.
Training network. lr: 0.000114. clip: 0.045663
Iteration 24169: Policy loss: 0.002942. Value loss: 2.875561. Entropy: 0.829740.
Iteration 24170: Policy loss: 0.003085. Value loss: 1.903928. Entropy: 0.811690.
Iteration 24171: Policy loss: 0.001940. Value loss: 1.533085. Entropy: 0.830433.
Training network. lr: 0.000114. clip: 0.045663
Iteration 24172: Policy loss: 0.003565. Value loss: 4.344626. Entropy: 1.216095.
Iteration 24173: Policy loss: 0.009491. Value loss: 2.907685. Entropy: 1.256000.
Iteration 24174: Policy loss: 0.004731. Value loss: 2.342343. Entropy: 1.235300.
episode: 9090   score: 380.0  epsilon: 1.0    steps: 383  evaluation reward: 362.75
episode: 9091   score: 180.0  epsilon: 1.0    steps: 991  evaluation reward: 364.75
Training network. lr: 0.000114. clip: 0.045663
Iteration 24175: Policy loss: 0.001654. Value loss: 4.946142. Entropy: 1.157408.
Iteration 24176: Policy loss: 0.000467. Va

episode: 9111   score: 30.0  epsilon: 1.0    steps: 381  evaluation reward: 349.8
episode: 9112   score: 495.0  epsilon: 1.0    steps: 884  evaluation reward: 348.3
Training network. lr: 0.000114. clip: 0.045550
Iteration 24235: Policy loss: -0.000952. Value loss: 2.675028. Entropy: 1.273944.
Iteration 24236: Policy loss: -0.000134. Value loss: 1.663777. Entropy: 1.269448.
Iteration 24237: Policy loss: -0.007856. Value loss: 1.318597. Entropy: 1.276370.
episode: 9113   score: 495.0  epsilon: 1.0    steps: 7  evaluation reward: 351.45
Training network. lr: 0.000114. clip: 0.045550
Iteration 24238: Policy loss: 0.000595. Value loss: 2.407494. Entropy: 0.979544.
Iteration 24239: Policy loss: -0.001875. Value loss: 1.592201. Entropy: 0.987643.
Iteration 24240: Policy loss: -0.004818. Value loss: 1.354345. Entropy: 0.999013.
Training network. lr: 0.000114. clip: 0.045550
Iteration 24241: Policy loss: 0.005196. Value loss: 3.794199. Entropy: 1.152705.
Iteration 24242: Policy loss: 0.001790. 

Iteration 24298: Policy loss: 0.006895. Value loss: 5.452858. Entropy: 0.901305.
Iteration 24299: Policy loss: 0.010857. Value loss: 2.930095. Entropy: 0.873092.
Iteration 24300: Policy loss: 0.009294. Value loss: 2.382576. Entropy: 0.892914.
episode: 9136   score: 925.0  epsilon: 1.0    steps: 606  evaluation reward: 361.45
Training network. lr: 0.000113. clip: 0.045325
Iteration 24301: Policy loss: 0.001132. Value loss: 3.673647. Entropy: 0.731829.
Iteration 24302: Policy loss: 0.000746. Value loss: 2.485616. Entropy: 0.744942.
Iteration 24303: Policy loss: -0.001485. Value loss: 1.851108. Entropy: 0.732665.
episode: 9137   score: 180.0  epsilon: 1.0    steps: 225  evaluation reward: 367.55
episode: 9138   score: 35.0  epsilon: 1.0    steps: 962  evaluation reward: 366.15
Training network. lr: 0.000113. clip: 0.045325
Iteration 24304: Policy loss: 0.000608. Value loss: 3.726528. Entropy: 1.109479.
Iteration 24305: Policy loss: 0.000832. Value loss: 2.590438. Entropy: 1.116404.
Iterat

Training network. lr: 0.000113. clip: 0.045213
Iteration 24364: Policy loss: 0.002839. Value loss: 3.181307. Entropy: 0.963750.
Iteration 24365: Policy loss: 0.004076. Value loss: 1.847865. Entropy: 0.983514.
Iteration 24366: Policy loss: 0.000601. Value loss: 1.556383. Entropy: 0.987753.
episode: 9158   score: 570.0  epsilon: 1.0    steps: 785  evaluation reward: 360.9
Training network. lr: 0.000113. clip: 0.045213
Iteration 24367: Policy loss: 0.004628. Value loss: 3.140014. Entropy: 0.958840.
Iteration 24368: Policy loss: 0.001555. Value loss: 2.215841. Entropy: 1.007122.
Iteration 24369: Policy loss: -0.000005. Value loss: 1.687965. Entropy: 0.988438.
Training network. lr: 0.000113. clip: 0.045213
Iteration 24370: Policy loss: 0.003537. Value loss: 6.064445. Entropy: 1.118089.
Iteration 24371: Policy loss: 0.012723. Value loss: 4.426116. Entropy: 1.109991.
Iteration 24372: Policy loss: 0.005772. Value loss: 3.798092. Entropy: 1.126690.
episode: 9159   score: 375.0  epsilon: 1.0    

Training network. lr: 0.000113. clip: 0.045100
Iteration 24430: Policy loss: 0.003438. Value loss: 3.327462. Entropy: 0.956850.
Iteration 24431: Policy loss: 0.004860. Value loss: 2.207688. Entropy: 0.960205.
Iteration 24432: Policy loss: 0.003101. Value loss: 1.779283. Entropy: 0.970272.
Training network. lr: 0.000113. clip: 0.045100
Iteration 24433: Policy loss: 0.004853. Value loss: 5.475815. Entropy: 1.029763.
Iteration 24434: Policy loss: 0.014684. Value loss: 3.338394. Entropy: 1.038309.
Iteration 24435: Policy loss: 0.014080. Value loss: 2.253884. Entropy: 1.006405.
Training network. lr: 0.000113. clip: 0.045100
Iteration 24436: Policy loss: 0.002292. Value loss: 4.260205. Entropy: 1.153149.
Iteration 24437: Policy loss: 0.006799. Value loss: 2.393112. Entropy: 1.149111.
Iteration 24438: Policy loss: 0.002836. Value loss: 1.722914. Entropy: 1.164468.
Training network. lr: 0.000113. clip: 0.045100
Iteration 24439: Policy loss: 0.006803. Value loss: 4.358074. Entropy: 1.342630.
It

episode: 9199   score: 810.0  epsilon: 1.0    steps: 694  evaluation reward: 397.85
episode: 9200   score: 695.0  epsilon: 1.0    steps: 919  evaluation reward: 401.75
Training network. lr: 0.000112. clip: 0.044988
Iteration 24499: Policy loss: -0.000239. Value loss: 3.144904. Entropy: 1.153506.
Iteration 24500: Policy loss: 0.000936. Value loss: 2.601258. Entropy: 1.171237.
Iteration 24501: Policy loss: -0.000958. Value loss: 1.959082. Entropy: 1.136370.
now time :  2019-02-23 08:04:00.491094
episode: 9201   score: 500.0  epsilon: 1.0    steps: 855  evaluation reward: 406.3
Training network. lr: 0.000112. clip: 0.044875
Iteration 24502: Policy loss: 0.004592. Value loss: 2.849137. Entropy: 1.085499.
Iteration 24503: Policy loss: 0.000532. Value loss: 1.837119. Entropy: 1.081608.
Iteration 24504: Policy loss: 0.002529. Value loss: 1.359679. Entropy: 1.057063.
Training network. lr: 0.000112. clip: 0.044875
Iteration 24505: Policy loss: 0.003132. Value loss: 3.609363. Entropy: 0.949177.


Training network. lr: 0.000112. clip: 0.044763
Iteration 24562: Policy loss: 0.003267. Value loss: 3.380991. Entropy: 0.809135.
Iteration 24563: Policy loss: 0.000213. Value loss: 2.447741. Entropy: 0.795637.
Iteration 24564: Policy loss: -0.003306. Value loss: 2.104753. Entropy: 0.810186.
episode: 9225   score: 300.0  epsilon: 1.0    steps: 517  evaluation reward: 415.1
Training network. lr: 0.000112. clip: 0.044763
Iteration 24565: Policy loss: 0.002906. Value loss: 2.574464. Entropy: 0.745892.
Iteration 24566: Policy loss: 0.004021. Value loss: 1.953725. Entropy: 0.779056.
Iteration 24567: Policy loss: 0.001075. Value loss: 1.696186. Entropy: 0.782330.
episode: 9226   score: 415.0  epsilon: 1.0    steps: 420  evaluation reward: 413.85
Training network. lr: 0.000112. clip: 0.044763
Iteration 24568: Policy loss: 0.005198. Value loss: 2.978235. Entropy: 0.884946.
Iteration 24569: Policy loss: 0.002032. Value loss: 2.036661. Entropy: 0.884778.
Iteration 24570: Policy loss: 0.004459. Val

Iteration 24629: Policy loss: -0.000214. Value loss: 2.029194. Entropy: 1.169151.
Iteration 24630: Policy loss: -0.002879. Value loss: 1.634646. Entropy: 1.172119.
Training network. lr: 0.000112. clip: 0.044650
Iteration 24631: Policy loss: 0.003513. Value loss: 3.410643. Entropy: 1.071496.
Iteration 24632: Policy loss: 0.002502. Value loss: 1.996932. Entropy: 1.060423.
Iteration 24633: Policy loss: -0.002451. Value loss: 1.648297. Entropy: 1.077260.
episode: 9246   score: 355.0  epsilon: 1.0    steps: 195  evaluation reward: 423.2
Training network. lr: 0.000112. clip: 0.044650
Iteration 24634: Policy loss: 0.001705. Value loss: 2.268119. Entropy: 1.293538.
Iteration 24635: Policy loss: -0.002963. Value loss: 1.514786. Entropy: 1.283288.
Iteration 24636: Policy loss: -0.004616. Value loss: 1.105999. Entropy: 1.297660.
episode: 9247   score: 650.0  epsilon: 1.0    steps: 594  evaluation reward: 422.7
Training network. lr: 0.000112. clip: 0.044650
Iteration 24637: Policy loss: -0.000609.

Iteration 24694: Policy loss: 0.003937. Value loss: 3.761987. Entropy: 1.111244.
Iteration 24695: Policy loss: -0.001128. Value loss: 2.400280. Entropy: 1.105280.
Iteration 24696: Policy loss: -0.002274. Value loss: 1.792170. Entropy: 1.098447.
episode: 9269   score: 445.0  epsilon: 1.0    steps: 40  evaluation reward: 411.0
episode: 9270   score: 395.0  epsilon: 1.0    steps: 543  evaluation reward: 411.5
Training network. lr: 0.000111. clip: 0.044538
Iteration 24697: Policy loss: 0.000467. Value loss: 6.483400. Entropy: 0.982254.
Iteration 24698: Policy loss: 0.003179. Value loss: 4.540731. Entropy: 0.996640.
Iteration 24699: Policy loss: 0.000808. Value loss: 3.929934. Entropy: 0.981684.
episode: 9271   score: 410.0  epsilon: 1.0    steps: 262  evaluation reward: 407.55
episode: 9272   score: 210.0  epsilon: 1.0    steps: 718  evaluation reward: 406.7
Training network. lr: 0.000111. clip: 0.044538
Iteration 24700: Policy loss: 0.001000. Value loss: 2.143481. Entropy: 0.960947.
Itera

Iteration 24759: Policy loss: 0.000299. Value loss: 4.329719. Entropy: 1.041068.
episode: 9293   score: 375.0  epsilon: 1.0    steps: 738  evaluation reward: 392.3
Training network. lr: 0.000111. clip: 0.044313
Iteration 24760: Policy loss: -0.000143. Value loss: 3.465563. Entropy: 1.177524.
Iteration 24761: Policy loss: 0.002018. Value loss: 2.216433. Entropy: 1.175089.
Iteration 24762: Policy loss: -0.001493. Value loss: 1.737308. Entropy: 1.176357.
Training network. lr: 0.000111. clip: 0.044313
Iteration 24763: Policy loss: 0.002292. Value loss: 3.182178. Entropy: 0.921575.
Iteration 24764: Policy loss: -0.000750. Value loss: 2.188025. Entropy: 0.926689.
Iteration 24765: Policy loss: -0.003244. Value loss: 1.734313. Entropy: 0.922429.
episode: 9294   score: 260.0  epsilon: 1.0    steps: 120  evaluation reward: 392.45
episode: 9295   score: 470.0  epsilon: 1.0    steps: 592  evaluation reward: 389.65
Training network. lr: 0.000111. clip: 0.044313
Iteration 24766: Policy loss: 0.00649

Training network. lr: 0.000111. clip: 0.044200
Iteration 24823: Policy loss: 0.000685. Value loss: 3.670750. Entropy: 0.919022.
Iteration 24824: Policy loss: 0.000031. Value loss: 2.321106. Entropy: 0.920979.
Iteration 24825: Policy loss: -0.003503. Value loss: 1.827875. Entropy: 0.905087.
Training network. lr: 0.000111. clip: 0.044200
Iteration 24826: Policy loss: 0.005294. Value loss: 8.622255. Entropy: 1.413486.
Iteration 24827: Policy loss: 0.013608. Value loss: 5.286757. Entropy: 1.412976.
Iteration 24828: Policy loss: 0.015554. Value loss: 3.386594. Entropy: 1.397321.
episode: 9317   score: 385.0  epsilon: 1.0    steps: 693  evaluation reward: 382.1
Training network. lr: 0.000111. clip: 0.044200
Iteration 24829: Policy loss: 0.005212. Value loss: 6.394903. Entropy: 1.302427.
Iteration 24830: Policy loss: 0.009537. Value loss: 4.232533. Entropy: 1.305805.
Iteration 24831: Policy loss: 0.005512. Value loss: 3.034796. Entropy: 1.298634.
episode: 9318   score: 570.0  epsilon: 1.0    

Iteration 24890: Policy loss: 0.008520. Value loss: 3.079499. Entropy: 1.068249.
Iteration 24891: Policy loss: 0.010601. Value loss: 2.374528. Entropy: 1.051771.
episode: 9338   score: 470.0  epsilon: 1.0    steps: 1000  evaluation reward: 387.65
Training network. lr: 0.000110. clip: 0.044088
Iteration 24892: Policy loss: 0.000944. Value loss: 2.839231. Entropy: 1.207869.
Iteration 24893: Policy loss: 0.005306. Value loss: 2.006983. Entropy: 1.190618.
Iteration 24894: Policy loss: 0.001886. Value loss: 1.499666. Entropy: 1.163452.
Training network. lr: 0.000110. clip: 0.044088
Iteration 24895: Policy loss: -0.000078. Value loss: 3.970951. Entropy: 1.235708.
Iteration 24896: Policy loss: 0.000901. Value loss: 2.813179. Entropy: 1.255153.
Iteration 24897: Policy loss: 0.000953. Value loss: 2.167100. Entropy: 1.235287.
episode: 9339   score: 240.0  epsilon: 1.0    steps: 448  evaluation reward: 388.75
episode: 9340   score: 765.0  epsilon: 1.0    steps: 532  evaluation reward: 387.45
Trai

Iteration 24956: Policy loss: 0.000317. Value loss: 1.739624. Entropy: 1.065968.
Iteration 24957: Policy loss: -0.000755. Value loss: 1.264428. Entropy: 1.072330.
episode: 9360   score: 580.0  epsilon: 1.0    steps: 430  evaluation reward: 392.4
Training network. lr: 0.000110. clip: 0.043863
Iteration 24958: Policy loss: 0.001033. Value loss: 4.478946. Entropy: 0.933595.
Iteration 24959: Policy loss: 0.002594. Value loss: 2.913296. Entropy: 0.953846.
Iteration 24960: Policy loss: 0.003239. Value loss: 2.347268. Entropy: 0.952125.
episode: 9361   score: 260.0  epsilon: 1.0    steps: 24  evaluation reward: 394.95
episode: 9362   score: 270.0  epsilon: 1.0    steps: 302  evaluation reward: 395.75
Training network. lr: 0.000110. clip: 0.043863
Iteration 24961: Policy loss: 0.003875. Value loss: 3.565578. Entropy: 1.054221.
Iteration 24962: Policy loss: 0.004694. Value loss: 2.324950. Entropy: 1.041951.
Iteration 24963: Policy loss: -0.000270. Value loss: 2.083725. Entropy: 1.027167.
episod

Iteration 25021: Policy loss: 0.004972. Value loss: 3.479054. Entropy: 1.002127.
Iteration 25022: Policy loss: -0.002146. Value loss: 2.460064. Entropy: 0.985142.
Iteration 25023: Policy loss: 0.000372. Value loss: 2.045765. Entropy: 1.002218.
episode: 9383   score: 290.0  epsilon: 1.0    steps: 181  evaluation reward: 421.35
Training network. lr: 0.000109. clip: 0.043750
Iteration 25024: Policy loss: 0.004292. Value loss: 4.253223. Entropy: 0.859426.
Iteration 25025: Policy loss: -0.000868. Value loss: 3.101911. Entropy: 0.832269.
Iteration 25026: Policy loss: -0.000058. Value loss: 2.475568. Entropy: 0.827677.
Training network. lr: 0.000109. clip: 0.043750
Iteration 25027: Policy loss: 0.001702. Value loss: 3.810622. Entropy: 0.929905.
Iteration 25028: Policy loss: 0.002847. Value loss: 2.923641. Entropy: 0.930575.
Iteration 25029: Policy loss: -0.002936. Value loss: 2.174452. Entropy: 0.916778.
episode: 9384   score: 320.0  epsilon: 1.0    steps: 55  evaluation reward: 419.2
episode

Iteration 25086: Policy loss: 0.005975. Value loss: 2.182917. Entropy: 0.792573.
episode: 9406   score: 265.0  epsilon: 1.0    steps: 22  evaluation reward: 416.75
Training network. lr: 0.000109. clip: 0.043638
Iteration 25087: Policy loss: 0.001584. Value loss: 2.937175. Entropy: 0.877393.
Iteration 25088: Policy loss: 0.003709. Value loss: 1.589102. Entropy: 0.906504.
Iteration 25089: Policy loss: -0.000391. Value loss: 1.264675. Entropy: 0.851175.
Training network. lr: 0.000109. clip: 0.043638
Iteration 25090: Policy loss: 0.003907. Value loss: 3.197001. Entropy: 1.328112.
Iteration 25091: Policy loss: 0.001914. Value loss: 2.152624. Entropy: 1.301041.
Iteration 25092: Policy loss: 0.000084. Value loss: 2.001292. Entropy: 1.322867.
Training network. lr: 0.000109. clip: 0.043638
Iteration 25093: Policy loss: 0.001343. Value loss: 4.884935. Entropy: 1.150068.
Iteration 25094: Policy loss: 0.007692. Value loss: 2.832644. Entropy: 1.164404.
Iteration 25095: Policy loss: 0.003739. Value 

Iteration 25150: Policy loss: -0.000483. Value loss: 2.972394. Entropy: 0.788150.
Iteration 25151: Policy loss: -0.003714. Value loss: 2.240705. Entropy: 0.794098.
Iteration 25152: Policy loss: -0.004414. Value loss: 1.688256. Entropy: 0.822829.
episode: 9430   score: 350.0  epsilon: 1.0    steps: 707  evaluation reward: 408.35
Training network. lr: 0.000109. clip: 0.043413
Iteration 25153: Policy loss: 0.000796. Value loss: 3.943662. Entropy: 1.114456.
Iteration 25154: Policy loss: 0.002458. Value loss: 2.486440. Entropy: 1.106348.
Iteration 25155: Policy loss: 0.002818. Value loss: 1.831318. Entropy: 1.110539.
Training network. lr: 0.000109. clip: 0.043413
Iteration 25156: Policy loss: 0.003341. Value loss: 4.962716. Entropy: 1.173833.
Iteration 25157: Policy loss: 0.007043. Value loss: 3.814701. Entropy: 1.157059.
Iteration 25158: Policy loss: 0.003576. Value loss: 3.399912. Entropy: 1.173637.
Training network. lr: 0.000109. clip: 0.043413
Iteration 25159: Policy loss: 0.004956. Val

episode: 9452   score: 185.0  epsilon: 1.0    steps: 873  evaluation reward: 396.65
Training network. lr: 0.000108. clip: 0.043300
Iteration 25216: Policy loss: 0.002190. Value loss: 2.567406. Entropy: 0.935168.
Iteration 25217: Policy loss: 0.000031. Value loss: 1.801067. Entropy: 0.959987.
Iteration 25218: Policy loss: 0.002694. Value loss: 1.446703. Entropy: 0.936859.
episode: 9453   score: 360.0  epsilon: 1.0    steps: 726  evaluation reward: 395.2
Training network. lr: 0.000108. clip: 0.043300
Iteration 25219: Policy loss: 0.001608. Value loss: 3.637951. Entropy: 1.049265.
Iteration 25220: Policy loss: 0.000607. Value loss: 2.548671. Entropy: 1.023478.
Iteration 25221: Policy loss: -0.001938. Value loss: 1.908721. Entropy: 1.034240.
Training network. lr: 0.000108. clip: 0.043300
Iteration 25222: Policy loss: 0.011899. Value loss: 5.906149. Entropy: 1.130506.
Iteration 25223: Policy loss: 0.009504. Value loss: 3.551161. Entropy: 1.147933.
Iteration 25224: Policy loss: 0.008135. Val

Training network. lr: 0.000108. clip: 0.043188
Iteration 25282: Policy loss: 0.002772. Value loss: 4.163620. Entropy: 1.163001.
Iteration 25283: Policy loss: 0.002032. Value loss: 2.466509. Entropy: 1.206173.
Iteration 25284: Policy loss: -0.003512. Value loss: 1.834560. Entropy: 1.161171.
episode: 9474   score: 395.0  epsilon: 1.0    steps: 65  evaluation reward: 395.9
episode: 9475   score: 330.0  epsilon: 1.0    steps: 381  evaluation reward: 397.6
Training network. lr: 0.000108. clip: 0.043188
Iteration 25285: Policy loss: 0.007574. Value loss: 8.554938. Entropy: 1.212332.
Iteration 25286: Policy loss: 0.016959. Value loss: 5.068779. Entropy: 1.209930.
Iteration 25287: Policy loss: 0.011389. Value loss: 3.742453. Entropy: 1.210694.
episode: 9476   score: 865.0  epsilon: 1.0    steps: 253  evaluation reward: 394.55
episode: 9477   score: 710.0  epsilon: 1.0    steps: 514  evaluation reward: 400.05
Training network. lr: 0.000108. clip: 0.043188
Iteration 25288: Policy loss: 0.001716.

Training network. lr: 0.000108. clip: 0.043075
Iteration 25348: Policy loss: 0.007017. Value loss: 4.068329. Entropy: 0.903158.
Iteration 25349: Policy loss: 0.009626. Value loss: 2.534119. Entropy: 0.887717.
Iteration 25350: Policy loss: 0.002907. Value loss: 2.277598. Entropy: 0.890171.
episode: 9496   score: 210.0  epsilon: 1.0    steps: 235  evaluation reward: 389.95
episode: 9497   score: 405.0  epsilon: 1.0    steps: 480  evaluation reward: 387.85
Training network. lr: 0.000107. clip: 0.042963
Iteration 25351: Policy loss: 0.001056. Value loss: 3.046126. Entropy: 1.111864.
Iteration 25352: Policy loss: -0.003150. Value loss: 2.222945. Entropy: 1.132646.
Iteration 25353: Policy loss: -0.000193. Value loss: 1.827840. Entropy: 1.126393.
episode: 9498   score: 730.0  epsilon: 1.0    steps: 841  evaluation reward: 385.25
episode: 9499   score: 355.0  epsilon: 1.0    steps: 917  evaluation reward: 390.45
Training network. lr: 0.000107. clip: 0.042963
Iteration 25354: Policy loss: 0.002

Iteration 25414: Policy loss: 0.003633. Value loss: 4.085648. Entropy: 1.146162.
Iteration 25415: Policy loss: 0.005957. Value loss: 2.984548. Entropy: 1.140359.
Iteration 25416: Policy loss: 0.002365. Value loss: 2.690689. Entropy: 1.143139.
episode: 9517   score: 465.0  epsilon: 1.0    steps: 262  evaluation reward: 410.8
episode: 9518   score: 400.0  epsilon: 1.0    steps: 502  evaluation reward: 413.2
Training network. lr: 0.000107. clip: 0.042850
Iteration 25417: Policy loss: -0.000910. Value loss: 4.236310. Entropy: 0.857159.
Iteration 25418: Policy loss: 0.001146. Value loss: 2.402772. Entropy: 0.856884.
Iteration 25419: Policy loss: -0.002551. Value loss: 2.281368. Entropy: 0.828271.
episode: 9519   score: 345.0  epsilon: 1.0    steps: 870  evaluation reward: 413.25
Training network. lr: 0.000107. clip: 0.042850
Iteration 25420: Policy loss: 0.001659. Value loss: 3.053635. Entropy: 1.131725.
Iteration 25421: Policy loss: 0.002266. Value loss: 2.026982. Entropy: 1.130865.
Iterat

Iteration 25478: Policy loss: 0.010942. Value loss: 3.107335. Entropy: 1.105311.
Iteration 25479: Policy loss: 0.006209. Value loss: 1.986432. Entropy: 1.068828.
Training network. lr: 0.000107. clip: 0.042738
Iteration 25480: Policy loss: 0.003293. Value loss: 3.698549. Entropy: 1.164435.
Iteration 25481: Policy loss: 0.002605. Value loss: 2.126542. Entropy: 1.159174.
Iteration 25482: Policy loss: 0.000152. Value loss: 1.719414. Entropy: 1.170145.
episode: 9542   score: 475.0  epsilon: 1.0    steps: 115  evaluation reward: 401.1
episode: 9543   score: 315.0  epsilon: 1.0    steps: 570  evaluation reward: 402.25
episode: 9544   score: 545.0  epsilon: 1.0    steps: 886  evaluation reward: 403.3
Training network. lr: 0.000107. clip: 0.042738
Iteration 25483: Policy loss: 0.002758. Value loss: 7.701483. Entropy: 1.076332.
Iteration 25484: Policy loss: 0.006603. Value loss: 5.729456. Entropy: 1.079755.
Iteration 25485: Policy loss: 0.003071. Value loss: 4.515965. Entropy: 1.074312.
episode:

Training network. lr: 0.000107. clip: 0.042625
Iteration 25543: Policy loss: 0.003940. Value loss: 6.255556. Entropy: 1.038154.
Iteration 25544: Policy loss: 0.012305. Value loss: 3.392224. Entropy: 1.023401.
Iteration 25545: Policy loss: 0.005170. Value loss: 2.417774. Entropy: 1.039017.
episode: 9565   score: 180.0  epsilon: 1.0    steps: 876  evaluation reward: 408.65
Training network. lr: 0.000107. clip: 0.042625
Iteration 25546: Policy loss: 0.002704. Value loss: 5.219071. Entropy: 1.091587.
Iteration 25547: Policy loss: 0.003765. Value loss: 3.283576. Entropy: 1.105514.
Iteration 25548: Policy loss: 0.003978. Value loss: 2.367676. Entropy: 1.096744.
episode: 9566   score: 515.0  epsilon: 1.0    steps: 549  evaluation reward: 404.15
Training network. lr: 0.000107. clip: 0.042625
Iteration 25549: Policy loss: 0.003446. Value loss: 3.534985. Entropy: 1.129655.
Iteration 25550: Policy loss: 0.005488. Value loss: 2.158423. Entropy: 1.113170.
Iteration 25551: Policy loss: -0.000461. Va

Iteration 25605: Policy loss: 0.002867. Value loss: 1.423421. Entropy: 1.171969.
episode: 9592   score: 210.0  epsilon: 1.0    steps: 101  evaluation reward: 373.95
episode: 9593   score: 180.0  epsilon: 1.0    steps: 199  evaluation reward: 372.5
episode: 9594   score: 135.0  epsilon: 1.0    steps: 674  evaluation reward: 370.05
Training network. lr: 0.000106. clip: 0.042400
Iteration 25606: Policy loss: 0.003900. Value loss: 2.645177. Entropy: 1.113669.
Iteration 25607: Policy loss: 0.000051. Value loss: 1.860093. Entropy: 1.111001.
Iteration 25608: Policy loss: 0.001695. Value loss: 1.508210. Entropy: 1.111623.
episode: 9595   score: 300.0  epsilon: 1.0    steps: 507  evaluation reward: 367.7
Training network. lr: 0.000106. clip: 0.042400
Iteration 25609: Policy loss: 0.005639. Value loss: 3.861615. Entropy: 0.987180.
Iteration 25610: Policy loss: 0.004475. Value loss: 2.249281. Entropy: 0.978274.
Iteration 25611: Policy loss: 0.005782. Value loss: 1.630069. Entropy: 0.984492.
episo

Iteration 25668: Policy loss: 0.008988. Value loss: 3.458907. Entropy: 1.156808.
episode: 9617   score: 215.0  epsilon: 1.0    steps: 676  evaluation reward: 340.55
Training network. lr: 0.000106. clip: 0.042288
Iteration 25669: Policy loss: 0.008731. Value loss: 2.952253. Entropy: 0.947210.
Iteration 25670: Policy loss: 0.010925. Value loss: 1.582754. Entropy: 0.953053.
Iteration 25671: Policy loss: 0.008631. Value loss: 1.388870. Entropy: 0.929884.
Training network. lr: 0.000106. clip: 0.042288
Iteration 25672: Policy loss: 0.000382. Value loss: 3.243475. Entropy: 0.948694.
Iteration 25673: Policy loss: 0.003136. Value loss: 1.860196. Entropy: 0.933208.
Iteration 25674: Policy loss: -0.001239. Value loss: 1.358523. Entropy: 0.932173.
Training network. lr: 0.000106. clip: 0.042288
Iteration 25675: Policy loss: 0.005840. Value loss: 4.857901. Entropy: 1.083180.
Iteration 25676: Policy loss: 0.010147. Value loss: 3.244252. Entropy: 1.089627.
Iteration 25677: Policy loss: 0.004998. Value

Iteration 25734: Policy loss: 0.003861. Value loss: 1.214073. Entropy: 1.029144.
episode: 9639   score: 530.0  epsilon: 1.0    steps: 501  evaluation reward: 343.3
Training network. lr: 0.000105. clip: 0.042175
Iteration 25735: Policy loss: 0.003031. Value loss: 3.780195. Entropy: 1.152655.
Iteration 25736: Policy loss: 0.002708. Value loss: 2.500410. Entropy: 1.145472.
Iteration 25737: Policy loss: 0.000537. Value loss: 1.906735. Entropy: 1.174979.
episode: 9640   score: 210.0  epsilon: 1.0    steps: 894  evaluation reward: 346.35
episode: 9641   score: 315.0  epsilon: 1.0    steps: 901  evaluation reward: 346.65
Training network. lr: 0.000105. clip: 0.042175
Iteration 25738: Policy loss: 0.008165. Value loss: 7.624489. Entropy: 0.995655.
Iteration 25739: Policy loss: 0.023874. Value loss: 3.683284. Entropy: 0.954946.
Iteration 25740: Policy loss: 0.011030. Value loss: 2.651864. Entropy: 0.959170.
episode: 9642   score: 700.0  epsilon: 1.0    steps: 181  evaluation reward: 345.95
Trai

Iteration 25797: Policy loss: 0.006004. Value loss: 2.475058. Entropy: 0.901142.
Training network. lr: 0.000105. clip: 0.042063
Iteration 25798: Policy loss: 0.001610. Value loss: 5.146406. Entropy: 1.048423.
Iteration 25799: Policy loss: 0.005114. Value loss: 2.969271. Entropy: 1.052519.
Iteration 25800: Policy loss: 0.006995. Value loss: 2.458067. Entropy: 1.058642.
episode: 9664   score: 440.0  epsilon: 1.0    steps: 157  evaluation reward: 341.45
Training network. lr: 0.000105. clip: 0.041950
Iteration 25801: Policy loss: 0.004459. Value loss: 2.389985. Entropy: 0.976483.
Iteration 25802: Policy loss: 0.003344. Value loss: 1.463302. Entropy: 0.959060.
Iteration 25803: Policy loss: 0.002361. Value loss: 1.145988. Entropy: 0.979405.
episode: 9665   score: 390.0  epsilon: 1.0    steps: 429  evaluation reward: 339.1
Training network. lr: 0.000105. clip: 0.041950
Iteration 25804: Policy loss: 0.004674. Value loss: 2.944237. Entropy: 1.179821.
Iteration 25805: Policy loss: 0.003126. Valu

Iteration 25862: Policy loss: 0.002905. Value loss: 2.149906. Entropy: 1.174485.
Iteration 25863: Policy loss: 0.003865. Value loss: 1.621765. Entropy: 1.187828.
episode: 9687   score: 210.0  epsilon: 1.0    steps: 191  evaluation reward: 355.75
Training network. lr: 0.000105. clip: 0.041838
Iteration 25864: Policy loss: 0.002635. Value loss: 3.704041. Entropy: 1.091887.
Iteration 25865: Policy loss: 0.002371. Value loss: 2.288869. Entropy: 1.090160.
Iteration 25866: Policy loss: 0.000019. Value loss: 1.952571. Entropy: 1.099509.
episode: 9688   score: 385.0  epsilon: 1.0    steps: 99  evaluation reward: 355.7
episode: 9689   score: 320.0  epsilon: 1.0    steps: 599  evaluation reward: 356.65
Training network. lr: 0.000105. clip: 0.041838
Iteration 25867: Policy loss: 0.003385. Value loss: 2.406749. Entropy: 1.185395.
Iteration 25868: Policy loss: 0.005748. Value loss: 1.792595. Entropy: 1.175484.
Iteration 25869: Policy loss: 0.003532. Value loss: 1.431741. Entropy: 1.175623.
Training

Iteration 25925: Policy loss: 0.004130. Value loss: 2.295701. Entropy: 1.191239.
Iteration 25926: Policy loss: 0.005100. Value loss: 2.072201. Entropy: 1.200706.
episode: 9712   score: 315.0  epsilon: 1.0    steps: 479  evaluation reward: 357.0
Training network. lr: 0.000104. clip: 0.041725
Iteration 25927: Policy loss: 0.003699. Value loss: 3.971376. Entropy: 1.105983.
Iteration 25928: Policy loss: 0.004831. Value loss: 2.971756. Entropy: 1.089697.
Iteration 25929: Policy loss: 0.002632. Value loss: 2.462856. Entropy: 1.097793.
episode: 9713   score: 210.0  epsilon: 1.0    steps: 741  evaluation reward: 355.45
Training network. lr: 0.000104. clip: 0.041725
Iteration 25930: Policy loss: 0.003079. Value loss: 3.099453. Entropy: 1.306258.
Iteration 25931: Policy loss: -0.000428. Value loss: 2.074943. Entropy: 1.285125.
Iteration 25932: Policy loss: 0.005602. Value loss: 1.481937. Entropy: 1.292019.
episode: 9714   score: 375.0  epsilon: 1.0    steps: 926  evaluation reward: 355.0
Trainin

Training network. lr: 0.000104. clip: 0.041613
Iteration 25990: Policy loss: 0.003381. Value loss: 2.203468. Entropy: 1.033061.
Iteration 25991: Policy loss: 0.005923. Value loss: 1.707957. Entropy: 1.032829.
Iteration 25992: Policy loss: 0.004319. Value loss: 1.219046. Entropy: 1.028882.
Training network. lr: 0.000104. clip: 0.041613
Iteration 25993: Policy loss: 0.004824. Value loss: 3.770249. Entropy: 1.010002.
Iteration 25994: Policy loss: 0.002246. Value loss: 2.794648. Entropy: 1.021478.
Iteration 25995: Policy loss: 0.002542. Value loss: 2.151999. Entropy: 1.023947.
episode: 9736   score: 285.0  epsilon: 1.0    steps: 63  evaluation reward: 356.15
episode: 9737   score: 420.0  epsilon: 1.0    steps: 999  evaluation reward: 355.25
Training network. lr: 0.000104. clip: 0.041613
Iteration 25996: Policy loss: 0.003677. Value loss: 3.307193. Entropy: 1.206345.
Iteration 25997: Policy loss: -0.001284. Value loss: 2.168503. Entropy: 1.197360.
Iteration 25998: Policy loss: -0.002482. Va

Iteration 26054: Policy loss: 0.001998. Value loss: 2.412573. Entropy: 0.978750.
Iteration 26055: Policy loss: 0.001404. Value loss: 1.950583. Entropy: 0.977763.
Training network. lr: 0.000103. clip: 0.041388
Iteration 26056: Policy loss: 0.008057. Value loss: 3.645044. Entropy: 1.251712.
Iteration 26057: Policy loss: 0.005305. Value loss: 2.214069. Entropy: 1.247287.
Iteration 26058: Policy loss: 0.004797. Value loss: 1.731839. Entropy: 1.246804.
episode: 9760   score: 405.0  epsilon: 1.0    steps: 666  evaluation reward: 357.15
Training network. lr: 0.000103. clip: 0.041388
Iteration 26059: Policy loss: 0.000880. Value loss: 4.272599. Entropy: 1.093377.
Iteration 26060: Policy loss: 0.004039. Value loss: 3.387739. Entropy: 1.094190.
Iteration 26061: Policy loss: 0.000602. Value loss: 3.126349. Entropy: 1.109520.
episode: 9761   score: 420.0  epsilon: 1.0    steps: 54  evaluation reward: 355.7
episode: 9762   score: 180.0  epsilon: 1.0    steps: 896  evaluation reward: 356.65
Training

Iteration 26121: Policy loss: 0.009827. Value loss: 3.505480. Entropy: 1.388477.
episode: 9781   score: 255.0  epsilon: 1.0    steps: 124  evaluation reward: 360.2
episode: 9782   score: 320.0  epsilon: 1.0    steps: 591  evaluation reward: 359.9
Training network. lr: 0.000103. clip: 0.041275
Iteration 26122: Policy loss: 0.006732. Value loss: 6.360060. Entropy: 1.101896.
Iteration 26123: Policy loss: 0.004488. Value loss: 3.625180. Entropy: 1.092268.
Iteration 26124: Policy loss: 0.003649. Value loss: 2.934155. Entropy: 1.098158.
episode: 9783   score: 565.0  epsilon: 1.0    steps: 943  evaluation reward: 360.85
Training network. lr: 0.000103. clip: 0.041275
Iteration 26125: Policy loss: 0.001019. Value loss: 3.782237. Entropy: 1.107982.
Iteration 26126: Policy loss: 0.002668. Value loss: 2.539824. Entropy: 1.115923.
Iteration 26127: Policy loss: 0.001920. Value loss: 2.137096. Entropy: 1.085133.
episode: 9784   score: 625.0  epsilon: 1.0    steps: 302  evaluation reward: 364.7
Traini

Iteration 26186: Policy loss: 0.004634. Value loss: 3.452543. Entropy: 1.171921.
Iteration 26187: Policy loss: 0.002586. Value loss: 2.812353. Entropy: 1.161312.
Training network. lr: 0.000103. clip: 0.041163
Iteration 26188: Policy loss: 0.007506. Value loss: 3.040035. Entropy: 1.075452.
Iteration 26189: Policy loss: 0.004590. Value loss: 2.057298. Entropy: 1.066757.
Iteration 26190: Policy loss: 0.004669. Value loss: 1.753190. Entropy: 1.081773.
episode: 9804   score: 390.0  epsilon: 1.0    steps: 645  evaluation reward: 380.4
Training network. lr: 0.000103. clip: 0.041163
Iteration 26191: Policy loss: 0.003380. Value loss: 3.526007. Entropy: 1.142956.
Iteration 26192: Policy loss: 0.000168. Value loss: 2.318732. Entropy: 1.155586.
Iteration 26193: Policy loss: -0.002193. Value loss: 1.971005. Entropy: 1.145459.
episode: 9805   score: 620.0  epsilon: 1.0    steps: 62  evaluation reward: 381.6
Training network. lr: 0.000103. clip: 0.041163
Iteration 26194: Policy loss: 0.002535. Value

Iteration 26251: Policy loss: 0.001774. Value loss: 4.316994. Entropy: 1.055184.
Iteration 26252: Policy loss: 0.006335. Value loss: 3.028791. Entropy: 1.066732.
Iteration 26253: Policy loss: -0.000725. Value loss: 2.298656. Entropy: 1.062713.
episode: 9827   score: 665.0  epsilon: 1.0    steps: 311  evaluation reward: 400.3
Training network. lr: 0.000102. clip: 0.040938
Iteration 26254: Policy loss: 0.003174. Value loss: 2.635687. Entropy: 1.111225.
Iteration 26255: Policy loss: 0.000615. Value loss: 1.662379. Entropy: 1.122965.
Iteration 26256: Policy loss: 0.000279. Value loss: 1.475772. Entropy: 1.150972.
episode: 9828   score: 240.0  epsilon: 1.0    steps: 476  evaluation reward: 400.75
Training network. lr: 0.000102. clip: 0.040938
Iteration 26257: Policy loss: 0.004187. Value loss: 4.435583. Entropy: 1.187875.
Iteration 26258: Policy loss: 0.010267. Value loss: 2.957245. Entropy: 1.198212.
Iteration 26259: Policy loss: 0.001639. Value loss: 2.393193. Entropy: 1.210518.
episode: 

episode: 9850   score: 775.0  epsilon: 1.0    steps: 411  evaluation reward: 395.55
Training network. lr: 0.000102. clip: 0.040825
Iteration 26317: Policy loss: 0.006381. Value loss: 4.445912. Entropy: 1.049268.
Iteration 26318: Policy loss: 0.010933. Value loss: 3.073505. Entropy: 1.076211.
Iteration 26319: Policy loss: 0.007160. Value loss: 2.688091. Entropy: 1.076923.
Training network. lr: 0.000102. clip: 0.040825
Iteration 26320: Policy loss: 0.002926. Value loss: 4.068388. Entropy: 1.127537.
Iteration 26321: Policy loss: 0.002743. Value loss: 2.732796. Entropy: 1.117135.
Iteration 26322: Policy loss: 0.001555. Value loss: 2.155295. Entropy: 1.145850.
now time :  2019-02-23 08:41:24.916836
episode: 9851   score: 390.0  epsilon: 1.0    steps: 585  evaluation reward: 401.05
episode: 9852   score: 425.0  epsilon: 1.0    steps: 934  evaluation reward: 401.05
Training network. lr: 0.000102. clip: 0.040825
Iteration 26323: Policy loss: 0.000841. Value loss: 3.413009. Entropy: 1.133722.
I

Training network. lr: 0.000102. clip: 0.040713
Iteration 26383: Policy loss: 0.002004. Value loss: 5.551008. Entropy: 1.070301.
Iteration 26384: Policy loss: 0.004306. Value loss: 3.723672. Entropy: 1.103318.
Iteration 26385: Policy loss: 0.000945. Value loss: 2.613656. Entropy: 1.081445.
episode: 9872   score: 375.0  epsilon: 1.0    steps: 224  evaluation reward: 407.45
episode: 9873   score: 520.0  epsilon: 1.0    steps: 299  evaluation reward: 409.1
Training network. lr: 0.000102. clip: 0.040713
Iteration 26386: Policy loss: 0.001633. Value loss: 4.041707. Entropy: 0.942421.
Iteration 26387: Policy loss: 0.005832. Value loss: 2.403665. Entropy: 0.913149.
Iteration 26388: Policy loss: 0.003851. Value loss: 1.902561. Entropy: 0.925422.
episode: 9874   score: 330.0  epsilon: 1.0    steps: 401  evaluation reward: 411.55
Training network. lr: 0.000102. clip: 0.040713
Iteration 26389: Policy loss: 0.006600. Value loss: 4.090553. Entropy: 1.048283.
Iteration 26390: Policy loss: 0.008771. V

Iteration 26447: Policy loss: 0.001216. Value loss: 2.154522. Entropy: 0.920640.
Iteration 26448: Policy loss: -0.000150. Value loss: 1.728859. Entropy: 0.931199.
episode: 9896   score: 180.0  epsilon: 1.0    steps: 863  evaluation reward: 414.65
Training network. lr: 0.000102. clip: 0.040600
Iteration 26449: Policy loss: 0.012334. Value loss: 3.844946. Entropy: 1.009349.
Iteration 26450: Policy loss: 0.011943. Value loss: 2.591125. Entropy: 1.022964.
Iteration 26451: Policy loss: 0.005223. Value loss: 2.041189. Entropy: 1.019027.
episode: 9897   score: 210.0  epsilon: 1.0    steps: 74  evaluation reward: 410.65
episode: 9898   score: 155.0  epsilon: 1.0    steps: 338  evaluation reward: 410.65
Training network. lr: 0.000101. clip: 0.040488
Iteration 26452: Policy loss: 0.012606. Value loss: 4.385262. Entropy: 0.983736.
Iteration 26453: Policy loss: 0.019366. Value loss: 2.271928. Entropy: 1.011660.
Iteration 26454: Policy loss: 0.012252. Value loss: 1.787877. Entropy: 1.008674.
episod

episode: 9919   score: 525.0  epsilon: 1.0    steps: 52  evaluation reward: 393.05
episode: 9920   score: 450.0  epsilon: 1.0    steps: 309  evaluation reward: 394.2
Training network. lr: 0.000101. clip: 0.040375
Iteration 26512: Policy loss: 0.000995. Value loss: 5.651121. Entropy: 0.982204.
Iteration 26513: Policy loss: 0.005009. Value loss: 4.944417. Entropy: 0.971601.
Iteration 26514: Policy loss: 0.001910. Value loss: 3.912855. Entropy: 0.985281.
episode: 9921   score: 105.0  epsilon: 1.0    steps: 826  evaluation reward: 395.1
Training network. lr: 0.000101. clip: 0.040375
Iteration 26515: Policy loss: 0.001374. Value loss: 5.248280. Entropy: 1.161534.
Iteration 26516: Policy loss: 0.000558. Value loss: 5.108922. Entropy: 1.133528.
Iteration 26517: Policy loss: 0.002543. Value loss: 3.964929. Entropy: 1.155325.
Training network. lr: 0.000101. clip: 0.040375
Iteration 26518: Policy loss: 0.005028. Value loss: 2.694654. Entropy: 1.214577.
Iteration 26519: Policy loss: 0.004155. Val

Iteration 26576: Policy loss: 0.005528. Value loss: 4.728282. Entropy: 1.115176.
Iteration 26577: Policy loss: 0.002090. Value loss: 3.421716. Entropy: 1.115988.
episode: 9943   score: 445.0  epsilon: 1.0    steps: 469  evaluation reward: 395.0
Training network. lr: 0.000101. clip: 0.040263
Iteration 26578: Policy loss: 0.000200. Value loss: 4.228733. Entropy: 1.155774.
Iteration 26579: Policy loss: 0.003709. Value loss: 2.794733. Entropy: 1.166859.
Iteration 26580: Policy loss: 0.004491. Value loss: 2.038574. Entropy: 1.164676.
episode: 9944   score: 635.0  epsilon: 1.0    steps: 918  evaluation reward: 397.3
Training network. lr: 0.000101. clip: 0.040263
Iteration 26581: Policy loss: 0.004481. Value loss: 5.163321. Entropy: 1.137260.
Iteration 26582: Policy loss: 0.026835. Value loss: 3.396184. Entropy: 1.133643.
Iteration 26583: Policy loss: 0.017450. Value loss: 2.806939. Entropy: 1.126824.
episode: 9945   score: 345.0  epsilon: 1.0    steps: 231  evaluation reward: 400.5
episode: 

Training network. lr: 0.000100. clip: 0.040150
Iteration 26641: Policy loss: 0.002898. Value loss: 3.417350. Entropy: 0.965036.
Iteration 26642: Policy loss: 0.004332. Value loss: 1.986741. Entropy: 0.965119.
Iteration 26643: Policy loss: 0.001316. Value loss: 1.470386. Entropy: 0.974013.
episode: 9966   score: 320.0  epsilon: 1.0    steps: 468  evaluation reward: 387.35
Training network. lr: 0.000100. clip: 0.040150
Iteration 26644: Policy loss: 0.003178. Value loss: 4.299562. Entropy: 1.122717.
Iteration 26645: Policy loss: 0.006132. Value loss: 2.493999. Entropy: 1.102286.
Iteration 26646: Policy loss: 0.011002. Value loss: 2.023323. Entropy: 1.107777.
episode: 9967   score: 355.0  epsilon: 1.0    steps: 834  evaluation reward: 386.6
Training network. lr: 0.000100. clip: 0.040150
Iteration 26647: Policy loss: 0.001167. Value loss: 3.358204. Entropy: 0.917274.
Iteration 26648: Policy loss: 0.000473. Value loss: 2.303446. Entropy: 0.921674.
Iteration 26649: Policy loss: -0.001806. Val

episode: 9988   score: 290.0  epsilon: 1.0    steps: 853  evaluation reward: 374.6
Training network. lr: 0.000100. clip: 0.039925
Iteration 26707: Policy loss: 0.001998. Value loss: 6.160027. Entropy: 1.187015.
Iteration 26708: Policy loss: 0.007731. Value loss: 3.435548. Entropy: 1.197972.
Iteration 26709: Policy loss: 0.010540. Value loss: 2.423987. Entropy: 1.175260.
Training network. lr: 0.000100. clip: 0.039925
Iteration 26710: Policy loss: 0.000279. Value loss: 1.947487. Entropy: 1.104743.
Iteration 26711: Policy loss: -0.000233. Value loss: 1.240779. Entropy: 1.080570.
Iteration 26712: Policy loss: -0.002509. Value loss: 0.925430. Entropy: 1.127882.
Training network. lr: 0.000100. clip: 0.039925
Iteration 26713: Policy loss: 0.002400. Value loss: 5.738947. Entropy: 1.100742.
Iteration 26714: Policy loss: 0.004185. Value loss: 3.228379. Entropy: 1.097228.
Iteration 26715: Policy loss: 0.003648. Value loss: 2.744723. Entropy: 1.115285.
episode: 9989   score: 615.0  epsilon: 1.0   

Training network. lr: 0.000100. clip: 0.039813
Iteration 26770: Policy loss: 0.000836. Value loss: 3.541026. Entropy: 0.966534.
Iteration 26771: Policy loss: 0.000397. Value loss: 2.252709. Entropy: 0.945948.
Iteration 26772: Policy loss: -0.001341. Value loss: 1.885980. Entropy: 0.958071.
episode: 10013   score: 285.0  epsilon: 1.0    steps: 926  evaluation reward: 393.8
Training network. lr: 0.000100. clip: 0.039813
Iteration 26773: Policy loss: 0.002425. Value loss: 3.251651. Entropy: 0.770292.
Iteration 26774: Policy loss: 0.002944. Value loss: 2.270867. Entropy: 0.786234.
Iteration 26775: Policy loss: 0.003110. Value loss: 1.952430. Entropy: 0.776028.
episode: 10014   score: 330.0  epsilon: 1.0    steps: 598  evaluation reward: 393.3
episode: 10015   score: 515.0  epsilon: 1.0    steps: 732  evaluation reward: 392.4
Training network. lr: 0.000100. clip: 0.039813
Iteration 26776: Policy loss: 0.007551. Value loss: 3.575634. Entropy: 0.748849.
Iteration 26777: Policy loss: 0.006281.

Training network. lr: 0.000099. clip: 0.039700
Iteration 26836: Policy loss: 0.014412. Value loss: 5.299890. Entropy: 1.003450.
Iteration 26837: Policy loss: 0.011746. Value loss: 3.434105. Entropy: 0.991351.
Iteration 26838: Policy loss: 0.013132. Value loss: 2.767382. Entropy: 1.010697.
episode: 10035   score: 595.0  epsilon: 1.0    steps: 741  evaluation reward: 401.35
episode: 10036   score: 670.0  epsilon: 1.0    steps: 803  evaluation reward: 401.85
Training network. lr: 0.000099. clip: 0.039700
Iteration 26839: Policy loss: 0.002613. Value loss: 4.536152. Entropy: 0.989791.
Iteration 26840: Policy loss: 0.006999. Value loss: 3.119835. Entropy: 0.984160.
Iteration 26841: Policy loss: 0.002372. Value loss: 2.571368. Entropy: 0.975734.
Training network. lr: 0.000099. clip: 0.039700
Iteration 26842: Policy loss: 0.006904. Value loss: 3.508210. Entropy: 0.835493.
Iteration 26843: Policy loss: 0.002008. Value loss: 2.130365. Entropy: 0.832700.
Iteration 26844: Policy loss: -0.001759. 

Iteration 26903: Policy loss: 0.008202. Value loss: 2.554788. Entropy: 1.026994.
Iteration 26904: Policy loss: 0.007488. Value loss: 2.007959. Entropy: 1.047906.
episode: 10055   score: 140.0  epsilon: 1.0    steps: 889  evaluation reward: 399.65
Training network. lr: 0.000099. clip: 0.039475
Iteration 26905: Policy loss: 0.003812. Value loss: 2.662422. Entropy: 1.053295.
Iteration 26906: Policy loss: 0.002662. Value loss: 1.727555. Entropy: 1.066607.
Iteration 26907: Policy loss: 0.000014. Value loss: 1.289512. Entropy: 1.059051.
Training network. lr: 0.000099. clip: 0.039475
Iteration 26908: Policy loss: 0.004975. Value loss: 5.976877. Entropy: 1.249434.
Iteration 26909: Policy loss: 0.003073. Value loss: 3.507135. Entropy: 1.244483.
Iteration 26910: Policy loss: 0.004242. Value loss: 2.900591. Entropy: 1.239167.
episode: 10056   score: 360.0  epsilon: 1.0    steps: 112  evaluation reward: 400.05
episode: 10057   score: 180.0  epsilon: 1.0    steps: 733  evaluation reward: 399.15
Tra

Iteration 26966: Policy loss: 0.007107. Value loss: 4.235573. Entropy: 0.709301.
Iteration 26967: Policy loss: 0.006317. Value loss: 3.907107. Entropy: 0.730509.
Training network. lr: 0.000098. clip: 0.039363
Iteration 26968: Policy loss: 0.002790. Value loss: 3.300535. Entropy: 1.161085.
Iteration 26969: Policy loss: 0.004978. Value loss: 2.071779. Entropy: 1.164083.
Iteration 26970: Policy loss: 0.006013. Value loss: 1.653278. Entropy: 1.166320.
episode: 10080   score: 210.0  epsilon: 1.0    steps: 508  evaluation reward: 383.75
Training network. lr: 0.000098. clip: 0.039363
Iteration 26971: Policy loss: 0.002930. Value loss: 3.182757. Entropy: 1.217955.
Iteration 26972: Policy loss: 0.007010. Value loss: 2.157928. Entropy: 1.199185.
Iteration 26973: Policy loss: 0.003111. Value loss: 1.919260. Entropy: 1.208188.
episode: 10081   score: 230.0  epsilon: 1.0    steps: 765  evaluation reward: 384.3
episode: 10082   score: 260.0  epsilon: 1.0    steps: 810  evaluation reward: 384.2
episo

Iteration 27030: Policy loss: -0.003233. Value loss: 1.662784. Entropy: 0.979498.
Training network. lr: 0.000098. clip: 0.039250
Iteration 27031: Policy loss: 0.002618. Value loss: 7.031391. Entropy: 1.084399.
Iteration 27032: Policy loss: -0.000083. Value loss: 5.042979. Entropy: 1.099533.
Iteration 27033: Policy loss: 0.000976. Value loss: 4.949442. Entropy: 1.096714.
episode: 10104   score: 55.0  epsilon: 1.0    steps: 929  evaluation reward: 356.25
Training network. lr: 0.000098. clip: 0.039250
Iteration 27034: Policy loss: 0.002088. Value loss: 4.125659. Entropy: 1.106112.
Iteration 27035: Policy loss: 0.010003. Value loss: 2.680422. Entropy: 1.106435.
Iteration 27036: Policy loss: 0.004907. Value loss: 2.397660. Entropy: 1.138744.
episode: 10105   score: 45.0  epsilon: 1.0    steps: 167  evaluation reward: 354.65
Training network. lr: 0.000098. clip: 0.039250
Iteration 27037: Policy loss: 0.001236. Value loss: 2.478594. Entropy: 1.199186.
Iteration 27038: Policy loss: 0.004556. V

Iteration 27096: Policy loss: 0.004736. Value loss: 1.804476. Entropy: 1.146444.
episode: 10126   score: 325.0  epsilon: 1.0    steps: 956  evaluation reward: 349.75
Training network. lr: 0.000098. clip: 0.039137
Iteration 27097: Policy loss: 0.004134. Value loss: 6.732702. Entropy: 1.099785.
Iteration 27098: Policy loss: 0.003372. Value loss: 4.768571. Entropy: 1.098668.
Iteration 27099: Policy loss: 0.007116. Value loss: 3.676055. Entropy: 1.097576.
episode: 10127   score: 685.0  epsilon: 1.0    steps: 135  evaluation reward: 351.1
episode: 10128   score: 360.0  epsilon: 1.0    steps: 370  evaluation reward: 349.25
Training network. lr: 0.000098. clip: 0.039137
Iteration 27100: Policy loss: 0.002366. Value loss: 3.252355. Entropy: 0.992731.
Iteration 27101: Policy loss: 0.003171. Value loss: 2.358560. Entropy: 1.018863.
Iteration 27102: Policy loss: 0.005194. Value loss: 1.803118. Entropy: 0.999768.
Training network. lr: 0.000098. clip: 0.039025
Iteration 27103: Policy loss: 0.001315

Iteration 27162: Policy loss: 0.005570. Value loss: 1.619264. Entropy: 1.115120.
episode: 10148   score: 295.0  epsilon: 1.0    steps: 116  evaluation reward: 335.2
episode: 10149   score: 270.0  epsilon: 1.0    steps: 888  evaluation reward: 335.75
Training network. lr: 0.000097. clip: 0.038913
Iteration 27163: Policy loss: 0.003889. Value loss: 5.753804. Entropy: 1.225837.
Iteration 27164: Policy loss: 0.007471. Value loss: 3.854684. Entropy: 1.229682.
Iteration 27165: Policy loss: 0.005378. Value loss: 2.815280. Entropy: 1.228312.
episode: 10150   score: 650.0  epsilon: 1.0    steps: 371  evaluation reward: 334.25
now time :  2019-02-23 08:58:45.014008
episode: 10151   score: 440.0  epsilon: 1.0    steps: 477  evaluation reward: 336.6
episode: 10152   score: 310.0  epsilon: 1.0    steps: 619  evaluation reward: 337.85
Training network. lr: 0.000097. clip: 0.038913
Iteration 27166: Policy loss: 0.002195. Value loss: 3.118434. Entropy: 0.914821.
Iteration 27167: Policy loss: 0.001530.

Iteration 27225: Policy loss: 0.009996. Value loss: 1.678375. Entropy: 1.233376.
Training network. lr: 0.000097. clip: 0.038800
Iteration 27226: Policy loss: 0.005871. Value loss: 2.911541. Entropy: 1.168158.
Iteration 27227: Policy loss: 0.007121. Value loss: 1.898520. Entropy: 1.179527.
Iteration 27228: Policy loss: 0.008961. Value loss: 1.384581. Entropy: 1.168353.
episode: 10173   score: 495.0  epsilon: 1.0    steps: 804  evaluation reward: 335.6
Training network. lr: 0.000097. clip: 0.038800
Iteration 27229: Policy loss: 0.008877. Value loss: 4.408925. Entropy: 1.159999.
Iteration 27230: Policy loss: 0.006919. Value loss: 2.728046. Entropy: 1.177159.
Iteration 27231: Policy loss: 0.007460. Value loss: 1.860849. Entropy: 1.166920.
episode: 10174   score: 255.0  epsilon: 1.0    steps: 409  evaluation reward: 337.35
episode: 10175   score: 230.0  epsilon: 1.0    steps: 1001  evaluation reward: 335.3
Training network. lr: 0.000097. clip: 0.038800
Iteration 27232: Policy loss: 0.004889

Iteration 27291: Policy loss: -0.000071. Value loss: 1.698538. Entropy: 1.172242.
episode: 10195   score: 490.0  epsilon: 1.0    steps: 926  evaluation reward: 336.8
Training network. lr: 0.000097. clip: 0.038687
Iteration 27292: Policy loss: 0.005126. Value loss: 4.638548. Entropy: 1.188550.
Iteration 27293: Policy loss: 0.003257. Value loss: 2.880229. Entropy: 1.170709.
Iteration 27294: Policy loss: 0.001283. Value loss: 2.151808. Entropy: 1.179190.
Training network. lr: 0.000097. clip: 0.038687
Iteration 27295: Policy loss: 0.002488. Value loss: 4.034517. Entropy: 1.195965.
Iteration 27296: Policy loss: 0.001918. Value loss: 2.369623. Entropy: 1.172324.
Iteration 27297: Policy loss: -0.000341. Value loss: 1.915653. Entropy: 1.188640.
episode: 10196   score: 260.0  epsilon: 1.0    steps: 368  evaluation reward: 339.1
episode: 10197   score: 460.0  epsilon: 1.0    steps: 500  evaluation reward: 338.7
episode: 10198   score: 435.0  epsilon: 1.0    steps: 536  evaluation reward: 341.2
e

Iteration 27354: Policy loss: 0.002210. Value loss: 1.765233. Entropy: 1.070033.
episode: 10220   score: 320.0  epsilon: 1.0    steps: 661  evaluation reward: 346.45
Training network. lr: 0.000096. clip: 0.038463
Iteration 27355: Policy loss: 0.000291. Value loss: 2.331656. Entropy: 1.121040.
Iteration 27356: Policy loss: -0.001907. Value loss: 1.539907. Entropy: 1.132645.
Iteration 27357: Policy loss: -0.001108. Value loss: 1.179052. Entropy: 1.126327.
episode: 10221   score: 280.0  epsilon: 1.0    steps: 861  evaluation reward: 345.1
Training network. lr: 0.000096. clip: 0.038463
Iteration 27358: Policy loss: 0.003939. Value loss: 6.538485. Entropy: 1.235909.
Iteration 27359: Policy loss: 0.005165. Value loss: 4.802296. Entropy: 1.230514.
Iteration 27360: Policy loss: 0.006655. Value loss: 3.938270. Entropy: 1.237643.
episode: 10222   score: 280.0  epsilon: 1.0    steps: 489  evaluation reward: 345.3
episode: 10223   score: 405.0  epsilon: 1.0    steps: 960  evaluation reward: 342.1


Training network. lr: 0.000096. clip: 0.038350
Iteration 27418: Policy loss: -0.000033. Value loss: 2.600525. Entropy: 1.197542.
Iteration 27419: Policy loss: 0.000207. Value loss: 1.818740. Entropy: 1.183518.
Iteration 27420: Policy loss: -0.000057. Value loss: 1.381077. Entropy: 1.195181.
episode: 10244   score: 670.0  epsilon: 1.0    steps: 78  evaluation reward: 323.95
episode: 10245   score: 650.0  epsilon: 1.0    steps: 186  evaluation reward: 329.35
episode: 10246   score: 260.0  epsilon: 1.0    steps: 324  evaluation reward: 334.1
episode: 10247   score: 185.0  epsilon: 1.0    steps: 782  evaluation reward: 331.85
Training network. lr: 0.000096. clip: 0.038350
Iteration 27421: Policy loss: 0.005515. Value loss: 4.047038. Entropy: 0.988767.
Iteration 27422: Policy loss: 0.004371. Value loss: 3.008810. Entropy: 0.972116.
Iteration 27423: Policy loss: 0.003879. Value loss: 2.337996. Entropy: 1.001485.
episode: 10248   score: 635.0  epsilon: 1.0    steps: 988  evaluation reward: 32

Iteration 27479: Policy loss: 0.001643. Value loss: 1.978333. Entropy: 0.927538.
Iteration 27480: Policy loss: -0.001464. Value loss: 1.749056. Entropy: 0.932758.
episode: 10271   score: 210.0  epsilon: 1.0    steps: 195  evaluation reward: 324.15
Training network. lr: 0.000096. clip: 0.038238
Iteration 27481: Policy loss: 0.003143. Value loss: 2.476424. Entropy: 0.858058.
Iteration 27482: Policy loss: 0.000987. Value loss: 2.065466. Entropy: 0.841237.
Iteration 27483: Policy loss: -0.001333. Value loss: 1.788654. Entropy: 0.849530.
Training network. lr: 0.000096. clip: 0.038238
Iteration 27484: Policy loss: 0.002217. Value loss: 3.890816. Entropy: 0.822944.
Iteration 27485: Policy loss: 0.004068. Value loss: 2.387207. Entropy: 0.823804.
Iteration 27486: Policy loss: 0.001083. Value loss: 1.980871. Entropy: 0.831102.
Training network. lr: 0.000096. clip: 0.038238
Iteration 27487: Policy loss: 0.005092. Value loss: 2.872322. Entropy: 1.038220.
Iteration 27488: Policy loss: 0.005305. Val

Training network. lr: 0.000095. clip: 0.038125
Iteration 27544: Policy loss: 0.004273. Value loss: 2.688044. Entropy: 1.005276.
Iteration 27545: Policy loss: 0.002332. Value loss: 1.905121. Entropy: 1.005599.
Iteration 27546: Policy loss: 0.002210. Value loss: 1.379966. Entropy: 0.997602.
episode: 10294   score: 225.0  epsilon: 1.0    steps: 199  evaluation reward: 330.35
Training network. lr: 0.000095. clip: 0.038125
Iteration 27547: Policy loss: 0.003408. Value loss: 3.394167. Entropy: 1.078803.
Iteration 27548: Policy loss: 0.002536. Value loss: 2.506606. Entropy: 1.076797.
Iteration 27549: Policy loss: -0.000169. Value loss: 1.763116. Entropy: 1.063555.
episode: 10295   score: 380.0  epsilon: 1.0    steps: 548  evaluation reward: 329.45
Training network. lr: 0.000095. clip: 0.038125
Iteration 27550: Policy loss: 0.004126. Value loss: 2.975683. Entropy: 0.963070.
Iteration 27551: Policy loss: 0.002976. Value loss: 1.941534. Entropy: 0.976723.
Iteration 27552: Policy loss: 0.003192. 

episode: 10319   score: 190.0  epsilon: 1.0    steps: 445  evaluation reward: 314.55
Training network. lr: 0.000095. clip: 0.037900
Iteration 27607: Policy loss: 0.000322. Value loss: 2.822739. Entropy: 1.054150.
Iteration 27608: Policy loss: 0.002307. Value loss: 1.822117. Entropy: 1.040696.
Iteration 27609: Policy loss: -0.002233. Value loss: 1.394297. Entropy: 1.079645.
episode: 10320   score: 95.0  epsilon: 1.0    steps: 125  evaluation reward: 313.6
Training network. lr: 0.000095. clip: 0.037900
Iteration 27610: Policy loss: 0.000425. Value loss: 3.716314. Entropy: 1.260377.
Iteration 27611: Policy loss: 0.002395. Value loss: 2.316950. Entropy: 1.265973.
Iteration 27612: Policy loss: 0.003586. Value loss: 1.700450. Entropy: 1.270571.
episode: 10321   score: 400.0  epsilon: 1.0    steps: 304  evaluation reward: 311.35
episode: 10322   score: 285.0  epsilon: 1.0    steps: 990  evaluation reward: 312.55
Training network. lr: 0.000095. clip: 0.037900
Iteration 27613: Policy loss: 0.00

Iteration 27672: Policy loss: 0.002762. Value loss: 1.301006. Entropy: 1.352600.
episode: 10342   score: 310.0  epsilon: 1.0    steps: 43  evaluation reward: 315.25
episode: 10343   score: 360.0  epsilon: 1.0    steps: 194  evaluation reward: 315.95
episode: 10344   score: 185.0  epsilon: 1.0    steps: 309  evaluation reward: 314.9
episode: 10345   score: 225.0  epsilon: 1.0    steps: 868  evaluation reward: 310.05
Training network. lr: 0.000094. clip: 0.037788
Iteration 27673: Policy loss: 0.001548. Value loss: 2.726274. Entropy: 0.932394.
Iteration 27674: Policy loss: 0.007453. Value loss: 2.029805. Entropy: 0.947511.
Iteration 27675: Policy loss: 0.011556. Value loss: 1.516611. Entropy: 0.937102.
episode: 10346   score: 275.0  epsilon: 1.0    steps: 481  evaluation reward: 305.8
Training network. lr: 0.000094. clip: 0.037788
Iteration 27676: Policy loss: 0.001149. Value loss: 2.275940. Entropy: 1.005782.
Iteration 27677: Policy loss: 0.002889. Value loss: 1.746448. Entropy: 0.989643

episode: 10366   score: 420.0  epsilon: 1.0    steps: 334  evaluation reward: 296.75
episode: 10367   score: 470.0  epsilon: 1.0    steps: 532  evaluation reward: 298.6
episode: 10368   score: 390.0  epsilon: 1.0    steps: 761  evaluation reward: 300.55
Training network. lr: 0.000094. clip: 0.037675
Iteration 27736: Policy loss: 0.001571. Value loss: 5.778236. Entropy: 1.101396.
Iteration 27737: Policy loss: 0.003241. Value loss: 3.428800. Entropy: 1.091384.
Iteration 27738: Policy loss: 0.000056. Value loss: 2.783144. Entropy: 1.123925.
episode: 10369   score: 650.0  epsilon: 1.0    steps: 845  evaluation reward: 303.05
Training network. lr: 0.000094. clip: 0.037675
Iteration 27739: Policy loss: 0.002776. Value loss: 4.815560. Entropy: 0.948277.
Iteration 27740: Policy loss: 0.011616. Value loss: 3.065348. Entropy: 0.945963.
Iteration 27741: Policy loss: 0.010317. Value loss: 2.204199. Entropy: 0.950604.
episode: 10370   score: 285.0  epsilon: 1.0    steps: 927  evaluation reward: 306

episode: 10391   score: 345.0  epsilon: 1.0    steps: 606  evaluation reward: 304.45
episode: 10392   score: 250.0  epsilon: 1.0    steps: 966  evaluation reward: 304.65
Training network. lr: 0.000094. clip: 0.037563
Iteration 27799: Policy loss: 0.003379. Value loss: 3.514952. Entropy: 1.240210.
Iteration 27800: Policy loss: 0.006203. Value loss: 2.338486. Entropy: 1.275311.
Iteration 27801: Policy loss: 0.004908. Value loss: 1.920953. Entropy: 1.232194.
episode: 10393   score: 265.0  epsilon: 1.0    steps: 269  evaluation reward: 304.35
episode: 10394   score: 305.0  epsilon: 1.0    steps: 697  evaluation reward: 302.75
Training network. lr: 0.000094. clip: 0.037450
Iteration 27802: Policy loss: 0.003246. Value loss: 3.556631. Entropy: 0.951137.
Iteration 27803: Policy loss: 0.006628. Value loss: 2.518951. Entropy: 0.928602.
Iteration 27804: Policy loss: 0.007241. Value loss: 1.973649. Entropy: 0.915560.
episode: 10395   score: 330.0  epsilon: 1.0    steps: 10  evaluation reward: 303

Iteration 27861: Policy loss: -0.004018. Value loss: 1.607554. Entropy: 0.913806.
Training network. lr: 0.000093. clip: 0.037338
Iteration 27862: Policy loss: 0.002623. Value loss: 3.219593. Entropy: 1.024125.
Iteration 27863: Policy loss: 0.006226. Value loss: 2.261444. Entropy: 1.040562.
Iteration 27864: Policy loss: 0.005354. Value loss: 1.942780. Entropy: 1.037999.
episode: 10417   score: 290.0  epsilon: 1.0    steps: 210  evaluation reward: 317.6
episode: 10418   score: 165.0  epsilon: 1.0    steps: 415  evaluation reward: 318.7
Training network. lr: 0.000093. clip: 0.037338
Iteration 27865: Policy loss: -0.000182. Value loss: 2.114236. Entropy: 0.984045.
Iteration 27866: Policy loss: 0.001183. Value loss: 1.480965. Entropy: 1.016123.
Iteration 27867: Policy loss: -0.000245. Value loss: 1.266833. Entropy: 1.007355.
episode: 10419   score: 185.0  epsilon: 1.0    steps: 991  evaluation reward: 318.1
Training network. lr: 0.000093. clip: 0.037338
Iteration 27868: Policy loss: 0.00232

Iteration 27922: Policy loss: 0.000629. Value loss: 3.803903. Entropy: 0.746507.
Iteration 27923: Policy loss: 0.001059. Value loss: 2.708155. Entropy: 0.712022.
Iteration 27924: Policy loss: -0.003006. Value loss: 2.396703. Entropy: 0.714422.
Training network. lr: 0.000093. clip: 0.037225
Iteration 27925: Policy loss: -0.000168. Value loss: 2.466634. Entropy: 0.889680.
Iteration 27926: Policy loss: 0.000247. Value loss: 1.542095. Entropy: 0.897095.
Iteration 27927: Policy loss: 0.001393. Value loss: 1.480995. Entropy: 0.928160.
Training network. lr: 0.000093. clip: 0.037225
Iteration 27928: Policy loss: 0.008518. Value loss: 3.802514. Entropy: 1.115834.
Iteration 27929: Policy loss: 0.006966. Value loss: 2.084284. Entropy: 1.097970.
Iteration 27930: Policy loss: 0.003845. Value loss: 1.608384. Entropy: 1.105632.
episode: 10445   score: 475.0  epsilon: 1.0    steps: 803  evaluation reward: 313.85
Training network. lr: 0.000093. clip: 0.037225
Iteration 27931: Policy loss: 0.006984. Val

episode: 10467   score: 365.0  epsilon: 1.0    steps: 674  evaluation reward: 323.4
Training network. lr: 0.000093. clip: 0.037113
Iteration 27988: Policy loss: -0.000139. Value loss: 3.682517. Entropy: 1.252328.
Iteration 27989: Policy loss: 0.002022. Value loss: 2.419685. Entropy: 1.244375.
Iteration 27990: Policy loss: 0.000329. Value loss: 1.937634. Entropy: 1.248438.
episode: 10468   score: 345.0  epsilon: 1.0    steps: 527  evaluation reward: 322.35
Training network. lr: 0.000093. clip: 0.037113
Iteration 27991: Policy loss: 0.002237. Value loss: 3.014858. Entropy: 0.925200.
Iteration 27992: Policy loss: 0.004800. Value loss: 2.218131. Entropy: 0.929092.
Iteration 27993: Policy loss: 0.005529. Value loss: 1.511206. Entropy: 0.940832.
episode: 10469   score: 330.0  epsilon: 1.0    steps: 891  evaluation reward: 321.9
Training network. lr: 0.000093. clip: 0.037113
Iteration 27994: Policy loss: 0.001465. Value loss: 4.416487. Entropy: 1.193850.
Iteration 27995: Policy loss: 0.004507

Iteration 28048: Policy loss: 0.000769. Value loss: 2.486701. Entropy: 1.125560.
Iteration 28049: Policy loss: -0.000453. Value loss: 1.773912. Entropy: 1.147417.
Iteration 28050: Policy loss: 0.001542. Value loss: 1.360173. Entropy: 1.135796.
Training network. lr: 0.000092. clip: 0.036888
Iteration 28051: Policy loss: 0.001978. Value loss: 3.547908. Entropy: 1.119677.
Iteration 28052: Policy loss: -0.000394. Value loss: 2.286128. Entropy: 1.121100.
Iteration 28053: Policy loss: 0.002469. Value loss: 1.873062. Entropy: 1.125077.
episode: 10495   score: 125.0  epsilon: 1.0    steps: 482  evaluation reward: 306.95
episode: 10496   score: 345.0  epsilon: 1.0    steps: 814  evaluation reward: 304.9
Training network. lr: 0.000092. clip: 0.036888
Iteration 28054: Policy loss: 0.002295. Value loss: 4.045846. Entropy: 1.099275.
Iteration 28055: Policy loss: 0.001701. Value loss: 2.399906. Entropy: 1.118780.
Iteration 28056: Policy loss: 0.001252. Value loss: 2.152042. Entropy: 1.095837.
episod

episode: 10520   score: 510.0  epsilon: 1.0    steps: 351  evaluation reward: 295.5
episode: 10521   score: 215.0  epsilon: 1.0    steps: 760  evaluation reward: 298.5
Training network. lr: 0.000092. clip: 0.036775
Iteration 28111: Policy loss: 0.007158. Value loss: 4.469931. Entropy: 1.222722.
Iteration 28112: Policy loss: 0.008875. Value loss: 3.104902. Entropy: 1.222465.
Iteration 28113: Policy loss: 0.009387. Value loss: 2.560525. Entropy: 1.220735.
Training network. lr: 0.000092. clip: 0.036775
Iteration 28114: Policy loss: 0.002952. Value loss: 3.832596. Entropy: 0.973248.
Iteration 28115: Policy loss: 0.006385. Value loss: 2.659621. Entropy: 0.920180.
Iteration 28116: Policy loss: 0.000184. Value loss: 2.050885. Entropy: 0.960080.
episode: 10522   score: 320.0  epsilon: 1.0    steps: 392  evaluation reward: 298.25
episode: 10523   score: 120.0  epsilon: 1.0    steps: 638  evaluation reward: 297.8
Training network. lr: 0.000092. clip: 0.036775
Iteration 28117: Policy loss: 0.0048

Iteration 28171: Policy loss: 0.004543. Value loss: 3.130682. Entropy: 1.209649.
Iteration 28172: Policy loss: 0.002244. Value loss: 2.063289. Entropy: 1.209190.
Iteration 28173: Policy loss: -0.000434. Value loss: 1.653024. Entropy: 1.201043.
episode: 10548   score: 180.0  epsilon: 1.0    steps: 31  evaluation reward: 290.0
episode: 10549   score: 185.0  epsilon: 1.0    steps: 872  evaluation reward: 289.85
Training network. lr: 0.000092. clip: 0.036663
Iteration 28174: Policy loss: 0.002929. Value loss: 2.412539. Entropy: 0.973911.
Iteration 28175: Policy loss: 0.001600. Value loss: 1.543112. Entropy: 0.956305.
Iteration 28176: Policy loss: -0.000661. Value loss: 1.219820. Entropy: 0.982056.
episode: 10550   score: 195.0  epsilon: 1.0    steps: 422  evaluation reward: 288.5
Training network. lr: 0.000092. clip: 0.036663
Iteration 28177: Policy loss: 0.001013. Value loss: 2.647222. Entropy: 1.167741.
Iteration 28178: Policy loss: 0.001930. Value loss: 1.966591. Entropy: 1.165688.
Iter

episode: 10574   score: 315.0  epsilon: 1.0    steps: 573  evaluation reward: 274.3
Training network. lr: 0.000091. clip: 0.036550
Iteration 28234: Policy loss: 0.004339. Value loss: 2.888090. Entropy: 1.045623.
Iteration 28235: Policy loss: 0.003640. Value loss: 1.972826. Entropy: 1.058666.
Iteration 28236: Policy loss: 0.002204. Value loss: 1.623171. Entropy: 1.053479.
Training network. lr: 0.000091. clip: 0.036550
Iteration 28237: Policy loss: 0.002585. Value loss: 3.414291. Entropy: 1.188400.
Iteration 28238: Policy loss: 0.003711. Value loss: 2.578584. Entropy: 1.191352.
Iteration 28239: Policy loss: 0.003979. Value loss: 2.251223. Entropy: 1.168410.
episode: 10575   score: 190.0  epsilon: 1.0    steps: 971  evaluation reward: 275.1
Training network. lr: 0.000091. clip: 0.036550
Iteration 28240: Policy loss: 0.002692. Value loss: 3.439543. Entropy: 1.153102.
Iteration 28241: Policy loss: 0.005608. Value loss: 2.336394. Entropy: 1.098599.
Iteration 28242: Policy loss: 0.003994. Val

Training network. lr: 0.000091. clip: 0.036438
Iteration 28297: Policy loss: 0.004543. Value loss: 3.197160. Entropy: 1.186666.
Iteration 28298: Policy loss: 0.003887. Value loss: 1.799149. Entropy: 1.157597.
Iteration 28299: Policy loss: 0.003466. Value loss: 1.524561. Entropy: 1.158656.
episode: 10599   score: 330.0  epsilon: 1.0    steps: 168  evaluation reward: 276.6
episode: 10600   score: 360.0  epsilon: 1.0    steps: 867  evaluation reward: 276.0
Training network. lr: 0.000091. clip: 0.036438
Iteration 28300: Policy loss: 0.003031. Value loss: 4.348864. Entropy: 1.192227.
Iteration 28301: Policy loss: 0.002152. Value loss: 3.577408. Entropy: 1.227253.
Iteration 28302: Policy loss: 0.002777. Value loss: 2.749778. Entropy: 1.189273.
now time :  2019-02-23 09:22:06.177873
episode: 10601   score: 225.0  epsilon: 1.0    steps: 10  evaluation reward: 277.3
episode: 10602   score: 230.0  epsilon: 1.0    steps: 360  evaluation reward: 276.2
episode: 10603   score: 565.0  epsilon: 1.0   

Iteration 28359: Policy loss: 0.004298. Value loss: 2.676804. Entropy: 1.322210.
episode: 10625   score: 305.0  epsilon: 1.0    steps: 41  evaluation reward: 281.4
Training network. lr: 0.000091. clip: 0.036213
Iteration 28360: Policy loss: 0.001894. Value loss: 3.689387. Entropy: 1.251173.
Iteration 28361: Policy loss: 0.003326. Value loss: 2.263331. Entropy: 1.226653.
Iteration 28362: Policy loss: 0.001864. Value loss: 1.862699. Entropy: 1.240070.
Training network. lr: 0.000091. clip: 0.036213
Iteration 28363: Policy loss: 0.000818. Value loss: 4.778573. Entropy: 1.378661.
Iteration 28364: Policy loss: 0.002222. Value loss: 2.943126. Entropy: 1.386101.
Iteration 28365: Policy loss: -0.000059. Value loss: 2.400476. Entropy: 1.394134.
episode: 10626   score: 400.0  epsilon: 1.0    steps: 329  evaluation reward: 282.05
episode: 10627   score: 515.0  epsilon: 1.0    steps: 924  evaluation reward: 283.95
Training network. lr: 0.000091. clip: 0.036213
Iteration 28366: Policy loss: 0.001824

episode: 10649   score: 210.0  epsilon: 1.0    steps: 581  evaluation reward: 310.4
Training network. lr: 0.000090. clip: 0.036100
Iteration 28423: Policy loss: 0.003290. Value loss: 4.466074. Entropy: 1.179435.
Iteration 28424: Policy loss: 0.007413. Value loss: 3.076030. Entropy: 1.165154.
Iteration 28425: Policy loss: 0.003356. Value loss: 2.782105. Entropy: 1.172515.
episode: 10650   score: 210.0  epsilon: 1.0    steps: 877  evaluation reward: 310.65
Training network. lr: 0.000090. clip: 0.036100
Iteration 28426: Policy loss: 0.006736. Value loss: 2.471773. Entropy: 1.084282.
Iteration 28427: Policy loss: 0.006320. Value loss: 1.446907. Entropy: 1.095847.
Iteration 28428: Policy loss: 0.003784. Value loss: 1.244919. Entropy: 1.074899.
Training network. lr: 0.000090. clip: 0.036100
Iteration 28429: Policy loss: 0.007052. Value loss: 4.187778. Entropy: 1.081155.
Iteration 28430: Policy loss: 0.004596. Value loss: 2.530590. Entropy: 1.100504.
Iteration 28431: Policy loss: 0.003855. Va

episode: 10674   score: 180.0  epsilon: 1.0    steps: 202  evaluation reward: 309.7
Training network. lr: 0.000090. clip: 0.035988
Iteration 28486: Policy loss: 0.002728. Value loss: 2.944165. Entropy: 1.183165.
Iteration 28487: Policy loss: 0.000843. Value loss: 2.180697. Entropy: 1.180448.
Iteration 28488: Policy loss: 0.000278. Value loss: 1.691499. Entropy: 1.190311.
Training network. lr: 0.000090. clip: 0.035988
Iteration 28489: Policy loss: 0.002892. Value loss: 5.563981. Entropy: 1.338102.
Iteration 28490: Policy loss: 0.010858. Value loss: 3.254664. Entropy: 1.338194.
Iteration 28491: Policy loss: 0.008653. Value loss: 2.330561. Entropy: 1.345850.
episode: 10675   score: 350.0  epsilon: 1.0    steps: 584  evaluation reward: 308.35
episode: 10676   score: 305.0  epsilon: 1.0    steps: 903  evaluation reward: 309.95
Training network. lr: 0.000090. clip: 0.035988
Iteration 28492: Policy loss: 0.002844. Value loss: 3.512307. Entropy: 1.143684.
Iteration 28493: Policy loss: 0.004510

Iteration 28550: Policy loss: 0.013437. Value loss: 4.039085. Entropy: 1.243738.
Iteration 28551: Policy loss: 0.008077. Value loss: 3.248122. Entropy: 1.251153.
episode: 10698   score: 600.0  epsilon: 1.0    steps: 445  evaluation reward: 320.45
episode: 10699   score: 390.0  epsilon: 1.0    steps: 824  evaluation reward: 324.95
Training network. lr: 0.000089. clip: 0.035763
Iteration 28552: Policy loss: 0.004107. Value loss: 5.273517. Entropy: 1.066911.
Iteration 28553: Policy loss: 0.004076. Value loss: 3.146451. Entropy: 1.085427.
Iteration 28554: Policy loss: 0.006586. Value loss: 2.825390. Entropy: 1.058466.
Training network. lr: 0.000089. clip: 0.035763
Iteration 28555: Policy loss: 0.005644. Value loss: 5.308281. Entropy: 1.204730.
Iteration 28556: Policy loss: 0.005471. Value loss: 3.632864. Entropy: 1.225126.
Iteration 28557: Policy loss: 0.005213. Value loss: 2.712972. Entropy: 1.179536.
episode: 10700   score: 240.0  epsilon: 1.0    steps: 134  evaluation reward: 325.55
Tra

Iteration 28611: Policy loss: 0.002716. Value loss: 1.667039. Entropy: 0.845105.
episode: 10725   score: 335.0  epsilon: 1.0    steps: 298  evaluation reward: 315.45
episode: 10726   score: 315.0  epsilon: 1.0    steps: 614  evaluation reward: 315.75
Training network. lr: 0.000089. clip: 0.035650
Iteration 28612: Policy loss: 0.003174. Value loss: 3.899920. Entropy: 0.969318.
Iteration 28613: Policy loss: 0.004729. Value loss: 2.746222. Entropy: 0.974151.
Iteration 28614: Policy loss: 0.007676. Value loss: 2.465888. Entropy: 0.964635.
Training network. lr: 0.000089. clip: 0.035650
Iteration 28615: Policy loss: 0.003125. Value loss: 2.947357. Entropy: 1.070599.
Iteration 28616: Policy loss: 0.001835. Value loss: 2.087404. Entropy: 1.062078.
Iteration 28617: Policy loss: 0.000458. Value loss: 1.617885. Entropy: 1.047640.
episode: 10727   score: 240.0  epsilon: 1.0    steps: 880  evaluation reward: 314.9
episode: 10728   score: 210.0  epsilon: 1.0    steps: 1024  evaluation reward: 312.15

Iteration 28676: Policy loss: 0.002261. Value loss: 3.481226. Entropy: 1.367566.
Iteration 28677: Policy loss: 0.004318. Value loss: 2.762078. Entropy: 1.350652.
episode: 10748   score: 320.0  epsilon: 1.0    steps: 113  evaluation reward: 310.3
episode: 10749   score: 290.0  epsilon: 1.0    steps: 183  evaluation reward: 307.5
Training network. lr: 0.000089. clip: 0.035538
Iteration 28678: Policy loss: 0.003939. Value loss: 5.673341. Entropy: 1.270367.
Iteration 28679: Policy loss: 0.006614. Value loss: 3.890945. Entropy: 1.267883.
Iteration 28680: Policy loss: 0.007692. Value loss: 3.341241. Entropy: 1.269807.
episode: 10750   score: 430.0  epsilon: 1.0    steps: 377  evaluation reward: 308.3
now time :  2019-02-23 09:29:55.380038
episode: 10751   score: 180.0  epsilon: 1.0    steps: 875  evaluation reward: 310.5
Training network. lr: 0.000089. clip: 0.035538
Iteration 28681: Policy loss: 0.007195. Value loss: 3.811466. Entropy: 1.093702.
Iteration 28682: Policy loss: 0.008297. Value

Iteration 28740: Policy loss: 0.009846. Value loss: 2.900607. Entropy: 1.101910.
Training network. lr: 0.000089. clip: 0.035425
Iteration 28741: Policy loss: 0.008174. Value loss: 5.979471. Entropy: 1.191928.
Iteration 28742: Policy loss: 0.005432. Value loss: 3.135723. Entropy: 1.196861.
Iteration 28743: Policy loss: 0.005451. Value loss: 2.259995. Entropy: 1.209912.
episode: 10772   score: 360.0  epsilon: 1.0    steps: 6  evaluation reward: 314.05
Training network. lr: 0.000089. clip: 0.035425
Iteration 28744: Policy loss: 0.004121. Value loss: 5.284210. Entropy: 1.143550.
Iteration 28745: Policy loss: 0.003857. Value loss: 3.513972. Entropy: 1.182048.
Iteration 28746: Policy loss: 0.002108. Value loss: 2.729464. Entropy: 1.178884.
episode: 10773   score: 410.0  epsilon: 1.0    steps: 284  evaluation reward: 314.75
episode: 10774   score: 375.0  epsilon: 1.0    steps: 988  evaluation reward: 314.8
Training network. lr: 0.000089. clip: 0.035425
Iteration 28747: Policy loss: 0.003678. 

Iteration 28805: Policy loss: 0.003425. Value loss: 2.843046. Entropy: 1.229428.
Iteration 28806: Policy loss: 0.005577. Value loss: 2.548292. Entropy: 1.235193.
episode: 10795   score: 445.0  epsilon: 1.0    steps: 113  evaluation reward: 322.9
episode: 10796   score: 520.0  epsilon: 1.0    steps: 547  evaluation reward: 325.25
episode: 10797   score: 620.0  epsilon: 1.0    steps: 704  evaluation reward: 326.45
Training network. lr: 0.000088. clip: 0.035200
Iteration 28807: Policy loss: -0.001724. Value loss: 3.177875. Entropy: 0.987580.
Iteration 28808: Policy loss: -0.000814. Value loss: 2.210156. Entropy: 1.007919.
Iteration 28809: Policy loss: -0.000565. Value loss: 1.802639. Entropy: 0.988199.
episode: 10798   score: 220.0  epsilon: 1.0    steps: 205  evaluation reward: 331.15
Training network. lr: 0.000088. clip: 0.035200
Iteration 28810: Policy loss: 0.003354. Value loss: 2.901270. Entropy: 0.896208.
Iteration 28811: Policy loss: 0.002804. Value loss: 1.765863. Entropy: 0.89128

Iteration 28868: Policy loss: 0.002571. Value loss: 3.954707. Entropy: 1.210685.
Iteration 28869: Policy loss: 0.000348. Value loss: 3.067517. Entropy: 1.201034.
episode: 10820   score: 520.0  epsilon: 1.0    steps: 379  evaluation reward: 327.95
episode: 10821   score: 320.0  epsilon: 1.0    steps: 883  evaluation reward: 329.45
Training network. lr: 0.000088. clip: 0.035088
Iteration 28870: Policy loss: 0.004806. Value loss: 5.750593. Entropy: 1.122617.
Iteration 28871: Policy loss: 0.008546. Value loss: 3.216753. Entropy: 1.119478.
Iteration 28872: Policy loss: 0.010153. Value loss: 2.479047. Entropy: 1.084966.
episode: 10822   score: 250.0  epsilon: 1.0    steps: 527  evaluation reward: 330.5
Training network. lr: 0.000088. clip: 0.035088
Iteration 28873: Policy loss: 0.000225. Value loss: 2.551579. Entropy: 1.019409.
Iteration 28874: Policy loss: 0.000436. Value loss: 1.612524. Entropy: 1.014578.
Iteration 28875: Policy loss: -0.001950. Value loss: 1.374645. Entropy: 1.025841.
epi

episode: 10843   score: 475.0  epsilon: 1.0    steps: 526  evaluation reward: 323.2
episode: 10844   score: 550.0  epsilon: 1.0    steps: 968  evaluation reward: 326.15
Training network. lr: 0.000087. clip: 0.034975
Iteration 28933: Policy loss: 0.002295. Value loss: 2.627829. Entropy: 1.174120.
Iteration 28934: Policy loss: 0.002350. Value loss: 2.144251. Entropy: 1.179335.
Iteration 28935: Policy loss: 0.000224. Value loss: 1.767461. Entropy: 1.174752.
Training network. lr: 0.000087. clip: 0.034975
Iteration 28936: Policy loss: 0.001853. Value loss: 2.986911. Entropy: 1.224723.
Iteration 28937: Policy loss: -0.000167. Value loss: 1.964981. Entropy: 1.207158.
Iteration 28938: Policy loss: -0.002175. Value loss: 1.322362. Entropy: 1.230475.
Training network. lr: 0.000087. clip: 0.034975
Iteration 28939: Policy loss: 0.004186. Value loss: 3.864110. Entropy: 1.296980.
Iteration 28940: Policy loss: 0.005737. Value loss: 2.620086. Entropy: 1.305506.
Iteration 28941: Policy loss: 0.002078. 

episode: 10868   score: 290.0  epsilon: 1.0    steps: 637  evaluation reward: 332.5
episode: 10869   score: 305.0  epsilon: 1.0    steps: 873  evaluation reward: 330.55
Training network. lr: 0.000087. clip: 0.034863
Iteration 28996: Policy loss: 0.005120. Value loss: 3.499274. Entropy: 1.105919.
Iteration 28997: Policy loss: 0.003343. Value loss: 2.174938. Entropy: 1.115130.
Iteration 28998: Policy loss: -0.001018. Value loss: 1.789002. Entropy: 1.101371.
Training network. lr: 0.000087. clip: 0.034863
Iteration 28999: Policy loss: 0.003629. Value loss: 2.705104. Entropy: 1.000899.
Iteration 29000: Policy loss: 0.008411. Value loss: 1.817057. Entropy: 1.026258.
Iteration 29001: Policy loss: 0.001215. Value loss: 1.511176. Entropy: 0.992850.
Training network. lr: 0.000087. clip: 0.034750
Iteration 29002: Policy loss: 0.003151. Value loss: 3.796525. Entropy: 1.140040.
Iteration 29003: Policy loss: 0.001658. Value loss: 2.463727. Entropy: 1.124124.
Iteration 29004: Policy loss: 0.000189. V

Iteration 29060: Policy loss: 0.006902. Value loss: 2.628522. Entropy: 1.265631.
Iteration 29061: Policy loss: 0.010811. Value loss: 2.221471. Entropy: 1.269393.
Training network. lr: 0.000087. clip: 0.034638
Iteration 29062: Policy loss: 0.003762. Value loss: 3.786825. Entropy: 1.338972.
Iteration 29063: Policy loss: 0.001652. Value loss: 2.569957. Entropy: 1.340301.
Iteration 29064: Policy loss: -0.001932. Value loss: 2.216096. Entropy: 1.334157.
episode: 10892   score: 445.0  epsilon: 1.0    steps: 56  evaluation reward: 316.2
episode: 10893   score: 180.0  epsilon: 1.0    steps: 267  evaluation reward: 315.9
Training network. lr: 0.000087. clip: 0.034638
Iteration 29065: Policy loss: 0.002916. Value loss: 3.057002. Entropy: 1.212053.
Iteration 29066: Policy loss: 0.008237. Value loss: 2.228536. Entropy: 1.226017.
Iteration 29067: Policy loss: 0.004342. Value loss: 1.801856. Entropy: 1.206014.
episode: 10894   score: 500.0  epsilon: 1.0    steps: 610  evaluation reward: 313.5
Traini

Iteration 29124: Policy loss: 0.002624. Value loss: 1.805470. Entropy: 1.142906.
Training network. lr: 0.000086. clip: 0.034525
Iteration 29125: Policy loss: 0.002179. Value loss: 3.563580. Entropy: 1.369457.
Iteration 29126: Policy loss: 0.003259. Value loss: 2.560352. Entropy: 1.379071.
Iteration 29127: Policy loss: 0.000703. Value loss: 1.977429. Entropy: 1.375785.
episode: 10916   score: 195.0  epsilon: 1.0    steps: 239  evaluation reward: 321.3
episode: 10917   score: 180.0  epsilon: 1.0    steps: 408  evaluation reward: 321.05
episode: 10918   score: 190.0  epsilon: 1.0    steps: 585  evaluation reward: 317.9
episode: 10919   score: 450.0  epsilon: 1.0    steps: 1012  evaluation reward: 318.0
Training network. lr: 0.000086. clip: 0.034525
Iteration 29128: Policy loss: 0.004796. Value loss: 4.016628. Entropy: 1.247315.
Iteration 29129: Policy loss: 0.007160. Value loss: 2.525906. Entropy: 1.274459.
Iteration 29130: Policy loss: 0.002615. Value loss: 1.782107. Entropy: 1.276222.
e

Iteration 29187: Policy loss: 0.001693. Value loss: 2.155193. Entropy: 1.176054.
episode: 10941   score: 285.0  epsilon: 1.0    steps: 180  evaluation reward: 312.8
Training network. lr: 0.000086. clip: 0.034413
Iteration 29188: Policy loss: 0.004165. Value loss: 5.687877. Entropy: 1.190075.
Iteration 29189: Policy loss: 0.002200. Value loss: 3.828263. Entropy: 1.189318.
Iteration 29190: Policy loss: 0.000903. Value loss: 3.279562. Entropy: 1.194495.
episode: 10942   score: 180.0  epsilon: 1.0    steps: 372  evaluation reward: 311.7
episode: 10943   score: 430.0  epsilon: 1.0    steps: 460  evaluation reward: 311.75
episode: 10944   score: 385.0  epsilon: 1.0    steps: 788  evaluation reward: 311.3
Training network. lr: 0.000086. clip: 0.034413
Iteration 29191: Policy loss: 0.002348. Value loss: 4.468504. Entropy: 1.140506.
Iteration 29192: Policy loss: 0.004932. Value loss: 3.085714. Entropy: 1.156528.
Iteration 29193: Policy loss: 0.002762. Value loss: 2.553540. Entropy: 1.137869.
ep

episode: 10968   score: 310.0  epsilon: 1.0    steps: 164  evaluation reward: 300.45
Training network. lr: 0.000086. clip: 0.034300
Iteration 29248: Policy loss: 0.003134. Value loss: 2.827866. Entropy: 1.317965.
Iteration 29249: Policy loss: 0.004506. Value loss: 2.249296. Entropy: 1.305318.
Iteration 29250: Policy loss: 0.009511. Value loss: 1.826552. Entropy: 1.303028.
episode: 10969   score: 190.0  epsilon: 1.0    steps: 872  evaluation reward: 300.65
Training network. lr: 0.000085. clip: 0.034188
Iteration 29251: Policy loss: 0.001828. Value loss: 4.554092. Entropy: 1.285318.
Iteration 29252: Policy loss: -0.000268. Value loss: 3.146494. Entropy: 1.295676.
Iteration 29253: Policy loss: 0.000760. Value loss: 2.426407. Entropy: 1.292778.
episode: 10970   score: 285.0  epsilon: 1.0    steps: 80  evaluation reward: 299.5
episode: 10971   score: 180.0  epsilon: 1.0    steps: 381  evaluation reward: 300.6
episode: 10972   score: 200.0  epsilon: 1.0    steps: 488  evaluation reward: 297.

Iteration 29308: Policy loss: 0.002217. Value loss: 2.990672. Entropy: 1.158815.
Iteration 29309: Policy loss: 0.001789. Value loss: 1.962289. Entropy: 1.131742.
Iteration 29310: Policy loss: 0.000691. Value loss: 1.617197. Entropy: 1.139071.
episode: 10996   score: 125.0  epsilon: 1.0    steps: 41  evaluation reward: 282.6
episode: 10997   score: 295.0  epsilon: 1.0    steps: 300  evaluation reward: 281.6
Training network. lr: 0.000085. clip: 0.034075
Iteration 29311: Policy loss: 0.002668. Value loss: 3.372577. Entropy: 1.064394.
Iteration 29312: Policy loss: 0.004977. Value loss: 2.268414. Entropy: 1.052731.
Iteration 29313: Policy loss: 0.002433. Value loss: 2.086016. Entropy: 1.049426.
episode: 10998   score: 210.0  epsilon: 1.0    steps: 864  evaluation reward: 280.0
Training network. lr: 0.000085. clip: 0.034075
Iteration 29314: Policy loss: 0.002889. Value loss: 2.422162. Entropy: 1.126375.
Iteration 29315: Policy loss: 0.003952. Value loss: 1.504755. Entropy: 1.133265.
Iterati

Iteration 29371: Policy loss: 0.003605. Value loss: 6.182277. Entropy: 1.232201.
Iteration 29372: Policy loss: 0.010161. Value loss: 4.408231. Entropy: 1.220065.
Iteration 29373: Policy loss: 0.010871. Value loss: 3.573681. Entropy: 1.210501.
episode: 11021   score: 210.0  epsilon: 1.0    steps: 965  evaluation reward: 277.15
Training network. lr: 0.000085. clip: 0.033962
Iteration 29374: Policy loss: 0.002526. Value loss: 6.015060. Entropy: 1.169629.
Iteration 29375: Policy loss: 0.006619. Value loss: 4.036854. Entropy: 1.189724.
Iteration 29376: Policy loss: 0.005162. Value loss: 3.295872. Entropy: 1.170673.
episode: 11022   score: 180.0  epsilon: 1.0    steps: 394  evaluation reward: 276.85
Training network. lr: 0.000085. clip: 0.033962
Iteration 29377: Policy loss: 0.003811. Value loss: 3.226113. Entropy: 1.198443.
Iteration 29378: Policy loss: 0.005741. Value loss: 2.348400. Entropy: 1.213176.
Iteration 29379: Policy loss: 0.005643. Value loss: 1.901301. Entropy: 1.195101.
episode

Iteration 29435: Policy loss: 0.003274. Value loss: 1.893820. Entropy: 1.229464.
Iteration 29436: Policy loss: 0.001503. Value loss: 1.496882. Entropy: 1.223875.
episode: 11045   score: 185.0  epsilon: 1.0    steps: 569  evaluation reward: 284.6
Training network. lr: 0.000085. clip: 0.033850
Iteration 29437: Policy loss: 0.002187. Value loss: 4.290815. Entropy: 1.331685.
Iteration 29438: Policy loss: 0.006325. Value loss: 3.457426. Entropy: 1.318527.
Iteration 29439: Policy loss: 0.007372. Value loss: 2.495702. Entropy: 1.326645.
episode: 11046   score: 565.0  epsilon: 1.0    steps: 239  evaluation reward: 284.65
Training network. lr: 0.000085. clip: 0.033850
Iteration 29440: Policy loss: 0.007437. Value loss: 7.742820. Entropy: 1.374719.
Iteration 29441: Policy loss: 0.005835. Value loss: 5.345329. Entropy: 1.366937.
Iteration 29442: Policy loss: 0.012214. Value loss: 3.626478. Entropy: 1.367342.
Training network. lr: 0.000085. clip: 0.033850
Iteration 29443: Policy loss: 0.000845. Va

Training network. lr: 0.000084. clip: 0.033738
Iteration 29500: Policy loss: 0.000943. Value loss: 6.372995. Entropy: 1.351257.
Iteration 29501: Policy loss: 0.005888. Value loss: 4.777320. Entropy: 1.340153.
Iteration 29502: Policy loss: 0.001602. Value loss: 3.976276. Entropy: 1.342494.
episode: 11068   score: 175.0  epsilon: 1.0    steps: 139  evaluation reward: 305.9
episode: 11069   score: 365.0  epsilon: 1.0    steps: 508  evaluation reward: 304.55
episode: 11070   score: 315.0  epsilon: 1.0    steps: 712  evaluation reward: 306.3
episode: 11071   score: 80.0  epsilon: 1.0    steps: 894  evaluation reward: 306.6
Training network. lr: 0.000084. clip: 0.033625
Iteration 29503: Policy loss: 0.005007. Value loss: 6.085585. Entropy: 1.200396.
Iteration 29504: Policy loss: 0.005673. Value loss: 4.320504. Entropy: 1.198322.
Iteration 29505: Policy loss: 0.007128. Value loss: 3.201725. Entropy: 1.190309.
episode: 11072   score: 275.0  epsilon: 1.0    steps: 381  evaluation reward: 305.6


Training network. lr: 0.000084. clip: 0.033513
Iteration 29563: Policy loss: 0.001965. Value loss: 4.606858. Entropy: 1.305212.
Iteration 29564: Policy loss: 0.002482. Value loss: 3.244793. Entropy: 1.309688.
Iteration 29565: Policy loss: 0.003215. Value loss: 2.251662. Entropy: 1.308562.
episode: 11093   score: 510.0  epsilon: 1.0    steps: 10  evaluation reward: 321.2
episode: 11094   score: 345.0  epsilon: 1.0    steps: 448  evaluation reward: 323.05
Training network. lr: 0.000084. clip: 0.033513
Iteration 29566: Policy loss: 0.003571. Value loss: 3.094961. Entropy: 1.189348.
Iteration 29567: Policy loss: 0.004409. Value loss: 2.264629. Entropy: 1.199209.
Iteration 29568: Policy loss: 0.002110. Value loss: 1.846829. Entropy: 1.193861.
episode: 11095   score: 370.0  epsilon: 1.0    steps: 252  evaluation reward: 324.6
Training network. lr: 0.000084. clip: 0.033513
Iteration 29569: Policy loss: 0.009248. Value loss: 3.875371. Entropy: 1.305520.
Iteration 29570: Policy loss: 0.009566. 

Training network. lr: 0.000083. clip: 0.033400
Iteration 29626: Policy loss: 0.004531. Value loss: 4.549809. Entropy: 1.263636.
Iteration 29627: Policy loss: 0.003410. Value loss: 3.160276. Entropy: 1.246512.
Iteration 29628: Policy loss: 0.002475. Value loss: 2.754027. Entropy: 1.267535.
Training network. lr: 0.000083. clip: 0.033400
Iteration 29629: Policy loss: 0.006526. Value loss: 8.638297. Entropy: 1.244697.
Iteration 29630: Policy loss: 0.016161. Value loss: 4.552818. Entropy: 1.240341.
Iteration 29631: Policy loss: 0.010229. Value loss: 4.018625. Entropy: 1.231792.
episode: 11118   score: 895.0  epsilon: 1.0    steps: 52  evaluation reward: 326.55
episode: 11119   score: 215.0  epsilon: 1.0    steps: 206  evaluation reward: 331.85
Training network. lr: 0.000083. clip: 0.033400
Iteration 29632: Policy loss: 0.003585. Value loss: 3.671025. Entropy: 1.257251.
Iteration 29633: Policy loss: 0.005935. Value loss: 3.190615. Entropy: 1.265591.
Iteration 29634: Policy loss: 0.007596. Va

Iteration 29687: Policy loss: 0.001450. Value loss: 3.024868. Entropy: 1.235295.
Iteration 29688: Policy loss: 0.003158. Value loss: 2.583884. Entropy: 1.236716.
episode: 11145   score: 185.0  epsilon: 1.0    steps: 505  evaluation reward: 309.95
episode: 11146   score: 385.0  epsilon: 1.0    steps: 632  evaluation reward: 309.95
Training network. lr: 0.000083. clip: 0.033288
Iteration 29689: Policy loss: 0.000684. Value loss: 3.276663. Entropy: 1.278594.
Iteration 29690: Policy loss: -0.000108. Value loss: 2.177311. Entropy: 1.261557.
Iteration 29691: Policy loss: 0.002980. Value loss: 1.626841. Entropy: 1.263810.
Training network. lr: 0.000083. clip: 0.033288
Iteration 29692: Policy loss: 0.002878. Value loss: 2.187024. Entropy: 1.189050.
Iteration 29693: Policy loss: 0.004804. Value loss: 1.517529. Entropy: 1.215425.
Iteration 29694: Policy loss: 0.002929. Value loss: 1.214754. Entropy: 1.186331.
Training network. lr: 0.000083. clip: 0.033288
Iteration 29695: Policy loss: 0.003243. 

Iteration 29752: Policy loss: 0.002074. Value loss: 4.097557. Entropy: 1.276206.
Iteration 29753: Policy loss: 0.002341. Value loss: 3.021971. Entropy: 1.283654.
Iteration 29754: Policy loss: 0.001628. Value loss: 2.289696. Entropy: 1.266235.
episode: 11167   score: 185.0  epsilon: 1.0    steps: 329  evaluation reward: 287.25
episode: 11168   score: 150.0  epsilon: 1.0    steps: 394  evaluation reward: 284.4
episode: 11169   score: 430.0  epsilon: 1.0    steps: 624  evaluation reward: 284.15
episode: 11170   score: 225.0  epsilon: 1.0    steps: 853  evaluation reward: 284.8
Training network. lr: 0.000083. clip: 0.033063
Iteration 29755: Policy loss: 0.003338. Value loss: 4.535810. Entropy: 1.061759.
Iteration 29756: Policy loss: 0.006191. Value loss: 2.943694. Entropy: 1.077379.
Iteration 29757: Policy loss: 0.002968. Value loss: 2.298557. Entropy: 1.082839.
episode: 11171   score: 135.0  epsilon: 1.0    steps: 233  evaluation reward: 283.9
episode: 11172   score: 290.0  epsilon: 1.0  

Iteration 29813: Policy loss: 0.009745. Value loss: 4.123646. Entropy: 1.188373.
Iteration 29814: Policy loss: 0.004803. Value loss: 3.265711. Entropy: 1.199594.
episode: 11195   score: 300.0  epsilon: 1.0    steps: 449  evaluation reward: 275.05
Training network. lr: 0.000082. clip: 0.032950
Iteration 29815: Policy loss: 0.003199. Value loss: 3.854320. Entropy: 1.191167.
Iteration 29816: Policy loss: 0.011324. Value loss: 2.458180. Entropy: 1.201765.
Iteration 29817: Policy loss: 0.008172. Value loss: 1.983917. Entropy: 1.193484.
episode: 11196   score: 370.0  epsilon: 1.0    steps: 148  evaluation reward: 274.35
Training network. lr: 0.000082. clip: 0.032950
Iteration 29818: Policy loss: 0.004105. Value loss: 4.018688. Entropy: 1.265915.
Iteration 29819: Policy loss: 0.002708. Value loss: 2.660674. Entropy: 1.283034.
Iteration 29820: Policy loss: 0.001089. Value loss: 2.092292. Entropy: 1.279143.
episode: 11197   score: 90.0  epsilon: 1.0    steps: 587  evaluation reward: 274.5
Train

Iteration 29877: Policy loss: 0.008616. Value loss: 1.962193. Entropy: 1.258676.
episode: 11219   score: 245.0  epsilon: 1.0    steps: 430  evaluation reward: 263.45
episode: 11220   score: 335.0  epsilon: 1.0    steps: 632  evaluation reward: 263.75
Training network. lr: 0.000082. clip: 0.032837
Iteration 29878: Policy loss: 0.003075. Value loss: 4.702371. Entropy: 1.191150.
Iteration 29879: Policy loss: 0.005555. Value loss: 3.440221. Entropy: 1.220268.
Iteration 29880: Policy loss: 0.007383. Value loss: 2.600724. Entropy: 1.213206.
episode: 11221   score: 335.0  epsilon: 1.0    steps: 60  evaluation reward: 262.2
episode: 11222   score: 290.0  epsilon: 1.0    steps: 830  evaluation reward: 262.3
episode: 11223   score: 280.0  epsilon: 1.0    steps: 954  evaluation reward: 262.05
Training network. lr: 0.000082. clip: 0.032837
Iteration 29881: Policy loss: 0.001698. Value loss: 2.298791. Entropy: 1.035524.
Iteration 29882: Policy loss: 0.001435. Value loss: 1.845291. Entropy: 1.027510

Iteration 29940: Policy loss: 0.009158. Value loss: 4.890034. Entropy: 1.158962.
Training network. lr: 0.000082. clip: 0.032725
Iteration 29941: Policy loss: 0.001132. Value loss: 7.168963. Entropy: 1.205601.
Iteration 29942: Policy loss: 0.005274. Value loss: 4.833342. Entropy: 1.189647.
Iteration 29943: Policy loss: 0.007882. Value loss: 3.321702. Entropy: 1.209279.
episode: 11244   score: 545.0  epsilon: 1.0    steps: 928  evaluation reward: 287.4
Training network. lr: 0.000082. clip: 0.032725
Iteration 29944: Policy loss: 0.008442. Value loss: 8.934764. Entropy: 1.264482.
Iteration 29945: Policy loss: 0.012355. Value loss: 6.344247. Entropy: 1.244763.
Iteration 29946: Policy loss: 0.013299. Value loss: 5.097500. Entropy: 1.252866.
episode: 11245   score: 415.0  epsilon: 1.0    steps: 88  evaluation reward: 289.8
episode: 11246   score: 160.0  epsilon: 1.0    steps: 828  evaluation reward: 292.1
Training network. lr: 0.000082. clip: 0.032725
Iteration 29947: Policy loss: 0.007799. V

Iteration 30003: Policy loss: 0.002369. Value loss: 1.262120. Entropy: 1.322057.
episode: 11269   score: 280.0  epsilon: 1.0    steps: 828  evaluation reward: 296.0
Training network. lr: 0.000081. clip: 0.032500
Iteration 30004: Policy loss: 0.005730. Value loss: 3.002085. Entropy: 1.188540.
Iteration 30005: Policy loss: 0.002104. Value loss: 1.839538. Entropy: 1.176505.
Iteration 30006: Policy loss: 0.002683. Value loss: 1.558091. Entropy: 1.206163.
Training network. lr: 0.000081. clip: 0.032500
Iteration 30007: Policy loss: 0.004040. Value loss: 2.905601. Entropy: 1.347769.
Iteration 30008: Policy loss: 0.004028. Value loss: 1.927359. Entropy: 1.350228.
Iteration 30009: Policy loss: 0.007842. Value loss: 1.451640. Entropy: 1.345922.
episode: 11270   score: 220.0  epsilon: 1.0    steps: 191  evaluation reward: 294.5
episode: 11271   score: 395.0  epsilon: 1.0    steps: 455  evaluation reward: 294.45
episode: 11272   score: 155.0  epsilon: 1.0    steps: 619  evaluation reward: 297.05
T

Iteration 30066: Policy loss: 0.001297. Value loss: 2.541037. Entropy: 1.302508.
episode: 11294   score: 225.0  epsilon: 1.0    steps: 203  evaluation reward: 297.75
episode: 11295   score: 440.0  epsilon: 1.0    steps: 1000  evaluation reward: 291.95
Training network. lr: 0.000081. clip: 0.032387
Iteration 30067: Policy loss: 0.004416. Value loss: 4.769301. Entropy: 1.273795.
Iteration 30068: Policy loss: 0.005925. Value loss: 2.929365. Entropy: 1.277081.
Iteration 30069: Policy loss: 0.002623. Value loss: 2.397785. Entropy: 1.264797.
episode: 11296   score: 385.0  epsilon: 1.0    steps: 507  evaluation reward: 293.35
episode: 11297   score: 175.0  epsilon: 1.0    steps: 645  evaluation reward: 293.5
Training network. lr: 0.000081. clip: 0.032387
Iteration 30070: Policy loss: 0.001572. Value loss: 6.802842. Entropy: 1.190871.
Iteration 30071: Policy loss: 0.003835. Value loss: 4.732667. Entropy: 1.181820.
Iteration 30072: Policy loss: 0.006747. Value loss: 4.061563. Entropy: 1.159595.

episode: 11319   score: 395.0  epsilon: 1.0    steps: 808  evaluation reward: 298.7
Training network. lr: 0.000081. clip: 0.032275
Iteration 30130: Policy loss: 0.002236. Value loss: 8.486310. Entropy: 1.181466.
Iteration 30131: Policy loss: 0.004010. Value loss: 5.785229. Entropy: 1.186575.
Iteration 30132: Policy loss: 0.001757. Value loss: 4.335158. Entropy: 1.178206.
episode: 11320   score: 315.0  epsilon: 1.0    steps: 963  evaluation reward: 300.2
Training network. lr: 0.000081. clip: 0.032275
Iteration 30133: Policy loss: 0.003692. Value loss: 4.329759. Entropy: 1.169560.
Iteration 30134: Policy loss: 0.005193. Value loss: 2.677589. Entropy: 1.170525.
Iteration 30135: Policy loss: 0.002430. Value loss: 2.231311. Entropy: 1.177608.
episode: 11321   score: 635.0  epsilon: 1.0    steps: 538  evaluation reward: 300.0
Training network. lr: 0.000081. clip: 0.032275
Iteration 30136: Policy loss: 0.004063. Value loss: 2.590463. Entropy: 1.215046.
Iteration 30137: Policy loss: 0.002648. 

episode: 11344   score: 515.0  epsilon: 1.0    steps: 891  evaluation reward: 293.3
Training network. lr: 0.000080. clip: 0.032163
Iteration 30193: Policy loss: 0.004965. Value loss: 5.004192. Entropy: 1.236228.
Iteration 30194: Policy loss: 0.005796. Value loss: 3.243192. Entropy: 1.235520.
Iteration 30195: Policy loss: 0.001874. Value loss: 2.549420. Entropy: 1.216546.
episode: 11345   score: 175.0  epsilon: 1.0    steps: 997  evaluation reward: 293.0
Training network. lr: 0.000080. clip: 0.032163
Iteration 30196: Policy loss: 0.001004. Value loss: 4.013227. Entropy: 1.095677.
Iteration 30197: Policy loss: 0.002454. Value loss: 2.776505. Entropy: 1.095432.
Iteration 30198: Policy loss: 0.000255. Value loss: 2.213395. Entropy: 1.102996.
episode: 11346   score: 215.0  epsilon: 1.0    steps: 238  evaluation reward: 290.6
episode: 11347   score: 340.0  epsilon: 1.0    steps: 667  evaluation reward: 291.15
Training network. lr: 0.000080. clip: 0.032163
Iteration 30199: Policy loss: 0.0014

episode: 11369   score: 275.0  epsilon: 1.0    steps: 351  evaluation reward: 281.7
episode: 11370   score: 315.0  epsilon: 1.0    steps: 925  evaluation reward: 281.65
Training network. lr: 0.000080. clip: 0.031938
Iteration 30256: Policy loss: 0.003601. Value loss: 4.701484. Entropy: 1.128308.
Iteration 30257: Policy loss: 0.002883. Value loss: 3.531459. Entropy: 1.109501.
Iteration 30258: Policy loss: 0.002133. Value loss: 3.124669. Entropy: 1.129378.
Training network. lr: 0.000080. clip: 0.031938
Iteration 30259: Policy loss: 0.006737. Value loss: 7.016162. Entropy: 1.212991.
Iteration 30260: Policy loss: 0.013814. Value loss: 4.513094. Entropy: 1.213398.
Iteration 30261: Policy loss: 0.009292. Value loss: 3.683583. Entropy: 1.205742.
episode: 11371   score: 765.0  epsilon: 1.0    steps: 835  evaluation reward: 282.6
Training network. lr: 0.000080. clip: 0.031938
Iteration 30262: Policy loss: -0.001113. Value loss: 5.704727. Entropy: 1.303334.
Iteration 30263: Policy loss: -0.00142

Iteration 30316: Policy loss: 0.003448. Value loss: 4.913442. Entropy: 1.143254.
Iteration 30317: Policy loss: 0.005787. Value loss: 3.190662. Entropy: 1.157723.
Iteration 30318: Policy loss: 0.003929. Value loss: 2.669047. Entropy: 1.139184.
Training network. lr: 0.000080. clip: 0.031825
Iteration 30319: Policy loss: 0.004330. Value loss: 2.847298. Entropy: 1.351860.
Iteration 30320: Policy loss: 0.006927. Value loss: 1.993587. Entropy: 1.361123.
Iteration 30321: Policy loss: 0.002343. Value loss: 1.576133. Entropy: 1.355060.
Training network. lr: 0.000080. clip: 0.031825
Iteration 30322: Policy loss: 0.000480. Value loss: 3.834592. Entropy: 1.366544.
Iteration 30323: Policy loss: 0.001685. Value loss: 2.764980. Entropy: 1.363326.
Iteration 30324: Policy loss: 0.001827. Value loss: 2.272407. Entropy: 1.375554.
episode: 11397   score: 260.0  epsilon: 1.0    steps: 110  evaluation reward: 285.55
episode: 11398   score: 320.0  epsilon: 1.0    steps: 741  evaluation reward: 286.4
Training

Iteration 30376: Policy loss: 0.005164. Value loss: 2.650309. Entropy: 0.981294.
Iteration 30377: Policy loss: 0.009998. Value loss: 1.922623. Entropy: 0.975671.
Iteration 30378: Policy loss: 0.006133. Value loss: 1.509508. Entropy: 0.974573.
episode: 11425   score: 180.0  epsilon: 1.0    steps: 205  evaluation reward: 268.85
Training network. lr: 0.000079. clip: 0.031713
Iteration 30379: Policy loss: 0.004849. Value loss: 3.102765. Entropy: 1.197557.
Iteration 30380: Policy loss: 0.006514. Value loss: 2.136764. Entropy: 1.199860.
Iteration 30381: Policy loss: 0.006161. Value loss: 1.833221. Entropy: 1.186852.
Training network. lr: 0.000079. clip: 0.031713
Iteration 30382: Policy loss: 0.006083. Value loss: 2.859475. Entropy: 1.280802.
Iteration 30383: Policy loss: 0.008068. Value loss: 1.864553. Entropy: 1.262320.
Iteration 30384: Policy loss: 0.004442. Value loss: 1.435986. Entropy: 1.294820.
episode: 11426   score: 180.0  epsilon: 1.0    steps: 982  evaluation reward: 268.3
Training

now time :  2019-02-23 10:06:05.204388
episode: 11451   score: 165.0  epsilon: 1.0    steps: 678  evaluation reward: 254.8
Training network. lr: 0.000079. clip: 0.031600
Iteration 30439: Policy loss: 0.006783. Value loss: 4.343801. Entropy: 1.180571.
Iteration 30440: Policy loss: 0.004997. Value loss: 2.455200. Entropy: 1.180731.
Iteration 30441: Policy loss: 0.004312. Value loss: 1.967604. Entropy: 1.205576.
Training network. lr: 0.000079. clip: 0.031600
Iteration 30442: Policy loss: -0.002054. Value loss: 4.172931. Entropy: 1.348176.
Iteration 30443: Policy loss: 0.000084. Value loss: 3.349483. Entropy: 1.343527.
Iteration 30444: Policy loss: 0.000175. Value loss: 2.541387. Entropy: 1.339189.
episode: 11452   score: 290.0  epsilon: 1.0    steps: 633  evaluation reward: 254.7
episode: 11453   score: 210.0  epsilon: 1.0    steps: 826  evaluation reward: 254.55
Training network. lr: 0.000079. clip: 0.031600
Iteration 30445: Policy loss: 0.001027. Value loss: 3.495057. Entropy: 1.304694.

Training network. lr: 0.000078. clip: 0.031375
Iteration 30502: Policy loss: 0.003932. Value loss: 2.981409. Entropy: 1.308038.
Iteration 30503: Policy loss: 0.004271. Value loss: 1.732343. Entropy: 1.333091.
Iteration 30504: Policy loss: 0.002637. Value loss: 1.456082. Entropy: 1.308275.
episode: 11476   score: 215.0  epsilon: 1.0    steps: 602  evaluation reward: 260.8
episode: 11477   score: 180.0  epsilon: 1.0    steps: 713  evaluation reward: 259.75
episode: 11478   score: 180.0  epsilon: 1.0    steps: 851  evaluation reward: 258.55
Training network. lr: 0.000078. clip: 0.031375
Iteration 30505: Policy loss: 0.000461. Value loss: 3.557743. Entropy: 1.175436.
Iteration 30506: Policy loss: 0.003564. Value loss: 2.207551. Entropy: 1.171544.
Iteration 30507: Policy loss: 0.001652. Value loss: 1.790725. Entropy: 1.180240.
episode: 11479   score: 240.0  epsilon: 1.0    steps: 243  evaluation reward: 258.55
Training network. lr: 0.000078. clip: 0.031375
Iteration 30508: Policy loss: 0.00

episode: 11501   score: 390.0  epsilon: 1.0    steps: 272  evaluation reward: 271.65
episode: 11502   score: 285.0  epsilon: 1.0    steps: 911  evaluation reward: 274.15
Training network. lr: 0.000078. clip: 0.031262
Iteration 30565: Policy loss: 0.001474. Value loss: 3.186053. Entropy: 1.011292.
Iteration 30566: Policy loss: 0.004549. Value loss: 2.100070. Entropy: 0.989772.
Iteration 30567: Policy loss: 0.002565. Value loss: 1.715075. Entropy: 0.986293.
Training network. lr: 0.000078. clip: 0.031262
Iteration 30568: Policy loss: 0.002980. Value loss: 3.545491. Entropy: 1.260339.
Iteration 30569: Policy loss: 0.006347. Value loss: 2.317274. Entropy: 1.251331.
Iteration 30570: Policy loss: 0.002332. Value loss: 1.917964. Entropy: 1.263041.
episode: 11503   score: 370.0  epsilon: 1.0    steps: 858  evaluation reward: 273.55
Training network. lr: 0.000078. clip: 0.031262
Iteration 30571: Policy loss: 0.003186. Value loss: 3.363779. Entropy: 1.244253.
Iteration 30572: Policy loss: -0.0004

Iteration 30627: Policy loss: 0.002407. Value loss: 1.524711. Entropy: 1.336318.
episode: 11527   score: 210.0  epsilon: 1.0    steps: 644  evaluation reward: 283.4
Training network. lr: 0.000078. clip: 0.031150
Iteration 30628: Policy loss: -0.000074. Value loss: 6.673621. Entropy: 1.305680.
Iteration 30629: Policy loss: 0.006702. Value loss: 4.177675. Entropy: 1.308354.
Iteration 30630: Policy loss: 0.005600. Value loss: 2.955616. Entropy: 1.312295.
episode: 11528   score: 515.0  epsilon: 1.0    steps: 241  evaluation reward: 283.75
Training network. lr: 0.000078. clip: 0.031150
Iteration 30631: Policy loss: 0.005839. Value loss: 6.446510. Entropy: 1.369636.
Iteration 30632: Policy loss: 0.005766. Value loss: 4.514991. Entropy: 1.369856.
Iteration 30633: Policy loss: 0.004340. Value loss: 4.364203. Entropy: 1.372441.
episode: 11529   score: 200.0  epsilon: 1.0    steps: 329  evaluation reward: 287.8
episode: 11530   score: 275.0  epsilon: 1.0    steps: 460  evaluation reward: 284.25


now time :  2019-02-23 10:11:17.782724
episode: 11551   score: 330.0  epsilon: 1.0    steps: 996  evaluation reward: 303.2
Training network. lr: 0.000078. clip: 0.031038
Iteration 30691: Policy loss: 0.003808. Value loss: 3.302649. Entropy: 1.428343.
Iteration 30692: Policy loss: 0.005043. Value loss: 2.386162. Entropy: 1.440969.
Iteration 30693: Policy loss: 0.003258. Value loss: 2.000722. Entropy: 1.428197.
episode: 11552   score: 180.0  epsilon: 1.0    steps: 162  evaluation reward: 304.85
Training network. lr: 0.000078. clip: 0.031038
Iteration 30694: Policy loss: 0.001257. Value loss: 3.768785. Entropy: 1.303784.
Iteration 30695: Policy loss: 0.001912. Value loss: 2.516155. Entropy: 1.313737.
Iteration 30696: Policy loss: 0.001686. Value loss: 2.261715. Entropy: 1.306162.
episode: 11553   score: 435.0  epsilon: 1.0    steps: 126  evaluation reward: 303.75
episode: 11554   score: 345.0  epsilon: 1.0    steps: 258  evaluation reward: 306.0
episode: 11555   score: 210.0  epsilon: 1.0

Training network. lr: 0.000077. clip: 0.030813
Iteration 30754: Policy loss: 0.001146. Value loss: 3.648337. Entropy: 1.278421.
Iteration 30755: Policy loss: -0.000069. Value loss: 2.759731. Entropy: 1.281206.
Iteration 30756: Policy loss: -0.000857. Value loss: 2.211466. Entropy: 1.275996.
episode: 11576   score: 370.0  epsilon: 1.0    steps: 561  evaluation reward: 320.35
Training network. lr: 0.000077. clip: 0.030813
Iteration 30757: Policy loss: 0.003917. Value loss: 4.387768. Entropy: 1.218137.
Iteration 30758: Policy loss: 0.009305. Value loss: 2.621480. Entropy: 1.205055.
Iteration 30759: Policy loss: 0.006988. Value loss: 2.261293. Entropy: 1.208413.
episode: 11577   score: 375.0  epsilon: 1.0    steps: 782  evaluation reward: 321.9
episode: 11578   score: 215.0  epsilon: 1.0    steps: 968  evaluation reward: 323.85
Training network. lr: 0.000077. clip: 0.030813
Iteration 30760: Policy loss: 0.003804. Value loss: 3.419941. Entropy: 1.255407.
Iteration 30761: Policy loss: 0.0017

Iteration 30816: Policy loss: 0.004964. Value loss: 3.505509. Entropy: 1.285980.
episode: 11602   score: 245.0  epsilon: 1.0    steps: 483  evaluation reward: 310.7
episode: 11603   score: 185.0  epsilon: 1.0    steps: 557  evaluation reward: 310.3
Training network. lr: 0.000077. clip: 0.030700
Iteration 30817: Policy loss: 0.012288. Value loss: 5.157222. Entropy: 1.237131.
Iteration 30818: Policy loss: 0.006868. Value loss: 4.094318. Entropy: 1.241620.
Iteration 30819: Policy loss: 0.011528. Value loss: 2.651619. Entropy: 1.236219.
episode: 11604   score: 545.0  epsilon: 1.0    steps: 866  evaluation reward: 308.45
Training network. lr: 0.000077. clip: 0.030700
Iteration 30820: Policy loss: 0.005193. Value loss: 4.681050. Entropy: 1.318249.
Iteration 30821: Policy loss: 0.002995. Value loss: 3.901122. Entropy: 1.324768.
Iteration 30822: Policy loss: 0.005405. Value loss: 2.884729. Entropy: 1.303132.
episode: 11605   score: 195.0  epsilon: 1.0    steps: 1022  evaluation reward: 312.1
T

episode: 11626   score: 265.0  epsilon: 1.0    steps: 791  evaluation reward: 323.55
Training network. lr: 0.000076. clip: 0.030588
Iteration 30880: Policy loss: 0.003746. Value loss: 3.483250. Entropy: 1.156116.
Iteration 30881: Policy loss: 0.005949. Value loss: 2.534888. Entropy: 1.188761.
Iteration 30882: Policy loss: 0.004782. Value loss: 2.135622. Entropy: 1.161774.
episode: 11627   score: 425.0  epsilon: 1.0    steps: 1012  evaluation reward: 323.75
Training network. lr: 0.000076. clip: 0.030588
Iteration 30883: Policy loss: 0.001575. Value loss: 4.903366. Entropy: 1.252508.
Iteration 30884: Policy loss: 0.008896. Value loss: 3.773299. Entropy: 1.279144.
Iteration 30885: Policy loss: 0.006545. Value loss: 3.028589. Entropy: 1.266817.
episode: 11628   score: 110.0  epsilon: 1.0    steps: 318  evaluation reward: 325.9
episode: 11629   score: 300.0  epsilon: 1.0    steps: 504  evaluation reward: 321.85
episode: 11630   score: 245.0  epsilon: 1.0    steps: 665  evaluation reward: 32

episode: 11648   score: 260.0  epsilon: 1.0    steps: 831  evaluation reward: 325.8
Training network. lr: 0.000076. clip: 0.030475
Iteration 30946: Policy loss: 0.001274. Value loss: 3.258650. Entropy: 1.268993.
Iteration 30947: Policy loss: 0.001343. Value loss: 2.381086. Entropy: 1.266643.
Iteration 30948: Policy loss: -0.001183. Value loss: 2.005728. Entropy: 1.273679.
episode: 11649   score: 355.0  epsilon: 1.0    steps: 967  evaluation reward: 324.35
Training network. lr: 0.000076. clip: 0.030475
Iteration 30949: Policy loss: 0.007008. Value loss: 7.033693. Entropy: 1.269392.
Iteration 30950: Policy loss: 0.010882. Value loss: 3.872059. Entropy: 1.290458.
Iteration 30951: Policy loss: 0.008620. Value loss: 3.195102. Entropy: 1.263887.
Training network. lr: 0.000076. clip: 0.030363
Iteration 30952: Policy loss: 0.003332. Value loss: 6.075070. Entropy: 1.281473.
Iteration 30953: Policy loss: 0.006341. Value loss: 3.444117. Entropy: 1.239232.
Iteration 30954: Policy loss: 0.001766. V

Iteration 31010: Policy loss: 0.009218. Value loss: 4.122745. Entropy: 1.148187.
Iteration 31011: Policy loss: 0.008755. Value loss: 3.026905. Entropy: 1.122874.
episode: 11671   score: 450.0  epsilon: 1.0    steps: 505  evaluation reward: 332.0
episode: 11672   score: 365.0  epsilon: 1.0    steps: 895  evaluation reward: 332.4
Training network. lr: 0.000076. clip: 0.030250
Iteration 31012: Policy loss: 0.000952. Value loss: 7.100275. Entropy: 1.260885.
Iteration 31013: Policy loss: 0.006538. Value loss: 5.864874. Entropy: 1.228232.
Iteration 31014: Policy loss: 0.002199. Value loss: 4.115368. Entropy: 1.248294.
episode: 11673   score: 150.0  epsilon: 1.0    steps: 916  evaluation reward: 331.3
Training network. lr: 0.000076. clip: 0.030250
Iteration 31015: Policy loss: 0.004240. Value loss: 6.184289. Entropy: 1.046285.
Iteration 31016: Policy loss: 0.000728. Value loss: 4.192740. Entropy: 1.054983.
Iteration 31017: Policy loss: 0.001804. Value loss: 3.456382. Entropy: 1.043989.
episod

Iteration 31072: Policy loss: 0.001112. Value loss: 3.592333. Entropy: 1.032258.
Iteration 31073: Policy loss: -0.002183. Value loss: 2.980490. Entropy: 1.046638.
Iteration 31074: Policy loss: -0.003205. Value loss: 2.199194. Entropy: 1.055032.
episode: 11697   score: 265.0  epsilon: 1.0    steps: 903  evaluation reward: 350.9
Training network. lr: 0.000075. clip: 0.030138
Iteration 31075: Policy loss: 0.004025. Value loss: 4.740363. Entropy: 1.023038.
Iteration 31076: Policy loss: 0.002135. Value loss: 3.258823. Entropy: 1.020795.
Iteration 31077: Policy loss: 0.005692. Value loss: 2.543187. Entropy: 1.012659.
Training network. lr: 0.000075. clip: 0.030138
Iteration 31078: Policy loss: 0.003386. Value loss: 3.942934. Entropy: 1.183295.
Iteration 31079: Policy loss: 0.012095. Value loss: 2.401386. Entropy: 1.179361.
Iteration 31080: Policy loss: 0.006994. Value loss: 1.783090. Entropy: 1.187791.
episode: 11698   score: 210.0  epsilon: 1.0    steps: 250  evaluation reward: 351.7
Trainin

Iteration 31137: Policy loss: 0.006862. Value loss: 2.301516. Entropy: 1.192003.
episode: 11720   score: 210.0  epsilon: 1.0    steps: 13  evaluation reward: 345.95
episode: 11721   score: 345.0  epsilon: 1.0    steps: 219  evaluation reward: 342.6
episode: 11722   score: 185.0  epsilon: 1.0    steps: 696  evaluation reward: 344.7
Training network. lr: 0.000075. clip: 0.030025
Iteration 31138: Policy loss: 0.001804. Value loss: 2.522579. Entropy: 1.143112.
Iteration 31139: Policy loss: 0.002813. Value loss: 1.717976. Entropy: 1.151038.
Iteration 31140: Policy loss: 0.001630. Value loss: 1.683460. Entropy: 1.134081.
Training network. lr: 0.000075. clip: 0.030025
Iteration 31141: Policy loss: 0.003062. Value loss: 2.637292. Entropy: 1.162900.
Iteration 31142: Policy loss: 0.002648. Value loss: 1.899655. Entropy: 1.169560.
Iteration 31143: Policy loss: 0.003531. Value loss: 1.634441. Entropy: 1.152750.
episode: 11723   score: 155.0  epsilon: 1.0    steps: 339  evaluation reward: 342.65
ep

Iteration 31198: Policy loss: 0.002846. Value loss: 3.480118. Entropy: 1.251291.
Iteration 31199: Policy loss: 0.004442. Value loss: 2.247971. Entropy: 1.250275.
Iteration 31200: Policy loss: 0.001455. Value loss: 1.942395. Entropy: 1.248027.
episode: 11747   score: 260.0  epsilon: 1.0    steps: 152  evaluation reward: 332.6
episode: 11748   score: 210.0  epsilon: 1.0    steps: 445  evaluation reward: 330.25
Training network. lr: 0.000074. clip: 0.029800
Iteration 31201: Policy loss: 0.004895. Value loss: 6.505001. Entropy: 1.233847.
Iteration 31202: Policy loss: 0.011242. Value loss: 4.695529. Entropy: 1.245150.
Iteration 31203: Policy loss: 0.011905. Value loss: 4.075420. Entropy: 1.239609.
Training network. lr: 0.000074. clip: 0.029800
Iteration 31204: Policy loss: 0.001509. Value loss: 2.712433. Entropy: 1.303259.
Iteration 31205: Policy loss: 0.002705. Value loss: 1.676998. Entropy: 1.309546.
Iteration 31206: Policy loss: 0.002126. Value loss: 1.510230. Entropy: 1.297226.
episode:

episode: 11772   score: 595.0  epsilon: 1.0    steps: 993  evaluation reward: 321.2
Training network. lr: 0.000074. clip: 0.029688
Iteration 31261: Policy loss: 0.003105. Value loss: 3.060133. Entropy: 0.932134.
Iteration 31262: Policy loss: 0.005131. Value loss: 2.187616. Entropy: 0.930828.
Iteration 31263: Policy loss: 0.006491. Value loss: 1.637396. Entropy: 0.933864.
Training network. lr: 0.000074. clip: 0.029688
Iteration 31264: Policy loss: 0.005420. Value loss: 4.188976. Entropy: 1.054985.
Iteration 31265: Policy loss: 0.004509. Value loss: 2.880194. Entropy: 1.061785.
Iteration 31266: Policy loss: 0.005827. Value loss: 2.231098. Entropy: 1.032241.
episode: 11773   score: 550.0  epsilon: 1.0    steps: 214  evaluation reward: 323.5
episode: 11774   score: 260.0  epsilon: 1.0    steps: 713  evaluation reward: 327.5
Training network. lr: 0.000074. clip: 0.029688
Iteration 31267: Policy loss: 0.004414. Value loss: 2.860201. Entropy: 1.184534.
Iteration 31268: Policy loss: 0.007798. 

episode: 11794   score: 300.0  epsilon: 1.0    steps: 605  evaluation reward: 311.0
episode: 11795   score: 320.0  epsilon: 1.0    steps: 645  evaluation reward: 311.6
Training network. lr: 0.000074. clip: 0.029575
Iteration 31327: Policy loss: 0.003504. Value loss: 5.595667. Entropy: 1.163047.
Iteration 31328: Policy loss: 0.012210. Value loss: 3.371445. Entropy: 1.179796.
Iteration 31329: Policy loss: 0.011216. Value loss: 2.538388. Entropy: 1.173841.
Training network. lr: 0.000074. clip: 0.029575
Iteration 31330: Policy loss: 0.001720. Value loss: 5.884597. Entropy: 1.209411.
Iteration 31331: Policy loss: -0.001606. Value loss: 4.752511. Entropy: 1.234559.
Iteration 31332: Policy loss: 0.001147. Value loss: 3.829329. Entropy: 1.220701.
episode: 11796   score: 285.0  epsilon: 1.0    steps: 324  evaluation reward: 313.3
episode: 11797   score: 320.0  epsilon: 1.0    steps: 786  evaluation reward: 313.55
Training network. lr: 0.000074. clip: 0.029575
Iteration 31333: Policy loss: 0.001

Iteration 31391: Policy loss: 0.010023. Value loss: 4.637268. Entropy: 1.127442.
Iteration 31392: Policy loss: 0.023633. Value loss: 3.417948. Entropy: 1.146439.
Training network. lr: 0.000074. clip: 0.029463
Iteration 31393: Policy loss: 0.003045. Value loss: 4.215802. Entropy: 1.313680.
Iteration 31394: Policy loss: 0.002561. Value loss: 3.053444. Entropy: 1.326129.
Iteration 31395: Policy loss: 0.001740. Value loss: 2.665823. Entropy: 1.309140.
episode: 11817   score: 390.0  epsilon: 1.0    steps: 1  evaluation reward: 339.3
episode: 11818   score: 210.0  epsilon: 1.0    steps: 546  evaluation reward: 337.75
Training network. lr: 0.000074. clip: 0.029463
Iteration 31396: Policy loss: 0.003698. Value loss: 2.775787. Entropy: 1.161789.
Iteration 31397: Policy loss: 0.003639. Value loss: 1.892162. Entropy: 1.168768.
Iteration 31398: Policy loss: 0.003134. Value loss: 1.528499. Entropy: 1.164618.
Training network. lr: 0.000074. clip: 0.029463
Iteration 31399: Policy loss: 0.001905. Valu

Iteration 31456: Policy loss: 0.004666. Value loss: 4.502454. Entropy: 1.045175.
Iteration 31457: Policy loss: 0.004977. Value loss: 3.317403. Entropy: 1.037904.
Iteration 31458: Policy loss: 0.004842. Value loss: 2.636294. Entropy: 1.033924.
episode: 11840   score: 330.0  epsilon: 1.0    steps: 493  evaluation reward: 364.9
Training network. lr: 0.000073. clip: 0.029237
Iteration 31459: Policy loss: 0.003793. Value loss: 4.308290. Entropy: 1.174008.
Iteration 31460: Policy loss: 0.005463. Value loss: 3.174292. Entropy: 1.173529.
Iteration 31461: Policy loss: 0.005084. Value loss: 2.434914. Entropy: 1.171179.
Training network. lr: 0.000073. clip: 0.029237
Iteration 31462: Policy loss: 0.003518. Value loss: 5.690457. Entropy: 1.297699.
Iteration 31463: Policy loss: 0.005478. Value loss: 4.310668. Entropy: 1.295881.
Iteration 31464: Policy loss: 0.006694. Value loss: 4.035643. Entropy: 1.326333.
episode: 11841   score: 285.0  epsilon: 1.0    steps: 135  evaluation reward: 363.55
episode:

Iteration 31521: Policy loss: 0.008841. Value loss: 4.236764. Entropy: 1.036287.
episode: 11863   score: 285.0  epsilon: 1.0    steps: 646  evaluation reward: 365.55
episode: 11864   score: 595.0  epsilon: 1.0    steps: 989  evaluation reward: 365.25
Training network. lr: 0.000073. clip: 0.029125
Iteration 31522: Policy loss: 0.001027. Value loss: 4.388317. Entropy: 1.260824.
Iteration 31523: Policy loss: 0.001851. Value loss: 3.206364. Entropy: 1.248996.
Iteration 31524: Policy loss: -0.000589. Value loss: 2.577135. Entropy: 1.259992.
episode: 11865   score: 215.0  epsilon: 1.0    steps: 604  evaluation reward: 367.15
Training network. lr: 0.000073. clip: 0.029125
Iteration 31525: Policy loss: 0.001597. Value loss: 2.847565. Entropy: 1.107836.
Iteration 31526: Policy loss: 0.000143. Value loss: 1.903492. Entropy: 1.111452.
Iteration 31527: Policy loss: 0.004493. Value loss: 1.540035. Entropy: 1.107659.
Training network. lr: 0.000073. clip: 0.029125
Iteration 31528: Policy loss: 0.0038

Iteration 31584: Policy loss: 0.001999. Value loss: 4.670307. Entropy: 1.236164.
episode: 11888   score: 210.0  epsilon: 1.0    steps: 325  evaluation reward: 369.0
episode: 11889   score: 585.0  epsilon: 1.0    steps: 1000  evaluation reward: 365.45
Training network. lr: 0.000073. clip: 0.029013
Iteration 31585: Policy loss: 0.006067. Value loss: 5.158154. Entropy: 1.203297.
Iteration 31586: Policy loss: 0.002919. Value loss: 3.543557. Entropy: 1.210085.
Iteration 31587: Policy loss: 0.003368. Value loss: 4.067076. Entropy: 1.201006.
Training network. lr: 0.000073. clip: 0.029013
Iteration 31588: Policy loss: 0.002201. Value loss: 3.702837. Entropy: 1.152040.
Iteration 31589: Policy loss: 0.003522. Value loss: 2.684112. Entropy: 1.154543.
Iteration 31590: Policy loss: 0.004292. Value loss: 2.242053. Entropy: 1.142382.
episode: 11890   score: 315.0  epsilon: 1.0    steps: 843  evaluation reward: 369.0
Training network. lr: 0.000073. clip: 0.029013
Iteration 31591: Policy loss: 0.008516

episode: 11912   score: 460.0  epsilon: 1.0    steps: 795  evaluation reward: 359.95
Training network. lr: 0.000072. clip: 0.028900
Iteration 31648: Policy loss: 0.001783. Value loss: 2.677404. Entropy: 1.194571.
Iteration 31649: Policy loss: 0.002282. Value loss: 2.029838. Entropy: 1.217037.
Iteration 31650: Policy loss: 0.003714. Value loss: 1.707953. Entropy: 1.212503.
episode: 11913   score: 320.0  epsilon: 1.0    steps: 43  evaluation reward: 356.35
Training network. lr: 0.000072. clip: 0.028788
Iteration 31651: Policy loss: 0.004857. Value loss: 2.685125. Entropy: 1.295010.
Iteration 31652: Policy loss: 0.006722. Value loss: 1.957816. Entropy: 1.303953.
Iteration 31653: Policy loss: 0.004409. Value loss: 1.649853. Entropy: 1.300261.
episode: 11914   score: 425.0  epsilon: 1.0    steps: 507  evaluation reward: 355.15
Training network. lr: 0.000072. clip: 0.028788
Iteration 31654: Policy loss: 0.003044. Value loss: 3.592681. Entropy: 1.265941.
Iteration 31655: Policy loss: 0.004349

Iteration 31712: Policy loss: 0.008989. Value loss: 3.213746. Entropy: 1.258298.
Iteration 31713: Policy loss: 0.013852. Value loss: 2.513613. Entropy: 1.264038.
episode: 11936   score: 485.0  epsilon: 1.0    steps: 189  evaluation reward: 343.85
episode: 11937   score: 310.0  epsilon: 1.0    steps: 470  evaluation reward: 346.15
Training network. lr: 0.000072. clip: 0.028675
Iteration 31714: Policy loss: 0.001936. Value loss: 2.879850. Entropy: 1.123006.
Iteration 31715: Policy loss: 0.001894. Value loss: 2.101329. Entropy: 1.097524.
Iteration 31716: Policy loss: 0.001024. Value loss: 1.985525. Entropy: 1.131281.
episode: 11938   score: 590.0  epsilon: 1.0    steps: 1020  evaluation reward: 342.75
Training network. lr: 0.000072. clip: 0.028675
Iteration 31717: Policy loss: 0.003811. Value loss: 2.976213. Entropy: 1.140025.
Iteration 31718: Policy loss: 0.004901. Value loss: 1.947793. Entropy: 1.100609.
Iteration 31719: Policy loss: 0.003587. Value loss: 1.687692. Entropy: 1.104988.
ep

Iteration 31776: Policy loss: 0.005001. Value loss: 4.097369. Entropy: 1.210303.
Training network. lr: 0.000071. clip: 0.028563
Iteration 31777: Policy loss: 0.002770. Value loss: 2.857117. Entropy: 1.288954.
Iteration 31778: Policy loss: 0.004316. Value loss: 1.823120. Entropy: 1.277448.
Iteration 31779: Policy loss: 0.004564. Value loss: 1.518850. Entropy: 1.282128.
Training network. lr: 0.000071. clip: 0.028563
Iteration 31780: Policy loss: 0.004350. Value loss: 7.584894. Entropy: 1.301236.
Iteration 31781: Policy loss: 0.005698. Value loss: 5.169545. Entropy: 1.321332.
Iteration 31782: Policy loss: 0.008374. Value loss: 3.508497. Entropy: 1.294699.
episode: 11960   score: 305.0  epsilon: 1.0    steps: 21  evaluation reward: 339.7
episode: 11961   score: 535.0  epsilon: 1.0    steps: 420  evaluation reward: 340.15
episode: 11962   score: 365.0  epsilon: 1.0    steps: 963  evaluation reward: 342.35
Training network. lr: 0.000071. clip: 0.028563
Iteration 31783: Policy loss: 0.004295.

Iteration 31842: Policy loss: 0.003120. Value loss: 1.689090. Entropy: 1.303049.
Training network. lr: 0.000071. clip: 0.028450
Iteration 31843: Policy loss: 0.002298. Value loss: 7.822978. Entropy: 1.446503.
Iteration 31844: Policy loss: 0.009263. Value loss: 5.507011. Entropy: 1.440796.
Iteration 31845: Policy loss: 0.010068. Value loss: 4.481500. Entropy: 1.440092.
Training network. lr: 0.000071. clip: 0.028450
Iteration 31846: Policy loss: 0.005400. Value loss: 4.881535. Entropy: 1.358288.
Iteration 31847: Policy loss: 0.005691. Value loss: 3.576915. Entropy: 1.350796.
Iteration 31848: Policy loss: 0.005777. Value loss: 3.121726. Entropy: 1.353759.
episode: 11982   score: 290.0  epsilon: 1.0    steps: 75  evaluation reward: 351.8
episode: 11983   score: 155.0  epsilon: 1.0    steps: 436  evaluation reward: 353.6
Training network. lr: 0.000071. clip: 0.028450
Iteration 31849: Policy loss: 0.002285. Value loss: 9.095831. Entropy: 1.385538.
Iteration 31850: Policy loss: 0.012439. Valu

Training network. lr: 0.000071. clip: 0.028225
Iteration 31906: Policy loss: 0.007564. Value loss: 5.519616. Entropy: 1.214859.
Iteration 31907: Policy loss: 0.004555. Value loss: 3.876285. Entropy: 1.217608.
Iteration 31908: Policy loss: 0.004265. Value loss: 3.168462. Entropy: 1.214502.
episode: 12006   score: 345.0  epsilon: 1.0    steps: 160  evaluation reward: 342.45
Training network. lr: 0.000071. clip: 0.028225
Iteration 31909: Policy loss: -0.000455. Value loss: 3.995251. Entropy: 1.349845.
Iteration 31910: Policy loss: 0.000233. Value loss: 3.032169. Entropy: 1.339913.
Iteration 31911: Policy loss: 0.000779. Value loss: 2.342016. Entropy: 1.338111.
Training network. lr: 0.000071. clip: 0.028225
Iteration 31912: Policy loss: 0.001139. Value loss: 4.974420. Entropy: 1.187259.
Iteration 31913: Policy loss: 0.002286. Value loss: 3.057929. Entropy: 1.188885.
Iteration 31914: Policy loss: 0.001044. Value loss: 2.740331. Entropy: 1.190964.
episode: 12007   score: 225.0  epsilon: 1.0 

Iteration 31971: Policy loss: 0.001802. Value loss: 1.395391. Entropy: 1.357414.
episode: 12029   score: 295.0  epsilon: 1.0    steps: 30  evaluation reward: 338.55
episode: 12030   score: 270.0  epsilon: 1.0    steps: 448  evaluation reward: 338.9
Training network. lr: 0.000070. clip: 0.028113
Iteration 31972: Policy loss: 0.001478. Value loss: 6.380616. Entropy: 1.162900.
Iteration 31973: Policy loss: 0.007348. Value loss: 4.624004. Entropy: 1.184775.
Iteration 31974: Policy loss: 0.007383. Value loss: 3.870105. Entropy: 1.155983.
episode: 12031   score: 225.0  epsilon: 1.0    steps: 660  evaluation reward: 336.75
Training network. lr: 0.000070. clip: 0.028113
Iteration 31975: Policy loss: 0.004582. Value loss: 8.236678. Entropy: 1.122409.
Iteration 31976: Policy loss: 0.005121. Value loss: 5.583421. Entropy: 1.155378.
Iteration 31977: Policy loss: 0.004858. Value loss: 4.882114. Entropy: 1.115790.
episode: 12032   score: 425.0  epsilon: 1.0    steps: 527  evaluation reward: 335.5
Tr

episode: 12056   score: 180.0  epsilon: 1.0    steps: 319  evaluation reward: 321.3
Training network. lr: 0.000070. clip: 0.028000
Iteration 32032: Policy loss: 0.006216. Value loss: 3.423004. Entropy: 1.231045.
Iteration 32033: Policy loss: 0.008673. Value loss: 2.285359. Entropy: 1.237787.
Iteration 32034: Policy loss: 0.007156. Value loss: 1.684643. Entropy: 1.232942.
episode: 12057   score: 310.0  epsilon: 1.0    steps: 518  evaluation reward: 319.9
Training network. lr: 0.000070. clip: 0.028000
Iteration 32035: Policy loss: 0.005213. Value loss: 3.750655. Entropy: 1.142148.
Iteration 32036: Policy loss: 0.007489. Value loss: 2.342548. Entropy: 1.165148.
Iteration 32037: Policy loss: 0.005977. Value loss: 1.792812. Entropy: 1.166407.
Training network. lr: 0.000070. clip: 0.028000
Iteration 32038: Policy loss: 0.001048. Value loss: 4.408526. Entropy: 1.343711.
Iteration 32039: Policy loss: -0.002025. Value loss: 3.624374. Entropy: 1.354210.
Iteration 32040: Policy loss: -0.002298. V

Iteration 32096: Policy loss: 0.004861. Value loss: 2.088529. Entropy: 1.290690.
Iteration 32097: Policy loss: 0.005637. Value loss: 1.642608. Entropy: 1.282296.
episode: 12080   score: 290.0  epsilon: 1.0    steps: 739  evaluation reward: 303.2
episode: 12081   score: 570.0  epsilon: 1.0    steps: 803  evaluation reward: 302.8
Training network. lr: 0.000070. clip: 0.027888
Iteration 32098: Policy loss: 0.001392. Value loss: 2.792761. Entropy: 1.216632.
Iteration 32099: Policy loss: -0.000455. Value loss: 2.136655. Entropy: 1.227225.
Iteration 32100: Policy loss: 0.002996. Value loss: 1.707311. Entropy: 1.219895.
Training network. lr: 0.000069. clip: 0.027775
Iteration 32101: Policy loss: 0.003639. Value loss: 4.229035. Entropy: 1.277948.
Iteration 32102: Policy loss: 0.003819. Value loss: 3.262273. Entropy: 1.284955.
Iteration 32103: Policy loss: 0.003353. Value loss: 2.917655. Entropy: 1.275488.
episode: 12082   score: 420.0  epsilon: 1.0    steps: 382  evaluation reward: 299.7
episo

Iteration 32163: Policy loss: 0.002294. Value loss: 1.806849. Entropy: 1.301639.
Training network. lr: 0.000069. clip: 0.027662
Iteration 32164: Policy loss: 0.003281. Value loss: 4.508890. Entropy: 1.459970.
Iteration 32165: Policy loss: 0.006401. Value loss: 3.366433. Entropy: 1.467273.
Iteration 32166: Policy loss: 0.008836. Value loss: 2.451509. Entropy: 1.458034.
now time :  2019-02-23 10:41:39.464241
episode: 12101   score: 215.0  epsilon: 1.0    steps: 816  evaluation reward: 302.4
Training network. lr: 0.000069. clip: 0.027662
Iteration 32167: Policy loss: 0.000163. Value loss: 4.853500. Entropy: 1.385906.
Iteration 32168: Policy loss: 0.001149. Value loss: 3.875321. Entropy: 1.394615.
Iteration 32169: Policy loss: 0.002947. Value loss: 3.230024. Entropy: 1.396196.
episode: 12102   score: 555.0  epsilon: 1.0    steps: 102  evaluation reward: 301.0
episode: 12103   score: 670.0  epsilon: 1.0    steps: 747  evaluation reward: 303.95
Training network. lr: 0.000069. clip: 0.027662


Iteration 32227: Policy loss: 0.005464. Value loss: 4.435447. Entropy: 1.143059.
Iteration 32228: Policy loss: 0.006375. Value loss: 2.864616. Entropy: 1.124521.
Iteration 32229: Policy loss: 0.007051. Value loss: 2.207802. Entropy: 1.131214.
episode: 12124   score: 670.0  epsilon: 1.0    steps: 369  evaluation reward: 326.15
episode: 12125   score: 210.0  epsilon: 1.0    steps: 463  evaluation reward: 330.7
Training network. lr: 0.000069. clip: 0.027550
Iteration 32230: Policy loss: 0.006195. Value loss: 5.164610. Entropy: 1.138871.
Iteration 32231: Policy loss: 0.009718. Value loss: 2.936849. Entropy: 1.133328.
Iteration 32232: Policy loss: 0.010176. Value loss: 2.273957. Entropy: 1.118299.
episode: 12126   score: 180.0  epsilon: 1.0    steps: 537  evaluation reward: 330.5
episode: 12127   score: 315.0  epsilon: 1.0    steps: 944  evaluation reward: 329.7
Training network. lr: 0.000069. clip: 0.027550
Iteration 32233: Policy loss: 0.003040. Value loss: 6.568777. Entropy: 1.060737.
It

Iteration 32292: Policy loss: 0.007183. Value loss: 2.881588. Entropy: 1.150356.
episode: 12147   score: 330.0  epsilon: 1.0    steps: 557  evaluation reward: 336.6
episode: 12148   score: 330.0  epsilon: 1.0    steps: 1011  evaluation reward: 337.05
Training network. lr: 0.000069. clip: 0.027438
Iteration 32293: Policy loss: 0.002101. Value loss: 2.438066. Entropy: 1.245290.
Iteration 32294: Policy loss: 0.003835. Value loss: 1.820695. Entropy: 1.262293.
Iteration 32295: Policy loss: 0.002810. Value loss: 1.477741. Entropy: 1.242086.
episode: 12149   score: 420.0  epsilon: 1.0    steps: 72  evaluation reward: 338.25
episode: 12150   score: 180.0  epsilon: 1.0    steps: 493  evaluation reward: 339.25
now time :  2019-02-23 10:44:18.725792
episode: 12151   score: 435.0  epsilon: 1.0    steps: 816  evaluation reward: 337.3
Training network. lr: 0.000069. clip: 0.027438
Iteration 32296: Policy loss: 0.004225. Value loss: 6.851241. Entropy: 1.082370.
Iteration 32297: Policy loss: 0.003556.

episode: 12171   score: 445.0  epsilon: 1.0    steps: 745  evaluation reward: 350.0
episode: 12172   score: 315.0  epsilon: 1.0    steps: 791  evaluation reward: 351.7
Training network. lr: 0.000068. clip: 0.027213
Iteration 32356: Policy loss: 0.003056. Value loss: 4.650086. Entropy: 1.066433.
Iteration 32357: Policy loss: 0.001897. Value loss: 3.443888. Entropy: 1.073051.
Iteration 32358: Policy loss: 0.003320. Value loss: 2.581227. Entropy: 1.063082.
episode: 12173   score: 625.0  epsilon: 1.0    steps: 525  evaluation reward: 352.75
Training network. lr: 0.000068. clip: 0.027213
Iteration 32359: Policy loss: 0.004873. Value loss: 5.195400. Entropy: 1.148005.
Iteration 32360: Policy loss: 0.003386. Value loss: 3.401644. Entropy: 1.145555.
Iteration 32361: Policy loss: 0.007445. Value loss: 3.018402. Entropy: 1.158758.
Training network. lr: 0.000068. clip: 0.027213
Iteration 32362: Policy loss: 0.000616. Value loss: 3.885079. Entropy: 1.144175.
Iteration 32363: Policy loss: 0.005280.

Iteration 32420: Policy loss: 0.002529. Value loss: 2.768101. Entropy: 1.188969.
Iteration 32421: Policy loss: 0.001192. Value loss: 2.343112. Entropy: 1.216318.
episode: 12195   score: 340.0  epsilon: 1.0    steps: 571  evaluation reward: 364.35
episode: 12196   score: 595.0  epsilon: 1.0    steps: 1023  evaluation reward: 365.45
Training network. lr: 0.000068. clip: 0.027100
Iteration 32422: Policy loss: 0.003523. Value loss: 3.817893. Entropy: 1.158810.
Iteration 32423: Policy loss: 0.006261. Value loss: 2.732369. Entropy: 1.165186.
Iteration 32424: Policy loss: 0.007321. Value loss: 2.282631. Entropy: 1.162652.
episode: 12197   score: 465.0  epsilon: 1.0    steps: 68  evaluation reward: 368.1
Training network. lr: 0.000068. clip: 0.027100
Iteration 32425: Policy loss: 0.000691. Value loss: 4.743402. Entropy: 0.984452.
Iteration 32426: Policy loss: 0.000080. Value loss: 3.985291. Entropy: 1.008369.
Iteration 32427: Policy loss: -0.000740. Value loss: 3.702815. Entropy: 0.976972.
Tra

episode: 12218   score: 420.0  epsilon: 1.0    steps: 655  evaluation reward: 364.85
Training network. lr: 0.000067. clip: 0.026988
Iteration 32485: Policy loss: 0.000272. Value loss: 5.422747. Entropy: 1.003656.
Iteration 32486: Policy loss: 0.006170. Value loss: 3.727945. Entropy: 1.011500.
Iteration 32487: Policy loss: 0.003751. Value loss: 2.716640. Entropy: 1.007858.
Training network. lr: 0.000067. clip: 0.026988
Iteration 32488: Policy loss: 0.004297. Value loss: 5.541049. Entropy: 1.270558.
Iteration 32489: Policy loss: 0.003074. Value loss: 4.456751. Entropy: 1.272821.
Iteration 32490: Policy loss: 0.002861. Value loss: 3.828214. Entropy: 1.278859.
episode: 12219   score: 365.0  epsilon: 1.0    steps: 36  evaluation reward: 366.05
Training network. lr: 0.000067. clip: 0.026988
Iteration 32491: Policy loss: 0.003178. Value loss: 8.019855. Entropy: 1.171958.
Iteration 32492: Policy loss: 0.006042. Value loss: 5.981712. Entropy: 1.171447.
Iteration 32493: Policy loss: 0.003391. Va

Iteration 32549: Policy loss: 0.005931. Value loss: 6.744399. Entropy: 1.242127.
Iteration 32550: Policy loss: 0.009567. Value loss: 5.238746. Entropy: 1.241280.
Training network. lr: 0.000067. clip: 0.026763
Iteration 32551: Policy loss: 0.002874. Value loss: 3.015495. Entropy: 1.263166.
Iteration 32552: Policy loss: 0.003079. Value loss: 2.154772. Entropy: 1.271610.
Iteration 32553: Policy loss: 0.001547. Value loss: 1.673340. Entropy: 1.265259.
episode: 12242   score: 310.0  epsilon: 1.0    steps: 163  evaluation reward: 372.65
episode: 12243   score: 480.0  epsilon: 1.0    steps: 310  evaluation reward: 372.0
Training network. lr: 0.000067. clip: 0.026763
Iteration 32554: Policy loss: 0.005304. Value loss: 8.147526. Entropy: 1.214785.
Iteration 32555: Policy loss: 0.011210. Value loss: 4.441220. Entropy: 1.200095.
Iteration 32556: Policy loss: 0.011915. Value loss: 3.643254. Entropy: 1.200094.
episode: 12244   score: 210.0  epsilon: 1.0    steps: 703  evaluation reward: 371.45
Trai

Iteration 32613: Policy loss: 0.004300. Value loss: 1.940784. Entropy: 1.201435.
episode: 12266   score: 320.0  epsilon: 1.0    steps: 218  evaluation reward: 381.95
episode: 12267   score: 345.0  epsilon: 1.0    steps: 461  evaluation reward: 379.0
Training network. lr: 0.000067. clip: 0.026650
Iteration 32614: Policy loss: 0.001992. Value loss: 3.730318. Entropy: 1.162499.
Iteration 32615: Policy loss: 0.003069. Value loss: 2.727161. Entropy: 1.165065.
Iteration 32616: Policy loss: 0.001546. Value loss: 2.484275. Entropy: 1.161447.
episode: 12268   score: 315.0  epsilon: 1.0    steps: 867  evaluation reward: 380.35
Training network. lr: 0.000067. clip: 0.026650
Iteration 32617: Policy loss: 0.002823. Value loss: 4.044592. Entropy: 1.149294.
Iteration 32618: Policy loss: 0.006363. Value loss: 3.035421. Entropy: 1.159932.
Iteration 32619: Policy loss: 0.006926. Value loss: 2.683026. Entropy: 1.129143.
episode: 12269   score: 760.0  epsilon: 1.0    steps: 547  evaluation reward: 380.05


episode: 12290   score: 215.0  epsilon: 1.0    steps: 864  evaluation reward: 359.7
Training network. lr: 0.000066. clip: 0.026537
Iteration 32677: Policy loss: 0.001527. Value loss: 4.823791. Entropy: 1.315945.
Iteration 32678: Policy loss: 0.000953. Value loss: 2.900095. Entropy: 1.322800.
Iteration 32679: Policy loss: 0.005012. Value loss: 2.471558. Entropy: 1.316885.
episode: 12291   score: 465.0  epsilon: 1.0    steps: 384  evaluation reward: 356.45
Training network. lr: 0.000066. clip: 0.026537
Iteration 32680: Policy loss: 0.002149. Value loss: 5.033035. Entropy: 1.302441.
Iteration 32681: Policy loss: 0.006206. Value loss: 3.667304. Entropy: 1.298700.
Iteration 32682: Policy loss: 0.007680. Value loss: 2.700859. Entropy: 1.298844.
episode: 12292   score: 280.0  epsilon: 1.0    steps: 51  evaluation reward: 359.0
Training network. lr: 0.000066. clip: 0.026537
Iteration 32683: Policy loss: 0.003149. Value loss: 2.847232. Entropy: 1.339373.
Iteration 32684: Policy loss: 0.004754. 

Iteration 32742: Policy loss: 0.002231. Value loss: 2.516744. Entropy: 1.377845.
Training network. lr: 0.000066. clip: 0.026425
Iteration 32743: Policy loss: 0.002007. Value loss: 5.521002. Entropy: 1.259811.
Iteration 32744: Policy loss: 0.005138. Value loss: 4.175926. Entropy: 1.278726.
Iteration 32745: Policy loss: 0.004666. Value loss: 3.287737. Entropy: 1.268672.
episode: 12312   score: 240.0  epsilon: 1.0    steps: 345  evaluation reward: 354.85
episode: 12313   score: 250.0  epsilon: 1.0    steps: 647  evaluation reward: 350.0
episode: 12314   score: 230.0  epsilon: 1.0    steps: 940  evaluation reward: 349.3
Training network. lr: 0.000066. clip: 0.026425
Iteration 32746: Policy loss: 0.001944. Value loss: 4.195775. Entropy: 1.175218.
Iteration 32747: Policy loss: 0.000986. Value loss: 3.166045. Entropy: 1.179918.
Iteration 32748: Policy loss: 0.000338. Value loss: 2.775352. Entropy: 1.180177.
Training network. lr: 0.000066. clip: 0.026425
Iteration 32749: Policy loss: 0.003441.

Iteration 32804: Policy loss: -0.001180. Value loss: 2.560743. Entropy: 1.198096.
Iteration 32805: Policy loss: 0.000461. Value loss: 2.171559. Entropy: 1.178432.
episode: 12338   score: 585.0  epsilon: 1.0    steps: 593  evaluation reward: 324.05
Training network. lr: 0.000065. clip: 0.026200
Iteration 32806: Policy loss: -0.000763. Value loss: 5.668933. Entropy: 1.227640.
Iteration 32807: Policy loss: -0.000714. Value loss: 4.329032. Entropy: 1.235161.
Iteration 32808: Policy loss: -0.000620. Value loss: 3.823868. Entropy: 1.229387.
episode: 12339   score: 805.0  epsilon: 1.0    steps: 1017  evaluation reward: 326.1
Training network. lr: 0.000065. clip: 0.026200
Iteration 32809: Policy loss: -0.000053. Value loss: 4.153649. Entropy: 1.293296.
Iteration 32810: Policy loss: 0.004620. Value loss: 2.932950. Entropy: 1.294150.
Iteration 32811: Policy loss: 0.003263. Value loss: 2.579614. Entropy: 1.303448.
Training network. lr: 0.000065. clip: 0.026200
Iteration 32812: Policy loss: 0.0020

Iteration 32868: Policy loss: 0.006664. Value loss: 4.664829. Entropy: 1.152330.
Training network. lr: 0.000065. clip: 0.026087
Iteration 32869: Policy loss: 0.003889. Value loss: 4.433223. Entropy: 1.274632.
Iteration 32870: Policy loss: 0.007858. Value loss: 3.191175. Entropy: 1.263789.
Iteration 32871: Policy loss: 0.003437. Value loss: 2.746778. Entropy: 1.267735.
episode: 12362   score: 290.0  epsilon: 1.0    steps: 174  evaluation reward: 326.75
Training network. lr: 0.000065. clip: 0.026087
Iteration 32872: Policy loss: 0.004224. Value loss: 3.399099. Entropy: 1.247313.
Iteration 32873: Policy loss: 0.006691. Value loss: 2.333619. Entropy: 1.229725.
Iteration 32874: Policy loss: 0.003807. Value loss: 1.830456. Entropy: 1.249599.
episode: 12363   score: 235.0  epsilon: 1.0    steps: 514  evaluation reward: 326.6
episode: 12364   score: 265.0  epsilon: 1.0    steps: 774  evaluation reward: 326.85
Training network. lr: 0.000065. clip: 0.026087
Iteration 32875: Policy loss: -0.00023

Iteration 32933: Policy loss: 0.003310. Value loss: 4.693391. Entropy: 1.202378.
Iteration 32934: Policy loss: 0.001885. Value loss: 3.753462. Entropy: 1.201730.
episode: 12385   score: 420.0  epsilon: 1.0    steps: 335  evaluation reward: 320.3
Training network. lr: 0.000065. clip: 0.025975
Iteration 32935: Policy loss: 0.001605. Value loss: 4.995844. Entropy: 1.067182.
Iteration 32936: Policy loss: 0.000490. Value loss: 3.556554. Entropy: 1.088929.
Iteration 32937: Policy loss: -0.001195. Value loss: 2.992513. Entropy: 1.082452.
episode: 12386   score: 255.0  epsilon: 1.0    steps: 599  evaluation reward: 322.25
Training network. lr: 0.000065. clip: 0.025975
Iteration 32938: Policy loss: 0.003607. Value loss: 4.202334. Entropy: 1.255080.
Iteration 32939: Policy loss: 0.005141. Value loss: 2.833128. Entropy: 1.255301.
Iteration 32940: Policy loss: 0.004008. Value loss: 2.209094. Entropy: 1.249158.
episode: 12387   score: 555.0  epsilon: 1.0    steps: 126  evaluation reward: 322.15
epi

Training network. lr: 0.000065. clip: 0.025863
Iteration 32998: Policy loss: 0.003835. Value loss: 5.982909. Entropy: 1.177304.
Iteration 32999: Policy loss: 0.006217. Value loss: 4.032313. Entropy: 1.170820.
Iteration 33000: Policy loss: 0.004965. Value loss: 3.568703. Entropy: 1.174790.
episode: 12408   score: 300.0  epsilon: 1.0    steps: 333  evaluation reward: 334.75
Training network. lr: 0.000064. clip: 0.025750
Iteration 33001: Policy loss: 0.004218. Value loss: 4.044554. Entropy: 1.208766.
Iteration 33002: Policy loss: 0.004895. Value loss: 2.994970. Entropy: 1.212528.
Iteration 33003: Policy loss: 0.006350. Value loss: 2.497052. Entropy: 1.205804.
episode: 12409   score: 270.0  epsilon: 1.0    steps: 532  evaluation reward: 335.1
Training network. lr: 0.000064. clip: 0.025750
Iteration 33004: Policy loss: 0.002091. Value loss: 2.656119. Entropy: 1.235502.
Iteration 33005: Policy loss: 0.003405. Value loss: 1.889350. Entropy: 1.262077.
Iteration 33006: Policy loss: 0.002240. Va

Iteration 33063: Policy loss: -0.000343. Value loss: 1.377200. Entropy: 1.205865.
Training network. lr: 0.000064. clip: 0.025638
Iteration 33064: Policy loss: 0.002454. Value loss: 3.699849. Entropy: 1.229681.
Iteration 33065: Policy loss: 0.007334. Value loss: 2.121011. Entropy: 1.197538.
Iteration 33066: Policy loss: 0.004270. Value loss: 1.820227. Entropy: 1.212719.
episode: 12431   score: 335.0  epsilon: 1.0    steps: 894  evaluation reward: 350.4
Training network. lr: 0.000064. clip: 0.025638
Iteration 33067: Policy loss: 0.003606. Value loss: 5.384815. Entropy: 1.260696.
Iteration 33068: Policy loss: 0.001198. Value loss: 3.841344. Entropy: 1.243952.
Iteration 33069: Policy loss: 0.001777. Value loss: 3.044461. Entropy: 1.265020.
episode: 12432   score: 380.0  epsilon: 1.0    steps: 71  evaluation reward: 350.55
episode: 12433   score: 275.0  epsilon: 1.0    steps: 593  evaluation reward: 350.9
Training network. lr: 0.000064. clip: 0.025638
Iteration 33070: Policy loss: 0.001191.

Training network. lr: 0.000064. clip: 0.025525
Iteration 33127: Policy loss: -0.000618. Value loss: 3.790052. Entropy: 1.081876.
Iteration 33128: Policy loss: -0.000472. Value loss: 2.676671. Entropy: 1.060414.
Iteration 33129: Policy loss: 0.001088. Value loss: 2.380086. Entropy: 1.083160.
episode: 12455   score: 480.0  epsilon: 1.0    steps: 652  evaluation reward: 351.55
Training network. lr: 0.000064. clip: 0.025525
Iteration 33130: Policy loss: 0.003994. Value loss: 5.497814. Entropy: 1.088616.
Iteration 33131: Policy loss: 0.005190. Value loss: 4.060951. Entropy: 1.076249.
Iteration 33132: Policy loss: 0.005779. Value loss: 4.314273. Entropy: 1.080678.
Training network. lr: 0.000064. clip: 0.025525
Iteration 33133: Policy loss: 0.001255. Value loss: 3.662541. Entropy: 1.246116.
Iteration 33134: Policy loss: 0.001446. Value loss: 2.441255. Entropy: 1.278905.
Iteration 33135: Policy loss: 0.001366. Value loss: 1.977264. Entropy: 1.252761.
episode: 12456   score: 500.0  epsilon: 1.0

Iteration 33192: Policy loss: 0.002003. Value loss: 1.844250. Entropy: 1.037743.
episode: 12478   score: 155.0  epsilon: 1.0    steps: 29  evaluation reward: 353.75
episode: 12479   score: 210.0  epsilon: 1.0    steps: 264  evaluation reward: 352.45
episode: 12480   score: 365.0  epsilon: 1.0    steps: 802  evaluation reward: 353.0
Training network. lr: 0.000064. clip: 0.025413
Iteration 33193: Policy loss: -0.001245. Value loss: 3.269381. Entropy: 1.059482.
Iteration 33194: Policy loss: 0.003319. Value loss: 2.667495. Entropy: 1.058993.
Iteration 33195: Policy loss: 0.000634. Value loss: 2.340883. Entropy: 1.051844.
episode: 12481   score: 470.0  epsilon: 1.0    steps: 967  evaluation reward: 354.55
Training network. lr: 0.000064. clip: 0.025413
Iteration 33196: Policy loss: 0.003821. Value loss: 8.858761. Entropy: 1.194883.
Iteration 33197: Policy loss: 0.009201. Value loss: 6.064241. Entropy: 1.203552.
Iteration 33198: Policy loss: 0.015839. Value loss: 4.896817. Entropy: 1.204642.


Iteration 33254: Policy loss: 0.012624. Value loss: 4.022137. Entropy: 1.005139.
Iteration 33255: Policy loss: 0.009450. Value loss: 3.031794. Entropy: 1.025097.
episode: 12504   score: 155.0  epsilon: 1.0    steps: 5  evaluation reward: 337.65
episode: 12505   score: 505.0  epsilon: 1.0    steps: 505  evaluation reward: 335.75
episode: 12506   score: 315.0  epsilon: 1.0    steps: 520  evaluation reward: 338.05
Training network. lr: 0.000063. clip: 0.025188
Iteration 33256: Policy loss: 0.004724. Value loss: 3.535846. Entropy: 0.998261.
Iteration 33257: Policy loss: 0.004211. Value loss: 2.593768. Entropy: 0.986531.
Iteration 33258: Policy loss: 0.005334. Value loss: 2.366107. Entropy: 0.995768.
episode: 12507   score: 495.0  epsilon: 1.0    steps: 240  evaluation reward: 337.3
Training network. lr: 0.000063. clip: 0.025188
Iteration 33259: Policy loss: 0.003480. Value loss: 3.870123. Entropy: 1.065098.
Iteration 33260: Policy loss: 0.004587. Value loss: 3.069632. Entropy: 1.066604.
It

Iteration 33318: Policy loss: 0.006040. Value loss: 6.097290. Entropy: 1.114823.
episode: 12528   score: 760.0  epsilon: 1.0    steps: 583  evaluation reward: 339.3
episode: 12529   score: 255.0  epsilon: 1.0    steps: 823  evaluation reward: 344.65
episode: 12530   score: 620.0  epsilon: 1.0    steps: 950  evaluation reward: 343.55
Training network. lr: 0.000063. clip: 0.025075
Iteration 33319: Policy loss: 0.001910. Value loss: 3.857870. Entropy: 1.135509.
Iteration 33320: Policy loss: 0.004773. Value loss: 2.661272. Entropy: 1.125546.
Iteration 33321: Policy loss: 0.006289. Value loss: 2.039537. Entropy: 1.143064.
episode: 12531   score: 665.0  epsilon: 1.0    steps: 457  evaluation reward: 343.7
Training network. lr: 0.000063. clip: 0.025075
Iteration 33322: Policy loss: 0.002466. Value loss: 3.368345. Entropy: 1.158904.
Iteration 33323: Policy loss: 0.002439. Value loss: 2.580102. Entropy: 1.154009.
Iteration 33324: Policy loss: 0.001199. Value loss: 2.105023. Entropy: 1.154264.
T

Iteration 33383: Policy loss: 0.003097. Value loss: 4.503724. Entropy: 1.054282.
Iteration 33384: Policy loss: 0.005517. Value loss: 3.816552. Entropy: 1.063075.
now time :  2019-02-23 11:06:40.981735
episode: 12551   score: 655.0  epsilon: 1.0    steps: 334  evaluation reward: 366.15
Training network. lr: 0.000062. clip: 0.024963
Iteration 33385: Policy loss: 0.004449. Value loss: 5.338065. Entropy: 1.229425.
Iteration 33386: Policy loss: 0.009399. Value loss: 3.042770. Entropy: 1.226428.
Iteration 33387: Policy loss: 0.007464. Value loss: 2.950799. Entropy: 1.220241.
Training network. lr: 0.000062. clip: 0.024963
Iteration 33388: Policy loss: 0.002618. Value loss: 2.881326. Entropy: 1.078228.
Iteration 33389: Policy loss: 0.006039. Value loss: 1.940037. Entropy: 1.054325.
Iteration 33390: Policy loss: 0.003292. Value loss: 1.503680. Entropy: 1.063344.
Training network. lr: 0.000062. clip: 0.024963
Iteration 33391: Policy loss: 0.004141. Value loss: 6.550535. Entropy: 1.248263.
Iterat

episode: 12574   score: 155.0  epsilon: 1.0    steps: 837  evaluation reward: 371.85
Training network. lr: 0.000062. clip: 0.024850
Iteration 33448: Policy loss: 0.003337. Value loss: 5.125115. Entropy: 1.056783.
Iteration 33449: Policy loss: 0.001324. Value loss: 4.847893. Entropy: 1.069683.
Iteration 33450: Policy loss: 0.001575. Value loss: 3.949740. Entropy: 1.061697.
Training network. lr: 0.000062. clip: 0.024738
Iteration 33451: Policy loss: 0.003312. Value loss: 3.437949. Entropy: 1.109846.
Iteration 33452: Policy loss: 0.007347. Value loss: 2.090229. Entropy: 1.090164.
Iteration 33453: Policy loss: 0.004051. Value loss: 1.736539. Entropy: 1.100546.
episode: 12575   score: 240.0  epsilon: 1.0    steps: 419  evaluation reward: 369.15
episode: 12576   score: 775.0  epsilon: 1.0    steps: 513  evaluation reward: 368.6
Training network. lr: 0.000062. clip: 0.024738
Iteration 33454: Policy loss: 0.003179. Value loss: 3.044629. Entropy: 1.064426.
Iteration 33455: Policy loss: 0.002722

Iteration 33514: Policy loss: 0.001546. Value loss: 5.141396. Entropy: 1.150965.
Iteration 33515: Policy loss: 0.001292. Value loss: 3.583956. Entropy: 1.135925.
Iteration 33516: Policy loss: 0.004038. Value loss: 3.366383. Entropy: 1.137316.
episode: 12595   score: 215.0  epsilon: 1.0    steps: 810  evaluation reward: 386.8
Training network. lr: 0.000062. clip: 0.024625
Iteration 33517: Policy loss: -0.000114. Value loss: 4.431026. Entropy: 1.146555.
Iteration 33518: Policy loss: 0.004077. Value loss: 2.439176. Entropy: 1.158107.
Iteration 33519: Policy loss: 0.000385. Value loss: 1.935144. Entropy: 1.166066.
episode: 12596   score: 185.0  epsilon: 1.0    steps: 18  evaluation reward: 388.15
episode: 12597   score: 555.0  epsilon: 1.0    steps: 652  evaluation reward: 384.35
Training network. lr: 0.000062. clip: 0.024625
Iteration 33520: Policy loss: 0.001984. Value loss: 2.942035. Entropy: 1.208045.
Iteration 33521: Policy loss: 0.003157. Value loss: 2.077231. Entropy: 1.196444.
Iter

Iteration 33578: Policy loss: 0.003672. Value loss: 5.102238. Entropy: 1.183575.
Iteration 33579: Policy loss: 0.006230. Value loss: 3.478460. Entropy: 1.163460.
episode: 12619   score: 465.0  epsilon: 1.0    steps: 175  evaluation reward: 381.4
episode: 12620   score: 245.0  epsilon: 1.0    steps: 495  evaluation reward: 382.3
Training network. lr: 0.000061. clip: 0.024513
Iteration 33580: Policy loss: 0.002001. Value loss: 4.764921. Entropy: 1.192318.
Iteration 33581: Policy loss: 0.001267. Value loss: 3.489985. Entropy: 1.155626.
Iteration 33582: Policy loss: -0.002262. Value loss: 2.948390. Entropy: 1.177679.
episode: 12621   score: 210.0  epsilon: 1.0    steps: 640  evaluation reward: 380.85
episode: 12622   score: 210.0  epsilon: 1.0    steps: 692  evaluation reward: 380.2
Training network. lr: 0.000061. clip: 0.024513
Iteration 33583: Policy loss: 0.001451. Value loss: 4.229367. Entropy: 1.109652.
Iteration 33584: Policy loss: 0.002223. Value loss: 2.898079. Entropy: 1.093976.
I

Iteration 33640: Policy loss: 0.002348. Value loss: 6.628022. Entropy: 1.096090.
Iteration 33641: Policy loss: 0.004023. Value loss: 5.231472. Entropy: 1.089609.
Iteration 33642: Policy loss: 0.002524. Value loss: 4.394954. Entropy: 1.097982.
Training network. lr: 0.000061. clip: 0.024400
Iteration 33643: Policy loss: 0.001811. Value loss: 3.691838. Entropy: 1.303447.
Iteration 33644: Policy loss: 0.004708. Value loss: 2.555876. Entropy: 1.306411.
Iteration 33645: Policy loss: 0.001353. Value loss: 2.117920. Entropy: 1.307968.
episode: 12645   score: 290.0  epsilon: 1.0    steps: 406  evaluation reward: 348.55
episode: 12646   score: 185.0  epsilon: 1.0    steps: 832  evaluation reward: 344.75
Training network. lr: 0.000061. clip: 0.024400
Iteration 33646: Policy loss: 0.001827. Value loss: 1.837033. Entropy: 1.248330.
Iteration 33647: Policy loss: 0.001983. Value loss: 1.433782. Entropy: 1.253713.
Iteration 33648: Policy loss: 0.000582. Value loss: 1.136222. Entropy: 1.264627.
Trainin

Iteration 33704: Policy loss: 0.004096. Value loss: 3.339457. Entropy: 1.072097.
Iteration 33705: Policy loss: 0.003292. Value loss: 2.671712. Entropy: 1.067415.
episode: 12669   score: 530.0  epsilon: 1.0    steps: 143  evaluation reward: 320.3
Training network. lr: 0.000060. clip: 0.024175
Iteration 33706: Policy loss: 0.001833. Value loss: 4.226707. Entropy: 1.004504.
Iteration 33707: Policy loss: 0.003369. Value loss: 3.245560. Entropy: 1.001735.
Iteration 33708: Policy loss: 0.001503. Value loss: 2.952622. Entropy: 1.002402.
episode: 12670   score: 185.0  epsilon: 1.0    steps: 639  evaluation reward: 323.0
Training network. lr: 0.000060. clip: 0.024175
Iteration 33709: Policy loss: -0.000375. Value loss: 3.542034. Entropy: 1.229135.
Iteration 33710: Policy loss: 0.003590. Value loss: 2.371022. Entropy: 1.227891.
Iteration 33711: Policy loss: 0.000737. Value loss: 1.765036. Entropy: 1.236150.
Training network. lr: 0.000060. clip: 0.024175
Iteration 33712: Policy loss: 0.003513. Va

episode: 12692   score: 360.0  epsilon: 1.0    steps: 806  evaluation reward: 313.15
Training network. lr: 0.000060. clip: 0.024063
Iteration 33769: Policy loss: 0.001084. Value loss: 4.625842. Entropy: 1.179137.
Iteration 33770: Policy loss: 0.002414. Value loss: 3.371418. Entropy: 1.166163.
Iteration 33771: Policy loss: 0.003031. Value loss: 2.886619. Entropy: 1.175739.
episode: 12693   score: 230.0  epsilon: 1.0    steps: 644  evaluation reward: 313.55
Training network. lr: 0.000060. clip: 0.024063
Iteration 33772: Policy loss: 0.003524. Value loss: 5.741993. Entropy: 1.187793.
Iteration 33773: Policy loss: 0.008171. Value loss: 3.879917. Entropy: 1.175304.
Iteration 33774: Policy loss: 0.006755. Value loss: 3.168609. Entropy: 1.182909.
episode: 12694   score: 180.0  epsilon: 1.0    steps: 559  evaluation reward: 312.5
episode: 12695   score: 225.0  epsilon: 1.0    steps: 935  evaluation reward: 308.95
Training network. lr: 0.000060. clip: 0.024063
Iteration 33775: Policy loss: 0.00

Training network. lr: 0.000060. clip: 0.023950
Iteration 33832: Policy loss: 0.002582. Value loss: 4.811219. Entropy: 1.209618.
Iteration 33833: Policy loss: 0.005967. Value loss: 3.463119. Entropy: 1.204144.
Iteration 33834: Policy loss: 0.007264. Value loss: 1.805903. Entropy: 1.213202.
Training network. lr: 0.000060. clip: 0.023950
Iteration 33835: Policy loss: 0.000586. Value loss: 2.443598. Entropy: 1.288755.
Iteration 33836: Policy loss: 0.000543. Value loss: 1.979293. Entropy: 1.289294.
Iteration 33837: Policy loss: 0.000695. Value loss: 1.449914. Entropy: 1.279274.
episode: 12717   score: 575.0  epsilon: 1.0    steps: 249  evaluation reward: 315.5
episode: 12718   score: 305.0  epsilon: 1.0    steps: 760  evaluation reward: 318.8
Training network. lr: 0.000060. clip: 0.023950
Iteration 33838: Policy loss: 0.001929. Value loss: 5.876278. Entropy: 1.359588.
Iteration 33839: Policy loss: 0.003924. Value loss: 4.516353. Entropy: 1.349327.
Iteration 33840: Policy loss: 0.006171. Val

episode: 12739   score: 340.0  epsilon: 1.0    steps: 359  evaluation reward: 334.9
episode: 12740   score: 230.0  epsilon: 1.0    steps: 785  evaluation reward: 335.9
Training network. lr: 0.000060. clip: 0.023838
Iteration 33898: Policy loss: 0.001469. Value loss: 1.913783. Entropy: 1.192867.
Iteration 33899: Policy loss: 0.002101. Value loss: 1.409504. Entropy: 1.197343.
Iteration 33900: Policy loss: 0.003285. Value loss: 1.158173. Entropy: 1.182761.
episode: 12741   score: 165.0  epsilon: 1.0    steps: 719  evaluation reward: 336.4
Training network. lr: 0.000059. clip: 0.023725
Iteration 33901: Policy loss: 0.004073. Value loss: 5.672970. Entropy: 1.156502.
Iteration 33902: Policy loss: 0.004414. Value loss: 4.387005. Entropy: 1.158551.
Iteration 33903: Policy loss: 0.007781. Value loss: 3.711474. Entropy: 1.158794.
episode: 12742   score: 295.0  epsilon: 1.0    steps: 207  evaluation reward: 331.15
episode: 12743   score: 195.0  epsilon: 1.0    steps: 400  evaluation reward: 332.0

Iteration 33961: Policy loss: 0.001478. Value loss: 4.248141. Entropy: 1.242185.
Iteration 33962: Policy loss: 0.000769. Value loss: 3.274929. Entropy: 1.244682.
Iteration 33963: Policy loss: 0.001511. Value loss: 2.641920. Entropy: 1.246770.
episode: 12764   score: 375.0  epsilon: 1.0    steps: 366  evaluation reward: 335.85
episode: 12765   score: 170.0  epsilon: 1.0    steps: 830  evaluation reward: 336.8
Training network. lr: 0.000059. clip: 0.023613
Iteration 33964: Policy loss: 0.001948. Value loss: 6.637195. Entropy: 1.160458.
Iteration 33965: Policy loss: 0.006244. Value loss: 4.636086. Entropy: 1.131023.
Iteration 33966: Policy loss: 0.004004. Value loss: 4.071915. Entropy: 1.162654.
episode: 12766   score: 535.0  epsilon: 1.0    steps: 664  evaluation reward: 334.95
Training network. lr: 0.000059. clip: 0.023613
Iteration 33967: Policy loss: 0.001598. Value loss: 3.990437. Entropy: 0.884657.
Iteration 33968: Policy loss: 0.000095. Value loss: 3.664787. Entropy: 0.903817.
Iter

Iteration 34024: Policy loss: 0.001423. Value loss: 5.870284. Entropy: 1.057130.
Iteration 34025: Policy loss: 0.002453. Value loss: 4.626364. Entropy: 1.083454.
Iteration 34026: Policy loss: 0.003483. Value loss: 3.600277. Entropy: 1.044506.
episode: 12789   score: 340.0  epsilon: 1.0    steps: 489  evaluation reward: 323.1
Training network. lr: 0.000059. clip: 0.023500
Iteration 34027: Policy loss: 0.001786. Value loss: 4.606967. Entropy: 1.023453.
Iteration 34028: Policy loss: 0.003220. Value loss: 3.540778. Entropy: 1.045915.
Iteration 34029: Policy loss: -0.000255. Value loss: 2.867576. Entropy: 1.013375.
Training network. lr: 0.000059. clip: 0.023500
Iteration 34030: Policy loss: 0.001889. Value loss: 6.072585. Entropy: 1.077224.
Iteration 34031: Policy loss: 0.001538. Value loss: 4.273397. Entropy: 1.064057.
Iteration 34032: Policy loss: 0.001733. Value loss: 3.225711. Entropy: 1.073398.
Training network. lr: 0.000059. clip: 0.023500
Iteration 34033: Policy loss: 0.002481. Value

Iteration 34088: Policy loss: 0.004040. Value loss: 1.407879. Entropy: 1.054608.
Iteration 34089: Policy loss: 0.002906. Value loss: 1.196227. Entropy: 1.057615.
episode: 12813   score: 220.0  epsilon: 1.0    steps: 285  evaluation reward: 321.9
Training network. lr: 0.000058. clip: 0.023388
Iteration 34090: Policy loss: 0.003434. Value loss: 5.463272. Entropy: 1.035246.
Iteration 34091: Policy loss: 0.008905. Value loss: 4.103642. Entropy: 1.037142.
Iteration 34092: Policy loss: 0.003782. Value loss: 3.528707. Entropy: 1.039266.
episode: 12814   score: 315.0  epsilon: 1.0    steps: 700  evaluation reward: 320.3
episode: 12815   score: 330.0  epsilon: 1.0    steps: 875  evaluation reward: 320.9
Training network. lr: 0.000058. clip: 0.023388
Iteration 34093: Policy loss: 0.003810. Value loss: 5.018399. Entropy: 1.052135.
Iteration 34094: Policy loss: 0.002605. Value loss: 3.277689. Entropy: 1.054455.
Iteration 34095: Policy loss: 0.007106. Value loss: 2.508372. Entropy: 1.055229.
Traini

Iteration 34151: Policy loss: 0.004119. Value loss: 2.421368. Entropy: 1.083784.
Iteration 34152: Policy loss: 0.005400. Value loss: 1.919209. Entropy: 1.058143.
Training network. lr: 0.000058. clip: 0.023163
Iteration 34153: Policy loss: 0.002224. Value loss: 5.314233. Entropy: 1.245349.
Iteration 34154: Policy loss: 0.006218. Value loss: 4.204492. Entropy: 1.265883.
Iteration 34155: Policy loss: 0.010797. Value loss: 3.291358. Entropy: 1.266557.
episode: 12838   score: 155.0  epsilon: 1.0    steps: 257  evaluation reward: 301.4
episode: 12839   score: 260.0  epsilon: 1.0    steps: 802  evaluation reward: 297.15
Training network. lr: 0.000058. clip: 0.023163
Iteration 34156: Policy loss: 0.004641. Value loss: 3.552868. Entropy: 1.045388.
Iteration 34157: Policy loss: 0.004823. Value loss: 2.541003. Entropy: 1.037120.
Iteration 34158: Policy loss: 0.002338. Value loss: 2.081601. Entropy: 1.031424.
episode: 12840   score: 415.0  epsilon: 1.0    steps: 488  evaluation reward: 296.35
Trai

Iteration 34214: Policy loss: 0.001578. Value loss: 2.423541. Entropy: 1.035564.
Iteration 34215: Policy loss: 0.002741. Value loss: 2.026480. Entropy: 1.034693.
episode: 12863   score: 225.0  epsilon: 1.0    steps: 112  evaluation reward: 309.85
Training network. lr: 0.000058. clip: 0.023050
Iteration 34216: Policy loss: 0.001858. Value loss: 3.476628. Entropy: 1.111963.
Iteration 34217: Policy loss: 0.005674. Value loss: 2.378113. Entropy: 1.106672.
Iteration 34218: Policy loss: 0.006325. Value loss: 2.240278. Entropy: 1.110172.
Training network. lr: 0.000058. clip: 0.023050
Iteration 34219: Policy loss: 0.002940. Value loss: 3.296907. Entropy: 1.035111.
Iteration 34220: Policy loss: 0.002080. Value loss: 2.116405. Entropy: 1.050341.
Iteration 34221: Policy loss: 0.001229. Value loss: 1.744447. Entropy: 1.029724.
episode: 12864   score: 330.0  epsilon: 1.0    steps: 604  evaluation reward: 308.9
episode: 12865   score: 315.0  epsilon: 1.0    steps: 881  evaluation reward: 308.45
Trai

Iteration 34277: Policy loss: 0.000018. Value loss: 3.000802. Entropy: 1.150822.
Iteration 34278: Policy loss: 0.003382. Value loss: 2.613430. Entropy: 1.147187.
episode: 12888   score: 390.0  epsilon: 1.0    steps: 801  evaluation reward: 307.25
Training network. lr: 0.000057. clip: 0.022937
Iteration 34279: Policy loss: 0.001325. Value loss: 3.579775. Entropy: 1.243725.
Iteration 34280: Policy loss: 0.001152. Value loss: 2.555331. Entropy: 1.236835.
Iteration 34281: Policy loss: 0.002951. Value loss: 2.151278. Entropy: 1.242450.
Training network. lr: 0.000057. clip: 0.022937
Iteration 34282: Policy loss: 0.003334. Value loss: 6.513633. Entropy: 1.178521.
Iteration 34283: Policy loss: 0.005113. Value loss: 5.086147. Entropy: 1.181840.
Iteration 34284: Policy loss: 0.005293. Value loss: 4.365797. Entropy: 1.188362.
episode: 12889   score: 315.0  epsilon: 1.0    steps: 132  evaluation reward: 305.95
Training network. lr: 0.000057. clip: 0.022937
Iteration 34285: Policy loss: 0.003019. V

Iteration 34341: Policy loss: 0.005636. Value loss: 4.834935. Entropy: 1.205752.
episode: 12912   score: 285.0  epsilon: 1.0    steps: 886  evaluation reward: 303.55
episode: 12913   score: 345.0  epsilon: 1.0    steps: 972  evaluation reward: 303.55
Training network. lr: 0.000057. clip: 0.022825
Iteration 34342: Policy loss: 0.000404. Value loss: 3.927525. Entropy: 1.173373.
Iteration 34343: Policy loss: 0.000925. Value loss: 2.874913. Entropy: 1.156873.
Iteration 34344: Policy loss: 0.000731. Value loss: 2.113821. Entropy: 1.177396.
Training network. lr: 0.000057. clip: 0.022825
Iteration 34345: Policy loss: 0.001036. Value loss: 4.908978. Entropy: 1.243423.
Iteration 34346: Policy loss: 0.003341. Value loss: 4.090321. Entropy: 1.241647.
Iteration 34347: Policy loss: 0.003085. Value loss: 3.056796. Entropy: 1.260180.
Training network. lr: 0.000057. clip: 0.022825
Iteration 34348: Policy loss: 0.002796. Value loss: 6.489005. Entropy: 1.242316.
Iteration 34349: Policy loss: 0.003919. V

Iteration 34402: Policy loss: 0.001119. Value loss: 3.769684. Entropy: 1.094505.
Iteration 34403: Policy loss: 0.007425. Value loss: 2.514569. Entropy: 1.091787.
Iteration 34404: Policy loss: 0.007128. Value loss: 1.933353. Entropy: 1.070653.
Training network. lr: 0.000057. clip: 0.022600
Iteration 34405: Policy loss: 0.003799. Value loss: 7.362671. Entropy: 1.126772.
Iteration 34406: Policy loss: 0.006394. Value loss: 5.199449. Entropy: 1.104663.
Iteration 34407: Policy loss: 0.006814. Value loss: 4.744705. Entropy: 1.093210.
episode: 12939   score: 515.0  epsilon: 1.0    steps: 253  evaluation reward: 303.9
episode: 12940   score: 445.0  epsilon: 1.0    steps: 384  evaluation reward: 306.45
episode: 12941   score: 250.0  epsilon: 1.0    steps: 534  evaluation reward: 306.75
Training network. lr: 0.000057. clip: 0.022600
Iteration 34408: Policy loss: 0.001768. Value loss: 3.183638. Entropy: 1.051095.
Iteration 34409: Policy loss: 0.002658. Value loss: 2.278893. Entropy: 1.052532.
Iter

Iteration 34467: Policy loss: 0.002946. Value loss: 3.216646. Entropy: 1.238732.
episode: 12962   score: 225.0  epsilon: 1.0    steps: 650  evaluation reward: 298.75
Training network. lr: 0.000056. clip: 0.022488
Iteration 34468: Policy loss: 0.001323. Value loss: 9.197399. Entropy: 1.130505.
Iteration 34469: Policy loss: 0.002961. Value loss: 6.267999. Entropy: 1.124826.
Iteration 34470: Policy loss: -0.000060. Value loss: 5.335645. Entropy: 1.125362.
episode: 12963   score: 345.0  epsilon: 1.0    steps: 603  evaluation reward: 298.15
Training network. lr: 0.000056. clip: 0.022488
Iteration 34471: Policy loss: 0.002727. Value loss: 6.632037. Entropy: 1.239276.
Iteration 34472: Policy loss: 0.005652. Value loss: 4.986685. Entropy: 1.216303.
Iteration 34473: Policy loss: 0.003930. Value loss: 4.247537. Entropy: 1.214719.
episode: 12964   score: 380.0  epsilon: 1.0    steps: 90  evaluation reward: 299.35
episode: 12965   score: 345.0  epsilon: 1.0    steps: 312  evaluation reward: 299.85

Training network. lr: 0.000056. clip: 0.022375
Iteration 34531: Policy loss: 0.001713. Value loss: 6.623026. Entropy: 1.080117.
Iteration 34532: Policy loss: 0.002921. Value loss: 4.955997. Entropy: 1.083258.
Iteration 34533: Policy loss: 0.005637. Value loss: 4.154135. Entropy: 1.081296.
Training network. lr: 0.000056. clip: 0.022375
Iteration 34534: Policy loss: 0.001332. Value loss: 6.125779. Entropy: 1.228688.
Iteration 34535: Policy loss: 0.004237. Value loss: 4.440810. Entropy: 1.232383.
Iteration 34536: Policy loss: 0.007772. Value loss: 4.131789. Entropy: 1.231631.
episode: 12987   score: 210.0  epsilon: 1.0    steps: 549  evaluation reward: 314.45
Training network. lr: 0.000056. clip: 0.022375
Iteration 34537: Policy loss: 0.003323. Value loss: 3.290149. Entropy: 1.233227.
Iteration 34538: Policy loss: 0.004684. Value loss: 2.099050. Entropy: 1.216262.
Iteration 34539: Policy loss: 0.005112. Value loss: 1.764170. Entropy: 1.242324.
episode: 12988   score: 215.0  epsilon: 1.0  

Iteration 34596: Policy loss: 0.001465. Value loss: 2.854747. Entropy: 1.047958.
episode: 13009   score: 285.0  epsilon: 1.0    steps: 106  evaluation reward: 324.0
episode: 13010   score: 290.0  epsilon: 1.0    steps: 335  evaluation reward: 323.7
episode: 13011   score: 190.0  epsilon: 1.0    steps: 1011  evaluation reward: 323.75
Training network. lr: 0.000056. clip: 0.022263
Iteration 34597: Policy loss: -0.000200. Value loss: 5.412881. Entropy: 0.937756.
Iteration 34598: Policy loss: -0.000064. Value loss: 4.427145. Entropy: 0.924564.
Iteration 34599: Policy loss: 0.000100. Value loss: 3.921883. Entropy: 0.930088.
Training network. lr: 0.000056. clip: 0.022263
Iteration 34600: Policy loss: 0.002415. Value loss: 3.586339. Entropy: 1.035543.
Iteration 34601: Policy loss: 0.002109. Value loss: 2.831505. Entropy: 1.040350.
Iteration 34602: Policy loss: 0.001865. Value loss: 2.404502. Entropy: 1.041316.
episode: 13012   score: 100.0  epsilon: 1.0    steps: 435  evaluation reward: 320.7

episode: 13033   score: 375.0  epsilon: 1.0    steps: 166  evaluation reward: 333.15
Training network. lr: 0.000055. clip: 0.022038
Iteration 34660: Policy loss: 0.000406. Value loss: 4.620499. Entropy: 1.113080.
Iteration 34661: Policy loss: 0.000463. Value loss: 3.350839. Entropy: 1.121141.
Iteration 34662: Policy loss: 0.001998. Value loss: 2.892621. Entropy: 1.127452.
Training network. lr: 0.000055. clip: 0.022038
Iteration 34663: Policy loss: 0.002989. Value loss: 4.355869. Entropy: 1.199036.
Iteration 34664: Policy loss: 0.002207. Value loss: 2.839684. Entropy: 1.219479.
Iteration 34665: Policy loss: -0.000098. Value loss: 2.478857. Entropy: 1.203217.
episode: 13034   score: 295.0  epsilon: 1.0    steps: 464  evaluation reward: 335.05
episode: 13035   score: 305.0  epsilon: 1.0    steps: 903  evaluation reward: 336.45
Training network. lr: 0.000055. clip: 0.022038
Iteration 34666: Policy loss: 0.002481. Value loss: 3.264162. Entropy: 1.112305.
Iteration 34667: Policy loss: 0.0017

Iteration 34724: Policy loss: 0.006171. Value loss: 2.303982. Entropy: 1.087354.
Iteration 34725: Policy loss: 0.006142. Value loss: 1.950360. Entropy: 1.086231.
episode: 13056   score: 600.0  epsilon: 1.0    steps: 474  evaluation reward: 350.05
Training network. lr: 0.000055. clip: 0.021925
Iteration 34726: Policy loss: 0.001923. Value loss: 3.924837. Entropy: 1.094061.
Iteration 34727: Policy loss: 0.003532. Value loss: 2.787784. Entropy: 1.109319.
Iteration 34728: Policy loss: 0.005472. Value loss: 2.320346. Entropy: 1.098322.
episode: 13057   score: 210.0  epsilon: 1.0    steps: 36  evaluation reward: 352.15
Training network. lr: 0.000055. clip: 0.021925
Iteration 34729: Policy loss: 0.001449. Value loss: 2.714201. Entropy: 1.188244.
Iteration 34730: Policy loss: -0.000171. Value loss: 2.142579. Entropy: 1.195202.
Iteration 34731: Policy loss: 0.001581. Value loss: 1.904731. Entropy: 1.205177.
Training network. lr: 0.000055. clip: 0.021925
Iteration 34732: Policy loss: 0.004479. V

Iteration 34786: Policy loss: 0.003236. Value loss: 3.457374. Entropy: 1.181808.
Iteration 34787: Policy loss: 0.003429. Value loss: 2.664928. Entropy: 1.184579.
Iteration 34788: Policy loss: 0.002253. Value loss: 2.453118. Entropy: 1.189400.
Training network. lr: 0.000055. clip: 0.021813
Iteration 34789: Policy loss: 0.000919. Value loss: 3.570514. Entropy: 1.203683.
Iteration 34790: Policy loss: 0.002429. Value loss: 2.437035. Entropy: 1.214294.
Iteration 34791: Policy loss: 0.004163. Value loss: 2.039384. Entropy: 1.201227.
episode: 13082   score: 260.0  epsilon: 1.0    steps: 7  evaluation reward: 344.25
Training network. lr: 0.000055. clip: 0.021813
Iteration 34792: Policy loss: 0.001003. Value loss: 5.856792. Entropy: 1.193769.
Iteration 34793: Policy loss: 0.004559. Value loss: 3.697963. Entropy: 1.177832.
Iteration 34794: Policy loss: 0.003825. Value loss: 3.734968. Entropy: 1.182577.
Training network. lr: 0.000055. clip: 0.021813
Iteration 34795: Policy loss: -0.000146. Value 

episode: 13104   score: 290.0  epsilon: 1.0    steps: 752  evaluation reward: 364.7
Training network. lr: 0.000054. clip: 0.021588
Iteration 34852: Policy loss: 0.003193. Value loss: 6.724369. Entropy: 1.115550.
Iteration 34853: Policy loss: 0.002661. Value loss: 5.293622. Entropy: 1.104491.
Iteration 34854: Policy loss: 0.002849. Value loss: 4.388647. Entropy: 1.127575.
episode: 13105   score: 330.0  epsilon: 1.0    steps: 408  evaluation reward: 362.6
Training network. lr: 0.000054. clip: 0.021588
Iteration 34855: Policy loss: 0.000873. Value loss: 4.203455. Entropy: 1.140897.
Iteration 34856: Policy loss: 0.000238. Value loss: 3.516296. Entropy: 1.169622.
Iteration 34857: Policy loss: 0.000061. Value loss: 2.920470. Entropy: 1.142048.
Training network. lr: 0.000054. clip: 0.021588
Iteration 34858: Policy loss: 0.001407. Value loss: 3.501471. Entropy: 1.174254.
Iteration 34859: Policy loss: 0.001592. Value loss: 2.453286. Entropy: 1.166508.
Iteration 34860: Policy loss: 0.001641. Val

episode: 13129   score: 260.0  epsilon: 1.0    steps: 972  evaluation reward: 358.15
Training network. lr: 0.000054. clip: 0.021475
Iteration 34915: Policy loss: -0.000056. Value loss: 6.613806. Entropy: 0.978172.
Iteration 34916: Policy loss: 0.002073. Value loss: 4.368144. Entropy: 0.971367.
Iteration 34917: Policy loss: 0.002093. Value loss: 3.527599. Entropy: 1.000982.
Training network. lr: 0.000054. clip: 0.021475
Iteration 34918: Policy loss: 0.004992. Value loss: 4.423460. Entropy: 1.007896.
Iteration 34919: Policy loss: 0.006706. Value loss: 2.651450. Entropy: 0.988345.
Iteration 34920: Policy loss: 0.005903. Value loss: 2.178329. Entropy: 0.985458.
Training network. lr: 0.000054. clip: 0.021475
Iteration 34921: Policy loss: 0.002494. Value loss: 6.223121. Entropy: 1.215793.
Iteration 34922: Policy loss: 0.003409. Value loss: 4.928393. Entropy: 1.218069.
Iteration 34923: Policy loss: 0.006359. Value loss: 3.872187. Entropy: 1.224965.
Training network. lr: 0.000054. clip: 0.0214

now time :  2019-02-23 11:39:32.223844
episode: 13151   score: 380.0  epsilon: 1.0    steps: 952  evaluation reward: 374.4
Training network. lr: 0.000053. clip: 0.021362
Iteration 34981: Policy loss: 0.006863. Value loss: 5.535109. Entropy: 1.144527.
Iteration 34982: Policy loss: 0.009433. Value loss: 3.693465. Entropy: 1.161988.
Iteration 34983: Policy loss: 0.007078. Value loss: 2.930965. Entropy: 1.151142.
episode: 13152   score: 290.0  epsilon: 1.0    steps: 814  evaluation reward: 375.35
Training network. lr: 0.000053. clip: 0.021362
Iteration 34984: Policy loss: 0.001059. Value loss: 2.251919. Entropy: 1.006167.
Iteration 34985: Policy loss: 0.000818. Value loss: 1.657657. Entropy: 1.020986.
Iteration 34986: Policy loss: 0.000070. Value loss: 1.665532. Entropy: 1.007095.
Training network. lr: 0.000053. clip: 0.021362
Iteration 34987: Policy loss: 0.000579. Value loss: 2.499879. Entropy: 0.932870.
Iteration 34988: Policy loss: 0.001334. Value loss: 2.104309. Entropy: 0.926360.
Ite

Iteration 35045: Policy loss: 0.003696. Value loss: 2.516736. Entropy: 1.187456.
Iteration 35046: Policy loss: 0.001898. Value loss: 2.368403. Entropy: 1.184942.
Training network. lr: 0.000053. clip: 0.021250
Iteration 35047: Policy loss: 0.001351. Value loss: 3.173778. Entropy: 1.263329.
Iteration 35048: Policy loss: 0.002389. Value loss: 2.263999. Entropy: 1.275729.
Iteration 35049: Policy loss: 0.004892. Value loss: 1.598715. Entropy: 1.256591.
episode: 13174   score: 375.0  epsilon: 1.0    steps: 240  evaluation reward: 356.7
Training network. lr: 0.000053. clip: 0.021250
Iteration 35050: Policy loss: 0.002296. Value loss: 5.945897. Entropy: 1.275810.
Iteration 35051: Policy loss: 0.003840. Value loss: 5.353001. Entropy: 1.254895.
Iteration 35052: Policy loss: 0.002317. Value loss: 4.483696. Entropy: 1.262123.
episode: 13175   score: 220.0  epsilon: 1.0    steps: 257  evaluation reward: 356.75
Training network. lr: 0.000053. clip: 0.021138
Iteration 35053: Policy loss: 0.004104. Va

Iteration 35108: Policy loss: 0.010670. Value loss: 3.897601. Entropy: 1.086575.
Iteration 35109: Policy loss: 0.008795. Value loss: 2.683493. Entropy: 1.058005.
episode: 13199   score: 215.0  epsilon: 1.0    steps: 840  evaluation reward: 361.95
Training network. lr: 0.000053. clip: 0.021025
Iteration 35110: Policy loss: 0.001046. Value loss: 3.383536. Entropy: 0.899131.
Iteration 35111: Policy loss: 0.002889. Value loss: 2.488144. Entropy: 0.901497.
Iteration 35112: Policy loss: 0.001068. Value loss: 2.325776. Entropy: 0.897352.
episode: 13200   score: 195.0  epsilon: 1.0    steps: 78  evaluation reward: 360.95
Training network. lr: 0.000053. clip: 0.021025
Iteration 35113: Policy loss: 0.002094. Value loss: 3.241012. Entropy: 1.157842.
Iteration 35114: Policy loss: 0.004249. Value loss: 2.610858. Entropy: 1.159321.
Iteration 35115: Policy loss: 0.004215. Value loss: 2.192297. Entropy: 1.155805.
now time :  2019-02-23 11:42:17.904516
episode: 13201   score: 285.0  epsilon: 1.0    ste

Iteration 35171: Policy loss: 0.004801. Value loss: 3.178051. Entropy: 1.102950.
Iteration 35172: Policy loss: 0.004215. Value loss: 3.358826. Entropy: 1.104240.
Training network. lr: 0.000052. clip: 0.020913
Iteration 35173: Policy loss: 0.001637. Value loss: 3.591867. Entropy: 1.038090.
Iteration 35174: Policy loss: 0.001845. Value loss: 2.571111. Entropy: 1.036999.
Iteration 35175: Policy loss: 0.001385. Value loss: 2.251336. Entropy: 1.044379.
episode: 13224   score: 350.0  epsilon: 1.0    steps: 109  evaluation reward: 354.45
episode: 13225   score: 305.0  epsilon: 1.0    steps: 298  evaluation reward: 354.7
episode: 13226   score: 185.0  epsilon: 1.0    steps: 538  evaluation reward: 354.35
Training network. lr: 0.000052. clip: 0.020913
Iteration 35176: Policy loss: 0.002319. Value loss: 3.495101. Entropy: 1.069936.
Iteration 35177: Policy loss: 0.003702. Value loss: 2.434768. Entropy: 1.066651.
Iteration 35178: Policy loss: 0.004700. Value loss: 1.939003. Entropy: 1.078957.
Trai

Iteration 35235: Policy loss: 0.003542. Value loss: 3.395787. Entropy: 1.035423.
Training network. lr: 0.000052. clip: 0.020800
Iteration 35236: Policy loss: 0.001192. Value loss: 3.455667. Entropy: 1.144711.
Iteration 35237: Policy loss: 0.002000. Value loss: 2.434485. Entropy: 1.140026.
Iteration 35238: Policy loss: 0.002881. Value loss: 2.006968. Entropy: 1.154185.
Training network. lr: 0.000052. clip: 0.020800
Iteration 35239: Policy loss: 0.002692. Value loss: 5.610778. Entropy: 1.142612.
Iteration 35240: Policy loss: 0.004124. Value loss: 4.445004. Entropy: 1.144921.
Iteration 35241: Policy loss: 0.005094. Value loss: 3.442750. Entropy: 1.155884.
episode: 13248   score: 470.0  epsilon: 1.0    steps: 122  evaluation reward: 340.4
episode: 13249   score: 220.0  epsilon: 1.0    steps: 704  evaluation reward: 338.4
episode: 13250   score: 265.0  epsilon: 1.0    steps: 1006  evaluation reward: 337.4
Training network. lr: 0.000052. clip: 0.020800
Iteration 35242: Policy loss: 0.000954.

Iteration 35300: Policy loss: 0.000077. Value loss: 2.596197. Entropy: 1.194032.
Iteration 35301: Policy loss: 0.000086. Value loss: 2.035061. Entropy: 1.204039.
episode: 13270   score: 365.0  epsilon: 1.0    steps: 51  evaluation reward: 341.45
episode: 13271   score: 515.0  epsilon: 1.0    steps: 928  evaluation reward: 341.8
Training network. lr: 0.000051. clip: 0.020575
Iteration 35302: Policy loss: 0.002311. Value loss: 4.310902. Entropy: 0.933432.
Iteration 35303: Policy loss: 0.002193. Value loss: 3.266839. Entropy: 0.949397.
Iteration 35304: Policy loss: 0.004052. Value loss: 2.853713. Entropy: 0.975067.
episode: 13272   score: 315.0  epsilon: 1.0    steps: 150  evaluation reward: 346.25
episode: 13273   score: 345.0  epsilon: 1.0    steps: 703  evaluation reward: 346.15
Training network. lr: 0.000051. clip: 0.020575
Iteration 35305: Policy loss: 0.000941. Value loss: 3.374764. Entropy: 0.932746.
Iteration 35306: Policy loss: 0.005044. Value loss: 2.588058. Entropy: 0.919589.
I

Iteration 35363: Policy loss: 0.008918. Value loss: 3.474508. Entropy: 0.894346.
Iteration 35364: Policy loss: 0.003374. Value loss: 3.202740. Entropy: 0.913486.
episode: 13295   score: 80.0  epsilon: 1.0    steps: 686  evaluation reward: 346.5
Training network. lr: 0.000051. clip: 0.020463
Iteration 35365: Policy loss: 0.000468. Value loss: 3.913113. Entropy: 0.983285.
Iteration 35366: Policy loss: 0.000661. Value loss: 2.809026. Entropy: 1.003429.
Iteration 35367: Policy loss: 0.001369. Value loss: 2.453480. Entropy: 0.981792.
Training network. lr: 0.000051. clip: 0.020463
Iteration 35368: Policy loss: 0.006821. Value loss: 4.467202. Entropy: 1.106413.
Iteration 35369: Policy loss: 0.017739. Value loss: 2.766753. Entropy: 1.090270.
Iteration 35370: Policy loss: 0.011343. Value loss: 2.279022. Entropy: 1.098948.
episode: 13296   score: 400.0  epsilon: 1.0    steps: 65  evaluation reward: 345.75
episode: 13297   score: 210.0  epsilon: 1.0    steps: 808  evaluation reward: 344.6
Trainin

episode: 13318   score: 435.0  epsilon: 1.0    steps: 419  evaluation reward: 339.35
Training network. lr: 0.000051. clip: 0.020350
Iteration 35428: Policy loss: 0.001632. Value loss: 2.726901. Entropy: 1.192855.
Iteration 35429: Policy loss: 0.003165. Value loss: 1.989059. Entropy: 1.194444.
Iteration 35430: Policy loss: 0.000908. Value loss: 1.658483. Entropy: 1.185719.
episode: 13319   score: 670.0  epsilon: 1.0    steps: 805  evaluation reward: 340.35
episode: 13320   score: 295.0  epsilon: 1.0    steps: 939  evaluation reward: 343.45
Training network. lr: 0.000051. clip: 0.020350
Iteration 35431: Policy loss: 0.001453. Value loss: 1.515204. Entropy: 1.059619.
Iteration 35432: Policy loss: 0.000355. Value loss: 1.273764. Entropy: 1.049266.
Iteration 35433: Policy loss: -0.000778. Value loss: 1.082044. Entropy: 1.064716.
Training network. lr: 0.000051. clip: 0.020350
Iteration 35434: Policy loss: 0.001638. Value loss: 2.378143. Entropy: 0.939040.
Iteration 35435: Policy loss: 0.0005

Training network. lr: 0.000051. clip: 0.020238
Iteration 35491: Policy loss: 0.000565. Value loss: 2.758723. Entropy: 1.130740.
Iteration 35492: Policy loss: 0.003044. Value loss: 1.988833. Entropy: 1.110029.
Iteration 35493: Policy loss: 0.000660. Value loss: 1.520070. Entropy: 1.128927.
episode: 13343   score: 290.0  epsilon: 1.0    steps: 342  evaluation reward: 335.7
episode: 13344   score: 290.0  epsilon: 1.0    steps: 646  evaluation reward: 335.45
Training network. lr: 0.000051. clip: 0.020238
Iteration 35494: Policy loss: -0.000332. Value loss: 6.761997. Entropy: 0.900156.
Iteration 35495: Policy loss: 0.000372. Value loss: 4.987068. Entropy: 0.888456.
Iteration 35496: Policy loss: 0.000431. Value loss: 3.928347. Entropy: 0.905269.
Training network. lr: 0.000051. clip: 0.020238
Iteration 35497: Policy loss: 0.001406. Value loss: 4.698212. Entropy: 1.040196.
Iteration 35498: Policy loss: 0.001496. Value loss: 4.095612. Entropy: 1.028862.
Iteration 35499: Policy loss: 0.003368. V

Iteration 35554: Policy loss: 0.002044. Value loss: 3.169374. Entropy: 1.179861.
Iteration 35555: Policy loss: 0.004081. Value loss: 2.142585. Entropy: 1.195686.
Iteration 35556: Policy loss: 0.002548. Value loss: 1.551252. Entropy: 1.184935.
Training network. lr: 0.000050. clip: 0.020013
Iteration 35557: Policy loss: 0.001566. Value loss: 4.205737. Entropy: 1.260701.
Iteration 35558: Policy loss: 0.007418. Value loss: 3.204811. Entropy: 1.255303.
Iteration 35559: Policy loss: 0.005866. Value loss: 2.289595. Entropy: 1.249666.
episode: 13367   score: 390.0  epsilon: 1.0    steps: 693  evaluation reward: 340.9
Training network. lr: 0.000050. clip: 0.020013
Iteration 35560: Policy loss: 0.003067. Value loss: 5.100045. Entropy: 1.189131.
Iteration 35561: Policy loss: 0.002865. Value loss: 3.491231. Entropy: 1.183772.
Iteration 35562: Policy loss: 0.007079. Value loss: 3.200035. Entropy: 1.182128.
episode: 13368   score: 330.0  epsilon: 1.0    steps: 946  evaluation reward: 340.75
Training

Training network. lr: 0.000050. clip: 0.019900
Iteration 35617: Policy loss: 0.003274. Value loss: 3.404889. Entropy: 0.954130.
Iteration 35618: Policy loss: 0.004552. Value loss: 2.707944. Entropy: 0.957195.
Iteration 35619: Policy loss: 0.005938. Value loss: 2.249494. Entropy: 0.962808.
episode: 13393   score: 385.0  epsilon: 1.0    steps: 234  evaluation reward: 330.9
Training network. lr: 0.000050. clip: 0.019900
Iteration 35620: Policy loss: 0.000818. Value loss: 2.673610. Entropy: 0.871721.
Iteration 35621: Policy loss: 0.002241. Value loss: 1.898344. Entropy: 0.865145.
Iteration 35622: Policy loss: 0.000627. Value loss: 1.547398. Entropy: 0.873265.
Training network. lr: 0.000050. clip: 0.019900
Iteration 35623: Policy loss: 0.000935. Value loss: 1.958597. Entropy: 0.998732.
Iteration 35624: Policy loss: 0.002818. Value loss: 1.623871. Entropy: 1.002817.
Iteration 35625: Policy loss: 0.003355. Value loss: 1.372192. Entropy: 1.000884.
episode: 13394   score: 255.0  epsilon: 1.0   

episode: 13415   score: 135.0  epsilon: 1.0    steps: 162  evaluation reward: 334.75
episode: 13416   score: 225.0  epsilon: 1.0    steps: 502  evaluation reward: 331.35
episode: 13417   score: 555.0  epsilon: 1.0    steps: 963  evaluation reward: 331.8
Training network. lr: 0.000049. clip: 0.019787
Iteration 35683: Policy loss: 0.000614. Value loss: 4.126324. Entropy: 0.934883.
Iteration 35684: Policy loss: 0.001560. Value loss: 3.180742. Entropy: 0.916539.
Iteration 35685: Policy loss: 0.000624. Value loss: 3.180064. Entropy: 0.918032.
episode: 13418   score: 120.0  epsilon: 1.0    steps: 694  evaluation reward: 334.4
episode: 13419   score: 455.0  epsilon: 1.0    steps: 858  evaluation reward: 331.25
Training network. lr: 0.000049. clip: 0.019787
Iteration 35686: Policy loss: 0.002441. Value loss: 3.645321. Entropy: 0.881443.
Iteration 35687: Policy loss: 0.001923. Value loss: 2.853537. Entropy: 0.874310.
Iteration 35688: Policy loss: 0.001591. Value loss: 2.451669. Entropy: 0.89027

Iteration 35745: Policy loss: 0.001185. Value loss: 2.145740. Entropy: 0.941453.
episode: 13441   score: 240.0  epsilon: 1.0    steps: 378  evaluation reward: 321.75
Training network. lr: 0.000049. clip: 0.019675
Iteration 35746: Policy loss: 0.002431. Value loss: 5.374047. Entropy: 1.299817.
Iteration 35747: Policy loss: 0.004482. Value loss: 4.145014. Entropy: 1.316922.
Iteration 35748: Policy loss: 0.003914. Value loss: 4.071613. Entropy: 1.302399.
episode: 13442   score: 350.0  epsilon: 1.0    steps: 902  evaluation reward: 321.0
Training network. lr: 0.000049. clip: 0.019675
Iteration 35749: Policy loss: 0.001564. Value loss: 2.476012. Entropy: 1.132120.
Iteration 35750: Policy loss: 0.000867. Value loss: 2.256987. Entropy: 1.109555.
Iteration 35751: Policy loss: 0.001133. Value loss: 1.829079. Entropy: 1.111021.
episode: 13443   score: 215.0  epsilon: 1.0    steps: 250  evaluation reward: 320.95
episode: 13444   score: 235.0  epsilon: 1.0    steps: 817  evaluation reward: 320.2
T

episode: 13465   score: 290.0  epsilon: 1.0    steps: 706  evaluation reward: 319.85
Training network. lr: 0.000049. clip: 0.019450
Iteration 35809: Policy loss: 0.001963. Value loss: 2.831065. Entropy: 1.053450.
Iteration 35810: Policy loss: 0.003158. Value loss: 2.011029. Entropy: 1.054312.
Iteration 35811: Policy loss: 0.004726. Value loss: 1.612832. Entropy: 1.036322.
episode: 13466   score: 675.0  epsilon: 1.0    steps: 535  evaluation reward: 318.85
Training network. lr: 0.000049. clip: 0.019450
Iteration 35812: Policy loss: 0.001835. Value loss: 2.333977. Entropy: 0.955941.
Iteration 35813: Policy loss: 0.003345. Value loss: 1.926974. Entropy: 0.931785.
Iteration 35814: Policy loss: 0.003212. Value loss: 1.652904. Entropy: 0.961146.
episode: 13467   score: 450.0  epsilon: 1.0    steps: 785  evaluation reward: 323.4
Training network. lr: 0.000049. clip: 0.019450
Iteration 35815: Policy loss: 0.000965. Value loss: 3.463871. Entropy: 0.998787.
Iteration 35816: Policy loss: 0.001247

Iteration 35873: Policy loss: 0.003953. Value loss: 2.854946. Entropy: 1.047476.
Iteration 35874: Policy loss: 0.002068. Value loss: 2.174478. Entropy: 1.052958.
episode: 13489   score: 315.0  epsilon: 1.0    steps: 405  evaluation reward: 327.45
Training network. lr: 0.000048. clip: 0.019338
Iteration 35875: Policy loss: 0.000761. Value loss: 2.549898. Entropy: 1.154351.
Iteration 35876: Policy loss: -0.001117. Value loss: 1.912395. Entropy: 1.157986.
Iteration 35877: Policy loss: 0.002587. Value loss: 1.777475. Entropy: 1.143471.
episode: 13490   score: 435.0  epsilon: 1.0    steps: 105  evaluation reward: 327.3
episode: 13491   score: 300.0  epsilon: 1.0    steps: 522  evaluation reward: 327.1
Training network. lr: 0.000048. clip: 0.019338
Iteration 35878: Policy loss: 0.000679. Value loss: 3.078661. Entropy: 1.017649.
Iteration 35879: Policy loss: -0.000638. Value loss: 2.400785. Entropy: 1.013788.
Iteration 35880: Policy loss: 0.000573. Value loss: 2.022758. Entropy: 1.006312.
epi

Iteration 35936: Policy loss: 0.001264. Value loss: 2.890972. Entropy: 0.934186.
Iteration 35937: Policy loss: 0.000412. Value loss: 2.561659. Entropy: 0.940248.
episode: 13514   score: 365.0  epsilon: 1.0    steps: 685  evaluation reward: 323.75
Training network. lr: 0.000048. clip: 0.019225
Iteration 35938: Policy loss: 0.003106. Value loss: 2.265590. Entropy: 1.228202.
Iteration 35939: Policy loss: 0.001996. Value loss: 1.582953. Entropy: 1.233754.
Iteration 35940: Policy loss: 0.000443. Value loss: 1.320250. Entropy: 1.231548.
Training network. lr: 0.000048. clip: 0.019225
Iteration 35941: Policy loss: 0.001967. Value loss: 1.840682. Entropy: 1.186675.
Iteration 35942: Policy loss: 0.001236. Value loss: 1.271368. Entropy: 1.179148.
Iteration 35943: Policy loss: 0.000565. Value loss: 1.093955. Entropy: 1.196270.
episode: 13515   score: 225.0  epsilon: 1.0    steps: 124  evaluation reward: 325.0
Training network. lr: 0.000048. clip: 0.019225
Iteration 35944: Policy loss: 0.003416. Va

episode: 13538   score: 600.0  epsilon: 1.0    steps: 228  evaluation reward: 339.25
Training network. lr: 0.000048. clip: 0.019000
Iteration 36001: Policy loss: 0.001051. Value loss: 3.193871. Entropy: 0.860022.
Iteration 36002: Policy loss: 0.001559. Value loss: 2.428634. Entropy: 0.849597.
Iteration 36003: Policy loss: 0.001578. Value loss: 2.082795. Entropy: 0.846662.
episode: 13539   score: 150.0  epsilon: 1.0    steps: 639  evaluation reward: 343.3
Training network. lr: 0.000048. clip: 0.019000
Iteration 36004: Policy loss: 0.002138. Value loss: 6.503424. Entropy: 1.156591.
Iteration 36005: Policy loss: 0.005890. Value loss: 5.209289. Entropy: 1.150841.
Iteration 36006: Policy loss: 0.008908. Value loss: 3.898873. Entropy: 1.154385.
Training network. lr: 0.000048. clip: 0.019000
Iteration 36007: Policy loss: 0.000169. Value loss: 2.262050. Entropy: 1.079252.
Iteration 36008: Policy loss: 0.002817. Value loss: 1.797423. Entropy: 1.090150.
Iteration 36009: Policy loss: 0.001769. Va

episode: 13563   score: 600.0  epsilon: 1.0    steps: 785  evaluation reward: 334.5
Training network. lr: 0.000047. clip: 0.018888
Iteration 36064: Policy loss: 0.001274. Value loss: 2.644094. Entropy: 0.966817.
Iteration 36065: Policy loss: 0.001986. Value loss: 1.965422. Entropy: 0.963681.
Iteration 36066: Policy loss: 0.001293. Value loss: 1.681558. Entropy: 0.954759.
Training network. lr: 0.000047. clip: 0.018888
Iteration 36067: Policy loss: 0.003700. Value loss: 4.747744. Entropy: 1.064912.
Iteration 36068: Policy loss: 0.005911. Value loss: 3.511554. Entropy: 1.059236.
Iteration 36069: Policy loss: 0.004404. Value loss: 3.080534. Entropy: 1.087395.
Training network. lr: 0.000047. clip: 0.018888
Iteration 36070: Policy loss: 0.002936. Value loss: 5.420211. Entropy: 1.176672.
Iteration 36071: Policy loss: 0.003358. Value loss: 3.454268. Entropy: 1.194302.
Iteration 36072: Policy loss: 0.004946. Value loss: 3.041545. Entropy: 1.200870.
Training network. lr: 0.000047. clip: 0.018888

Iteration 36132: Policy loss: 0.007529. Value loss: 5.162828. Entropy: 1.318612.
episode: 13582   score: 775.0  epsilon: 1.0    steps: 461  evaluation reward: 335.15
Training network. lr: 0.000047. clip: 0.018775
Iteration 36133: Policy loss: 0.002460. Value loss: 5.538535. Entropy: 1.312522.
Iteration 36134: Policy loss: 0.002837. Value loss: 4.417517. Entropy: 1.306870.
Iteration 36135: Policy loss: 0.002664. Value loss: 3.952647. Entropy: 1.307087.
episode: 13583   score: 605.0  epsilon: 1.0    steps: 84  evaluation reward: 338.95
episode: 13584   score: 270.0  epsilon: 1.0    steps: 311  evaluation reward: 342.85
episode: 13585   score: 345.0  epsilon: 1.0    steps: 629  evaluation reward: 343.45
episode: 13586   score: 315.0  epsilon: 1.0    steps: 1009  evaluation reward: 343.35
Training network. lr: 0.000047. clip: 0.018775
Iteration 36136: Policy loss: 0.002007. Value loss: 4.330506. Entropy: 1.140789.
Iteration 36137: Policy loss: 0.003796. Value loss: 3.598758. Entropy: 1.143

Iteration 36195: Policy loss: 0.004964. Value loss: 4.445856. Entropy: 1.183227.
episode: 13607   score: 255.0  epsilon: 1.0    steps: 437  evaluation reward: 341.8
episode: 13608   score: 370.0  epsilon: 1.0    steps: 632  evaluation reward: 339.15
episode: 13609   score: 330.0  epsilon: 1.0    steps: 800  evaluation reward: 341.05
Training network. lr: 0.000047. clip: 0.018663
Iteration 36196: Policy loss: 0.001623. Value loss: 12.310850. Entropy: 0.986529.
Iteration 36197: Policy loss: 0.005643. Value loss: 8.927509. Entropy: 0.996783.
Iteration 36198: Policy loss: 0.006190. Value loss: 8.512404. Entropy: 0.979913.
Training network. lr: 0.000047. clip: 0.018663
Iteration 36199: Policy loss: 0.001860. Value loss: 5.889292. Entropy: 0.872828.
Iteration 36200: Policy loss: 0.000967. Value loss: 4.161056. Entropy: 0.871296.
Iteration 36201: Policy loss: 0.002971. Value loss: 3.520813. Entropy: 0.903903.
episode: 13610   score: 470.0  epsilon: 1.0    steps: 125  evaluation reward: 340.85

episode: 13631   score: 655.0  epsilon: 1.0    steps: 494  evaluation reward: 336.85
Training network. lr: 0.000046. clip: 0.018438
Iteration 36259: Policy loss: 0.001387. Value loss: 5.569375. Entropy: 1.121890.
Iteration 36260: Policy loss: 0.002929. Value loss: 4.876835. Entropy: 1.129515.
Iteration 36261: Policy loss: 0.002682. Value loss: 3.661356. Entropy: 1.130823.
episode: 13632   score: 240.0  epsilon: 1.0    steps: 307  evaluation reward: 341.6
episode: 13633   score: 215.0  epsilon: 1.0    steps: 963  evaluation reward: 339.9
Training network. lr: 0.000046. clip: 0.018438
Iteration 36262: Policy loss: 0.002717. Value loss: 2.369247. Entropy: 1.090137.
Iteration 36263: Policy loss: 0.003076. Value loss: 1.730373. Entropy: 1.122951.
Iteration 36264: Policy loss: 0.005324. Value loss: 1.613022. Entropy: 1.117805.
Training network. lr: 0.000046. clip: 0.018438
Iteration 36265: Policy loss: 0.003044. Value loss: 4.891087. Entropy: 1.127631.
Iteration 36266: Policy loss: 0.005820.

episode: 13656   score: 235.0  epsilon: 1.0    steps: 344  evaluation reward: 328.35
Training network. lr: 0.000046. clip: 0.018325
Iteration 36322: Policy loss: 0.003266. Value loss: 6.050423. Entropy: 1.132793.
Iteration 36323: Policy loss: 0.005111. Value loss: 4.203022. Entropy: 1.134520.
Iteration 36324: Policy loss: 0.003591. Value loss: 4.107675. Entropy: 1.137196.
episode: 13657   score: 260.0  epsilon: 1.0    steps: 122  evaluation reward: 327.55
episode: 13658   score: 155.0  epsilon: 1.0    steps: 970  evaluation reward: 326.8
Training network. lr: 0.000046. clip: 0.018325
Iteration 36325: Policy loss: 0.002739. Value loss: 3.999660. Entropy: 1.187526.
Iteration 36326: Policy loss: 0.004155. Value loss: 2.500275. Entropy: 1.188155.
Iteration 36327: Policy loss: 0.005407. Value loss: 2.076298. Entropy: 1.218661.
Training network. lr: 0.000046. clip: 0.018325
Iteration 36328: Policy loss: 0.004861. Value loss: 5.246266. Entropy: 1.042400.
Iteration 36329: Policy loss: 0.005744

Iteration 36384: Policy loss: 0.004397. Value loss: 1.800636. Entropy: 1.009375.
Training network. lr: 0.000046. clip: 0.018212
Iteration 36385: Policy loss: 0.003024. Value loss: 4.587146. Entropy: 1.026019.
Iteration 36386: Policy loss: 0.001484. Value loss: 4.239549. Entropy: 1.056230.
Iteration 36387: Policy loss: 0.004258. Value loss: 3.018897. Entropy: 1.034987.
episode: 13682   score: 275.0  epsilon: 1.0    steps: 679  evaluation reward: 321.5
Training network. lr: 0.000046. clip: 0.018212
Iteration 36388: Policy loss: 0.003228. Value loss: 3.931555. Entropy: 1.104860.
Iteration 36389: Policy loss: 0.002215. Value loss: 3.051312. Entropy: 1.086040.
Iteration 36390: Policy loss: 0.001957. Value loss: 2.488121. Entropy: 1.095390.
episode: 13683   score: 350.0  epsilon: 1.0    steps: 44  evaluation reward: 316.5
Training network. lr: 0.000046. clip: 0.018212
Iteration 36391: Policy loss: 0.002259. Value loss: 4.078888. Entropy: 1.191070.
Iteration 36392: Policy loss: 0.005327. Valu

Iteration 36450: Policy loss: 0.003776. Value loss: 4.229165. Entropy: 1.219486.
episode: 13703   score: 375.0  epsilon: 1.0    steps: 74  evaluation reward: 320.25
Training network. lr: 0.000045. clip: 0.017988
Iteration 36451: Policy loss: 0.002608. Value loss: 7.266584. Entropy: 1.025944.
Iteration 36452: Policy loss: 0.002895. Value loss: 5.374768. Entropy: 1.018757.
Iteration 36453: Policy loss: 0.003783. Value loss: 4.729918. Entropy: 1.023372.
episode: 13704   score: 310.0  epsilon: 1.0    steps: 492  evaluation reward: 322.45
Training network. lr: 0.000045. clip: 0.017988
Iteration 36454: Policy loss: 0.001473. Value loss: 11.606517. Entropy: 1.157987.
Iteration 36455: Policy loss: 0.004848. Value loss: 9.209951. Entropy: 1.170846.
Iteration 36456: Policy loss: 0.004449. Value loss: 7.158261. Entropy: 1.169540.
episode: 13705   score: 665.0  epsilon: 1.0    steps: 671  evaluation reward: 322.4
episode: 13706   score: 590.0  epsilon: 1.0    steps: 795  evaluation reward: 327.05


episode: 13727   score: 240.0  epsilon: 1.0    steps: 248  evaluation reward: 322.85
episode: 13728   score: 300.0  epsilon: 1.0    steps: 297  evaluation reward: 322.3
episode: 13729   score: 345.0  epsilon: 1.0    steps: 938  evaluation reward: 323.5
Training network. lr: 0.000045. clip: 0.017875
Iteration 36514: Policy loss: 0.004092. Value loss: 3.464495. Entropy: 1.164799.
Iteration 36515: Policy loss: 0.002418. Value loss: 2.370884. Entropy: 1.143653.
Iteration 36516: Policy loss: 0.001462. Value loss: 2.023689. Entropy: 1.158056.
Training network. lr: 0.000045. clip: 0.017875
Iteration 36517: Policy loss: 0.002119. Value loss: 4.858627. Entropy: 1.202847.
Iteration 36518: Policy loss: 0.007940. Value loss: 3.726699. Entropy: 1.208399.
Iteration 36519: Policy loss: 0.008061. Value loss: 2.732494. Entropy: 1.207321.
episode: 13730   score: 440.0  epsilon: 1.0    steps: 424  evaluation reward: 323.45
episode: 13731   score: 215.0  epsilon: 1.0    steps: 845  evaluation reward: 324.

Iteration 36575: Policy loss: 0.002019. Value loss: 3.862443. Entropy: 1.073341.
Iteration 36576: Policy loss: 0.003596. Value loss: 3.152862. Entropy: 1.045861.
Training network. lr: 0.000044. clip: 0.017763
Iteration 36577: Policy loss: 0.002629. Value loss: 7.317425. Entropy: 1.037434.
Iteration 36578: Policy loss: 0.003345. Value loss: 5.549327. Entropy: 1.018914.
Iteration 36579: Policy loss: 0.005991. Value loss: 4.697643. Entropy: 1.041256.
episode: 13754   score: 310.0  epsilon: 1.0    steps: 814  evaluation reward: 326.65
Training network. lr: 0.000044. clip: 0.017763
Iteration 36580: Policy loss: 0.002104. Value loss: 5.505028. Entropy: 1.271157.
Iteration 36581: Policy loss: 0.005929. Value loss: 3.762910. Entropy: 1.254124.
Iteration 36582: Policy loss: 0.006350. Value loss: 3.097659. Entropy: 1.272104.
episode: 13755   score: 305.0  epsilon: 1.0    steps: 493  evaluation reward: 327.4
episode: 13756   score: 225.0  epsilon: 1.0    steps: 580  evaluation reward: 328.3
Train

Iteration 36637: Policy loss: 0.000814. Value loss: 4.858340. Entropy: 1.048524.
Iteration 36638: Policy loss: 0.002017. Value loss: 4.933986. Entropy: 1.067724.
Iteration 36639: Policy loss: 0.002648. Value loss: 4.766446. Entropy: 1.055203.
episode: 13780   score: 520.0  epsilon: 1.0    steps: 484  evaluation reward: 323.25
Training network. lr: 0.000044. clip: 0.017650
Iteration 36640: Policy loss: 0.001278. Value loss: 2.543884. Entropy: 1.131062.
Iteration 36641: Policy loss: 0.002084. Value loss: 1.855553. Entropy: 1.123102.
Iteration 36642: Policy loss: 0.002816. Value loss: 1.553359. Entropy: 1.147282.
episode: 13781   score: 230.0  epsilon: 1.0    steps: 711  evaluation reward: 327.4
episode: 13782   score: 210.0  epsilon: 1.0    steps: 967  evaluation reward: 326.65
Training network. lr: 0.000044. clip: 0.017650
Iteration 36643: Policy loss: 0.000048. Value loss: 2.574914. Entropy: 1.005833.
Iteration 36644: Policy loss: 0.002304. Value loss: 2.056587. Entropy: 1.007322.
Iter

episode: 13802   score: 675.0  epsilon: 1.0    steps: 601  evaluation reward: 328.85
episode: 13803   score: 245.0  epsilon: 1.0    steps: 996  evaluation reward: 333.0
Training network. lr: 0.000044. clip: 0.017425
Iteration 36703: Policy loss: 0.000999. Value loss: 4.590215. Entropy: 1.159720.
Iteration 36704: Policy loss: 0.002551. Value loss: 3.715362. Entropy: 1.172586.
Iteration 36705: Policy loss: 0.005317. Value loss: 3.184719. Entropy: 1.159279.
Training network. lr: 0.000044. clip: 0.017425
Iteration 36706: Policy loss: 0.003482. Value loss: 5.831584. Entropy: 1.031037.
Iteration 36707: Policy loss: 0.006106. Value loss: 3.520990. Entropy: 1.019647.
Iteration 36708: Policy loss: 0.003652. Value loss: 3.046121. Entropy: 1.023550.
episode: 13804   score: 295.0  epsilon: 1.0    steps: 434  evaluation reward: 331.7
Training network. lr: 0.000044. clip: 0.017425
Iteration 36709: Policy loss: 0.001877. Value loss: 6.648513. Entropy: 1.055087.
Iteration 36710: Policy loss: 0.003414.

Iteration 36764: Policy loss: 0.005211. Value loss: 2.524151. Entropy: 1.021875.
Iteration 36765: Policy loss: 0.003857. Value loss: 2.178330. Entropy: 1.012039.
episode: 13829   score: 435.0  epsilon: 1.0    steps: 157  evaluation reward: 317.7
Training network. lr: 0.000043. clip: 0.017313
Iteration 36766: Policy loss: 0.001448. Value loss: 3.201016. Entropy: 1.171685.
Iteration 36767: Policy loss: 0.003935. Value loss: 2.228387. Entropy: 1.185266.
Iteration 36768: Policy loss: 0.003220. Value loss: 1.863081. Entropy: 1.166672.
episode: 13830   score: 260.0  epsilon: 1.0    steps: 524  evaluation reward: 318.6
Training network. lr: 0.000043. clip: 0.017313
Iteration 36769: Policy loss: 0.000646. Value loss: 2.242042. Entropy: 1.023593.
Iteration 36770: Policy loss: 0.002166. Value loss: 1.661898. Entropy: 1.037667.
Iteration 36771: Policy loss: 0.001123. Value loss: 1.359563. Entropy: 1.012051.
Training network. lr: 0.000043. clip: 0.017313
Iteration 36772: Policy loss: -0.000073. Va

Iteration 36828: Policy loss: 0.010673. Value loss: 6.257650. Entropy: 0.903202.
episode: 13853   score: 725.0  epsilon: 1.0    steps: 71  evaluation reward: 321.75
Training network. lr: 0.000043. clip: 0.017200
Iteration 36829: Policy loss: 0.001225. Value loss: 3.530197. Entropy: 0.992457.
Iteration 36830: Policy loss: 0.000306. Value loss: 2.621091. Entropy: 0.975016.
Iteration 36831: Policy loss: 0.001530. Value loss: 2.322540. Entropy: 0.991607.
Training network. lr: 0.000043. clip: 0.017200
Iteration 36832: Policy loss: 0.001823. Value loss: 4.043053. Entropy: 1.043964.
Iteration 36833: Policy loss: 0.001680. Value loss: 3.118841. Entropy: 1.026435.
Iteration 36834: Policy loss: 0.003397. Value loss: 2.366049. Entropy: 1.045916.
episode: 13854   score: 210.0  epsilon: 1.0    steps: 375  evaluation reward: 326.4
Training network. lr: 0.000043. clip: 0.017200
Iteration 36835: Policy loss: 0.002967. Value loss: 5.503080. Entropy: 1.280972.
Iteration 36836: Policy loss: 0.003596. Val

Iteration 36893: Policy loss: 0.003865. Value loss: 3.934113. Entropy: 1.213742.
Iteration 36894: Policy loss: 0.005265. Value loss: 3.164849. Entropy: 1.200583.
episode: 13876   score: 210.0  epsilon: 1.0    steps: 93  evaluation reward: 333.3
Training network. lr: 0.000043. clip: 0.017088
Iteration 36895: Policy loss: 0.002455. Value loss: 3.792959. Entropy: 1.274861.
Iteration 36896: Policy loss: 0.003287. Value loss: 2.618674. Entropy: 1.275329.
Iteration 36897: Policy loss: 0.003577. Value loss: 1.976297. Entropy: 1.275143.
episode: 13877   score: 375.0  epsilon: 1.0    steps: 617  evaluation reward: 331.65
Training network. lr: 0.000043. clip: 0.017088
Iteration 36898: Policy loss: 0.000761. Value loss: 3.168100. Entropy: 1.214734.
Iteration 36899: Policy loss: 0.001625. Value loss: 2.313856. Entropy: 1.229703.
Iteration 36900: Policy loss: 0.001095. Value loss: 2.048121. Entropy: 1.232920.
episode: 13878   score: 180.0  epsilon: 1.0    steps: 164  evaluation reward: 331.95
episo

Iteration 36956: Policy loss: 0.001785. Value loss: 2.673490. Entropy: 0.823765.
Iteration 36957: Policy loss: 0.002264. Value loss: 2.202577. Entropy: 0.836406.
Training network. lr: 0.000042. clip: 0.016863
Iteration 36958: Policy loss: 0.000953. Value loss: 2.559748. Entropy: 0.857000.
Iteration 36959: Policy loss: 0.001567. Value loss: 2.178431. Entropy: 0.871064.
Iteration 36960: Policy loss: 0.001142. Value loss: 1.939477. Entropy: 0.851810.
Training network. lr: 0.000042. clip: 0.016863
Iteration 36961: Policy loss: 0.001612. Value loss: 9.406862. Entropy: 0.891473.
Iteration 36962: Policy loss: 0.003622. Value loss: 7.235344. Entropy: 0.909874.
Iteration 36963: Policy loss: 0.003856. Value loss: 7.125414. Entropy: 0.871035.
Training network. lr: 0.000042. clip: 0.016863
Iteration 36964: Policy loss: 0.001322. Value loss: 4.032770. Entropy: 1.207742.
Iteration 36965: Policy loss: 0.002454. Value loss: 3.964314. Entropy: 1.206137.
Iteration 36966: Policy loss: 0.002953. Value los

Training network. lr: 0.000042. clip: 0.016750
Iteration 37024: Policy loss: 0.000989. Value loss: 6.558714. Entropy: 0.926168.
Iteration 37025: Policy loss: 0.003700. Value loss: 5.236289. Entropy: 0.928274.
Iteration 37026: Policy loss: 0.005606. Value loss: 3.899211. Entropy: 0.921122.
episode: 13921   score: 150.0  epsilon: 1.0    steps: 723  evaluation reward: 369.8
episode: 13922   score: 155.0  epsilon: 1.0    steps: 783  evaluation reward: 369.2
episode: 13923   score: 210.0  epsilon: 1.0    steps: 937  evaluation reward: 368.65
Training network. lr: 0.000042. clip: 0.016750
Iteration 37027: Policy loss: 0.000961. Value loss: 5.212463. Entropy: 0.996927.
Iteration 37028: Policy loss: 0.004521. Value loss: 3.785289. Entropy: 1.003264.
Iteration 37029: Policy loss: 0.005090. Value loss: 3.281815. Entropy: 0.998435.
episode: 13924   score: 625.0  epsilon: 1.0    steps: 148  evaluation reward: 366.7
Training network. lr: 0.000042. clip: 0.016750
Iteration 37030: Policy loss: 0.0009

episode: 13946   score: 210.0  epsilon: 1.0    steps: 596  evaluation reward: 378.7
Training network. lr: 0.000042. clip: 0.016637
Iteration 37087: Policy loss: 0.001980. Value loss: 5.176777. Entropy: 1.115568.
Iteration 37088: Policy loss: 0.002674. Value loss: 3.535038. Entropy: 1.083150.
Iteration 37089: Policy loss: 0.005696. Value loss: 3.460173. Entropy: 1.112262.
episode: 13947   score: 805.0  epsilon: 1.0    steps: 296  evaluation reward: 378.75
Training network. lr: 0.000042. clip: 0.016637
Iteration 37090: Policy loss: 0.002111. Value loss: 2.041806. Entropy: 0.964215.
Iteration 37091: Policy loss: 0.001463. Value loss: 1.658230. Entropy: 1.009043.
Iteration 37092: Policy loss: 0.001298. Value loss: 1.488460. Entropy: 0.980405.
episode: 13948   score: 135.0  epsilon: 1.0    steps: 400  evaluation reward: 383.8
episode: 13949   score: 420.0  epsilon: 1.0    steps: 816  evaluation reward: 380.0
Training network. lr: 0.000042. clip: 0.016637
Iteration 37093: Policy loss: 0.0006

episode: 13971   score: 230.0  epsilon: 1.0    steps: 103  evaluation reward: 362.25
episode: 13972   score: 210.0  epsilon: 1.0    steps: 591  evaluation reward: 359.85
episode: 13973   score: 215.0  epsilon: 1.0    steps: 705  evaluation reward: 359.85
Training network. lr: 0.000041. clip: 0.016525
Iteration 37150: Policy loss: 0.001009. Value loss: 5.897489. Entropy: 1.094141.
Iteration 37151: Policy loss: 0.000848. Value loss: 4.975441. Entropy: 1.081043.
Iteration 37152: Policy loss: -0.000091. Value loss: 3.967864. Entropy: 1.091194.
episode: 13974   score: 575.0  epsilon: 1.0    steps: 211  evaluation reward: 360.2
Training network. lr: 0.000041. clip: 0.016413
Iteration 37153: Policy loss: -0.000193. Value loss: 2.856324. Entropy: 0.937858.
Iteration 37154: Policy loss: -0.001074. Value loss: 2.219219. Entropy: 0.934230.
Iteration 37155: Policy loss: -0.000502. Value loss: 1.922822. Entropy: 0.935649.
Training network. lr: 0.000041. clip: 0.016413
Iteration 37156: Policy loss: 

Training network. lr: 0.000041. clip: 0.016300
Iteration 37216: Policy loss: 0.000192. Value loss: 4.547732. Entropy: 1.065700.
Iteration 37217: Policy loss: -0.001664. Value loss: 3.850602. Entropy: 1.071394.
Iteration 37218: Policy loss: -0.000467. Value loss: 3.440153. Entropy: 1.066448.
episode: 13993   score: 230.0  epsilon: 1.0    steps: 393  evaluation reward: 370.2
episode: 13994   score: 775.0  epsilon: 1.0    steps: 729  evaluation reward: 368.35
episode: 13995   score: 430.0  epsilon: 1.0    steps: 1005  evaluation reward: 372.05
Training network. lr: 0.000041. clip: 0.016300
Iteration 37219: Policy loss: 0.000131. Value loss: 2.653503. Entropy: 0.962595.
Iteration 37220: Policy loss: 0.002096. Value loss: 2.018211. Entropy: 0.945319.
Iteration 37221: Policy loss: 0.000240. Value loss: 1.766703. Entropy: 0.982304.
episode: 13996   score: 210.0  epsilon: 1.0    steps: 76  evaluation reward: 371.7
Training network. lr: 0.000041. clip: 0.016300
Iteration 37222: Policy loss: 0.0

Iteration 37279: Policy loss: 0.001960. Value loss: 6.171412. Entropy: 0.840973.
Iteration 37280: Policy loss: 0.004604. Value loss: 4.584203. Entropy: 0.849533.
Iteration 37281: Policy loss: 0.002324. Value loss: 3.866027. Entropy: 0.847508.
episode: 14017   score: 335.0  epsilon: 1.0    steps: 764  evaluation reward: 348.0
Training network. lr: 0.000040. clip: 0.016188
Iteration 37282: Policy loss: 0.000913. Value loss: 6.552809. Entropy: 1.078619.
Iteration 37283: Policy loss: 0.004225. Value loss: 5.521440. Entropy: 1.070489.
Iteration 37284: Policy loss: 0.003406. Value loss: 4.796396. Entropy: 1.075303.
episode: 14018   score: 320.0  epsilon: 1.0    steps: 1014  evaluation reward: 345.3
Training network. lr: 0.000040. clip: 0.016188
Iteration 37285: Policy loss: 0.000382. Value loss: 3.365114. Entropy: 1.167139.
Iteration 37286: Policy loss: 0.002499. Value loss: 2.373540. Entropy: 1.156613.
Iteration 37287: Policy loss: 0.000993. Value loss: 1.881932. Entropy: 1.158189.
episode:

Iteration 37344: Policy loss: 0.002125. Value loss: 2.271497. Entropy: 1.171110.
episode: 14040   score: 215.0  epsilon: 1.0    steps: 616  evaluation reward: 356.3
episode: 14041   score: 265.0  epsilon: 1.0    steps: 997  evaluation reward: 354.95
Training network. lr: 0.000040. clip: 0.016075
Iteration 37345: Policy loss: 0.002056. Value loss: 3.246882. Entropy: 1.077855.
Iteration 37346: Policy loss: 0.003326. Value loss: 2.897764. Entropy: 1.099783.
Iteration 37347: Policy loss: 0.003564. Value loss: 2.459715. Entropy: 1.062656.
episode: 14042   score: 285.0  epsilon: 1.0    steps: 3  evaluation reward: 352.5
Training network. lr: 0.000040. clip: 0.016075
Iteration 37348: Policy loss: 0.001518. Value loss: 3.112678. Entropy: 1.103230.
Iteration 37349: Policy loss: 0.002924. Value loss: 2.454810. Entropy: 1.094821.
Iteration 37350: Policy loss: 0.004138. Value loss: 2.266120. Entropy: 1.094616.
episode: 14043   score: 300.0  epsilon: 1.0    steps: 489  evaluation reward: 349.35
Tra

Iteration 37406: Policy loss: 0.002078. Value loss: 8.222265. Entropy: 1.035424.
Iteration 37407: Policy loss: 0.002081. Value loss: 6.545106. Entropy: 1.038878.
episode: 14066   score: 335.0  epsilon: 1.0    steps: 25  evaluation reward: 341.8
episode: 14067   score: 515.0  epsilon: 1.0    steps: 366  evaluation reward: 342.6
Training network. lr: 0.000040. clip: 0.015850
Iteration 37408: Policy loss: -0.000795. Value loss: 6.642590. Entropy: 0.943470.
Iteration 37409: Policy loss: -0.000098. Value loss: 5.121636. Entropy: 0.944607.
Iteration 37410: Policy loss: 0.001552. Value loss: 4.730482. Entropy: 0.950495.
Training network. lr: 0.000040. clip: 0.015850
Iteration 37411: Policy loss: 0.002241. Value loss: 3.572057. Entropy: 0.937555.
Iteration 37412: Policy loss: 0.003277. Value loss: 2.789890. Entropy: 0.942823.
Iteration 37413: Policy loss: 0.003064. Value loss: 2.363337. Entropy: 0.942648.
episode: 14068   score: 375.0  epsilon: 1.0    steps: 214  evaluation reward: 344.75
Trai

episode: 14093   score: 320.0  epsilon: 1.0    steps: 716  evaluation reward: 324.15
Training network. lr: 0.000039. clip: 0.015738
Iteration 37468: Policy loss: 0.000388. Value loss: 6.173978. Entropy: 1.044477.
Iteration 37469: Policy loss: 0.002488. Value loss: 5.161361. Entropy: 1.033073.
Iteration 37470: Policy loss: 0.006987. Value loss: 4.375197. Entropy: 1.022280.
episode: 14094   score: 335.0  epsilon: 1.0    steps: 91  evaluation reward: 325.05
episode: 14095   score: 450.0  epsilon: 1.0    steps: 164  evaluation reward: 320.65
episode: 14096   score: 260.0  epsilon: 1.0    steps: 353  evaluation reward: 320.85
Training network. lr: 0.000039. clip: 0.015738
Iteration 37471: Policy loss: 0.001686. Value loss: 3.194193. Entropy: 1.002940.
Iteration 37472: Policy loss: 0.004298. Value loss: 2.221467. Entropy: 0.990433.
Iteration 37473: Policy loss: 0.005074. Value loss: 1.679018. Entropy: 1.010006.
episode: 14097   score: 210.0  epsilon: 1.0    steps: 1002  evaluation reward: 32

Training network. lr: 0.000039. clip: 0.015625
Iteration 37531: Policy loss: 0.002637. Value loss: 6.395716. Entropy: 1.203896.
Iteration 37532: Policy loss: 0.003203. Value loss: 5.087736. Entropy: 1.214527.
Iteration 37533: Policy loss: 0.002728. Value loss: 4.217197. Entropy: 1.204444.
episode: 14118   score: 330.0  epsilon: 1.0    steps: 283  evaluation reward: 313.5
episode: 14119   score: 175.0  epsilon: 1.0    steps: 957  evaluation reward: 313.6
Training network. lr: 0.000039. clip: 0.015625
Iteration 37534: Policy loss: 0.000941. Value loss: 3.681753. Entropy: 1.107904.
Iteration 37535: Policy loss: 0.001937. Value loss: 2.935040. Entropy: 1.112942.
Iteration 37536: Policy loss: 0.002548. Value loss: 2.630565. Entropy: 1.114586.
episode: 14120   score: 500.0  epsilon: 1.0    steps: 697  evaluation reward: 311.1
Training network. lr: 0.000039. clip: 0.015625
Iteration 37537: Policy loss: 0.000914. Value loss: 2.972416. Entropy: 1.186513.
Iteration 37538: Policy loss: 0.003610. 

Iteration 37595: Policy loss: 0.002077. Value loss: 4.469029. Entropy: 0.921131.
Iteration 37596: Policy loss: 0.001655. Value loss: 4.018589. Entropy: 0.930249.
Training network. lr: 0.000039. clip: 0.015513
Iteration 37597: Policy loss: 0.001242. Value loss: 4.186212. Entropy: 1.092175.
Iteration 37598: Policy loss: 0.002884. Value loss: 2.864455. Entropy: 1.096650.
Iteration 37599: Policy loss: 0.001574. Value loss: 2.366963. Entropy: 1.081525.
episode: 14142   score: 405.0  epsilon: 1.0    steps: 553  evaluation reward: 309.6
episode: 14143   score: 435.0  epsilon: 1.0    steps: 808  evaluation reward: 310.8
episode: 14144   score: 120.0  epsilon: 1.0    steps: 930  evaluation reward: 312.15
Training network. lr: 0.000039. clip: 0.015513
Iteration 37600: Policy loss: 0.001957. Value loss: 5.243457. Entropy: 1.007085.
Iteration 37601: Policy loss: 0.004106. Value loss: 4.186271. Entropy: 1.021108.
Iteration 37602: Policy loss: 0.005476. Value loss: 3.669014. Entropy: 1.037740.
episo

Training network. lr: 0.000038. clip: 0.015288
Iteration 37660: Policy loss: 0.001072. Value loss: 2.661829. Entropy: 1.092960.
Iteration 37661: Policy loss: 0.001354. Value loss: 2.282434. Entropy: 1.050363.
Iteration 37662: Policy loss: 0.000674. Value loss: 2.126758. Entropy: 1.048764.
episode: 14165   score: 380.0  epsilon: 1.0    steps: 338  evaluation reward: 319.2
episode: 14166   score: 230.0  epsilon: 1.0    steps: 525  evaluation reward: 319.65
Training network. lr: 0.000038. clip: 0.015288
Iteration 37663: Policy loss: 0.001741. Value loss: 3.524352. Entropy: 0.874227.
Iteration 37664: Policy loss: 0.002424. Value loss: 2.792620. Entropy: 0.867047.
Iteration 37665: Policy loss: 0.002102. Value loss: 2.474919. Entropy: 0.876105.
episode: 14167   score: 335.0  epsilon: 1.0    steps: 472  evaluation reward: 318.6
Training network. lr: 0.000038. clip: 0.015288
Iteration 37666: Policy loss: 0.001759. Value loss: 3.247358. Entropy: 1.077269.
Iteration 37667: Policy loss: 0.002903.

episode: 14187   score: 210.0  epsilon: 1.0    steps: 681  evaluation reward: 324.55
Training network. lr: 0.000038. clip: 0.015175
Iteration 37726: Policy loss: 0.000080. Value loss: 2.998219. Entropy: 1.262883.
Iteration 37727: Policy loss: 0.000509. Value loss: 2.449126. Entropy: 1.279495.
Iteration 37728: Policy loss: 0.000736. Value loss: 2.382913. Entropy: 1.279761.
episode: 14188   score: 210.0  epsilon: 1.0    steps: 237  evaluation reward: 324.85
episode: 14189   score: 180.0  epsilon: 1.0    steps: 490  evaluation reward: 324.8
Training network. lr: 0.000038. clip: 0.015175
Iteration 37729: Policy loss: 0.000712. Value loss: 4.071135. Entropy: 1.170723.
Iteration 37730: Policy loss: 0.001784. Value loss: 3.228714. Entropy: 1.173127.
Iteration 37731: Policy loss: 0.002302. Value loss: 2.759287. Entropy: 1.159789.
episode: 14190   score: 600.0  epsilon: 1.0    steps: 524  evaluation reward: 324.5
episode: 14191   score: 280.0  epsilon: 1.0    steps: 936  evaluation reward: 329.

Training network. lr: 0.000038. clip: 0.015063
Iteration 37789: Policy loss: 0.002132. Value loss: 8.284268. Entropy: 0.891703.
Iteration 37790: Policy loss: 0.005459. Value loss: 4.957994. Entropy: 0.904471.
Iteration 37791: Policy loss: 0.005640. Value loss: 5.288877. Entropy: 0.888363.
episode: 14212   score: 670.0  epsilon: 1.0    steps: 700  evaluation reward: 340.25
Training network. lr: 0.000038. clip: 0.015063
Iteration 37792: Policy loss: 0.001493. Value loss: 7.505167. Entropy: 0.981686.
Iteration 37793: Policy loss: 0.001365. Value loss: 6.373425. Entropy: 0.969802.
Iteration 37794: Policy loss: 0.003152. Value loss: 4.741880. Entropy: 0.973010.
episode: 14213   score: 595.0  epsilon: 1.0    steps: 215  evaluation reward: 344.1
Training network. lr: 0.000038. clip: 0.015063
Iteration 37795: Policy loss: 0.000678. Value loss: 4.318520. Entropy: 1.130200.
Iteration 37796: Policy loss: 0.002077. Value loss: 3.690996. Entropy: 1.122929.
Iteration 37797: Policy loss: 0.002935. Va

Iteration 37854: Policy loss: 0.003170. Value loss: 2.344774. Entropy: 1.133648.
Training network. lr: 0.000037. clip: 0.014837
Iteration 37855: Policy loss: 0.001722. Value loss: 4.910789. Entropy: 1.267565.
Iteration 37856: Policy loss: 0.004434. Value loss: 3.794739. Entropy: 1.260996.
Iteration 37857: Policy loss: 0.006013. Value loss: 3.230119. Entropy: 1.270795.
episode: 14235   score: 395.0  epsilon: 1.0    steps: 890  evaluation reward: 344.35
Training network. lr: 0.000037. clip: 0.014837
Iteration 37858: Policy loss: 0.001967. Value loss: 8.178335. Entropy: 1.301565.
Iteration 37859: Policy loss: 0.003621. Value loss: 6.715619. Entropy: 1.295957.
Iteration 37860: Policy loss: 0.003931. Value loss: 5.786038. Entropy: 1.299540.
episode: 14236   score: 390.0  epsilon: 1.0    steps: 295  evaluation reward: 344.15
episode: 14237   score: 460.0  epsilon: 1.0    steps: 922  evaluation reward: 344.9
Training network. lr: 0.000037. clip: 0.014837
Iteration 37861: Policy loss: 0.001724

Iteration 37918: Policy loss: 0.002947. Value loss: 3.487998. Entropy: 1.453910.
Iteration 37919: Policy loss: 0.005267. Value loss: 2.460137. Entropy: 1.459281.
Iteration 37920: Policy loss: 0.004963. Value loss: 1.944973. Entropy: 1.456145.
Training network. lr: 0.000037. clip: 0.014725
Iteration 37921: Policy loss: 0.002043. Value loss: 9.070148. Entropy: 1.189634.
Iteration 37922: Policy loss: 0.003666. Value loss: 6.968913. Entropy: 1.166085.
Iteration 37923: Policy loss: 0.003570. Value loss: 5.985229. Entropy: 1.178162.
episode: 14258   score: 185.0  epsilon: 1.0    steps: 424  evaluation reward: 340.45
episode: 14259   score: 405.0  epsilon: 1.0    steps: 683  evaluation reward: 338.95
Training network. lr: 0.000037. clip: 0.014725
Iteration 37924: Policy loss: 0.001290. Value loss: 9.249369. Entropy: 1.226948.
Iteration 37925: Policy loss: 0.003193. Value loss: 6.385458. Entropy: 1.233281.
Iteration 37926: Policy loss: 0.004299. Value loss: 5.479905. Entropy: 1.223870.
Trainin

Training network. lr: 0.000037. clip: 0.014613
Iteration 37981: Policy loss: 0.000671. Value loss: 4.764647. Entropy: 1.042971.
Iteration 37982: Policy loss: 0.001619. Value loss: 3.697396. Entropy: 1.040364.
Iteration 37983: Policy loss: 0.005605. Value loss: 3.006178. Entropy: 1.038867.
episode: 14284   score: 315.0  epsilon: 1.0    steps: 160  evaluation reward: 325.75
Training network. lr: 0.000037. clip: 0.014613
Iteration 37984: Policy loss: 0.001157. Value loss: 6.704843. Entropy: 1.043076.
Iteration 37985: Policy loss: 0.003729. Value loss: 6.726460. Entropy: 1.046267.
Iteration 37986: Policy loss: 0.004544. Value loss: 5.032404. Entropy: 1.049026.
episode: 14285   score: 210.0  epsilon: 1.0    steps: 600  evaluation reward: 326.8
Training network. lr: 0.000037. clip: 0.014613
Iteration 37987: Policy loss: 0.000587. Value loss: 4.421172. Entropy: 1.142055.
Iteration 37988: Policy loss: 0.001026. Value loss: 3.512788. Entropy: 1.119241.
Iteration 37989: Policy loss: 0.002745. Va

Iteration 38044: Policy loss: 0.002284. Value loss: 3.568919. Entropy: 1.312010.
Iteration 38045: Policy loss: 0.004964. Value loss: 2.687752. Entropy: 1.309501.
Iteration 38046: Policy loss: 0.004951. Value loss: 2.108728. Entropy: 1.306432.
episode: 14308   score: 305.0  epsilon: 1.0    steps: 33  evaluation reward: 318.45
Training network. lr: 0.000036. clip: 0.014500
Iteration 38047: Policy loss: 0.002119. Value loss: 6.352291. Entropy: 1.190102.
Iteration 38048: Policy loss: 0.004894. Value loss: 4.385016. Entropy: 1.165761.
Iteration 38049: Policy loss: 0.005750. Value loss: 4.211678. Entropy: 1.188192.
episode: 14309   score: 485.0  epsilon: 1.0    steps: 347  evaluation reward: 317.05
episode: 14310   score: 80.0  epsilon: 1.0    steps: 629  evaluation reward: 317.15
episode: 14311   score: 155.0  epsilon: 1.0    steps: 669  evaluation reward: 316.95
Training network. lr: 0.000036. clip: 0.014500
Iteration 38050: Policy loss: 0.001446. Value loss: 6.743295. Entropy: 1.124339.
I

Iteration 38107: Policy loss: 0.002329. Value loss: 4.222019. Entropy: 1.192575.
Iteration 38108: Policy loss: 0.005272. Value loss: 3.416437. Entropy: 1.196401.
Iteration 38109: Policy loss: 0.004898. Value loss: 2.993842. Entropy: 1.197805.
episode: 14333   score: 170.0  epsilon: 1.0    steps: 217  evaluation reward: 294.7
episode: 14334   score: 465.0  epsilon: 1.0    steps: 621  evaluation reward: 293.15
Training network. lr: 0.000036. clip: 0.014275
Iteration 38110: Policy loss: 0.000393. Value loss: 5.414144. Entropy: 1.225328.
Iteration 38111: Policy loss: 0.005910. Value loss: 4.466335. Entropy: 1.254955.
Iteration 38112: Policy loss: 0.005492. Value loss: 3.804533. Entropy: 1.235174.
episode: 14335   score: 230.0  epsilon: 1.0    steps: 880  evaluation reward: 295.45
Training network. lr: 0.000036. clip: 0.014275
Iteration 38113: Policy loss: 0.001391. Value loss: 5.688277. Entropy: 1.304106.
Iteration 38114: Policy loss: 0.005752. Value loss: 4.546417. Entropy: 1.301154.
Iter

Iteration 38169: Policy loss: 0.002286. Value loss: 2.323340. Entropy: 1.166319.
episode: 14359   score: 180.0  epsilon: 1.0    steps: 277  evaluation reward: 288.6
episode: 14360   score: 235.0  epsilon: 1.0    steps: 769  evaluation reward: 286.35
episode: 14361   score: 265.0  epsilon: 1.0    steps: 908  evaluation reward: 283.4
Training network. lr: 0.000035. clip: 0.014163
Iteration 38170: Policy loss: 0.000707. Value loss: 2.201181. Entropy: 0.960242.
Iteration 38171: Policy loss: 0.000856. Value loss: 1.880139. Entropy: 0.956593.
Iteration 38172: Policy loss: -0.000098. Value loss: 1.626382. Entropy: 0.944292.
Training network. lr: 0.000035. clip: 0.014163
Iteration 38173: Policy loss: 0.000847. Value loss: 3.369913. Entropy: 1.075661.
Iteration 38174: Policy loss: 0.001104. Value loss: 3.036232. Entropy: 1.079008.
Iteration 38175: Policy loss: 0.002261. Value loss: 2.750315. Entropy: 1.079324.
episode: 14362   score: 160.0  epsilon: 1.0    steps: 207  evaluation reward: 281.4
e

Training network. lr: 0.000035. clip: 0.014050
Iteration 38233: Policy loss: 0.002863. Value loss: 3.937489. Entropy: 1.271527.
Iteration 38234: Policy loss: 0.005853. Value loss: 3.142257. Entropy: 1.263313.
Iteration 38235: Policy loss: 0.007568. Value loss: 2.786891. Entropy: 1.269506.
episode: 14383   score: 90.0  epsilon: 1.0    steps: 174  evaluation reward: 280.7
Training network. lr: 0.000035. clip: 0.014050
Iteration 38236: Policy loss: 0.005809. Value loss: 10.002869. Entropy: 1.220840.
Iteration 38237: Policy loss: 0.006641. Value loss: 7.943767. Entropy: 1.214299.
Iteration 38238: Policy loss: 0.007152. Value loss: 6.638159. Entropy: 1.220880.
episode: 14384   score: 245.0  epsilon: 1.0    steps: 18  evaluation reward: 278.1
episode: 14385   score: 695.0  epsilon: 1.0    steps: 336  evaluation reward: 277.4
Training network. lr: 0.000035. clip: 0.014050
Iteration 38239: Policy loss: 0.002667. Value loss: 6.072775. Entropy: 1.084717.
Iteration 38240: Policy loss: 0.003706. V

Iteration 38298: Policy loss: 0.002155. Value loss: 4.278844. Entropy: 1.028822.
Training network. lr: 0.000035. clip: 0.013938
Iteration 38299: Policy loss: 0.002014. Value loss: 3.400581. Entropy: 1.151379.
Iteration 38300: Policy loss: 0.004780. Value loss: 2.392036. Entropy: 1.137701.
Iteration 38301: Policy loss: 0.006020. Value loss: 1.931432. Entropy: 1.127605.
episode: 14405   score: 210.0  epsilon: 1.0    steps: 683  evaluation reward: 294.2
Training network. lr: 0.000035. clip: 0.013825
Iteration 38302: Policy loss: 0.001963. Value loss: 4.014302. Entropy: 1.215939.
Iteration 38303: Policy loss: 0.004586. Value loss: 3.124506. Entropy: 1.179006.
Iteration 38304: Policy loss: 0.006054. Value loss: 2.997973. Entropy: 1.199327.
episode: 14406   score: 370.0  epsilon: 1.0    steps: 263  evaluation reward: 292.8
Training network. lr: 0.000035. clip: 0.013825
Iteration 38305: Policy loss: 0.003003. Value loss: 9.959768. Entropy: 1.149092.
Iteration 38306: Policy loss: 0.005989. Val

episode: 14429   score: 320.0  epsilon: 1.0    steps: 345  evaluation reward: 305.45
episode: 14430   score: 325.0  epsilon: 1.0    steps: 863  evaluation reward: 306.6
Training network. lr: 0.000034. clip: 0.013713
Iteration 38362: Policy loss: 0.000956. Value loss: 3.176497. Entropy: 1.143712.
Iteration 38363: Policy loss: 0.002384. Value loss: 2.781834. Entropy: 1.153691.
Iteration 38364: Policy loss: 0.004426. Value loss: 2.491930. Entropy: 1.156291.
Training network. lr: 0.000034. clip: 0.013713
Iteration 38365: Policy loss: 0.001661. Value loss: 4.563541. Entropy: 1.081478.
Iteration 38366: Policy loss: 0.004359. Value loss: 3.396430. Entropy: 1.088319.
Iteration 38367: Policy loss: 0.004051. Value loss: 3.137265. Entropy: 1.104272.
episode: 14431   score: 650.0  epsilon: 1.0    steps: 199  evaluation reward: 308.3
Training network. lr: 0.000034. clip: 0.013713
Iteration 38368: Policy loss: 0.001512. Value loss: 5.120491. Entropy: 1.127049.
Iteration 38369: Policy loss: 0.001063.

Iteration 38423: Policy loss: 0.001924. Value loss: 2.808675. Entropy: 1.218595.
Iteration 38424: Policy loss: 0.001463. Value loss: 2.507992. Entropy: 1.206999.
Training network. lr: 0.000034. clip: 0.013600
Iteration 38425: Policy loss: 0.001136. Value loss: 2.352502. Entropy: 1.275447.
Iteration 38426: Policy loss: 0.003262. Value loss: 1.804424. Entropy: 1.276800.
Iteration 38427: Policy loss: 0.001056. Value loss: 1.621299. Entropy: 1.284798.
episode: 14456   score: 345.0  epsilon: 1.0    steps: 827  evaluation reward: 308.95
Training network. lr: 0.000034. clip: 0.013600
Iteration 38428: Policy loss: 0.002090. Value loss: 4.002502. Entropy: 1.322286.
Iteration 38429: Policy loss: 0.003243. Value loss: 3.231546. Entropy: 1.331338.
Iteration 38430: Policy loss: 0.003749. Value loss: 2.806601. Entropy: 1.329423.
episode: 14457   score: 180.0  epsilon: 1.0    steps: 186  evaluation reward: 309.5
episode: 14458   score: 290.0  epsilon: 1.0    steps: 504  evaluation reward: 306.75
epis

Training network. lr: 0.000034. clip: 0.013488
Iteration 38488: Policy loss: 0.000954. Value loss: 3.386096. Entropy: 1.113145.
Iteration 38489: Policy loss: 0.002467. Value loss: 2.884462. Entropy: 1.102535.
Iteration 38490: Policy loss: 0.002613. Value loss: 2.465813. Entropy: 1.107241.
episode: 14479   score: 520.0  epsilon: 1.0    steps: 66  evaluation reward: 323.75
episode: 14480   score: 210.0  epsilon: 1.0    steps: 139  evaluation reward: 324.25
Training network. lr: 0.000034. clip: 0.013488
Iteration 38491: Policy loss: 0.001268. Value loss: 2.844536. Entropy: 1.035304.
Iteration 38492: Policy loss: 0.002849. Value loss: 2.306201. Entropy: 1.039683.
Iteration 38493: Policy loss: 0.001979. Value loss: 2.124437. Entropy: 1.030247.
episode: 14481   score: 315.0  epsilon: 1.0    steps: 339  evaluation reward: 322.15
episode: 14482   score: 800.0  epsilon: 1.0    steps: 396  evaluation reward: 321.2
Training network. lr: 0.000034. clip: 0.013488
Iteration 38494: Policy loss: 0.001

Iteration 38551: Policy loss: 0.003042. Value loss: 3.425851. Entropy: 1.183040.
Iteration 38552: Policy loss: 0.006637. Value loss: 2.783361. Entropy: 1.183261.
Iteration 38553: Policy loss: 0.005392. Value loss: 2.272343. Entropy: 1.186621.
episode: 14503   score: 55.0  epsilon: 1.0    steps: 238  evaluation reward: 328.5
Training network. lr: 0.000033. clip: 0.013262
Iteration 38554: Policy loss: 0.001133. Value loss: 5.720405. Entropy: 1.064511.
Iteration 38555: Policy loss: 0.002311. Value loss: 4.165244. Entropy: 1.039590.
Iteration 38556: Policy loss: 0.001995. Value loss: 3.805877. Entropy: 1.030343.
episode: 14504   score: 330.0  epsilon: 1.0    steps: 796  evaluation reward: 323.8
Training network. lr: 0.000033. clip: 0.013262
Iteration 38557: Policy loss: 0.000672. Value loss: 3.393468. Entropy: 1.127112.
Iteration 38558: Policy loss: 0.002061. Value loss: 2.618300. Entropy: 1.131410.
Iteration 38559: Policy loss: 0.000921. Value loss: 2.411083. Entropy: 1.107687.
episode: 1

episode: 14529   score: 260.0  epsilon: 1.0    steps: 646  evaluation reward: 326.3
Training network. lr: 0.000033. clip: 0.013150
Iteration 38614: Policy loss: 0.000423. Value loss: 2.930375. Entropy: 0.963898.
Iteration 38615: Policy loss: 0.004502. Value loss: 2.251938. Entropy: 0.970974.
Iteration 38616: Policy loss: 0.006132. Value loss: 1.898030. Entropy: 0.962850.
episode: 14530   score: 375.0  epsilon: 1.0    steps: 846  evaluation reward: 325.7
Training network. lr: 0.000033. clip: 0.013150
Iteration 38617: Policy loss: 0.002070. Value loss: 3.192203. Entropy: 1.055519.
Iteration 38618: Policy loss: 0.003978. Value loss: 2.553393. Entropy: 1.051852.
Iteration 38619: Policy loss: 0.004669. Value loss: 2.203706. Entropy: 1.046727.
Training network. lr: 0.000033. clip: 0.013150
Iteration 38620: Policy loss: 0.001773. Value loss: 3.154428. Entropy: 1.178007.
Iteration 38621: Policy loss: 0.002751. Value loss: 2.729562. Entropy: 1.174226.
Iteration 38622: Policy loss: 0.003591. Val

Training network. lr: 0.000033. clip: 0.013038
Iteration 38677: Policy loss: 0.005008. Value loss: 3.529803. Entropy: 1.141329.
Iteration 38678: Policy loss: 0.007145. Value loss: 2.538878. Entropy: 1.184514.
Iteration 38679: Policy loss: 0.004687. Value loss: 2.182248. Entropy: 1.153605.
episode: 14554   score: 285.0  epsilon: 1.0    steps: 523  evaluation reward: 335.35
Training network. lr: 0.000033. clip: 0.013038
Iteration 38680: Policy loss: 0.003638. Value loss: 4.007261. Entropy: 1.199890.
Iteration 38681: Policy loss: 0.009554. Value loss: 2.857976. Entropy: 1.193935.
Iteration 38682: Policy loss: 0.011809. Value loss: 2.708090. Entropy: 1.185381.
episode: 14555   score: 185.0  epsilon: 1.0    steps: 641  evaluation reward: 333.25
Training network. lr: 0.000033. clip: 0.013038
Iteration 38683: Policy loss: 0.001674. Value loss: 2.846952. Entropy: 1.112470.
Iteration 38684: Policy loss: 0.003691. Value loss: 2.207070. Entropy: 1.094375.
Iteration 38685: Policy loss: 0.003970. V

Iteration 38742: Policy loss: 0.002518. Value loss: 2.431662. Entropy: 1.171031.
episode: 14577   score: 175.0  epsilon: 1.0    steps: 494  evaluation reward: 326.2
episode: 14578   score: 365.0  epsilon: 1.0    steps: 852  evaluation reward: 324.8
Training network. lr: 0.000032. clip: 0.012925
Iteration 38743: Policy loss: 0.001121. Value loss: 2.940307. Entropy: 1.247184.
Iteration 38744: Policy loss: 0.001592. Value loss: 2.330767. Entropy: 1.261192.
Iteration 38745: Policy loss: 0.001920. Value loss: 1.995260. Entropy: 1.269493.
episode: 14579   score: 395.0  epsilon: 1.0    steps: 135  evaluation reward: 323.95
Training network. lr: 0.000032. clip: 0.012925
Iteration 38746: Policy loss: 0.001591. Value loss: 3.793959. Entropy: 1.168705.
Iteration 38747: Policy loss: 0.002920. Value loss: 2.798446. Entropy: 1.174099.
Iteration 38748: Policy loss: 0.001846. Value loss: 2.435616. Entropy: 1.164159.
episode: 14580   score: 315.0  epsilon: 1.0    steps: 102  evaluation reward: 322.7
Tr

Iteration 38807: Policy loss: 0.002940. Value loss: 2.211605. Entropy: 1.172895.
Iteration 38808: Policy loss: 0.003100. Value loss: 2.012748. Entropy: 1.165506.
Training network. lr: 0.000032. clip: 0.012700
Iteration 38809: Policy loss: 0.002498. Value loss: 6.810022. Entropy: 1.173658.
Iteration 38810: Policy loss: 0.004725. Value loss: 5.269797. Entropy: 1.170336.
Iteration 38811: Policy loss: 0.006030. Value loss: 4.466549. Entropy: 1.210206.
episode: 14600   score: 255.0  epsilon: 1.0    steps: 635  evaluation reward: 313.3
now time :  2019-02-23 12:58:22.465673
episode: 14601   score: 315.0  epsilon: 1.0    steps: 712  evaluation reward: 313.9
Training network. lr: 0.000032. clip: 0.012700
Iteration 38812: Policy loss: 0.000846. Value loss: 4.472424. Entropy: 1.147680.
Iteration 38813: Policy loss: 0.002565. Value loss: 3.711264. Entropy: 1.145539.
Iteration 38814: Policy loss: 0.001742. Value loss: 3.327906. Entropy: 1.164262.
episode: 14602   score: 155.0  epsilon: 1.0    step

Iteration 38872: Policy loss: 0.000733. Value loss: 5.802796. Entropy: 1.124888.
Iteration 38873: Policy loss: 0.003579. Value loss: 4.458784. Entropy: 1.127105.
Iteration 38874: Policy loss: 0.000835. Value loss: 4.912866. Entropy: 1.117100.
episode: 14622   score: 355.0  epsilon: 1.0    steps: 442  evaluation reward: 321.25
Training network. lr: 0.000031. clip: 0.012588
Iteration 38875: Policy loss: 0.002195. Value loss: 10.176338. Entropy: 0.966903.
Iteration 38876: Policy loss: 0.005411. Value loss: 8.203424. Entropy: 0.963821.
Iteration 38877: Policy loss: 0.005827. Value loss: 7.215561. Entropy: 0.950362.
Training network. lr: 0.000031. clip: 0.012588
Iteration 38878: Policy loss: 0.001520. Value loss: 6.874476. Entropy: 0.997137.
Iteration 38879: Policy loss: 0.003172. Value loss: 5.306501. Entropy: 1.000479.
Iteration 38880: Policy loss: 0.001605. Value loss: 4.567904. Entropy: 1.004783.
episode: 14623   score: 400.0  epsilon: 1.0    steps: 105  evaluation reward: 322.2
Trainin

Iteration 38940: Policy loss: 0.005835. Value loss: 6.687326. Entropy: 1.229235.
episode: 14642   score: 270.0  epsilon: 1.0    steps: 422  evaluation reward: 331.8
episode: 14643   score: 570.0  epsilon: 1.0    steps: 629  evaluation reward: 329.65
Training network. lr: 0.000031. clip: 0.012475
Iteration 38941: Policy loss: 0.002210. Value loss: 5.016416. Entropy: 0.973709.
Iteration 38942: Policy loss: 0.002678. Value loss: 3.519186. Entropy: 0.948447.
Iteration 38943: Policy loss: 0.002982. Value loss: 2.761468. Entropy: 0.972872.
episode: 14644   score: 380.0  epsilon: 1.0    steps: 93  evaluation reward: 333.75
episode: 14645   score: 460.0  epsilon: 1.0    steps: 342  evaluation reward: 334.9
episode: 14646   score: 635.0  epsilon: 1.0    steps: 682  evaluation reward: 336.05
Training network. lr: 0.000031. clip: 0.012475
Iteration 38944: Policy loss: 0.001184. Value loss: 6.280181. Entropy: 0.963113.
Iteration 38945: Policy loss: 0.003908. Value loss: 3.849513. Entropy: 1.012811

Iteration 39002: Policy loss: 0.001287. Value loss: 2.698990. Entropy: 1.210600.
Iteration 39003: Policy loss: 0.002327. Value loss: 2.439084. Entropy: 1.205412.
episode: 14668   score: 155.0  epsilon: 1.0    steps: 421  evaluation reward: 351.25
episode: 14669   score: 300.0  epsilon: 1.0    steps: 656  evaluation reward: 347.05
Training network. lr: 0.000031. clip: 0.012250
Iteration 39004: Policy loss: 0.001557. Value loss: 3.374843. Entropy: 1.134114.
Iteration 39005: Policy loss: 0.004077. Value loss: 2.444342. Entropy: 1.135198.
Iteration 39006: Policy loss: 0.003699. Value loss: 2.553558. Entropy: 1.126041.
episode: 14670   score: 210.0  epsilon: 1.0    steps: 798  evaluation reward: 348.25
episode: 14671   score: 155.0  epsilon: 1.0    steps: 910  evaluation reward: 348.4
Training network. lr: 0.000031. clip: 0.012250
Iteration 39007: Policy loss: 0.000813. Value loss: 2.709102. Entropy: 1.104407.
Iteration 39008: Policy loss: 0.001806. Value loss: 2.150946. Entropy: 1.093363.


Iteration 39065: Policy loss: 0.002379. Value loss: 5.374455. Entropy: 1.312968.
Iteration 39066: Policy loss: 0.003842. Value loss: 4.662792. Entropy: 1.316286.
episode: 14693   score: 335.0  epsilon: 1.0    steps: 264  evaluation reward: 342.6
Training network. lr: 0.000030. clip: 0.012138
Iteration 39067: Policy loss: 0.003396. Value loss: 2.983958. Entropy: 1.179935.
Iteration 39068: Policy loss: 0.006523. Value loss: 2.122736. Entropy: 1.212937.
Iteration 39069: Policy loss: 0.005215. Value loss: 1.659774. Entropy: 1.196300.
episode: 14694   score: 165.0  epsilon: 1.0    steps: 875  evaluation reward: 342.5
Training network. lr: 0.000030. clip: 0.012138
Iteration 39070: Policy loss: 0.002259. Value loss: 3.561762. Entropy: 1.328334.
Iteration 39071: Policy loss: 0.007490. Value loss: 2.865908. Entropy: 1.310428.
Iteration 39072: Policy loss: 0.007907. Value loss: 2.492399. Entropy: 1.328359.
Training network. lr: 0.000030. clip: 0.012138
Iteration 39073: Policy loss: 0.001276. Val

Training network. lr: 0.000030. clip: 0.012025
Iteration 39130: Policy loss: 0.000483. Value loss: 8.771635. Entropy: 1.215574.
Iteration 39131: Policy loss: 0.002735. Value loss: 7.711721. Entropy: 1.192169.
Iteration 39132: Policy loss: 0.003304. Value loss: 7.190226. Entropy: 1.203432.
Training network. lr: 0.000030. clip: 0.012025
Iteration 39133: Policy loss: 0.002591. Value loss: 2.960387. Entropy: 1.225842.
Iteration 39134: Policy loss: 0.002165. Value loss: 2.068706. Entropy: 1.228368.
Iteration 39135: Policy loss: 0.001934. Value loss: 1.810532. Entropy: 1.235360.
Training network. lr: 0.000030. clip: 0.012025
Iteration 39136: Policy loss: 0.001667. Value loss: 9.623652. Entropy: 1.321424.
Iteration 39137: Policy loss: 0.004468. Value loss: 8.210206. Entropy: 1.329316.
Iteration 39138: Policy loss: 0.005317. Value loss: 7.837428. Entropy: 1.325569.
episode: 14716   score: 405.0  epsilon: 1.0    steps: 373  evaluation reward: 351.4
episode: 14717   score: 305.0  epsilon: 1.0   

episode: 14738   score: 155.0  epsilon: 1.0    steps: 828  evaluation reward: 344.9
episode: 14739   score: 255.0  epsilon: 1.0    steps: 965  evaluation reward: 339.95
Training network. lr: 0.000030. clip: 0.011913
Iteration 39196: Policy loss: 0.001163. Value loss: 5.807758. Entropy: 1.156350.
Iteration 39197: Policy loss: 0.003824. Value loss: 4.021011. Entropy: 1.146961.
Iteration 39198: Policy loss: 0.004363. Value loss: 3.881501. Entropy: 1.158296.
episode: 14740   score: 400.0  epsilon: 1.0    steps: 242  evaluation reward: 339.65
Training network. lr: 0.000030. clip: 0.011913
Iteration 39199: Policy loss: 0.001679. Value loss: 3.345918. Entropy: 1.142655.
Iteration 39200: Policy loss: 0.002055. Value loss: 2.409879. Entropy: 1.152287.
Iteration 39201: Policy loss: 0.001999. Value loss: 2.199962. Entropy: 1.143118.
episode: 14741   score: 365.0  epsilon: 1.0    steps: 105  evaluation reward: 339.0
episode: 14742   score: 230.0  epsilon: 1.0    steps: 360  evaluation reward: 339.

Iteration 39258: Policy loss: 0.000958. Value loss: 1.551275. Entropy: 1.197777.
Training network. lr: 0.000029. clip: 0.011687
Iteration 39259: Policy loss: 0.001356. Value loss: 3.289931. Entropy: 1.310542.
Iteration 39260: Policy loss: 0.002340. Value loss: 2.520788. Entropy: 1.319741.
Iteration 39261: Policy loss: 0.002429. Value loss: 2.548342. Entropy: 1.325926.
Training network. lr: 0.000029. clip: 0.011687
Iteration 39262: Policy loss: 0.001429. Value loss: 7.978537. Entropy: 1.205622.
Iteration 39263: Policy loss: 0.005059. Value loss: 5.646172. Entropy: 1.195794.
Iteration 39264: Policy loss: 0.005394. Value loss: 5.602322. Entropy: 1.209535.
episode: 14764   score: 545.0  epsilon: 1.0    steps: 191  evaluation reward: 333.6
episode: 14765   score: 560.0  epsilon: 1.0    steps: 709  evaluation reward: 335.9
episode: 14766   score: 230.0  epsilon: 1.0    steps: 908  evaluation reward: 338.7
Training network. lr: 0.000029. clip: 0.011687
Iteration 39265: Policy loss: 0.000608. 

Training network. lr: 0.000029. clip: 0.011575
Iteration 39322: Policy loss: 0.001125. Value loss: 4.045600. Entropy: 1.129634.
Iteration 39323: Policy loss: 0.004884. Value loss: 3.080840. Entropy: 1.136497.
Iteration 39324: Policy loss: 0.006493. Value loss: 2.853575. Entropy: 1.133030.
episode: 14788   score: 315.0  epsilon: 1.0    steps: 972  evaluation reward: 354.65
Training network. lr: 0.000029. clip: 0.011575
Iteration 39325: Policy loss: 0.000608. Value loss: 3.635046. Entropy: 1.336909.
Iteration 39326: Policy loss: 0.003028. Value loss: 3.097116. Entropy: 1.345463.
Iteration 39327: Policy loss: 0.002481. Value loss: 2.545396. Entropy: 1.341390.
Training network. lr: 0.000029. clip: 0.011575
Iteration 39328: Policy loss: 0.001331. Value loss: 3.891766. Entropy: 1.368519.
Iteration 39329: Policy loss: 0.002638. Value loss: 2.998116. Entropy: 1.364846.
Iteration 39330: Policy loss: 0.006013. Value loss: 2.387316. Entropy: 1.366161.
episode: 14789   score: 280.0  epsilon: 1.0  

Iteration 39387: Policy loss: 0.005783. Value loss: 3.002656. Entropy: 1.142971.
Training network. lr: 0.000029. clip: 0.011463
Iteration 39388: Policy loss: 0.003877. Value loss: 5.007714. Entropy: 1.075988.
Iteration 39389: Policy loss: 0.006227. Value loss: 3.770694. Entropy: 1.062243.
Iteration 39390: Policy loss: 0.005457. Value loss: 3.165437. Entropy: 1.068266.
episode: 14810   score: 315.0  epsilon: 1.0    steps: 414  evaluation reward: 361.85
Training network. lr: 0.000029. clip: 0.011463
Iteration 39391: Policy loss: 0.000916. Value loss: 3.567747. Entropy: 1.131962.
Iteration 39392: Policy loss: 0.002117. Value loss: 3.001289. Entropy: 1.147877.
Iteration 39393: Policy loss: 0.003681. Value loss: 2.493669. Entropy: 1.126549.
episode: 14811   score: 155.0  epsilon: 1.0    steps: 564  evaluation reward: 357.3
Training network. lr: 0.000029. clip: 0.011463
Iteration 39394: Policy loss: 0.000904. Value loss: 5.494977. Entropy: 1.261983.
Iteration 39395: Policy loss: 0.002746. Va

Iteration 39452: Policy loss: 0.002985. Value loss: 3.297991. Entropy: 1.106777.
Iteration 39453: Policy loss: 0.004163. Value loss: 3.044711. Entropy: 1.117347.
Training network. lr: 0.000028. clip: 0.011238
Iteration 39454: Policy loss: 0.000124. Value loss: 4.641446. Entropy: 1.331306.
Iteration 39455: Policy loss: 0.001341. Value loss: 4.131561. Entropy: 1.348397.
Iteration 39456: Policy loss: 0.003472. Value loss: 3.736887. Entropy: 1.348611.
episode: 14833   score: 215.0  epsilon: 1.0    steps: 347  evaluation reward: 372.1
Training network. lr: 0.000028. clip: 0.011238
Iteration 39457: Policy loss: 0.001859. Value loss: 7.099303. Entropy: 1.120135.
Iteration 39458: Policy loss: 0.002474. Value loss: 5.753838. Entropy: 1.124735.
Iteration 39459: Policy loss: 0.000923. Value loss: 5.553114. Entropy: 1.122091.
episode: 14834   score: 330.0  epsilon: 1.0    steps: 701  evaluation reward: 370.85
episode: 14835   score: 355.0  epsilon: 1.0    steps: 970  evaluation reward: 372.05
Trai

Training network. lr: 0.000028. clip: 0.011125
Iteration 39517: Policy loss: 0.000963. Value loss: 3.476323. Entropy: 1.190452.
Iteration 39518: Policy loss: 0.002240. Value loss: 2.711940. Entropy: 1.193409.
Iteration 39519: Policy loss: 0.002785. Value loss: 2.590659. Entropy: 1.215191.
episode: 14856   score: 635.0  epsilon: 1.0    steps: 108  evaluation reward: 367.8
Training network. lr: 0.000028. clip: 0.011125
Iteration 39520: Policy loss: 0.002199. Value loss: 7.712337. Entropy: 1.051960.
Iteration 39521: Policy loss: 0.005017. Value loss: 6.928712. Entropy: 1.076641.
Iteration 39522: Policy loss: 0.004444. Value loss: 6.144937. Entropy: 1.070209.
episode: 14857   score: 380.0  epsilon: 1.0    steps: 685  evaluation reward: 371.0
Training network. lr: 0.000028. clip: 0.011125
Iteration 39523: Policy loss: 0.002214. Value loss: 5.429946. Entropy: 1.179642.
Iteration 39524: Policy loss: 0.004092. Value loss: 3.763874. Entropy: 1.180998.
Iteration 39525: Policy loss: 0.004596. Val

Iteration 39582: Policy loss: 0.003576. Value loss: 3.087112. Entropy: 1.090164.
episode: 14879   score: 180.0  epsilon: 1.0    steps: 799  evaluation reward: 367.45
Training network. lr: 0.000028. clip: 0.011012
Iteration 39583: Policy loss: 0.001805. Value loss: 4.116220. Entropy: 1.089283.
Iteration 39584: Policy loss: 0.002220. Value loss: 3.252872. Entropy: 1.095703.
Iteration 39585: Policy loss: 0.004173. Value loss: 2.876067. Entropy: 1.086551.
episode: 14880   score: 295.0  epsilon: 1.0    steps: 133  evaluation reward: 366.4
episode: 14881   score: 155.0  epsilon: 1.0    steps: 516  evaluation reward: 365.4
Training network. lr: 0.000028. clip: 0.011012
Iteration 39586: Policy loss: 0.001212. Value loss: 2.918408. Entropy: 0.985107.
Iteration 39587: Policy loss: 0.001080. Value loss: 2.378402. Entropy: 0.991331.
Iteration 39588: Policy loss: 0.001314. Value loss: 2.159177. Entropy: 0.980209.
episode: 14882   score: 180.0  epsilon: 1.0    steps: 115  evaluation reward: 364.05
e

Iteration 39645: Policy loss: 0.003904. Value loss: 3.453832. Entropy: 1.229051.
episode: 14904   score: 330.0  epsilon: 1.0    steps: 730  evaluation reward: 351.2
Training network. lr: 0.000027. clip: 0.010900
Iteration 39646: Policy loss: 0.001494. Value loss: 5.506274. Entropy: 1.290958.
Iteration 39647: Policy loss: 0.003227. Value loss: 4.589604. Entropy: 1.297906.
Iteration 39648: Policy loss: 0.003445. Value loss: 3.999827. Entropy: 1.283238.
episode: 14905   score: 465.0  epsilon: 1.0    steps: 260  evaluation reward: 350.8
Training network. lr: 0.000027. clip: 0.010900
Iteration 39649: Policy loss: 0.002155. Value loss: 9.995708. Entropy: 1.185164.
Iteration 39650: Policy loss: 0.003813. Value loss: 8.417804. Entropy: 1.131253.
Iteration 39651: Policy loss: 0.003035. Value loss: 7.257774. Entropy: 1.180510.
Training network. lr: 0.000027. clip: 0.010788
Iteration 39652: Policy loss: 0.000547. Value loss: 8.282780. Entropy: 0.970005.
Iteration 39653: Policy loss: 0.001534. Val

Iteration 39710: Policy loss: 0.004550. Value loss: 5.373477. Entropy: 1.125483.
Iteration 39711: Policy loss: 0.004794. Value loss: 5.128686. Entropy: 1.135341.
Training network. lr: 0.000027. clip: 0.010675
Iteration 39712: Policy loss: 0.002159. Value loss: 6.303378. Entropy: 1.156843.
Iteration 39713: Policy loss: 0.004807. Value loss: 4.909992. Entropy: 1.173794.
Iteration 39714: Policy loss: 0.006242. Value loss: 4.096968. Entropy: 1.163016.
Training network. lr: 0.000027. clip: 0.010675
Iteration 39715: Policy loss: 0.000545. Value loss: 6.317126. Entropy: 1.181513.
Iteration 39716: Policy loss: 0.002015. Value loss: 5.975118. Entropy: 1.169516.
Iteration 39717: Policy loss: 0.003980. Value loss: 4.952942. Entropy: 1.171917.
episode: 14927   score: 395.0  epsilon: 1.0    steps: 374  evaluation reward: 353.95
episode: 14928   score: 165.0  epsilon: 1.0    steps: 890  evaluation reward: 354.3
Training network. lr: 0.000027. clip: 0.010675
Iteration 39718: Policy loss: 0.001615. Va

Training network. lr: 0.000026. clip: 0.010563
Iteration 39775: Policy loss: 0.001100. Value loss: 2.396843. Entropy: 1.145740.
Iteration 39776: Policy loss: 0.002195. Value loss: 1.929607. Entropy: 1.142088.
Iteration 39777: Policy loss: 0.001648. Value loss: 1.692327. Entropy: 1.135082.
Training network. lr: 0.000026. clip: 0.010563
Iteration 39778: Policy loss: 0.001372. Value loss: 5.683999. Entropy: 1.053435.
Iteration 39779: Policy loss: 0.003016. Value loss: 4.419631. Entropy: 1.054974.
Iteration 39780: Policy loss: 0.002118. Value loss: 4.094511. Entropy: 1.051093.
episode: 14950   score: 230.0  epsilon: 1.0    steps: 81  evaluation reward: 350.35
now time :  2019-02-23 13:18:17.730322
episode: 14951   score: 150.0  epsilon: 1.0    steps: 771  evaluation reward: 349.65
Training network. lr: 0.000026. clip: 0.010563
Iteration 39781: Policy loss: 0.001707. Value loss: 4.578623. Entropy: 1.089821.
Iteration 39782: Policy loss: 0.003578. Value loss: 3.374209. Entropy: 1.085770.
Ite

Iteration 39840: Policy loss: 0.004153. Value loss: 2.339720. Entropy: 1.221283.
Training network. lr: 0.000026. clip: 0.010450
Iteration 39841: Policy loss: 0.002086. Value loss: 5.970107. Entropy: 1.258116.
Iteration 39842: Policy loss: 0.003219. Value loss: 5.198437. Entropy: 1.255890.
Iteration 39843: Policy loss: 0.004343. Value loss: 4.924119. Entropy: 1.250942.
Training network. lr: 0.000026. clip: 0.010450
Iteration 39844: Policy loss: 0.002045. Value loss: 10.649441. Entropy: 1.326755.
Iteration 39845: Policy loss: 0.005000. Value loss: 9.076127. Entropy: 1.333812.
Iteration 39846: Policy loss: 0.006171. Value loss: 7.183452. Entropy: 1.341049.
Training network. lr: 0.000026. clip: 0.010450
Iteration 39847: Policy loss: 0.003249. Value loss: 7.324173. Entropy: 1.201214.
Iteration 39848: Policy loss: 0.004481. Value loss: 5.891182. Entropy: 1.206241.
Iteration 39849: Policy loss: 0.006089. Value loss: 5.636402. Entropy: 1.193440.
episode: 14972   score: 155.0  epsilon: 1.0    s

episode: 14994   score: 290.0  epsilon: 1.0    steps: 278  evaluation reward: 361.9
episode: 14995   score: 260.0  epsilon: 1.0    steps: 474  evaluation reward: 361.65
Training network. lr: 0.000026. clip: 0.010225
Iteration 39907: Policy loss: 0.001084. Value loss: 2.288209. Entropy: 1.096343.
Iteration 39908: Policy loss: 0.002279. Value loss: 1.920512. Entropy: 1.119778.
Iteration 39909: Policy loss: 0.003505. Value loss: 1.673748. Entropy: 1.110992.
Training network. lr: 0.000026. clip: 0.010225
Iteration 39910: Policy loss: 0.001220. Value loss: 4.470530. Entropy: 1.043340.
Iteration 39911: Policy loss: 0.004503. Value loss: 3.294636. Entropy: 1.038531.
Iteration 39912: Policy loss: 0.004261. Value loss: 2.659961. Entropy: 1.038732.
Training network. lr: 0.000026. clip: 0.010225
Iteration 39913: Policy loss: 0.002306. Value loss: 7.290182. Entropy: 1.109939.
Iteration 39914: Policy loss: 0.002563. Value loss: 6.180979. Entropy: 1.116956.
Iteration 39915: Policy loss: 0.002164. Va

Iteration 39972: Policy loss: 0.003928. Value loss: 2.473309. Entropy: 1.156797.
Training network. lr: 0.000025. clip: 0.010112
Iteration 39973: Policy loss: 0.000558. Value loss: 3.319672. Entropy: 1.288109.
Iteration 39974: Policy loss: 0.000912. Value loss: 2.770927. Entropy: 1.292627.
Iteration 39975: Policy loss: 0.001815. Value loss: 2.579614. Entropy: 1.290767.
episode: 15016   score: 450.0  epsilon: 1.0    steps: 110  evaluation reward: 367.55
Training network. lr: 0.000025. clip: 0.010112
Iteration 39976: Policy loss: 0.002096. Value loss: 5.351385. Entropy: 1.335212.
Iteration 39977: Policy loss: 0.003708. Value loss: 4.376274. Entropy: 1.320263.
Iteration 39978: Policy loss: 0.006661. Value loss: 4.131668. Entropy: 1.331338.
episode: 15017   score: 325.0  epsilon: 1.0    steps: 845  evaluation reward: 368.45
Training network. lr: 0.000025. clip: 0.010112
Iteration 39979: Policy loss: 0.001793. Value loss: 9.447397. Entropy: 1.032772.
Iteration 39980: Policy loss: 0.002624. V

Training network. lr: 0.000025. clip: 0.010112
Iteration 40039: Policy loss: 0.002243. Value loss: 3.852190. Entropy: 1.184353.
Iteration 40040: Policy loss: 0.002332. Value loss: 3.180876. Entropy: 1.185422.
Iteration 40041: Policy loss: 0.003073. Value loss: 2.761787. Entropy: 1.208896.
episode: 15037   score: 575.0  epsilon: 1.0    steps: 109  evaluation reward: 373.4
episode: 15038   score: 180.0  epsilon: 1.0    steps: 346  evaluation reward: 374.8
episode: 15039   score: 135.0  epsilon: 1.0    steps: 616  evaluation reward: 370.85
episode: 15040   score: 125.0  epsilon: 1.0    steps: 913  evaluation reward: 370.3
Training network. lr: 0.000025. clip: 0.010112
Iteration 40042: Policy loss: 0.000661. Value loss: 4.020823. Entropy: 1.170453.
Iteration 40043: Policy loss: 0.002000. Value loss: 3.358610. Entropy: 1.172994.
Iteration 40044: Policy loss: 0.002116. Value loss: 2.852433. Entropy: 1.177705.
Training network. lr: 0.000025. clip: 0.010112
Iteration 40045: Policy loss: 0.0004

episode: 15062   score: 285.0  epsilon: 1.0    steps: 774  evaluation reward: 368.35
Training network. lr: 0.000025. clip: 0.010112
Iteration 40102: Policy loss: 0.002213. Value loss: 3.919701. Entropy: 1.124865.
Iteration 40103: Policy loss: 0.004530. Value loss: 3.396584. Entropy: 1.146138.
Iteration 40104: Policy loss: 0.004643. Value loss: 2.877869. Entropy: 1.135063.
episode: 15063   score: 440.0  epsilon: 1.0    steps: 189  evaluation reward: 363.75
Training network. lr: 0.000025. clip: 0.010112
Iteration 40105: Policy loss: 0.000881. Value loss: 7.427625. Entropy: 1.026582.
Iteration 40106: Policy loss: 0.003375. Value loss: 7.133285. Entropy: 1.042374.
Iteration 40107: Policy loss: 0.005200. Value loss: 6.061017. Entropy: 1.053593.
episode: 15064   score: 380.0  epsilon: 1.0    steps: 409  evaluation reward: 365.0
Training network. lr: 0.000025. clip: 0.010112
Iteration 40108: Policy loss: 0.001057. Value loss: 3.513656. Entropy: 1.314176.
Iteration 40109: Policy loss: 0.002909

Iteration 40167: Policy loss: 0.001546. Value loss: 4.192460. Entropy: 1.272400.
episode: 15085   score: 180.0  epsilon: 1.0    steps: 313  evaluation reward: 362.95
episode: 15086   score: 425.0  epsilon: 1.0    steps: 731  evaluation reward: 360.05
Training network. lr: 0.000025. clip: 0.010112
Iteration 40168: Policy loss: 0.001116. Value loss: 4.478427. Entropy: 1.179735.
Iteration 40169: Policy loss: 0.003696. Value loss: 3.671718. Entropy: 1.183204.
Iteration 40170: Policy loss: 0.003068. Value loss: 2.827554. Entropy: 1.189427.
episode: 15087   score: 470.0  epsilon: 1.0    steps: 16  evaluation reward: 361.35
Training network. lr: 0.000025. clip: 0.010112
Iteration 40171: Policy loss: 0.002299. Value loss: 5.776539. Entropy: 1.144068.
Iteration 40172: Policy loss: 0.005279. Value loss: 4.880340. Entropy: 1.153510.
Iteration 40173: Policy loss: 0.006755. Value loss: 4.137109. Entropy: 1.151353.
episode: 15088   score: 520.0  epsilon: 1.0    steps: 141  evaluation reward: 364.5
e

episode: 15109   score: 265.0  epsilon: 1.0    steps: 90  evaluation reward: 358.4
episode: 15110   score: 545.0  epsilon: 1.0    steps: 586  evaluation reward: 358.95
Training network. lr: 0.000025. clip: 0.010112
Iteration 40231: Policy loss: 0.000852. Value loss: 4.090071. Entropy: 1.273777.
Iteration 40232: Policy loss: 0.001446. Value loss: 3.404028. Entropy: 1.262042.
Iteration 40233: Policy loss: 0.002690. Value loss: 2.910499. Entropy: 1.282124.
episode: 15111   score: 280.0  epsilon: 1.0    steps: 276  evaluation reward: 358.4
episode: 15112   score: 420.0  epsilon: 1.0    steps: 393  evaluation reward: 356.95
episode: 15113   score: 430.0  epsilon: 1.0    steps: 666  evaluation reward: 357.3
Training network. lr: 0.000025. clip: 0.010112
Iteration 40234: Policy loss: 0.001799. Value loss: 7.405483. Entropy: 1.121247.
Iteration 40235: Policy loss: 0.005865. Value loss: 6.171382. Entropy: 1.135416.
Iteration 40236: Policy loss: 0.002744. Value loss: 5.840158. Entropy: 1.122115.

Iteration 40296: Policy loss: 0.005308. Value loss: 5.802340. Entropy: 1.207541.
episode: 15132   score: 860.0  epsilon: 1.0    steps: 195  evaluation reward: 347.95
episode: 15133   score: 325.0  epsilon: 1.0    steps: 882  evaluation reward: 354.45
Training network. lr: 0.000025. clip: 0.010112
Iteration 40297: Policy loss: 0.000064. Value loss: 8.467047. Entropy: 1.196634.
Iteration 40298: Policy loss: 0.003436. Value loss: 6.661886. Entropy: 1.182881.
Iteration 40299: Policy loss: 0.003444. Value loss: 7.249355. Entropy: 1.222860.
episode: 15134   score: 625.0  epsilon: 1.0    steps: 720  evaluation reward: 355.1
episode: 15135   score: 495.0  epsilon: 1.0    steps: 930  evaluation reward: 358.25
Training network. lr: 0.000025. clip: 0.010112
Iteration 40300: Policy loss: 0.000288. Value loss: 2.156789. Entropy: 0.994943.
Iteration 40301: Policy loss: 0.001301. Value loss: 2.017128. Entropy: 0.956136.
Iteration 40302: Policy loss: 0.000862. Value loss: 1.785903. Entropy: 0.977417.


Iteration 40362: Policy loss: 0.004912. Value loss: 4.452838. Entropy: 1.170193.
episode: 15153   score: 515.0  epsilon: 1.0    steps: 83  evaluation reward: 380.1
episode: 15154   score: 310.0  epsilon: 1.0    steps: 331  evaluation reward: 381.0
episode: 15155   score: 575.0  epsilon: 1.0    steps: 734  evaluation reward: 381.75
Training network. lr: 0.000025. clip: 0.010112
Iteration 40363: Policy loss: 0.001332. Value loss: 3.420948. Entropy: 1.028484.
Iteration 40364: Policy loss: 0.004996. Value loss: 2.833578. Entropy: 1.011128.
Iteration 40365: Policy loss: 0.005982. Value loss: 2.271844. Entropy: 1.027931.
Training network. lr: 0.000025. clip: 0.010112
Iteration 40366: Policy loss: -0.000073. Value loss: 3.484949. Entropy: 1.107826.
Iteration 40367: Policy loss: 0.000833. Value loss: 2.664403. Entropy: 1.111301.
Iteration 40368: Policy loss: 0.001956. Value loss: 2.345710. Entropy: 1.091323.
episode: 15156   score: 560.0  epsilon: 1.0    steps: 852  evaluation reward: 384.5
Tr

Iteration 40427: Policy loss: 0.004227. Value loss: 2.596890. Entropy: 0.993059.
Iteration 40428: Policy loss: 0.003682. Value loss: 2.319110. Entropy: 0.976295.
episode: 15176   score: 155.0  epsilon: 1.0    steps: 166  evaluation reward: 397.25
episode: 15177   score: 425.0  epsilon: 1.0    steps: 479  evaluation reward: 395.15
episode: 15178   score: 335.0  epsilon: 1.0    steps: 635  evaluation reward: 397.6
Training network. lr: 0.000025. clip: 0.010112
Iteration 40429: Policy loss: 0.003933. Value loss: 7.697074. Entropy: 1.041226.
Iteration 40430: Policy loss: 0.002949. Value loss: 7.288107. Entropy: 1.033883.
Iteration 40431: Policy loss: 0.003871. Value loss: 6.980863. Entropy: 1.035817.
Training network. lr: 0.000025. clip: 0.010112
Iteration 40432: Policy loss: 0.000924. Value loss: 2.984976. Entropy: 1.007650.
Iteration 40433: Policy loss: 0.001035. Value loss: 2.572408. Entropy: 1.022748.
Iteration 40434: Policy loss: 0.002553. Value loss: 2.221972. Entropy: 1.007564.
epis

Training network. lr: 0.000025. clip: 0.010112
Iteration 40492: Policy loss: 0.001001. Value loss: 5.138715. Entropy: 1.036383.
Iteration 40493: Policy loss: 0.003156. Value loss: 5.615273. Entropy: 1.054456.
Iteration 40494: Policy loss: 0.005162. Value loss: 4.296498. Entropy: 1.027596.
Training network. lr: 0.000025. clip: 0.010112
Iteration 40495: Policy loss: 0.000220. Value loss: 2.891772. Entropy: 1.178302.
Iteration 40496: Policy loss: 0.001465. Value loss: 2.291741. Entropy: 1.147615.
Iteration 40497: Policy loss: 0.002398. Value loss: 1.992517. Entropy: 1.170693.
Training network. lr: 0.000025. clip: 0.010112
Iteration 40498: Policy loss: 0.004940. Value loss: 4.131884. Entropy: 1.220383.
Iteration 40499: Policy loss: 0.008391. Value loss: 3.355211. Entropy: 1.212091.
Iteration 40500: Policy loss: 0.009875. Value loss: 3.086642. Entropy: 1.214367.
episode: 15199   score: 290.0  epsilon: 1.0    steps: 628  evaluation reward: 423.25
Training network. lr: 0.000025. clip: 0.01011

Training network. lr: 0.000025. clip: 0.010112
Iteration 40558: Policy loss: 0.001597. Value loss: 5.678568. Entropy: 1.129954.
Iteration 40559: Policy loss: 0.003171. Value loss: 4.701866. Entropy: 1.150528.
Iteration 40560: Policy loss: 0.002523. Value loss: 4.060820. Entropy: 1.134439.
episode: 15220   score: 590.0  epsilon: 1.0    steps: 74  evaluation reward: 416.05
episode: 15221   score: 255.0  epsilon: 1.0    steps: 542  evaluation reward: 418.9
episode: 15222   score: 275.0  epsilon: 1.0    steps: 737  evaluation reward: 415.85
Training network. lr: 0.000025. clip: 0.010112
Iteration 40561: Policy loss: 0.000357. Value loss: 2.599003. Entropy: 1.129974.
Iteration 40562: Policy loss: 0.000923. Value loss: 2.152764. Entropy: 1.110590.
Iteration 40563: Policy loss: 0.002006. Value loss: 1.873548. Entropy: 1.128113.
Training network. lr: 0.000025. clip: 0.010112
Iteration 40564: Policy loss: 0.000868. Value loss: 6.122190. Entropy: 1.054516.
Iteration 40565: Policy loss: 0.003735.

Training network. lr: 0.000025. clip: 0.010112
Iteration 40621: Policy loss: 0.001337. Value loss: 3.001532. Entropy: 1.082426.
Iteration 40622: Policy loss: 0.001786. Value loss: 2.628988. Entropy: 1.081017.
Iteration 40623: Policy loss: 0.001760. Value loss: 2.487772. Entropy: 1.077372.
episode: 15245   score: 375.0  epsilon: 1.0    steps: 269  evaluation reward: 406.4
Training network. lr: 0.000025. clip: 0.010112
Iteration 40624: Policy loss: 0.001201. Value loss: 6.518784. Entropy: 1.143274.
Iteration 40625: Policy loss: 0.001471. Value loss: 6.053663. Entropy: 1.137349.
Iteration 40626: Policy loss: 0.002988. Value loss: 5.037669. Entropy: 1.128462.
episode: 15246   score: 165.0  epsilon: 1.0    steps: 100  evaluation reward: 401.5
episode: 15247   score: 180.0  epsilon: 1.0    steps: 875  evaluation reward: 397.9
Training network. lr: 0.000025. clip: 0.010112
Iteration 40627: Policy loss: 0.002061. Value loss: 3.091071. Entropy: 1.194057.
Iteration 40628: Policy loss: 0.004799. 

Iteration 40685: Policy loss: 0.001923. Value loss: 2.396623. Entropy: 1.146591.
Iteration 40686: Policy loss: 0.001477. Value loss: 2.185196. Entropy: 1.129846.
episode: 15268   score: 210.0  epsilon: 1.0    steps: 132  evaluation reward: 384.0
Training network. lr: 0.000025. clip: 0.010112
Iteration 40687: Policy loss: 0.001799. Value loss: 3.785155. Entropy: 0.989781.
Iteration 40688: Policy loss: 0.002195. Value loss: 2.986480. Entropy: 0.981626.
Iteration 40689: Policy loss: 0.002703. Value loss: 3.103119. Entropy: 0.989339.
Training network. lr: 0.000025. clip: 0.010112
Iteration 40690: Policy loss: 0.001776. Value loss: 3.229281. Entropy: 1.166486.
Iteration 40691: Policy loss: 0.004148. Value loss: 2.699156. Entropy: 1.163159.
Iteration 40692: Policy loss: 0.005417. Value loss: 2.302670. Entropy: 1.168311.
episode: 15269   score: 290.0  epsilon: 1.0    steps: 120  evaluation reward: 378.5
episode: 15270   score: 210.0  epsilon: 1.0    steps: 658  evaluation reward: 378.4
Traini

Iteration 40751: Policy loss: 0.001275. Value loss: 4.093468. Entropy: 1.303261.
Iteration 40752: Policy loss: 0.003130. Value loss: 3.624253. Entropy: 1.305599.
episode: 15290   score: 365.0  epsilon: 1.0    steps: 282  evaluation reward: 371.8
Training network. lr: 0.000025. clip: 0.010112
Iteration 40753: Policy loss: 0.002557. Value loss: 4.515406. Entropy: 1.327601.
Iteration 40754: Policy loss: 0.006139. Value loss: 3.473526. Entropy: 1.329144.
Iteration 40755: Policy loss: 0.004801. Value loss: 3.232816. Entropy: 1.322554.
episode: 15291   score: 340.0  epsilon: 1.0    steps: 1004  evaluation reward: 369.8
Training network. lr: 0.000025. clip: 0.010112
Iteration 40756: Policy loss: 0.002363. Value loss: 9.280307. Entropy: 1.191639.
Iteration 40757: Policy loss: 0.004100. Value loss: 7.243769. Entropy: 1.192897.
Iteration 40758: Policy loss: 0.003651. Value loss: 6.170316. Entropy: 1.209479.
Training network. lr: 0.000025. clip: 0.010112
Iteration 40759: Policy loss: 0.001045. Va

Iteration 40816: Policy loss: 0.001591. Value loss: 4.635643. Entropy: 1.191327.
Iteration 40817: Policy loss: 0.002422. Value loss: 3.409964. Entropy: 1.182576.
Iteration 40818: Policy loss: 0.002014. Value loss: 3.325947. Entropy: 1.180404.
episode: 15312   score: 320.0  epsilon: 1.0    steps: 298  evaluation reward: 375.35
Training network. lr: 0.000025. clip: 0.010112
Iteration 40819: Policy loss: 0.000829. Value loss: 6.739764. Entropy: 0.918951.
Iteration 40820: Policy loss: 0.001821. Value loss: 5.120527. Entropy: 0.916835.
Iteration 40821: Policy loss: 0.002425. Value loss: 5.236246. Entropy: 0.943524.
episode: 15313   score: 540.0  epsilon: 1.0    steps: 756  evaluation reward: 374.95
Training network. lr: 0.000025. clip: 0.010112
Iteration 40822: Policy loss: 0.001110. Value loss: 5.299294. Entropy: 1.000637.
Iteration 40823: Policy loss: 0.001720. Value loss: 4.511609. Entropy: 1.037274.
Iteration 40824: Policy loss: 0.002736. Value loss: 3.993642. Entropy: 1.006897.
episode

Training network. lr: 0.000025. clip: 0.010112
Iteration 40879: Policy loss: 0.000460. Value loss: 2.960077. Entropy: 1.068319.
Iteration 40880: Policy loss: 0.002340. Value loss: 2.079168. Entropy: 1.064608.
Iteration 40881: Policy loss: 0.002458. Value loss: 1.913237. Entropy: 1.057499.
Training network. lr: 0.000025. clip: 0.010112
Iteration 40882: Policy loss: -0.000346. Value loss: 3.562191. Entropy: 1.197580.
Iteration 40883: Policy loss: 0.000918. Value loss: 2.779247. Entropy: 1.197991.
Iteration 40884: Policy loss: 0.001429. Value loss: 2.548139. Entropy: 1.186297.
episode: 15338   score: 315.0  epsilon: 1.0    steps: 226  evaluation reward: 371.9
episode: 15339   score: 210.0  epsilon: 1.0    steps: 602  evaluation reward: 373.8
Training network. lr: 0.000025. clip: 0.010112
Iteration 40885: Policy loss: 0.001049. Value loss: 7.668694. Entropy: 1.187863.
Iteration 40886: Policy loss: 0.005339. Value loss: 7.618456. Entropy: 1.204804.
Iteration 40887: Policy loss: 0.005520. Va

Training network. lr: 0.000025. clip: 0.010112
Iteration 40945: Policy loss: 0.003376. Value loss: 6.035605. Entropy: 1.342105.
Iteration 40946: Policy loss: 0.005784. Value loss: 4.479182. Entropy: 1.338279.
Iteration 40947: Policy loss: 0.005107. Value loss: 3.887109. Entropy: 1.340686.
episode: 15359   score: 410.0  epsilon: 1.0    steps: 657  evaluation reward: 377.95
episode: 15360   score: 330.0  epsilon: 1.0    steps: 990  evaluation reward: 376.3
Training network. lr: 0.000025. clip: 0.010112
Iteration 40948: Policy loss: 0.002267. Value loss: 8.179639. Entropy: 1.101530.
Iteration 40949: Policy loss: 0.002727. Value loss: 6.834061. Entropy: 1.101001.
Iteration 40950: Policy loss: 0.005683. Value loss: 6.015808. Entropy: 1.097301.
episode: 15361   score: 515.0  epsilon: 1.0    steps: 808  evaluation reward: 376.75
Training network. lr: 0.000025. clip: 0.010112
Iteration 40951: Policy loss: 0.000562. Value loss: 5.083426. Entropy: 1.061210.
Iteration 40952: Policy loss: 0.002362

Iteration 41010: Policy loss: 0.003798. Value loss: 2.643860. Entropy: 1.031514.
episode: 15382   score: 230.0  epsilon: 1.0    steps: 267  evaluation reward: 396.4
episode: 15383   score: 330.0  epsilon: 1.0    steps: 487  evaluation reward: 393.65
episode: 15384   score: 345.0  epsilon: 1.0    steps: 887  evaluation reward: 393.8
Training network. lr: 0.000025. clip: 0.010112
Iteration 41011: Policy loss: 0.000463. Value loss: 4.179463. Entropy: 1.038798.
Iteration 41012: Policy loss: 0.000705. Value loss: 3.239061. Entropy: 1.044099.
Iteration 41013: Policy loss: 0.002037. Value loss: 3.075311. Entropy: 1.035708.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41014: Policy loss: 0.000803. Value loss: 2.136666. Entropy: 1.140924.
Iteration 41015: Policy loss: 0.001927. Value loss: 1.987522. Entropy: 1.149574.
Iteration 41016: Policy loss: 0.002391. Value loss: 1.679778. Entropy: 1.150896.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41017: Policy loss: 0.000867.

Iteration 41074: Policy loss: 0.003840. Value loss: 7.691717. Entropy: 1.161056.
Iteration 41075: Policy loss: 0.007094. Value loss: 7.695669. Entropy: 1.145690.
Iteration 41076: Policy loss: 0.009137. Value loss: 5.625531. Entropy: 1.149734.
episode: 15405   score: 255.0  epsilon: 1.0    steps: 338  evaluation reward: 378.85
Training network. lr: 0.000025. clip: 0.010112
Iteration 41077: Policy loss: 0.000602. Value loss: 4.066917. Entropy: 1.134688.
Iteration 41078: Policy loss: 0.004399. Value loss: 3.445852. Entropy: 1.145694.
Iteration 41079: Policy loss: 0.004672. Value loss: 3.048641. Entropy: 1.122888.
episode: 15406   score: 280.0  epsilon: 1.0    steps: 130  evaluation reward: 374.85
episode: 15407   score: 320.0  epsilon: 1.0    steps: 400  evaluation reward: 372.95
Training network. lr: 0.000025. clip: 0.010112
Iteration 41080: Policy loss: 0.001037. Value loss: 2.978074. Entropy: 1.092670.
Iteration 41081: Policy loss: 0.003089. Value loss: 2.424762. Entropy: 1.104856.
Ite

Iteration 41138: Policy loss: 0.002384. Value loss: 3.694105. Entropy: 0.976070.
Iteration 41139: Policy loss: 0.004117. Value loss: 3.181437. Entropy: 0.994299.
episode: 15429   score: 290.0  epsilon: 1.0    steps: 363  evaluation reward: 369.35
Training network. lr: 0.000025. clip: 0.010112
Iteration 41140: Policy loss: 0.000615. Value loss: 3.010099. Entropy: 1.167747.
Iteration 41141: Policy loss: 0.002739. Value loss: 2.253751. Entropy: 1.170264.
Iteration 41142: Policy loss: 0.002885. Value loss: 1.935643. Entropy: 1.173571.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41143: Policy loss: 0.001486. Value loss: 3.242678. Entropy: 1.193596.
Iteration 41144: Policy loss: 0.003064. Value loss: 2.553733. Entropy: 1.218037.
Iteration 41145: Policy loss: 0.005251. Value loss: 2.324492. Entropy: 1.210644.
episode: 15430   score: 395.0  epsilon: 1.0    steps: 826  evaluation reward: 369.1
Training network. lr: 0.000025. clip: 0.010112
Iteration 41146: Policy loss: 0.001281. Va

Iteration 41205: Policy loss: 0.009597. Value loss: 5.231010. Entropy: 1.313594.
episode: 15450   score: 265.0  epsilon: 1.0    steps: 104  evaluation reward: 383.75
Training network. lr: 0.000025. clip: 0.010112
Iteration 41206: Policy loss: 0.001772. Value loss: 6.331213. Entropy: 1.329405.
Iteration 41207: Policy loss: 0.003332. Value loss: 5.286844. Entropy: 1.347780.
Iteration 41208: Policy loss: 0.002216. Value loss: 4.603458. Entropy: 1.324699.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41209: Policy loss: 0.002297. Value loss: 10.096068. Entropy: 1.289353.
Iteration 41210: Policy loss: 0.005616. Value loss: 7.565207. Entropy: 1.286189.
Iteration 41211: Policy loss: 0.006618. Value loss: 6.868932. Entropy: 1.294564.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41212: Policy loss: 0.001817. Value loss: 6.569222. Entropy: 1.218834.
Iteration 41213: Policy loss: 0.003855. Value loss: 5.632514. Entropy: 1.220199.
Iteration 41214: Policy loss: 0.004552. Valu

Training network. lr: 0.000025. clip: 0.010112
Iteration 41269: Policy loss: 0.000434. Value loss: 2.753966. Entropy: 1.062569.
Iteration 41270: Policy loss: 0.002662. Value loss: 2.188266. Entropy: 1.036926.
Iteration 41271: Policy loss: 0.002780. Value loss: 2.460899. Entropy: 1.067433.
episode: 15474   score: 435.0  epsilon: 1.0    steps: 361  evaluation reward: 386.45
Training network. lr: 0.000025. clip: 0.010112
Iteration 41272: Policy loss: -0.000068. Value loss: 3.253613. Entropy: 1.233272.
Iteration 41273: Policy loss: 0.001214. Value loss: 2.806562. Entropy: 1.228636.
Iteration 41274: Policy loss: 0.002002. Value loss: 2.253466. Entropy: 1.254251.
episode: 15475   score: 210.0  epsilon: 1.0    steps: 603  evaluation reward: 387.6
Training network. lr: 0.000025. clip: 0.010112
Iteration 41275: Policy loss: 0.000303. Value loss: 2.598588. Entropy: 1.341330.
Iteration 41276: Policy loss: 0.002015. Value loss: 1.987480. Entropy: 1.350629.
Iteration 41277: Policy loss: 0.002760. V

Iteration 41336: Policy loss: 0.004681. Value loss: 7.720467. Entropy: 1.305146.
Iteration 41337: Policy loss: 0.007595. Value loss: 6.640478. Entropy: 1.312875.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41338: Policy loss: 0.001875. Value loss: 5.439988. Entropy: 1.279025.
Iteration 41339: Policy loss: 0.004533. Value loss: 4.599776. Entropy: 1.278507.
Iteration 41340: Policy loss: 0.004898. Value loss: 3.995576. Entropy: 1.273252.
episode: 15494   score: 620.0  epsilon: 1.0    steps: 473  evaluation reward: 369.15
episode: 15495   score: 445.0  epsilon: 1.0    steps: 676  evaluation reward: 371.95
episode: 15496   score: 380.0  epsilon: 1.0    steps: 950  evaluation reward: 372.75
Training network. lr: 0.000025. clip: 0.010112
Iteration 41341: Policy loss: 0.000869. Value loss: 4.560922. Entropy: 1.188169.
Iteration 41342: Policy loss: 0.002106. Value loss: 3.531798. Entropy: 1.212198.
Iteration 41343: Policy loss: 0.002879. Value loss: 2.969841. Entropy: 1.221489.
epi

Training network. lr: 0.000025. clip: 0.010112
Iteration 41401: Policy loss: 0.000987. Value loss: 4.283679. Entropy: 1.105449.
Iteration 41402: Policy loss: 0.001414. Value loss: 3.404352. Entropy: 1.101171.
Iteration 41403: Policy loss: 0.002078. Value loss: 3.006226. Entropy: 1.102028.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41404: Policy loss: 0.001320. Value loss: 2.844353. Entropy: 1.155166.
Iteration 41405: Policy loss: 0.002300. Value loss: 2.308478. Entropy: 1.160235.
Iteration 41406: Policy loss: 0.002109. Value loss: 1.907667. Entropy: 1.145667.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41407: Policy loss: -0.000027. Value loss: 5.233221. Entropy: 1.337030.
Iteration 41408: Policy loss: 0.000871. Value loss: 4.953130. Entropy: 1.339452.
Iteration 41409: Policy loss: 0.002097. Value loss: 4.789748. Entropy: 1.342476.
episode: 15517   score: 305.0  epsilon: 1.0    steps: 983  evaluation reward: 390.1
Training network. lr: 0.000025. clip: 0.01011

Iteration 41468: Policy loss: 0.005651. Value loss: 4.248486. Entropy: 1.127665.
Iteration 41469: Policy loss: 0.007430. Value loss: 4.057127. Entropy: 1.138741.
episode: 15537   score: 320.0  epsilon: 1.0    steps: 622  evaluation reward: 391.55
Training network. lr: 0.000025. clip: 0.010112
Iteration 41470: Policy loss: 0.001469. Value loss: 6.617336. Entropy: 1.397111.
Iteration 41471: Policy loss: 0.001252. Value loss: 5.806144. Entropy: 1.374193.
Iteration 41472: Policy loss: 0.003636. Value loss: 4.554537. Entropy: 1.392059.
episode: 15538   score: 360.0  epsilon: 1.0    steps: 282  evaluation reward: 391.65
episode: 15539   score: 325.0  epsilon: 1.0    steps: 437  evaluation reward: 394.0
Training network. lr: 0.000025. clip: 0.010112
Iteration 41473: Policy loss: 0.001048. Value loss: 2.666348. Entropy: 1.298588.
Iteration 41474: Policy loss: 0.001664. Value loss: 2.363974. Entropy: 1.282626.
Iteration 41475: Policy loss: 0.001881. Value loss: 2.075769. Entropy: 1.295018.
epis

episode: 15560   score: 345.0  epsilon: 1.0    steps: 1008  evaluation reward: 369.45
Training network. lr: 0.000025. clip: 0.010112
Iteration 41533: Policy loss: 0.000488. Value loss: 4.643476. Entropy: 1.305926.
Iteration 41534: Policy loss: 0.003115. Value loss: 3.877406. Entropy: 1.289542.
Iteration 41535: Policy loss: 0.004504. Value loss: 3.300936. Entropy: 1.299057.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41536: Policy loss: 0.002377. Value loss: 9.368741. Entropy: 1.171480.
Iteration 41537: Policy loss: 0.004388. Value loss: 7.833711. Entropy: 1.159413.
Iteration 41538: Policy loss: 0.003352. Value loss: 7.174758. Entropy: 1.178623.
episode: 15561   score: 490.0  epsilon: 1.0    steps: 118  evaluation reward: 369.3
episode: 15562   score: 540.0  epsilon: 1.0    steps: 385  evaluation reward: 370.0
Training network. lr: 0.000025. clip: 0.010112
Iteration 41539: Policy loss: 0.002163. Value loss: 6.448310. Entropy: 0.999832.
Iteration 41540: Policy loss: 0.003156

episode: 15583   score: 1030.0  epsilon: 1.0    steps: 490  evaluation reward: 372.7
Training network. lr: 0.000025. clip: 0.010112
Iteration 41599: Policy loss: 0.001469. Value loss: 3.788002. Entropy: 1.340730.
Iteration 41600: Policy loss: 0.002487. Value loss: 2.910325. Entropy: 1.348293.
Iteration 41601: Policy loss: 0.003617. Value loss: 2.541091. Entropy: 1.347746.
episode: 15584   score: 150.0  epsilon: 1.0    steps: 281  evaluation reward: 378.35
episode: 15585   score: 220.0  epsilon: 1.0    steps: 740  evaluation reward: 375.05
Training network. lr: 0.000025. clip: 0.010112
Iteration 41602: Policy loss: 0.001660. Value loss: 5.057870. Entropy: 1.292167.
Iteration 41603: Policy loss: 0.003786. Value loss: 4.781813. Entropy: 1.245545.
Iteration 41604: Policy loss: 0.005436. Value loss: 4.276298. Entropy: 1.280379.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41605: Policy loss: 0.000355. Value loss: 8.742558. Entropy: 1.097043.
Iteration 41606: Policy loss: 0.00376

Iteration 41663: Policy loss: 0.000907. Value loss: 2.746305. Entropy: 1.355941.
Iteration 41664: Policy loss: 0.001863. Value loss: 2.189634. Entropy: 1.351690.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41665: Policy loss: 0.001600. Value loss: 5.929290. Entropy: 1.337149.
Iteration 41666: Policy loss: 0.005042. Value loss: 5.623607. Entropy: 1.339534.
Iteration 41667: Policy loss: 0.004760. Value loss: 4.564518. Entropy: 1.345975.
episode: 15606   score: 290.0  epsilon: 1.0    steps: 752  evaluation reward: 379.4
Training network. lr: 0.000025. clip: 0.010112
Iteration 41668: Policy loss: 0.001718. Value loss: 10.939229. Entropy: 1.359358.
Iteration 41669: Policy loss: 0.003428. Value loss: 9.567560. Entropy: 1.362497.
Iteration 41670: Policy loss: 0.003785. Value loss: 8.637146. Entropy: 1.357528.
episode: 15607   score: 460.0  epsilon: 1.0    steps: 558  evaluation reward: 376.85
Training network. lr: 0.000025. clip: 0.010112
Iteration 41671: Policy loss: 0.001112. V

Iteration 41728: Policy loss: 0.002707. Value loss: 6.911738. Entropy: 1.236204.
Iteration 41729: Policy loss: 0.005365. Value loss: 5.725714. Entropy: 1.220121.
Iteration 41730: Policy loss: 0.004745. Value loss: 4.862843. Entropy: 1.239789.
episode: 15629   score: 575.0  epsilon: 1.0    steps: 328  evaluation reward: 372.45
episode: 15630   score: 225.0  epsilon: 1.0    steps: 770  evaluation reward: 375.0
Training network. lr: 0.000025. clip: 0.010112
Iteration 41731: Policy loss: 0.002786. Value loss: 4.488823. Entropy: 1.094208.
Iteration 41732: Policy loss: 0.004858. Value loss: 3.306524. Entropy: 1.081648.
Iteration 41733: Policy loss: 0.003517. Value loss: 3.015028. Entropy: 1.093943.
episode: 15631   score: 335.0  epsilon: 1.0    steps: 159  evaluation reward: 373.65
episode: 15632   score: 520.0  epsilon: 1.0    steps: 927  evaluation reward: 372.4
Training network. lr: 0.000025. clip: 0.010112
Iteration 41734: Policy loss: 0.001568. Value loss: 3.732178. Entropy: 1.055317.
I

Iteration 41791: Policy loss: 0.001576. Value loss: 8.644302. Entropy: 1.244953.
Iteration 41792: Policy loss: 0.003784. Value loss: 7.667468. Entropy: 1.247660.
Iteration 41793: Policy loss: 0.006473. Value loss: 6.702034. Entropy: 1.242877.
episode: 15654   score: 180.0  epsilon: 1.0    steps: 749  evaluation reward: 362.35
Training network. lr: 0.000025. clip: 0.010112
Iteration 41794: Policy loss: 0.000953. Value loss: 4.564106. Entropy: 1.270796.
Iteration 41795: Policy loss: 0.002704. Value loss: 3.988755. Entropy: 1.270683.
Iteration 41796: Policy loss: 0.005027. Value loss: 3.634017. Entropy: 1.269967.
episode: 15655   score: 315.0  epsilon: 1.0    steps: 170  evaluation reward: 362.05
episode: 15656   score: 580.0  epsilon: 1.0    steps: 806  evaluation reward: 363.7
Training network. lr: 0.000025. clip: 0.010112
Iteration 41797: Policy loss: 0.000487. Value loss: 3.178874. Entropy: 1.256149.
Iteration 41798: Policy loss: 0.001344. Value loss: 2.930494. Entropy: 1.240373.
Iter

Iteration 41855: Policy loss: 0.002284. Value loss: 2.595529. Entropy: 1.353694.
Iteration 41856: Policy loss: 0.002331. Value loss: 2.455423. Entropy: 1.365993.
episode: 15678   score: 155.0  epsilon: 1.0    steps: 16  evaluation reward: 372.45
episode: 15679   score: 180.0  epsilon: 1.0    steps: 186  evaluation reward: 370.65
episode: 15680   score: 315.0  epsilon: 1.0    steps: 682  evaluation reward: 366.35
episode: 15681   score: 480.0  epsilon: 1.0    steps: 772  evaluation reward: 367.95
Training network. lr: 0.000025. clip: 0.010112
Iteration 41857: Policy loss: 0.000938. Value loss: 2.084807. Entropy: 1.165379.
Iteration 41858: Policy loss: 0.001161. Value loss: 1.873548. Entropy: 1.164236.
Iteration 41859: Policy loss: 0.001677. Value loss: 1.799857. Entropy: 1.162131.
episode: 15682   score: 215.0  epsilon: 1.0    steps: 409  evaluation reward: 367.2
Training network. lr: 0.000025. clip: 0.010112
Iteration 41860: Policy loss: 0.001172. Value loss: 4.797898. Entropy: 1.06639

Iteration 41918: Policy loss: 0.000837. Value loss: 3.860071. Entropy: 1.201485.
Iteration 41919: Policy loss: 0.001200. Value loss: 3.259482. Entropy: 1.210645.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41920: Policy loss: 0.000693. Value loss: 3.296883. Entropy: 1.215798.
Iteration 41921: Policy loss: 0.003017. Value loss: 2.667769. Entropy: 1.216238.
Iteration 41922: Policy loss: 0.004182. Value loss: 2.165795. Entropy: 1.224126.
episode: 15703   score: 390.0  epsilon: 1.0    steps: 32  evaluation reward: 349.9
episode: 15704   score: 330.0  epsilon: 1.0    steps: 521  evaluation reward: 350.25
Training network. lr: 0.000025. clip: 0.010112
Iteration 41923: Policy loss: 0.001268. Value loss: 3.478349. Entropy: 1.274350.
Iteration 41924: Policy loss: 0.002869. Value loss: 2.889469. Entropy: 1.266452.
Iteration 41925: Policy loss: 0.003392. Value loss: 2.648183. Entropy: 1.265806.
episode: 15705   score: 315.0  epsilon: 1.0    steps: 345  evaluation reward: 348.6
Traini

Iteration 41984: Policy loss: 0.003756. Value loss: 4.650237. Entropy: 1.090431.
Iteration 41985: Policy loss: 0.004963. Value loss: 4.102877. Entropy: 1.086814.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41986: Policy loss: 0.000712. Value loss: 2.155170. Entropy: 1.331238.
Iteration 41987: Policy loss: 0.001617. Value loss: 1.655686. Entropy: 1.340353.
Iteration 41988: Policy loss: 0.001156. Value loss: 1.602537. Entropy: 1.337394.
episode: 15725   score: 775.0  epsilon: 1.0    steps: 120  evaluation reward: 348.85
episode: 15726   score: 460.0  epsilon: 1.0    steps: 418  evaluation reward: 353.25
Training network. lr: 0.000025. clip: 0.010112
Iteration 41989: Policy loss: 0.001121. Value loss: 6.183677. Entropy: 1.219654.
Iteration 41990: Policy loss: 0.004294. Value loss: 5.211694. Entropy: 1.234018.
Iteration 41991: Policy loss: 0.006149. Value loss: 5.087153. Entropy: 1.223699.
Training network. lr: 0.000025. clip: 0.010112
Iteration 41992: Policy loss: 0.000617. V

Iteration 42050: Policy loss: 0.006670. Value loss: 4.344097. Entropy: 1.163867.
Iteration 42051: Policy loss: 0.010611. Value loss: 3.356877. Entropy: 1.160553.
episode: 15747   score: 305.0  epsilon: 1.0    steps: 688  evaluation reward: 373.5
Training network. lr: 0.000025. clip: 0.010112
Iteration 42052: Policy loss: 0.001320. Value loss: 2.857720. Entropy: 1.181033.
Iteration 42053: Policy loss: 0.002409. Value loss: 2.135697. Entropy: 1.188862.
Iteration 42054: Policy loss: 0.001983. Value loss: 2.155355. Entropy: 1.191008.
episode: 15748   score: 565.0  epsilon: 1.0    steps: 436  evaluation reward: 374.45
episode: 15749   score: 270.0  epsilon: 1.0    steps: 950  evaluation reward: 378.0
Training network. lr: 0.000025. clip: 0.010112
Iteration 42055: Policy loss: 0.001571. Value loss: 7.579741. Entropy: 1.245962.
Iteration 42056: Policy loss: 0.004334. Value loss: 6.728219. Entropy: 1.249016.
Iteration 42057: Policy loss: 0.005201. Value loss: 6.452814. Entropy: 1.220804.
Train

Iteration 42114: Policy loss: 0.002735. Value loss: 2.387236. Entropy: 1.114899.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42115: Policy loss: 0.001509. Value loss: 3.404797. Entropy: 1.231441.
Iteration 42116: Policy loss: 0.004526. Value loss: 2.806156. Entropy: 1.253499.
Iteration 42117: Policy loss: 0.005822. Value loss: 2.501154. Entropy: 1.225595.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42118: Policy loss: 0.001742. Value loss: 2.316957. Entropy: 1.237041.
Iteration 42119: Policy loss: 0.003218. Value loss: 1.753569. Entropy: 1.254550.
Iteration 42120: Policy loss: 0.003501. Value loss: 1.539495. Entropy: 1.239143.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42121: Policy loss: 0.001155. Value loss: 6.645782. Entropy: 1.435876.
Iteration 42122: Policy loss: 0.005693. Value loss: 5.719375. Entropy: 1.428869.
Iteration 42123: Policy loss: 0.008204. Value loss: 4.717092. Entropy: 1.428429.
episode: 15771   score: 320.0  epsilon: 1.0    st

Iteration 42179: Policy loss: 0.002431. Value loss: 2.076668. Entropy: 1.224021.
Iteration 42180: Policy loss: 0.002448. Value loss: 1.838512. Entropy: 1.235354.
episode: 15794   score: 240.0  epsilon: 1.0    steps: 487  evaluation reward: 383.85
Training network. lr: 0.000025. clip: 0.010112
Iteration 42181: Policy loss: 0.001150. Value loss: 3.978585. Entropy: 1.268857.
Iteration 42182: Policy loss: 0.004266. Value loss: 3.152947. Entropy: 1.276920.
Iteration 42183: Policy loss: 0.003320. Value loss: 2.863256. Entropy: 1.265186.
episode: 15795   score: 355.0  epsilon: 1.0    steps: 299  evaluation reward: 382.1
episode: 15796   score: 360.0  epsilon: 1.0    steps: 933  evaluation reward: 382.8
Training network. lr: 0.000025. clip: 0.010112
Iteration 42184: Policy loss: 0.001869. Value loss: 2.399942. Entropy: 1.282406.
Iteration 42185: Policy loss: 0.002024. Value loss: 1.858123. Entropy: 1.274840.
Iteration 42186: Policy loss: 0.002284. Value loss: 1.615042. Entropy: 1.273695.
episo

Iteration 42243: Policy loss: 0.003595. Value loss: 2.295722. Entropy: 1.256502.
episode: 15818   score: 335.0  epsilon: 1.0    steps: 378  evaluation reward: 372.6
episode: 15819   score: 305.0  epsilon: 1.0    steps: 713  evaluation reward: 372.3
Training network. lr: 0.000025. clip: 0.010112
Iteration 42244: Policy loss: 0.001426. Value loss: 6.688711. Entropy: 1.351301.
Iteration 42245: Policy loss: 0.004526. Value loss: 5.424012. Entropy: 1.334403.
Iteration 42246: Policy loss: 0.006664. Value loss: 4.673616. Entropy: 1.355928.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42247: Policy loss: 0.002898. Value loss: 6.881046. Entropy: 1.181823.
Iteration 42248: Policy loss: 0.005439. Value loss: 4.589503. Entropy: 1.170357.
Iteration 42249: Policy loss: 0.005863. Value loss: 4.428086. Entropy: 1.188880.
episode: 15820   score: 225.0  epsilon: 1.0    steps: 75  evaluation reward: 371.25
episode: 15821   score: 515.0  epsilon: 1.0    steps: 396  evaluation reward: 370.6
Tra

episode: 15839   score: 510.0  epsilon: 1.0    steps: 192  evaluation reward: 362.65
episode: 15840   score: 725.0  epsilon: 1.0    steps: 947  evaluation reward: 362.85
Training network. lr: 0.000025. clip: 0.010112
Iteration 42310: Policy loss: 0.000399. Value loss: 4.549516. Entropy: 1.083702.
Iteration 42311: Policy loss: 0.004467. Value loss: 3.491033. Entropy: 1.084652.
Iteration 42312: Policy loss: 0.005271. Value loss: 2.819414. Entropy: 1.066266.
episode: 15841   score: 315.0  epsilon: 1.0    steps: 110  evaluation reward: 367.2
Training network. lr: 0.000025. clip: 0.010112
Iteration 42313: Policy loss: 0.000798. Value loss: 4.041746. Entropy: 1.113471.
Iteration 42314: Policy loss: 0.003809. Value loss: 3.057512. Entropy: 1.117865.
Iteration 42315: Policy loss: 0.002980. Value loss: 2.674007. Entropy: 1.105057.
episode: 15842   score: 540.0  epsilon: 1.0    steps: 769  evaluation reward: 364.05
Training network. lr: 0.000025. clip: 0.010112
Iteration 42316: Policy loss: 0.00

Iteration 42374: Policy loss: 0.004743. Value loss: 5.657909. Entropy: 1.111032.
Iteration 42375: Policy loss: 0.006848. Value loss: 4.807675. Entropy: 1.106943.
episode: 15862   score: 420.0  epsilon: 1.0    steps: 462  evaluation reward: 364.35
Training network. lr: 0.000025. clip: 0.010112
Iteration 42376: Policy loss: 0.000812. Value loss: 7.171581. Entropy: 1.126679.
Iteration 42377: Policy loss: 0.001879. Value loss: 6.271347. Entropy: 1.136524.
Iteration 42378: Policy loss: 0.001436. Value loss: 6.161718. Entropy: 1.130932.
episode: 15863   score: 470.0  epsilon: 1.0    steps: 883  evaluation reward: 365.45
Training network. lr: 0.000025. clip: 0.010112
Iteration 42379: Policy loss: 0.001000. Value loss: 4.473331. Entropy: 1.192590.
Iteration 42380: Policy loss: 0.002456. Value loss: 3.808370. Entropy: 1.184532.
Iteration 42381: Policy loss: 0.002352. Value loss: 3.046715. Entropy: 1.187039.
episode: 15864   score: 360.0  epsilon: 1.0    steps: 904  evaluation reward: 367.25
Tra

Iteration 42440: Policy loss: 0.005954. Value loss: 6.227713. Entropy: 1.102559.
Iteration 42441: Policy loss: 0.008128. Value loss: 5.211830. Entropy: 1.054991.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42442: Policy loss: 0.001543. Value loss: 5.839639. Entropy: 1.279047.
Iteration 42443: Policy loss: 0.003635. Value loss: 4.630628. Entropy: 1.290837.
Iteration 42444: Policy loss: 0.003901. Value loss: 3.918363. Entropy: 1.283926.
episode: 15884   score: 350.0  epsilon: 1.0    steps: 65  evaluation reward: 382.65
episode: 15885   score: 315.0  epsilon: 1.0    steps: 528  evaluation reward: 381.9
Training network. lr: 0.000025. clip: 0.010112
Iteration 42445: Policy loss: 0.001308. Value loss: 5.027032. Entropy: 1.100398.
Iteration 42446: Policy loss: 0.003821. Value loss: 3.971528. Entropy: 1.076114.
Iteration 42447: Policy loss: 0.005722. Value loss: 3.717055. Entropy: 1.081060.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42448: Policy loss: 0.001141. Val

Training network. lr: 0.000025. clip: 0.010112
Iteration 42505: Policy loss: 0.001899. Value loss: 2.759898. Entropy: 1.268039.
Iteration 42506: Policy loss: 0.002116. Value loss: 2.053869. Entropy: 1.288014.
Iteration 42507: Policy loss: 0.002399. Value loss: 1.923854. Entropy: 1.278657.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42508: Policy loss: 0.001375. Value loss: 3.953035. Entropy: 1.281616.
Iteration 42509: Policy loss: 0.002749. Value loss: 3.335115. Entropy: 1.285465.
Iteration 42510: Policy loss: 0.003703. Value loss: 3.080397. Entropy: 1.279812.
episode: 15907   score: 335.0  epsilon: 1.0    steps: 224  evaluation reward: 394.6
episode: 15908   score: 180.0  epsilon: 1.0    steps: 344  evaluation reward: 394.05
Training network. lr: 0.000025. clip: 0.010112
Iteration 42511: Policy loss: 0.001610. Value loss: 5.332483. Entropy: 1.259873.
Iteration 42512: Policy loss: 0.002966. Value loss: 4.811290. Entropy: 1.262643.
Iteration 42513: Policy loss: 0.004251. Va

episode: 15929   score: 180.0  epsilon: 1.0    steps: 92  evaluation reward: 395.45
episode: 15930   score: 135.0  epsilon: 1.0    steps: 416  evaluation reward: 393.65
episode: 15931   score: 645.0  epsilon: 1.0    steps: 640  evaluation reward: 391.75
Training network. lr: 0.000025. clip: 0.010112
Iteration 42571: Policy loss: 0.001588. Value loss: 4.660359. Entropy: 1.190578.
Iteration 42572: Policy loss: 0.002095. Value loss: 3.916391. Entropy: 1.188003.
Iteration 42573: Policy loss: 0.004028. Value loss: 3.309490. Entropy: 1.194120.
episode: 15932   score: 280.0  epsilon: 1.0    steps: 274  evaluation reward: 394.45
episode: 15933   score: 210.0  epsilon: 1.0    steps: 702  evaluation reward: 391.75
Training network. lr: 0.000025. clip: 0.010112
Iteration 42574: Policy loss: 0.000472. Value loss: 3.551723. Entropy: 0.980704.
Iteration 42575: Policy loss: 0.001652. Value loss: 2.717440. Entropy: 0.966050.
Iteration 42576: Policy loss: 0.003412. Value loss: 2.461412. Entropy: 0.9755

episode: 15954   score: 405.0  epsilon: 1.0    steps: 521  evaluation reward: 378.15
Training network. lr: 0.000025. clip: 0.010112
Iteration 42634: Policy loss: 0.001162. Value loss: 4.504813. Entropy: 1.163585.
Iteration 42635: Policy loss: 0.002739. Value loss: 3.535552. Entropy: 1.126000.
Iteration 42636: Policy loss: 0.002341. Value loss: 3.209723. Entropy: 1.140029.
episode: 15955   score: 390.0  epsilon: 1.0    steps: 838  evaluation reward: 377.7
Training network. lr: 0.000025. clip: 0.010112
Iteration 42637: Policy loss: 0.001748. Value loss: 7.662657. Entropy: 1.297720.
Iteration 42638: Policy loss: 0.005906. Value loss: 5.849904. Entropy: 1.286700.
Iteration 42639: Policy loss: 0.008657. Value loss: 5.361269. Entropy: 1.308409.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42640: Policy loss: 0.000876. Value loss: 3.734611. Entropy: 1.308383.
Iteration 42641: Policy loss: 0.002037. Value loss: 3.034439. Entropy: 1.289336.
Iteration 42642: Policy loss: 0.001537. Va

Iteration 42699: Policy loss: 0.006875. Value loss: 4.983131. Entropy: 1.274072.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42700: Policy loss: 0.000761. Value loss: 3.059042. Entropy: 1.279528.
Iteration 42701: Policy loss: 0.001551. Value loss: 2.548738. Entropy: 1.285938.
Iteration 42702: Policy loss: 0.001295. Value loss: 2.297123. Entropy: 1.288720.
episode: 15977   score: 635.0  epsilon: 1.0    steps: 301  evaluation reward: 355.8
Training network. lr: 0.000025. clip: 0.010112
Iteration 42703: Policy loss: 0.000346. Value loss: 3.105695. Entropy: 1.381998.
Iteration 42704: Policy loss: 0.001827. Value loss: 2.625186. Entropy: 1.378200.
Iteration 42705: Policy loss: 0.001899. Value loss: 2.233175. Entropy: 1.381790.
episode: 15978   score: 420.0  epsilon: 1.0    steps: 5  evaluation reward: 359.0
episode: 15979   score: 490.0  epsilon: 1.0    steps: 850  evaluation reward: 360.0
episode: 15980   score: 285.0  epsilon: 1.0    steps: 938  evaluation reward: 362.55
Trai

Training network. lr: 0.000025. clip: 0.010112
Iteration 42763: Policy loss: 0.000948. Value loss: 4.232555. Entropy: 0.971476.
Iteration 42764: Policy loss: 0.000675. Value loss: 3.390482. Entropy: 0.974644.
Iteration 42765: Policy loss: 0.002024. Value loss: 3.095367. Entropy: 0.975824.
now time :  2019-02-23 14:19:52.778515
episode: 16001   score: 615.0  epsilon: 1.0    steps: 780  evaluation reward: 350.45
Training network. lr: 0.000025. clip: 0.010112
Iteration 42766: Policy loss: 0.001729. Value loss: 8.632474. Entropy: 0.923833.
Iteration 42767: Policy loss: 0.002407. Value loss: 7.363304. Entropy: 0.917673.
Iteration 42768: Policy loss: 0.003344. Value loss: 6.685234. Entropy: 0.924593.
episode: 16002   score: 575.0  epsilon: 1.0    steps: 195  evaluation reward: 352.5
episode: 16003   score: 300.0  epsilon: 1.0    steps: 1003  evaluation reward: 354.75
Training network. lr: 0.000025. clip: 0.010112
Iteration 42769: Policy loss: -0.000249. Value loss: 3.279787. Entropy: 1.23321

Iteration 42828: Policy loss: 0.002402. Value loss: 2.189319. Entropy: 1.391891.
episode: 16023   score: 285.0  epsilon: 1.0    steps: 198  evaluation reward: 354.6
episode: 16024   score: 420.0  epsilon: 1.0    steps: 552  evaluation reward: 352.25
Training network. lr: 0.000025. clip: 0.010112
Iteration 42829: Policy loss: 0.002103. Value loss: 7.919713. Entropy: 1.091361.
Iteration 42830: Policy loss: 0.003896. Value loss: 6.273179. Entropy: 1.083237.
Iteration 42831: Policy loss: 0.005176. Value loss: 5.523204. Entropy: 1.079673.
episode: 16025   score: 465.0  epsilon: 1.0    steps: 511  evaluation reward: 353.05
Training network. lr: 0.000025. clip: 0.010112
Iteration 42832: Policy loss: 0.000859. Value loss: 4.419936. Entropy: 1.122627.
Iteration 42833: Policy loss: 0.004967. Value loss: 2.909539. Entropy: 1.125288.
Iteration 42834: Policy loss: 0.003536. Value loss: 2.549416. Entropy: 1.110096.
episode: 16026   score: 320.0  epsilon: 1.0    steps: 831  evaluation reward: 355.8
e

Training network. lr: 0.000025. clip: 0.010112
Iteration 42892: Policy loss: 0.001083. Value loss: 3.180392. Entropy: 1.208713.
Iteration 42893: Policy loss: 0.000485. Value loss: 2.807947. Entropy: 1.229712.
Iteration 42894: Policy loss: 0.001475. Value loss: 2.313392. Entropy: 1.197828.
Training network. lr: 0.000025. clip: 0.010112
Iteration 42895: Policy loss: 0.000002. Value loss: 2.919965. Entropy: 1.346479.
Iteration 42896: Policy loss: 0.002212. Value loss: 2.888144. Entropy: 1.328497.
Iteration 42897: Policy loss: 0.002699. Value loss: 2.413810. Entropy: 1.350518.
episode: 16048   score: 230.0  epsilon: 1.0    steps: 351  evaluation reward: 367.45
Training network. lr: 0.000025. clip: 0.010112
Iteration 42898: Policy loss: 0.000642. Value loss: 3.295203. Entropy: 1.320478.


In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")