# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10

### Main Training Loop

In [2]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
max_eval = -np.inf
reset_count = 0

while (frame < 10000000):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                    
                    if np.mean(evaluation_reward) > max_eval:
                        torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
                        max_eval = float(np.mean(evaluation_reward))
                        reset_count = 0
                    elif e > 5000:
                        reset_count += 1
                        """
                        if (reset_count == reset_max):
                            print("Training went nowhere, starting again at best model")
                            agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                            agent.update_target_net()
                            reset_count = 0
                        """
                e += 1
                evaluation_reward.append(env.score)
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                
                env.done = False
                env.score = 0
                env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()


SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-2-34ccbcc99089>, line 82)

In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

In [None]:
### Loop through all environments and run PPO on them

env_names = ['Asterix-v0', 'Asteroids-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0', 'MsPacman-v0']

for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size)
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        for i in range(num_envs):
            env = envs[i]
            #history = env.history
            #life = env.life
            #state, reward, done, info = [env.state, env.reward, env.done, env.info]
            for j in range(env_mem_size):
                step += 1
                frame += 1

                curr_state = env.history[HISTORY_SIZE-1,:,:]
                action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)

                next_state, env.reward, env.done, env.info = env.step(action)

                if (i == vis_env_idx):
                    vis_env._env.render()

                frame_next_state = get_frame(next_state)
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])

                env.life = env.info['ale.lives']
                r = ((env.reward - low) / (high - low)) * 10

                agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
                if (j == env_mem_size-1):
                    _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                    frame_next_vals.append(frame_next_val)
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]

                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)



        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: -0.134242. Value loss: 0.025672. Entropy: 1.384760.
Iteration 2: Policy loss: -0.134622. Value loss: 0.024417. Entropy: 1.382654.
Iteration 3: Policy loss: -0.138663. Value loss: 0.025462. Entropy: 1.380519.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: -0.166682. Value loss: 0.051994. Entropy: 1.374561.
Iteration 5: Policy loss: -0.153367. Value loss: 0.041330. Entropy: 1.375694.
Iteration 6: Policy loss: -0.167738. Value loss: 0.047337. Entropy: 1.376785.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: -0.256462. Value loss: 0.727763. Entropy: 1.370962.
Iteration 8: Policy loss: -0.249125. Value loss: 0.479165. Entropy: 1.374830.
Iteration 9: Policy loss: -0.237247. Value loss: 0.508010. Entropy: 1.382150.
Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: -0.563716. Value loss: 2.374101. Entropy: 1.376569.
Iteration 11: Policy loss: -0.543814. Value loss: 2.030069. Entropy: 1.379307.

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Training network. lr: 0.000250. clip: 0.100000
Iteration 28: Policy loss: -0.035380. Value loss: 1.883276. Entropy: 1.367360.
Iteration 29: Policy loss: -0.036834. Value loss: 1.759964. Entropy: 1.370840.
Iteration 30: Policy loss: -0.036598. Value loss: 1.746900. Entropy: 1.371950.
Training network. lr: 0.000250. clip: 0.100000
Iteration 31: Policy loss: -0.058636. Value loss: 1.006419. Entropy: 1.369001.
Iteration 32: Policy loss: -0.033320. Value loss: 0.862457. Entropy: 1.364149.
Iteration 33: Policy loss: -0.065974. Value loss: 0.884775. Entropy: 1.361571.
Training network. lr: 0.000250. clip: 0.100000
Iteration 34: Policy loss: 0.000143. Value loss: 1.397101. Entropy: 1.370030.
Iteration 35: Policy loss: -0.000328. Value loss: 1.279787. Entropy: 1.359337.
Iteration 36: Policy loss: -0.010333. Value loss: 1.356722. Entropy: 1.366242.
Training network. lr: 0.000250. clip: 0.100000
Iteration 37: Policy loss: 0.151668. Value loss: 0.326707. Entropy: 1.362033.
Iteration 38: Policy los

Iteration 103: Policy loss: 0.104650. Value loss: 0.487069. Entropy: 1.310694.
Iteration 104: Policy loss: 0.121857. Value loss: 0.390418. Entropy: 1.308963.
Iteration 105: Policy loss: 0.107936. Value loss: 0.398923. Entropy: 1.303240.
Training network. lr: 0.000249. clip: 0.099696
Iteration 106: Policy loss: -0.019600. Value loss: 0.810460. Entropy: 1.291222.
Iteration 107: Policy loss: -0.043799. Value loss: 0.813314. Entropy: 1.305367.
Iteration 108: Policy loss: -0.041227. Value loss: 0.788016. Entropy: 1.295246.
Training network. lr: 0.000249. clip: 0.099696
Iteration 109: Policy loss: -0.053537. Value loss: 1.527595. Entropy: 1.300061.
Iteration 110: Policy loss: -0.172127. Value loss: 1.685091. Entropy: 1.297742.
Iteration 111: Policy loss: -0.111701. Value loss: 1.285365. Entropy: 1.288221.
Training network. lr: 0.000249. clip: 0.099696
Iteration 112: Policy loss: 0.182973. Value loss: 0.846592. Entropy: 1.276863.
Iteration 113: Policy loss: 0.123608. Value loss: 0.738449. Ent

episode: 27   score: 15100.0  epsilon: 1.0    steps: 652  evaluation reward: 20807.40740740741
Training network. lr: 0.000249. clip: 0.099548
Iteration 178: Policy loss: 0.024522. Value loss: 1.066570. Entropy: 1.251285.
Iteration 179: Policy loss: 0.002568. Value loss: 0.774038. Entropy: 1.244492.
Iteration 180: Policy loss: 0.035311. Value loss: 0.542031. Entropy: 1.251379.
Training network. lr: 0.000249. clip: 0.099548
Iteration 181: Policy loss: 0.348849. Value loss: 0.860963. Entropy: 1.250753.
Iteration 182: Policy loss: 0.360043. Value loss: 0.543085. Entropy: 1.267107.
Iteration 183: Policy loss: 0.340807. Value loss: 0.445778. Entropy: 1.261358.
Training network. lr: 0.000249. clip: 0.099548
Iteration 184: Policy loss: -0.182871. Value loss: 1.296346. Entropy: 1.253929.
Iteration 185: Policy loss: -0.199198. Value loss: 1.305865. Entropy: 1.246943.
Iteration 186: Policy loss: -0.209038. Value loss: 1.193934. Entropy: 1.243795.
Training network. lr: 0.000249. clip: 0.099548
Ite

Training network. lr: 0.000248. clip: 0.099235
Iteration 253: Policy loss: -0.319049. Value loss: 1.359535. Entropy: 1.225098.
Iteration 254: Policy loss: -0.315179. Value loss: 1.013290. Entropy: 1.227338.
Iteration 255: Policy loss: -0.313552. Value loss: 0.808596. Entropy: 1.219360.
Training network. lr: 0.000248. clip: 0.099235
Iteration 256: Policy loss: 0.308583. Value loss: 0.695461. Entropy: 1.197481.
Iteration 257: Policy loss: 0.283490. Value loss: 0.415248. Entropy: 1.207493.
Iteration 258: Policy loss: 0.289186. Value loss: 0.344825. Entropy: 1.209764.
Training network. lr: 0.000248. clip: 0.099235
Iteration 259: Policy loss: 0.365938. Value loss: 0.413320. Entropy: 1.214309.
Iteration 260: Policy loss: 0.279146. Value loss: 0.306038. Entropy: 1.213354.
Iteration 261: Policy loss: 0.287859. Value loss: 0.188810. Entropy: 1.214358.
episode: 39   score: 19400.0  epsilon: 1.0    steps: 255  evaluation reward: 21571.79487179487
Training network. lr: 0.000248. clip: 0.099235
Ite

now time :  2019-02-26 18:34:21.096261
episode: 51   score: 15300.0  epsilon: 1.0    steps: 763  evaluation reward: 21550.980392156864
Training network. lr: 0.000248. clip: 0.099088
Iteration 328: Policy loss: -0.078063. Value loss: 1.641254. Entropy: 1.260863.
Iteration 329: Policy loss: -0.147065. Value loss: 1.556226. Entropy: 1.270547.
Iteration 330: Policy loss: -0.147428. Value loss: 1.345164. Entropy: 1.262586.
episode: 52   score: 25400.0  epsilon: 1.0    steps: 438  evaluation reward: 21625.0
Training network. lr: 0.000248. clip: 0.099088
Iteration 331: Policy loss: 0.094786. Value loss: 0.580274. Entropy: 1.262394.
Iteration 332: Policy loss: 0.068061. Value loss: 0.451178. Entropy: 1.273560.
Iteration 333: Policy loss: 0.106265. Value loss: 0.303274. Entropy: 1.273206.
episode: 53   score: 16800.0  epsilon: 1.0    steps: 920  evaluation reward: 21533.962264150945
Training network. lr: 0.000248. clip: 0.099088
Iteration 334: Policy loss: 0.005600. Value loss: 0.746157. Entrop

Iteration 404: Policy loss: -0.031507. Value loss: 0.612422. Entropy: 1.264108.
Iteration 405: Policy loss: -0.022770. Value loss: 0.508492. Entropy: 1.254439.
episode: 61   score: 44800.0  epsilon: 1.0    steps: 962  evaluation reward: 22045.90163934426
Training network. lr: 0.000247. clip: 0.098774
Iteration 406: Policy loss: -0.192920. Value loss: 0.919145. Entropy: 1.230061.
Iteration 407: Policy loss: -0.211573. Value loss: 0.718112. Entropy: 1.193738.
Iteration 408: Policy loss: -0.213745. Value loss: 0.583244. Entropy: 1.195256.
episode: 62   score: 38600.0  epsilon: 1.0    steps: 350  evaluation reward: 22312.90322580645
Training network. lr: 0.000247. clip: 0.098774
Iteration 409: Policy loss: -0.126742. Value loss: 0.927364. Entropy: 1.263301.
Iteration 410: Policy loss: -0.107851. Value loss: 0.700501. Entropy: 1.260257.
Iteration 411: Policy loss: -0.122900. Value loss: 0.716371. Entropy: 1.255312.
Training network. lr: 0.000247. clip: 0.098774
Iteration 412: Policy loss: 0

episode: 74   score: 13200.0  epsilon: 1.0    steps: 82  evaluation reward: 22947.297297297297
Training network. lr: 0.000247. clip: 0.098627
Iteration 478: Policy loss: 0.145831. Value loss: 0.734072. Entropy: 1.184595.
Iteration 479: Policy loss: 0.131309. Value loss: 0.521540. Entropy: 1.200800.
Iteration 480: Policy loss: 0.123923. Value loss: 0.416268. Entropy: 1.190076.
Training network. lr: 0.000247. clip: 0.098627
Iteration 481: Policy loss: 0.128315. Value loss: 0.600195. Entropy: 1.159994.
Iteration 482: Policy loss: 0.123820. Value loss: 0.377291. Entropy: 1.152957.
Iteration 483: Policy loss: 0.127348. Value loss: 0.273005. Entropy: 1.142003.
Training network. lr: 0.000247. clip: 0.098627
Iteration 484: Policy loss: -0.044792. Value loss: 1.398033. Entropy: 1.161748.
Iteration 485: Policy loss: 0.023031. Value loss: 0.742627. Entropy: 1.155169.
Iteration 486: Policy loss: -0.038201. Value loss: 0.576356. Entropy: 1.136416.
episode: 75   score: 29900.0  epsilon: 1.0    steps

Training network. lr: 0.000246. clip: 0.098313
Iteration 553: Policy loss: -0.123705. Value loss: 0.343663. Entropy: 1.188173.
Iteration 554: Policy loss: -0.140674. Value loss: 0.234884. Entropy: 1.182800.
Iteration 555: Policy loss: -0.106437. Value loss: 0.186553. Entropy: 1.176732.
Training network. lr: 0.000246. clip: 0.098313
Iteration 556: Policy loss: -0.058256. Value loss: 1.180871. Entropy: 1.187319.
Iteration 557: Policy loss: -0.071887. Value loss: 0.863398. Entropy: 1.177067.
Iteration 558: Policy loss: -0.103179. Value loss: 0.919431. Entropy: 1.179204.
episode: 86   score: 16300.0  epsilon: 1.0    steps: 514  evaluation reward: 23610.46511627907
Training network. lr: 0.000246. clip: 0.098313
Iteration 559: Policy loss: 0.047139. Value loss: 0.668316. Entropy: 1.106681.
Iteration 560: Policy loss: 0.046547. Value loss: 0.370490. Entropy: 1.113377.
Iteration 561: Policy loss: 0.030040. Value loss: 0.300642. Entropy: 1.097455.
Training network. lr: 0.000246. clip: 0.098313


Iteration 628: Policy loss: 0.184687. Value loss: 0.295518. Entropy: 1.129871.
Iteration 629: Policy loss: 0.189942. Value loss: 0.158017. Entropy: 1.159451.
Iteration 630: Policy loss: 0.189218. Value loss: 0.112671. Entropy: 1.161508.
Training network. lr: 0.000245. clip: 0.098166
Iteration 631: Policy loss: 0.118640. Value loss: 0.743704. Entropy: 1.193213.
Iteration 632: Policy loss: 0.113013. Value loss: 0.478432. Entropy: 1.188289.
Iteration 633: Policy loss: 0.078979. Value loss: 0.357397. Entropy: 1.193517.
Training network. lr: 0.000245. clip: 0.098166
Iteration 634: Policy loss: -0.017169. Value loss: 1.121971. Entropy: 1.123940.
Iteration 635: Policy loss: 0.051235. Value loss: 0.592665. Entropy: 1.121765.
Iteration 636: Policy loss: -0.009278. Value loss: 0.447004. Entropy: 1.121657.
episode: 97   score: 18500.0  epsilon: 1.0    steps: 654  evaluation reward: 23518.556701030928
Training network. lr: 0.000245. clip: 0.098166
Iteration 637: Policy loss: -0.028000. Value loss:

Iteration 704: Policy loss: 0.009614. Value loss: 0.512811. Entropy: 1.074824.
Iteration 705: Policy loss: 0.025747. Value loss: 0.362102. Entropy: 1.074686.
episode: 108   score: 36700.0  epsilon: 1.0    steps: 851  evaluation reward: 23394.0
Training network. lr: 0.000245. clip: 0.097853
Iteration 706: Policy loss: -0.009360. Value loss: 1.063123. Entropy: 1.143048.
Iteration 707: Policy loss: -0.031568. Value loss: 0.781523. Entropy: 1.120737.
Iteration 708: Policy loss: -0.047583. Value loss: 0.641145. Entropy: 1.109197.
Training network. lr: 0.000245. clip: 0.097853
Iteration 709: Policy loss: -0.163909. Value loss: 1.154352. Entropy: 1.143426.
Iteration 710: Policy loss: -0.120494. Value loss: 0.784809. Entropy: 1.130848.
Iteration 711: Policy loss: -0.187408. Value loss: 0.763533. Entropy: 1.124867.
Training network. lr: 0.000245. clip: 0.097853
Iteration 712: Policy loss: 0.043609. Value loss: 1.219809. Entropy: 0.979583.
Iteration 713: Policy loss: 0.035005. Value loss: 0.6615

episode: 119   score: 24200.0  epsilon: 1.0    steps: 547  evaluation reward: 23487.0
episode: 120   score: 38100.0  epsilon: 1.0    steps: 849  evaluation reward: 23786.0
Training network. lr: 0.000244. clip: 0.097705
Iteration 781: Policy loss: -0.048157. Value loss: 1.447751. Entropy: 1.098205.
Iteration 782: Policy loss: -0.031625. Value loss: 0.800205. Entropy: 1.108155.
Iteration 783: Policy loss: -0.107017. Value loss: 0.632029. Entropy: 1.095137.
Training network. lr: 0.000244. clip: 0.097705
Iteration 784: Policy loss: -0.142154. Value loss: 1.614519. Entropy: 1.077729.
Iteration 785: Policy loss: -0.223700. Value loss: 0.960065. Entropy: 1.055352.
Iteration 786: Policy loss: -0.201910. Value loss: 0.807556. Entropy: 1.045791.
episode: 121   score: 19700.0  epsilon: 1.0    steps: 312  evaluation reward: 23839.0
Training network. lr: 0.000244. clip: 0.097705
Iteration 787: Policy loss: -0.049586. Value loss: 0.688110. Entropy: 1.072974.
Iteration 788: Policy loss: -0.079100. Va

Training network. lr: 0.000243. clip: 0.097392
Iteration 856: Policy loss: 0.127739. Value loss: 1.074453. Entropy: 0.978679.
Iteration 857: Policy loss: 0.129910. Value loss: 0.654368. Entropy: 0.989372.
Iteration 858: Policy loss: 0.137831. Value loss: 0.473542. Entropy: 0.987758.
episode: 132   score: 24400.0  epsilon: 1.0    steps: 2  evaluation reward: 23857.0
Training network. lr: 0.000243. clip: 0.097392
Iteration 859: Policy loss: -0.312837. Value loss: 1.596105. Entropy: 1.011684.
Iteration 860: Policy loss: -0.304874. Value loss: 1.031320. Entropy: 1.032305.
Iteration 861: Policy loss: -0.254054. Value loss: 0.753730. Entropy: 0.981950.
Training network. lr: 0.000243. clip: 0.097392
Iteration 862: Policy loss: -0.236215. Value loss: 1.299252. Entropy: 1.029333.
Iteration 863: Policy loss: -0.301605. Value loss: 0.851390. Entropy: 1.007083.
Iteration 864: Policy loss: -0.256442. Value loss: 0.699381. Entropy: 1.010248.
Training network. lr: 0.000243. clip: 0.097392
Iteration 8

Iteration 932: Policy loss: -0.075648. Value loss: 0.438848. Entropy: 0.931886.
Iteration 933: Policy loss: -0.118035. Value loss: 0.389917. Entropy: 0.904911.
Training network. lr: 0.000243. clip: 0.097244
Iteration 934: Policy loss: -0.206477. Value loss: 0.956160. Entropy: 0.907300.
Iteration 935: Policy loss: -0.152583. Value loss: 0.448888. Entropy: 0.919837.
Iteration 936: Policy loss: -0.203128. Value loss: 0.402025. Entropy: 0.898515.
Training network. lr: 0.000243. clip: 0.097244
Iteration 937: Policy loss: 0.128310. Value loss: 0.814875. Entropy: 0.966540.
Iteration 938: Policy loss: 0.059925. Value loss: 0.646787. Entropy: 0.968961.
Iteration 939: Policy loss: 0.105374. Value loss: 0.514699. Entropy: 0.964756.
Training network. lr: 0.000243. clip: 0.097244
Iteration 940: Policy loss: -0.144646. Value loss: 1.590381. Entropy: 0.977069.
Iteration 941: Policy loss: -0.078161. Value loss: 1.049862. Entropy: 0.989064.
Iteration 942: Policy loss: -0.102949. Value loss: 0.655219. E

Training network. lr: 0.000242. clip: 0.096931
Iteration 1009: Policy loss: 0.001327. Value loss: 0.557905. Entropy: 0.962947.
Iteration 1010: Policy loss: 0.016338. Value loss: 0.328680. Entropy: 0.966208.
Iteration 1011: Policy loss: 0.019873. Value loss: 0.250507. Entropy: 0.952147.
episode: 153   score: 17700.0  epsilon: 1.0    steps: 307  evaluation reward: 25125.0
Training network. lr: 0.000242. clip: 0.096931
Iteration 1012: Policy loss: -0.046640. Value loss: 0.369705. Entropy: 0.917552.
Iteration 1013: Policy loss: -0.039661. Value loss: 0.155020. Entropy: 0.926076.
Iteration 1014: Policy loss: -0.047885. Value loss: 0.123702. Entropy: 0.923986.
episode: 154   score: 10000.0  epsilon: 1.0    steps: 230  evaluation reward: 25041.0
episode: 155   score: 22200.0  epsilon: 1.0    steps: 1015  evaluation reward: 25095.0
Training network. lr: 0.000242. clip: 0.096931
Iteration 1015: Policy loss: -0.041895. Value loss: 0.722853. Entropy: 0.955868.
Iteration 1016: Policy loss: -0.0760

Training network. lr: 0.000242. clip: 0.096784
Iteration 1084: Policy loss: 0.063399. Value loss: 0.794299. Entropy: 1.007933.
Iteration 1085: Policy loss: 0.089291. Value loss: 0.531485. Entropy: 1.028481.
Iteration 1086: Policy loss: 0.091212. Value loss: 0.527014. Entropy: 1.021030.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1087: Policy loss: -0.499620. Value loss: 1.226755. Entropy: 0.877790.
Iteration 1088: Policy loss: -0.488551. Value loss: 0.616425. Entropy: 0.866155.
Iteration 1089: Policy loss: -0.561668. Value loss: 0.458807. Entropy: 0.865465.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1090: Policy loss: -0.092323. Value loss: 0.948403. Entropy: 0.933332.
Iteration 1091: Policy loss: -0.091181. Value loss: 0.554349. Entropy: 0.953991.
Iteration 1092: Policy loss: -0.108637. Value loss: 0.565370. Entropy: 0.951645.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1093: Policy loss: 0.217435. Value loss: 1.090174. Entropy: 0.866456.
Iterat

Training network. lr: 0.000241. clip: 0.096470
Iteration 1159: Policy loss: 0.306844. Value loss: 0.770071. Entropy: 0.946885.
Iteration 1160: Policy loss: 0.330598. Value loss: 0.448586. Entropy: 0.938883.
Iteration 1161: Policy loss: 0.277542. Value loss: 0.296407. Entropy: 0.941276.
Training network. lr: 0.000241. clip: 0.096470
Iteration 1162: Policy loss: -0.191615. Value loss: 0.706451. Entropy: 0.959674.
Iteration 1163: Policy loss: -0.206714. Value loss: 0.523701. Entropy: 0.927166.
Iteration 1164: Policy loss: -0.216965. Value loss: 0.367454. Entropy: 0.919653.
Training network. lr: 0.000241. clip: 0.096470
Iteration 1165: Policy loss: -0.089175. Value loss: 0.909919. Entropy: 1.031176.
Iteration 1166: Policy loss: -0.091273. Value loss: 0.542905. Entropy: 1.025719.
Iteration 1167: Policy loss: -0.103446. Value loss: 0.388850. Entropy: 1.018131.
Training network. lr: 0.000241. clip: 0.096470
Iteration 1168: Policy loss: -0.307931. Value loss: 1.596060. Entropy: 0.974654.
Itera

Iteration 1235: Policy loss: 0.215010. Value loss: 0.128708. Entropy: 1.009411.
Iteration 1236: Policy loss: 0.190628. Value loss: 0.101652. Entropy: 1.025356.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1237: Policy loss: -0.275513. Value loss: 1.031775. Entropy: 0.942727.
Iteration 1238: Policy loss: -0.258835. Value loss: 0.812852. Entropy: 0.907313.
Iteration 1239: Policy loss: -0.318530. Value loss: 0.681856. Entropy: 0.902985.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1240: Policy loss: -0.051327. Value loss: 1.127395. Entropy: 0.993752.
Iteration 1241: Policy loss: -0.066948. Value loss: 0.792831. Entropy: 1.007643.
Iteration 1242: Policy loss: -0.096389. Value loss: 0.560579. Entropy: 0.988823.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1243: Policy loss: 0.163913. Value loss: 1.064397. Entropy: 0.921063.
Iteration 1244: Policy loss: 0.120170. Value loss: 0.676804. Entropy: 0.946344.
Iteration 1245: Policy loss: 0.156288. Value loss: 0.

Training network. lr: 0.000240. clip: 0.096009
Iteration 1312: Policy loss: -0.032330. Value loss: 0.879523. Entropy: 1.060517.
Iteration 1313: Policy loss: -0.032286. Value loss: 0.557464. Entropy: 1.036597.
Iteration 1314: Policy loss: -0.020595. Value loss: 0.469353. Entropy: 1.027820.
episode: 197   score: 16000.0  epsilon: 1.0    steps: 682  evaluation reward: 26147.0
Training network. lr: 0.000240. clip: 0.096009
Iteration 1315: Policy loss: 0.244096. Value loss: 0.873914. Entropy: 1.040894.
Iteration 1316: Policy loss: 0.238711. Value loss: 0.608642. Entropy: 1.056974.
Iteration 1317: Policy loss: 0.220665. Value loss: 0.465108. Entropy: 1.045044.
Training network. lr: 0.000240. clip: 0.096009
Iteration 1318: Policy loss: -0.051065. Value loss: 0.464524. Entropy: 1.014868.
Iteration 1319: Policy loss: -0.035889. Value loss: 0.230671. Entropy: 0.997226.
Iteration 1320: Policy loss: -0.018084. Value loss: 0.165676. Entropy: 1.008326.
episode: 198   score: 24000.0  epsilon: 1.0    

Training network. lr: 0.000240. clip: 0.095862
Iteration 1387: Policy loss: -0.055134. Value loss: 1.137029. Entropy: 0.880778.
Iteration 1388: Policy loss: -0.091353. Value loss: 0.504261. Entropy: 0.872057.
Iteration 1389: Policy loss: -0.076133. Value loss: 0.401518. Entropy: 0.846442.
episode: 208   score: 26300.0  epsilon: 1.0    steps: 932  evaluation reward: 26766.0
Training network. lr: 0.000240. clip: 0.095862
Iteration 1390: Policy loss: 0.214452. Value loss: 0.320641. Entropy: 0.942977.
Iteration 1391: Policy loss: 0.210719. Value loss: 0.216659. Entropy: 0.946436.
Iteration 1392: Policy loss: 0.171709. Value loss: 0.136952. Entropy: 0.956246.
Training network. lr: 0.000240. clip: 0.095862
Iteration 1393: Policy loss: -0.216466. Value loss: 1.059518. Entropy: 0.795823.
Iteration 1394: Policy loss: -0.251828. Value loss: 0.602094. Entropy: 0.775601.
Iteration 1395: Policy loss: -0.273751. Value loss: 0.422587. Entropy: 0.773482.
Training network. lr: 0.000240. clip: 0.095862


Iteration 1463: Policy loss: -0.173143. Value loss: 0.751367. Entropy: 0.956360.
Iteration 1464: Policy loss: -0.201546. Value loss: 0.487671. Entropy: 0.961692.
Training network. lr: 0.000239. clip: 0.095549
Iteration 1465: Policy loss: 0.454134. Value loss: 1.188513. Entropy: 0.962206.
Iteration 1466: Policy loss: 0.370611. Value loss: 0.666320. Entropy: 0.975067.
Iteration 1467: Policy loss: 0.372779. Value loss: 0.458427. Entropy: 0.982516.
Training network. lr: 0.000239. clip: 0.095549
Iteration 1468: Policy loss: 0.037813. Value loss: 0.994678. Entropy: 0.939709.
Iteration 1469: Policy loss: -0.009544. Value loss: 0.787927. Entropy: 0.939905.
Iteration 1470: Policy loss: 0.008697. Value loss: 0.548150. Entropy: 0.933425.
Training network. lr: 0.000239. clip: 0.095549
Iteration 1471: Policy loss: 0.112661. Value loss: 1.427309. Entropy: 1.024886.
Iteration 1472: Policy loss: 0.092201. Value loss: 1.084846. Entropy: 1.011006.
Iteration 1473: Policy loss: 0.081382. Value loss: 0.801

Iteration 1538: Policy loss: -0.220526. Value loss: 1.329366. Entropy: 0.941869.
Iteration 1539: Policy loss: -0.194427. Value loss: 1.160975. Entropy: 0.933319.
Training network. lr: 0.000239. clip: 0.095401
Iteration 1540: Policy loss: -0.204871. Value loss: 1.770694. Entropy: 1.012446.
Iteration 1541: Policy loss: -0.176802. Value loss: 1.111081. Entropy: 1.011443.
Iteration 1542: Policy loss: -0.173859. Value loss: 0.720571. Entropy: 1.008268.
episode: 230   score: 28600.0  epsilon: 1.0    steps: 856  evaluation reward: 27843.0
Training network. lr: 0.000239. clip: 0.095401
Iteration 1543: Policy loss: 0.002575. Value loss: 0.881508. Entropy: 1.051295.
Iteration 1544: Policy loss: 0.040660. Value loss: 0.594865. Entropy: 1.066922.
Iteration 1545: Policy loss: 0.003353. Value loss: 0.437252. Entropy: 1.053680.
Training network. lr: 0.000239. clip: 0.095401
Iteration 1546: Policy loss: 0.369009. Value loss: 0.776747. Entropy: 1.032408.
Iteration 1547: Policy loss: 0.354492. Value los

Iteration 1616: Policy loss: 0.103353. Value loss: 0.390182. Entropy: 0.978935.
Iteration 1617: Policy loss: 0.117847. Value loss: 0.293861. Entropy: 0.982502.
Training network. lr: 0.000238. clip: 0.095088
Iteration 1618: Policy loss: -0.344992. Value loss: 1.386098. Entropy: 0.935231.
Iteration 1619: Policy loss: -0.362992. Value loss: 0.689171. Entropy: 0.917077.
Iteration 1620: Policy loss: -0.352044. Value loss: 0.426854. Entropy: 0.912001.
Training network. lr: 0.000238. clip: 0.095088
Iteration 1621: Policy loss: -0.233612. Value loss: 1.835528. Entropy: 0.960539.
Iteration 1622: Policy loss: -0.238587. Value loss: 1.473977. Entropy: 0.952795.
Iteration 1623: Policy loss: -0.246246. Value loss: 0.983269. Entropy: 0.948247.
episode: 239   score: 9500.0  epsilon: 1.0    steps: 711  evaluation reward: 28355.0
Training network. lr: 0.000238. clip: 0.095088
Iteration 1624: Policy loss: -0.008504. Value loss: 1.976823. Entropy: 0.901478.
Iteration 1625: Policy loss: -0.004450. Value l

Iteration 1690: Policy loss: -0.271394. Value loss: 0.888822. Entropy: 1.154159.
Iteration 1691: Policy loss: -0.221524. Value loss: 0.339113. Entropy: 1.150293.
Iteration 1692: Policy loss: -0.280310. Value loss: 0.396517. Entropy: 1.147092.
Training network. lr: 0.000237. clip: 0.094940
Iteration 1693: Policy loss: 0.085848. Value loss: 0.339002. Entropy: 1.132569.
Iteration 1694: Policy loss: 0.094356. Value loss: 0.191201. Entropy: 1.152832.
Iteration 1695: Policy loss: 0.094010. Value loss: 0.125879. Entropy: 1.158264.
Training network. lr: 0.000237. clip: 0.094940
Iteration 1696: Policy loss: 0.179283. Value loss: 0.560625. Entropy: 1.127506.
Iteration 1697: Policy loss: 0.166750. Value loss: 0.335808. Entropy: 1.148589.
Iteration 1698: Policy loss: 0.137700. Value loss: 0.252876. Entropy: 1.146796.
episode: 252   score: 16600.0  epsilon: 1.0    steps: 312  evaluation reward: 27914.0
Training network. lr: 0.000237. clip: 0.094940
Iteration 1699: Policy loss: 0.169295. Value loss:

Iteration 1767: Policy loss: 0.052763. Value loss: 0.259531. Entropy: 1.097384.
episode: 262   score: 28500.0  epsilon: 1.0    steps: 193  evaluation reward: 28081.0
episode: 263   score: 18400.0  epsilon: 1.0    steps: 650  evaluation reward: 27942.0
Training network. lr: 0.000237. clip: 0.094627
Iteration 1768: Policy loss: 0.105199. Value loss: 0.495276. Entropy: 1.136034.
Iteration 1769: Policy loss: 0.097994. Value loss: 0.370196. Entropy: 1.140321.
Iteration 1770: Policy loss: 0.089760. Value loss: 0.289810. Entropy: 1.144815.
episode: 264   score: 30400.0  epsilon: 1.0    steps: 457  evaluation reward: 27789.0
Training network. lr: 0.000237. clip: 0.094627
Iteration 1771: Policy loss: 0.045212. Value loss: 0.422451. Entropy: 1.146236.
Iteration 1772: Policy loss: 0.050433. Value loss: 0.295003. Entropy: 1.157680.
Iteration 1773: Policy loss: 0.028881. Value loss: 0.224690. Entropy: 1.139816.
Training network. lr: 0.000237. clip: 0.094627
Iteration 1774: Policy loss: -0.057700. V

episode: 273   score: 24300.0  epsilon: 1.0    steps: 762  evaluation reward: 27090.0
Training network. lr: 0.000236. clip: 0.094480
Iteration 1843: Policy loss: 0.315155. Value loss: 0.506667. Entropy: 1.080378.
Iteration 1844: Policy loss: 0.298985. Value loss: 0.421192. Entropy: 1.090378.
Iteration 1845: Policy loss: 0.307492. Value loss: 0.287037. Entropy: 1.090887.
Training network. lr: 0.000236. clip: 0.094480
Iteration 1846: Policy loss: -0.223010. Value loss: 0.985122. Entropy: 1.079501.
Iteration 1847: Policy loss: -0.240709. Value loss: 0.552212. Entropy: 1.064292.
Iteration 1848: Policy loss: -0.250604. Value loss: 0.434134. Entropy: 1.066610.
episode: 274   score: 41500.0  epsilon: 1.0    steps: 468  evaluation reward: 27103.0
episode: 275   score: 43200.0  epsilon: 1.0    steps: 975  evaluation reward: 27428.0
Training network. lr: 0.000236. clip: 0.094480
Iteration 1849: Policy loss: 0.186847. Value loss: 0.555506. Entropy: 1.113829.
Iteration 1850: Policy loss: 0.188972.

Iteration 1917: Policy loss: 0.036406. Value loss: 0.168588. Entropy: 1.098702.
episode: 286   score: 23600.0  epsilon: 1.0    steps: 861  evaluation reward: 27136.0
Training network. lr: 0.000235. clip: 0.094166
Iteration 1918: Policy loss: -0.064894. Value loss: 0.378287. Entropy: 1.069622.
Iteration 1919: Policy loss: -0.059117. Value loss: 0.139654. Entropy: 1.095244.
Iteration 1920: Policy loss: -0.046436. Value loss: 0.084926. Entropy: 1.098096.
Training network. lr: 0.000235. clip: 0.094166
Iteration 1921: Policy loss: -0.059771. Value loss: 0.295736. Entropy: 1.049020.
Iteration 1922: Policy loss: -0.081804. Value loss: 0.131585. Entropy: 1.052298.
Iteration 1923: Policy loss: -0.062521. Value loss: 0.084918. Entropy: 1.043235.
Training network. lr: 0.000235. clip: 0.094166
Iteration 1924: Policy loss: -0.211374. Value loss: 0.619876. Entropy: 1.138084.
Iteration 1925: Policy loss: -0.227543. Value loss: 0.389005. Entropy: 1.115300.
Iteration 1926: Policy loss: -0.191547. Value

Iteration 1992: Policy loss: -0.475034. Value loss: 1.330563. Entropy: 1.029596.
Training network. lr: 0.000235. clip: 0.094019
Iteration 1993: Policy loss: 0.068024. Value loss: 2.045343. Entropy: 1.085611.
Iteration 1994: Policy loss: 0.091553. Value loss: 1.362905. Entropy: 1.082451.
Iteration 1995: Policy loss: 0.062622. Value loss: 0.980541. Entropy: 1.065944.
Training network. lr: 0.000235. clip: 0.094019
Iteration 1996: Policy loss: 0.191584. Value loss: 0.766560. Entropy: 1.126898.
Iteration 1997: Policy loss: 0.164707. Value loss: 0.494931. Entropy: 1.139628.
Iteration 1998: Policy loss: 0.195762. Value loss: 0.399920. Entropy: 1.124783.
episode: 298   score: 14200.0  epsilon: 1.0    steps: 486  evaluation reward: 26440.0
Training network. lr: 0.000235. clip: 0.094019
Iteration 1999: Policy loss: 0.065072. Value loss: 0.854200. Entropy: 1.163264.
Iteration 2000: Policy loss: -0.007569. Value loss: 0.924741. Entropy: 1.181427.
Iteration 2001: Policy loss: 0.029829. Value loss: 

Training network. lr: 0.000234. clip: 0.093705
Iteration 2068: Policy loss: -0.067593. Value loss: 1.318424. Entropy: 1.005637.
Iteration 2069: Policy loss: -0.081279. Value loss: 0.935653. Entropy: 1.017034.
Iteration 2070: Policy loss: -0.053555. Value loss: 0.699220. Entropy: 1.010035.
Training network. lr: 0.000234. clip: 0.093705
Iteration 2071: Policy loss: 0.307749. Value loss: 1.140168. Entropy: 1.018384.
Iteration 2072: Policy loss: 0.304494. Value loss: 0.777372. Entropy: 1.042602.
Iteration 2073: Policy loss: 0.276209. Value loss: 0.602204. Entropy: 1.050726.
episode: 309   score: 14200.0  epsilon: 1.0    steps: 674  evaluation reward: 26384.0
Training network. lr: 0.000234. clip: 0.093705
Iteration 2074: Policy loss: 0.008919. Value loss: 0.366637. Entropy: 1.089051.
Iteration 2075: Policy loss: -0.013424. Value loss: 0.232525. Entropy: 1.056661.
Iteration 2076: Policy loss: 0.017388. Value loss: 0.115432. Entropy: 1.061461.
Training network. lr: 0.000234. clip: 0.093705
It

Iteration 2142: Policy loss: -0.043710. Value loss: 0.226779. Entropy: 1.086617.
Training network. lr: 0.000234. clip: 0.093558
Iteration 2143: Policy loss: -0.014552. Value loss: 0.444630. Entropy: 1.116626.
Iteration 2144: Policy loss: -0.021469. Value loss: 0.287042. Entropy: 1.118919.
Iteration 2145: Policy loss: -0.018920. Value loss: 0.209012. Entropy: 1.115874.
Training network. lr: 0.000234. clip: 0.093558
Iteration 2146: Policy loss: -0.052022. Value loss: 0.770784. Entropy: 1.112526.
Iteration 2147: Policy loss: -0.052222. Value loss: 0.343885. Entropy: 1.113631.
Iteration 2148: Policy loss: -0.044744. Value loss: 0.207191. Entropy: 1.102540.
Training network. lr: 0.000234. clip: 0.093558
Iteration 2149: Policy loss: -0.018720. Value loss: 0.650729. Entropy: 1.092017.
Iteration 2150: Policy loss: -0.021623. Value loss: 0.369682. Entropy: 1.091038.
Iteration 2151: Policy loss: 0.033516. Value loss: 0.233288. Entropy: 1.084072.
Training network. lr: 0.000234. clip: 0.093401
Ite

Iteration 2218: Policy loss: -0.053004. Value loss: 1.450286. Entropy: 1.016820.
Iteration 2219: Policy loss: -0.035440. Value loss: 0.830844. Entropy: 1.003386.
Iteration 2220: Policy loss: -0.058279. Value loss: 0.706350. Entropy: 1.001137.
episode: 332   score: 10000.0  epsilon: 1.0    steps: 574  evaluation reward: 24639.0
episode: 333   score: 36000.0  epsilon: 1.0    steps: 909  evaluation reward: 24723.0
Training network. lr: 0.000233. clip: 0.093245
Iteration 2221: Policy loss: 0.276470. Value loss: 0.832167. Entropy: 0.988220.
Iteration 2222: Policy loss: 0.271693. Value loss: 0.401121. Entropy: 0.989982.
Iteration 2223: Policy loss: 0.277072. Value loss: 0.300246. Entropy: 1.000000.
Training network. lr: 0.000233. clip: 0.093245
Iteration 2224: Policy loss: 0.007430. Value loss: 1.013557. Entropy: 1.001028.
Iteration 2225: Policy loss: 0.016916. Value loss: 0.547848. Entropy: 1.017629.
Iteration 2226: Policy loss: -0.006820. Value loss: 0.434966. Entropy: 1.027899.
Training n

Iteration 2293: Policy loss: 0.070721. Value loss: 0.371694. Entropy: 1.098978.
Iteration 2294: Policy loss: 0.069635. Value loss: 0.179331. Entropy: 1.107667.
Iteration 2295: Policy loss: 0.056512. Value loss: 0.144252. Entropy: 1.107585.
episode: 344   score: 10200.0  epsilon: 1.0    steps: 117  evaluation reward: 24363.0
Training network. lr: 0.000233. clip: 0.093097
Iteration 2296: Policy loss: 0.041515. Value loss: 1.104361. Entropy: 1.117005.
Iteration 2297: Policy loss: 0.033297. Value loss: 0.804936. Entropy: 1.111684.
Iteration 2298: Policy loss: -0.009026. Value loss: 0.613757. Entropy: 1.120499.
episode: 345   score: 21200.0  epsilon: 1.0    steps: 364  evaluation reward: 24305.0
episode: 346   score: 41200.0  epsilon: 1.0    steps: 556  evaluation reward: 24294.0
Training network. lr: 0.000233. clip: 0.093097
Iteration 2299: Policy loss: 0.008785. Value loss: 0.748979. Entropy: 1.087337.
Iteration 2300: Policy loss: -0.004842. Value loss: 0.541865. Entropy: 1.070123.
Iterat

Training network. lr: 0.000232. clip: 0.092784
Iteration 2368: Policy loss: 0.062499. Value loss: 1.101542. Entropy: 0.983878.
Iteration 2369: Policy loss: 0.108426. Value loss: 0.687041. Entropy: 0.999402.
Iteration 2370: Policy loss: 0.071982. Value loss: 0.477845. Entropy: 0.990035.
Training network. lr: 0.000232. clip: 0.092784
Iteration 2371: Policy loss: -0.240429. Value loss: 1.816646. Entropy: 1.037873.
Iteration 2372: Policy loss: -0.262171. Value loss: 1.232982. Entropy: 1.064201.
Iteration 2373: Policy loss: -0.220630. Value loss: 1.029618. Entropy: 1.043975.
Training network. lr: 0.000232. clip: 0.092784
Iteration 2374: Policy loss: 0.079877. Value loss: 0.855081. Entropy: 1.006151.
Iteration 2375: Policy loss: 0.003736. Value loss: 0.574205. Entropy: 0.991091.
Iteration 2376: Policy loss: 0.058495. Value loss: 0.407365. Entropy: 0.980834.
Training network. lr: 0.000232. clip: 0.092784
Iteration 2377: Policy loss: 0.289259. Value loss: 0.697244. Entropy: 1.039489.
Iteration

Iteration 2446: Policy loss: 0.137001. Value loss: 1.156412. Entropy: 1.054001.
Iteration 2447: Policy loss: 0.124192. Value loss: 0.724368. Entropy: 1.062372.
Iteration 2448: Policy loss: 0.092393. Value loss: 0.507465. Entropy: 1.059583.
episode: 364   score: 61500.0  epsilon: 1.0    steps: 11  evaluation reward: 26081.0
Training network. lr: 0.000232. clip: 0.092636
Iteration 2449: Policy loss: 0.066066. Value loss: 0.832650. Entropy: 1.083173.
Iteration 2450: Policy loss: 0.081774. Value loss: 0.535230. Entropy: 1.090194.
Iteration 2451: Policy loss: 0.027535. Value loss: 0.416831. Entropy: 1.080376.
Training network. lr: 0.000231. clip: 0.092480
Iteration 2452: Policy loss: 0.185122. Value loss: 0.552272. Entropy: 1.083414.
Iteration 2453: Policy loss: 0.192681. Value loss: 0.345542. Entropy: 1.073259.
Iteration 2454: Policy loss: 0.153649. Value loss: 0.272632. Entropy: 1.078152.
episode: 365   score: 19900.0  epsilon: 1.0    steps: 898  evaluation reward: 25875.0
Training networ

Iteration 2522: Policy loss: 0.000485. Value loss: 0.370450. Entropy: 1.079147.
Iteration 2523: Policy loss: -0.064928. Value loss: 0.407966. Entropy: 1.082978.
episode: 375   score: 37800.0  epsilon: 1.0    steps: 956  evaluation reward: 26375.0
Training network. lr: 0.000231. clip: 0.092323
Iteration 2524: Policy loss: -0.121264. Value loss: 1.250165. Entropy: 1.064980.
Iteration 2525: Policy loss: -0.199186. Value loss: 0.805853. Entropy: 1.067359.
Iteration 2526: Policy loss: -0.193346. Value loss: 0.649597. Entropy: 1.060284.
Training network. lr: 0.000231. clip: 0.092323
Iteration 2527: Policy loss: 0.537976. Value loss: 0.951595. Entropy: 1.103131.
Iteration 2528: Policy loss: 0.438499. Value loss: 0.508772. Entropy: 1.129415.
Iteration 2529: Policy loss: 0.455536. Value loss: 0.375715. Entropy: 1.116011.
Training network. lr: 0.000231. clip: 0.092323
Iteration 2530: Policy loss: -0.003842. Value loss: 1.220248. Entropy: 1.072904.
Iteration 2531: Policy loss: -0.107030. Value lo

Iteration 2597: Policy loss: -0.002811. Value loss: 0.523768. Entropy: 1.103258.
Iteration 2598: Policy loss: -0.015669. Value loss: 0.354681. Entropy: 1.089939.
Training network. lr: 0.000230. clip: 0.092176
Iteration 2599: Policy loss: -0.140292. Value loss: 0.582795. Entropy: 1.039226.
Iteration 2600: Policy loss: -0.166966. Value loss: 0.341562. Entropy: 1.032458.
Iteration 2601: Policy loss: -0.160244. Value loss: 0.234574. Entropy: 1.035401.
Training network. lr: 0.000230. clip: 0.092019
Iteration 2602: Policy loss: 0.096741. Value loss: 0.566164. Entropy: 1.097844.
Iteration 2603: Policy loss: 0.115148. Value loss: 0.255575. Entropy: 1.107020.
Iteration 2604: Policy loss: 0.117339. Value loss: 0.207104. Entropy: 1.100548.
episode: 387   score: 18600.0  epsilon: 1.0    steps: 737  evaluation reward: 26444.0
Training network. lr: 0.000230. clip: 0.092019
Iteration 2605: Policy loss: 0.168369. Value loss: 0.366277. Entropy: 1.113543.
Iteration 2606: Policy loss: 0.162252. Value los

Iteration 2672: Policy loss: -0.191954. Value loss: 0.541140. Entropy: 1.123427.
Iteration 2673: Policy loss: -0.239273. Value loss: 0.357295. Entropy: 1.123935.
Training network. lr: 0.000230. clip: 0.091862
Iteration 2674: Policy loss: -0.133540. Value loss: 0.950103. Entropy: 1.120155.
Iteration 2675: Policy loss: -0.219649. Value loss: 0.456425. Entropy: 1.134229.
Iteration 2676: Policy loss: -0.125502. Value loss: 0.271291. Entropy: 1.121947.
Training network. lr: 0.000230. clip: 0.091862
Iteration 2677: Policy loss: 0.162243. Value loss: 0.397512. Entropy: 1.144106.
Iteration 2678: Policy loss: 0.149946. Value loss: 0.201392. Entropy: 1.153731.
Iteration 2679: Policy loss: 0.152183. Value loss: 0.118810. Entropy: 1.157986.
Training network. lr: 0.000230. clip: 0.091862
Iteration 2680: Policy loss: -0.120648. Value loss: 0.948575. Entropy: 1.148942.
Iteration 2681: Policy loss: -0.121495. Value loss: 0.436931. Entropy: 1.135522.
Iteration 2682: Policy loss: -0.087778. Value loss: 

Iteration 2748: Policy loss: -0.039422. Value loss: 0.372106. Entropy: 1.052293.
episode: 409   score: 18800.0  epsilon: 1.0    steps: 558  evaluation reward: 26651.0
Training network. lr: 0.000229. clip: 0.091715
Iteration 2749: Policy loss: -0.052282. Value loss: 1.484184. Entropy: 1.081444.
Iteration 2750: Policy loss: -0.097523. Value loss: 0.707635. Entropy: 1.072140.
Iteration 2751: Policy loss: -0.023989. Value loss: 0.468320. Entropy: 1.057519.
Training network. lr: 0.000229. clip: 0.091558
Iteration 2752: Policy loss: -0.002417. Value loss: 1.247566. Entropy: 1.080440.
Iteration 2753: Policy loss: 0.000831. Value loss: 0.634060. Entropy: 1.065221.
Iteration 2754: Policy loss: -0.013202. Value loss: 0.562658. Entropy: 1.058711.
Training network. lr: 0.000229. clip: 0.091558
Iteration 2755: Policy loss: 0.268231. Value loss: 0.844193. Entropy: 1.034746.
Iteration 2756: Policy loss: 0.348727. Value loss: 0.474760. Entropy: 1.047433.
Iteration 2757: Policy loss: 0.272223. Value lo

episode: 421   score: 19800.0  epsilon: 1.0    steps: 832  evaluation reward: 27659.0
Training network. lr: 0.000229. clip: 0.091401
Iteration 2824: Policy loss: 0.037838. Value loss: 0.426764. Entropy: 1.009029.
Iteration 2825: Policy loss: 0.014013. Value loss: 0.225513. Entropy: 1.005856.
Iteration 2826: Policy loss: 0.002895. Value loss: 0.150299. Entropy: 1.019892.
Training network. lr: 0.000229. clip: 0.091401
Iteration 2827: Policy loss: -0.392426. Value loss: 1.701659. Entropy: 0.995840.
Iteration 2828: Policy loss: -0.328085. Value loss: 1.039574. Entropy: 0.990765.
Iteration 2829: Policy loss: -0.396722. Value loss: 1.045666. Entropy: 0.978643.
episode: 422   score: 49000.0  epsilon: 1.0    steps: 574  evaluation reward: 27848.0
episode: 423   score: 23000.0  epsilon: 1.0    steps: 705  evaluation reward: 27955.0
Training network. lr: 0.000229. clip: 0.091401
Iteration 2830: Policy loss: 0.159367. Value loss: 0.338496. Entropy: 1.002496.
Iteration 2831: Policy loss: 0.154099.

Iteration 2899: Policy loss: -0.255600. Value loss: 0.944962. Entropy: 0.985769.
Iteration 2900: Policy loss: -0.306392. Value loss: 0.668720. Entropy: 0.988316.
Iteration 2901: Policy loss: -0.232394. Value loss: 0.476674. Entropy: 0.971883.
Training network. lr: 0.000228. clip: 0.091097
Iteration 2902: Policy loss: 0.397893. Value loss: 0.753721. Entropy: 1.004204.
Iteration 2903: Policy loss: 0.362117. Value loss: 0.495170. Entropy: 1.014802.
Iteration 2904: Policy loss: 0.360006. Value loss: 0.340415. Entropy: 1.013502.
Training network. lr: 0.000228. clip: 0.091097
Iteration 2905: Policy loss: 0.113105. Value loss: 0.566920. Entropy: 1.012365.
Iteration 2906: Policy loss: 0.067219. Value loss: 0.391850. Entropy: 1.045503.
Iteration 2907: Policy loss: 0.097791. Value loss: 0.313102. Entropy: 1.021691.
Training network. lr: 0.000228. clip: 0.091097
Iteration 2908: Policy loss: -0.176478. Value loss: 1.404504. Entropy: 1.082303.
Iteration 2909: Policy loss: -0.179063. Value loss: 0.7

Iteration 2974: Policy loss: -0.124496. Value loss: 0.864767. Entropy: 1.027648.
Iteration 2975: Policy loss: -0.093082. Value loss: 0.744242. Entropy: 1.032375.
Iteration 2976: Policy loss: -0.127188. Value loss: 0.476897. Entropy: 1.019714.
Training network. lr: 0.000227. clip: 0.090941
Iteration 2977: Policy loss: 0.246310. Value loss: 0.379349. Entropy: 1.113528.
Iteration 2978: Policy loss: 0.206768. Value loss: 0.149502. Entropy: 1.116347.
Iteration 2979: Policy loss: 0.220428. Value loss: 0.105587. Entropy: 1.117789.
Training network. lr: 0.000227. clip: 0.090941
Iteration 2980: Policy loss: -0.073140. Value loss: 0.725715. Entropy: 1.072300.
Iteration 2981: Policy loss: -0.127422. Value loss: 0.541079. Entropy: 1.087970.
Iteration 2982: Policy loss: -0.089168. Value loss: 0.315953. Entropy: 1.075073.
Training network. lr: 0.000227. clip: 0.090941
Iteration 2983: Policy loss: 0.132105. Value loss: 0.397779. Entropy: 1.082800.
Iteration 2984: Policy loss: 0.160739. Value loss: 0.

episode: 456   score: 9000.0  epsilon: 1.0    steps: 539  evaluation reward: 27747.0
Training network. lr: 0.000227. clip: 0.090793
Iteration 3049: Policy loss: -0.127617. Value loss: 0.657906. Entropy: 1.126771.
Iteration 3050: Policy loss: -0.204442. Value loss: 0.474417. Entropy: 1.114477.
Iteration 3051: Policy loss: -0.171538. Value loss: 0.273853. Entropy: 1.110146.
episode: 457   score: 11400.0  epsilon: 1.0    steps: 326  evaluation reward: 27505.0
Training network. lr: 0.000227. clip: 0.090637
Iteration 3052: Policy loss: -0.151075. Value loss: 1.232231. Entropy: 1.130249.
Iteration 3053: Policy loss: -0.145632. Value loss: 0.960678. Entropy: 1.128635.
Iteration 3054: Policy loss: -0.173303. Value loss: 0.751621. Entropy: 1.130148.
episode: 458   score: 26300.0  epsilon: 1.0    steps: 1000  evaluation reward: 27162.0
Training network. lr: 0.000227. clip: 0.090637
Iteration 3055: Policy loss: 0.204125. Value loss: 0.750744. Entropy: 1.105129.
Iteration 3056: Policy loss: 0.2026

Iteration 3123: Policy loss: -0.009596. Value loss: 0.381990. Entropy: 1.070564.
Training network. lr: 0.000226. clip: 0.090480
Iteration 3124: Policy loss: -0.122159. Value loss: 0.700306. Entropy: 1.049578.
Iteration 3125: Policy loss: -0.149787. Value loss: 0.316565. Entropy: 1.051993.
Iteration 3126: Policy loss: -0.161220. Value loss: 0.277609. Entropy: 1.056635.
episode: 469   score: 27200.0  epsilon: 1.0    steps: 872  evaluation reward: 26285.0
Training network. lr: 0.000226. clip: 0.090480
Iteration 3127: Policy loss: 0.071406. Value loss: 1.027124. Entropy: 1.078805.
Iteration 3128: Policy loss: 0.042191. Value loss: 0.501856. Entropy: 1.103928.
Iteration 3129: Policy loss: 0.038027. Value loss: 0.385195. Entropy: 1.084350.
Training network. lr: 0.000226. clip: 0.090480
Iteration 3130: Policy loss: -0.132666. Value loss: 0.803722. Entropy: 1.093443.
Iteration 3131: Policy loss: -0.120395. Value loss: 0.367539. Entropy: 1.074132.
Iteration 3132: Policy loss: -0.105232. Value l

Training network. lr: 0.000226. clip: 0.090332
Iteration 3199: Policy loss: 0.039486. Value loss: 0.296164. Entropy: 1.093208.
Iteration 3200: Policy loss: 0.041072. Value loss: 0.208796. Entropy: 1.096178.
Iteration 3201: Policy loss: 0.045952. Value loss: 0.139478. Entropy: 1.094036.
Training network. lr: 0.000225. clip: 0.090176
Iteration 3202: Policy loss: 0.051244. Value loss: 0.356010. Entropy: 1.077672.
Iteration 3203: Policy loss: 0.048921. Value loss: 0.208892. Entropy: 1.080166.
Iteration 3204: Policy loss: 0.030026. Value loss: 0.148518. Entropy: 1.084157.
episode: 480   score: 14800.0  epsilon: 1.0    steps: 632  evaluation reward: 25948.0
Training network. lr: 0.000225. clip: 0.090176
Iteration 3205: Policy loss: -0.269593. Value loss: 1.168882. Entropy: 1.069829.
Iteration 3206: Policy loss: -0.241210. Value loss: 0.530953. Entropy: 1.073758.
Iteration 3207: Policy loss: -0.326608. Value loss: 0.478557. Entropy: 1.063838.
episode: 481   score: 24000.0  epsilon: 1.0    ste

Training network. lr: 0.000225. clip: 0.090019
Iteration 3274: Policy loss: 0.124669. Value loss: 0.685106. Entropy: 0.947459.
Iteration 3275: Policy loss: 0.129400. Value loss: 0.345244. Entropy: 0.958922.
Iteration 3276: Policy loss: 0.100146. Value loss: 0.255677. Entropy: 0.937065.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3277: Policy loss: -0.492774. Value loss: 1.315004. Entropy: 0.931356.
Iteration 3278: Policy loss: -0.492182. Value loss: 0.638804. Entropy: 0.902806.
Iteration 3279: Policy loss: -0.506410. Value loss: 0.407312. Entropy: 0.893178.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3280: Policy loss: 0.146208. Value loss: 1.020036. Entropy: 0.969051.
Iteration 3281: Policy loss: 0.079218. Value loss: 0.502799. Entropy: 0.992829.
Iteration 3282: Policy loss: 0.113994. Value loss: 0.365959. Entropy: 0.989597.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3283: Policy loss: -0.026162. Value loss: 0.983077. Entropy: 0.998035.
Iteratio

Iteration 3348: Policy loss: -0.142506. Value loss: 0.473819. Entropy: 1.094298.
Training network. lr: 0.000225. clip: 0.089872
Iteration 3349: Policy loss: -0.030950. Value loss: 0.696501. Entropy: 1.101373.
Iteration 3350: Policy loss: -0.044204. Value loss: 0.330233. Entropy: 1.109774.
Iteration 3351: Policy loss: -0.057531. Value loss: 0.228685. Entropy: 1.117254.
Training network. lr: 0.000224. clip: 0.089715
Iteration 3352: Policy loss: -0.281303. Value loss: 1.686554. Entropy: 1.142106.
Iteration 3353: Policy loss: -0.164021. Value loss: 0.893630. Entropy: 1.131182.
Iteration 3354: Policy loss: -0.231876. Value loss: 0.834372. Entropy: 1.116896.
Training network. lr: 0.000224. clip: 0.089715
Iteration 3355: Policy loss: -0.049272. Value loss: 1.158059. Entropy: 1.089922.
Iteration 3356: Policy loss: -0.104693. Value loss: 0.623352. Entropy: 1.085921.
Iteration 3357: Policy loss: -0.063265. Value loss: 0.435420. Entropy: 1.091194.
episode: 504   score: 18000.0  epsilon: 1.0    st

Iteration 3424: Policy loss: -0.020211. Value loss: 0.893951. Entropy: 1.104280.
Iteration 3425: Policy loss: -0.049646. Value loss: 0.485281. Entropy: 1.095613.
Iteration 3426: Policy loss: -0.033479. Value loss: 0.348728. Entropy: 1.096951.
episode: 514   score: 31600.0  epsilon: 1.0    steps: 181  evaluation reward: 26276.0
Training network. lr: 0.000224. clip: 0.089558
Iteration 3427: Policy loss: -0.067828. Value loss: 0.557210. Entropy: 1.116769.
Iteration 3428: Policy loss: -0.047035. Value loss: 0.392519. Entropy: 1.127981.
Iteration 3429: Policy loss: -0.045176. Value loss: 0.293172. Entropy: 1.122703.
Training network. lr: 0.000224. clip: 0.089558
Iteration 3430: Policy loss: -0.143358. Value loss: 1.141421. Entropy: 1.134987.
Iteration 3431: Policy loss: -0.146670. Value loss: 0.578067. Entropy: 1.138771.
Iteration 3432: Policy loss: -0.216141. Value loss: 0.341801. Entropy: 1.118229.
Training network. lr: 0.000224. clip: 0.089558
Iteration 3433: Policy loss: -0.032596. Valu

Training network. lr: 0.000224. clip: 0.089411
Iteration 3499: Policy loss: 0.047813. Value loss: 0.696293. Entropy: 1.112862.
Iteration 3500: Policy loss: 0.011603. Value loss: 0.340547. Entropy: 1.117364.
Iteration 3501: Policy loss: 0.018315. Value loss: 0.223854. Entropy: 1.122617.
Training network. lr: 0.000223. clip: 0.089254
Iteration 3502: Policy loss: 0.239469. Value loss: 0.550703. Entropy: 1.104887.
Iteration 3503: Policy loss: 0.204056. Value loss: 0.351261. Entropy: 1.116806.
Iteration 3504: Policy loss: 0.206853. Value loss: 0.261858. Entropy: 1.110244.
Training network. lr: 0.000223. clip: 0.089254
Iteration 3505: Policy loss: -0.121088. Value loss: 0.997647. Entropy: 1.126553.
Iteration 3506: Policy loss: -0.177658. Value loss: 0.496232. Entropy: 1.106562.
Iteration 3507: Policy loss: -0.112538. Value loss: 0.284304. Entropy: 1.106024.
episode: 526   score: 18700.0  epsilon: 1.0    steps: 717  evaluation reward: 25744.0
Training network. lr: 0.000223. clip: 0.089254
Ite

Iteration 3574: Policy loss: -0.204901. Value loss: 1.453907. Entropy: 1.069928.
Iteration 3575: Policy loss: -0.205166. Value loss: 0.859831. Entropy: 1.090533.
Iteration 3576: Policy loss: -0.245858. Value loss: 0.548474. Entropy: 1.074532.
episode: 537   score: 32300.0  epsilon: 1.0    steps: 94  evaluation reward: 25667.0
Training network. lr: 0.000223. clip: 0.089097
Iteration 3577: Policy loss: 0.012718. Value loss: 1.191248. Entropy: 1.066805.
Iteration 3578: Policy loss: -0.020999. Value loss: 0.695496. Entropy: 1.066946.
Iteration 3579: Policy loss: 0.054371. Value loss: 0.487086. Entropy: 1.069325.
episode: 538   score: 15000.0  epsilon: 1.0    steps: 467  evaluation reward: 25578.0
Training network. lr: 0.000223. clip: 0.089097
Iteration 3580: Policy loss: 0.072806. Value loss: 0.541148. Entropy: 1.090289.
Iteration 3581: Policy loss: 0.110744. Value loss: 0.267314. Entropy: 1.106621.
Iteration 3582: Policy loss: 0.091833. Value loss: 0.181227. Entropy: 1.106824.
episode: 53

Iteration 3649: Policy loss: 0.029334. Value loss: 1.022565. Entropy: 1.068712.
Iteration 3650: Policy loss: 0.025449. Value loss: 0.462072. Entropy: 1.058951.
Iteration 3651: Policy loss: 0.055325. Value loss: 0.337496. Entropy: 1.054959.
Training network. lr: 0.000222. clip: 0.088793
Iteration 3652: Policy loss: 0.337220. Value loss: 0.969316. Entropy: 1.033328.
Iteration 3653: Policy loss: 0.328264. Value loss: 0.635619. Entropy: 1.038704.
Iteration 3654: Policy loss: 0.312020. Value loss: 0.496374. Entropy: 1.045238.
Training network. lr: 0.000222. clip: 0.088793
Iteration 3655: Policy loss: -0.038640. Value loss: 1.234845. Entropy: 1.041570.
Iteration 3656: Policy loss: -0.064249. Value loss: 0.919824. Entropy: 1.052983.
Iteration 3657: Policy loss: -0.020595. Value loss: 0.501491. Entropy: 1.061763.
Training network. lr: 0.000222. clip: 0.088793
Iteration 3658: Policy loss: 0.052320. Value loss: 1.177163. Entropy: 1.077375.
Iteration 3659: Policy loss: 0.079101. Value loss: 0.692

Training network. lr: 0.000222. clip: 0.088637
Iteration 3727: Policy loss: -0.017892. Value loss: 0.631214. Entropy: 1.053094.
Iteration 3728: Policy loss: -0.059151. Value loss: 0.347556. Entropy: 1.068154.
Iteration 3729: Policy loss: -0.033378. Value loss: 0.262031. Entropy: 1.067997.
episode: 558   score: 32800.0  epsilon: 1.0    steps: 86  evaluation reward: 26623.0
episode: 559   score: 21100.0  epsilon: 1.0    steps: 789  evaluation reward: 26736.0
Training network. lr: 0.000222. clip: 0.088637
Iteration 3730: Policy loss: -0.164696. Value loss: 0.619802. Entropy: 1.127862.
Iteration 3731: Policy loss: -0.166657. Value loss: 0.384124. Entropy: 1.131251.
Iteration 3732: Policy loss: -0.176871. Value loss: 0.326368. Entropy: 1.116089.
episode: 560   score: 23800.0  epsilon: 1.0    steps: 327  evaluation reward: 26782.0
episode: 561   score: 25300.0  epsilon: 1.0    steps: 434  evaluation reward: 26778.0
episode: 562   score: 27100.0  epsilon: 1.0    steps: 899  evaluation reward:

Training network. lr: 0.000221. clip: 0.088333
Iteration 3802: Policy loss: 0.145482. Value loss: 1.078404. Entropy: 1.071941.
Iteration 3803: Policy loss: 0.166751. Value loss: 0.592854. Entropy: 1.051741.
Iteration 3804: Policy loss: 0.130134. Value loss: 0.411395. Entropy: 1.056379.
Training network. lr: 0.000221. clip: 0.088333
Iteration 3805: Policy loss: 0.481212. Value loss: 0.895492. Entropy: 1.065177.
Iteration 3806: Policy loss: 0.453421. Value loss: 0.338776. Entropy: 1.083538.
Iteration 3807: Policy loss: 0.471590. Value loss: 0.210325. Entropy: 1.072307.
Training network. lr: 0.000221. clip: 0.088333
Iteration 3808: Policy loss: -0.240037. Value loss: 1.060301. Entropy: 1.088289.
Iteration 3809: Policy loss: -0.236589. Value loss: 0.669370. Entropy: 1.086958.
Iteration 3810: Policy loss: -0.258032. Value loss: 0.449874. Entropy: 1.082039.
episode: 570   score: 45600.0  epsilon: 1.0    steps: 624  evaluation reward: 27328.0
episode: 571   score: 49100.0  epsilon: 1.0    ste

Iteration 3879: Policy loss: -0.008928. Value loss: 0.285542. Entropy: 1.084593.
Training network. lr: 0.000220. clip: 0.088176
Iteration 3880: Policy loss: 0.140047. Value loss: 0.634725. Entropy: 1.086322.
Iteration 3881: Policy loss: 0.087914. Value loss: 0.429755. Entropy: 1.086721.
Iteration 3882: Policy loss: 0.118094. Value loss: 0.275302. Entropy: 1.094600.
episode: 579   score: 34800.0  epsilon: 1.0    steps: 927  evaluation reward: 27720.0
Training network. lr: 0.000220. clip: 0.088176
Iteration 3883: Policy loss: 0.169284. Value loss: 0.662778. Entropy: 1.050506.
Iteration 3884: Policy loss: 0.206316. Value loss: 0.377437. Entropy: 1.039271.
Iteration 3885: Policy loss: 0.181950. Value loss: 0.292059. Entropy: 1.032810.
Training network. lr: 0.000220. clip: 0.088176
Iteration 3886: Policy loss: 0.318906. Value loss: 0.698854. Entropy: 1.032587.
Iteration 3887: Policy loss: 0.296979. Value loss: 0.373250. Entropy: 1.054263.
Iteration 3888: Policy loss: 0.335117. Value loss: 0

Iteration 3954: Policy loss: 0.111986. Value loss: 0.106153. Entropy: 1.083664.
Training network. lr: 0.000220. clip: 0.087872
Iteration 3955: Policy loss: -0.196754. Value loss: 0.801049. Entropy: 1.102081.
Iteration 3956: Policy loss: -0.137908. Value loss: 0.329107. Entropy: 1.094177.
Iteration 3957: Policy loss: -0.202732. Value loss: 0.187288. Entropy: 1.077827.
Training network. lr: 0.000220. clip: 0.087872
Iteration 3958: Policy loss: -0.053105. Value loss: 1.566743. Entropy: 1.065360.
Iteration 3959: Policy loss: -0.054695. Value loss: 0.929113. Entropy: 1.068293.
Iteration 3960: Policy loss: -0.106080. Value loss: 0.664869. Entropy: 1.046936.
Training network. lr: 0.000220. clip: 0.087872
Iteration 3961: Policy loss: 0.069732. Value loss: 1.128420. Entropy: 1.105149.
Iteration 3962: Policy loss: 0.071991. Value loss: 0.546028. Entropy: 1.099147.
Iteration 3963: Policy loss: 0.067401. Value loss: 0.397813. Entropy: 1.083345.
episode: 591   score: 29500.0  epsilon: 1.0    steps:

Training network. lr: 0.000219. clip: 0.087715
Iteration 4030: Policy loss: 0.062008. Value loss: 0.473552. Entropy: 1.127022.
Iteration 4031: Policy loss: -0.003083. Value loss: 0.382704. Entropy: 1.140020.
Iteration 4032: Policy loss: 0.033040. Value loss: 0.228770. Entropy: 1.128316.
episode: 602   score: 24300.0  epsilon: 1.0    steps: 512  evaluation reward: 28158.0
Training network. lr: 0.000219. clip: 0.087715
Iteration 4033: Policy loss: 0.333680. Value loss: 0.482069. Entropy: 1.141605.
Iteration 4034: Policy loss: 0.315021. Value loss: 0.349893. Entropy: 1.153398.
Iteration 4035: Policy loss: 0.275853. Value loss: 0.246373. Entropy: 1.152773.
Training network. lr: 0.000219. clip: 0.087715
Iteration 4036: Policy loss: 0.000264. Value loss: 0.832212. Entropy: 1.141812.
Iteration 4037: Policy loss: 0.027469. Value loss: 0.431358. Entropy: 1.138044.
Iteration 4038: Policy loss: -0.008876. Value loss: 0.336070. Entropy: 1.139063.
episode: 603   score: 36900.0  epsilon: 1.0    step

Iteration 4103: Policy loss: 0.092447. Value loss: 0.260922. Entropy: 1.163957.
Iteration 4104: Policy loss: 0.108180. Value loss: 0.149201. Entropy: 1.156279.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4105: Policy loss: 0.052432. Value loss: 0.515970. Entropy: 1.102313.
Iteration 4106: Policy loss: 0.058903. Value loss: 0.344111. Entropy: 1.112785.
Iteration 4107: Policy loss: 0.021631. Value loss: 0.277918. Entropy: 1.101958.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4108: Policy loss: -0.109772. Value loss: 0.482424. Entropy: 1.125556.
Iteration 4109: Policy loss: -0.114593. Value loss: 0.342116. Entropy: 1.128321.
Iteration 4110: Policy loss: -0.106355. Value loss: 0.212669. Entropy: 1.120577.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4111: Policy loss: 0.066244. Value loss: 0.593169. Entropy: 1.113478.
Iteration 4112: Policy loss: 0.083833. Value loss: 0.332293. Entropy: 1.122141.
Iteration 4113: Policy loss: 0.016440. Value loss: 0.211

episode: 626   score: 25400.0  epsilon: 1.0    steps: 981  evaluation reward: 28135.0
Training network. lr: 0.000218. clip: 0.087254
Iteration 4180: Policy loss: 0.036133. Value loss: 0.520589. Entropy: 1.122883.
Iteration 4181: Policy loss: 0.050316. Value loss: 0.305474. Entropy: 1.128691.
Iteration 4182: Policy loss: 0.090105. Value loss: 0.195058. Entropy: 1.131243.
Training network. lr: 0.000218. clip: 0.087254
Iteration 4183: Policy loss: -0.125210. Value loss: 1.214803. Entropy: 1.134451.
Iteration 4184: Policy loss: -0.122177. Value loss: 0.625850. Entropy: 1.106438.
Iteration 4185: Policy loss: -0.150549. Value loss: 0.491496. Entropy: 1.105076.
Training network. lr: 0.000218. clip: 0.087254
Iteration 4186: Policy loss: 0.116679. Value loss: 0.978884. Entropy: 1.120719.
Iteration 4187: Policy loss: 0.106399. Value loss: 0.542850. Entropy: 1.131958.
Iteration 4188: Policy loss: 0.078664. Value loss: 0.352374. Entropy: 1.137312.
Training network. lr: 0.000218. clip: 0.087254
Ite

Training network. lr: 0.000217. clip: 0.086950
Iteration 4255: Policy loss: -0.279844. Value loss: 1.335636. Entropy: 1.115635.
Iteration 4256: Policy loss: -0.201880. Value loss: 0.695058. Entropy: 1.110380.
Iteration 4257: Policy loss: -0.288720. Value loss: 0.534488. Entropy: 1.111978.
Training network. lr: 0.000217. clip: 0.086950
Iteration 4258: Policy loss: -0.004249. Value loss: 1.359044. Entropy: 1.101411.
Iteration 4259: Policy loss: 0.000316. Value loss: 0.796715. Entropy: 1.082685.
Iteration 4260: Policy loss: -0.021902. Value loss: 0.714413. Entropy: 1.077582.
Training network. lr: 0.000217. clip: 0.086950
Iteration 4261: Policy loss: 0.128722. Value loss: 0.851320. Entropy: 1.106924.
Iteration 4262: Policy loss: 0.117265. Value loss: 0.551635. Entropy: 1.103413.
Iteration 4263: Policy loss: 0.101724. Value loss: 0.399320. Entropy: 1.095466.
Training network. lr: 0.000217. clip: 0.086950
Iteration 4264: Policy loss: 0.230244. Value loss: 0.842521. Entropy: 1.053165.
Iterati

Training network. lr: 0.000217. clip: 0.086793
Iteration 4330: Policy loss: -0.119524. Value loss: 0.849046. Entropy: 1.102532.
Iteration 4331: Policy loss: -0.120906. Value loss: 0.317417. Entropy: 1.101691.
Iteration 4332: Policy loss: -0.096816. Value loss: 0.201645. Entropy: 1.092111.
episode: 650   score: 24400.0  epsilon: 1.0    steps: 341  evaluation reward: 28240.0
Training network. lr: 0.000217. clip: 0.086793
Iteration 4333: Policy loss: -0.018615. Value loss: 0.759181. Entropy: 1.094937.
Iteration 4334: Policy loss: -0.012893. Value loss: 0.449422. Entropy: 1.107224.
Iteration 4335: Policy loss: -0.027226. Value loss: 0.328162. Entropy: 1.101365.
Training network. lr: 0.000217. clip: 0.086793
Iteration 4336: Policy loss: -0.133443. Value loss: 1.724385. Entropy: 1.058991.
Iteration 4337: Policy loss: -0.138062. Value loss: 1.088379. Entropy: 1.062992.
Iteration 4338: Policy loss: -0.169115. Value loss: 0.955262. Entropy: 1.054096.
Training network. lr: 0.000217. clip: 0.0867

episode: 661   score: 26500.0  epsilon: 1.0    steps: 544  evaluation reward: 28457.0
Training network. lr: 0.000216. clip: 0.086489
Iteration 4405: Policy loss: -0.007167. Value loss: 0.643224. Entropy: 1.031776.
Iteration 4406: Policy loss: 0.032563. Value loss: 0.364340. Entropy: 1.059524.
Iteration 4407: Policy loss: 0.022208. Value loss: 0.322895. Entropy: 1.047518.
Training network. lr: 0.000216. clip: 0.086489
Iteration 4408: Policy loss: -0.064481. Value loss: 0.949310. Entropy: 1.065447.
Iteration 4409: Policy loss: -0.088345. Value loss: 0.425317. Entropy: 1.081332.
Iteration 4410: Policy loss: -0.073061. Value loss: 0.270412. Entropy: 1.068838.
Training network. lr: 0.000216. clip: 0.086489
Iteration 4411: Policy loss: -0.086902. Value loss: 1.763364. Entropy: 1.063233.
Iteration 4412: Policy loss: -0.027460. Value loss: 0.859012. Entropy: 1.051341.
Iteration 4413: Policy loss: -0.105050. Value loss: 0.598819. Entropy: 1.040894.
Training network. lr: 0.000216. clip: 0.086489

Iteration 4482: Policy loss: -0.059200. Value loss: 0.284508. Entropy: 1.092640.
Training network. lr: 0.000216. clip: 0.086333
Iteration 4483: Policy loss: 0.006082. Value loss: 1.250286. Entropy: 1.128253.
Iteration 4484: Policy loss: -0.034237. Value loss: 0.631900. Entropy: 1.115809.
Iteration 4485: Policy loss: -0.043101. Value loss: 0.425580. Entropy: 1.111485.
Training network. lr: 0.000216. clip: 0.086333
Iteration 4486: Policy loss: -0.091405. Value loss: 1.120953. Entropy: 1.127783.
Iteration 4487: Policy loss: -0.125644. Value loss: 0.473265. Entropy: 1.139924.
Iteration 4488: Policy loss: -0.119485. Value loss: 0.372202. Entropy: 1.127324.
episode: 671   score: 29900.0  epsilon: 1.0    steps: 36  evaluation reward: 28933.0
Training network. lr: 0.000216. clip: 0.086333
Iteration 4489: Policy loss: -0.039737. Value loss: 0.548342. Entropy: 1.106377.
Iteration 4490: Policy loss: -0.027378. Value loss: 0.359036. Entropy: 1.096623.
Iteration 4491: Policy loss: -0.065198. Value 

Training network. lr: 0.000215. clip: 0.086029
Iteration 4558: Policy loss: -0.208074. Value loss: 0.800872. Entropy: 1.172445.
Iteration 4559: Policy loss: -0.236447. Value loss: 0.428718. Entropy: 1.154552.
Iteration 4560: Policy loss: -0.226283. Value loss: 0.267164. Entropy: 1.160282.
Training network. lr: 0.000215. clip: 0.086029
Iteration 4561: Policy loss: 0.027930. Value loss: 0.672889. Entropy: 1.115562.
Iteration 4562: Policy loss: -0.004204. Value loss: 0.369541. Entropy: 1.121531.
Iteration 4563: Policy loss: 0.007547. Value loss: 0.303566. Entropy: 1.121399.
episode: 682   score: 27400.0  epsilon: 1.0    steps: 325  evaluation reward: 28474.0
Training network. lr: 0.000215. clip: 0.086029
Iteration 4564: Policy loss: -0.060669. Value loss: 0.930956. Entropy: 1.157977.
Iteration 4565: Policy loss: -0.065589. Value loss: 0.574457. Entropy: 1.149043.
Iteration 4566: Policy loss: 0.004514. Value loss: 0.369326. Entropy: 1.152557.
Training network. lr: 0.000215. clip: 0.086029


Training network. lr: 0.000215. clip: 0.085872
Iteration 4636: Policy loss: 0.244551. Value loss: 0.484020. Entropy: 1.238853.
Iteration 4637: Policy loss: 0.236093. Value loss: 0.267603. Entropy: 1.240899.
Iteration 4638: Policy loss: 0.236917. Value loss: 0.194456. Entropy: 1.240734.
episode: 691   score: 30500.0  epsilon: 1.0    steps: 120  evaluation reward: 28480.0
episode: 692   score: 44500.0  epsilon: 1.0    steps: 365  evaluation reward: 28627.0
Training network. lr: 0.000215. clip: 0.085872
Iteration 4639: Policy loss: 0.038872. Value loss: 0.618380. Entropy: 1.222203.
Iteration 4640: Policy loss: 0.069948. Value loss: 0.341436. Entropy: 1.227862.
Iteration 4641: Policy loss: 0.069993. Value loss: 0.275392. Entropy: 1.228507.
Training network. lr: 0.000215. clip: 0.085872
Iteration 4642: Policy loss: -0.013803. Value loss: 0.898877. Entropy: 1.239783.
Iteration 4643: Policy loss: -0.086342. Value loss: 0.415181. Entropy: 1.229756.
Iteration 4644: Policy loss: -0.018507. Value

Iteration 4711: Policy loss: -0.056811. Value loss: 0.412187. Entropy: 1.198659.
Iteration 4712: Policy loss: -0.060691. Value loss: 0.235401. Entropy: 1.198256.
Iteration 4713: Policy loss: -0.063846. Value loss: 0.190044. Entropy: 1.195134.
episode: 703   score: 41700.0  epsilon: 1.0    steps: 344  evaluation reward: 29414.0
Training network. lr: 0.000214. clip: 0.085568
Iteration 4714: Policy loss: 0.002959. Value loss: 0.429468. Entropy: 1.194244.
Iteration 4715: Policy loss: -0.025085. Value loss: 0.165336. Entropy: 1.187365.
Iteration 4716: Policy loss: 0.001516. Value loss: 0.129018. Entropy: 1.180588.
Training network. lr: 0.000214. clip: 0.085568
Iteration 4717: Policy loss: 0.004695. Value loss: 0.216897. Entropy: 1.208035.
Iteration 4718: Policy loss: -0.017867. Value loss: 0.139629. Entropy: 1.216308.
Iteration 4719: Policy loss: -0.001699. Value loss: 0.083694. Entropy: 1.215089.
Training network. lr: 0.000214. clip: 0.085568
Iteration 4720: Policy loss: -0.073639. Value l

Iteration 4788: Policy loss: 0.139367. Value loss: 0.157438. Entropy: 1.125319.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4789: Policy loss: -0.062635. Value loss: 0.869327. Entropy: 1.163413.
Iteration 4790: Policy loss: -0.018969. Value loss: 0.406428. Entropy: 1.169854.
Iteration 4791: Policy loss: -0.068131. Value loss: 0.317024. Entropy: 1.165306.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4792: Policy loss: -0.032159. Value loss: 0.987094. Entropy: 1.112706.
Iteration 4793: Policy loss: -0.110903. Value loss: 0.708157. Entropy: 1.103489.
Iteration 4794: Policy loss: -0.064060. Value loss: 0.356928. Entropy: 1.119068.
episode: 713   score: 30600.0  epsilon: 1.0    steps: 97  evaluation reward: 29778.0
Training network. lr: 0.000214. clip: 0.085411
Iteration 4795: Policy loss: -0.098515. Value loss: 0.899553. Entropy: 1.116352.
Iteration 4796: Policy loss: -0.038586. Value loss: 0.450460. Entropy: 1.113519.
Iteration 4797: Policy loss: -0.075535. Value 

Training network. lr: 0.000213. clip: 0.085107
Iteration 4867: Policy loss: 0.038861. Value loss: 0.621522. Entropy: 1.130331.
Iteration 4868: Policy loss: 0.029935. Value loss: 0.386307. Entropy: 1.142158.
Iteration 4869: Policy loss: 0.082496. Value loss: 0.243505. Entropy: 1.132458.
Training network. lr: 0.000213. clip: 0.085107
Iteration 4870: Policy loss: -0.054920. Value loss: 1.277164. Entropy: 1.211344.
Iteration 4871: Policy loss: -0.059547. Value loss: 0.726840. Entropy: 1.189718.
Iteration 4872: Policy loss: -0.123422. Value loss: 0.470341. Entropy: 1.196036.
episode: 721   score: 25900.0  epsilon: 1.0    steps: 757  evaluation reward: 30898.0
Training network. lr: 0.000213. clip: 0.085107
Iteration 4873: Policy loss: 0.090350. Value loss: 0.997438. Entropy: 1.142115.
Iteration 4874: Policy loss: 0.058185. Value loss: 0.652209. Entropy: 1.147765.
Iteration 4875: Policy loss: 0.073762. Value loss: 0.506642. Entropy: 1.143057.
episode: 722   score: 50600.0  epsilon: 1.0    ste

Iteration 4943: Policy loss: -0.092089. Value loss: 0.452187. Entropy: 1.147913.
Iteration 4944: Policy loss: -0.061450. Value loss: 0.310212. Entropy: 1.136126.
episode: 731   score: 43800.0  epsilon: 1.0    steps: 707  evaluation reward: 31210.0
Training network. lr: 0.000212. clip: 0.084950
Iteration 4945: Policy loss: 0.081173. Value loss: 0.828256. Entropy: 1.138924.
Iteration 4946: Policy loss: 0.067868. Value loss: 0.417923. Entropy: 1.127963.
Iteration 4947: Policy loss: 0.103570. Value loss: 0.303401. Entropy: 1.135338.
episode: 732   score: 34800.0  epsilon: 1.0    steps: 266  evaluation reward: 31375.0
Training network. lr: 0.000212. clip: 0.084950
Iteration 4948: Policy loss: 0.292579. Value loss: 0.757468. Entropy: 1.134411.
Iteration 4949: Policy loss: 0.270711. Value loss: 0.449073. Entropy: 1.142097.
Iteration 4950: Policy loss: 0.256703. Value loss: 0.301155. Entropy: 1.145242.
Training network. lr: 0.000212. clip: 0.084803
Iteration 4951: Policy loss: -0.052625. Value

Iteration 5019: Policy loss: 0.037561. Value loss: 0.533519. Entropy: 1.070105.
Training network. lr: 0.000212. clip: 0.084646
Iteration 5020: Policy loss: -0.262091. Value loss: 1.159897. Entropy: 1.078388.
Iteration 5021: Policy loss: -0.339104. Value loss: 0.786278. Entropy: 1.075808.
Iteration 5022: Policy loss: -0.320740. Value loss: 0.477804. Entropy: 1.069033.
Training network. lr: 0.000212. clip: 0.084646
Iteration 5023: Policy loss: 0.330046. Value loss: 0.707342. Entropy: 1.085807.
Iteration 5024: Policy loss: 0.337082. Value loss: 0.316367. Entropy: 1.096761.
Iteration 5025: Policy loss: 0.334336. Value loss: 0.200332. Entropy: 1.101351.
episode: 742   score: 21200.0  epsilon: 1.0    steps: 410  evaluation reward: 32090.0
Training network. lr: 0.000212. clip: 0.084646
Iteration 5026: Policy loss: 0.305132. Value loss: 0.396239. Entropy: 1.128676.
Iteration 5027: Policy loss: 0.304419. Value loss: 0.165132. Entropy: 1.138643.
Iteration 5028: Policy loss: 0.287566. Value loss:

Iteration 5095: Policy loss: -0.114062. Value loss: 1.432249. Entropy: 1.136930.
Iteration 5096: Policy loss: -0.090358. Value loss: 0.852890. Entropy: 1.152947.
Iteration 5097: Policy loss: -0.103264. Value loss: 0.706820. Entropy: 1.129220.
Training network. lr: 0.000211. clip: 0.084489
Iteration 5098: Policy loss: 0.254765. Value loss: 0.688431. Entropy: 1.151199.
Iteration 5099: Policy loss: 0.259426. Value loss: 0.503540. Entropy: 1.156938.
Iteration 5100: Policy loss: 0.256280. Value loss: 0.391309. Entropy: 1.160494.
Training network. lr: 0.000211. clip: 0.084342
Iteration 5101: Policy loss: -0.006407. Value loss: 1.076880. Entropy: 1.146526.
Iteration 5102: Policy loss: 0.008622. Value loss: 0.757280. Entropy: 1.157899.
Iteration 5103: Policy loss: -0.010540. Value loss: 0.646124. Entropy: 1.158025.
Training network. lr: 0.000211. clip: 0.084342
Iteration 5104: Policy loss: 0.067614. Value loss: 0.498386. Entropy: 1.168706.
Iteration 5105: Policy loss: 0.084161. Value loss: 0.3

Iteration 5172: Policy loss: -0.032734. Value loss: 0.569650. Entropy: 1.175471.
Training network. lr: 0.000210. clip: 0.084185
Iteration 5173: Policy loss: 0.161275. Value loss: 0.464249. Entropy: 1.227490.
Iteration 5174: Policy loss: 0.162361. Value loss: 0.334064. Entropy: 1.230841.
Iteration 5175: Policy loss: 0.160037. Value loss: 0.244383. Entropy: 1.231335.
episode: 762   score: 46700.0  epsilon: 1.0    steps: 99  evaluation reward: 32735.0
Training network. lr: 0.000210. clip: 0.084185
Iteration 5176: Policy loss: -0.194121. Value loss: 0.735314. Entropy: 1.169074.
Iteration 5177: Policy loss: -0.167146. Value loss: 0.378832. Entropy: 1.171000.
Iteration 5178: Policy loss: -0.196828. Value loss: 0.313830. Entropy: 1.166826.
episode: 763   score: 30200.0  epsilon: 1.0    steps: 369  evaluation reward: 32554.0
Training network. lr: 0.000210. clip: 0.084185
Iteration 5179: Policy loss: -0.051803. Value loss: 0.790044. Entropy: 1.175574.
Iteration 5180: Policy loss: -0.056756. Val

Iteration 5250: Policy loss: 0.057554. Value loss: 0.165617. Entropy: 1.191579.
episode: 771   score: 42600.0  epsilon: 1.0    steps: 344  evaluation reward: 32739.0
Training network. lr: 0.000210. clip: 0.083881
Iteration 5251: Policy loss: -0.176386. Value loss: 2.442615. Entropy: 1.177966.
Iteration 5252: Policy loss: -0.218175. Value loss: 1.381222. Entropy: 1.156469.
Iteration 5253: Policy loss: -0.215463. Value loss: 0.891128. Entropy: 1.151136.
Training network. lr: 0.000210. clip: 0.083881
Iteration 5254: Policy loss: -0.032175. Value loss: 0.990446. Entropy: 1.202768.
Iteration 5255: Policy loss: -0.003339. Value loss: 0.549677. Entropy: 1.214632.
Iteration 5256: Policy loss: -0.009511. Value loss: 0.325392. Entropy: 1.217805.
Training network. lr: 0.000210. clip: 0.083881
Iteration 5257: Policy loss: -0.027243. Value loss: 1.107647. Entropy: 1.134105.
Iteration 5258: Policy loss: -0.046292. Value loss: 0.708133. Entropy: 1.117104.
Iteration 5259: Policy loss: -0.060216. Value

Iteration 5326: Policy loss: -0.143141. Value loss: 1.293199. Entropy: 1.092596.
Iteration 5327: Policy loss: -0.150257. Value loss: 0.924181. Entropy: 1.086260.
Iteration 5328: Policy loss: -0.165784. Value loss: 0.768610. Entropy: 1.073634.
Training network. lr: 0.000209. clip: 0.083725
Iteration 5329: Policy loss: -0.526133. Value loss: 1.664281. Entropy: 1.147598.
Iteration 5330: Policy loss: -0.531290. Value loss: 0.895618. Entropy: 1.143586.
Iteration 5331: Policy loss: -0.476645. Value loss: 0.523634. Entropy: 1.138623.
Training network. lr: 0.000209. clip: 0.083725
Iteration 5332: Policy loss: -0.125384. Value loss: 0.964801. Entropy: 1.164968.
Iteration 5333: Policy loss: -0.118979. Value loss: 0.771569. Entropy: 1.150846.
Iteration 5334: Policy loss: -0.120942. Value loss: 0.528990. Entropy: 1.158107.
Training network. lr: 0.000209. clip: 0.083725
Iteration 5335: Policy loss: -0.187013. Value loss: 1.602743. Entropy: 1.079119.
Iteration 5336: Policy loss: -0.158155. Value los

Iteration 5403: Policy loss: -0.028046. Value loss: 0.356707. Entropy: 1.115180.
Training network. lr: 0.000209. clip: 0.083420
Iteration 5404: Policy loss: -0.011042. Value loss: 0.364152. Entropy: 1.168497.
Iteration 5405: Policy loss: -0.028909. Value loss: 0.260505. Entropy: 1.169544.
Iteration 5406: Policy loss: -0.010807. Value loss: 0.193703. Entropy: 1.171744.
episode: 791   score: 33800.0  epsilon: 1.0    steps: 485  evaluation reward: 33937.0
Training network. lr: 0.000209. clip: 0.083420
Iteration 5407: Policy loss: -0.024709. Value loss: 0.603315. Entropy: 1.136571.
Iteration 5408: Policy loss: -0.010749. Value loss: 0.265994. Entropy: 1.123792.
Iteration 5409: Policy loss: -0.026701. Value loss: 0.219575. Entropy: 1.125273.
Training network. lr: 0.000209. clip: 0.083420
Iteration 5410: Policy loss: -0.072196. Value loss: 1.108140. Entropy: 1.137570.
Iteration 5411: Policy loss: -0.049689. Value loss: 0.444699. Entropy: 1.133807.
Iteration 5412: Policy loss: -0.043953. Valu

Iteration 5479: Policy loss: -0.215141. Value loss: 0.522562. Entropy: 1.118676.
Iteration 5480: Policy loss: -0.203320. Value loss: 0.272529. Entropy: 1.114895.
Iteration 5481: Policy loss: -0.201049. Value loss: 0.200822. Entropy: 1.114855.
Training network. lr: 0.000208. clip: 0.083264
Iteration 5482: Policy loss: 0.031878. Value loss: 1.291847. Entropy: 1.080082.
Iteration 5483: Policy loss: 0.060448. Value loss: 0.662827. Entropy: 1.074182.
Iteration 5484: Policy loss: 0.028044. Value loss: 0.429723. Entropy: 1.047009.
Training network. lr: 0.000208. clip: 0.083264
Iteration 5485: Policy loss: -0.241934. Value loss: 0.729153. Entropy: 1.101115.
Iteration 5486: Policy loss: -0.202557. Value loss: 0.487262. Entropy: 1.085232.
Iteration 5487: Policy loss: -0.218103. Value loss: 0.336040. Entropy: 1.083844.
now time :  2019-02-26 20:09:54.903983
episode: 801   score: 14500.0  epsilon: 1.0    steps: 556  evaluation reward: 33835.0
episode: 802   score: 29800.0  epsilon: 1.0    steps: 9

Iteration 5556: Policy loss: 0.490075. Value loss: 0.219239. Entropy: 1.154986.
episode: 811   score: 17600.0  epsilon: 1.0    steps: 679  evaluation reward: 34638.0
Training network. lr: 0.000207. clip: 0.082960
Iteration 5557: Policy loss: 0.025185. Value loss: 0.335948. Entropy: 1.116311.
Iteration 5558: Policy loss: 0.025869. Value loss: 0.176324. Entropy: 1.112301.
Iteration 5559: Policy loss: -0.003054. Value loss: 0.118502. Entropy: 1.106271.
episode: 812   score: 28900.0  epsilon: 1.0    steps: 261  evaluation reward: 34537.0
Training network. lr: 0.000207. clip: 0.082960
Iteration 5560: Policy loss: -0.293705. Value loss: 1.393889. Entropy: 1.134353.
Iteration 5561: Policy loss: -0.313293. Value loss: 0.765790. Entropy: 1.143723.
Iteration 5562: Policy loss: -0.285003. Value loss: 0.512486. Entropy: 1.147658.
Training network. lr: 0.000207. clip: 0.082960
Iteration 5563: Policy loss: -0.090365. Value loss: 0.811113. Entropy: 1.112207.
Iteration 5564: Policy loss: -0.129201. Va

Iteration 5631: Policy loss: 0.051458. Value loss: 0.169905. Entropy: 1.088507.
episode: 823   score: 41000.0  epsilon: 1.0    steps: 633  evaluation reward: 33344.0
Training network. lr: 0.000207. clip: 0.082803
Iteration 5632: Policy loss: 0.070637. Value loss: 1.583015. Entropy: 1.048135.
Iteration 5633: Policy loss: 0.095985. Value loss: 0.829816. Entropy: 1.059310.
Iteration 5634: Policy loss: 0.097844. Value loss: 0.534297. Entropy: 1.055916.
Training network. lr: 0.000207. clip: 0.082803
Iteration 5635: Policy loss: -0.112659. Value loss: 0.920629. Entropy: 1.094232.
Iteration 5636: Policy loss: -0.064665. Value loss: 0.437256. Entropy: 1.085105.
Iteration 5637: Policy loss: -0.129589. Value loss: 0.316276. Entropy: 1.092458.
Training network. lr: 0.000207. clip: 0.082803
Iteration 5638: Policy loss: 0.150743. Value loss: 0.583428. Entropy: 1.040565.
Iteration 5639: Policy loss: 0.130505. Value loss: 0.361103. Entropy: 1.051811.
Iteration 5640: Policy loss: 0.142545. Value loss:

Iteration 5706: Policy loss: -0.103535. Value loss: 0.270288. Entropy: 1.106112.
Training network. lr: 0.000206. clip: 0.082499
Iteration 5707: Policy loss: 0.325904. Value loss: 0.312015. Entropy: 1.115708.
Iteration 5708: Policy loss: 0.316614. Value loss: 0.173168. Entropy: 1.116864.
Iteration 5709: Policy loss: 0.303826. Value loss: 0.139488. Entropy: 1.117116.
Training network. lr: 0.000206. clip: 0.082499
Iteration 5710: Policy loss: 0.085107. Value loss: 0.566993. Entropy: 1.104391.
Iteration 5711: Policy loss: 0.066095. Value loss: 0.498787. Entropy: 1.107846.
Iteration 5712: Policy loss: 0.085322. Value loss: 0.318999. Entropy: 1.107889.
episode: 835   score: 22200.0  epsilon: 1.0    steps: 1015  evaluation reward: 32424.0
Training network. lr: 0.000206. clip: 0.082499
Iteration 5713: Policy loss: -0.116215. Value loss: 1.299280. Entropy: 1.103169.
Iteration 5714: Policy loss: -0.153227. Value loss: 0.865070. Entropy: 1.108649.
Iteration 5715: Policy loss: -0.153034. Value los

Training network. lr: 0.000206. clip: 0.082342
Iteration 5782: Policy loss: 0.202363. Value loss: 0.438283. Entropy: 1.157533.
Iteration 5783: Policy loss: 0.192721. Value loss: 0.188347. Entropy: 1.145007.
Iteration 5784: Policy loss: 0.169106. Value loss: 0.203726. Entropy: 1.148402.
Training network. lr: 0.000206. clip: 0.082342
Iteration 5785: Policy loss: -0.010327. Value loss: 0.590128. Entropy: 1.165140.
Iteration 5786: Policy loss: -0.048231. Value loss: 0.366950. Entropy: 1.156620.
Iteration 5787: Policy loss: -0.021784. Value loss: 0.232694. Entropy: 1.149928.
Training network. lr: 0.000206. clip: 0.082342
Iteration 5788: Policy loss: 0.193063. Value loss: 0.676510. Entropy: 1.096074.
Iteration 5789: Policy loss: 0.206285. Value loss: 0.266896. Entropy: 1.093204.
Iteration 5790: Policy loss: 0.158954. Value loss: 0.193221. Entropy: 1.082482.
Training network. lr: 0.000206. clip: 0.082342
Iteration 5791: Policy loss: -0.082673. Value loss: 1.400998. Entropy: 1.189420.
Iteratio

Training network. lr: 0.000205. clip: 0.082038
Iteration 5860: Policy loss: -0.225116. Value loss: 1.100953. Entropy: 1.077797.
Iteration 5861: Policy loss: -0.289214. Value loss: 0.682963. Entropy: 1.107350.
Iteration 5862: Policy loss: -0.239324. Value loss: 0.455948. Entropy: 1.082371.
episode: 854   score: 37600.0  epsilon: 1.0    steps: 524  evaluation reward: 32737.0
episode: 855   score: 27600.0  epsilon: 1.0    steps: 840  evaluation reward: 32718.0
Training network. lr: 0.000205. clip: 0.082038
Iteration 5863: Policy loss: 0.037964. Value loss: 1.296570. Entropy: 1.148157.
Iteration 5864: Policy loss: 0.082489. Value loss: 0.640603. Entropy: 1.139067.
Iteration 5865: Policy loss: 0.031120. Value loss: 0.476700. Entropy: 1.123661.
episode: 856   score: 28100.0  epsilon: 1.0    steps: 244  evaluation reward: 32770.0
Training network. lr: 0.000205. clip: 0.082038
Iteration 5866: Policy loss: -0.183930. Value loss: 0.829238. Entropy: 1.090066.
Iteration 5867: Policy loss: -0.21749

Training network. lr: 0.000205. clip: 0.081881
Iteration 5935: Policy loss: 0.016192. Value loss: 0.774485. Entropy: 1.169900.
Iteration 5936: Policy loss: -0.012922. Value loss: 0.474138. Entropy: 1.172968.
Iteration 5937: Policy loss: -0.001131. Value loss: 0.284595. Entropy: 1.177531.
Training network. lr: 0.000205. clip: 0.081881
Iteration 5938: Policy loss: -0.090318. Value loss: 1.209646. Entropy: 1.056587.
Iteration 5939: Policy loss: -0.056122. Value loss: 0.562115. Entropy: 1.055853.
Iteration 5940: Policy loss: -0.082652. Value loss: 0.459121. Entropy: 1.046865.
Training network. lr: 0.000205. clip: 0.081881
Iteration 5941: Policy loss: 0.171461. Value loss: 1.033739. Entropy: 1.074278.
Iteration 5942: Policy loss: 0.166778. Value loss: 0.600787. Entropy: 1.063504.
Iteration 5943: Policy loss: 0.137551. Value loss: 0.444375. Entropy: 1.066161.
Training network. lr: 0.000205. clip: 0.081881
Iteration 5944: Policy loss: 0.214805. Value loss: 1.082771. Entropy: 1.173148.
Iterati

Iteration 6013: Policy loss: -0.117059. Value loss: 0.827991. Entropy: 1.046931.
Iteration 6014: Policy loss: -0.134417. Value loss: 0.469038. Entropy: 1.036155.
Iteration 6015: Policy loss: -0.115967. Value loss: 0.289391. Entropy: 1.046048.
episode: 874   score: 37000.0  epsilon: 1.0    steps: 276  evaluation reward: 32094.0
Training network. lr: 0.000204. clip: 0.081577
Iteration 6016: Policy loss: -0.153657. Value loss: 0.931696. Entropy: 1.141420.
Iteration 6017: Policy loss: -0.119941. Value loss: 0.508276. Entropy: 1.139095.
Iteration 6018: Policy loss: -0.153635. Value loss: 0.396869. Entropy: 1.138508.
Training network. lr: 0.000204. clip: 0.081577
Iteration 6019: Policy loss: -0.096082. Value loss: 0.573958. Entropy: 1.048604.
Iteration 6020: Policy loss: -0.093662. Value loss: 0.291069. Entropy: 1.051747.
Iteration 6021: Policy loss: -0.097232. Value loss: 0.188955. Entropy: 1.055085.
Training network. lr: 0.000204. clip: 0.081577
Iteration 6022: Policy loss: 0.104713. Value

Iteration 6092: Policy loss: 0.353782. Value loss: 0.514127. Entropy: 1.084100.
Iteration 6093: Policy loss: 0.383479. Value loss: 0.368447. Entropy: 1.097146.
episode: 882   score: 31500.0  epsilon: 1.0    steps: 154  evaluation reward: 32192.0
Training network. lr: 0.000204. clip: 0.081421
Iteration 6094: Policy loss: 0.321758. Value loss: 0.511961. Entropy: 1.052170.
Iteration 6095: Policy loss: 0.341409. Value loss: 0.325020. Entropy: 1.063632.
Iteration 6096: Policy loss: 0.304946. Value loss: 0.271089. Entropy: 1.042453.
episode: 883   score: 53000.0  epsilon: 1.0    steps: 282  evaluation reward: 32272.0
episode: 884   score: 38900.0  epsilon: 1.0    steps: 529  evaluation reward: 32234.0
Training network. lr: 0.000204. clip: 0.081421
Iteration 6097: Policy loss: -0.004857. Value loss: 0.392503. Entropy: 1.093850.
Iteration 6098: Policy loss: 0.000824. Value loss: 0.347306. Entropy: 1.082381.
Iteration 6099: Policy loss: -0.035192. Value loss: 0.237301. Entropy: 1.078889.
Traini

Training network. lr: 0.000203. clip: 0.081116
Iteration 6166: Policy loss: -0.070401. Value loss: 0.473434. Entropy: 1.177494.
Iteration 6167: Policy loss: -0.079787. Value loss: 0.159167. Entropy: 1.177791.
Iteration 6168: Policy loss: -0.090421. Value loss: 0.098308. Entropy: 1.168463.
Training network. lr: 0.000203. clip: 0.081116
Iteration 6169: Policy loss: -0.050218. Value loss: 1.008685. Entropy: 1.147061.
Iteration 6170: Policy loss: -0.126086. Value loss: 0.640500. Entropy: 1.138812.
Iteration 6171: Policy loss: -0.069492. Value loss: 0.387050. Entropy: 1.144265.
Training network. lr: 0.000203. clip: 0.081116
Iteration 6172: Policy loss: 0.209801. Value loss: 1.051528. Entropy: 1.136035.
Iteration 6173: Policy loss: 0.198234. Value loss: 0.355424. Entropy: 1.127206.
Iteration 6174: Policy loss: 0.191685. Value loss: 0.229373. Entropy: 1.139608.
Training network. lr: 0.000203. clip: 0.081116
Iteration 6175: Policy loss: -0.028418. Value loss: 0.756608. Entropy: 1.153827.
Itera

Training network. lr: 0.000202. clip: 0.080960
Iteration 6241: Policy loss: -0.276309. Value loss: 0.764172. Entropy: 1.100023.
Iteration 6242: Policy loss: -0.242349. Value loss: 0.467242. Entropy: 1.083341.
Iteration 6243: Policy loss: -0.243307. Value loss: 0.337237. Entropy: 1.085103.
Training network. lr: 0.000202. clip: 0.080960
Iteration 6244: Policy loss: 0.125926. Value loss: 0.350966. Entropy: 1.134583.
Iteration 6245: Policy loss: 0.107495. Value loss: 0.164297. Entropy: 1.116386.
Iteration 6246: Policy loss: 0.095549. Value loss: 0.122343. Entropy: 1.135910.
Training network. lr: 0.000202. clip: 0.080960
Iteration 6247: Policy loss: -0.373056. Value loss: 1.517715. Entropy: 1.170803.
Iteration 6248: Policy loss: -0.444985. Value loss: 1.246134. Entropy: 1.163842.
Iteration 6249: Policy loss: -0.306915. Value loss: 0.670169. Entropy: 1.155510.
Training network. lr: 0.000202. clip: 0.080960
Iteration 6250: Policy loss: 0.119876. Value loss: 0.382165. Entropy: 1.153017.
Iterat

episode: 916   score: 20300.0  epsilon: 1.0    steps: 258  evaluation reward: 31481.0
episode: 917   score: 47800.0  epsilon: 1.0    steps: 774  evaluation reward: 31767.0
Training network. lr: 0.000202. clip: 0.080656
Iteration 6319: Policy loss: 0.067908. Value loss: 0.667906. Entropy: 1.154851.
Iteration 6320: Policy loss: 0.072750. Value loss: 0.298424. Entropy: 1.166412.
Iteration 6321: Policy loss: 0.044603. Value loss: 0.194032. Entropy: 1.166092.
Training network. lr: 0.000202. clip: 0.080656
Iteration 6322: Policy loss: 0.345460. Value loss: 0.333703. Entropy: 1.166439.
Iteration 6323: Policy loss: 0.328453. Value loss: 0.114002. Entropy: 1.184693.
Iteration 6324: Policy loss: 0.328371. Value loss: 0.085087. Entropy: 1.171472.
episode: 918   score: 11800.0  epsilon: 1.0    steps: 691  evaluation reward: 31568.0
Training network. lr: 0.000202. clip: 0.080656
Iteration 6325: Policy loss: -0.111930. Value loss: 0.797695. Entropy: 1.108183.
Iteration 6326: Policy loss: -0.112409. 

Iteration 6392: Policy loss: -0.214636. Value loss: 0.416371. Entropy: 1.147369.
Iteration 6393: Policy loss: -0.184168. Value loss: 0.284640. Entropy: 1.132016.
Training network. lr: 0.000201. clip: 0.080499
Iteration 6394: Policy loss: 0.072751. Value loss: 0.600379. Entropy: 1.191655.
Iteration 6395: Policy loss: 0.012871. Value loss: 0.208237. Entropy: 1.198964.
Iteration 6396: Policy loss: 0.030305. Value loss: 0.116559. Entropy: 1.197444.
Training network. lr: 0.000201. clip: 0.080499
Iteration 6397: Policy loss: -0.112198. Value loss: 0.472575. Entropy: 1.201774.
Iteration 6398: Policy loss: -0.089977. Value loss: 0.328861. Entropy: 1.221705.
Iteration 6399: Policy loss: -0.059370. Value loss: 0.173465. Entropy: 1.212104.
Training network. lr: 0.000201. clip: 0.080499
Iteration 6400: Policy loss: 0.037062. Value loss: 0.892852. Entropy: 1.162645.
Iteration 6401: Policy loss: -0.027954. Value loss: 0.703819. Entropy: 1.167317.
Iteration 6402: Policy loss: -0.016359. Value loss: 0

Iteration 6470: Policy loss: 0.064230. Value loss: 0.488151. Entropy: 1.145459.
Iteration 6471: Policy loss: -0.054368. Value loss: 0.276951. Entropy: 1.152268.
Training network. lr: 0.000200. clip: 0.080195
Iteration 6472: Policy loss: 0.369782. Value loss: 0.888252. Entropy: 1.149184.
Iteration 6473: Policy loss: 0.356980. Value loss: 0.621051. Entropy: 1.152915.
Iteration 6474: Policy loss: 0.379774. Value loss: 0.482272. Entropy: 1.149590.
Training network. lr: 0.000200. clip: 0.080195
Iteration 6475: Policy loss: -0.109686. Value loss: 0.737340. Entropy: 1.084069.
Iteration 6476: Policy loss: -0.098625. Value loss: 0.494627. Entropy: 1.079803.
Iteration 6477: Policy loss: -0.136709. Value loss: 0.359865. Entropy: 1.083705.
Training network. lr: 0.000200. clip: 0.080195
Iteration 6478: Policy loss: 0.061564. Value loss: 0.831708. Entropy: 1.158184.
Iteration 6479: Policy loss: 0.036211. Value loss: 0.628933. Entropy: 1.167917.
Iteration 6480: Policy loss: 0.014272. Value loss: 0.44

Training network. lr: 0.000200. clip: 0.080038
Iteration 6547: Policy loss: -0.017502. Value loss: 1.359119. Entropy: 1.133713.
Iteration 6548: Policy loss: -0.010406. Value loss: 0.771149. Entropy: 1.130352.
Iteration 6549: Policy loss: 0.008115. Value loss: 0.558355. Entropy: 1.146599.
Training network. lr: 0.000200. clip: 0.080038
Iteration 6550: Policy loss: -0.083794. Value loss: 1.029234. Entropy: 1.132984.
Iteration 6551: Policy loss: -0.087656. Value loss: 0.718573. Entropy: 1.132409.
Iteration 6552: Policy loss: -0.108484. Value loss: 0.533028. Entropy: 1.135676.
Training network. lr: 0.000200. clip: 0.079881
Iteration 6553: Policy loss: 0.082345. Value loss: 1.243909. Entropy: 1.102263.
Iteration 6554: Policy loss: 0.077087. Value loss: 0.880990. Entropy: 1.108219.
Iteration 6555: Policy loss: 0.075702. Value loss: 0.668789. Entropy: 1.122651.
episode: 949   score: 38400.0  epsilon: 1.0    steps: 222  evaluation reward: 31176.0
episode: 950   score: 35900.0  epsilon: 1.0    s

Iteration 6620: Policy loss: 0.232809. Value loss: 0.411785. Entropy: 1.060815.
Iteration 6621: Policy loss: 0.242211. Value loss: 0.342832. Entropy: 1.063287.
Training network. lr: 0.000199. clip: 0.079734
Iteration 6622: Policy loss: 0.054376. Value loss: 0.661655. Entropy: 1.133461.
Iteration 6623: Policy loss: 0.028386. Value loss: 0.270529. Entropy: 1.145089.
Iteration 6624: Policy loss: 0.003961. Value loss: 0.239692. Entropy: 1.125600.
Training network. lr: 0.000199. clip: 0.079734
Iteration 6625: Policy loss: -0.070295. Value loss: 0.874813. Entropy: 1.124169.
Iteration 6626: Policy loss: -0.143328. Value loss: 0.829472. Entropy: 1.121403.
Iteration 6627: Policy loss: -0.126518. Value loss: 0.547813. Entropy: 1.140478.
episode: 962   score: 20300.0  epsilon: 1.0    steps: 83  evaluation reward: 30120.0
episode: 963   score: 33200.0  epsilon: 1.0    steps: 616  evaluation reward: 30123.0
Training network. lr: 0.000199. clip: 0.079734
Iteration 6628: Policy loss: -0.003015. Value

Iteration 6695: Policy loss: 0.084654. Value loss: 0.552639. Entropy: 1.072364.
Iteration 6696: Policy loss: 0.059758. Value loss: 0.321280. Entropy: 1.058305.
Training network. lr: 0.000199. clip: 0.079577
Iteration 6697: Policy loss: 0.260791. Value loss: 0.456013. Entropy: 1.074905.
Iteration 6698: Policy loss: 0.267301. Value loss: 0.288645. Entropy: 1.062818.
Iteration 6699: Policy loss: 0.260513. Value loss: 0.223174. Entropy: 1.073278.
episode: 974   score: 22000.0  epsilon: 1.0    steps: 940  evaluation reward: 29075.0
Training network. lr: 0.000199. clip: 0.079577
Iteration 6700: Policy loss: 0.218736. Value loss: 0.470393. Entropy: 1.134436.
Iteration 6701: Policy loss: 0.214163. Value loss: 0.287626. Entropy: 1.123629.
Iteration 6702: Policy loss: 0.194715. Value loss: 0.177782. Entropy: 1.133299.
episode: 975   score: 17700.0  epsilon: 1.0    steps: 265  evaluation reward: 28992.0
Training network. lr: 0.000199. clip: 0.079421
Iteration 6703: Policy loss: -0.076834. Value l

episode: 987   score: 17800.0  epsilon: 1.0    steps: 776  evaluation reward: 27295.0
Training network. lr: 0.000198. clip: 0.079273
Iteration 6769: Policy loss: 0.093072. Value loss: 0.456934. Entropy: 1.157195.
Iteration 6770: Policy loss: 0.098007. Value loss: 0.247792. Entropy: 1.173331.
Iteration 6771: Policy loss: 0.072268. Value loss: 0.158427. Entropy: 1.174210.
Training network. lr: 0.000198. clip: 0.079273
Iteration 6772: Policy loss: 0.000389. Value loss: 0.574401. Entropy: 1.163258.
Iteration 6773: Policy loss: -0.007683. Value loss: 0.279105. Entropy: 1.176600.
Iteration 6774: Policy loss: -0.053327. Value loss: 0.177378. Entropy: 1.181643.
Training network. lr: 0.000198. clip: 0.079273
Iteration 6775: Policy loss: 0.155663. Value loss: 0.609610. Entropy: 1.183794.
Iteration 6776: Policy loss: 0.160078. Value loss: 0.339955. Entropy: 1.177794.
Iteration 6777: Policy loss: 0.173852. Value loss: 0.267153. Entropy: 1.183997.
Training network. lr: 0.000198. clip: 0.079273
Iter

Training network. lr: 0.000198. clip: 0.079117
Iteration 6847: Policy loss: 0.123745. Value loss: 0.837246. Entropy: 1.127066.
Iteration 6848: Policy loss: 0.167589. Value loss: 0.391484. Entropy: 1.144711.
Iteration 6849: Policy loss: 0.130854. Value loss: 0.323997. Entropy: 1.140404.
episode: 996   score: 33900.0  epsilon: 1.0    steps: 1022  evaluation reward: 27218.0
Training network. lr: 0.000198. clip: 0.079117
Iteration 6850: Policy loss: 0.159855. Value loss: 0.791227. Entropy: 1.139255.
Iteration 6851: Policy loss: 0.148191. Value loss: 0.591334. Entropy: 1.145338.
Iteration 6852: Policy loss: 0.113402. Value loss: 0.400115. Entropy: 1.149098.
Training network. lr: 0.000197. clip: 0.078960
Iteration 6853: Policy loss: 0.342893. Value loss: 1.155650. Entropy: 1.109792.
Iteration 6854: Policy loss: 0.320504. Value loss: 0.588488. Entropy: 1.100813.
Iteration 6855: Policy loss: 0.334261. Value loss: 0.486994. Entropy: 1.097346.
episode: 997   score: 34900.0  epsilon: 1.0    steps

Iteration 6921: Policy loss: 0.261831. Value loss: 0.156036. Entropy: 1.180668.
Training network. lr: 0.000197. clip: 0.078812
Iteration 6922: Policy loss: 0.063967. Value loss: 0.616195. Entropy: 1.145516.
Iteration 6923: Policy loss: 0.022551. Value loss: 0.511570. Entropy: 1.172418.
Iteration 6924: Policy loss: 0.039770. Value loss: 0.320511. Entropy: 1.150019.
episode: 1008   score: 28600.0  epsilon: 1.0    steps: 24  evaluation reward: 27027.0
Training network. lr: 0.000197. clip: 0.078812
Iteration 6925: Policy loss: -0.067923. Value loss: 0.921857. Entropy: 1.171350.
Iteration 6926: Policy loss: -0.117180. Value loss: 0.811586. Entropy: 1.162359.
Iteration 6927: Policy loss: -0.131305. Value loss: 0.499087. Entropy: 1.149039.
Training network. lr: 0.000197. clip: 0.078812
Iteration 6928: Policy loss: -0.157854. Value loss: 0.576597. Entropy: 1.180602.
Iteration 6929: Policy loss: -0.151161. Value loss: 0.299494. Entropy: 1.167187.
Iteration 6930: Policy loss: -0.172358. Value lo

Iteration 6998: Policy loss: -0.033924. Value loss: 0.828534. Entropy: 1.196251.
Iteration 6999: Policy loss: -0.076134. Value loss: 0.566183. Entropy: 1.173096.
Training network. lr: 0.000197. clip: 0.078656
Iteration 7000: Policy loss: -0.139859. Value loss: 0.486841. Entropy: 1.194879.
Iteration 7001: Policy loss: -0.156561. Value loss: 0.353804. Entropy: 1.184019.
Iteration 7002: Policy loss: -0.195944. Value loss: 0.219800. Entropy: 1.204789.
Training network. lr: 0.000196. clip: 0.078499
Iteration 7003: Policy loss: -0.041114. Value loss: 1.040229. Entropy: 1.150540.
Iteration 7004: Policy loss: -0.005613. Value loss: 0.646470. Entropy: 1.144255.
Iteration 7005: Policy loss: -0.033253. Value loss: 0.490117. Entropy: 1.147538.
Training network. lr: 0.000196. clip: 0.078499
Iteration 7006: Policy loss: 0.099783. Value loss: 0.808493. Entropy: 1.164183.
Iteration 7007: Policy loss: 0.078811. Value loss: 0.490437. Entropy: 1.168264.
Iteration 7008: Policy loss: 0.106547. Value loss: 

Iteration 7075: Policy loss: 0.151174. Value loss: 0.879891. Entropy: 1.136018.
Iteration 7076: Policy loss: 0.165337. Value loss: 0.534485. Entropy: 1.143603.
Iteration 7077: Policy loss: 0.156011. Value loss: 0.368053. Entropy: 1.150043.
Training network. lr: 0.000196. clip: 0.078352
Iteration 7078: Policy loss: 0.084030. Value loss: 1.213414. Entropy: 1.132150.
Iteration 7079: Policy loss: 0.031283. Value loss: 0.929375. Entropy: 1.128940.
Iteration 7080: Policy loss: 0.061704. Value loss: 0.651871. Entropy: 1.126362.
episode: 1026   score: 63000.0  epsilon: 1.0    steps: 762  evaluation reward: 28954.0
Training network. lr: 0.000196. clip: 0.078352
Iteration 7081: Policy loss: -0.014712. Value loss: 0.927119. Entropy: 1.112557.
Iteration 7082: Policy loss: -0.009205. Value loss: 0.644444. Entropy: 1.104124.
Iteration 7083: Policy loss: 0.010014. Value loss: 0.466548. Entropy: 1.098956.
Training network. lr: 0.000196. clip: 0.078352
Iteration 7084: Policy loss: 0.103318. Value loss:

Iteration 7152: Policy loss: 0.052992. Value loss: 0.721159. Entropy: 1.129765.
Training network. lr: 0.000195. clip: 0.078038
Iteration 7153: Policy loss: -0.065479. Value loss: 1.456577. Entropy: 1.132248.
Iteration 7154: Policy loss: -0.075850. Value loss: 0.888786. Entropy: 1.136995.
Iteration 7155: Policy loss: -0.079478. Value loss: 0.616662. Entropy: 1.120664.
episode: 1036   score: 37200.0  epsilon: 1.0    steps: 651  evaluation reward: 30001.0
Training network. lr: 0.000195. clip: 0.078038
Iteration 7156: Policy loss: 0.109479. Value loss: 1.231833. Entropy: 1.149342.
Iteration 7157: Policy loss: 0.104042. Value loss: 0.864325. Entropy: 1.161755.
Iteration 7158: Policy loss: 0.081715. Value loss: 0.543296. Entropy: 1.156811.
Training network. lr: 0.000195. clip: 0.078038
Iteration 7159: Policy loss: 0.190540. Value loss: 0.726338. Entropy: 1.166544.
Iteration 7160: Policy loss: 0.176734. Value loss: 0.386643. Entropy: 1.157148.
Iteration 7161: Policy loss: 0.173731. Value loss

episode: 1047   score: 35400.0  epsilon: 1.0    steps: 779  evaluation reward: 30340.0
Training network. lr: 0.000195. clip: 0.077891
Iteration 7228: Policy loss: 0.064613. Value loss: 0.841761. Entropy: 1.076642.
Iteration 7229: Policy loss: 0.067544. Value loss: 0.480759. Entropy: 1.089976.
Iteration 7230: Policy loss: 0.041361. Value loss: 0.316838. Entropy: 1.088637.
Training network. lr: 0.000195. clip: 0.077891
Iteration 7231: Policy loss: 0.101976. Value loss: 0.618861. Entropy: 1.108069.
Iteration 7232: Policy loss: 0.107665. Value loss: 0.311894. Entropy: 1.108279.
Iteration 7233: Policy loss: 0.082815. Value loss: 0.227428. Entropy: 1.107452.
Training network. lr: 0.000195. clip: 0.077891
Iteration 7234: Policy loss: -0.008136. Value loss: 0.638625. Entropy: 1.131127.
Iteration 7235: Policy loss: -0.052134. Value loss: 0.385311. Entropy: 1.121407.
Iteration 7236: Policy loss: -0.010436. Value loss: 0.257566. Entropy: 1.128654.
Training network. lr: 0.000195. clip: 0.077891
It

Training network. lr: 0.000194. clip: 0.077577
Iteration 7306: Policy loss: 0.041836. Value loss: 0.704507. Entropy: 1.055706.
Iteration 7307: Policy loss: 0.029749. Value loss: 0.358226. Entropy: 1.048082.
Iteration 7308: Policy loss: 0.045918. Value loss: 0.269538. Entropy: 1.048931.
episode: 1055   score: 28800.0  epsilon: 1.0    steps: 439  evaluation reward: 30425.0
episode: 1056   score: 51200.0  epsilon: 1.0    steps: 802  evaluation reward: 30756.0
Training network. lr: 0.000194. clip: 0.077577
Iteration 7309: Policy loss: 0.255406. Value loss: 0.450510. Entropy: 1.136507.
Iteration 7310: Policy loss: 0.282899. Value loss: 0.247226. Entropy: 1.142156.
Iteration 7311: Policy loss: 0.253437. Value loss: 0.173335. Entropy: 1.140593.
Training network. lr: 0.000194. clip: 0.077577
Iteration 7312: Policy loss: -0.242226. Value loss: 0.823992. Entropy: 1.106119.
Iteration 7313: Policy loss: -0.208204. Value loss: 0.349378. Entropy: 1.099295.
Iteration 7314: Policy loss: -0.195085. Val

Iteration 7379: Policy loss: 0.045826. Value loss: 0.162015. Entropy: 1.214873.
Iteration 7380: Policy loss: 0.042053. Value loss: 0.121249. Entropy: 1.221687.
Training network. lr: 0.000194. clip: 0.077430
Iteration 7381: Policy loss: 0.191746. Value loss: 0.694510. Entropy: 1.177781.
Iteration 7382: Policy loss: 0.183750. Value loss: 0.310750. Entropy: 1.179223.
Iteration 7383: Policy loss: 0.175117. Value loss: 0.272951. Entropy: 1.181624.
Training network. lr: 0.000194. clip: 0.077430
Iteration 7384: Policy loss: 0.184794. Value loss: 0.874860. Entropy: 1.178528.
Iteration 7385: Policy loss: 0.153576. Value loss: 0.554469. Entropy: 1.182241.
Iteration 7386: Policy loss: 0.115232. Value loss: 0.241722. Entropy: 1.189060.
episode: 1069   score: 26000.0  epsilon: 1.0    steps: 608  evaluation reward: 32128.0
Training network. lr: 0.000194. clip: 0.077430
Iteration 7387: Policy loss: 0.267898. Value loss: 0.723314. Entropy: 1.164128.
Iteration 7388: Policy loss: 0.212224. Value loss: 0

Training network. lr: 0.000193. clip: 0.077117
Iteration 7456: Policy loss: 0.262700. Value loss: 0.757571. Entropy: 1.169408.
Iteration 7457: Policy loss: 0.282596. Value loss: 0.430475. Entropy: 1.173406.
Iteration 7458: Policy loss: 0.236681. Value loss: 0.346301. Entropy: 1.176686.
Training network. lr: 0.000193. clip: 0.077117
Iteration 7459: Policy loss: 0.067625. Value loss: 0.663958. Entropy: 1.155913.
Iteration 7460: Policy loss: 0.082427. Value loss: 0.386809. Entropy: 1.160129.
Iteration 7461: Policy loss: 0.061435. Value loss: 0.348252. Entropy: 1.160478.
Training network. lr: 0.000193. clip: 0.077117
Iteration 7462: Policy loss: -0.256887. Value loss: 0.814602. Entropy: 1.162049.
Iteration 7463: Policy loss: -0.324255. Value loss: 0.700844. Entropy: 1.158420.
Iteration 7464: Policy loss: -0.233342. Value loss: 0.328520. Entropy: 1.152022.
Training network. lr: 0.000193. clip: 0.077117
Iteration 7465: Policy loss: -0.169341. Value loss: 1.205854. Entropy: 1.185277.
Iteratio

Training network. lr: 0.000192. clip: 0.076969
Iteration 7534: Policy loss: -0.062394. Value loss: 0.444973. Entropy: 1.112569.
Iteration 7535: Policy loss: -0.074069. Value loss: 0.235004. Entropy: 1.103967.
Iteration 7536: Policy loss: -0.063379. Value loss: 0.156887. Entropy: 1.115210.
Training network. lr: 0.000192. clip: 0.076969
Iteration 7537: Policy loss: 0.182824. Value loss: 2.033633. Entropy: 1.125187.
Iteration 7538: Policy loss: 0.031597. Value loss: 1.258598. Entropy: 1.126966.
Iteration 7539: Policy loss: 0.122406. Value loss: 0.777224. Entropy: 1.132595.
Training network. lr: 0.000192. clip: 0.076969
Iteration 7540: Policy loss: 0.059223. Value loss: 1.166544. Entropy: 1.141118.
Iteration 7541: Policy loss: 0.106703. Value loss: 0.511983. Entropy: 1.113535.
Iteration 7542: Policy loss: 0.052353. Value loss: 0.448593. Entropy: 1.119028.
Training network. lr: 0.000192. clip: 0.076969
Iteration 7543: Policy loss: 0.302915. Value loss: 0.616063. Entropy: 1.130097.
Iteration

Training network. lr: 0.000192. clip: 0.076656
Iteration 7612: Policy loss: 0.046253. Value loss: 0.796680. Entropy: 1.049976.
Iteration 7613: Policy loss: 0.013309. Value loss: 0.499362. Entropy: 1.050765.
Iteration 7614: Policy loss: 0.000003. Value loss: 0.310621. Entropy: 1.070141.
Training network. lr: 0.000192. clip: 0.076656
Iteration 7615: Policy loss: -0.101746. Value loss: 1.147800. Entropy: 1.093269.
Iteration 7616: Policy loss: -0.132911. Value loss: 0.721314. Entropy: 1.072711.
Iteration 7617: Policy loss: -0.151608. Value loss: 0.652628. Entropy: 1.066122.
episode: 1097   score: 53800.0  epsilon: 1.0    steps: 417  evaluation reward: 33422.0
Training network. lr: 0.000192. clip: 0.076656
Iteration 7618: Policy loss: -0.298323. Value loss: 1.044308. Entropy: 1.074187.
Iteration 7619: Policy loss: -0.304323. Value loss: 0.606218. Entropy: 1.057239.
Iteration 7620: Policy loss: -0.295140. Value loss: 0.478303. Entropy: 1.081339.
Training network. lr: 0.000192. clip: 0.076656

Iteration 7688: Policy loss: -0.018930. Value loss: 0.542776. Entropy: 1.123051.
Iteration 7689: Policy loss: 0.001105. Value loss: 0.362024. Entropy: 1.107550.
Training network. lr: 0.000191. clip: 0.076508
Iteration 7690: Policy loss: 0.107978. Value loss: 0.987203. Entropy: 1.088838.
Iteration 7691: Policy loss: 0.159802. Value loss: 0.547129. Entropy: 1.082984.
Iteration 7692: Policy loss: 0.168359. Value loss: 0.423877. Entropy: 1.094324.
episode: 1107   score: 31000.0  epsilon: 1.0    steps: 42  evaluation reward: 34624.0
Training network. lr: 0.000191. clip: 0.076508
Iteration 7693: Policy loss: 0.063160. Value loss: 0.195889. Entropy: 1.167415.
Iteration 7694: Policy loss: 0.055322. Value loss: 0.125008. Entropy: 1.175454.
Iteration 7695: Policy loss: 0.058080. Value loss: 0.095424. Entropy: 1.168005.
Training network. lr: 0.000191. clip: 0.076508
Iteration 7696: Policy loss: 0.038955. Value loss: 0.639818. Entropy: 1.113361.
Iteration 7697: Policy loss: 0.037247. Value loss: 0

Training network. lr: 0.000190. clip: 0.076195
Iteration 7765: Policy loss: 0.114233. Value loss: 0.551817. Entropy: 1.158857.
Iteration 7766: Policy loss: 0.180717. Value loss: 0.389302. Entropy: 1.140969.
Iteration 7767: Policy loss: 0.138035. Value loss: 0.312595. Entropy: 1.147058.
Training network. lr: 0.000190. clip: 0.076195
Iteration 7768: Policy loss: 0.060481. Value loss: 0.719907. Entropy: 1.161109.
Iteration 7769: Policy loss: 0.031356. Value loss: 0.548640. Entropy: 1.166316.
Iteration 7770: Policy loss: 0.051195. Value loss: 0.461508. Entropy: 1.153741.
episode: 1117   score: 21200.0  epsilon: 1.0    steps: 998  evaluation reward: 33929.0
Training network. lr: 0.000190. clip: 0.076195
Iteration 7771: Policy loss: 0.185449. Value loss: 1.300494. Entropy: 1.106375.
Iteration 7772: Policy loss: 0.164061. Value loss: 0.648548. Entropy: 1.093828.
Iteration 7773: Policy loss: 0.172292. Value loss: 0.457777. Entropy: 1.100106.
Training network. lr: 0.000190. clip: 0.076195
Itera

Iteration 7839: Policy loss: 0.242229. Value loss: 0.199238. Entropy: 1.239712.
Training network. lr: 0.000190. clip: 0.076048
Iteration 7840: Policy loss: -0.216859. Value loss: 1.122642. Entropy: 1.230061.
Iteration 7841: Policy loss: -0.177558. Value loss: 0.411711. Entropy: 1.240741.
Iteration 7842: Policy loss: -0.177039. Value loss: 0.397007. Entropy: 1.245354.
episode: 1130   score: 31100.0  epsilon: 1.0    steps: 311  evaluation reward: 32364.0
Training network. lr: 0.000190. clip: 0.076048
Iteration 7843: Policy loss: 0.103638. Value loss: 0.465353. Entropy: 1.252284.
Iteration 7844: Policy loss: 0.130293. Value loss: 0.212940. Entropy: 1.250453.
Iteration 7845: Policy loss: 0.133108. Value loss: 0.171115. Entropy: 1.251226.
Training network. lr: 0.000190. clip: 0.076048
Iteration 7846: Policy loss: 0.262198. Value loss: 1.256577. Entropy: 1.256182.
Iteration 7847: Policy loss: 0.288075. Value loss: 0.533771. Entropy: 1.244908.
Iteration 7848: Policy loss: 0.215663. Value loss

Iteration 7916: Policy loss: 0.001637. Value loss: 0.387496. Entropy: 1.188274.
Iteration 7917: Policy loss: -0.002423. Value loss: 0.296049. Entropy: 1.178118.
episode: 1139   score: 20000.0  epsilon: 1.0    steps: 873  evaluation reward: 31710.0
Training network. lr: 0.000189. clip: 0.075734
Iteration 7918: Policy loss: -0.213804. Value loss: 1.419923. Entropy: 1.170124.
Iteration 7919: Policy loss: -0.143843. Value loss: 0.572577. Entropy: 1.161852.
Iteration 7920: Policy loss: -0.155740. Value loss: 0.453247. Entropy: 1.162366.
episode: 1140   score: 34400.0  epsilon: 1.0    steps: 603  evaluation reward: 31812.0
Training network. lr: 0.000189. clip: 0.075734
Iteration 7921: Policy loss: 0.318012. Value loss: 0.495769. Entropy: 1.220514.
Iteration 7922: Policy loss: 0.306042. Value loss: 0.232779. Entropy: 1.223472.
Iteration 7923: Policy loss: 0.309422. Value loss: 0.168486. Entropy: 1.228912.
episode: 1141   score: 25900.0  epsilon: 1.0    steps: 484  evaluation reward: 31531.0
T

Iteration 7991: Policy loss: 0.187203. Value loss: 0.361260. Entropy: 1.226367.
Iteration 7992: Policy loss: 0.213132. Value loss: 0.292676. Entropy: 1.216426.
Training network. lr: 0.000189. clip: 0.075587
Iteration 7993: Policy loss: -0.076905. Value loss: 1.109083. Entropy: 1.150957.
Iteration 7994: Policy loss: -0.032661. Value loss: 0.704112. Entropy: 1.149933.
Iteration 7995: Policy loss: -0.040308. Value loss: 0.489364. Entropy: 1.141804.
Training network. lr: 0.000189. clip: 0.075587
Iteration 7996: Policy loss: -0.046251. Value loss: 0.790889. Entropy: 1.222533.
Iteration 7997: Policy loss: -0.053764. Value loss: 0.480672. Entropy: 1.231049.
Iteration 7998: Policy loss: -0.049983. Value loss: 0.406363. Entropy: 1.223360.
Training network. lr: 0.000189. clip: 0.075587
Iteration 7999: Policy loss: 0.157067. Value loss: 0.488969. Entropy: 1.187669.
Iteration 8000: Policy loss: 0.121617. Value loss: 0.312132. Entropy: 1.183235.
Iteration 8001: Policy loss: 0.113410. Value loss: 0.

Training network. lr: 0.000188. clip: 0.075273
Iteration 8068: Policy loss: 0.162523. Value loss: 0.924731. Entropy: 1.233772.
Iteration 8069: Policy loss: 0.138461. Value loss: 0.500504. Entropy: 1.235554.
Iteration 8070: Policy loss: 0.148936. Value loss: 0.364910. Entropy: 1.234571.
episode: 1161   score: 20100.0  epsilon: 1.0    steps: 536  evaluation reward: 30523.0
Training network. lr: 0.000188. clip: 0.075273
Iteration 8071: Policy loss: 0.252015. Value loss: 0.314882. Entropy: 1.258425.
Iteration 8072: Policy loss: 0.235076. Value loss: 0.156134. Entropy: 1.253594.
Iteration 8073: Policy loss: 0.226856. Value loss: 0.115355. Entropy: 1.260579.
episode: 1162   score: 15400.0  epsilon: 1.0    steps: 784  evaluation reward: 30462.0
Training network. lr: 0.000188. clip: 0.075273
Iteration 8074: Policy loss: -0.064427. Value loss: 0.787136. Entropy: 1.258161.
Iteration 8075: Policy loss: -0.027020. Value loss: 0.372930. Entropy: 1.264899.
Iteration 8076: Policy loss: -0.055167. Val

Iteration 8144: Policy loss: 0.189907. Value loss: 0.398275. Entropy: 1.242492.
Iteration 8145: Policy loss: 0.176944. Value loss: 0.309493. Entropy: 1.239127.
Training network. lr: 0.000188. clip: 0.075126
Iteration 8146: Policy loss: 0.384158. Value loss: 0.837673. Entropy: 1.238946.
Iteration 8147: Policy loss: 0.403427. Value loss: 0.344779. Entropy: 1.243565.
Iteration 8148: Policy loss: 0.394713. Value loss: 0.291700. Entropy: 1.242400.
Training network. lr: 0.000188. clip: 0.075126
Iteration 8149: Policy loss: -0.142654. Value loss: 1.521748. Entropy: 1.242732.
Iteration 8150: Policy loss: -0.212722. Value loss: 1.187231. Entropy: 1.231161.
Iteration 8151: Policy loss: -0.226588. Value loss: 0.944139. Entropy: 1.244354.
episode: 1171   score: 28800.0  epsilon: 1.0    steps: 962  evaluation reward: 30977.0
Training network. lr: 0.000187. clip: 0.074969
Iteration 8152: Policy loss: 0.010877. Value loss: 0.606547. Entropy: 1.203247.
Iteration 8153: Policy loss: -0.046915. Value los

Iteration 8220: Policy loss: 0.119275. Value loss: 0.298149. Entropy: 1.221221.
Training network. lr: 0.000187. clip: 0.074813
Iteration 8221: Policy loss: -0.018145. Value loss: 0.491454. Entropy: 1.242092.
Iteration 8222: Policy loss: -0.026265. Value loss: 0.202496. Entropy: 1.249246.
Iteration 8223: Policy loss: -0.024039. Value loss: 0.113778. Entropy: 1.243700.
episode: 1182   score: 55000.0  epsilon: 1.0    steps: 13  evaluation reward: 31256.0
Training network. lr: 0.000187. clip: 0.074813
Iteration 8224: Policy loss: 0.079165. Value loss: 1.168758. Entropy: 1.263587.
Iteration 8225: Policy loss: 0.037668. Value loss: 0.617465. Entropy: 1.259456.
Iteration 8226: Policy loss: 0.102037. Value loss: 0.436721. Entropy: 1.252521.
Training network. lr: 0.000187. clip: 0.074813
Iteration 8227: Policy loss: 0.278869. Value loss: 0.425461. Entropy: 1.259578.
Iteration 8228: Policy loss: 0.286681. Value loss: 0.257659. Entropy: 1.244330.
Iteration 8229: Policy loss: 0.311461. Value loss:

Iteration 8296: Policy loss: 0.349020. Value loss: 0.684054. Entropy: 1.264644.
Iteration 8297: Policy loss: 0.366778. Value loss: 0.347428. Entropy: 1.265645.
Iteration 8298: Policy loss: 0.350292. Value loss: 0.260987. Entropy: 1.269939.
Training network. lr: 0.000187. clip: 0.074665
Iteration 8299: Policy loss: -0.020121. Value loss: 1.394651. Entropy: 1.233598.
Iteration 8300: Policy loss: 0.000402. Value loss: 0.831868. Entropy: 1.232563.
Iteration 8301: Policy loss: 0.002907. Value loss: 0.500292. Entropy: 1.228670.
Training network. lr: 0.000186. clip: 0.074509
Iteration 8302: Policy loss: 0.033741. Value loss: 1.002093. Entropy: 1.217081.
Iteration 8303: Policy loss: 0.027014. Value loss: 0.820102. Entropy: 1.203676.
Iteration 8304: Policy loss: -0.017894. Value loss: 0.666564. Entropy: 1.207994.
Training network. lr: 0.000186. clip: 0.074509
Iteration 8305: Policy loss: 0.236725. Value loss: 0.507026. Entropy: 1.213456.
Iteration 8306: Policy loss: 0.258505. Value loss: 0.2970

Iteration 8372: Policy loss: 0.109699. Value loss: 0.460067. Entropy: 1.229327.
Iteration 8373: Policy loss: 0.091958. Value loss: 0.352456. Entropy: 1.233595.
Training network. lr: 0.000186. clip: 0.074352
Iteration 8374: Policy loss: -0.087034. Value loss: 0.630479. Entropy: 1.234248.
Iteration 8375: Policy loss: -0.085030. Value loss: 0.321115. Entropy: 1.243234.
Iteration 8376: Policy loss: -0.088426. Value loss: 0.256060. Entropy: 1.247744.
episode: 1202   score: 55200.0  epsilon: 1.0    steps: 571  evaluation reward: 29181.0
Training network. lr: 0.000186. clip: 0.074352
Iteration 8377: Policy loss: 0.007891. Value loss: 0.598629. Entropy: 1.247362.
Iteration 8378: Policy loss: -0.089862. Value loss: 0.376780. Entropy: 1.237942.
Iteration 8379: Policy loss: -0.065994. Value loss: 0.220196. Entropy: 1.247444.
Training network. lr: 0.000186. clip: 0.074352
Iteration 8380: Policy loss: -0.273751. Value loss: 1.641825. Entropy: 1.252526.
Iteration 8381: Policy loss: -0.242901. Value 

Iteration 8449: Policy loss: -0.041452. Value loss: 0.332203. Entropy: 1.260769.
Iteration 8450: Policy loss: -0.048201. Value loss: 0.189043. Entropy: 1.249205.
Iteration 8451: Policy loss: -0.042368. Value loss: 0.136836. Entropy: 1.257491.
Training network. lr: 0.000185. clip: 0.074048
Iteration 8452: Policy loss: -0.119822. Value loss: 0.696274. Entropy: 1.193481.
Iteration 8453: Policy loss: -0.129383. Value loss: 0.380327. Entropy: 1.192912.
Iteration 8454: Policy loss: -0.132869. Value loss: 0.352340. Entropy: 1.197994.
Training network. lr: 0.000185. clip: 0.074048
Iteration 8455: Policy loss: -0.108017. Value loss: 1.118819. Entropy: 1.198311.
Iteration 8456: Policy loss: -0.158770. Value loss: 0.434183. Entropy: 1.188051.
Iteration 8457: Policy loss: -0.089331. Value loss: 0.391279. Entropy: 1.184547.
Training network. lr: 0.000185. clip: 0.074048
Iteration 8458: Policy loss: -0.357388. Value loss: 1.103050. Entropy: 1.213989.
Iteration 8459: Policy loss: -0.298240. Value los

Iteration 8526: Policy loss: 0.061484. Value loss: 0.251558. Entropy: 1.224358.
Training network. lr: 0.000185. clip: 0.073891
Iteration 8527: Policy loss: 0.352007. Value loss: 0.358087. Entropy: 1.221098.
Iteration 8528: Policy loss: 0.369944. Value loss: 0.154179. Entropy: 1.221390.
Iteration 8529: Policy loss: 0.360226. Value loss: 0.119576. Entropy: 1.229093.
Training network. lr: 0.000185. clip: 0.073891
Iteration 8530: Policy loss: -0.168373. Value loss: 1.141555. Entropy: 1.224537.
Iteration 8531: Policy loss: -0.150598. Value loss: 0.686366. Entropy: 1.234470.
Iteration 8532: Policy loss: -0.190935. Value loss: 0.511953. Entropy: 1.227857.
Training network. lr: 0.000185. clip: 0.073891
Iteration 8533: Policy loss: -0.374312. Value loss: 1.423158. Entropy: 1.198066.
Iteration 8534: Policy loss: -0.404991. Value loss: 0.783493. Entropy: 1.206334.
Iteration 8535: Policy loss: -0.360938. Value loss: 0.496346. Entropy: 1.207370.
Training network. lr: 0.000185. clip: 0.073891
Iterat

episode: 1229   score: 58700.0  epsilon: 1.0    steps: 460  evaluation reward: 30602.0
Training network. lr: 0.000184. clip: 0.073587
Iteration 8605: Policy loss: 0.202286. Value loss: 1.286768. Entropy: 1.218253.
Iteration 8606: Policy loss: 0.207819. Value loss: 0.766445. Entropy: 1.212621.
Iteration 8607: Policy loss: 0.248198. Value loss: 0.496314. Entropy: 1.210874.
episode: 1230   score: 30800.0  epsilon: 1.0    steps: 620  evaluation reward: 30599.0
Training network. lr: 0.000184. clip: 0.073587
Iteration 8608: Policy loss: 0.108739. Value loss: 0.472255. Entropy: 1.264510.
Iteration 8609: Policy loss: 0.056948. Value loss: 0.262236. Entropy: 1.264768.
Iteration 8610: Policy loss: 0.087003. Value loss: 0.167867. Entropy: 1.265247.
Training network. lr: 0.000184. clip: 0.073587
Iteration 8611: Policy loss: 0.207415. Value loss: 0.327027. Entropy: 1.247139.
Iteration 8612: Policy loss: 0.187662. Value loss: 0.186731. Entropy: 1.240699.
Iteration 8613: Policy loss: 0.179189. Value 

Iteration 8680: Policy loss: 0.173722. Value loss: 0.499428. Entropy: 1.217242.
Iteration 8681: Policy loss: 0.167477. Value loss: 0.310370. Entropy: 1.216302.
Iteration 8682: Policy loss: 0.172929. Value loss: 0.191599. Entropy: 1.219296.
Training network. lr: 0.000184. clip: 0.073430
Iteration 8683: Policy loss: -0.241590. Value loss: 1.284481. Entropy: 1.236933.
Iteration 8684: Policy loss: -0.249207. Value loss: 0.761747. Entropy: 1.234384.
Iteration 8685: Policy loss: -0.256874. Value loss: 0.548596. Entropy: 1.235801.
episode: 1240   score: 32000.0  epsilon: 1.0    steps: 872  evaluation reward: 30864.0
Training network. lr: 0.000184. clip: 0.073430
Iteration 8686: Policy loss: 0.306197. Value loss: 0.453865. Entropy: 1.239468.
Iteration 8687: Policy loss: 0.308414. Value loss: 0.309989. Entropy: 1.246334.
Iteration 8688: Policy loss: 0.303078. Value loss: 0.231820. Entropy: 1.245185.
episode: 1241   score: 37900.0  epsilon: 1.0    steps: 261  evaluation reward: 30984.0
Training 

Iteration 8756: Policy loss: 0.130149. Value loss: 0.426393. Entropy: 1.253234.
Iteration 8757: Policy loss: 0.158593. Value loss: 0.242375. Entropy: 1.256044.
Training network. lr: 0.000183. clip: 0.073126
Iteration 8758: Policy loss: 0.387167. Value loss: 0.902632. Entropy: 1.234012.
Iteration 8759: Policy loss: 0.384593. Value loss: 0.560577. Entropy: 1.242167.
Iteration 8760: Policy loss: 0.381858. Value loss: 0.397316. Entropy: 1.242319.
Training network. lr: 0.000183. clip: 0.073126
Iteration 8761: Policy loss: 0.246812. Value loss: 1.232256. Entropy: 1.282827.
Iteration 8762: Policy loss: 0.215895. Value loss: 0.571570. Entropy: 1.278526.
Iteration 8763: Policy loss: 0.217968. Value loss: 0.369941. Entropy: 1.280065.
Training network. lr: 0.000183. clip: 0.073126
Iteration 8764: Policy loss: 0.055971. Value loss: 1.120311. Entropy: 1.240091.
Iteration 8765: Policy loss: 0.076430. Value loss: 0.558866. Entropy: 1.224218.
Iteration 8766: Policy loss: 0.072519. Value loss: 0.475856

Iteration 8834: Policy loss: -0.184556. Value loss: 0.876024. Entropy: 1.208093.
Iteration 8835: Policy loss: -0.178814. Value loss: 0.552347. Entropy: 1.211460.
episode: 1259   score: 28100.0  epsilon: 1.0    steps: 495  evaluation reward: 31702.0
Training network. lr: 0.000182. clip: 0.072969
Iteration 8836: Policy loss: 0.176972. Value loss: 0.778431. Entropy: 1.232181.
Iteration 8837: Policy loss: 0.184040. Value loss: 0.415147. Entropy: 1.236657.
Iteration 8838: Policy loss: 0.180547. Value loss: 0.268508. Entropy: 1.231864.
Training network. lr: 0.000182. clip: 0.072969
Iteration 8839: Policy loss: -0.099853. Value loss: 1.109875. Entropy: 1.198067.
Iteration 8840: Policy loss: -0.110576. Value loss: 0.627709. Entropy: 1.212701.
Iteration 8841: Policy loss: -0.111720. Value loss: 0.469349. Entropy: 1.210148.
Training network. lr: 0.000182. clip: 0.072969
Iteration 8842: Policy loss: -0.032925. Value loss: 0.959059. Entropy: 1.241148.
Iteration 8843: Policy loss: -0.080343. Value 

episode: 1269   score: 36000.0  epsilon: 1.0    steps: 443  evaluation reward: 32067.0
Training network. lr: 0.000182. clip: 0.072665
Iteration 8911: Policy loss: -0.056400. Value loss: 0.622901. Entropy: 1.119819.
Iteration 8912: Policy loss: -0.073320. Value loss: 0.367716. Entropy: 1.130794.
Iteration 8913: Policy loss: -0.068806. Value loss: 0.286815. Entropy: 1.123183.
episode: 1270   score: 31700.0  epsilon: 1.0    steps: 902  evaluation reward: 32029.0
Training network. lr: 0.000182. clip: 0.072665
Iteration 8914: Policy loss: -0.116800. Value loss: 0.769989. Entropy: 1.157296.
Iteration 8915: Policy loss: -0.118080. Value loss: 0.432999. Entropy: 1.162525.
Iteration 8916: Policy loss: -0.165822. Value loss: 0.329443. Entropy: 1.147542.
Training network. lr: 0.000182. clip: 0.072665
Iteration 8917: Policy loss: 0.202276. Value loss: 0.757412. Entropy: 1.136928.
Iteration 8918: Policy loss: 0.194301. Value loss: 0.430833. Entropy: 1.144549.
Iteration 8919: Policy loss: 0.184494. 

Training network. lr: 0.000181. clip: 0.072509
Iteration 8986: Policy loss: 0.101734. Value loss: 0.999450. Entropy: 1.175715.
Iteration 8987: Policy loss: 0.083682. Value loss: 0.509789. Entropy: 1.174458.
Iteration 8988: Policy loss: 0.076970. Value loss: 0.314023. Entropy: 1.180861.
Training network. lr: 0.000181. clip: 0.072509
Iteration 8989: Policy loss: 0.042230. Value loss: 0.968015. Entropy: 1.162263.
Iteration 8990: Policy loss: 0.044743. Value loss: 0.537401. Entropy: 1.150458.
Iteration 8991: Policy loss: -0.017473. Value loss: 0.407108. Entropy: 1.152590.
Training network. lr: 0.000181. clip: 0.072509
Iteration 8992: Policy loss: 0.309226. Value loss: 0.796398. Entropy: 1.150899.
Iteration 8993: Policy loss: 0.292886. Value loss: 0.460435. Entropy: 1.149619.
Iteration 8994: Policy loss: 0.279393. Value loss: 0.319207. Entropy: 1.142034.
Training network. lr: 0.000181. clip: 0.072509
Iteration 8995: Policy loss: -0.320229. Value loss: 1.127051. Entropy: 1.137341.
Iteration 

Training network. lr: 0.000181. clip: 0.072205
Iteration 9064: Policy loss: -0.071767. Value loss: 0.673814. Entropy: 1.165897.
Iteration 9065: Policy loss: 0.019959. Value loss: 0.323170. Entropy: 1.169297.
Iteration 9066: Policy loss: -0.038019. Value loss: 0.246201. Entropy: 1.166009.
episode: 1290   score: 20000.0  epsilon: 1.0    steps: 994  evaluation reward: 31619.0
Training network. lr: 0.000181. clip: 0.072205
Iteration 9067: Policy loss: 0.023967. Value loss: 0.534289. Entropy: 1.141339.
Iteration 9068: Policy loss: 0.057842. Value loss: 0.302159. Entropy: 1.145590.
Iteration 9069: Policy loss: 0.034706. Value loss: 0.273520. Entropy: 1.132255.
episode: 1291   score: 19300.0  epsilon: 1.0    steps: 758  evaluation reward: 31551.0
Training network. lr: 0.000181. clip: 0.072205
Iteration 9070: Policy loss: 0.069913. Value loss: 0.911833. Entropy: 1.165116.
Iteration 9071: Policy loss: -0.036170. Value loss: 0.694725. Entropy: 1.166768.
Iteration 9072: Policy loss: 0.053625. Val

Iteration 9139: Policy loss: 0.184893. Value loss: 0.599060. Entropy: 1.123248.
Iteration 9140: Policy loss: 0.150186. Value loss: 0.593232. Entropy: 1.127053.
Iteration 9141: Policy loss: 0.143921. Value loss: 0.481961. Entropy: 1.139022.
Training network. lr: 0.000180. clip: 0.072048
Iteration 9142: Policy loss: 0.117059. Value loss: 0.822344. Entropy: 1.126361.
Iteration 9143: Policy loss: 0.146741. Value loss: 0.359828. Entropy: 1.125301.
Iteration 9144: Policy loss: 0.200492. Value loss: 0.248361. Entropy: 1.119400.
now time :  2019-02-26 21:17:50.910405
episode: 1301   score: 57200.0  epsilon: 1.0    steps: 385  evaluation reward: 32475.0
Training network. lr: 0.000180. clip: 0.072048
Iteration 9145: Policy loss: 0.068279. Value loss: 1.203862. Entropy: 1.145620.
Iteration 9146: Policy loss: 0.065645. Value loss: 0.561716. Entropy: 1.137164.
Iteration 9147: Policy loss: 0.042842. Value loss: 0.426170. Entropy: 1.134603.
episode: 1302   score: 22000.0  epsilon: 1.0    steps: 698  

Iteration 9216: Policy loss: 0.306274. Value loss: 0.129443. Entropy: 1.133591.
Training network. lr: 0.000179. clip: 0.071744
Iteration 9217: Policy loss: -0.272625. Value loss: 1.232803. Entropy: 1.106244.
Iteration 9218: Policy loss: -0.269044. Value loss: 0.757772. Entropy: 1.115817.
Iteration 9219: Policy loss: -0.343900. Value loss: 0.549699. Entropy: 1.098266.
episode: 1311   score: 34900.0  epsilon: 1.0    steps: 374  evaluation reward: 31434.0
episode: 1312   score: 49100.0  epsilon: 1.0    steps: 929  evaluation reward: 31676.0
Training network. lr: 0.000179. clip: 0.071744
Iteration 9220: Policy loss: 0.077418. Value loss: 0.654913. Entropy: 1.156723.
Iteration 9221: Policy loss: 0.087512. Value loss: 0.325320. Entropy: 1.164472.
Iteration 9222: Policy loss: 0.046041. Value loss: 0.266725. Entropy: 1.165384.
Training network. lr: 0.000179. clip: 0.071744
Iteration 9223: Policy loss: -0.142041. Value loss: 1.017846. Entropy: 1.182993.
Iteration 9224: Policy loss: -0.157657. V

Iteration 9294: Policy loss: 0.175463. Value loss: 0.526988. Entropy: 1.129977.
Training network. lr: 0.000179. clip: 0.071587
Iteration 9295: Policy loss: 0.150443. Value loss: 1.409575. Entropy: 1.117416.
Iteration 9296: Policy loss: 0.139294. Value loss: 0.896376. Entropy: 1.120382.
Iteration 9297: Policy loss: 0.132721. Value loss: 0.643617. Entropy: 1.119731.
Training network. lr: 0.000179. clip: 0.071587
Iteration 9298: Policy loss: 0.026406. Value loss: 1.149646. Entropy: 1.123322.
Iteration 9299: Policy loss: 0.075199. Value loss: 0.630606. Entropy: 1.111359.
Iteration 9300: Policy loss: 0.057895. Value loss: 0.524598. Entropy: 1.122933.
episode: 1320   score: 27900.0  epsilon: 1.0    steps: 20  evaluation reward: 32011.0
episode: 1321   score: 48300.0  epsilon: 1.0    steps: 259  evaluation reward: 32097.0
Training network. lr: 0.000179. clip: 0.071440
Iteration 9301: Policy loss: 0.165282. Value loss: 0.541760. Entropy: 1.184363.
Iteration 9302: Policy loss: 0.168543. Value l

Iteration 9371: Policy loss: 0.073956. Value loss: 0.299853. Entropy: 1.143893.
Iteration 9372: Policy loss: 0.034573. Value loss: 0.249124. Entropy: 1.142867.
episode: 1329   score: 29500.0  epsilon: 1.0    steps: 840  evaluation reward: 31915.0
Training network. lr: 0.000178. clip: 0.071283
Iteration 9373: Policy loss: 0.003566. Value loss: 0.652795. Entropy: 1.136326.
Iteration 9374: Policy loss: 0.045820. Value loss: 0.330648. Entropy: 1.133212.
Iteration 9375: Policy loss: 0.029857. Value loss: 0.288959. Entropy: 1.135646.
episode: 1330   score: 40100.0  epsilon: 1.0    steps: 105  evaluation reward: 32008.0
Training network. lr: 0.000178. clip: 0.071283
Iteration 9376: Policy loss: 0.034890. Value loss: 0.482192. Entropy: 1.150059.
Iteration 9377: Policy loss: 0.037707. Value loss: 0.346177. Entropy: 1.151365.
Iteration 9378: Policy loss: 0.019335. Value loss: 0.240550. Entropy: 1.147075.
episode: 1331   score: 43400.0  epsilon: 1.0    steps: 301  evaluation reward: 31966.0
Train

Iteration 9446: Policy loss: 0.291516. Value loss: 0.269993. Entropy: 1.219806.
Iteration 9447: Policy loss: 0.290994. Value loss: 0.171578. Entropy: 1.223274.
episode: 1341   score: 35200.0  epsilon: 1.0    steps: 419  evaluation reward: 32243.0
episode: 1342   score: 38000.0  epsilon: 1.0    steps: 909  evaluation reward: 32153.0
Training network. lr: 0.000178. clip: 0.071126
Iteration 9448: Policy loss: 0.023779. Value loss: 0.352351. Entropy: 1.272118.
Iteration 9449: Policy loss: 0.028726. Value loss: 0.208549. Entropy: 1.271254.
Iteration 9450: Policy loss: 0.016360. Value loss: 0.151431. Entropy: 1.271194.
episode: 1343   score: 10100.0  epsilon: 1.0    steps: 860  evaluation reward: 31834.0
Training network. lr: 0.000177. clip: 0.070979
Iteration 9451: Policy loss: -0.359727. Value loss: 1.092950. Entropy: 1.222167.
Iteration 9452: Policy loss: -0.384620. Value loss: 0.686598. Entropy: 1.217263.
Iteration 9453: Policy loss: -0.359415. Value loss: 0.504056. Entropy: 1.218955.
ep

Iteration 9520: Policy loss: -0.164264. Value loss: 1.129393. Entropy: 1.176685.
Iteration 9521: Policy loss: -0.193815. Value loss: 0.905113. Entropy: 1.189844.
Iteration 9522: Policy loss: -0.225553. Value loss: 0.641485. Entropy: 1.178186.
Training network. lr: 0.000177. clip: 0.070822
Iteration 9523: Policy loss: -0.019514. Value loss: 1.318500. Entropy: 1.201745.
Iteration 9524: Policy loss: -0.050642. Value loss: 0.577271. Entropy: 1.202978.
Iteration 9525: Policy loss: -0.008052. Value loss: 0.343642. Entropy: 1.210103.
Training network. lr: 0.000177. clip: 0.070822
Iteration 9526: Policy loss: 0.080184. Value loss: 0.386325. Entropy: 1.204382.
Iteration 9527: Policy loss: 0.081051. Value loss: 0.215884. Entropy: 1.205515.
Iteration 9528: Policy loss: 0.079753. Value loss: 0.158922. Entropy: 1.204108.
episode: 1353   score: 18100.0  epsilon: 1.0    steps: 374  evaluation reward: 30862.0
Training network. lr: 0.000177. clip: 0.070822
Iteration 9529: Policy loss: -0.163819. Value 

Training network. lr: 0.000177. clip: 0.070665
Iteration 9598: Policy loss: -0.000719. Value loss: 0.852814. Entropy: 1.188015.
Iteration 9599: Policy loss: -0.021968. Value loss: 0.514813. Entropy: 1.199885.
Iteration 9600: Policy loss: -0.004084. Value loss: 0.411988. Entropy: 1.189072.
Training network. lr: 0.000176. clip: 0.070518
Iteration 9601: Policy loss: 0.230368. Value loss: 0.518776. Entropy: 1.152022.
Iteration 9602: Policy loss: 0.239265. Value loss: 0.354018. Entropy: 1.147672.
Iteration 9603: Policy loss: 0.203719. Value loss: 0.247603. Entropy: 1.158521.
episode: 1362   score: 23500.0  epsilon: 1.0    steps: 88  evaluation reward: 30627.0
episode: 1363   score: 37700.0  epsilon: 1.0    steps: 273  evaluation reward: 30653.0
Training network. lr: 0.000176. clip: 0.070518
Iteration 9604: Policy loss: 0.253331. Value loss: 0.488316. Entropy: 1.239274.
Iteration 9605: Policy loss: 0.241545. Value loss: 0.265882. Entropy: 1.238488.
Iteration 9606: Policy loss: 0.198946. Valu

Iteration 9673: Policy loss: 0.051775. Value loss: 0.697075. Entropy: 1.218223.
Iteration 9674: Policy loss: 0.069064. Value loss: 0.442768. Entropy: 1.227794.
Iteration 9675: Policy loss: 0.043521. Value loss: 0.286412. Entropy: 1.220465.
episode: 1373   score: 27500.0  epsilon: 1.0    steps: 532  evaluation reward: 31515.0
Training network. lr: 0.000176. clip: 0.070361
Iteration 9676: Policy loss: 0.004863. Value loss: 1.071271. Entropy: 1.218178.
Iteration 9677: Policy loss: 0.055923. Value loss: 0.532926. Entropy: 1.208505.
Iteration 9678: Policy loss: -0.006894. Value loss: 0.333594. Entropy: 1.208784.
Training network. lr: 0.000176. clip: 0.070361
Iteration 9679: Policy loss: 0.091882. Value loss: 0.977261. Entropy: 1.201626.
Iteration 9680: Policy loss: 0.081790. Value loss: 0.702213. Entropy: 1.212878.
Iteration 9681: Policy loss: 0.038367. Value loss: 0.551273. Entropy: 1.203589.
Training network. lr: 0.000176. clip: 0.070361
Iteration 9682: Policy loss: 0.031618. Value loss: 

Iteration 9752: Policy loss: -0.163073. Value loss: 0.506343. Entropy: 1.214708.
Iteration 9753: Policy loss: -0.151269. Value loss: 0.390276. Entropy: 1.212137.
Training network. lr: 0.000175. clip: 0.070057
Iteration 9754: Policy loss: -0.319180. Value loss: 1.255017. Entropy: 1.206740.
Iteration 9755: Policy loss: -0.305829. Value loss: 0.851503. Entropy: 1.220949.
Iteration 9756: Policy loss: -0.285558. Value loss: 0.657584. Entropy: 1.218080.
episode: 1381   score: 53000.0  epsilon: 1.0    steps: 531  evaluation reward: 32076.0
Training network. lr: 0.000175. clip: 0.070057
Iteration 9757: Policy loss: 0.071119. Value loss: 0.640894. Entropy: 1.213191.
Iteration 9758: Policy loss: 0.125948. Value loss: 0.386857. Entropy: 1.209505.
Iteration 9759: Policy loss: 0.095041. Value loss: 0.201394. Entropy: 1.215714.
episode: 1382   score: 30300.0  epsilon: 1.0    steps: 819  evaluation reward: 32178.0
Training network. lr: 0.000175. clip: 0.070057
Iteration 9760: Policy loss: 0.176370. V

Training network. lr: 0.000175. clip: 0.069901
Iteration 9829: Policy loss: -0.163547. Value loss: 0.872679. Entropy: 1.212665.
Iteration 9830: Policy loss: -0.209960. Value loss: 0.522579. Entropy: 1.219260.
Iteration 9831: Policy loss: -0.166484. Value loss: 0.297033. Entropy: 1.217126.
Training network. lr: 0.000175. clip: 0.069901
Iteration 9832: Policy loss: 0.144371. Value loss: 0.643164. Entropy: 1.211767.
Iteration 9833: Policy loss: 0.125281. Value loss: 0.399848. Entropy: 1.223181.
Iteration 9834: Policy loss: 0.174764. Value loss: 0.235408. Entropy: 1.220380.
Training network. lr: 0.000175. clip: 0.069901
Iteration 9835: Policy loss: 0.257403. Value loss: 1.238410. Entropy: 1.259338.
Iteration 9836: Policy loss: 0.236216. Value loss: 0.625818. Entropy: 1.265211.
Iteration 9837: Policy loss: 0.251894. Value loss: 0.553971. Entropy: 1.255437.
episode: 1391   score: 56500.0  epsilon: 1.0    steps: 927  evaluation reward: 33167.0
Training network. lr: 0.000175. clip: 0.069901
It

Iteration 9906: Policy loss: -0.047164. Value loss: 0.171323. Entropy: 1.249523.
episode: 1400   score: 33400.0  epsilon: 1.0    steps: 744  evaluation reward: 32945.0
Training network. lr: 0.000174. clip: 0.069596
Iteration 9907: Policy loss: -0.058458. Value loss: 0.650459. Entropy: 1.246168.
Iteration 9908: Policy loss: -0.018202. Value loss: 0.383735. Entropy: 1.242413.
Iteration 9909: Policy loss: 0.000660. Value loss: 0.245590. Entropy: 1.246772.
Training network. lr: 0.000174. clip: 0.069596
Iteration 9910: Policy loss: -0.137688. Value loss: 1.294609. Entropy: 1.261270.
Iteration 9911: Policy loss: -0.198034. Value loss: 0.751587. Entropy: 1.273487.
Iteration 9912: Policy loss: -0.142264. Value loss: 0.528349. Entropy: 1.260706.
now time :  2019-02-26 21:32:05.329494
episode: 1401   score: 37800.0  epsilon: 1.0    steps: 898  evaluation reward: 32751.0
Training network. lr: 0.000174. clip: 0.069596
Iteration 9913: Policy loss: -0.301580. Value loss: 0.702566. Entropy: 1.241692.

Iteration 9984: Policy loss: 0.196982. Value loss: 0.476128. Entropy: 1.175729.
episode: 1408   score: 26800.0  epsilon: 1.0    steps: 394  evaluation reward: 33311.0
episode: 1409   score: 65200.0  epsilon: 1.0    steps: 519  evaluation reward: 33748.0
Training network. lr: 0.000174. clip: 0.069440
Iteration 9985: Policy loss: -0.022337. Value loss: 0.770918. Entropy: 1.180457.
Iteration 9986: Policy loss: 0.027855. Value loss: 0.374760. Entropy: 1.184037.
Iteration 9987: Policy loss: 0.008738. Value loss: 0.290006. Entropy: 1.186154.
episode: 1410   score: 33100.0  epsilon: 1.0    steps: 243  evaluation reward: 33722.0
Training network. lr: 0.000174. clip: 0.069440
Iteration 9988: Policy loss: -0.030039. Value loss: 0.696971. Entropy: 1.160131.
Iteration 9989: Policy loss: -0.070492. Value loss: 0.506386. Entropy: 1.168200.
Iteration 9990: Policy loss: -0.049817. Value loss: 0.311794. Entropy: 1.158365.
Training network. lr: 0.000174. clip: 0.069440
Iteration 9991: Policy loss: 0.226

Iteration 10059: Policy loss: 0.270911. Value loss: 0.447665. Entropy: 1.203333.
Training network. lr: 0.000173. clip: 0.069136
Iteration 10060: Policy loss: 0.119452. Value loss: 0.346674. Entropy: 1.213079.
Iteration 10061: Policy loss: 0.125294. Value loss: 0.169234. Entropy: 1.225872.
Iteration 10062: Policy loss: 0.124529. Value loss: 0.133902. Entropy: 1.214267.
episode: 1419   score: 51700.0  epsilon: 1.0    steps: 503  evaluation reward: 33391.0
Training network. lr: 0.000173. clip: 0.069136
Iteration 10063: Policy loss: 0.073836. Value loss: 0.470129. Entropy: 1.216427.
Iteration 10064: Policy loss: 0.093464. Value loss: 0.223769. Entropy: 1.219106.
Iteration 10065: Policy loss: 0.095938. Value loss: 0.169888. Entropy: 1.218511.
Training network. lr: 0.000173. clip: 0.069136
Iteration 10066: Policy loss: -0.293179. Value loss: 1.104671. Entropy: 1.208842.
Iteration 10067: Policy loss: -0.290478. Value loss: 0.584909. Entropy: 1.207982.
Iteration 10068: Policy loss: -0.296727. 

Iteration 10136: Policy loss: 0.081059. Value loss: 0.475327. Entropy: 1.200605.
Iteration 10137: Policy loss: 0.035493. Value loss: 0.449563. Entropy: 1.196512.
Training network. lr: 0.000172. clip: 0.068979
Iteration 10138: Policy loss: 0.380057. Value loss: 0.863808. Entropy: 1.217345.
Iteration 10139: Policy loss: 0.392107. Value loss: 0.585031. Entropy: 1.217087.
Iteration 10140: Policy loss: 0.395478. Value loss: 0.416982. Entropy: 1.219098.
episode: 1427   score: 48100.0  epsilon: 1.0    steps: 389  evaluation reward: 33702.0
Training network. lr: 0.000172. clip: 0.068979
Iteration 10141: Policy loss: -0.217517. Value loss: 0.950997. Entropy: 1.206409.
Iteration 10142: Policy loss: -0.256255. Value loss: 0.466212. Entropy: 1.215037.
Iteration 10143: Policy loss: -0.233495. Value loss: 0.346088. Entropy: 1.215626.
Training network. lr: 0.000172. clip: 0.068979
Iteration 10144: Policy loss: 0.217294. Value loss: 0.671591. Entropy: 1.268909.
Iteration 10145: Policy loss: 0.260757. 

Iteration 10211: Policy loss: 0.018035. Value loss: 0.442297. Entropy: 1.251653.
Iteration 10212: Policy loss: -0.001241. Value loss: 0.278397. Entropy: 1.259811.
Training network. lr: 0.000172. clip: 0.068675
Iteration 10213: Policy loss: -0.112235. Value loss: 1.108209. Entropy: 1.272300.
Iteration 10214: Policy loss: -0.092679. Value loss: 0.474798. Entropy: 1.278770.
Iteration 10215: Policy loss: -0.090600. Value loss: 0.398647. Entropy: 1.276295.
episode: 1438   score: 19600.0  epsilon: 1.0    steps: 835  evaluation reward: 33592.0
Training network. lr: 0.000172. clip: 0.068675
Iteration 10216: Policy loss: 0.159664. Value loss: 0.850164. Entropy: 1.261122.
Iteration 10217: Policy loss: 0.153740. Value loss: 0.518666. Entropy: 1.261693.
Iteration 10218: Policy loss: 0.132632. Value loss: 0.369523. Entropy: 1.259435.
Training network. lr: 0.000172. clip: 0.068675
Iteration 10219: Policy loss: 0.004663. Value loss: 0.761065. Entropy: 1.249125.
Iteration 10220: Policy loss: -0.014848

Training network. lr: 0.000171. clip: 0.068518
Iteration 10285: Policy loss: 0.114186. Value loss: 0.847376. Entropy: 1.231590.
Iteration 10286: Policy loss: 0.099424. Value loss: 0.547816. Entropy: 1.226447.
Iteration 10287: Policy loss: 0.126562. Value loss: 0.266818. Entropy: 1.233594.
episode: 1450   score: 46300.0  epsilon: 1.0    steps: 579  evaluation reward: 34177.0
Training network. lr: 0.000171. clip: 0.068518
Iteration 10288: Policy loss: -0.066526. Value loss: 0.666695. Entropy: 1.255939.
Iteration 10289: Policy loss: -0.058919. Value loss: 0.344618. Entropy: 1.255373.
Iteration 10290: Policy loss: -0.077291. Value loss: 0.258085. Entropy: 1.255851.
Training network. lr: 0.000171. clip: 0.068518
Iteration 10291: Policy loss: -0.014189. Value loss: 0.433483. Entropy: 1.233897.
Iteration 10292: Policy loss: -0.010489. Value loss: 0.269908. Entropy: 1.234394.
Iteration 10293: Policy loss: -0.030494. Value loss: 0.196905. Entropy: 1.229710.
Training network. lr: 0.000171. clip:

Iteration 10363: Policy loss: -0.262680. Value loss: 1.575233. Entropy: 1.204789.
Iteration 10364: Policy loss: -0.261589. Value loss: 1.150435. Entropy: 1.220083.
Iteration 10365: Policy loss: -0.235622. Value loss: 0.866168. Entropy: 1.210605.
Training network. lr: 0.000171. clip: 0.068214
Iteration 10366: Policy loss: -0.049332. Value loss: 1.084104. Entropy: 1.158699.
Iteration 10367: Policy loss: -0.102991. Value loss: 0.861090. Entropy: 1.167286.
Iteration 10368: Policy loss: -0.086679. Value loss: 0.564127. Entropy: 1.169844.
Training network. lr: 0.000171. clip: 0.068214
Iteration 10369: Policy loss: 0.156146. Value loss: 0.286310. Entropy: 1.131531.
Iteration 10370: Policy loss: 0.176007. Value loss: 0.154517. Entropy: 1.136481.
Iteration 10371: Policy loss: 0.132544. Value loss: 0.141292. Entropy: 1.130167.
Training network. lr: 0.000171. clip: 0.068214
Iteration 10372: Policy loss: 0.190680. Value loss: 0.721009. Entropy: 1.134030.
Iteration 10373: Policy loss: 0.198756. Val

Training network. lr: 0.000170. clip: 0.068057
Iteration 10441: Policy loss: -0.005775. Value loss: 1.218415. Entropy: 1.173702.
Iteration 10442: Policy loss: -0.007877. Value loss: 0.613688. Entropy: 1.168075.
Iteration 10443: Policy loss: -0.005794. Value loss: 0.455712. Entropy: 1.168702.
Training network. lr: 0.000170. clip: 0.068057
Iteration 10444: Policy loss: 0.221894. Value loss: 0.227080. Entropy: 1.196431.
Iteration 10445: Policy loss: 0.219041. Value loss: 0.127211. Entropy: 1.196516.
Iteration 10446: Policy loss: 0.205670. Value loss: 0.113773. Entropy: 1.193244.
episode: 1465   score: 53000.0  epsilon: 1.0    steps: 1002  evaluation reward: 36291.0
Training network. lr: 0.000170. clip: 0.068057
Iteration 10447: Policy loss: -0.093908. Value loss: 0.939570. Entropy: 1.179386.
Iteration 10448: Policy loss: -0.022935. Value loss: 0.432563. Entropy: 1.160802.
Iteration 10449: Policy loss: -0.055136. Value loss: 0.319425. Entropy: 1.167277.
Training network. lr: 0.000170. clip

Iteration 10518: Policy loss: -0.033188. Value loss: 0.479255. Entropy: 1.233064.
Training network. lr: 0.000169. clip: 0.067753
Iteration 10519: Policy loss: 0.103571. Value loss: 1.902148. Entropy: 1.192427.
Iteration 10520: Policy loss: 0.161580. Value loss: 1.130038. Entropy: 1.195709.
Iteration 10521: Policy loss: 0.075756. Value loss: 0.969437. Entropy: 1.195599.
Training network. lr: 0.000169. clip: 0.067753
Iteration 10522: Policy loss: 0.277171. Value loss: 0.567812. Entropy: 1.242965.
Iteration 10523: Policy loss: 0.292381. Value loss: 0.321889. Entropy: 1.239553.
Iteration 10524: Policy loss: 0.291290. Value loss: 0.256353. Entropy: 1.240524.
Training network. lr: 0.000169. clip: 0.067753
Iteration 10525: Policy loss: 0.136560. Value loss: 1.037564. Entropy: 1.196463.
Iteration 10526: Policy loss: 0.100242. Value loss: 0.573859. Entropy: 1.208441.
Iteration 10527: Policy loss: 0.140045. Value loss: 0.481817. Entropy: 1.207953.
episode: 1474   score: 30200.0  epsilon: 1.0    

Iteration 10592: Policy loss: 0.164068. Value loss: 0.501929. Entropy: 1.201241.
Iteration 10593: Policy loss: 0.176553. Value loss: 0.369596. Entropy: 1.199449.
Training network. lr: 0.000169. clip: 0.067597
Iteration 10594: Policy loss: -0.232713. Value loss: 0.783167. Entropy: 1.201162.
Iteration 10595: Policy loss: -0.256703. Value loss: 0.467835. Entropy: 1.199448.
Iteration 10596: Policy loss: -0.294619. Value loss: 0.305964. Entropy: 1.186508.
episode: 1486   score: 21500.0  epsilon: 1.0    steps: 878  evaluation reward: 35019.0
Training network. lr: 0.000169. clip: 0.067597
Iteration 10597: Policy loss: 0.272903. Value loss: 0.589623. Entropy: 1.195625.
Iteration 10598: Policy loss: 0.302666. Value loss: 0.298877. Entropy: 1.198069.
Iteration 10599: Policy loss: 0.281475. Value loss: 0.201155. Entropy: 1.196019.
Training network. lr: 0.000169. clip: 0.067597
Iteration 10600: Policy loss: -0.005435. Value loss: 0.576671. Entropy: 1.254928.
Iteration 10601: Policy loss: 0.029253.

Iteration 10667: Policy loss: 0.223887. Value loss: 0.183987. Entropy: 1.192133.
Iteration 10668: Policy loss: 0.227515. Value loss: 0.144184. Entropy: 1.186198.
Training network. lr: 0.000168. clip: 0.067292
Iteration 10669: Policy loss: 0.068320. Value loss: 0.533562. Entropy: 1.234106.
Iteration 10670: Policy loss: 0.117522. Value loss: 0.193951. Entropy: 1.232035.
Iteration 10671: Policy loss: 0.078900. Value loss: 0.118312. Entropy: 1.230897.
Training network. lr: 0.000168. clip: 0.067292
Iteration 10672: Policy loss: -0.294013. Value loss: 1.336523. Entropy: 1.194196.
Iteration 10673: Policy loss: -0.272024. Value loss: 0.707191. Entropy: 1.184131.
Iteration 10674: Policy loss: -0.240704. Value loss: 0.529993. Entropy: 1.183610.
episode: 1497   score: 32600.0  epsilon: 1.0    steps: 499  evaluation reward: 35071.0
episode: 1498   score: 16400.0  epsilon: 1.0    steps: 957  evaluation reward: 34948.0
Training network. lr: 0.000168. clip: 0.067292
Iteration 10675: Policy loss: 0.08

Training network. lr: 0.000168. clip: 0.067136
Iteration 10744: Policy loss: -0.066308. Value loss: 0.817593. Entropy: 1.176673.
Iteration 10745: Policy loss: -0.075684. Value loss: 0.523243. Entropy: 1.179031.
Iteration 10746: Policy loss: -0.051581. Value loss: 0.381869. Entropy: 1.181544.
Training network. lr: 0.000168. clip: 0.067136
Iteration 10747: Policy loss: -0.104360. Value loss: 1.326819. Entropy: 1.166903.
Iteration 10748: Policy loss: -0.135540. Value loss: 0.866911. Entropy: 1.170812.
Iteration 10749: Policy loss: -0.174458. Value loss: 0.610692. Entropy: 1.175224.
Training network. lr: 0.000168. clip: 0.067136
Iteration 10750: Policy loss: 0.121719. Value loss: 1.281521. Entropy: 1.221220.
Iteration 10751: Policy loss: 0.103006. Value loss: 0.924586. Entropy: 1.221171.
Iteration 10752: Policy loss: 0.106110. Value loss: 0.725658. Entropy: 1.225460.
Training network. lr: 0.000167. clip: 0.066979
Iteration 10753: Policy loss: 0.111909. Value loss: 1.262397. Entropy: 1.1830

episode: 1516   score: 26300.0  epsilon: 1.0    steps: 996  evaluation reward: 33926.0
Training network. lr: 0.000167. clip: 0.066832
Iteration 10819: Policy loss: 0.112415. Value loss: 0.819266. Entropy: 1.216084.
Iteration 10820: Policy loss: 0.076588. Value loss: 0.438205. Entropy: 1.217326.
Iteration 10821: Policy loss: 0.068574. Value loss: 0.324420. Entropy: 1.213858.
Training network. lr: 0.000167. clip: 0.066832
Iteration 10822: Policy loss: 0.044269. Value loss: 0.409708. Entropy: 1.216145.
Iteration 10823: Policy loss: 0.072892. Value loss: 0.275326. Entropy: 1.213866.
Iteration 10824: Policy loss: 0.035605. Value loss: 0.177120. Entropy: 1.217160.
Training network. lr: 0.000167. clip: 0.066832
Iteration 10825: Policy loss: -0.150785. Value loss: 0.957827. Entropy: 1.244473.
Iteration 10826: Policy loss: -0.131225. Value loss: 0.673837. Entropy: 1.226099.
Iteration 10827: Policy loss: -0.169635. Value loss: 0.497239. Entropy: 1.232493.
Training network. lr: 0.000167. clip: 0.

Iteration 10894: Policy loss: -0.016627. Value loss: 0.485751. Entropy: 1.275940.
Iteration 10895: Policy loss: -0.077307. Value loss: 0.381982. Entropy: 1.279501.
Iteration 10896: Policy loss: -0.044958. Value loss: 0.227253. Entropy: 1.268684.
episode: 1526   score: 25200.0  epsilon: 1.0    steps: 148  evaluation reward: 33824.0
Training network. lr: 0.000167. clip: 0.066675
Iteration 10897: Policy loss: -0.007686. Value loss: 0.640309. Entropy: 1.212364.
Iteration 10898: Policy loss: -0.036412. Value loss: 0.389779. Entropy: 1.209481.
Iteration 10899: Policy loss: 0.002892. Value loss: 0.253839. Entropy: 1.217066.
Training network. lr: 0.000167. clip: 0.066675
Iteration 10900: Policy loss: 0.035373. Value loss: 0.591057. Entropy: 1.234906.
Iteration 10901: Policy loss: 0.007931. Value loss: 0.385750. Entropy: 1.235793.
Iteration 10902: Policy loss: 0.024826. Value loss: 0.284115. Entropy: 1.223148.
Training network. lr: 0.000166. clip: 0.066518
Iteration 10903: Policy loss: -0.27776

Iteration 10970: Policy loss: -0.006540. Value loss: 0.559064. Entropy: 1.160817.
Iteration 10971: Policy loss: 0.007303. Value loss: 0.375657. Entropy: 1.154223.
Training network. lr: 0.000166. clip: 0.066371
Iteration 10972: Policy loss: -0.054393. Value loss: 1.110184. Entropy: 1.179626.
Iteration 10973: Policy loss: -0.050212. Value loss: 0.684148. Entropy: 1.191336.
Iteration 10974: Policy loss: -0.113193. Value loss: 0.550141. Entropy: 1.188691.
episode: 1536   score: 39300.0  epsilon: 1.0    steps: 222  evaluation reward: 33865.0
Training network. lr: 0.000166. clip: 0.066371
Iteration 10975: Policy loss: 0.053648. Value loss: 0.555369. Entropy: 1.191997.
Iteration 10976: Policy loss: 0.085145. Value loss: 0.351284. Entropy: 1.193732.
Iteration 10977: Policy loss: 0.083956. Value loss: 0.274291. Entropy: 1.189847.
Training network. lr: 0.000166. clip: 0.066371
Iteration 10978: Policy loss: -0.317480. Value loss: 1.526516. Entropy: 1.189166.
Iteration 10979: Policy loss: -0.32637

Iteration 11047: Policy loss: 0.657141. Value loss: 1.525466. Entropy: 1.214564.
Iteration 11048: Policy loss: 0.590288. Value loss: 0.534573. Entropy: 1.213850.
Iteration 11049: Policy loss: 0.512073. Value loss: 0.366418. Entropy: 1.208997.
Training network. lr: 0.000166. clip: 0.066214
Iteration 11050: Policy loss: 0.277843. Value loss: 1.004937. Entropy: 1.217761.
Iteration 11051: Policy loss: 0.265890. Value loss: 0.653855. Entropy: 1.219725.
Iteration 11052: Policy loss: 0.223124. Value loss: 0.507403. Entropy: 1.229434.
episode: 1544   score: 23900.0  epsilon: 1.0    steps: 645  evaluation reward: 34540.0
Training network. lr: 0.000165. clip: 0.066057
Iteration 11053: Policy loss: 0.154723. Value loss: 0.560229. Entropy: 1.242004.
Iteration 11054: Policy loss: 0.196510. Value loss: 0.301953. Entropy: 1.241892.
Iteration 11055: Policy loss: 0.164855. Value loss: 0.195909. Entropy: 1.241521.
Training network. lr: 0.000165. clip: 0.066057
Iteration 11056: Policy loss: 0.048072. Val

Iteration 11120: Policy loss: 0.080990. Value loss: 0.208893. Entropy: 1.257563.
Iteration 11121: Policy loss: 0.038565. Value loss: 0.193615. Entropy: 1.257584.
episode: 1557   score: 38700.0  epsilon: 1.0    steps: 60  evaluation reward: 33685.0
Training network. lr: 0.000165. clip: 0.065910
Iteration 11122: Policy loss: -0.135588. Value loss: 0.924678. Entropy: 1.240356.
Iteration 11123: Policy loss: -0.167608. Value loss: 0.526902. Entropy: 1.230604.
Iteration 11124: Policy loss: -0.188238. Value loss: 0.381511. Entropy: 1.243062.
Training network. lr: 0.000165. clip: 0.065910
Iteration 11125: Policy loss: 0.147309. Value loss: 0.293367. Entropy: 1.248946.
Iteration 11126: Policy loss: 0.118994. Value loss: 0.161175. Entropy: 1.251752.
Iteration 11127: Policy loss: 0.120617. Value loss: 0.121288. Entropy: 1.253561.
episode: 1558   score: 34200.0  epsilon: 1.0    steps: 293  evaluation reward: 33199.0
Training network. lr: 0.000165. clip: 0.065910
Iteration 11128: Policy loss: 0.176

Iteration 11196: Policy loss: -0.016233. Value loss: 0.255608. Entropy: 1.231569.
Training network. lr: 0.000164. clip: 0.065753
Iteration 11197: Policy loss: -0.039066. Value loss: 1.608010. Entropy: 1.194589.
Iteration 11198: Policy loss: -0.020415. Value loss: 0.959761. Entropy: 1.169153.
Iteration 11199: Policy loss: -0.059874. Value loss: 0.980536. Entropy: 1.188946.
Training network. lr: 0.000164. clip: 0.065753
Iteration 11200: Policy loss: -0.012737. Value loss: 0.504364. Entropy: 1.200713.
Iteration 11201: Policy loss: -0.078713. Value loss: 0.337392. Entropy: 1.206248.
Iteration 11202: Policy loss: -0.062043. Value loss: 0.243128. Entropy: 1.201480.
Training network. lr: 0.000164. clip: 0.065597
Iteration 11203: Policy loss: -0.166659. Value loss: 1.122989. Entropy: 1.187979.
Iteration 11204: Policy loss: -0.145034. Value loss: 0.774531. Entropy: 1.197690.
Iteration 11205: Policy loss: -0.149466. Value loss: 0.529935. Entropy: 1.198930.
Training network. lr: 0.000164. clip: 0

Iteration 11272: Policy loss: 0.078335. Value loss: 1.063662. Entropy: 1.184534.
Iteration 11273: Policy loss: 0.028952. Value loss: 0.821521. Entropy: 1.175907.
Iteration 11274: Policy loss: 0.008688. Value loss: 0.461282. Entropy: 1.174200.
episode: 1576   score: 26000.0  epsilon: 1.0    steps: 642  evaluation reward: 32044.0
Training network. lr: 0.000164. clip: 0.065449
Iteration 11275: Policy loss: 0.029310. Value loss: 0.778401. Entropy: 1.213088.
Iteration 11276: Policy loss: 0.057192. Value loss: 0.409078. Entropy: 1.217384.
Iteration 11277: Policy loss: 0.037694. Value loss: 0.356364. Entropy: 1.209821.
Training network. lr: 0.000164. clip: 0.065449
Iteration 11278: Policy loss: 0.139038. Value loss: 1.516648. Entropy: 1.195621.
Iteration 11279: Policy loss: 0.127787. Value loss: 0.887944. Entropy: 1.190323.
Iteration 11280: Policy loss: 0.117440. Value loss: 0.461796. Entropy: 1.191051.
Training network. lr: 0.000164. clip: 0.065449
Iteration 11281: Policy loss: 0.058989. Val

Iteration 11348: Policy loss: -0.180294. Value loss: 0.827103. Entropy: 1.138270.
Iteration 11349: Policy loss: -0.165158. Value loss: 0.733178. Entropy: 1.148275.
episode: 1586   score: 20200.0  epsilon: 1.0    steps: 397  evaluation reward: 32173.0
Training network. lr: 0.000163. clip: 0.065293
Iteration 11350: Policy loss: 0.071018. Value loss: 0.424505. Entropy: 1.175527.
Iteration 11351: Policy loss: 0.130583. Value loss: 0.222905. Entropy: 1.172945.
Iteration 11352: Policy loss: 0.078945. Value loss: 0.151656. Entropy: 1.171979.
episode: 1587   score: 29400.0  epsilon: 1.0    steps: 134  evaluation reward: 32127.0
Training network. lr: 0.000163. clip: 0.065136
Iteration 11353: Policy loss: -0.225716. Value loss: 0.716978. Entropy: 1.183536.
Iteration 11354: Policy loss: -0.258506. Value loss: 0.341588. Entropy: 1.158344.
Iteration 11355: Policy loss: -0.238400. Value loss: 0.251767. Entropy: 1.169163.
Training network. lr: 0.000163. clip: 0.065136
Iteration 11356: Policy loss: -0

Iteration 11423: Policy loss: 0.185979. Value loss: 0.183877. Entropy: 1.191439.
Iteration 11424: Policy loss: 0.171735. Value loss: 0.156116. Entropy: 1.199234.
Training network. lr: 0.000162. clip: 0.064988
Iteration 11425: Policy loss: -0.166669. Value loss: 1.005499. Entropy: 1.203243.
Iteration 11426: Policy loss: -0.199209. Value loss: 0.758955. Entropy: 1.198278.
Iteration 11427: Policy loss: -0.182810. Value loss: 0.440785. Entropy: 1.186739.
Training network. lr: 0.000162. clip: 0.064988
Iteration 11428: Policy loss: -0.218313. Value loss: 1.055046. Entropy: 1.215512.
Iteration 11429: Policy loss: -0.239994. Value loss: 0.618720. Entropy: 1.212262.
Iteration 11430: Policy loss: -0.238296. Value loss: 0.487897. Entropy: 1.210354.
Training network. lr: 0.000162. clip: 0.064988
Iteration 11431: Policy loss: -0.303083. Value loss: 1.077780. Entropy: 1.194674.
Iteration 11432: Policy loss: -0.325984. Value loss: 0.727515. Entropy: 1.186848.
Iteration 11433: Policy loss: -0.304362. 

Iteration 11499: Policy loss: 0.046449. Value loss: 0.510110. Entropy: 1.164788.
Training network. lr: 0.000162. clip: 0.064832
Iteration 11500: Policy loss: -0.051088. Value loss: 1.159001. Entropy: 1.122911.
Iteration 11501: Policy loss: -0.014790. Value loss: 0.791312. Entropy: 1.114300.
Iteration 11502: Policy loss: -0.044894. Value loss: 0.659462. Entropy: 1.121482.
Training network. lr: 0.000162. clip: 0.064675
Iteration 11503: Policy loss: 0.192974. Value loss: 0.615154. Entropy: 1.124318.
Iteration 11504: Policy loss: 0.180101. Value loss: 0.410541. Entropy: 1.133636.
Iteration 11505: Policy loss: 0.190302. Value loss: 0.321066. Entropy: 1.133626.
Training network. lr: 0.000162. clip: 0.064675
Iteration 11506: Policy loss: 0.051186. Value loss: 0.668678. Entropy: 1.159855.
Iteration 11507: Policy loss: 0.035656. Value loss: 0.335464. Entropy: 1.166549.
Iteration 11508: Policy loss: 0.076443. Value loss: 0.233183. Entropy: 1.163264.
episode: 1606   score: 27900.0  epsilon: 1.0  

Training network. lr: 0.000161. clip: 0.064528
Iteration 11575: Policy loss: -0.032438. Value loss: 0.844880. Entropy: 1.179120.
Iteration 11576: Policy loss: 0.016004. Value loss: 0.243299. Entropy: 1.177587.
Iteration 11577: Policy loss: -0.031209. Value loss: 0.169225. Entropy: 1.179034.
episode: 1616   score: 26600.0  epsilon: 1.0    steps: 813  evaluation reward: 32440.0
Training network. lr: 0.000161. clip: 0.064528
Iteration 11578: Policy loss: 0.107325. Value loss: 0.866166. Entropy: 1.185091.
Iteration 11579: Policy loss: 0.068199. Value loss: 0.399412. Entropy: 1.194352.
Iteration 11580: Policy loss: 0.085464. Value loss: 0.261175. Entropy: 1.185221.
Training network. lr: 0.000161. clip: 0.064528
Iteration 11581: Policy loss: 0.197744. Value loss: 0.590577. Entropy: 1.169365.
Iteration 11582: Policy loss: 0.218797. Value loss: 0.333713. Entropy: 1.167735.
Iteration 11583: Policy loss: 0.197222. Value loss: 0.253414. Entropy: 1.171552.
Training network. lr: 0.000161. clip: 0.0

Iteration 11650: Policy loss: -0.062886. Value loss: 0.487484. Entropy: 1.118131.
Iteration 11651: Policy loss: -0.052876. Value loss: 0.377445. Entropy: 1.121396.
Iteration 11652: Policy loss: -0.033092. Value loss: 0.246730. Entropy: 1.130311.
Training network. lr: 0.000161. clip: 0.064214
Iteration 11653: Policy loss: 0.048631. Value loss: 0.939182. Entropy: 1.096118.
Iteration 11654: Policy loss: 0.003729. Value loss: 0.816788. Entropy: 1.093280.
Iteration 11655: Policy loss: 0.020001. Value loss: 0.606100. Entropy: 1.099013.
Training network. lr: 0.000161. clip: 0.064214
Iteration 11656: Policy loss: 0.489769. Value loss: 2.085103. Entropy: 1.104529.
Iteration 11657: Policy loss: 0.460324. Value loss: 1.290356. Entropy: 1.100136.
Iteration 11658: Policy loss: 0.447049. Value loss: 0.948991. Entropy: 1.106347.
episode: 1626   score: 36200.0  epsilon: 1.0    steps: 1005  evaluation reward: 31662.0
Training network. lr: 0.000161. clip: 0.064214
Iteration 11659: Policy loss: -0.072168

Iteration 11725: Policy loss: -0.094302. Value loss: 0.379562. Entropy: 1.169751.
Iteration 11726: Policy loss: -0.049977. Value loss: 0.208426. Entropy: 1.170309.
Iteration 11727: Policy loss: -0.087681. Value loss: 0.154741. Entropy: 1.171589.
episode: 1637   score: 25300.0  epsilon: 1.0    steps: 205  evaluation reward: 31485.0
Training network. lr: 0.000160. clip: 0.064067
Iteration 11728: Policy loss: -0.188920. Value loss: 0.453421. Entropy: 1.163119.
Iteration 11729: Policy loss: -0.167315. Value loss: 0.226917. Entropy: 1.165880.
Iteration 11730: Policy loss: -0.168367. Value loss: 0.155620. Entropy: 1.162354.
Training network. lr: 0.000160. clip: 0.064067
Iteration 11731: Policy loss: 0.200831. Value loss: 0.348269. Entropy: 1.188492.
Iteration 11732: Policy loss: 0.212448. Value loss: 0.221822. Entropy: 1.181627.
Iteration 11733: Policy loss: 0.196320. Value loss: 0.170533. Entropy: 1.192267.
episode: 1638   score: 30200.0  epsilon: 1.0    steps: 716  evaluation reward: 31502

episode: 1648   score: 24900.0  epsilon: 1.0    steps: 85  evaluation reward: 30772.0
Training network. lr: 0.000160. clip: 0.063910
Iteration 11800: Policy loss: 0.019579. Value loss: 0.521381. Entropy: 1.198307.
Iteration 11801: Policy loss: 0.009283. Value loss: 0.300041. Entropy: 1.207773.
Iteration 11802: Policy loss: 0.026580. Value loss: 0.268533. Entropy: 1.206039.
Training network. lr: 0.000159. clip: 0.063753
Iteration 11803: Policy loss: 0.222252. Value loss: 1.652861. Entropy: 1.172745.
Iteration 11804: Policy loss: 0.280843. Value loss: 0.955198. Entropy: 1.197088.
Iteration 11805: Policy loss: 0.195549. Value loss: 0.921705. Entropy: 1.194780.
Training network. lr: 0.000159. clip: 0.063753
Iteration 11806: Policy loss: 0.096741. Value loss: 0.525041. Entropy: 1.203512.
Iteration 11807: Policy loss: 0.088603. Value loss: 0.292994. Entropy: 1.203484.
Iteration 11808: Policy loss: 0.118606. Value loss: 0.259349. Entropy: 1.203796.
Training network. lr: 0.000159. clip: 0.0637

Iteration 11875: Policy loss: 0.227843. Value loss: 0.471368. Entropy: 1.183813.
Iteration 11876: Policy loss: 0.222352. Value loss: 0.298023. Entropy: 1.187204.
Iteration 11877: Policy loss: 0.225037. Value loss: 0.249633. Entropy: 1.189957.
Training network. lr: 0.000159. clip: 0.063606
Iteration 11878: Policy loss: 0.173888. Value loss: 1.344820. Entropy: 1.144990.
Iteration 11879: Policy loss: 0.180270. Value loss: 0.798590. Entropy: 1.144711.
Iteration 11880: Policy loss: 0.168638. Value loss: 0.526281. Entropy: 1.146983.
Training network. lr: 0.000159. clip: 0.063606
Iteration 11881: Policy loss: 0.108507. Value loss: 0.740512. Entropy: 1.205148.
Iteration 11882: Policy loss: 0.083151. Value loss: 0.444935. Entropy: 1.208037.
Iteration 11883: Policy loss: 0.082358. Value loss: 0.354945. Entropy: 1.208545.
Training network. lr: 0.000159. clip: 0.063606
Iteration 11884: Policy loss: 0.064365. Value loss: 0.788907. Entropy: 1.155697.
Iteration 11885: Policy loss: 0.059860. Value los

Iteration 11950: Policy loss: 0.091402. Value loss: 0.112264. Entropy: 1.231946.
Iteration 11951: Policy loss: 0.085563. Value loss: 0.051724. Entropy: 1.230754.
Iteration 11952: Policy loss: 0.098154. Value loss: 0.045949. Entropy: 1.228298.
Training network. lr: 0.000158. clip: 0.063293
Iteration 11953: Policy loss: -0.178032. Value loss: 1.256644. Entropy: 1.201666.
Iteration 11954: Policy loss: -0.112530. Value loss: 0.606662. Entropy: 1.202136.
Iteration 11955: Policy loss: -0.131457. Value loss: 0.499145. Entropy: 1.199605.
Training network. lr: 0.000158. clip: 0.063293
Iteration 11956: Policy loss: 0.245278. Value loss: 1.555115. Entropy: 1.229123.
Iteration 11957: Policy loss: 0.253418. Value loss: 0.872230. Entropy: 1.221680.
Iteration 11958: Policy loss: 0.234923. Value loss: 0.584829. Entropy: 1.223501.
Training network. lr: 0.000158. clip: 0.063293
Iteration 11959: Policy loss: 0.178318. Value loss: 0.771285. Entropy: 1.224494.
Iteration 11960: Policy loss: 0.149711. Value 

Training network. lr: 0.000158. clip: 0.063145
Iteration 12028: Policy loss: 0.078059. Value loss: 1.158112. Entropy: 1.233235.
Iteration 12029: Policy loss: 0.154991. Value loss: 0.489710. Entropy: 1.237098.
Iteration 12030: Policy loss: 0.097317. Value loss: 0.397469. Entropy: 1.234852.
Training network. lr: 0.000158. clip: 0.063145
Iteration 12031: Policy loss: -0.161884. Value loss: 1.514290. Entropy: 1.205196.
Iteration 12032: Policy loss: -0.154378. Value loss: 0.862534. Entropy: 1.205387.
Iteration 12033: Policy loss: -0.169402. Value loss: 0.686596. Entropy: 1.198770.
Training network. lr: 0.000158. clip: 0.063145
Iteration 12034: Policy loss: 0.007285. Value loss: 0.556820. Entropy: 1.177031.
Iteration 12035: Policy loss: 0.012949. Value loss: 0.315852. Entropy: 1.176376.
Iteration 12036: Policy loss: -0.036411. Value loss: 0.250197. Entropy: 1.181549.
Training network. lr: 0.000158. clip: 0.063145
Iteration 12037: Policy loss: 0.211504. Value loss: 0.967209. Entropy: 1.163037

Iteration 12102: Policy loss: -0.268012. Value loss: 0.597361. Entropy: 1.140085.
Training network. lr: 0.000157. clip: 0.062832
Iteration 12103: Policy loss: 0.214238. Value loss: 0.255469. Entropy: 1.137806.
Iteration 12104: Policy loss: 0.210542. Value loss: 0.149545. Entropy: 1.150523.
Iteration 12105: Policy loss: 0.211175. Value loss: 0.111314. Entropy: 1.147946.
episode: 1689   score: 28800.0  epsilon: 1.0    steps: 134  evaluation reward: 30515.0
Training network. lr: 0.000157. clip: 0.062832
Iteration 12106: Policy loss: -0.087475. Value loss: 0.406792. Entropy: 1.175410.
Iteration 12107: Policy loss: -0.105547. Value loss: 0.243050. Entropy: 1.164720.
Iteration 12108: Policy loss: -0.099312. Value loss: 0.177472. Entropy: 1.166419.
Training network. lr: 0.000157. clip: 0.062832
Iteration 12109: Policy loss: 0.119753. Value loss: 0.704764. Entropy: 1.165161.
Iteration 12110: Policy loss: 0.123448. Value loss: 0.391616. Entropy: 1.169465.
Iteration 12111: Policy loss: 0.075390.

Training network. lr: 0.000157. clip: 0.062684
Iteration 12178: Policy loss: -0.277642. Value loss: 0.936767. Entropy: 1.177263.
Iteration 12179: Policy loss: -0.291514. Value loss: 0.519406. Entropy: 1.186841.
Iteration 12180: Policy loss: -0.267989. Value loss: 0.335705. Entropy: 1.172473.
episode: 1699   score: 34800.0  epsilon: 1.0    steps: 606  evaluation reward: 29813.0
Training network. lr: 0.000157. clip: 0.062684
Iteration 12181: Policy loss: 0.156726. Value loss: 0.576531. Entropy: 1.205597.
Iteration 12182: Policy loss: 0.170467. Value loss: 0.290871. Entropy: 1.205657.
Iteration 12183: Policy loss: 0.140343. Value loss: 0.191854. Entropy: 1.207847.
Training network. lr: 0.000157. clip: 0.062684
Iteration 12184: Policy loss: -0.000423. Value loss: 0.471917. Entropy: 1.217633.
Iteration 12185: Policy loss: -0.003747. Value loss: 0.401761. Entropy: 1.212767.
Iteration 12186: Policy loss: 0.026754. Value loss: 0.238285. Entropy: 1.225388.
Training network. lr: 0.000157. clip: 

Iteration 12254: Policy loss: -0.118680. Value loss: 0.651812. Entropy: 1.235570.
Iteration 12255: Policy loss: -0.115655. Value loss: 0.378623. Entropy: 1.239559.
Training network. lr: 0.000156. clip: 0.062371
Iteration 12256: Policy loss: 0.026407. Value loss: 1.236202. Entropy: 1.253531.
Iteration 12257: Policy loss: 0.011072. Value loss: 0.737076. Entropy: 1.249810.
Iteration 12258: Policy loss: 0.022339. Value loss: 0.626825. Entropy: 1.246351.
Training network. lr: 0.000156. clip: 0.062371
Iteration 12259: Policy loss: 0.038835. Value loss: 0.795983. Entropy: 1.237414.
Iteration 12260: Policy loss: 0.021831. Value loss: 0.418647. Entropy: 1.229448.
Iteration 12261: Policy loss: -0.013809. Value loss: 0.350453. Entropy: 1.237761.
Training network. lr: 0.000156. clip: 0.062371
Iteration 12262: Policy loss: -0.244440. Value loss: 1.198066. Entropy: 1.244053.
Iteration 12263: Policy loss: -0.267328. Value loss: 0.568367. Entropy: 1.239974.
Iteration 12264: Policy loss: -0.214989. Val

Iteration 12332: Policy loss: 0.143445. Value loss: 0.591936. Entropy: 1.217285.
Iteration 12333: Policy loss: 0.066067. Value loss: 0.510067. Entropy: 1.210899.
episode: 1716   score: 21700.0  epsilon: 1.0    steps: 42  evaluation reward: 30583.0
Training network. lr: 0.000156. clip: 0.062224
Iteration 12334: Policy loss: 0.346311. Value loss: 0.582425. Entropy: 1.202118.
Iteration 12335: Policy loss: 0.340868. Value loss: 0.357701. Entropy: 1.198250.
Iteration 12336: Policy loss: 0.343913. Value loss: 0.257904. Entropy: 1.205422.
episode: 1717   score: 34300.0  epsilon: 1.0    steps: 268  evaluation reward: 30529.0
Training network. lr: 0.000156. clip: 0.062224
Iteration 12337: Policy loss: -0.197855. Value loss: 0.547431. Entropy: 1.259921.
Iteration 12338: Policy loss: -0.217647. Value loss: 0.307313. Entropy: 1.254806.
Iteration 12339: Policy loss: -0.240891. Value loss: 0.254847. Entropy: 1.256493.
Training network. lr: 0.000156. clip: 0.062224
Iteration 12340: Policy loss: -0.04

Training network. lr: 0.000155. clip: 0.061910
Iteration 12409: Policy loss: -0.089777. Value loss: 0.617761. Entropy: 1.230816.
Iteration 12410: Policy loss: -0.089217. Value loss: 0.359316. Entropy: 1.227497.
Iteration 12411: Policy loss: -0.075571. Value loss: 0.304354. Entropy: 1.224490.
Training network. lr: 0.000155. clip: 0.061910
Iteration 12412: Policy loss: 0.066879. Value loss: 0.619782. Entropy: 1.254959.
Iteration 12413: Policy loss: 0.069425. Value loss: 0.404940. Entropy: 1.249796.
Iteration 12414: Policy loss: 0.078852. Value loss: 0.293433. Entropy: 1.245229.
Training network. lr: 0.000155. clip: 0.061910
Iteration 12415: Policy loss: 0.044353. Value loss: 0.948404. Entropy: 1.221772.
Iteration 12416: Policy loss: 0.116809. Value loss: 0.449613. Entropy: 1.228970.
Iteration 12417: Policy loss: 0.111673. Value loss: 0.380259. Entropy: 1.225784.
episode: 1726   score: 26800.0  epsilon: 1.0    steps: 597  evaluation reward: 31384.0
Training network. lr: 0.000155. clip: 0.

Training network. lr: 0.000154. clip: 0.061763
Iteration 12487: Policy loss: 0.294627. Value loss: 0.634176. Entropy: 1.259394.
Iteration 12488: Policy loss: 0.354744. Value loss: 0.364832. Entropy: 1.245970.
Iteration 12489: Policy loss: 0.294622. Value loss: 0.274697. Entropy: 1.251588.
episode: 1734   score: 60500.0  epsilon: 1.0    steps: 653  evaluation reward: 31894.0
episode: 1735   score: 43400.0  epsilon: 1.0    steps: 962  evaluation reward: 32045.0
Training network. lr: 0.000154. clip: 0.061763
Iteration 12490: Policy loss: -0.081610. Value loss: 0.823757. Entropy: 1.240540.
Iteration 12491: Policy loss: -0.087881. Value loss: 0.598592. Entropy: 1.239887.
Iteration 12492: Policy loss: -0.100299. Value loss: 0.408117. Entropy: 1.243093.
Training network. lr: 0.000154. clip: 0.061763
Iteration 12493: Policy loss: 0.386390. Value loss: 0.339258. Entropy: 1.229603.
Iteration 12494: Policy loss: 0.380470. Value loss: 0.178819. Entropy: 1.233750.
Iteration 12495: Policy loss: 0.39

Training network. lr: 0.000154. clip: 0.061449
Iteration 12562: Policy loss: 0.222942. Value loss: 0.558785. Entropy: 1.242679.
Iteration 12563: Policy loss: 0.239822. Value loss: 0.433709. Entropy: 1.246768.
Iteration 12564: Policy loss: 0.225014. Value loss: 0.221925. Entropy: 1.251576.
Training network. lr: 0.000154. clip: 0.061449
Iteration 12565: Policy loss: -0.152924. Value loss: 1.210722. Entropy: 1.236363.
Iteration 12566: Policy loss: -0.141498. Value loss: 0.556207. Entropy: 1.235801.
Iteration 12567: Policy loss: -0.150342. Value loss: 0.354767. Entropy: 1.233487.
episode: 1745   score: 22100.0  epsilon: 1.0    steps: 350  evaluation reward: 31919.0
Training network. lr: 0.000154. clip: 0.061449
Iteration 12568: Policy loss: -0.090625. Value loss: 1.436526. Entropy: 1.250916.
Iteration 12569: Policy loss: -0.051695. Value loss: 0.743578. Entropy: 1.255129.
Iteration 12570: Policy loss: -0.060940. Value loss: 0.536446. Entropy: 1.253777.
Training network. lr: 0.000154. clip:

Iteration 12639: Policy loss: 0.356887. Value loss: 0.495006. Entropy: 1.208283.
Training network. lr: 0.000153. clip: 0.061302
Iteration 12640: Policy loss: 0.035465. Value loss: 0.374277. Entropy: 1.193768.
Iteration 12641: Policy loss: 0.029914. Value loss: 0.208547. Entropy: 1.184646.
Iteration 12642: Policy loss: 0.017944. Value loss: 0.156066. Entropy: 1.194725.
Training network. lr: 0.000153. clip: 0.061302
Iteration 12643: Policy loss: 0.146862. Value loss: 0.937096. Entropy: 1.183036.
Iteration 12644: Policy loss: 0.130466. Value loss: 0.591141. Entropy: 1.179904.
Iteration 12645: Policy loss: 0.130657. Value loss: 0.460038. Entropy: 1.186058.
Training network. lr: 0.000153. clip: 0.061302
Iteration 12646: Policy loss: -0.291468. Value loss: 1.165290. Entropy: 1.170733.
Iteration 12647: Policy loss: -0.302899. Value loss: 0.830838. Entropy: 1.176086.
Iteration 12648: Policy loss: -0.316221. Value loss: 0.743904. Entropy: 1.170149.
episode: 1753   score: 28400.0  epsilon: 1.0  

Iteration 12716: Policy loss: 0.498265. Value loss: 0.464555. Entropy: 1.245396.
Iteration 12717: Policy loss: 0.504492. Value loss: 0.412836. Entropy: 1.243396.
Training network. lr: 0.000152. clip: 0.060989
Iteration 12718: Policy loss: 0.187970. Value loss: 1.405871. Entropy: 1.172278.
Iteration 12719: Policy loss: 0.202933. Value loss: 0.813271. Entropy: 1.183766.
Iteration 12720: Policy loss: 0.159408. Value loss: 0.661362. Entropy: 1.174781.
episode: 1762   score: 28700.0  epsilon: 1.0    steps: 612  evaluation reward: 34176.0
Training network. lr: 0.000152. clip: 0.060989
Iteration 12721: Policy loss: 0.203839. Value loss: 0.781549. Entropy: 1.188866.
Iteration 12722: Policy loss: 0.196941. Value loss: 0.491071. Entropy: 1.190236.
Iteration 12723: Policy loss: 0.246725. Value loss: 0.336320. Entropy: 1.185368.
Training network. lr: 0.000152. clip: 0.060989
Iteration 12724: Policy loss: -0.128131. Value loss: 1.090540. Entropy: 1.231243.
Iteration 12725: Policy loss: -0.144290. V

Iteration 12791: Policy loss: 0.066227. Value loss: 0.670744. Entropy: 1.265377.
Iteration 12792: Policy loss: 0.025987. Value loss: 0.627019. Entropy: 1.269606.
Training network. lr: 0.000152. clip: 0.060841
Iteration 12793: Policy loss: 0.080419. Value loss: 1.300164. Entropy: 1.251708.
Iteration 12794: Policy loss: 0.054960. Value loss: 0.699523. Entropy: 1.250229.
Iteration 12795: Policy loss: 0.046710. Value loss: 0.540402. Entropy: 1.248441.
Training network. lr: 0.000152. clip: 0.060841
Iteration 12796: Policy loss: 0.153923. Value loss: 1.066431. Entropy: 1.228152.
Iteration 12797: Policy loss: 0.140243. Value loss: 0.634559. Entropy: 1.222843.
Iteration 12798: Policy loss: 0.172234. Value loss: 0.441128. Entropy: 1.222897.
Training network. lr: 0.000152. clip: 0.060841
Iteration 12799: Policy loss: 0.319834. Value loss: 0.958139. Entropy: 1.249865.
Iteration 12800: Policy loss: 0.299232. Value loss: 0.374015. Entropy: 1.243958.
Iteration 12801: Policy loss: 0.274862. Value los

Training network. lr: 0.000151. clip: 0.060528
Iteration 12868: Policy loss: -0.079802. Value loss: 1.284686. Entropy: 1.175207.
Iteration 12869: Policy loss: -0.045793. Value loss: 0.598411. Entropy: 1.179945.
Iteration 12870: Policy loss: -0.099774. Value loss: 0.456005. Entropy: 1.169075.
Training network. lr: 0.000151. clip: 0.060528
Iteration 12871: Policy loss: -0.167258. Value loss: 0.845087. Entropy: 1.252269.
Iteration 12872: Policy loss: -0.202897. Value loss: 0.508701. Entropy: 1.241304.
Iteration 12873: Policy loss: -0.188419. Value loss: 0.418645. Entropy: 1.245332.
episode: 1782   score: 42400.0  epsilon: 1.0    steps: 770  evaluation reward: 35009.0
Training network. lr: 0.000151. clip: 0.060528
Iteration 12874: Policy loss: 0.157416. Value loss: 0.556393. Entropy: 1.224926.
Iteration 12875: Policy loss: 0.134816. Value loss: 0.300235. Entropy: 1.228089.
Iteration 12876: Policy loss: 0.163718. Value loss: 0.263630. Entropy: 1.229863.
Training network. lr: 0.000151. clip:

Iteration 12945: Policy loss: -0.207759. Value loss: 0.444672. Entropy: 1.236536.
episode: 1790   score: 44300.0  epsilon: 1.0    steps: 175  evaluation reward: 36157.0
episode: 1791   score: 21400.0  epsilon: 1.0    steps: 673  evaluation reward: 36155.0
Training network. lr: 0.000151. clip: 0.060380
Iteration 12946: Policy loss: 0.043471. Value loss: 0.681549. Entropy: 1.247744.
Iteration 12947: Policy loss: 0.041393. Value loss: 0.385481. Entropy: 1.258288.
Iteration 12948: Policy loss: 0.054508. Value loss: 0.301123. Entropy: 1.258319.
Training network. lr: 0.000151. clip: 0.060380
Iteration 12949: Policy loss: 0.043088. Value loss: 0.302216. Entropy: 1.206109.
Iteration 12950: Policy loss: 0.033670. Value loss: 0.181020. Entropy: 1.211875.
Iteration 12951: Policy loss: 0.029773. Value loss: 0.130032. Entropy: 1.210614.
Training network. lr: 0.000151. clip: 0.060224
Iteration 12952: Policy loss: 0.011458. Value loss: 0.432564. Entropy: 1.265176.
Iteration 12953: Policy loss: -0.027

Iteration 13020: Policy loss: -0.099322. Value loss: 0.398740. Entropy: 1.239102.
Training network. lr: 0.000150. clip: 0.060067
Iteration 13021: Policy loss: 0.220654. Value loss: 0.719624. Entropy: 1.235382.
Iteration 13022: Policy loss: 0.231144. Value loss: 0.378907. Entropy: 1.228126.
Iteration 13023: Policy loss: 0.232743. Value loss: 0.264118. Entropy: 1.231848.
now time :  2019-02-26 22:29:50.244973
episode: 1801   score: 45000.0  epsilon: 1.0    steps: 250  evaluation reward: 36959.0
Training network. lr: 0.000150. clip: 0.060067
Iteration 13024: Policy loss: 0.100729. Value loss: 0.872347. Entropy: 1.274046.
Iteration 13025: Policy loss: 0.076148. Value loss: 0.510060. Entropy: 1.266707.
Iteration 13026: Policy loss: 0.050001. Value loss: 0.375786. Entropy: 1.274805.
Training network. lr: 0.000150. clip: 0.060067
Iteration 13027: Policy loss: -0.163014. Value loss: 0.698835. Entropy: 1.248781.
Iteration 13028: Policy loss: -0.174541. Value loss: 0.451124. Entropy: 1.241009.
I

Iteration 13095: Policy loss: -0.065092. Value loss: 0.537169. Entropy: 1.264794.
episode: 1811   score: 46600.0  epsilon: 1.0    steps: 23  evaluation reward: 36327.0
Training network. lr: 0.000150. clip: 0.059920
Iteration 13096: Policy loss: 0.211952. Value loss: 0.641842. Entropy: 1.280342.
Iteration 13097: Policy loss: 0.226179. Value loss: 0.399976. Entropy: 1.273317.
Iteration 13098: Policy loss: 0.210858. Value loss: 0.311764. Entropy: 1.275433.
Training network. lr: 0.000150. clip: 0.059920
Iteration 13099: Policy loss: 0.229532. Value loss: 1.039712. Entropy: 1.246367.
Iteration 13100: Policy loss: 0.241718. Value loss: 0.599404. Entropy: 1.238370.
Iteration 13101: Policy loss: 0.256986. Value loss: 0.431262. Entropy: 1.236645.
episode: 1812   score: 37700.0  epsilon: 1.0    steps: 598  evaluation reward: 36287.0
Training network. lr: 0.000149. clip: 0.059763
Iteration 13102: Policy loss: -0.152891. Value loss: 0.725610. Entropy: 1.250215.
Iteration 13103: Policy loss: -0.151

Iteration 13170: Policy loss: 0.047384. Value loss: 0.246446. Entropy: 1.248792.
Training network. lr: 0.000149. clip: 0.059606
Iteration 13171: Policy loss: 0.173652. Value loss: 1.232975. Entropy: 1.252779.
Iteration 13172: Policy loss: 0.148921. Value loss: 0.551498. Entropy: 1.247553.
Iteration 13173: Policy loss: 0.192977. Value loss: 0.328933. Entropy: 1.253341.
episode: 1822   score: 19100.0  epsilon: 1.0    steps: 844  evaluation reward: 35711.0
Training network. lr: 0.000149. clip: 0.059606
Iteration 13174: Policy loss: 0.084745. Value loss: 0.795105. Entropy: 1.288174.
Iteration 13175: Policy loss: 0.142042. Value loss: 0.451106. Entropy: 1.286164.
Iteration 13176: Policy loss: 0.097016. Value loss: 0.322895. Entropy: 1.285881.
Training network. lr: 0.000149. clip: 0.059606
Iteration 13177: Policy loss: -0.025267. Value loss: 1.023306. Entropy: 1.272223.
Iteration 13178: Policy loss: -0.108327. Value loss: 0.782741. Entropy: 1.269174.
Iteration 13179: Policy loss: -0.024666. 

Iteration 13245: Policy loss: -0.235058. Value loss: 0.334570. Entropy: 1.260571.
Training network. lr: 0.000149. clip: 0.059459
Iteration 13246: Policy loss: 0.191931. Value loss: 1.335708. Entropy: 1.258433.
Iteration 13247: Policy loss: 0.162422. Value loss: 0.866150. Entropy: 1.264380.
Iteration 13248: Policy loss: 0.209351. Value loss: 0.408054. Entropy: 1.261290.
Training network. lr: 0.000149. clip: 0.059459
Iteration 13249: Policy loss: -0.001572. Value loss: 0.624727. Entropy: 1.273301.
Iteration 13250: Policy loss: 0.046374. Value loss: 0.332756. Entropy: 1.275065.
Iteration 13251: Policy loss: 0.016320. Value loss: 0.268440. Entropy: 1.271544.
Training network. lr: 0.000148. clip: 0.059302
Iteration 13252: Policy loss: 0.206397. Value loss: 0.668578. Entropy: 1.217779.
Iteration 13253: Policy loss: 0.201565. Value loss: 0.366767. Entropy: 1.207981.
Iteration 13254: Policy loss: 0.212660. Value loss: 0.296482. Entropy: 1.214316.
Training network. lr: 0.000148. clip: 0.059302


Training network. lr: 0.000148. clip: 0.059145
Iteration 13321: Policy loss: -0.148240. Value loss: 0.874031. Entropy: 1.248520.
Iteration 13322: Policy loss: -0.164667. Value loss: 0.500344. Entropy: 1.251132.
Iteration 13323: Policy loss: -0.182176. Value loss: 0.368944. Entropy: 1.248006.
episode: 1843   score: 19900.0  epsilon: 1.0    steps: 356  evaluation reward: 33457.0
episode: 1844   score: 18000.0  epsilon: 1.0    steps: 1012  evaluation reward: 33439.0
Training network. lr: 0.000148. clip: 0.059145
Iteration 13324: Policy loss: -0.265509. Value loss: 0.878890. Entropy: 1.275516.
Iteration 13325: Policy loss: -0.270292. Value loss: 0.738266. Entropy: 1.277441.
Iteration 13326: Policy loss: -0.310039. Value loss: 0.514734. Entropy: 1.283022.
Training network. lr: 0.000148. clip: 0.059145
Iteration 13327: Policy loss: -0.052825. Value loss: 0.456584. Entropy: 1.273267.
Iteration 13328: Policy loss: -0.075232. Value loss: 0.337984. Entropy: 1.275969.
Iteration 13329: Policy loss

Iteration 13395: Policy loss: 0.015669. Value loss: 0.402680. Entropy: 1.252635.
episode: 1854   score: 20400.0  epsilon: 1.0    steps: 829  evaluation reward: 32949.0
Training network. lr: 0.000147. clip: 0.058998
Iteration 13396: Policy loss: 0.167201. Value loss: 0.613513. Entropy: 1.277084.
Iteration 13397: Policy loss: 0.179753. Value loss: 0.373402. Entropy: 1.273392.
Iteration 13398: Policy loss: 0.170289. Value loss: 0.269878. Entropy: 1.277617.
Training network. lr: 0.000147. clip: 0.058998
Iteration 13399: Policy loss: 0.236513. Value loss: 0.432710. Entropy: 1.262494.
Iteration 13400: Policy loss: 0.168418. Value loss: 0.241037. Entropy: 1.267595.
Iteration 13401: Policy loss: 0.234817. Value loss: 0.173393. Entropy: 1.264299.
Training network. lr: 0.000147. clip: 0.058841
Iteration 13402: Policy loss: 0.249875. Value loss: 0.955849. Entropy: 1.264790.
Iteration 13403: Policy loss: 0.203790. Value loss: 0.623361. Entropy: 1.265678.
Iteration 13404: Policy loss: 0.236289. Val

Training network. lr: 0.000147. clip: 0.058685
Iteration 13471: Policy loss: 0.077565. Value loss: 0.965275. Entropy: 1.263281.
Iteration 13472: Policy loss: 0.057663. Value loss: 0.494525. Entropy: 1.256783.
Iteration 13473: Policy loss: 0.097019. Value loss: 0.388497. Entropy: 1.270214.
Training network. lr: 0.000147. clip: 0.058685
Iteration 13474: Policy loss: 0.088807. Value loss: 0.339869. Entropy: 1.274683.
Iteration 13475: Policy loss: 0.068114. Value loss: 0.238929. Entropy: 1.268626.
Iteration 13476: Policy loss: 0.074744. Value loss: 0.215548. Entropy: 1.273459.
episode: 1864   score: 37400.0  epsilon: 1.0    steps: 739  evaluation reward: 32184.0
Training network. lr: 0.000147. clip: 0.058685
Iteration 13477: Policy loss: 0.200332. Value loss: 0.478439. Entropy: 1.256125.
Iteration 13478: Policy loss: 0.151602. Value loss: 0.344302. Entropy: 1.259746.
Iteration 13479: Policy loss: 0.172909. Value loss: 0.217320. Entropy: 1.263385.
episode: 1865   score: 20100.0  epsilon: 1.

Iteration 13548: Policy loss: 0.270015. Value loss: 0.250655. Entropy: 1.229851.
Training network. lr: 0.000146. clip: 0.058537
Iteration 13549: Policy loss: -0.059071. Value loss: 1.022490. Entropy: 1.257881.
Iteration 13550: Policy loss: -0.085857. Value loss: 0.753092. Entropy: 1.259929.
Iteration 13551: Policy loss: -0.036656. Value loss: 0.485165. Entropy: 1.262144.
Training network. lr: 0.000146. clip: 0.058381
Iteration 13552: Policy loss: 0.194523. Value loss: 0.816464. Entropy: 1.276481.
Iteration 13553: Policy loss: 0.172426. Value loss: 0.457794. Entropy: 1.276991.
Iteration 13554: Policy loss: 0.187226. Value loss: 0.338060. Entropy: 1.276064.
episode: 1873   score: 41500.0  epsilon: 1.0    steps: 681  evaluation reward: 32108.0
Training network. lr: 0.000146. clip: 0.058381
Iteration 13555: Policy loss: -0.005148. Value loss: 0.726899. Entropy: 1.277023.
Iteration 13556: Policy loss: -0.052358. Value loss: 0.417903. Entropy: 1.268020.
Iteration 13557: Policy loss: 0.004371

Iteration 13626: Policy loss: 0.101993. Value loss: 1.300518. Entropy: 1.197356.
episode: 1880   score: 46200.0  epsilon: 1.0    steps: 121  evaluation reward: 31757.0
Training network. lr: 0.000146. clip: 0.058224
Iteration 13627: Policy loss: -0.089029. Value loss: 1.348617. Entropy: 1.199813.
Iteration 13628: Policy loss: -0.075430. Value loss: 1.096648. Entropy: 1.203085.
Iteration 13629: Policy loss: 0.012041. Value loss: 0.651189. Entropy: 1.204798.
Training network. lr: 0.000146. clip: 0.058224
Iteration 13630: Policy loss: 0.367163. Value loss: 0.777215. Entropy: 1.251457.
Iteration 13631: Policy loss: 0.364525. Value loss: 0.529305. Entropy: 1.236123.
Iteration 13632: Policy loss: 0.404104. Value loss: 0.475142. Entropy: 1.242780.
episode: 1881   score: 45700.0  epsilon: 1.0    steps: 276  evaluation reward: 31929.0
Training network. lr: 0.000146. clip: 0.058224
Iteration 13633: Policy loss: -0.039327. Value loss: 1.051478. Entropy: 1.230807.
Iteration 13634: Policy loss: -0.0

Iteration 13702: Policy loss: -0.015712. Value loss: 1.126101. Entropy: 1.217257.
Iteration 13703: Policy loss: -0.014287. Value loss: 0.835126. Entropy: 1.232500.
Iteration 13704: Policy loss: 0.016869. Value loss: 0.579468. Entropy: 1.214255.
Training network. lr: 0.000145. clip: 0.057920
Iteration 13705: Policy loss: 0.204834. Value loss: 0.617142. Entropy: 1.227566.
Iteration 13706: Policy loss: 0.167369. Value loss: 0.411243. Entropy: 1.228609.
Iteration 13707: Policy loss: 0.169777. Value loss: 0.292618. Entropy: 1.234975.
episode: 1889   score: 48900.0  epsilon: 1.0    steps: 278  evaluation reward: 32478.0
Training network. lr: 0.000145. clip: 0.057920
Iteration 13708: Policy loss: -0.016187. Value loss: 0.423829. Entropy: 1.238972.
Iteration 13709: Policy loss: -0.006237. Value loss: 0.353596. Entropy: 1.240293.
Iteration 13710: Policy loss: -0.015928. Value loss: 0.233075. Entropy: 1.242352.
episode: 1890   score: 35200.0  epsilon: 1.0    steps: 388  evaluation reward: 32387.

Iteration 13778: Policy loss: -0.415039. Value loss: 0.874860. Entropy: 1.275329.
Iteration 13779: Policy loss: -0.409857. Value loss: 0.756136. Entropy: 1.272218.
Training network. lr: 0.000144. clip: 0.057763
Iteration 13780: Policy loss: 0.137305. Value loss: 0.893786. Entropy: 1.262025.
Iteration 13781: Policy loss: 0.098533. Value loss: 0.480103. Entropy: 1.265170.
Iteration 13782: Policy loss: 0.158042. Value loss: 0.375793. Entropy: 1.255646.
Training network. lr: 0.000144. clip: 0.057763
Iteration 13783: Policy loss: -0.013939. Value loss: 0.837001. Entropy: 1.266343.
Iteration 13784: Policy loss: -0.046224. Value loss: 0.660523. Entropy: 1.252458.
Iteration 13785: Policy loss: -0.058465. Value loss: 0.519554. Entropy: 1.257837.
episode: 1899   score: 37600.0  epsilon: 1.0    steps: 856  evaluation reward: 32560.0
Training network. lr: 0.000144. clip: 0.057763
Iteration 13786: Policy loss: 0.227605. Value loss: 0.463338. Entropy: 1.247388.
Iteration 13787: Policy loss: 0.247629

Training network. lr: 0.000144. clip: 0.057459
Iteration 13855: Policy loss: -0.052097. Value loss: 0.716210. Entropy: 1.188627.
Iteration 13856: Policy loss: 0.011751. Value loss: 0.375556. Entropy: 1.195133.
Iteration 13857: Policy loss: -0.059189. Value loss: 0.296769. Entropy: 1.196937.
episode: 1909   score: 35200.0  epsilon: 1.0    steps: 625  evaluation reward: 33082.0
Training network. lr: 0.000144. clip: 0.057459
Iteration 13858: Policy loss: 0.052399. Value loss: 0.560806. Entropy: 1.221484.
Iteration 13859: Policy loss: 0.073409. Value loss: 0.370261. Entropy: 1.216196.
Iteration 13860: Policy loss: 0.048382. Value loss: 0.349137. Entropy: 1.221715.
episode: 1910   score: 28000.0  epsilon: 1.0    steps: 133  evaluation reward: 33081.0
Training network. lr: 0.000144. clip: 0.057459
Iteration 13861: Policy loss: 0.010289. Value loss: 0.248678. Entropy: 1.262858.
Iteration 13862: Policy loss: 0.007912. Value loss: 0.139486. Entropy: 1.269419.
Iteration 13863: Policy loss: -0.00

Iteration 13931: Policy loss: 0.032599. Value loss: 0.427146. Entropy: 1.240368.
Iteration 13932: Policy loss: 0.075486. Value loss: 0.281589. Entropy: 1.238849.
Training network. lr: 0.000143. clip: 0.057302
Iteration 13933: Policy loss: -0.150083. Value loss: 1.017464. Entropy: 1.199153.
Iteration 13934: Policy loss: -0.162364. Value loss: 0.599584. Entropy: 1.208709.
Iteration 13935: Policy loss: -0.161158. Value loss: 0.438951. Entropy: 1.200423.
episode: 1918   score: 44300.0  epsilon: 1.0    steps: 343  evaluation reward: 32759.0
episode: 1919   score: 28400.0  epsilon: 1.0    steps: 907  evaluation reward: 32719.0
Training network. lr: 0.000143. clip: 0.057302
Iteration 13936: Policy loss: -0.031330. Value loss: 0.780703. Entropy: 1.240242.
Iteration 13937: Policy loss: -0.062132. Value loss: 0.427393. Entropy: 1.228696.
Iteration 13938: Policy loss: -0.030161. Value loss: 0.296043. Entropy: 1.232494.
Training network. lr: 0.000143. clip: 0.057302
Iteration 13939: Policy loss: 0

Iteration 14006: Policy loss: -0.018684. Value loss: 0.552181. Entropy: 1.252305.
Iteration 14007: Policy loss: -0.055047. Value loss: 0.522734. Entropy: 1.252894.
Training network. lr: 0.000142. clip: 0.056998
Iteration 14008: Policy loss: -0.010765. Value loss: 0.640222. Entropy: 1.247108.
Iteration 14009: Policy loss: -0.035918. Value loss: 0.411377. Entropy: 1.236317.
Iteration 14010: Policy loss: -0.027753. Value loss: 0.299865. Entropy: 1.245000.
Training network. lr: 0.000142. clip: 0.056998
Iteration 14011: Policy loss: -0.271614. Value loss: 1.553150. Entropy: 1.224118.
Iteration 14012: Policy loss: -0.219315. Value loss: 1.021700. Entropy: 1.220742.
Iteration 14013: Policy loss: -0.285652. Value loss: 0.779839. Entropy: 1.224325.
Training network. lr: 0.000142. clip: 0.056998
Iteration 14014: Policy loss: -0.087073. Value loss: 1.612784. Entropy: 1.217623.
Iteration 14015: Policy loss: -0.098417. Value loss: 1.195344. Entropy: 1.218615.
Iteration 14016: Policy loss: -0.035383

Iteration 14082: Policy loss: 0.083623. Value loss: 0.322976. Entropy: 1.230202.
episode: 1939   score: 22700.0  epsilon: 1.0    steps: 178  evaluation reward: 33873.0
Training network. lr: 0.000142. clip: 0.056841
Iteration 14083: Policy loss: 0.102446. Value loss: 0.665588. Entropy: 1.263899.
Iteration 14084: Policy loss: 0.111556. Value loss: 0.308571. Entropy: 1.269302.
Iteration 14085: Policy loss: 0.094223. Value loss: 0.174284. Entropy: 1.272200.
Training network. lr: 0.000142. clip: 0.056841
Iteration 14086: Policy loss: 0.190398. Value loss: 0.486381. Entropy: 1.250561.
Iteration 14087: Policy loss: 0.199293. Value loss: 0.281278. Entropy: 1.252093.
Iteration 14088: Policy loss: 0.211739. Value loss: 0.227341. Entropy: 1.249449.
Training network. lr: 0.000142. clip: 0.056841
Iteration 14089: Policy loss: -0.167629. Value loss: 0.733634. Entropy: 1.281273.
Iteration 14090: Policy loss: -0.165373. Value loss: 0.436406. Entropy: 1.273714.
Iteration 14091: Policy loss: -0.165918. 

Training network. lr: 0.000141. clip: 0.056537
Iteration 14155: Policy loss: 0.029882. Value loss: 0.957682. Entropy: 1.254146.
Iteration 14156: Policy loss: -0.013690. Value loss: 0.545281. Entropy: 1.249180.
Iteration 14157: Policy loss: 0.012199. Value loss: 0.445699. Entropy: 1.254625.
Training network. lr: 0.000141. clip: 0.056537
Iteration 14158: Policy loss: 0.069749. Value loss: 0.281844. Entropy: 1.271265.
Iteration 14159: Policy loss: 0.079033. Value loss: 0.180925. Entropy: 1.287063.
Iteration 14160: Policy loss: 0.074343. Value loss: 0.138712. Entropy: 1.281444.
Training network. lr: 0.000141. clip: 0.056537
Iteration 14161: Policy loss: 0.026946. Value loss: 0.951064. Entropy: 1.234903.
Iteration 14162: Policy loss: 0.004939. Value loss: 0.630075. Entropy: 1.233138.
Iteration 14163: Policy loss: 0.021191. Value loss: 0.377323. Entropy: 1.231377.
Training network. lr: 0.000141. clip: 0.056537
Iteration 14164: Policy loss: -0.149401. Value loss: 0.781125. Entropy: 1.254410.


Iteration 14231: Policy loss: -0.118626. Value loss: 0.906342. Entropy: 1.286946.
Iteration 14232: Policy loss: -0.125517. Value loss: 0.734435. Entropy: 1.288076.
episode: 1961   score: 48100.0  epsilon: 1.0    steps: 965  evaluation reward: 34165.0
Training network. lr: 0.000141. clip: 0.056381
Iteration 14233: Policy loss: 0.117463. Value loss: 1.016034. Entropy: 1.274084.
Iteration 14234: Policy loss: 0.072615. Value loss: 0.586720. Entropy: 1.275483.
Iteration 14235: Policy loss: 0.121428. Value loss: 0.374977. Entropy: 1.277184.
Training network. lr: 0.000141. clip: 0.056381
Iteration 14236: Policy loss: 0.037506. Value loss: 0.724203. Entropy: 1.277178.
Iteration 14237: Policy loss: 0.047497. Value loss: 0.433118. Entropy: 1.280188.
Iteration 14238: Policy loss: 0.011998. Value loss: 0.359440. Entropy: 1.279064.
Training network. lr: 0.000141. clip: 0.056381
Iteration 14239: Policy loss: -0.157333. Value loss: 0.806672. Entropy: 1.265887.
Iteration 14240: Policy loss: -0.230003.

Training network. lr: 0.000140. clip: 0.056077
Iteration 14308: Policy loss: -0.072062. Value loss: 0.702560. Entropy: 1.245860.
Iteration 14309: Policy loss: -0.025352. Value loss: 0.372222. Entropy: 1.256046.
Iteration 14310: Policy loss: -0.064209. Value loss: 0.328817. Entropy: 1.245370.
Training network. lr: 0.000140. clip: 0.056077
Iteration 14311: Policy loss: -0.077258. Value loss: 0.447798. Entropy: 1.257158.
Iteration 14312: Policy loss: -0.078682. Value loss: 0.292045. Entropy: 1.254118.
Iteration 14313: Policy loss: -0.056332. Value loss: 0.252986. Entropy: 1.255692.
Training network. lr: 0.000140. clip: 0.056077
Iteration 14314: Policy loss: -0.116953. Value loss: 0.864455. Entropy: 1.236493.
Iteration 14315: Policy loss: -0.106234. Value loss: 0.551345. Entropy: 1.228339.
Iteration 14316: Policy loss: -0.128344. Value loss: 0.384539. Entropy: 1.235317.
Training network. lr: 0.000140. clip: 0.056077
Iteration 14317: Policy loss: -0.126398. Value loss: 0.693812. Entropy: 1.

Training network. lr: 0.000140. clip: 0.055920
Iteration 14386: Policy loss: -0.160237. Value loss: 1.231596. Entropy: 1.281583.
Iteration 14387: Policy loss: -0.169686. Value loss: 0.701471. Entropy: 1.278064.
Iteration 14388: Policy loss: -0.190893. Value loss: 0.470354. Entropy: 1.279896.
Training network. lr: 0.000140. clip: 0.055920
Iteration 14389: Policy loss: 0.311483. Value loss: 1.096287. Entropy: 1.210478.
Iteration 14390: Policy loss: 0.312632. Value loss: 0.582114. Entropy: 1.207027.
Iteration 14391: Policy loss: 0.315342. Value loss: 0.363241. Entropy: 1.210361.
Training network. lr: 0.000140. clip: 0.055920
Iteration 14392: Policy loss: -0.189337. Value loss: 1.088442. Entropy: 1.256066.
Iteration 14393: Policy loss: -0.098670. Value loss: 0.704716. Entropy: 1.250953.
Iteration 14394: Policy loss: -0.222057. Value loss: 0.604408. Entropy: 1.255489.
episode: 1978   score: 27800.0  epsilon: 1.0    steps: 691  evaluation reward: 34909.0
Training network. lr: 0.000140. clip:

Iteration 14462: Policy loss: 0.148994. Value loss: 0.257865. Entropy: 1.251778.
Iteration 14463: Policy loss: 0.118429. Value loss: 0.217263. Entropy: 1.252683.
Training network. lr: 0.000139. clip: 0.055616
Iteration 14464: Policy loss: 0.099508. Value loss: 0.469281. Entropy: 1.266888.
Iteration 14465: Policy loss: 0.139556. Value loss: 0.304853. Entropy: 1.261191.
Iteration 14466: Policy loss: 0.125527. Value loss: 0.269599. Entropy: 1.265155.
episode: 1988   score: 19500.0  epsilon: 1.0    steps: 143  evaluation reward: 33646.0
Training network. lr: 0.000139. clip: 0.055616
Iteration 14467: Policy loss: -0.079974. Value loss: 0.920168. Entropy: 1.281504.
Iteration 14468: Policy loss: -0.066425. Value loss: 0.640536. Entropy: 1.279345.
Iteration 14469: Policy loss: -0.079335. Value loss: 0.475981. Entropy: 1.283576.
episode: 1989   score: 24700.0  epsilon: 1.0    steps: 289  evaluation reward: 33404.0
episode: 1990   score: 27000.0  epsilon: 1.0    steps: 625  evaluation reward: 33

Iteration 14537: Policy loss: 0.105135. Value loss: 0.382776. Entropy: 1.293780.
Iteration 14538: Policy loss: 0.059165. Value loss: 0.276554. Entropy: 1.295149.
Training network. lr: 0.000139. clip: 0.055459
Iteration 14539: Policy loss: -0.040657. Value loss: 0.580710. Entropy: 1.279741.
Iteration 14540: Policy loss: -0.019572. Value loss: 0.389417. Entropy: 1.279756.
Iteration 14541: Policy loss: -0.026643. Value loss: 0.249263. Entropy: 1.271912.
Training network. lr: 0.000139. clip: 0.055459
Iteration 14542: Policy loss: -0.025893. Value loss: 0.596578. Entropy: 1.257692.
Iteration 14543: Policy loss: -0.026229. Value loss: 0.468390. Entropy: 1.261307.
Iteration 14544: Policy loss: -0.044446. Value loss: 0.294391. Entropy: 1.256453.
Training network. lr: 0.000139. clip: 0.055459
Iteration 14545: Policy loss: 0.369475. Value loss: 0.677848. Entropy: 1.235098.
Iteration 14546: Policy loss: 0.424363. Value loss: 0.396632. Entropy: 1.237700.
Iteration 14547: Policy loss: 0.421039. Val

Training network. lr: 0.000138. clip: 0.055155
Iteration 14614: Policy loss: -0.127459. Value loss: 0.835608. Entropy: 1.258134.
Iteration 14615: Policy loss: -0.139923. Value loss: 0.436078. Entropy: 1.255712.
Iteration 14616: Policy loss: -0.192534. Value loss: 0.340759. Entropy: 1.254632.
Training network. lr: 0.000138. clip: 0.055155
Iteration 14617: Policy loss: -0.129995. Value loss: 0.807832. Entropy: 1.306213.
Iteration 14618: Policy loss: -0.121251. Value loss: 0.601036. Entropy: 1.304752.
Iteration 14619: Policy loss: -0.102753. Value loss: 0.431516. Entropy: 1.305336.
episode: 2008   score: 20000.0  epsilon: 1.0    steps: 838  evaluation reward: 32757.0
Training network. lr: 0.000138. clip: 0.055155
Iteration 14620: Policy loss: -0.144819. Value loss: 0.844975. Entropy: 1.275667.
Iteration 14621: Policy loss: -0.078020. Value loss: 0.425600. Entropy: 1.278161.
Iteration 14622: Policy loss: -0.143620. Value loss: 0.356509. Entropy: 1.279077.
Training network. lr: 0.000138. cl

Iteration 14690: Policy loss: 0.371415. Value loss: 0.200366. Entropy: 1.274285.
Iteration 14691: Policy loss: 0.389197. Value loss: 0.155656. Entropy: 1.271082.
Training network. lr: 0.000137. clip: 0.054998
Iteration 14692: Policy loss: 0.085796. Value loss: 0.979105. Entropy: 1.278040.
Iteration 14693: Policy loss: 0.079964. Value loss: 0.585637. Entropy: 1.280311.
Iteration 14694: Policy loss: 0.044109. Value loss: 0.514878. Entropy: 1.281054.
Training network. lr: 0.000137. clip: 0.054998
Iteration 14695: Policy loss: -0.348565. Value loss: 1.146014. Entropy: 1.263667.
Iteration 14696: Policy loss: -0.274742. Value loss: 0.624015. Entropy: 1.258152.
Iteration 14697: Policy loss: -0.314325. Value loss: 0.461187. Entropy: 1.264548.
Training network. lr: 0.000137. clip: 0.054998
Iteration 14698: Policy loss: -0.039604. Value loss: 0.643156. Entropy: 1.262207.
Iteration 14699: Policy loss: 0.008227. Value loss: 0.435647. Entropy: 1.263505.
Iteration 14700: Policy loss: -0.017555. Valu

Iteration 14768: Policy loss: -0.072266. Value loss: 0.536562. Entropy: 1.281677.
Iteration 14769: Policy loss: -0.050659. Value loss: 0.392448. Entropy: 1.290706.
Training network. lr: 0.000137. clip: 0.054694
Iteration 14770: Policy loss: 0.168923. Value loss: 1.051031. Entropy: 1.295464.
Iteration 14771: Policy loss: 0.173698. Value loss: 0.526797. Entropy: 1.293772.
Iteration 14772: Policy loss: 0.187421. Value loss: 0.333428. Entropy: 1.291318.
Training network. lr: 0.000137. clip: 0.054694
Iteration 14773: Policy loss: 0.293442. Value loss: 0.965214. Entropy: 1.293053.
Iteration 14774: Policy loss: 0.288974. Value loss: 0.609414. Entropy: 1.292516.
Iteration 14775: Policy loss: 0.318308. Value loss: 0.422021. Entropy: 1.296543.
Training network. lr: 0.000137. clip: 0.054694
Iteration 14776: Policy loss: 0.079402. Value loss: 0.723414. Entropy: 1.268713.
Iteration 14777: Policy loss: 0.043254. Value loss: 0.466246. Entropy: 1.274828.
Iteration 14778: Policy loss: 0.020474. Value l

Training network. lr: 0.000136. clip: 0.054537
Iteration 14845: Policy loss: -0.207851. Value loss: 1.717959. Entropy: 1.275927.
Iteration 14846: Policy loss: -0.263236. Value loss: 1.170779. Entropy: 1.275061.
Iteration 14847: Policy loss: -0.267430. Value loss: 0.912147. Entropy: 1.273266.
Training network. lr: 0.000136. clip: 0.054537
Iteration 14848: Policy loss: 0.030913. Value loss: 0.754151. Entropy: 1.259044.
Iteration 14849: Policy loss: 0.002940. Value loss: 0.510723. Entropy: 1.253744.
Iteration 14850: Policy loss: 0.011736. Value loss: 0.393169. Entropy: 1.254523.
Training network. lr: 0.000136. clip: 0.054390
Iteration 14851: Policy loss: 0.032306. Value loss: 1.344970. Entropy: 1.270347.
Iteration 14852: Policy loss: 0.003635. Value loss: 1.111256. Entropy: 1.271398.
Iteration 14853: Policy loss: 0.010005. Value loss: 0.974931. Entropy: 1.273339.
Training network. lr: 0.000136. clip: 0.054390
Iteration 14854: Policy loss: -0.049408. Value loss: 0.786655. Entropy: 1.284592

Iteration 14921: Policy loss: -0.019215. Value loss: 0.572566. Entropy: 1.282598.
Iteration 14922: Policy loss: -0.037317. Value loss: 0.387223. Entropy: 1.283040.
episode: 2043   score: 25100.0  epsilon: 1.0    steps: 463  evaluation reward: 34928.0
Training network. lr: 0.000136. clip: 0.054233
Iteration 14923: Policy loss: 0.245402. Value loss: 0.373656. Entropy: 1.299512.
Iteration 14924: Policy loss: 0.256099. Value loss: 0.203888. Entropy: 1.298542.
Iteration 14925: Policy loss: 0.235666. Value loss: 0.155844. Entropy: 1.302856.
Training network. lr: 0.000136. clip: 0.054233
Iteration 14926: Policy loss: -0.065244. Value loss: 1.505025. Entropy: 1.286748.
Iteration 14927: Policy loss: -0.086538. Value loss: 0.815198. Entropy: 1.281853.
Iteration 14928: Policy loss: -0.077008. Value loss: 0.566642. Entropy: 1.282764.
episode: 2044   score: 33600.0  epsilon: 1.0    steps: 9  evaluation reward: 35021.0
Training network. lr: 0.000136. clip: 0.054233
Iteration 14929: Policy loss: 0.10

episode: 2052   score: 18500.0  epsilon: 1.0    steps: 963  evaluation reward: 35432.0
Training network. lr: 0.000135. clip: 0.054077
Iteration 14998: Policy loss: 0.160446. Value loss: 2.020250. Entropy: 1.239192.
Iteration 14999: Policy loss: 0.153976. Value loss: 1.023603. Entropy: 1.246413.
Iteration 15000: Policy loss: 0.147727. Value loss: 0.730953. Entropy: 1.239207.
Training network. lr: 0.000135. clip: 0.053929
Iteration 15001: Policy loss: 0.306541. Value loss: 0.583197. Entropy: 1.277685.
Iteration 15002: Policy loss: 0.292781. Value loss: 0.357713. Entropy: 1.279082.
Iteration 15003: Policy loss: 0.311241. Value loss: 0.319581. Entropy: 1.279544.
Training network. lr: 0.000135. clip: 0.053929
Iteration 15004: Policy loss: 0.142247. Value loss: 0.405723. Entropy: 1.262020.
Iteration 15005: Policy loss: 0.118327. Value loss: 0.300517. Entropy: 1.257513.
Iteration 15006: Policy loss: 0.150804. Value loss: 0.216844. Entropy: 1.264460.
Training network. lr: 0.000135. clip: 0.053

Iteration 15073: Policy loss: 0.024887. Value loss: 0.519799. Entropy: 1.265064.
Iteration 15074: Policy loss: 0.015629. Value loss: 0.316922. Entropy: 1.278744.
Iteration 15075: Policy loss: 0.025443. Value loss: 0.263609. Entropy: 1.273701.
episode: 2062   score: 30700.0  epsilon: 1.0    steps: 364  evaluation reward: 35500.0
Training network. lr: 0.000134. clip: 0.053773
Iteration 15076: Policy loss: -0.015122. Value loss: 0.558557. Entropy: 1.279352.
Iteration 15077: Policy loss: -0.025550. Value loss: 0.404351. Entropy: 1.278074.
Iteration 15078: Policy loss: -0.014430. Value loss: 0.290939. Entropy: 1.274365.
Training network. lr: 0.000134. clip: 0.053773
Iteration 15079: Policy loss: -0.364898. Value loss: 1.066293. Entropy: 1.290759.
Iteration 15080: Policy loss: -0.303114. Value loss: 0.696873. Entropy: 1.286207.
Iteration 15081: Policy loss: -0.426228. Value loss: 0.462477. Entropy: 1.276771.
Training network. lr: 0.000134. clip: 0.053773
Iteration 15082: Policy loss: -0.0760

Iteration 15150: Policy loss: -0.281297. Value loss: 0.391055. Entropy: 1.279791.
Training network. lr: 0.000134. clip: 0.053468
Iteration 15151: Policy loss: 0.163619. Value loss: 1.377067. Entropy: 1.289343.
Iteration 15152: Policy loss: 0.187366. Value loss: 1.014570. Entropy: 1.290875.
Iteration 15153: Policy loss: 0.162300. Value loss: 0.778874. Entropy: 1.292208.
episode: 2071   score: 26400.0  epsilon: 1.0    steps: 483  evaluation reward: 35915.0
Training network. lr: 0.000134. clip: 0.053468
Iteration 15154: Policy loss: 0.073891. Value loss: 0.519141. Entropy: 1.270094.
Iteration 15155: Policy loss: 0.096572. Value loss: 0.300764. Entropy: 1.283014.
Iteration 15156: Policy loss: 0.056368. Value loss: 0.301909. Entropy: 1.269438.
Training network. lr: 0.000134. clip: 0.053468
Iteration 15157: Policy loss: -0.138533. Value loss: 0.564778. Entropy: 1.263383.
Iteration 15158: Policy loss: -0.135279. Value loss: 0.322315. Entropy: 1.260643.
Iteration 15159: Policy loss: -0.091251.

Iteration 15224: Policy loss: 0.228500. Value loss: 0.270004. Entropy: 1.282226.
Iteration 15225: Policy loss: 0.222870. Value loss: 0.244041. Entropy: 1.283311.
Training network. lr: 0.000133. clip: 0.053312
Iteration 15226: Policy loss: 0.154806. Value loss: 1.222365. Entropy: 1.254456.
Iteration 15227: Policy loss: 0.131610. Value loss: 0.792712. Entropy: 1.265001.
Iteration 15228: Policy loss: 0.157838. Value loss: 0.530954. Entropy: 1.259737.
Training network. lr: 0.000133. clip: 0.053312
Iteration 15229: Policy loss: -0.173382. Value loss: 0.658595. Entropy: 1.270391.
Iteration 15230: Policy loss: -0.165074. Value loss: 0.409241. Entropy: 1.270674.
Iteration 15231: Policy loss: -0.161935. Value loss: 0.298118. Entropy: 1.258692.
Training network. lr: 0.000133. clip: 0.053312
Iteration 15232: Policy loss: 0.086675. Value loss: 1.184307. Entropy: 1.247774.
Iteration 15233: Policy loss: 0.077732. Value loss: 0.778027. Entropy: 1.242220.
Iteration 15234: Policy loss: 0.051460. Value 

Iteration 15300: Policy loss: 0.211727. Value loss: 0.499405. Entropy: 1.279733.
Training network. lr: 0.000133. clip: 0.053008
Iteration 15301: Policy loss: 0.240572. Value loss: 0.349962. Entropy: 1.312078.
Iteration 15302: Policy loss: 0.226308. Value loss: 0.236613. Entropy: 1.311872.
Iteration 15303: Policy loss: 0.237850. Value loss: 0.182718. Entropy: 1.312693.
Training network. lr: 0.000133. clip: 0.053008
Iteration 15304: Policy loss: 0.290218. Value loss: 0.975578. Entropy: 1.286119.
Iteration 15305: Policy loss: 0.220842. Value loss: 0.646887. Entropy: 1.288561.
Iteration 15306: Policy loss: 0.204388. Value loss: 0.541940. Entropy: 1.295909.
Training network. lr: 0.000133. clip: 0.053008
Iteration 15307: Policy loss: 0.265422. Value loss: 0.363421. Entropy: 1.273005.
Iteration 15308: Policy loss: 0.261599. Value loss: 0.249072. Entropy: 1.269646.
Iteration 15309: Policy loss: 0.266425. Value loss: 0.182998. Entropy: 1.279363.
Training network. lr: 0.000133. clip: 0.053008
It

Training network. lr: 0.000132. clip: 0.052851
Iteration 15376: Policy loss: 0.063121. Value loss: 1.021193. Entropy: 1.271209.
Iteration 15377: Policy loss: 0.063780. Value loss: 0.728318. Entropy: 1.275493.
Iteration 15378: Policy loss: 0.083608. Value loss: 0.549669. Entropy: 1.270362.
episode: 2103   score: 19000.0  epsilon: 1.0    steps: 28  evaluation reward: 33910.0
Training network. lr: 0.000132. clip: 0.052851
Iteration 15379: Policy loss: 0.209183. Value loss: 0.153736. Entropy: 1.293841.
Iteration 15380: Policy loss: 0.212241. Value loss: 0.103831. Entropy: 1.295770.
Iteration 15381: Policy loss: 0.205444. Value loss: 0.084695. Entropy: 1.299423.
episode: 2104   score: 31700.0  epsilon: 1.0    steps: 230  evaluation reward: 33831.0
episode: 2105   score: 32000.0  epsilon: 1.0    steps: 884  evaluation reward: 33446.0
Training network. lr: 0.000132. clip: 0.052851
Iteration 15382: Policy loss: 0.124123. Value loss: 0.745112. Entropy: 1.297342.
Iteration 15383: Policy loss: 0.

episode: 2114   score: 32400.0  epsilon: 1.0    steps: 934  evaluation reward: 32530.0
Training network. lr: 0.000131. clip: 0.052547
Iteration 15451: Policy loss: 0.060468. Value loss: 0.874960. Entropy: 1.286978.
Iteration 15452: Policy loss: 0.070362. Value loss: 0.571674. Entropy: 1.283275.
Iteration 15453: Policy loss: 0.050115. Value loss: 0.404220. Entropy: 1.287971.
episode: 2115   score: 19200.0  epsilon: 1.0    steps: 674  evaluation reward: 32436.0
Training network. lr: 0.000131. clip: 0.052547
Iteration 15454: Policy loss: -0.090816. Value loss: 0.665296. Entropy: 1.322200.
Iteration 15455: Policy loss: -0.136691. Value loss: 0.516044. Entropy: 1.319661.
Iteration 15456: Policy loss: -0.108730. Value loss: 0.391743. Entropy: 1.320978.
Training network. lr: 0.000131. clip: 0.052547
Iteration 15457: Policy loss: -0.029008. Value loss: 0.474809. Entropy: 1.305014.
Iteration 15458: Policy loss: -0.024066. Value loss: 0.377849. Entropy: 1.309714.
Iteration 15459: Policy loss: -0

Iteration 15525: Policy loss: 0.250019. Value loss: 0.158827. Entropy: 1.285491.
episode: 2126   score: 42200.0  epsilon: 1.0    steps: 746  evaluation reward: 31064.0
Training network. lr: 0.000131. clip: 0.052390
Iteration 15526: Policy loss: 0.012054. Value loss: 0.985611. Entropy: 1.304523.
Iteration 15527: Policy loss: 0.006760. Value loss: 0.621087. Entropy: 1.305562.
Iteration 15528: Policy loss: 0.017688. Value loss: 0.480935. Entropy: 1.306822.
Training network. lr: 0.000131. clip: 0.052390
Iteration 15529: Policy loss: -0.031124. Value loss: 0.741937. Entropy: 1.289657.
Iteration 15530: Policy loss: -0.003948. Value loss: 0.438697. Entropy: 1.293671.
Iteration 15531: Policy loss: -0.014002. Value loss: 0.363485. Entropy: 1.294055.
episode: 2127   score: 31200.0  epsilon: 1.0    steps: 529  evaluation reward: 31113.0
Training network. lr: 0.000131. clip: 0.052390
Iteration 15532: Policy loss: -0.050391. Value loss: 1.223604. Entropy: 1.288847.
Iteration 15533: Policy loss: -0.

Training network. lr: 0.000130. clip: 0.052086
Iteration 15601: Policy loss: 0.073104. Value loss: 0.679251. Entropy: 1.265758.
Iteration 15602: Policy loss: 0.055348. Value loss: 0.402897. Entropy: 1.269822.
Iteration 15603: Policy loss: 0.056896. Value loss: 0.298124. Entropy: 1.269947.
Training network. lr: 0.000130. clip: 0.052086
Iteration 15604: Policy loss: -0.053052. Value loss: 0.949142. Entropy: 1.276456.
Iteration 15605: Policy loss: -0.037761. Value loss: 0.407234. Entropy: 1.265549.
Iteration 15606: Policy loss: -0.059304. Value loss: 0.298784. Entropy: 1.273101.
Training network. lr: 0.000130. clip: 0.052086
Iteration 15607: Policy loss: 0.104261. Value loss: 0.852946. Entropy: 1.283896.
Iteration 15608: Policy loss: 0.074186. Value loss: 0.461747. Entropy: 1.285465.
Iteration 15609: Policy loss: 0.009343. Value loss: 0.386035. Entropy: 1.281843.
episode: 2136   score: 28000.0  epsilon: 1.0    steps: 280  evaluation reward: 30159.0
episode: 2137   score: 18800.0  epsilon:

Iteration 15676: Policy loss: 0.304074. Value loss: 0.387313. Entropy: 1.296927.
Iteration 15677: Policy loss: 0.325690. Value loss: 0.207771. Entropy: 1.298811.
Iteration 15678: Policy loss: 0.303254. Value loss: 0.183024. Entropy: 1.296266.
Training network. lr: 0.000130. clip: 0.051929
Iteration 15679: Policy loss: -0.104206. Value loss: 0.727599. Entropy: 1.300953.
Iteration 15680: Policy loss: -0.117900. Value loss: 0.359562. Entropy: 1.302514.
Iteration 15681: Policy loss: -0.122202. Value loss: 0.232476. Entropy: 1.299115.
episode: 2146   score: 25200.0  epsilon: 1.0    steps: 50  evaluation reward: 29308.0
Training network. lr: 0.000130. clip: 0.051929
Iteration 15682: Policy loss: -0.421647. Value loss: 1.233194. Entropy: 1.255173.
Iteration 15683: Policy loss: -0.425702. Value loss: 0.932436. Entropy: 1.255922.
Iteration 15684: Policy loss: -0.409964. Value loss: 0.647691. Entropy: 1.262296.
Training network. lr: 0.000130. clip: 0.051929
Iteration 15685: Policy loss: -0.01659

Training network. lr: 0.000129. clip: 0.051625
Iteration 15751: Policy loss: 0.238887. Value loss: 0.669906. Entropy: 1.304127.
Iteration 15752: Policy loss: 0.220041. Value loss: 0.340203. Entropy: 1.307480.
Iteration 15753: Policy loss: 0.242369. Value loss: 0.278481. Entropy: 1.300823.
Training network. lr: 0.000129. clip: 0.051625
Iteration 15754: Policy loss: -0.068460. Value loss: 0.636758. Entropy: 1.307297.
Iteration 15755: Policy loss: -0.086353. Value loss: 0.435306. Entropy: 1.302629.
Iteration 15756: Policy loss: -0.059670. Value loss: 0.322583. Entropy: 1.304296.
Training network. lr: 0.000129. clip: 0.051625
Iteration 15757: Policy loss: 0.053714. Value loss: 0.567481. Entropy: 1.313327.
Iteration 15758: Policy loss: 0.084030. Value loss: 0.287961. Entropy: 1.309034.
Iteration 15759: Policy loss: 0.028418. Value loss: 0.269134. Entropy: 1.310266.
episode: 2157   score: 19500.0  epsilon: 1.0    steps: 213  evaluation reward: 28820.0
Training network. lr: 0.000129. clip: 0.

Training network. lr: 0.000129. clip: 0.051469
Iteration 15826: Policy loss: -0.245078. Value loss: 1.265157. Entropy: 1.303095.
Iteration 15827: Policy loss: -0.259744. Value loss: 0.605605. Entropy: 1.308700.
Iteration 15828: Policy loss: -0.243204. Value loss: 0.341078. Entropy: 1.308523.
Training network. lr: 0.000129. clip: 0.051469
Iteration 15829: Policy loss: -0.072650. Value loss: 0.908624. Entropy: 1.317927.
Iteration 15830: Policy loss: -0.123458. Value loss: 0.466702. Entropy: 1.317855.
Iteration 15831: Policy loss: -0.097017. Value loss: 0.432505. Entropy: 1.316189.
episode: 2168   score: 18200.0  epsilon: 1.0    steps: 42  evaluation reward: 27242.0
episode: 2169   score: 16900.0  epsilon: 1.0    steps: 301  evaluation reward: 27138.0
episode: 2170   score: 29100.0  epsilon: 1.0    steps: 489  evaluation reward: 26916.0
Training network. lr: 0.000129. clip: 0.051469
Iteration 15832: Policy loss: -0.154209. Value loss: 1.015037. Entropy: 1.307730.
Iteration 15833: Policy l

Iteration 15902: Policy loss: 0.107519. Value loss: 0.292963. Entropy: 1.295979.
Iteration 15903: Policy loss: 0.107901. Value loss: 0.228382. Entropy: 1.291267.
episode: 2178   score: 18500.0  epsilon: 1.0    steps: 144  evaluation reward: 27020.0
episode: 2179   score: 38300.0  epsilon: 1.0    steps: 385  evaluation reward: 27170.0
episode: 2180   score: 29900.0  epsilon: 1.0    steps: 530  evaluation reward: 27249.0
Training network. lr: 0.000128. clip: 0.051164
Iteration 15904: Policy loss: 0.067644. Value loss: 0.467746. Entropy: 1.298997.
Iteration 15905: Policy loss: 0.044498. Value loss: 0.234828. Entropy: 1.306055.
Iteration 15906: Policy loss: 0.069597. Value loss: 0.149258. Entropy: 1.306371.
Training network. lr: 0.000128. clip: 0.051164
Iteration 15907: Policy loss: 0.084369. Value loss: 0.623348. Entropy: 1.301230.
Iteration 15908: Policy loss: 0.104229. Value loss: 0.373442. Entropy: 1.296343.
Iteration 15909: Policy loss: 0.084010. Value loss: 0.230017. Entropy: 1.29600

Training network. lr: 0.000128. clip: 0.051008
Iteration 15976: Policy loss: 0.225486. Value loss: 0.399179. Entropy: 1.290940.
Iteration 15977: Policy loss: 0.218455. Value loss: 0.287980. Entropy: 1.297452.
Iteration 15978: Policy loss: 0.215208. Value loss: 0.249631. Entropy: 1.295523.
episode: 2190   score: 22200.0  epsilon: 1.0    steps: 893  evaluation reward: 26711.0
Training network. lr: 0.000128. clip: 0.051008
Iteration 15979: Policy loss: 0.095077. Value loss: 0.591014. Entropy: 1.294811.
Iteration 15980: Policy loss: 0.072424. Value loss: 0.367160. Entropy: 1.291342.
Iteration 15981: Policy loss: 0.071743. Value loss: 0.215119. Entropy: 1.291306.
episode: 2191   score: 26900.0  epsilon: 1.0    steps: 955  evaluation reward: 26807.0
Training network. lr: 0.000128. clip: 0.051008
Iteration 15982: Policy loss: -0.138323. Value loss: 1.130998. Entropy: 1.301605.
Iteration 15983: Policy loss: -0.133430. Value loss: 0.495716. Entropy: 1.307019.
Iteration 15984: Policy loss: -0.09

Iteration 16053: Policy loss: 0.256116. Value loss: 0.330189. Entropy: 1.280347.
Training network. lr: 0.000127. clip: 0.050704
Iteration 16054: Policy loss: 0.262954. Value loss: 1.272395. Entropy: 1.265906.
Iteration 16055: Policy loss: 0.240190. Value loss: 0.830584. Entropy: 1.266072.
Iteration 16056: Policy loss: 0.254246. Value loss: 0.588717. Entropy: 1.261999.
episode: 2198   score: 34700.0  epsilon: 1.0    steps: 110  evaluation reward: 26911.0
episode: 2199   score: 55400.0  epsilon: 1.0    steps: 661  evaluation reward: 27128.0
Training network. lr: 0.000127. clip: 0.050704
Iteration 16057: Policy loss: 0.181400. Value loss: 0.196668. Entropy: 1.304129.
Iteration 16058: Policy loss: 0.198308. Value loss: 0.149107. Entropy: 1.298358.
Iteration 16059: Policy loss: 0.197096. Value loss: 0.116168. Entropy: 1.301782.
Training network. lr: 0.000127. clip: 0.050704
Iteration 16060: Policy loss: -0.160043. Value loss: 1.052371. Entropy: 1.278355.
Iteration 16061: Policy loss: -0.175

Iteration 16128: Policy loss: 0.068783. Value loss: 0.531675. Entropy: 1.285214.
Training network. lr: 0.000126. clip: 0.050547
Iteration 16129: Policy loss: -0.034061. Value loss: 1.349056. Entropy: 1.262768.
Iteration 16130: Policy loss: -0.081844. Value loss: 0.927262. Entropy: 1.260041.
Iteration 16131: Policy loss: -0.071408. Value loss: 0.690468. Entropy: 1.250057.
episode: 2209   score: 46200.0  epsilon: 1.0    steps: 97  evaluation reward: 28127.0
episode: 2210   score: 32200.0  epsilon: 1.0    steps: 840  evaluation reward: 28400.0
Training network. lr: 0.000126. clip: 0.050547
Iteration 16132: Policy loss: 0.082303. Value loss: 1.013147. Entropy: 1.262523.
Iteration 16133: Policy loss: 0.137731. Value loss: 0.615996. Entropy: 1.257295.
Iteration 16134: Policy loss: 0.116380. Value loss: 0.347898. Entropy: 1.253408.
Training network. lr: 0.000126. clip: 0.050547
Iteration 16135: Policy loss: 0.125358. Value loss: 0.600810. Entropy: 1.264352.
Iteration 16136: Policy loss: 0.140

Iteration 16202: Policy loss: 0.356119. Value loss: 0.258599. Entropy: 1.262778.
Iteration 16203: Policy loss: 0.359575. Value loss: 0.212678. Entropy: 1.258375.
episode: 2221   score: 27900.0  epsilon: 1.0    steps: 1014  evaluation reward: 29127.0
Training network. lr: 0.000126. clip: 0.050243
Iteration 16204: Policy loss: 0.222123. Value loss: 0.266062. Entropy: 1.304018.
Iteration 16205: Policy loss: 0.217843. Value loss: 0.139927. Entropy: 1.302938.
Iteration 16206: Policy loss: 0.196959. Value loss: 0.112590. Entropy: 1.302903.
Training network. lr: 0.000126. clip: 0.050243
Iteration 16207: Policy loss: -0.213560. Value loss: 0.989966. Entropy: 1.300037.
Iteration 16208: Policy loss: -0.146002. Value loss: 0.486724. Entropy: 1.299881.
Iteration 16209: Policy loss: -0.161272. Value loss: 0.387260. Entropy: 1.288981.
Training network. lr: 0.000126. clip: 0.050243
Iteration 16210: Policy loss: 0.196289. Value loss: 0.528041. Entropy: 1.297943.
Iteration 16211: Policy loss: 0.140870.

Iteration 16278: Policy loss: 0.307110. Value loss: 0.168053. Entropy: 1.291700.
Training network. lr: 0.000125. clip: 0.050086
Iteration 16279: Policy loss: -0.077787. Value loss: 0.816235. Entropy: 1.285728.
Iteration 16280: Policy loss: -0.039963. Value loss: 0.516572. Entropy: 1.285000.
Iteration 16281: Policy loss: -0.025097. Value loss: 0.307079. Entropy: 1.292347.
episode: 2231   score: 35000.0  epsilon: 1.0    steps: 131  evaluation reward: 28703.0
Training network. lr: 0.000125. clip: 0.050086
Iteration 16282: Policy loss: 0.009421. Value loss: 0.571958. Entropy: 1.278003.
Iteration 16283: Policy loss: -0.028433. Value loss: 0.301401. Entropy: 1.280398.
Iteration 16284: Policy loss: 0.012450. Value loss: 0.264159. Entropy: 1.279453.
Training network. lr: 0.000125. clip: 0.050086
Iteration 16285: Policy loss: 0.085039. Value loss: 0.963275. Entropy: 1.287244.
Iteration 16286: Policy loss: 0.104635. Value loss: 0.580137. Entropy: 1.287657.
Iteration 16287: Policy loss: 0.080152.

Iteration 16353: Policy loss: -0.132633. Value loss: 0.343990. Entropy: 1.327083.
episode: 2242   score: 27400.0  epsilon: 1.0    steps: 728  evaluation reward: 28609.0
episode: 2243   score: 17600.0  epsilon: 1.0    steps: 1019  evaluation reward: 28320.0
Training network. lr: 0.000124. clip: 0.049782
Iteration 16354: Policy loss: -0.159600. Value loss: 0.975566. Entropy: 1.304641.
Iteration 16355: Policy loss: -0.144924. Value loss: 0.620627. Entropy: 1.304605.
Iteration 16356: Policy loss: -0.144176. Value loss: 0.518009. Entropy: 1.300476.
Training network. lr: 0.000124. clip: 0.049782
Iteration 16357: Policy loss: -0.174113. Value loss: 1.155151. Entropy: 1.314968.
Iteration 16358: Policy loss: -0.217812. Value loss: 0.622964. Entropy: 1.307715.
Iteration 16359: Policy loss: -0.177598. Value loss: 0.322256. Entropy: 1.315219.
Training network. lr: 0.000124. clip: 0.049782
Iteration 16360: Policy loss: -0.058171. Value loss: 0.732295. Entropy: 1.308331.
Iteration 16361: Policy loss

Iteration 16427: Policy loss: 0.046208. Value loss: 0.504609. Entropy: 1.287378.
Iteration 16428: Policy loss: 0.004164. Value loss: 0.280443. Entropy: 1.284863.
Training network. lr: 0.000124. clip: 0.049625
Iteration 16429: Policy loss: 0.190610. Value loss: 1.504513. Entropy: 1.283167.
Iteration 16430: Policy loss: 0.196538. Value loss: 1.098015. Entropy: 1.285490.
Iteration 16431: Policy loss: 0.122597. Value loss: 0.790112. Entropy: 1.289423.
Training network. lr: 0.000124. clip: 0.049625
Iteration 16432: Policy loss: -0.224439. Value loss: 1.284835. Entropy: 1.294108.
Iteration 16433: Policy loss: -0.211924. Value loss: 0.819069. Entropy: 1.296493.
Iteration 16434: Policy loss: -0.236083. Value loss: 0.566613. Entropy: 1.288079.
episode: 2253   score: 22000.0  epsilon: 1.0    steps: 416  evaluation reward: 28390.0
Training network. lr: 0.000124. clip: 0.049625
Iteration 16435: Policy loss: 0.112588. Value loss: 1.150208. Entropy: 1.302884.
Iteration 16436: Policy loss: 0.147265. 

Iteration 16505: Policy loss: -0.118266. Value loss: 1.609322. Entropy: 1.274577.
Iteration 16506: Policy loss: -0.102942. Value loss: 1.293183. Entropy: 1.272918.
Training network. lr: 0.000123. clip: 0.049321
Iteration 16507: Policy loss: -0.106506. Value loss: 2.405113. Entropy: 1.230076.
Iteration 16508: Policy loss: -0.159327. Value loss: 1.444849. Entropy: 1.224302.
Iteration 16509: Policy loss: -0.137854. Value loss: 1.204377. Entropy: 1.225511.
Training network. lr: 0.000123. clip: 0.049321
Iteration 16510: Policy loss: 0.072548. Value loss: 0.810911. Entropy: 1.250863.
Iteration 16511: Policy loss: 0.066562. Value loss: 0.566647. Entropy: 1.244659.
Iteration 16512: Policy loss: 0.110037. Value loss: 0.435224. Entropy: 1.249651.
episode: 2261   score: 67500.0  epsilon: 1.0    steps: 296  evaluation reward: 29141.0
Training network. lr: 0.000123. clip: 0.049321
Iteration 16513: Policy loss: 0.370672. Value loss: 0.948930. Entropy: 1.231071.
Iteration 16514: Policy loss: 0.379394

Iteration 16581: Policy loss: 0.268047. Value loss: 0.520291. Entropy: 1.213777.
Training network. lr: 0.000123. clip: 0.049165
Iteration 16582: Policy loss: 0.144040. Value loss: 0.639171. Entropy: 1.267730.
Iteration 16583: Policy loss: 0.031522. Value loss: 0.434248. Entropy: 1.266513.
Iteration 16584: Policy loss: 0.099545. Value loss: 0.307422. Entropy: 1.268707.
Training network. lr: 0.000123. clip: 0.049165
Iteration 16585: Policy loss: -0.203564. Value loss: 0.921473. Entropy: 1.248744.
Iteration 16586: Policy loss: -0.233936. Value loss: 0.638043. Entropy: 1.253237.
Iteration 16587: Policy loss: -0.215861. Value loss: 0.485377. Entropy: 1.243893.
Training network. lr: 0.000123. clip: 0.049165
Iteration 16588: Policy loss: 0.004360. Value loss: 0.868693. Entropy: 1.282360.
Iteration 16589: Policy loss: 0.051592. Value loss: 0.517162. Entropy: 1.281373.
Iteration 16590: Policy loss: -0.001759. Value loss: 0.411992. Entropy: 1.280420.
episode: 2271   score: 39100.0  epsilon: 1.0 

Iteration 16656: Policy loss: -0.216566. Value loss: 0.472761. Entropy: 1.293174.
Training network. lr: 0.000122. clip: 0.048860
Iteration 16657: Policy loss: 0.136652. Value loss: 0.905141. Entropy: 1.283995.
Iteration 16658: Policy loss: 0.111516. Value loss: 0.495123. Entropy: 1.283858.
Iteration 16659: Policy loss: 0.129479. Value loss: 0.378370. Entropy: 1.288649.
Training network. lr: 0.000122. clip: 0.048860
Iteration 16660: Policy loss: -0.061782. Value loss: 0.998883. Entropy: 1.280195.
Iteration 16661: Policy loss: -0.018915. Value loss: 0.630959. Entropy: 1.278144.
Iteration 16662: Policy loss: -0.035611. Value loss: 0.497119. Entropy: 1.281928.
Training network. lr: 0.000122. clip: 0.048860
Iteration 16663: Policy loss: 0.031684. Value loss: 0.721677. Entropy: 1.268836.
Iteration 16664: Policy loss: 0.004733. Value loss: 0.345542. Entropy: 1.277617.
Iteration 16665: Policy loss: 0.023421. Value loss: 0.278524. Entropy: 1.273893.
Training network. lr: 0.000122. clip: 0.04886

Iteration 16732: Policy loss: -0.047786. Value loss: 1.065419. Entropy: 1.239115.
Iteration 16733: Policy loss: -0.094842. Value loss: 0.696526. Entropy: 1.224634.
Iteration 16734: Policy loss: -0.053433. Value loss: 0.475286. Entropy: 1.241362.
Training network. lr: 0.000122. clip: 0.048704
Iteration 16735: Policy loss: -0.026226. Value loss: 0.521433. Entropy: 1.242591.
Iteration 16736: Policy loss: -0.005234. Value loss: 0.226428. Entropy: 1.250806.
Iteration 16737: Policy loss: -0.063585. Value loss: 0.186818. Entropy: 1.248077.
episode: 2291   score: 33800.0  epsilon: 1.0    steps: 341  evaluation reward: 31896.0
Training network. lr: 0.000122. clip: 0.048704
Iteration 16738: Policy loss: 0.144103. Value loss: 0.319912. Entropy: 1.215540.
Iteration 16739: Policy loss: 0.115215. Value loss: 0.277708. Entropy: 1.212279.
Iteration 16740: Policy loss: 0.126142. Value loss: 0.186980. Entropy: 1.223576.
Training network. lr: 0.000122. clip: 0.048704
Iteration 16741: Policy loss: -0.1672

Training network. lr: 0.000121. clip: 0.048400
Iteration 16810: Policy loss: 0.020511. Value loss: 1.245921. Entropy: 1.238827.
Iteration 16811: Policy loss: -0.056566. Value loss: 0.865903. Entropy: 1.237298.
Iteration 16812: Policy loss: 0.026448. Value loss: 0.685330. Entropy: 1.239301.
Training network. lr: 0.000121. clip: 0.048400
Iteration 16813: Policy loss: 0.191915. Value loss: 0.859230. Entropy: 1.244619.
Iteration 16814: Policy loss: 0.187467. Value loss: 0.507186. Entropy: 1.241370.
Iteration 16815: Policy loss: 0.193871. Value loss: 0.407673. Entropy: 1.243694.
episode: 2299   score: 41500.0  epsilon: 1.0    steps: 322  evaluation reward: 32267.0
Training network. lr: 0.000121. clip: 0.048400
Iteration 16816: Policy loss: 0.208847. Value loss: 0.633259. Entropy: 1.234761.
Iteration 16817: Policy loss: 0.183594. Value loss: 0.432300. Entropy: 1.229053.
Iteration 16818: Policy loss: 0.224993. Value loss: 0.364224. Entropy: 1.241285.
Training network. lr: 0.000121. clip: 0.04

Training network. lr: 0.000121. clip: 0.048243
Iteration 16885: Policy loss: -0.234427. Value loss: 0.893488. Entropy: 1.203075.
Iteration 16886: Policy loss: -0.269233. Value loss: 0.656737. Entropy: 1.198152.
Iteration 16887: Policy loss: -0.232776. Value loss: 0.549482. Entropy: 1.199817.
Training network. lr: 0.000121. clip: 0.048243
Iteration 16888: Policy loss: -0.161371. Value loss: 0.659419. Entropy: 1.201943.
Iteration 16889: Policy loss: -0.150530. Value loss: 0.405971. Entropy: 1.202371.
Iteration 16890: Policy loss: -0.141308. Value loss: 0.328353. Entropy: 1.198950.
Training network. lr: 0.000121. clip: 0.048243
Iteration 16891: Policy loss: 0.102051. Value loss: 0.502974. Entropy: 1.214809.
Iteration 16892: Policy loss: 0.095759. Value loss: 0.259911. Entropy: 1.216379.
Iteration 16893: Policy loss: 0.088446. Value loss: 0.210037. Entropy: 1.219150.
episode: 2309   score: 42000.0  epsilon: 1.0    steps: 423  evaluation reward: 32584.0
Training network. lr: 0.000121. clip:

Iteration 16961: Policy loss: -0.070570. Value loss: 0.211437. Entropy: 1.240403.
Iteration 16962: Policy loss: -0.102860. Value loss: 0.160482. Entropy: 1.240859.
Training network. lr: 0.000120. clip: 0.047939
Iteration 16963: Policy loss: 0.079986. Value loss: 0.737314. Entropy: 1.224500.
Iteration 16964: Policy loss: 0.056291. Value loss: 0.439605. Entropy: 1.210309.
Iteration 16965: Policy loss: 0.059175. Value loss: 0.279296. Entropy: 1.210299.
Training network. lr: 0.000120. clip: 0.047939
Iteration 16966: Policy loss: 0.055168. Value loss: 0.764241. Entropy: 1.250332.
Iteration 16967: Policy loss: 0.041580. Value loss: 0.464729. Entropy: 1.247584.
Iteration 16968: Policy loss: 0.015264. Value loss: 0.382108. Entropy: 1.245359.
episode: 2318   score: 32600.0  epsilon: 1.0    steps: 114  evaluation reward: 32778.0
Training network. lr: 0.000120. clip: 0.047939
Iteration 16969: Policy loss: 0.078877. Value loss: 0.646623. Entropy: 1.227311.
Iteration 16970: Policy loss: 0.072898. V

Iteration 17040: Policy loss: 0.064003. Value loss: 0.331648. Entropy: 1.249263.
Training network. lr: 0.000119. clip: 0.047782
Iteration 17041: Policy loss: 0.191999. Value loss: 0.759159. Entropy: 1.249881.
Iteration 17042: Policy loss: 0.179889. Value loss: 0.475502. Entropy: 1.245853.
Iteration 17043: Policy loss: 0.208089. Value loss: 0.366725. Entropy: 1.246858.
episode: 2325   score: 38800.0  epsilon: 1.0    steps: 321  evaluation reward: 33848.0
episode: 2326   score: 25500.0  epsilon: 1.0    steps: 578  evaluation reward: 33863.0
Training network. lr: 0.000119. clip: 0.047782
Iteration 17044: Policy loss: 0.028467. Value loss: 0.706656. Entropy: 1.233264.
Iteration 17045: Policy loss: 0.011008. Value loss: 0.454120. Entropy: 1.224207.
Iteration 17046: Policy loss: 0.022766. Value loss: 0.363175. Entropy: 1.224559.
Training network. lr: 0.000119. clip: 0.047782
Iteration 17047: Policy loss: 0.058501. Value loss: 0.939696. Entropy: 1.205355.
Iteration 17048: Policy loss: 0.08369

Iteration 17115: Policy loss: -0.347922. Value loss: 0.522211. Entropy: 1.229041.
Training network. lr: 0.000119. clip: 0.047478
Iteration 17116: Policy loss: -0.052446. Value loss: 0.952966. Entropy: 1.252988.
Iteration 17117: Policy loss: -0.099894. Value loss: 0.727731. Entropy: 1.250883.
Iteration 17118: Policy loss: -0.022876. Value loss: 0.405587. Entropy: 1.252636.
episode: 2336   score: 27400.0  epsilon: 1.0    steps: 247  evaluation reward: 34594.0
Training network. lr: 0.000119. clip: 0.047478
Iteration 17119: Policy loss: 0.003919. Value loss: 0.660237. Entropy: 1.231308.
Iteration 17120: Policy loss: 0.018461. Value loss: 0.381218. Entropy: 1.230240.
Iteration 17121: Policy loss: 0.004786. Value loss: 0.348562. Entropy: 1.233799.
episode: 2337   score: 39000.0  epsilon: 1.0    steps: 706  evaluation reward: 34863.0
Training network. lr: 0.000119. clip: 0.047478
Iteration 17122: Policy loss: 0.101707. Value loss: 1.121222. Entropy: 1.262357.
Iteration 17123: Policy loss: 0.0

episode: 2343   score: 67800.0  epsilon: 1.0    steps: 382  evaluation reward: 35974.0
Training network. lr: 0.000118. clip: 0.047321
Iteration 17194: Policy loss: -0.092467. Value loss: 0.774446. Entropy: 1.204004.
Iteration 17195: Policy loss: -0.077189. Value loss: 0.653417. Entropy: 1.203963.
Iteration 17196: Policy loss: -0.074355. Value loss: 0.507199. Entropy: 1.202876.
episode: 2344   score: 69900.0  epsilon: 1.0    steps: 831  evaluation reward: 36496.0
Training network. lr: 0.000118. clip: 0.047321
Iteration 17197: Policy loss: 0.131253. Value loss: 1.119159. Entropy: 1.220120.
Iteration 17198: Policy loss: 0.138970. Value loss: 0.784426. Entropy: 1.232881.
Iteration 17199: Policy loss: 0.169800. Value loss: 0.432458. Entropy: 1.226579.
Training network. lr: 0.000118. clip: 0.047321
Iteration 17200: Policy loss: 0.426611. Value loss: 0.536841. Entropy: 1.245026.
Iteration 17201: Policy loss: 0.373708. Value loss: 0.388890. Entropy: 1.247513.
Iteration 17202: Policy loss: 0.40

Iteration 17269: Policy loss: -0.160043. Value loss: 1.031288. Entropy: 1.244115.
Iteration 17270: Policy loss: -0.170026. Value loss: 0.693739. Entropy: 1.246034.
Iteration 17271: Policy loss: -0.171868. Value loss: 0.560097. Entropy: 1.249979.
episode: 2353   score: 15900.0  epsilon: 1.0    steps: 94  evaluation reward: 36518.0
Training network. lr: 0.000118. clip: 0.047017
Iteration 17272: Policy loss: 0.024379. Value loss: 0.573467. Entropy: 1.258581.
Iteration 17273: Policy loss: 0.051062. Value loss: 0.436307. Entropy: 1.262722.
Iteration 17274: Policy loss: 0.051902. Value loss: 0.392936. Entropy: 1.263131.
Training network. lr: 0.000118. clip: 0.047017
Iteration 17275: Policy loss: -0.142576. Value loss: 0.853079. Entropy: 1.253191.
Iteration 17276: Policy loss: -0.144645. Value loss: 0.713457. Entropy: 1.255852.
Iteration 17277: Policy loss: -0.149985. Value loss: 0.649135. Entropy: 1.250670.
episode: 2354   score: 25000.0  epsilon: 1.0    steps: 866  evaluation reward: 36514.

Iteration 17343: Policy loss: -0.241047. Value loss: 0.763015. Entropy: 1.247771.
Training network. lr: 0.000117. clip: 0.046861
Iteration 17344: Policy loss: -0.156468. Value loss: 0.650865. Entropy: 1.243552.
Iteration 17345: Policy loss: -0.150821. Value loss: 0.462935. Entropy: 1.240569.
Iteration 17346: Policy loss: -0.153732. Value loss: 0.418624. Entropy: 1.239412.
Training network. lr: 0.000117. clip: 0.046861
Iteration 17347: Policy loss: 0.067225. Value loss: 0.427923. Entropy: 1.262707.
Iteration 17348: Policy loss: 0.094106. Value loss: 0.345225. Entropy: 1.257740.
Iteration 17349: Policy loss: 0.060506. Value loss: 0.248636. Entropy: 1.266646.
Training network. lr: 0.000117. clip: 0.046861
Iteration 17350: Policy loss: -0.309770. Value loss: 0.667813. Entropy: 1.245822.
Iteration 17351: Policy loss: -0.288890. Value loss: 0.453308. Entropy: 1.248729.
Iteration 17352: Policy loss: -0.306748. Value loss: 0.385922. Entropy: 1.249602.
episode: 2365   score: 34200.0  epsilon: 1

episode: 2372   score: 63900.0  epsilon: 1.0    steps: 995  evaluation reward: 36297.0
Training network. lr: 0.000116. clip: 0.046556
Iteration 17422: Policy loss: 0.338300. Value loss: 0.500352. Entropy: 1.226973.
Iteration 17423: Policy loss: 0.324329. Value loss: 0.274431. Entropy: 1.230328.
Iteration 17424: Policy loss: 0.304218. Value loss: 0.210249. Entropy: 1.230521.
Training network. lr: 0.000116. clip: 0.046556
Iteration 17425: Policy loss: 0.210885. Value loss: 0.425327. Entropy: 1.224360.
Iteration 17426: Policy loss: 0.177406. Value loss: 0.299500. Entropy: 1.223245.
Iteration 17427: Policy loss: 0.188594. Value loss: 0.231381. Entropy: 1.228232.
episode: 2373   score: 56400.0  epsilon: 1.0    steps: 412  evaluation reward: 36531.0
Training network. lr: 0.000116. clip: 0.046556
Iteration 17428: Policy loss: -0.005268. Value loss: 0.488011. Entropy: 1.237815.
Iteration 17429: Policy loss: 0.015987. Value loss: 0.307483. Entropy: 1.241186.
Iteration 17430: Policy loss: 0.0130

Iteration 17497: Policy loss: 0.134651. Value loss: 0.868997. Entropy: 1.191519.
Iteration 17498: Policy loss: 0.089226. Value loss: 0.585978. Entropy: 1.192626.
Iteration 17499: Policy loss: 0.086246. Value loss: 0.514862. Entropy: 1.193792.
Training network. lr: 0.000116. clip: 0.046400
Iteration 17500: Policy loss: 0.089357. Value loss: 0.726652. Entropy: 1.209912.
Iteration 17501: Policy loss: 0.116827. Value loss: 0.570835. Entropy: 1.204179.
Iteration 17502: Policy loss: 0.175369. Value loss: 0.400643. Entropy: 1.201409.
Training network. lr: 0.000116. clip: 0.046243
Iteration 17503: Policy loss: -0.039923. Value loss: 0.740669. Entropy: 1.234858.
Iteration 17504: Policy loss: -0.002179. Value loss: 0.370519. Entropy: 1.247243.
Iteration 17505: Policy loss: -0.017827. Value loss: 0.324234. Entropy: 1.240616.
Training network. lr: 0.000116. clip: 0.046243
Iteration 17506: Policy loss: 0.310851. Value loss: 0.922529. Entropy: 1.212075.
Iteration 17507: Policy loss: 0.263845. Value 

Iteration 17572: Policy loss: -0.073798. Value loss: 0.592502. Entropy: 1.216102.
Iteration 17573: Policy loss: -0.106220. Value loss: 0.323080. Entropy: 1.212467.
Iteration 17574: Policy loss: -0.127211. Value loss: 0.262939. Entropy: 1.207333.
episode: 2393   score: 48900.0  epsilon: 1.0    steps: 610  evaluation reward: 37371.0
Training network. lr: 0.000115. clip: 0.046096
Iteration 17575: Policy loss: 0.180375. Value loss: 0.482640. Entropy: 1.248359.
Iteration 17576: Policy loss: 0.165620. Value loss: 0.291958. Entropy: 1.256389.
Iteration 17577: Policy loss: 0.172609. Value loss: 0.255772. Entropy: 1.257303.
Training network. lr: 0.000115. clip: 0.046096
Iteration 17578: Policy loss: -0.041324. Value loss: 0.539535. Entropy: 1.263913.
Iteration 17579: Policy loss: -0.061085. Value loss: 0.330933. Entropy: 1.265773.
Iteration 17580: Policy loss: -0.045170. Value loss: 0.219338. Entropy: 1.263360.
Training network. lr: 0.000115. clip: 0.046096
Iteration 17581: Policy loss: 0.05652

episode: 2404   score: 19800.0  epsilon: 1.0    steps: 399  evaluation reward: 36119.0
Training network. lr: 0.000115. clip: 0.045939
Iteration 17647: Policy loss: 0.247201. Value loss: 0.631454. Entropy: 1.269288.
Iteration 17648: Policy loss: 0.238015. Value loss: 0.393425. Entropy: 1.275169.
Iteration 17649: Policy loss: 0.255912. Value loss: 0.298474. Entropy: 1.268870.
Training network. lr: 0.000115. clip: 0.045939
Iteration 17650: Policy loss: 0.031565. Value loss: 0.611970. Entropy: 1.263638.
Iteration 17651: Policy loss: 0.013127. Value loss: 0.474832. Entropy: 1.261241.
Iteration 17652: Policy loss: 0.018322. Value loss: 0.365271. Entropy: 1.265953.
Training network. lr: 0.000114. clip: 0.045782
Iteration 17653: Policy loss: 0.077857. Value loss: 0.539455. Entropy: 1.278994.
Iteration 17654: Policy loss: 0.061341. Value loss: 0.313830. Entropy: 1.267927.
Iteration 17655: Policy loss: 0.057462. Value loss: 0.231019. Entropy: 1.271408.
Training network. lr: 0.000114. clip: 0.045

Iteration 17724: Policy loss: -0.162332. Value loss: 0.863765. Entropy: 1.285036.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17725: Policy loss: 0.267316. Value loss: 0.476616. Entropy: 1.273778.
Iteration 17726: Policy loss: 0.258690. Value loss: 0.268923. Entropy: 1.272757.
Iteration 17727: Policy loss: 0.263140. Value loss: 0.198124. Entropy: 1.268127.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17728: Policy loss: 0.185998. Value loss: 0.519367. Entropy: 1.257594.
Iteration 17729: Policy loss: 0.180896. Value loss: 0.363001. Entropy: 1.253025.
Iteration 17730: Policy loss: 0.190031. Value loss: 0.304224. Entropy: 1.253720.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17731: Policy loss: -0.218041. Value loss: 1.054883. Entropy: 1.251110.
Iteration 17732: Policy loss: -0.232276. Value loss: 0.752310. Entropy: 1.249635.
Iteration 17733: Policy loss: -0.239863. Value loss: 0.654560. Entropy: 1.249126.
episode: 2413   score: 35100.0  epsilon: 1.0 

Training network. lr: 0.000114. clip: 0.045478
Iteration 17797: Policy loss: -0.023252. Value loss: 0.839956. Entropy: 1.295407.
Iteration 17798: Policy loss: -0.047752. Value loss: 0.524768. Entropy: 1.302209.
Iteration 17799: Policy loss: -0.010305. Value loss: 0.336282. Entropy: 1.297967.
Training network. lr: 0.000114. clip: 0.045478
Iteration 17800: Policy loss: 0.067435. Value loss: 0.677798. Entropy: 1.301828.
Iteration 17801: Policy loss: 0.068950. Value loss: 0.642388. Entropy: 1.292388.
Iteration 17802: Policy loss: 0.060424. Value loss: 0.417355. Entropy: 1.294106.
Training network. lr: 0.000113. clip: 0.045321
Iteration 17803: Policy loss: 0.212846. Value loss: 1.644619. Entropy: 1.276538.
Iteration 17804: Policy loss: 0.115375. Value loss: 1.491156. Entropy: 1.277339.
Iteration 17805: Policy loss: 0.174178. Value loss: 1.202270. Entropy: 1.270718.
Training network. lr: 0.000113. clip: 0.045321
Iteration 17806: Policy loss: 0.244670. Value loss: 0.742007. Entropy: 1.278156.

Iteration 17873: Policy loss: 0.042274. Value loss: 0.317715. Entropy: 1.267729.
Iteration 17874: Policy loss: 0.008578. Value loss: 0.244550. Entropy: 1.267510.
Training network. lr: 0.000113. clip: 0.045174
Iteration 17875: Policy loss: -0.101527. Value loss: 0.945697. Entropy: 1.249561.
Iteration 17876: Policy loss: -0.108147. Value loss: 0.628109. Entropy: 1.261036.
Iteration 17877: Policy loss: -0.123010. Value loss: 0.469683. Entropy: 1.247256.
episode: 2436   score: 22100.0  epsilon: 1.0    steps: 477  evaluation reward: 33308.0
episode: 2437   score: 20800.0  epsilon: 1.0    steps: 739  evaluation reward: 33126.0
Training network. lr: 0.000113. clip: 0.045174
Iteration 17878: Policy loss: -0.006691. Value loss: 0.649352. Entropy: 1.264760.
Iteration 17879: Policy loss: 0.024892. Value loss: 0.451887. Entropy: 1.255208.
Iteration 17880: Policy loss: 0.002870. Value loss: 0.323132. Entropy: 1.253203.
Training network. lr: 0.000113. clip: 0.045174
Iteration 17881: Policy loss: -0.

Iteration 17949: Policy loss: -0.095736. Value loss: 0.306282. Entropy: 1.304920.
Training network. lr: 0.000113. clip: 0.045017
Iteration 17950: Policy loss: 0.098905. Value loss: 0.657227. Entropy: 1.277387.
Iteration 17951: Policy loss: 0.042545. Value loss: 0.531346. Entropy: 1.277068.
Iteration 17952: Policy loss: 0.051998. Value loss: 0.370965. Entropy: 1.281483.
Training network. lr: 0.000112. clip: 0.044861
Iteration 17953: Policy loss: -0.067121. Value loss: 0.988803. Entropy: 1.241663.
Iteration 17954: Policy loss: -0.080195. Value loss: 0.652314. Entropy: 1.238890.
Iteration 17955: Policy loss: -0.116431. Value loss: 0.533433. Entropy: 1.239369.
Training network. lr: 0.000112. clip: 0.044861
Iteration 17956: Policy loss: -0.401292. Value loss: 1.941224. Entropy: 1.273075.
Iteration 17957: Policy loss: -0.334191. Value loss: 1.014372. Entropy: 1.275349.
Iteration 17958: Policy loss: -0.339130. Value loss: 0.892301. Entropy: 1.278093.
Training network. lr: 0.000112. clip: 0.04

Iteration 18024: Policy loss: 0.166787. Value loss: 0.197588. Entropy: 1.282814.
Training network. lr: 0.000112. clip: 0.044713
Iteration 18025: Policy loss: 0.082235. Value loss: 0.529524. Entropy: 1.287009.
Iteration 18026: Policy loss: 0.062706. Value loss: 0.313605. Entropy: 1.295647.
Iteration 18027: Policy loss: 0.100821. Value loss: 0.193238. Entropy: 1.294195.
Training network. lr: 0.000112. clip: 0.044713
Iteration 18028: Policy loss: -0.303048. Value loss: 1.044121. Entropy: 1.269654.
Iteration 18029: Policy loss: -0.284015. Value loss: 0.642756. Entropy: 1.269669.
Iteration 18030: Policy loss: -0.292177. Value loss: 0.539856. Entropy: 1.262041.
Training network. lr: 0.000112. clip: 0.044713
Iteration 18031: Policy loss: -0.095249. Value loss: 1.496270. Entropy: 1.256381.
Iteration 18032: Policy loss: -0.102644. Value loss: 0.997689. Entropy: 1.249958.
Iteration 18033: Policy loss: -0.102763. Value loss: 0.681717. Entropy: 1.255030.
Training network. lr: 0.000112. clip: 0.044

Iteration 18101: Policy loss: -0.097280. Value loss: 0.421106. Entropy: 1.248238.
Iteration 18102: Policy loss: -0.086876. Value loss: 0.358691. Entropy: 1.246430.
Training network. lr: 0.000111. clip: 0.044400
Iteration 18103: Policy loss: -0.070474. Value loss: 0.561960. Entropy: 1.254687.
Iteration 18104: Policy loss: -0.046139. Value loss: 0.364159. Entropy: 1.251047.
Iteration 18105: Policy loss: -0.066263. Value loss: 0.329996. Entropy: 1.249063.
Training network. lr: 0.000111. clip: 0.044400
Iteration 18106: Policy loss: 0.462263. Value loss: 0.538868. Entropy: 1.253089.
Iteration 18107: Policy loss: 0.452216. Value loss: 0.373617. Entropy: 1.259419.
Iteration 18108: Policy loss: 0.483769. Value loss: 0.261163. Entropy: 1.255221.
Training network. lr: 0.000111. clip: 0.044400
Iteration 18109: Policy loss: -0.076819. Value loss: 1.004620. Entropy: 1.242001.
Iteration 18110: Policy loss: -0.082636. Value loss: 0.690775. Entropy: 1.238622.
Iteration 18111: Policy loss: -0.059916. V

Iteration 18177: Policy loss: 0.144354. Value loss: 0.423738. Entropy: 1.233876.
Training network. lr: 0.000111. clip: 0.044252
Iteration 18178: Policy loss: -0.099394. Value loss: 1.349820. Entropy: 1.260018.
Iteration 18179: Policy loss: -0.053937. Value loss: 0.810075. Entropy: 1.260738.
Iteration 18180: Policy loss: -0.044319. Value loss: 0.608030. Entropy: 1.259179.
Training network. lr: 0.000111. clip: 0.044252
Iteration 18181: Policy loss: 0.032345. Value loss: 0.789744. Entropy: 1.272627.
Iteration 18182: Policy loss: 0.029251. Value loss: 0.433092. Entropy: 1.274534.
Iteration 18183: Policy loss: 0.068635. Value loss: 0.349299. Entropy: 1.272705.
episode: 2476   score: 22900.0  epsilon: 1.0    steps: 628  evaluation reward: 30759.0
episode: 2477   score: 40400.0  epsilon: 1.0    steps: 862  evaluation reward: 30885.0
Training network. lr: 0.000111. clip: 0.044252
Iteration 18184: Policy loss: 0.025768. Value loss: 0.418097. Entropy: 1.289868.
Iteration 18185: Policy loss: 0.04

Iteration 18255: Policy loss: 0.041572. Value loss: 0.342302. Entropy: 1.221655.
episode: 2484   score: 44100.0  epsilon: 1.0    steps: 134  evaluation reward: 30726.0
episode: 2485   score: 17900.0  epsilon: 1.0    steps: 478  evaluation reward: 30609.0
Training network. lr: 0.000110. clip: 0.043939
Iteration 18256: Policy loss: 0.062707. Value loss: 0.516646. Entropy: 1.207654.
Iteration 18257: Policy loss: 0.040483. Value loss: 0.313195. Entropy: 1.214750.
Iteration 18258: Policy loss: 0.056125. Value loss: 0.268077. Entropy: 1.206600.
Training network. lr: 0.000110. clip: 0.043939
Iteration 18259: Policy loss: -0.165958. Value loss: 0.713832. Entropy: 1.245328.
Iteration 18260: Policy loss: -0.121861. Value loss: 0.421734. Entropy: 1.245001.
Iteration 18261: Policy loss: -0.118470. Value loss: 0.347786. Entropy: 1.245970.
episode: 2486   score: 28400.0  epsilon: 1.0    steps: 369  evaluation reward: 30203.0
episode: 2487   score: 49200.0  epsilon: 1.0    steps: 600  evaluation rewa

Training network. lr: 0.000109. clip: 0.043792
Iteration 18331: Policy loss: -0.285192. Value loss: 1.195792. Entropy: 1.170164.
Iteration 18332: Policy loss: -0.242724. Value loss: 0.854988. Entropy: 1.169437.
Iteration 18333: Policy loss: -0.260417. Value loss: 0.710845. Entropy: 1.173096.
Training network. lr: 0.000109. clip: 0.043792
Iteration 18334: Policy loss: 0.363369. Value loss: 0.480855. Entropy: 1.213129.
Iteration 18335: Policy loss: 0.384566. Value loss: 0.280846. Entropy: 1.210414.
Iteration 18336: Policy loss: 0.343834. Value loss: 0.252140. Entropy: 1.224612.
Training network. lr: 0.000109. clip: 0.043792
Iteration 18337: Policy loss: -0.051506. Value loss: 1.498973. Entropy: 1.161725.
Iteration 18338: Policy loss: -0.024360. Value loss: 1.197857. Entropy: 1.159410.
Iteration 18339: Policy loss: -0.115761. Value loss: 0.958462. Entropy: 1.159494.
episode: 2494   score: 49700.0  epsilon: 1.0    steps: 1022  evaluation reward: 31127.0
Training network. lr: 0.000109. clip

Iteration 18406: Policy loss: 0.164283. Value loss: 0.421053. Entropy: 1.226282.
Iteration 18407: Policy loss: 0.158345. Value loss: 0.253176. Entropy: 1.224796.
Iteration 18408: Policy loss: 0.157786. Value loss: 0.200283. Entropy: 1.224432.
episode: 2504   score: 27500.0  epsilon: 1.0    steps: 679  evaluation reward: 31982.0
episode: 2505   score: 28500.0  epsilon: 1.0    steps: 839  evaluation reward: 32000.0
Training network. lr: 0.000109. clip: 0.043478
Iteration 18409: Policy loss: 0.024003. Value loss: 0.388488. Entropy: 1.236094.
Iteration 18410: Policy loss: 0.032038. Value loss: 0.228265. Entropy: 1.230326.
Iteration 18411: Policy loss: 0.061928. Value loss: 0.177549. Entropy: 1.228833.
Training network. lr: 0.000109. clip: 0.043478
Iteration 18412: Policy loss: -0.004142. Value loss: 0.415585. Entropy: 1.233699.
Iteration 18413: Policy loss: -0.000269. Value loss: 0.248339. Entropy: 1.215125.
Iteration 18414: Policy loss: -0.031998. Value loss: 0.209817. Entropy: 1.219761.


Training network. lr: 0.000108. clip: 0.043331
Iteration 18481: Policy loss: 0.118437. Value loss: 0.618054. Entropy: 1.265105.
Iteration 18482: Policy loss: 0.129126. Value loss: 0.412349. Entropy: 1.264799.
Iteration 18483: Policy loss: 0.098604. Value loss: 0.303146. Entropy: 1.265836.
Training network. lr: 0.000108. clip: 0.043331
Iteration 18484: Policy loss: -0.097236. Value loss: 0.803554. Entropy: 1.265413.
Iteration 18485: Policy loss: -0.156840. Value loss: 0.404453. Entropy: 1.273398.
Iteration 18486: Policy loss: -0.132117. Value loss: 0.325977. Entropy: 1.271379.
Training network. lr: 0.000108. clip: 0.043331
Iteration 18487: Policy loss: 0.029501. Value loss: 0.698906. Entropy: 1.265943.
Iteration 18488: Policy loss: 0.046868. Value loss: 0.614461. Entropy: 1.266719.
Iteration 18489: Policy loss: 0.051612. Value loss: 0.364427. Entropy: 1.273629.
Training network. lr: 0.000108. clip: 0.043331
Iteration 18490: Policy loss: -0.066222. Value loss: 0.789767. Entropy: 1.276857

Iteration 18558: Policy loss: 0.172205. Value loss: 0.277998. Entropy: 1.268601.
episode: 2524   score: 25800.0  epsilon: 1.0    steps: 6  evaluation reward: 32667.0
Training network. lr: 0.000108. clip: 0.043017
Iteration 18559: Policy loss: -0.247029. Value loss: 0.623226. Entropy: 1.270831.
Iteration 18560: Policy loss: -0.244297. Value loss: 0.463144. Entropy: 1.266769.
Iteration 18561: Policy loss: -0.267767. Value loss: 0.391288. Entropy: 1.265666.
episode: 2525   score: 29500.0  epsilon: 1.0    steps: 965  evaluation reward: 32760.0
Training network. lr: 0.000108. clip: 0.043017
Iteration 18562: Policy loss: 0.094657. Value loss: 0.346298. Entropy: 1.267405.
Iteration 18563: Policy loss: 0.111514. Value loss: 0.230136. Entropy: 1.279379.
Iteration 18564: Policy loss: 0.106982. Value loss: 0.145809. Entropy: 1.265949.
episode: 2526   score: 11300.0  epsilon: 1.0    steps: 132  evaluation reward: 32538.0
episode: 2527   score: 20300.0  epsilon: 1.0    steps: 619  evaluation reward

episode: 2537   score: 33200.0  epsilon: 1.0    steps: 481  evaluation reward: 32337.0
Training network. lr: 0.000107. clip: 0.042870
Iteration 18631: Policy loss: 0.107462. Value loss: 0.602667. Entropy: 1.301554.
Iteration 18632: Policy loss: 0.100884. Value loss: 0.292239. Entropy: 1.304488.
Iteration 18633: Policy loss: 0.101043. Value loss: 0.196893. Entropy: 1.302058.
Training network. lr: 0.000107. clip: 0.042870
Iteration 18634: Policy loss: -0.036088. Value loss: 0.798352. Entropy: 1.275062.
Iteration 18635: Policy loss: -0.062948. Value loss: 0.606626. Entropy: 1.276145.
Iteration 18636: Policy loss: -0.010298. Value loss: 0.466326. Entropy: 1.268766.
Training network. lr: 0.000107. clip: 0.042870
Iteration 18637: Policy loss: 0.259120. Value loss: 0.389854. Entropy: 1.284414.
Iteration 18638: Policy loss: 0.227777. Value loss: 0.203585. Entropy: 1.287788.
Iteration 18639: Policy loss: 0.258836. Value loss: 0.135601. Entropy: 1.285633.
Training network. lr: 0.000107. clip: 0.

Iteration 18706: Policy loss: 0.208691. Value loss: 0.663131. Entropy: 1.273204.
Iteration 18707: Policy loss: 0.201886. Value loss: 0.418095. Entropy: 1.271139.
Iteration 18708: Policy loss: 0.213663. Value loss: 0.278086. Entropy: 1.271027.
Training network. lr: 0.000106. clip: 0.042557
Iteration 18709: Policy loss: -0.180162. Value loss: 0.971425. Entropy: 1.253569.
Iteration 18710: Policy loss: -0.193663. Value loss: 0.527657. Entropy: 1.251460.
Iteration 18711: Policy loss: -0.178308. Value loss: 0.402744. Entropy: 1.262511.
Training network. lr: 0.000106. clip: 0.042557
Iteration 18712: Policy loss: 0.033274. Value loss: 0.839833. Entropy: 1.252269.
Iteration 18713: Policy loss: 0.125225. Value loss: 0.558346. Entropy: 1.242979.
Iteration 18714: Policy loss: 0.014201. Value loss: 0.412995. Entropy: 1.250663.
Training network. lr: 0.000106. clip: 0.042557
Iteration 18715: Policy loss: -0.277304. Value loss: 0.806185. Entropy: 1.246735.
Iteration 18716: Policy loss: -0.231919. Valu

Iteration 18782: Policy loss: -0.025272. Value loss: 0.264780. Entropy: 1.256005.
Iteration 18783: Policy loss: -0.044800. Value loss: 0.178671. Entropy: 1.252128.
Training network. lr: 0.000106. clip: 0.042409
Iteration 18784: Policy loss: -0.163970. Value loss: 0.673246. Entropy: 1.263534.
Iteration 18785: Policy loss: -0.151009. Value loss: 0.372501. Entropy: 1.271924.
Iteration 18786: Policy loss: -0.182545. Value loss: 0.272884. Entropy: 1.263816.
Training network. lr: 0.000106. clip: 0.042409
Iteration 18787: Policy loss: 0.092839. Value loss: 0.371769. Entropy: 1.261763.
Iteration 18788: Policy loss: 0.102781. Value loss: 0.262780. Entropy: 1.261861.
Iteration 18789: Policy loss: 0.111023. Value loss: 0.225911. Entropy: 1.266613.
Training network. lr: 0.000106. clip: 0.042409
Iteration 18790: Policy loss: -0.261884. Value loss: 0.882177. Entropy: 1.247636.
Iteration 18791: Policy loss: -0.246661. Value loss: 0.720810. Entropy: 1.239846.
Iteration 18792: Policy loss: -0.232798. V

episode: 2566   score: 23000.0  epsilon: 1.0    steps: 158  evaluation reward: 31570.0
Training network. lr: 0.000105. clip: 0.042096
Iteration 18859: Policy loss: -0.194705. Value loss: 0.755336. Entropy: 1.259777.
Iteration 18860: Policy loss: -0.180272. Value loss: 0.619952. Entropy: 1.261883.
Iteration 18861: Policy loss: -0.212839. Value loss: 0.402567. Entropy: 1.263465.
Training network. lr: 0.000105. clip: 0.042096
Iteration 18862: Policy loss: 0.009627. Value loss: 0.443123. Entropy: 1.277203.
Iteration 18863: Policy loss: 0.023067. Value loss: 0.351108. Entropy: 1.273102.
Iteration 18864: Policy loss: 0.002684. Value loss: 0.300862. Entropy: 1.283096.
Training network. lr: 0.000105. clip: 0.042096
Iteration 18865: Policy loss: -0.109709. Value loss: 1.367778. Entropy: 1.232177.
Iteration 18866: Policy loss: -0.140164. Value loss: 1.046597. Entropy: 1.240994.
Iteration 18867: Policy loss: -0.126475. Value loss: 0.618344. Entropy: 1.240294.
episode: 2567   score: 39900.0  epsil

Training network. lr: 0.000105. clip: 0.041948
Iteration 18934: Policy loss: 0.194013. Value loss: 0.716513. Entropy: 1.267197.
Iteration 18935: Policy loss: 0.194414. Value loss: 0.458364. Entropy: 1.263189.
Iteration 18936: Policy loss: 0.207336. Value loss: 0.339585. Entropy: 1.263797.
Training network. lr: 0.000105. clip: 0.041948
Iteration 18937: Policy loss: -0.072064. Value loss: 0.381521. Entropy: 1.293954.
Iteration 18938: Policy loss: 0.003639. Value loss: 0.201252. Entropy: 1.299589.
Iteration 18939: Policy loss: -0.066192. Value loss: 0.198782. Entropy: 1.296135.
episode: 2577   score: 28300.0  epsilon: 1.0    steps: 686  evaluation reward: 31586.0
Training network. lr: 0.000105. clip: 0.041948
Iteration 18940: Policy loss: -0.170855. Value loss: 0.740258. Entropy: 1.281422.
Iteration 18941: Policy loss: -0.190440. Value loss: 0.566679. Entropy: 1.278990.
Iteration 18942: Policy loss: -0.180708. Value loss: 0.433201. Entropy: 1.281062.
episode: 2578   score: 23300.0  epsilo

episode: 2588   score: 29800.0  epsilon: 1.0    steps: 710  evaluation reward: 30865.0
Training network. lr: 0.000104. clip: 0.041635
Iteration 19009: Policy loss: -0.406686. Value loss: 1.266146. Entropy: 1.261832.
Iteration 19010: Policy loss: -0.442869. Value loss: 0.737981. Entropy: 1.265529.
Iteration 19011: Policy loss: -0.354581. Value loss: 0.505536. Entropy: 1.260379.
Training network. lr: 0.000104. clip: 0.041635
Iteration 19012: Policy loss: 0.411483. Value loss: 0.591765. Entropy: 1.276950.
Iteration 19013: Policy loss: 0.406006. Value loss: 0.337517. Entropy: 1.274136.
Iteration 19014: Policy loss: 0.400075. Value loss: 0.290233. Entropy: 1.274593.
Training network. lr: 0.000104. clip: 0.041635
Iteration 19015: Policy loss: 0.293063. Value loss: 1.181542. Entropy: 1.265959.
Iteration 19016: Policy loss: 0.237545. Value loss: 0.740236. Entropy: 1.270714.
Iteration 19017: Policy loss: 0.305604. Value loss: 0.509128. Entropy: 1.269580.
Training network. lr: 0.000104. clip: 0.

Iteration 19084: Policy loss: 0.040719. Value loss: 0.475641. Entropy: 1.275580.
Iteration 19085: Policy loss: 0.039784. Value loss: 0.239191. Entropy: 1.276646.
Iteration 19086: Policy loss: 0.088607. Value loss: 0.174276. Entropy: 1.272648.
episode: 2598   score: 26400.0  epsilon: 1.0    steps: 329  evaluation reward: 29792.0
Training network. lr: 0.000104. clip: 0.041488
Iteration 19087: Policy loss: -0.194024. Value loss: 1.011404. Entropy: 1.268810.
Iteration 19088: Policy loss: -0.155099. Value loss: 0.404262. Entropy: 1.258143.
Iteration 19089: Policy loss: -0.144355. Value loss: 0.327902. Entropy: 1.267460.
Training network. lr: 0.000104. clip: 0.041488
Iteration 19090: Policy loss: 0.060520. Value loss: 0.830380. Entropy: 1.251895.
Iteration 19091: Policy loss: 0.050744. Value loss: 0.491093. Entropy: 1.243455.
Iteration 19092: Policy loss: 0.000217. Value loss: 0.364562. Entropy: 1.245152.
Training network. lr: 0.000104. clip: 0.041488
Iteration 19093: Policy loss: 0.039500. 

Training network. lr: 0.000103. clip: 0.041174
Iteration 19159: Policy loss: -0.143429. Value loss: 0.854719. Entropy: 1.232922.
Iteration 19160: Policy loss: -0.149722. Value loss: 0.534851. Entropy: 1.214778.
Iteration 19161: Policy loss: -0.163121. Value loss: 0.455098. Entropy: 1.230996.
Training network. lr: 0.000103. clip: 0.041174
Iteration 19162: Policy loss: -0.139390. Value loss: 1.858559. Entropy: 1.212197.
Iteration 19163: Policy loss: -0.191571. Value loss: 1.195754. Entropy: 1.224873.
Iteration 19164: Policy loss: -0.182511. Value loss: 1.079911. Entropy: 1.211790.
Training network. lr: 0.000103. clip: 0.041174
Iteration 19165: Policy loss: 0.221707. Value loss: 0.903028. Entropy: 1.228714.
Iteration 19166: Policy loss: 0.240444. Value loss: 0.586602. Entropy: 1.226343.
Iteration 19167: Policy loss: 0.250943. Value loss: 0.482048. Entropy: 1.225777.
Training network. lr: 0.000103. clip: 0.041174
Iteration 19168: Policy loss: 0.158513. Value loss: 0.692442. Entropy: 1.2061

Iteration 19235: Policy loss: -0.061732. Value loss: 0.646898. Entropy: 1.200920.
Iteration 19236: Policy loss: 0.006156. Value loss: 0.401240. Entropy: 1.205149.
Training network. lr: 0.000103. clip: 0.041027
Iteration 19237: Policy loss: -0.240566. Value loss: 0.659601. Entropy: 1.196467.
Iteration 19238: Policy loss: -0.255776. Value loss: 0.453302. Entropy: 1.200832.
Iteration 19239: Policy loss: -0.247535. Value loss: 0.332882. Entropy: 1.193707.
Training network. lr: 0.000103. clip: 0.041027
Iteration 19240: Policy loss: 0.075478. Value loss: 0.608699. Entropy: 1.202064.
Iteration 19241: Policy loss: 0.115036. Value loss: 0.446470. Entropy: 1.202271.
Iteration 19242: Policy loss: 0.096830. Value loss: 0.408570. Entropy: 1.195040.
Training network. lr: 0.000103. clip: 0.041027
Iteration 19243: Policy loss: 0.153825. Value loss: 0.405474. Entropy: 1.241584.
Iteration 19244: Policy loss: 0.147074. Value loss: 0.322556. Entropy: 1.231657.
Iteration 19245: Policy loss: 0.144688. Value

Iteration 19312: Policy loss: 0.312659. Value loss: 0.783423. Entropy: 1.187444.
Iteration 19313: Policy loss: 0.299612. Value loss: 0.699873. Entropy: 1.182853.
Iteration 19314: Policy loss: 0.323480. Value loss: 0.496715. Entropy: 1.186044.
Training network. lr: 0.000102. clip: 0.040713
Iteration 19315: Policy loss: -0.520578. Value loss: 2.180794. Entropy: 1.156988.
Iteration 19316: Policy loss: -0.440762. Value loss: 1.471281. Entropy: 1.139754.
Iteration 19317: Policy loss: -0.475382. Value loss: 1.222000. Entropy: 1.153322.
Training network. lr: 0.000102. clip: 0.040713
Iteration 19318: Policy loss: 0.178765. Value loss: 0.952352. Entropy: 1.181875.
Iteration 19319: Policy loss: 0.214194. Value loss: 0.713244. Entropy: 1.178138.
Iteration 19320: Policy loss: 0.213399. Value loss: 0.509274. Entropy: 1.179452.
Training network. lr: 0.000102. clip: 0.040713
Iteration 19321: Policy loss: 0.246123. Value loss: 0.839163. Entropy: 1.174576.
Iteration 19322: Policy loss: 0.241021. Value 

Iteration 19388: Policy loss: 0.225121. Value loss: 0.247657. Entropy: 1.175872.
Iteration 19389: Policy loss: 0.231831. Value loss: 0.246307. Entropy: 1.179034.
Training network. lr: 0.000101. clip: 0.040566
Iteration 19390: Policy loss: 0.205678. Value loss: 0.830399. Entropy: 1.200872.
Iteration 19391: Policy loss: 0.175906. Value loss: 0.448437. Entropy: 1.201146.
Iteration 19392: Policy loss: 0.208408. Value loss: 0.284485. Entropy: 1.203486.
episode: 2637   score: 22500.0  epsilon: 1.0    steps: 286  evaluation reward: 31526.0
episode: 2638   score: 33100.0  epsilon: 1.0    steps: 533  evaluation reward: 31654.0
Training network. lr: 0.000101. clip: 0.040566
Iteration 19393: Policy loss: 0.062934. Value loss: 0.723521. Entropy: 1.199741.
Iteration 19394: Policy loss: 0.083495. Value loss: 0.481665. Entropy: 1.199441.
Iteration 19395: Policy loss: 0.057563. Value loss: 0.360609. Entropy: 1.206901.
episode: 2639   score: 36800.0  epsilon: 1.0    steps: 1001  evaluation reward: 3172

Training network. lr: 0.000101. clip: 0.040253
Iteration 19465: Policy loss: -0.055107. Value loss: 0.449629. Entropy: 1.184173.
Iteration 19466: Policy loss: -0.083766. Value loss: 0.334947. Entropy: 1.178107.
Iteration 19467: Policy loss: -0.093642. Value loss: 0.295993. Entropy: 1.191216.
Training network. lr: 0.000101. clip: 0.040253
Iteration 19468: Policy loss: 0.214277. Value loss: 0.315567. Entropy: 1.180035.
Iteration 19469: Policy loss: 0.209870. Value loss: 0.219746. Entropy: 1.191677.
Iteration 19470: Policy loss: 0.216886. Value loss: 0.198807. Entropy: 1.190538.
episode: 2646   score: 31800.0  epsilon: 1.0    steps: 479  evaluation reward: 32569.0
episode: 2647   score: 50900.0  epsilon: 1.0    steps: 990  evaluation reward: 32655.0
Training network. lr: 0.000101. clip: 0.040253
Iteration 19471: Policy loss: 0.093121. Value loss: 1.379218. Entropy: 1.149057.
Iteration 19472: Policy loss: 0.083848. Value loss: 1.015329. Entropy: 1.149526.
Iteration 19473: Policy loss: 0.08

Iteration 19541: Policy loss: -0.064019. Value loss: 0.720265. Entropy: 1.184303.
Iteration 19542: Policy loss: -0.106763. Value loss: 0.560753. Entropy: 1.176745.
episode: 2655   score: 38700.0  epsilon: 1.0    steps: 651  evaluation reward: 33384.0
Training network. lr: 0.000100. clip: 0.040105
Iteration 19543: Policy loss: 0.033498. Value loss: 0.694818. Entropy: 1.165906.
Iteration 19544: Policy loss: 0.002443. Value loss: 0.476650. Entropy: 1.171371.
Iteration 19545: Policy loss: 0.020133. Value loss: 0.400690. Entropy: 1.169033.
Training network. lr: 0.000100. clip: 0.040105
Iteration 19546: Policy loss: 0.043296. Value loss: 0.943806. Entropy: 1.201730.
Iteration 19547: Policy loss: 0.018227. Value loss: 0.696926. Entropy: 1.202569.
Iteration 19548: Policy loss: 0.057621. Value loss: 0.509947. Entropy: 1.194001.
Training network. lr: 0.000100. clip: 0.040105
Iteration 19549: Policy loss: 0.094724. Value loss: 0.417751. Entropy: 1.159940.
Iteration 19550: Policy loss: 0.089614. V

Iteration 19619: Policy loss: 0.147262. Value loss: 0.197033. Entropy: 1.225929.
Iteration 19620: Policy loss: 0.116616. Value loss: 0.159510. Entropy: 1.230351.
episode: 2663   score: 26400.0  epsilon: 1.0    steps: 769  evaluation reward: 34453.0
Training network. lr: 0.000099. clip: 0.039792
Iteration 19621: Policy loss: -0.030474. Value loss: 0.769594. Entropy: 1.179361.
Iteration 19622: Policy loss: 0.010180. Value loss: 0.397049. Entropy: 1.179551.
Iteration 19623: Policy loss: 0.007103. Value loss: 0.296626. Entropy: 1.184481.
episode: 2664   score: 54800.0  epsilon: 1.0    steps: 972  evaluation reward: 34562.0
Training network. lr: 0.000099. clip: 0.039792
Iteration 19624: Policy loss: -0.111915. Value loss: 0.552542. Entropy: 1.212881.
Iteration 19625: Policy loss: -0.052400. Value loss: 0.343798. Entropy: 1.200968.
Iteration 19626: Policy loss: -0.110211. Value loss: 0.231764. Entropy: 1.197559.
Training network. lr: 0.000099. clip: 0.039792
Iteration 19627: Policy loss: -0.

Training network. lr: 0.000099. clip: 0.039644
Iteration 19696: Policy loss: -0.090285. Value loss: 0.787937. Entropy: 1.202053.
Iteration 19697: Policy loss: -0.071988. Value loss: 0.535053. Entropy: 1.199155.
Iteration 19698: Policy loss: -0.075224. Value loss: 0.483937. Entropy: 1.205490.
Training network. lr: 0.000099. clip: 0.039644
Iteration 19699: Policy loss: -0.251557. Value loss: 0.898930. Entropy: 1.210269.
Iteration 19700: Policy loss: -0.205782. Value loss: 0.383824. Entropy: 1.221114.
Iteration 19701: Policy loss: -0.183047. Value loss: 0.317540. Entropy: 1.220724.
Training network. lr: 0.000099. clip: 0.039488
Iteration 19702: Policy loss: -0.531310. Value loss: 1.361518. Entropy: 1.181283.
Iteration 19703: Policy loss: -0.530566. Value loss: 1.058951. Entropy: 1.182500.
Iteration 19704: Policy loss: -0.563689. Value loss: 0.807562. Entropy: 1.168648.
Training network. lr: 0.000099. clip: 0.039488
Iteration 19705: Policy loss: 0.329768. Value loss: 0.651573. Entropy: 1.2

Iteration 19772: Policy loss: -0.062673. Value loss: 0.821916. Entropy: 1.150254.
Iteration 19773: Policy loss: -0.050972. Value loss: 0.697577. Entropy: 1.155390.
Training network. lr: 0.000098. clip: 0.039331
Iteration 19774: Policy loss: -0.067445. Value loss: 0.895052. Entropy: 1.147230.
Iteration 19775: Policy loss: -0.057446. Value loss: 0.711177. Entropy: 1.147236.
Iteration 19776: Policy loss: -0.051980. Value loss: 0.541730. Entropy: 1.147687.
Training network. lr: 0.000098. clip: 0.039331
Iteration 19777: Policy loss: 0.055099. Value loss: 0.655923. Entropy: 1.140097.
Iteration 19778: Policy loss: 0.071455. Value loss: 0.367385. Entropy: 1.152989.
Iteration 19779: Policy loss: 0.081350. Value loss: 0.308788. Entropy: 1.149208.
Training network. lr: 0.000098. clip: 0.039331
Iteration 19780: Policy loss: 0.138528. Value loss: 0.640109. Entropy: 1.150324.
Iteration 19781: Policy loss: 0.123255. Value loss: 0.609707. Entropy: 1.160465.
Iteration 19782: Policy loss: 0.149089. Valu

Iteration 19850: Policy loss: 0.300172. Value loss: 0.466669. Entropy: 1.165119.
Iteration 19851: Policy loss: 0.270952. Value loss: 0.364450. Entropy: 1.161242.
Training network. lr: 0.000098. clip: 0.039027
Iteration 19852: Policy loss: -0.101081. Value loss: 0.703272. Entropy: 1.185085.
Iteration 19853: Policy loss: -0.126589. Value loss: 0.426919. Entropy: 1.181695.
Iteration 19854: Policy loss: -0.130884. Value loss: 0.377949. Entropy: 1.185012.
Training network. lr: 0.000098. clip: 0.039027
Iteration 19855: Policy loss: 0.112928. Value loss: 0.957660. Entropy: 1.121663.
Iteration 19856: Policy loss: 0.101666. Value loss: 0.660810. Entropy: 1.118982.
Iteration 19857: Policy loss: 0.070111. Value loss: 0.556602. Entropy: 1.135181.
Training network. lr: 0.000098. clip: 0.039027
Iteration 19858: Policy loss: 0.079853. Value loss: 0.746769. Entropy: 1.133208.
Iteration 19859: Policy loss: 0.051797. Value loss: 0.517583. Entropy: 1.123538.
Iteration 19860: Policy loss: 0.043928. Value 

episode: 2698   score: 22300.0  epsilon: 1.0    steps: 289  evaluation reward: 38118.0
Training network. lr: 0.000097. clip: 0.038870
Iteration 19927: Policy loss: -0.099767. Value loss: 0.605428. Entropy: 1.165163.
Iteration 19928: Policy loss: -0.111270. Value loss: 0.547271. Entropy: 1.169728.
Iteration 19929: Policy loss: -0.117990. Value loss: 0.425080. Entropy: 1.172764.
Training network. lr: 0.000097. clip: 0.038870
Iteration 19930: Policy loss: -0.070774. Value loss: 0.971452. Entropy: 1.167185.
Iteration 19931: Policy loss: -0.056494. Value loss: 0.639758. Entropy: 1.164416.
Iteration 19932: Policy loss: -0.080042. Value loss: 0.523073. Entropy: 1.162396.
Training network. lr: 0.000097. clip: 0.038870
Iteration 19933: Policy loss: -0.170364. Value loss: 0.768294. Entropy: 1.164274.
Iteration 19934: Policy loss: -0.162697. Value loss: 0.427849. Entropy: 1.174704.
Iteration 19935: Policy loss: -0.193313. Value loss: 0.375211. Entropy: 1.166622.
Training network. lr: 0.000097. cl

Training network. lr: 0.000096. clip: 0.038566
Iteration 20002: Policy loss: 0.299513. Value loss: 0.984472. Entropy: 1.179781.
Iteration 20003: Policy loss: 0.196062. Value loss: 0.662804. Entropy: 1.177395.
Iteration 20004: Policy loss: 0.200778. Value loss: 0.550195. Entropy: 1.192328.
Training network. lr: 0.000096. clip: 0.038566
Iteration 20005: Policy loss: -0.041825. Value loss: 0.832157. Entropy: 1.167831.
Iteration 20006: Policy loss: 0.034196. Value loss: 0.534788. Entropy: 1.162941.
Iteration 20007: Policy loss: 0.000127. Value loss: 0.482798. Entropy: 1.164627.
Training network. lr: 0.000096. clip: 0.038566
Iteration 20008: Policy loss: -0.124108. Value loss: 1.041719. Entropy: 1.128156.
Iteration 20009: Policy loss: -0.098251. Value loss: 0.781218. Entropy: 1.132421.
Iteration 20010: Policy loss: -0.120655. Value loss: 0.636124. Entropy: 1.133437.
Training network. lr: 0.000096. clip: 0.038566
Iteration 20011: Policy loss: 0.244617. Value loss: 0.403564. Entropy: 1.163132

Iteration 20078: Policy loss: -0.009517. Value loss: 0.706130. Entropy: 1.128077.
Iteration 20079: Policy loss: -0.038138. Value loss: 0.638952. Entropy: 1.123428.
Training network. lr: 0.000096. clip: 0.038409
Iteration 20080: Policy loss: -0.122880. Value loss: 0.971260. Entropy: 1.112372.
Iteration 20081: Policy loss: -0.129363. Value loss: 0.756071. Entropy: 1.129980.
Iteration 20082: Policy loss: -0.120997. Value loss: 0.635763. Entropy: 1.112748.
episode: 2717   score: 24900.0  epsilon: 1.0    steps: 177  evaluation reward: 38691.0
Training network. lr: 0.000096. clip: 0.038409
Iteration 20083: Policy loss: -0.021456. Value loss: 0.681050. Entropy: 1.161177.
Iteration 20084: Policy loss: -0.033192. Value loss: 0.487831. Entropy: 1.156056.
Iteration 20085: Policy loss: -0.020157. Value loss: 0.408556. Entropy: 1.162171.
episode: 2718   score: 61700.0  epsilon: 1.0    steps: 258  evaluation reward: 39064.0
episode: 2719   score: 29400.0  epsilon: 1.0    steps: 1008  evaluation rewa

Iteration 20156: Policy loss: 0.233760. Value loss: 0.374630. Entropy: 1.171192.
Iteration 20157: Policy loss: 0.241046. Value loss: 0.311902. Entropy: 1.168008.
episode: 2725   score: 80200.0  epsilon: 1.0    steps: 498  evaluation reward: 39434.0
episode: 2726   score: 34000.0  epsilon: 1.0    steps: 739  evaluation reward: 39414.0
episode: 2727   score: 45400.0  epsilon: 1.0    steps: 1019  evaluation reward: 39170.0
Training network. lr: 0.000095. clip: 0.038105
Iteration 20158: Policy loss: 0.433805. Value loss: 0.718698. Entropy: 1.168234.
Iteration 20159: Policy loss: 0.444900. Value loss: 0.444698. Entropy: 1.162388.
Iteration 20160: Policy loss: 0.463313. Value loss: 0.344109. Entropy: 1.170673.
episode: 2728   score: 42300.0  epsilon: 1.0    steps: 283  evaluation reward: 39117.0
Training network. lr: 0.000095. clip: 0.038105
Iteration 20161: Policy loss: -0.113545. Value loss: 0.442006. Entropy: 1.213662.
Iteration 20162: Policy loss: -0.121130. Value loss: 0.278284. Entropy

episode: 2737   score: 17400.0  epsilon: 1.0    steps: 623  evaluation reward: 38188.0
Training network. lr: 0.000095. clip: 0.037949
Iteration 20230: Policy loss: 0.330345. Value loss: 0.582541. Entropy: 1.176997.
Iteration 20231: Policy loss: 0.360386. Value loss: 0.375709. Entropy: 1.171005.
Iteration 20232: Policy loss: 0.364748. Value loss: 0.249162. Entropy: 1.177943.
Training network. lr: 0.000095. clip: 0.037949
Iteration 20233: Policy loss: 0.204249. Value loss: 0.527187. Entropy: 1.202038.
Iteration 20234: Policy loss: 0.224977. Value loss: 0.368964. Entropy: 1.198482.
Iteration 20235: Policy loss: 0.199579. Value loss: 0.282207. Entropy: 1.199347.
Training network. lr: 0.000095. clip: 0.037949
Iteration 20236: Policy loss: -0.211435. Value loss: 0.806350. Entropy: 1.183032.
Iteration 20237: Policy loss: -0.216911. Value loss: 0.520403. Entropy: 1.196680.
Iteration 20238: Policy loss: -0.184780. Value loss: 0.371214. Entropy: 1.186556.
episode: 2738   score: 44000.0  epsilon:

Iteration 20307: Policy loss: -0.055898. Value loss: 0.376857. Entropy: 1.207908.
episode: 2746   score: 59800.0  epsilon: 1.0    steps: 662  evaluation reward: 37938.0
Training network. lr: 0.000094. clip: 0.037645
Iteration 20308: Policy loss: 0.152768. Value loss: 0.453443. Entropy: 1.211766.
Iteration 20309: Policy loss: 0.091443. Value loss: 0.422213. Entropy: 1.221140.
Iteration 20310: Policy loss: 0.132435. Value loss: 0.357418. Entropy: 1.217511.
episode: 2747   score: 42000.0  epsilon: 1.0    steps: 886  evaluation reward: 37849.0
Training network. lr: 0.000094. clip: 0.037645
Iteration 20311: Policy loss: -0.009283. Value loss: 0.491687. Entropy: 1.235087.
Iteration 20312: Policy loss: -0.047321. Value loss: 0.467514. Entropy: 1.239749.
Iteration 20313: Policy loss: -0.039141. Value loss: 0.312444. Entropy: 1.237105.
episode: 2748   score: 53100.0  epsilon: 1.0    steps: 418  evaluation reward: 38017.0
Training network. lr: 0.000094. clip: 0.037645
Iteration 20314: Policy los

Training network. lr: 0.000094. clip: 0.037488
Iteration 20383: Policy loss: -0.133696. Value loss: 1.178444. Entropy: 1.243658.
Iteration 20384: Policy loss: -0.124380. Value loss: 1.027538. Entropy: 1.244896.
Iteration 20385: Policy loss: -0.121756. Value loss: 0.843218. Entropy: 1.238146.
episode: 2756   score: 20000.0  epsilon: 1.0    steps: 215  evaluation reward: 37624.0
Training network. lr: 0.000094. clip: 0.037488
Iteration 20386: Policy loss: -0.031312. Value loss: 1.089085. Entropy: 1.226655.
Iteration 20387: Policy loss: -0.033048. Value loss: 0.766320. Entropy: 1.235846.
Iteration 20388: Policy loss: -0.004202. Value loss: 0.663088. Entropy: 1.232026.
episode: 2757   score: 47900.0  epsilon: 1.0    steps: 788  evaluation reward: 37549.0
Training network. lr: 0.000094. clip: 0.037488
Iteration 20389: Policy loss: 0.091340. Value loss: 0.588452. Entropy: 1.284872.
Iteration 20390: Policy loss: 0.095658. Value loss: 0.414219. Entropy: 1.285217.
Iteration 20391: Policy loss: 0

Iteration 20460: Policy loss: -0.112968. Value loss: 0.541789. Entropy: 1.219315.
Training network. lr: 0.000093. clip: 0.037184
Iteration 20461: Policy loss: 0.162717. Value loss: 0.599874. Entropy: 1.223594.
Iteration 20462: Policy loss: 0.150447. Value loss: 0.404314. Entropy: 1.240165.
Iteration 20463: Policy loss: 0.135169. Value loss: 0.351272. Entropy: 1.225163.
episode: 2765   score: 46600.0  epsilon: 1.0    steps: 736  evaluation reward: 37359.0
Training network. lr: 0.000093. clip: 0.037184
Iteration 20464: Policy loss: 0.069173. Value loss: 2.168772. Entropy: 1.236242.
Iteration 20465: Policy loss: -0.014641. Value loss: 1.952123. Entropy: 1.231481.
Iteration 20466: Policy loss: 0.136950. Value loss: 1.173999. Entropy: 1.241340.
Training network. lr: 0.000093. clip: 0.037184
Iteration 20467: Policy loss: -0.133201. Value loss: 1.237389. Entropy: 1.209336.
Iteration 20468: Policy loss: -0.102827. Value loss: 0.792443. Entropy: 1.215943.
Iteration 20469: Policy loss: -0.112959

Iteration 20535: Policy loss: 0.009357. Value loss: 0.285069. Entropy: 1.242497.
episode: 2776   score: 36100.0  epsilon: 1.0    steps: 219  evaluation reward: 36061.0
Training network. lr: 0.000093. clip: 0.037027
Iteration 20536: Policy loss: -0.076931. Value loss: 0.845286. Entropy: 1.284273.
Iteration 20537: Policy loss: -0.057851. Value loss: 0.556641. Entropy: 1.287428.
Iteration 20538: Policy loss: -0.035630. Value loss: 0.446647. Entropy: 1.282590.
episode: 2777   score: 23400.0  epsilon: 1.0    steps: 542  evaluation reward: 35925.0
Training network. lr: 0.000093. clip: 0.037027
Iteration 20539: Policy loss: 0.123970. Value loss: 0.242845. Entropy: 1.310433.
Iteration 20540: Policy loss: 0.135268. Value loss: 0.141555. Entropy: 1.306695.
Iteration 20541: Policy loss: 0.133601. Value loss: 0.151400. Entropy: 1.313370.
episode: 2778   score: 28300.0  epsilon: 1.0    steps: 118  evaluation reward: 35710.0
Training network. lr: 0.000093. clip: 0.037027
Iteration 20542: Policy loss

Iteration 20613: Policy loss: 0.028418. Value loss: 0.610124. Entropy: 1.230263.
episode: 2784   score: 57300.0  epsilon: 1.0    steps: 154  evaluation reward: 35715.0
Training network. lr: 0.000092. clip: 0.036723
Iteration 20614: Policy loss: 0.026339. Value loss: 1.019671. Entropy: 1.258713.
Iteration 20615: Policy loss: 0.048249. Value loss: 0.782828. Entropy: 1.248675.
Iteration 20616: Policy loss: 0.028576. Value loss: 0.730899. Entropy: 1.251098.
Training network. lr: 0.000092. clip: 0.036723
Iteration 20617: Policy loss: 0.266118. Value loss: 0.642283. Entropy: 1.235600.
Iteration 20618: Policy loss: 0.306692. Value loss: 0.436170. Entropy: 1.240639.
Iteration 20619: Policy loss: 0.286628. Value loss: 0.358608. Entropy: 1.240904.
Training network. lr: 0.000092. clip: 0.036723
Iteration 20620: Policy loss: -0.157873. Value loss: 1.687342. Entropy: 1.224671.
Iteration 20621: Policy loss: -0.156835. Value loss: 1.252779. Entropy: 1.224072.
Iteration 20622: Policy loss: -0.098959. 

episode: 2794   score: 46900.0  epsilon: 1.0    steps: 164  evaluation reward: 35589.0
Training network. lr: 0.000091. clip: 0.036566
Iteration 20689: Policy loss: 0.163217. Value loss: 0.563763. Entropy: 1.230746.
Iteration 20690: Policy loss: 0.118615. Value loss: 0.540919. Entropy: 1.221927.
Iteration 20691: Policy loss: 0.146408. Value loss: 0.477492. Entropy: 1.220669.
Training network. lr: 0.000091. clip: 0.036566
Iteration 20692: Policy loss: 0.049423. Value loss: 1.181448. Entropy: 1.231528.
Iteration 20693: Policy loss: -0.007142. Value loss: 0.770126. Entropy: 1.236595.
Iteration 20694: Policy loss: -0.021997. Value loss: 0.545249. Entropy: 1.230847.
Training network. lr: 0.000091. clip: 0.036566
Iteration 20695: Policy loss: 0.178140. Value loss: 0.708323. Entropy: 1.219703.
Iteration 20696: Policy loss: 0.197147. Value loss: 0.452162. Entropy: 1.221928.
Iteration 20697: Policy loss: 0.186663. Value loss: 0.367720. Entropy: 1.218835.
episode: 2795   score: 50500.0  epsilon: 

Iteration 20766: Policy loss: -0.146043. Value loss: 0.448871. Entropy: 1.217983.
Training network. lr: 0.000091. clip: 0.036262
Iteration 20767: Policy loss: -0.292355. Value loss: 0.988146. Entropy: 1.186469.
Iteration 20768: Policy loss: -0.324013. Value loss: 0.706984. Entropy: 1.200920.
Iteration 20769: Policy loss: -0.256816. Value loss: 0.640607. Entropy: 1.187535.
Training network. lr: 0.000091. clip: 0.036262
Iteration 20770: Policy loss: -0.090187. Value loss: 1.260475. Entropy: 1.182967.
Iteration 20771: Policy loss: -0.037161. Value loss: 0.840670. Entropy: 1.186876.
Iteration 20772: Policy loss: -0.061786. Value loss: 0.742946. Entropy: 1.187818.
episode: 2802   score: 53000.0  epsilon: 1.0    steps: 158  evaluation reward: 35499.0
episode: 2803   score: 44600.0  epsilon: 1.0    steps: 416  evaluation reward: 35637.0
Training network. lr: 0.000091. clip: 0.036262
Iteration 20773: Policy loss: -0.179422. Value loss: 1.076168. Entropy: 1.217175.
Iteration 20774: Policy loss:

Iteration 20841: Policy loss: 0.139905. Value loss: 0.303207. Entropy: 1.256207.
Training network. lr: 0.000090. clip: 0.036105
Iteration 20842: Policy loss: -0.361007. Value loss: 1.879065. Entropy: 1.216394.
Iteration 20843: Policy loss: -0.419884. Value loss: 1.469766. Entropy: 1.222469.
Iteration 20844: Policy loss: -0.317683. Value loss: 0.972452. Entropy: 1.222858.
episode: 2813   score: 37600.0  epsilon: 1.0    steps: 593  evaluation reward: 35765.0
Training network. lr: 0.000090. clip: 0.036105
Iteration 20845: Policy loss: 0.102681. Value loss: 0.731440. Entropy: 1.226707.
Iteration 20846: Policy loss: 0.114460. Value loss: 0.598306. Entropy: 1.230355.
Iteration 20847: Policy loss: 0.131727. Value loss: 0.396615. Entropy: 1.229791.
Training network. lr: 0.000090. clip: 0.036105
Iteration 20848: Policy loss: -0.095533. Value loss: 1.002203. Entropy: 1.220739.
Iteration 20849: Policy loss: -0.113709. Value loss: 0.806318. Entropy: 1.222600.
Iteration 20850: Policy loss: -0.11241

Training network. lr: 0.000090. clip: 0.035801
Iteration 20917: Policy loss: 0.288236. Value loss: 0.605817. Entropy: 1.251038.
Iteration 20918: Policy loss: 0.215974. Value loss: 0.455752. Entropy: 1.250569.
Iteration 20919: Policy loss: 0.277914. Value loss: 0.383251. Entropy: 1.245944.
Training network. lr: 0.000090. clip: 0.035801
Iteration 20920: Policy loss: 0.116612. Value loss: 1.038675. Entropy: 1.219128.
Iteration 20921: Policy loss: 0.093915. Value loss: 0.812327. Entropy: 1.217767.
Iteration 20922: Policy loss: 0.032604. Value loss: 0.754793. Entropy: 1.211107.
episode: 2823   score: 35600.0  epsilon: 1.0    steps: 381  evaluation reward: 36301.0
Training network. lr: 0.000090. clip: 0.035801
Iteration 20923: Policy loss: 0.390474. Value loss: 0.579193. Entropy: 1.222102.
Iteration 20924: Policy loss: 0.401344. Value loss: 0.356890. Entropy: 1.224524.
Iteration 20925: Policy loss: 0.397272. Value loss: 0.280647. Entropy: 1.227637.
Training network. lr: 0.000090. clip: 0.035

Training network. lr: 0.000089. clip: 0.035645
Iteration 20995: Policy loss: 0.287893. Value loss: 0.661381. Entropy: 1.207463.
Iteration 20996: Policy loss: 0.313633. Value loss: 0.317559. Entropy: 1.202793.
Iteration 20997: Policy loss: 0.294097. Value loss: 0.323038. Entropy: 1.198945.
episode: 2831   score: 31400.0  epsilon: 1.0    steps: 552  evaluation reward: 36005.0
Training network. lr: 0.000089. clip: 0.035645
Iteration 20998: Policy loss: 0.178992. Value loss: 0.712422. Entropy: 1.182719.
Iteration 20999: Policy loss: 0.153093. Value loss: 0.381285. Entropy: 1.182234.
Iteration 21000: Policy loss: 0.153423. Value loss: 0.288348. Entropy: 1.185237.
episode: 2832   score: 20200.0  epsilon: 1.0    steps: 221  evaluation reward: 35897.0
episode: 2833   score: 27400.0  epsilon: 1.0    steps: 815  evaluation reward: 35975.0
Training network. lr: 0.000089. clip: 0.035497
Iteration 21001: Policy loss: 0.167735. Value loss: 0.477380. Entropy: 1.199894.
Iteration 21002: Policy loss: 0

Training network. lr: 0.000088. clip: 0.035341
Iteration 21070: Policy loss: 0.130027. Value loss: 0.904303. Entropy: 1.222797.
Iteration 21071: Policy loss: 0.158259. Value loss: 0.502894. Entropy: 1.218606.
Iteration 21072: Policy loss: 0.154978. Value loss: 0.354846. Entropy: 1.223307.
Training network. lr: 0.000088. clip: 0.035341
Iteration 21073: Policy loss: -0.153586. Value loss: 1.130827. Entropy: 1.181633.
Iteration 21074: Policy loss: -0.215979. Value loss: 0.822583. Entropy: 1.186434.
Iteration 21075: Policy loss: -0.191276. Value loss: 0.688001. Entropy: 1.176739.
Training network. lr: 0.000088. clip: 0.035341
Iteration 21076: Policy loss: -0.020759. Value loss: 0.703587. Entropy: 1.206103.
Iteration 21077: Policy loss: -0.023517. Value loss: 0.459509. Entropy: 1.203341.
Iteration 21078: Policy loss: -0.017634. Value loss: 0.392951. Entropy: 1.207039.
episode: 2842   score: 24500.0  epsilon: 1.0    steps: 661  evaluation reward: 36221.0
Training network. lr: 0.000088. clip:

Iteration 21144: Policy loss: -0.013978. Value loss: 0.502723. Entropy: 1.253782.
episode: 2853   score: 26900.0  epsilon: 1.0    steps: 76  evaluation reward: 35590.0
Training network. lr: 0.000088. clip: 0.035184
Iteration 21145: Policy loss: 0.299961. Value loss: 0.398113. Entropy: 1.244664.
Iteration 21146: Policy loss: 0.309373. Value loss: 0.360440. Entropy: 1.249418.
Iteration 21147: Policy loss: 0.291367. Value loss: 0.268822. Entropy: 1.253493.
Training network. lr: 0.000088. clip: 0.035184
Iteration 21148: Policy loss: 0.003601. Value loss: 0.451644. Entropy: 1.267994.
Iteration 21149: Policy loss: 0.002748. Value loss: 0.343536. Entropy: 1.261088.
Iteration 21150: Policy loss: 0.006073. Value loss: 0.301426. Entropy: 1.265535.
Training network. lr: 0.000088. clip: 0.035036
Iteration 21151: Policy loss: 0.021023. Value loss: 0.785030. Entropy: 1.257445.
Iteration 21152: Policy loss: 0.017762. Value loss: 0.541083. Entropy: 1.249499.
Iteration 21153: Policy loss: 0.041999. Val

Iteration 21221: Policy loss: -0.062821. Value loss: 1.015636. Entropy: 1.190389.
Iteration 21222: Policy loss: -0.079037. Value loss: 1.037768. Entropy: 1.195929.
Training network. lr: 0.000087. clip: 0.034880
Iteration 21223: Policy loss: 0.261885. Value loss: 0.718706. Entropy: 1.186975.
Iteration 21224: Policy loss: 0.266224. Value loss: 0.510645. Entropy: 1.193286.
Iteration 21225: Policy loss: 0.265663. Value loss: 0.418813. Entropy: 1.196459.
episode: 2862   score: 48400.0  epsilon: 1.0    steps: 519  evaluation reward: 36122.0
Training network. lr: 0.000087. clip: 0.034880
Iteration 21226: Policy loss: 0.010419. Value loss: 2.351003. Entropy: 1.164751.
Iteration 21227: Policy loss: 0.006310. Value loss: 1.753567. Entropy: 1.165468.
Iteration 21228: Policy loss: -0.045064. Value loss: 1.485337. Entropy: 1.166612.
Training network. lr: 0.000087. clip: 0.034880
Iteration 21229: Policy loss: 0.326839. Value loss: 0.416916. Entropy: 1.197534.
Iteration 21230: Policy loss: 0.350993. 

Iteration 21296: Policy loss: 0.278260. Value loss: 0.285326. Entropy: 1.230887.
Iteration 21297: Policy loss: 0.289188. Value loss: 0.280682. Entropy: 1.234901.
Training network. lr: 0.000087. clip: 0.034723
Iteration 21298: Policy loss: 0.062152. Value loss: 0.590748. Entropy: 1.234051.
Iteration 21299: Policy loss: 0.101029. Value loss: 0.310614. Entropy: 1.238981.
Iteration 21300: Policy loss: 0.081122. Value loss: 0.249153. Entropy: 1.239683.
Training network. lr: 0.000086. clip: 0.034576
Iteration 21301: Policy loss: 0.156197. Value loss: 0.666963. Entropy: 1.233201.
Iteration 21302: Policy loss: 0.156578. Value loss: 0.539571. Entropy: 1.242668.
Iteration 21303: Policy loss: 0.140797. Value loss: 0.460156. Entropy: 1.243465.
Training network. lr: 0.000086. clip: 0.034576
Iteration 21304: Policy loss: 0.094697. Value loss: 1.245867. Entropy: 1.235072.
Iteration 21305: Policy loss: 0.131029. Value loss: 0.531573. Entropy: 1.244532.
Iteration 21306: Policy loss: 0.141659. Value los

Training network. lr: 0.000086. clip: 0.034419
Iteration 21373: Policy loss: 0.332945. Value loss: 0.695725. Entropy: 1.222175.
Iteration 21374: Policy loss: 0.342905. Value loss: 0.444567. Entropy: 1.233278.
Iteration 21375: Policy loss: 0.334231. Value loss: 0.342266. Entropy: 1.228094.
Training network. lr: 0.000086. clip: 0.034419
Iteration 21376: Policy loss: 0.210124. Value loss: 1.338543. Entropy: 1.222534.
Iteration 21377: Policy loss: 0.180721. Value loss: 0.892359. Entropy: 1.220355.
Iteration 21378: Policy loss: 0.232738. Value loss: 0.668959. Entropy: 1.215919.
Training network. lr: 0.000086. clip: 0.034419
Iteration 21379: Policy loss: -0.262851. Value loss: 1.470423. Entropy: 1.183164.
Iteration 21380: Policy loss: -0.249714. Value loss: 1.045224. Entropy: 1.174566.
Iteration 21381: Policy loss: -0.302145. Value loss: 0.909899. Entropy: 1.188276.
episode: 2882   score: 44500.0  epsilon: 1.0    steps: 980  evaluation reward: 36375.0
Training network. lr: 0.000086. clip: 0.

Training network. lr: 0.000085. clip: 0.034115
Iteration 21451: Policy loss: 0.224879. Value loss: 1.333584. Entropy: 1.172372.
Iteration 21452: Policy loss: 0.231824. Value loss: 0.884929. Entropy: 1.174788.
Iteration 21453: Policy loss: 0.242049. Value loss: 0.794537. Entropy: 1.176243.
Training network. lr: 0.000085. clip: 0.034115
Iteration 21454: Policy loss: 0.089338. Value loss: 0.598372. Entropy: 1.179381.
Iteration 21455: Policy loss: 0.073674. Value loss: 0.385408. Entropy: 1.173212.
Iteration 21456: Policy loss: 0.123977. Value loss: 0.260611. Entropy: 1.181694.
Training network. lr: 0.000085. clip: 0.034115
Iteration 21457: Policy loss: 0.098892. Value loss: 1.421703. Entropy: 1.159147.
Iteration 21458: Policy loss: 0.026877. Value loss: 1.065561. Entropy: 1.170782.
Iteration 21459: Policy loss: 0.049250. Value loss: 0.897986. Entropy: 1.163362.
episode: 2890   score: 27400.0  epsilon: 1.0    steps: 477  evaluation reward: 36164.0
Training network. lr: 0.000085. clip: 0.034

Training network. lr: 0.000085. clip: 0.033958
Iteration 21529: Policy loss: -0.091603. Value loss: 1.336959. Entropy: 1.179381.
Iteration 21530: Policy loss: -0.080944. Value loss: 0.860580. Entropy: 1.179522.
Iteration 21531: Policy loss: -0.067166. Value loss: 0.904245. Entropy: 1.183188.
Training network. lr: 0.000085. clip: 0.033958
Iteration 21532: Policy loss: 0.104367. Value loss: 0.610489. Entropy: 1.196103.
Iteration 21533: Policy loss: 0.083121. Value loss: 0.388028. Entropy: 1.190257.
Iteration 21534: Policy loss: 0.072557. Value loss: 0.308391. Entropy: 1.194909.
Training network. lr: 0.000085. clip: 0.033958
Iteration 21535: Policy loss: -0.248602. Value loss: 1.798558. Entropy: 1.206001.
Iteration 21536: Policy loss: -0.288780. Value loss: 1.364613. Entropy: 1.211657.
Iteration 21537: Policy loss: -0.171470. Value loss: 0.998055. Entropy: 1.208713.
Training network. lr: 0.000085. clip: 0.033958
Iteration 21538: Policy loss: -0.036680. Value loss: 1.919525. Entropy: 1.189

Iteration 21605: Policy loss: -0.117874. Value loss: 1.144248. Entropy: 1.182281.
Iteration 21606: Policy loss: -0.139167. Value loss: 0.985241. Entropy: 1.184433.
Training network. lr: 0.000084. clip: 0.033654
Iteration 21607: Policy loss: -0.159262. Value loss: 1.471942. Entropy: 1.187773.
Iteration 21608: Policy loss: -0.096075. Value loss: 1.177871. Entropy: 1.194613.
Iteration 21609: Policy loss: -0.133521. Value loss: 0.974760. Entropy: 1.192532.
Training network. lr: 0.000084. clip: 0.033654
Iteration 21610: Policy loss: -0.032881. Value loss: 0.397760. Entropy: 1.186297.
Iteration 21611: Policy loss: -0.021460. Value loss: 0.306464. Entropy: 1.189213.
Iteration 21612: Policy loss: -0.017695. Value loss: 0.267616. Entropy: 1.178591.
Training network. lr: 0.000084. clip: 0.033654
Iteration 21613: Policy loss: -0.219813. Value loss: 1.372176. Entropy: 1.187454.
Iteration 21614: Policy loss: -0.132586. Value loss: 0.904269. Entropy: 1.193334.
Iteration 21615: Policy loss: -0.189756

Iteration 21682: Policy loss: -0.082835. Value loss: 0.809484. Entropy: 1.212033.
Iteration 21683: Policy loss: -0.090061. Value loss: 0.589486. Entropy: 1.205798.
Iteration 21684: Policy loss: -0.086312. Value loss: 0.506704. Entropy: 1.213993.
Training network. lr: 0.000084. clip: 0.033497
Iteration 21685: Policy loss: -0.214874. Value loss: 0.796385. Entropy: 1.186008.
Iteration 21686: Policy loss: -0.245893. Value loss: 0.679070. Entropy: 1.166192.
Iteration 21687: Policy loss: -0.272078. Value loss: 0.518863. Entropy: 1.173162.
Training network. lr: 0.000084. clip: 0.033497
Iteration 21688: Policy loss: -0.215193. Value loss: 1.249479. Entropy: 1.173889.
Iteration 21689: Policy loss: -0.200465. Value loss: 1.002774. Entropy: 1.166421.
Iteration 21690: Policy loss: -0.191018. Value loss: 0.895866. Entropy: 1.169319.
episode: 2915   score: 28100.0  epsilon: 1.0    steps: 684  evaluation reward: 38061.0
Training network. lr: 0.000084. clip: 0.033497
Iteration 21691: Policy loss: 0.16

Training network. lr: 0.000083. clip: 0.033193
Iteration 21760: Policy loss: 0.195806. Value loss: 0.778238. Entropy: 1.178310.
Iteration 21761: Policy loss: 0.195202. Value loss: 0.557199. Entropy: 1.186397.
Iteration 21762: Policy loss: 0.204161. Value loss: 0.468902. Entropy: 1.192113.
episode: 2923   score: 70600.0  epsilon: 1.0    steps: 412  evaluation reward: 38304.0
episode: 2924   score: 62600.0  epsilon: 1.0    steps: 913  evaluation reward: 38439.0
Training network. lr: 0.000083. clip: 0.033193
Iteration 21763: Policy loss: 0.036698. Value loss: 1.105178. Entropy: 1.222936.
Iteration 21764: Policy loss: 0.065509. Value loss: 0.706561. Entropy: 1.219376.
Iteration 21765: Policy loss: 0.039448. Value loss: 0.582665. Entropy: 1.226074.
Training network. lr: 0.000083. clip: 0.033193
Iteration 21766: Policy loss: 0.185515. Value loss: 0.377353. Entropy: 1.238364.
Iteration 21767: Policy loss: 0.198907. Value loss: 0.262376. Entropy: 1.231947.
Iteration 21768: Policy loss: 0.19373

episode: 2931   score: 47400.0  epsilon: 1.0    steps: 149  evaluation reward: 38735.0
Training network. lr: 0.000083. clip: 0.033037
Iteration 21838: Policy loss: 0.278574. Value loss: 0.874611. Entropy: 1.208475.
Iteration 21839: Policy loss: 0.286107. Value loss: 0.701837. Entropy: 1.212176.
Iteration 21840: Policy loss: 0.286388. Value loss: 0.572451. Entropy: 1.209033.
episode: 2932   score: 54900.0  epsilon: 1.0    steps: 388  evaluation reward: 39082.0
Training network. lr: 0.000083. clip: 0.033037
Iteration 21841: Policy loss: 0.253871. Value loss: 0.504316. Entropy: 1.223204.
Iteration 21842: Policy loss: 0.265039. Value loss: 0.308962. Entropy: 1.225350.
Iteration 21843: Policy loss: 0.237049. Value loss: 0.266329. Entropy: 1.224222.
episode: 2933   score: 30500.0  epsilon: 1.0    steps: 6  evaluation reward: 39113.0
Training network. lr: 0.000083. clip: 0.033037
Iteration 21844: Policy loss: -0.001638. Value loss: 0.876773. Entropy: 1.214818.
Iteration 21845: Policy loss: 0.

Iteration 21913: Policy loss: 0.009108. Value loss: 0.735204. Entropy: 1.178448.
Iteration 21914: Policy loss: 0.007494. Value loss: 0.480014. Entropy: 1.189507.
Iteration 21915: Policy loss: -0.001768. Value loss: 0.422183. Entropy: 1.185948.
episode: 2941   score: 55500.0  epsilon: 1.0    steps: 198  evaluation reward: 40059.0
Training network. lr: 0.000082. clip: 0.032732
Iteration 21916: Policy loss: -0.110129. Value loss: 1.093780. Entropy: 1.198670.
Iteration 21917: Policy loss: -0.109594. Value loss: 0.653863. Entropy: 1.197801.
Iteration 21918: Policy loss: -0.093479. Value loss: 0.491979. Entropy: 1.208283.
Training network. lr: 0.000082. clip: 0.032732
Iteration 21919: Policy loss: 0.088293. Value loss: 0.685077. Entropy: 1.224273.
Iteration 21920: Policy loss: 0.124899. Value loss: 0.436181. Entropy: 1.229170.
Iteration 21921: Policy loss: 0.122046. Value loss: 0.374538. Entropy: 1.221126.
Training network. lr: 0.000082. clip: 0.032732
Iteration 21922: Policy loss: 0.354347.

episode: 2949   score: 22900.0  epsilon: 1.0    steps: 1014  evaluation reward: 40163.0
Training network. lr: 0.000081. clip: 0.032576
Iteration 21991: Policy loss: 0.003008. Value loss: 1.425809. Entropy: 1.182838.
Iteration 21992: Policy loss: 0.042922. Value loss: 0.896517. Entropy: 1.184057.
Iteration 21993: Policy loss: -0.014782. Value loss: 0.840243. Entropy: 1.179168.
Training network. lr: 0.000081. clip: 0.032576
Iteration 21994: Policy loss: 0.060737. Value loss: 1.031293. Entropy: 1.174175.
Iteration 21995: Policy loss: 0.074242. Value loss: 0.703063. Entropy: 1.179376.
Iteration 21996: Policy loss: 0.116583. Value loss: 0.593083. Entropy: 1.174304.
episode: 2950   score: 35400.0  epsilon: 1.0    steps: 26  evaluation reward: 40434.0
Training network. lr: 0.000081. clip: 0.032576
Iteration 21997: Policy loss: -0.022019. Value loss: 0.531651. Entropy: 1.232734.
Iteration 21998: Policy loss: -0.001565. Value loss: 0.398584. Entropy: 1.228876.
Iteration 21999: Policy loss: -0.0

Iteration 22066: Policy loss: -0.056207. Value loss: 1.044515. Entropy: 1.232758.
Iteration 22067: Policy loss: -0.065381. Value loss: 0.588767. Entropy: 1.235156.
Iteration 22068: Policy loss: -0.057159. Value loss: 0.467501. Entropy: 1.237304.
Training network. lr: 0.000081. clip: 0.032272
Iteration 22069: Policy loss: 0.120632. Value loss: 0.790375. Entropy: 1.254117.
Iteration 22070: Policy loss: 0.108836. Value loss: 0.593354. Entropy: 1.254413.
Iteration 22071: Policy loss: 0.095711. Value loss: 0.525258. Entropy: 1.253838.
Training network. lr: 0.000081. clip: 0.032272
Iteration 22072: Policy loss: -0.221264. Value loss: 0.888393. Entropy: 1.244164.
Iteration 22073: Policy loss: -0.217349. Value loss: 0.715581. Entropy: 1.239880.
Iteration 22074: Policy loss: -0.231330. Value loss: 0.478444. Entropy: 1.246061.
episode: 2959   score: 42300.0  epsilon: 1.0    steps: 348  evaluation reward: 40798.0
Training network. lr: 0.000081. clip: 0.032272
Iteration 22075: Policy loss: -0.1263

Iteration 22142: Policy loss: 0.152505. Value loss: 0.403528. Entropy: 1.250293.
Iteration 22143: Policy loss: 0.161409. Value loss: 0.341187. Entropy: 1.246250.
episode: 2969   score: 28900.0  epsilon: 1.0    steps: 898  evaluation reward: 40189.0
Training network. lr: 0.000080. clip: 0.032115
Iteration 22144: Policy loss: -0.105043. Value loss: 0.415058. Entropy: 1.238021.
Iteration 22145: Policy loss: -0.129151. Value loss: 0.377209. Entropy: 1.243309.
Iteration 22146: Policy loss: -0.135251. Value loss: 0.256853. Entropy: 1.236341.
Training network. lr: 0.000080. clip: 0.032115
Iteration 22147: Policy loss: 0.191142. Value loss: 0.978028. Entropy: 1.218157.
Iteration 22148: Policy loss: 0.218624. Value loss: 0.826550. Entropy: 1.210133.
Iteration 22149: Policy loss: 0.199297. Value loss: 0.683830. Entropy: 1.206520.
episode: 2970   score: 50100.0  epsilon: 1.0    steps: 824  evaluation reward: 40475.0
Training network. lr: 0.000080. clip: 0.032115
Iteration 22150: Policy loss: 0.16

Iteration 22218: Policy loss: 0.058424. Value loss: 0.287664. Entropy: 1.285834.
episode: 2979   score: 29800.0  epsilon: 1.0    steps: 284  evaluation reward: 40238.0
episode: 2980   score: 34800.0  epsilon: 1.0    steps: 828  evaluation reward: 40139.0
Training network. lr: 0.000080. clip: 0.031811
Iteration 22219: Policy loss: 0.243359. Value loss: 0.699621. Entropy: 1.275281.
Iteration 22220: Policy loss: 0.211363. Value loss: 0.402483. Entropy: 1.271844.
Iteration 22221: Policy loss: 0.220053. Value loss: 0.357344. Entropy: 1.273196.
Training network. lr: 0.000080. clip: 0.031811
Iteration 22222: Policy loss: 0.097232. Value loss: 0.284289. Entropy: 1.314225.
Iteration 22223: Policy loss: 0.112693. Value loss: 0.209335. Entropy: 1.307593.
Iteration 22224: Policy loss: 0.103626. Value loss: 0.180070. Entropy: 1.316140.
Training network. lr: 0.000080. clip: 0.031811
Iteration 22225: Policy loss: 0.250183. Value loss: 0.730549. Entropy: 1.303203.
Iteration 22226: Policy loss: 0.27156

Iteration 22293: Policy loss: 0.005818. Value loss: 0.357960. Entropy: 1.308009.
episode: 2990   score: 14400.0  epsilon: 1.0    steps: 356  evaluation reward: 38724.0
Training network. lr: 0.000079. clip: 0.031654
Iteration 22294: Policy loss: -0.089391. Value loss: 0.684907. Entropy: 1.276674.
Iteration 22295: Policy loss: -0.103677. Value loss: 0.465759. Entropy: 1.273531.
Iteration 22296: Policy loss: -0.068548. Value loss: 0.409341. Entropy: 1.269991.
Training network. lr: 0.000079. clip: 0.031654
Iteration 22297: Policy loss: -0.088557. Value loss: 0.811868. Entropy: 1.303020.
Iteration 22298: Policy loss: -0.073477. Value loss: 0.613632. Entropy: 1.301330.
Iteration 22299: Policy loss: -0.073319. Value loss: 0.526298. Entropy: 1.309110.
Training network. lr: 0.000079. clip: 0.031654
Iteration 22300: Policy loss: 0.323071. Value loss: 0.248650. Entropy: 1.329716.
Iteration 22301: Policy loss: 0.282419. Value loss: 0.149854. Entropy: 1.335302.
Iteration 22302: Policy loss: 0.31368

Iteration 22370: Policy loss: -0.025215. Value loss: 0.321139. Entropy: 1.314988.
Iteration 22371: Policy loss: -0.027375. Value loss: 0.307226. Entropy: 1.319055.
episode: 2999   score: 29800.0  epsilon: 1.0    steps: 763  evaluation reward: 37785.0
Training network. lr: 0.000078. clip: 0.031350
Iteration 22372: Policy loss: -0.075999. Value loss: 0.694705. Entropy: 1.290544.
Iteration 22373: Policy loss: -0.043515. Value loss: 0.488205. Entropy: 1.293905.
Iteration 22374: Policy loss: -0.080308. Value loss: 0.350155. Entropy: 1.289046.
Training network. lr: 0.000078. clip: 0.031350
Iteration 22375: Policy loss: -0.085443. Value loss: 0.731422. Entropy: 1.292330.
Iteration 22376: Policy loss: -0.068479. Value loss: 0.500714. Entropy: 1.289842.
Iteration 22377: Policy loss: -0.100508. Value loss: 0.413161. Entropy: 1.285186.
episode: 3000   score: 33200.0  epsilon: 1.0    steps: 21  evaluation reward: 37744.0
Training network. lr: 0.000078. clip: 0.031350
Iteration 22378: Policy loss: 

Iteration 22444: Policy loss: 0.030824. Value loss: 0.372658. Entropy: 1.320066.
Iteration 22445: Policy loss: 0.037014. Value loss: 0.265292. Entropy: 1.320779.
Iteration 22446: Policy loss: 0.031530. Value loss: 0.173347. Entropy: 1.318162.
episode: 3010   score: 13300.0  epsilon: 1.0    steps: 456  evaluation reward: 36656.0
episode: 3011   score: 23500.0  epsilon: 1.0    steps: 836  evaluation reward: 36383.0
Training network. lr: 0.000078. clip: 0.031193
Iteration 22447: Policy loss: 0.186384. Value loss: 0.411155. Entropy: 1.315720.
Iteration 22448: Policy loss: 0.159620. Value loss: 0.313806. Entropy: 1.319827.
Iteration 22449: Policy loss: 0.207880. Value loss: 0.217622. Entropy: 1.314330.
Training network. lr: 0.000078. clip: 0.031193
Iteration 22450: Policy loss: -0.086198. Value loss: 0.449949. Entropy: 1.330752.
Iteration 22451: Policy loss: -0.082152. Value loss: 0.292334. Entropy: 1.327564.
Iteration 22452: Policy loss: -0.065815. Value loss: 0.242323. Entropy: 1.330868.


Iteration 22521: Policy loss: -0.333568. Value loss: 0.782938. Entropy: 1.288971.
episode: 3019   score: 41800.0  epsilon: 1.0    steps: 8  evaluation reward: 35987.0
Training network. lr: 0.000077. clip: 0.030889
Iteration 22522: Policy loss: 0.162736. Value loss: 0.435938. Entropy: 1.296921.
Iteration 22523: Policy loss: 0.101929. Value loss: 0.385803. Entropy: 1.298965.
Iteration 22524: Policy loss: 0.146775. Value loss: 0.299447. Entropy: 1.294741.
Training network. lr: 0.000077. clip: 0.030889
Iteration 22525: Policy loss: -0.032024. Value loss: 0.991677. Entropy: 1.296810.
Iteration 22526: Policy loss: 0.020629. Value loss: 0.762504. Entropy: 1.286547.
Iteration 22527: Policy loss: -0.043229. Value loss: 0.563399. Entropy: 1.293864.
Training network. lr: 0.000077. clip: 0.030889
Iteration 22528: Policy loss: -0.049032. Value loss: 0.694561. Entropy: 1.295729.
Iteration 22529: Policy loss: -0.052952. Value loss: 0.452812. Entropy: 1.299709.
Iteration 22530: Policy loss: -0.056383.

Iteration 22598: Policy loss: -0.288042. Value loss: 1.075708. Entropy: 1.276191.
Iteration 22599: Policy loss: -0.298022. Value loss: 0.912308. Entropy: 1.274565.
Training network. lr: 0.000077. clip: 0.030733
Iteration 22600: Policy loss: 0.162021. Value loss: 1.567041. Entropy: 1.291886.
Iteration 22601: Policy loss: 0.072208. Value loss: 1.428114. Entropy: 1.297884.
Iteration 22602: Policy loss: 0.101748. Value loss: 1.064789. Entropy: 1.292793.
Training network. lr: 0.000076. clip: 0.030576
Iteration 22603: Policy loss: 0.127968. Value loss: 0.643403. Entropy: 1.324869.
Iteration 22604: Policy loss: 0.118900. Value loss: 0.375811. Entropy: 1.321149.
Iteration 22605: Policy loss: 0.104655. Value loss: 0.288922. Entropy: 1.322077.
episode: 3028   score: 30000.0  epsilon: 1.0    steps: 744  evaluation reward: 35441.0
Training network. lr: 0.000076. clip: 0.030576
Iteration 22606: Policy loss: 0.485390. Value loss: 0.664608. Entropy: 1.291132.
Iteration 22607: Policy loss: 0.453901. V

episode: 3037   score: 9000.0  epsilon: 1.0    steps: 860  evaluation reward: 34483.0
Training network. lr: 0.000076. clip: 0.030428
Iteration 22675: Policy loss: 0.027765. Value loss: 0.618164. Entropy: 1.284920.
Iteration 22676: Policy loss: 0.035512. Value loss: 0.466556. Entropy: 1.276040.
Iteration 22677: Policy loss: 0.032979. Value loss: 0.413898. Entropy: 1.278214.
episode: 3038   score: 27400.0  epsilon: 1.0    steps: 194  evaluation reward: 34269.0
Training network. lr: 0.000076. clip: 0.030428
Iteration 22678: Policy loss: -0.276581. Value loss: 0.855338. Entropy: 1.259104.
Iteration 22679: Policy loss: -0.307961. Value loss: 0.669202. Entropy: 1.267347.
Iteration 22680: Policy loss: -0.274979. Value loss: 0.551090. Entropy: 1.266972.
Training network. lr: 0.000076. clip: 0.030428
Iteration 22681: Policy loss: 0.082926. Value loss: 0.682173. Entropy: 1.307966.
Iteration 22682: Policy loss: 0.082220. Value loss: 0.524197. Entropy: 1.303630.
Iteration 22683: Policy loss: 0.065

Iteration 22751: Policy loss: 0.023285. Value loss: 0.849110. Entropy: 1.264202.
Iteration 22752: Policy loss: 0.035974. Value loss: 0.665862. Entropy: 1.271812.
Training network. lr: 0.000075. clip: 0.030115
Iteration 22753: Policy loss: 0.217227. Value loss: 0.678747. Entropy: 1.300689.
Iteration 22754: Policy loss: 0.243820. Value loss: 0.450794. Entropy: 1.303939.
Iteration 22755: Policy loss: 0.226871. Value loss: 0.380222. Entropy: 1.306152.
episode: 3046   score: 49100.0  epsilon: 1.0    steps: 715  evaluation reward: 34058.0
Training network. lr: 0.000075. clip: 0.030115
Iteration 22756: Policy loss: 0.051601. Value loss: 0.421836. Entropy: 1.292523.
Iteration 22757: Policy loss: 0.049220. Value loss: 0.289474. Entropy: 1.294424.
Iteration 22758: Policy loss: 0.043021. Value loss: 0.251959. Entropy: 1.292409.
episode: 3047   score: 33500.0  epsilon: 1.0    steps: 47  evaluation reward: 33966.0
Training network. lr: 0.000075. clip: 0.030115
Iteration 22759: Policy loss: 0.247941

Iteration 22827: Policy loss: 0.093927. Value loss: 0.241326. Entropy: 1.278351.
Training network. lr: 0.000075. clip: 0.029968
Iteration 22828: Policy loss: 0.183245. Value loss: 0.570620. Entropy: 1.303238.
Iteration 22829: Policy loss: 0.179611. Value loss: 0.349519. Entropy: 1.308294.
Iteration 22830: Policy loss: 0.179878. Value loss: 0.302696. Entropy: 1.307335.
episode: 3056   score: 26500.0  epsilon: 1.0    steps: 140  evaluation reward: 33136.0
episode: 3057   score: 28800.0  epsilon: 1.0    steps: 410  evaluation reward: 33056.0
episode: 3058   score: 34700.0  epsilon: 1.0    steps: 522  evaluation reward: 33057.0
Training network. lr: 0.000075. clip: 0.029968
Iteration 22831: Policy loss: 0.183199. Value loss: 0.536480. Entropy: 1.312221.
Iteration 22832: Policy loss: 0.201583. Value loss: 0.280620. Entropy: 1.314719.
Iteration 22833: Policy loss: 0.180302. Value loss: 0.227894. Entropy: 1.315805.
episode: 3059   score: 28600.0  epsilon: 1.0    steps: 947  evaluation reward:

Iteration 22901: Policy loss: 0.087169. Value loss: 0.686414. Entropy: 1.287182.
Iteration 22902: Policy loss: 0.071828. Value loss: 0.637489. Entropy: 1.287733.
Training network. lr: 0.000074. clip: 0.029654
Iteration 22903: Policy loss: 0.156816. Value loss: 0.642103. Entropy: 1.279590.
Iteration 22904: Policy loss: 0.142610. Value loss: 0.446861. Entropy: 1.285967.
Iteration 22905: Policy loss: 0.149433. Value loss: 0.362441. Entropy: 1.282094.
Training network. lr: 0.000074. clip: 0.029654
Iteration 22906: Policy loss: -0.200242. Value loss: 1.891248. Entropy: 1.279022.
Iteration 22907: Policy loss: -0.213682. Value loss: 1.103306. Entropy: 1.280019.
Iteration 22908: Policy loss: -0.245213. Value loss: 0.906362. Entropy: 1.276666.
Training network. lr: 0.000074. clip: 0.029654
Iteration 22909: Policy loss: 0.152876. Value loss: 0.627918. Entropy: 1.311020.
Iteration 22910: Policy loss: 0.136786. Value loss: 0.543946. Entropy: 1.313160.
Iteration 22911: Policy loss: 0.122021. Value 

Training network. lr: 0.000074. clip: 0.029507
Iteration 22978: Policy loss: 0.365792. Value loss: 0.348517. Entropy: 1.325145.
Iteration 22979: Policy loss: 0.361823. Value loss: 0.176219. Entropy: 1.326890.
Iteration 22980: Policy loss: 0.358038. Value loss: 0.146635. Entropy: 1.326597.
Training network. lr: 0.000074. clip: 0.029507
Iteration 22981: Policy loss: -0.293728. Value loss: 0.742207. Entropy: 1.295040.
Iteration 22982: Policy loss: -0.302155. Value loss: 0.566757. Entropy: 1.295416.
Iteration 22983: Policy loss: -0.302265. Value loss: 0.480247. Entropy: 1.294030.
Training network. lr: 0.000074. clip: 0.029507
Iteration 22984: Policy loss: 0.123275. Value loss: 0.596517. Entropy: 1.303207.
Iteration 22985: Policy loss: 0.079582. Value loss: 0.453630. Entropy: 1.308288.
Iteration 22986: Policy loss: 0.123322. Value loss: 0.331075. Entropy: 1.305254.
episode: 3077   score: 35000.0  epsilon: 1.0    steps: 773  evaluation reward: 31924.0
Training network. lr: 0.000074. clip: 0.

Iteration 23055: Policy loss: -0.109984. Value loss: 0.559064. Entropy: 1.273428.
Training network. lr: 0.000073. clip: 0.029193
Iteration 23056: Policy loss: 0.462599. Value loss: 0.999831. Entropy: 1.279392.
Iteration 23057: Policy loss: 0.444180. Value loss: 0.652610. Entropy: 1.280484.
Iteration 23058: Policy loss: 0.466914. Value loss: 0.566549. Entropy: 1.279709.
episode: 3086   score: 29100.0  epsilon: 1.0    steps: 566  evaluation reward: 32295.0
episode: 3087   score: 40400.0  epsilon: 1.0    steps: 643  evaluation reward: 32295.0
Training network. lr: 0.000073. clip: 0.029193
Iteration 23059: Policy loss: -0.073099. Value loss: 1.069641. Entropy: 1.302627.
Iteration 23060: Policy loss: -0.104006. Value loss: 0.637689. Entropy: 1.304095.
Iteration 23061: Policy loss: -0.112607. Value loss: 0.493633. Entropy: 1.304673.
episode: 3088   score: 37700.0  epsilon: 1.0    steps: 166  evaluation reward: 32433.0
Training network. lr: 0.000073. clip: 0.029193
Iteration 23062: Policy los

Iteration 23130: Policy loss: 0.057206. Value loss: 0.676866. Entropy: 1.286732.
Training network. lr: 0.000073. clip: 0.029046
Iteration 23131: Policy loss: 0.049808. Value loss: 0.689108. Entropy: 1.298475.
Iteration 23132: Policy loss: 0.053227. Value loss: 0.508928. Entropy: 1.296391.
Iteration 23133: Policy loss: 0.044740. Value loss: 0.472546. Entropy: 1.300731.
Training network. lr: 0.000073. clip: 0.029046
Iteration 23134: Policy loss: 0.007313. Value loss: 1.084558. Entropy: 1.304906.
Iteration 23135: Policy loss: 0.046871. Value loss: 0.779292. Entropy: 1.302785.
Iteration 23136: Policy loss: 0.007929. Value loss: 0.626250. Entropy: 1.304822.
Training network. lr: 0.000073. clip: 0.029046
Iteration 23137: Policy loss: -0.132448. Value loss: 1.687302. Entropy: 1.292332.
Iteration 23138: Policy loss: -0.130666. Value loss: 1.371359. Entropy: 1.294027.
Iteration 23139: Policy loss: -0.185890. Value loss: 1.264946. Entropy: 1.291884.
Training network. lr: 0.000073. clip: 0.029046

Iteration 23206: Policy loss: 0.194124. Value loss: 0.909263. Entropy: 1.279225.
Iteration 23207: Policy loss: 0.182230. Value loss: 0.656247. Entropy: 1.282690.
Iteration 23208: Policy loss: 0.185786. Value loss: 0.539283. Entropy: 1.277271.
Training network. lr: 0.000072. clip: 0.028733
Iteration 23209: Policy loss: -0.065902. Value loss: 0.738603. Entropy: 1.297687.
Iteration 23210: Policy loss: -0.103421. Value loss: 0.571308. Entropy: 1.295671.
Iteration 23211: Policy loss: -0.090613. Value loss: 0.536487. Entropy: 1.297429.
Training network. lr: 0.000072. clip: 0.028733
Iteration 23212: Policy loss: -0.076656. Value loss: 0.673987. Entropy: 1.268325.
Iteration 23213: Policy loss: -0.053847. Value loss: 0.571062. Entropy: 1.279374.
Iteration 23214: Policy loss: -0.079903. Value loss: 0.462079. Entropy: 1.275333.
Training network. lr: 0.000072. clip: 0.028733
Iteration 23215: Policy loss: 0.029730. Value loss: 1.517904. Entropy: 1.285607.
Iteration 23216: Policy loss: -0.010967. Va

Iteration 23283: Policy loss: 0.066301. Value loss: 0.519591. Entropy: 1.319776.
Training network. lr: 0.000071. clip: 0.028585
Iteration 23284: Policy loss: -0.062915. Value loss: 0.692180. Entropy: 1.314829.
Iteration 23285: Policy loss: -0.059822. Value loss: 0.492545. Entropy: 1.313515.
Iteration 23286: Policy loss: -0.014641. Value loss: 0.426458. Entropy: 1.310963.
Training network. lr: 0.000071. clip: 0.028585
Iteration 23287: Policy loss: 0.037075. Value loss: 0.534428. Entropy: 1.321834.
Iteration 23288: Policy loss: 0.028033. Value loss: 0.359869. Entropy: 1.319529.
Iteration 23289: Policy loss: 0.014487. Value loss: 0.303914. Entropy: 1.322626.
Training network. lr: 0.000071. clip: 0.028585
Iteration 23290: Policy loss: 0.084657. Value loss: 0.522623. Entropy: 1.307168.
Iteration 23291: Policy loss: 0.038998. Value loss: 0.401111. Entropy: 1.307524.
Iteration 23292: Policy loss: 0.049848. Value loss: 0.305960. Entropy: 1.305870.
Training network. lr: 0.000071. clip: 0.028585

Iteration 23360: Policy loss: 0.033309. Value loss: 0.619952. Entropy: 1.285427.
Iteration 23361: Policy loss: 0.045349. Value loss: 0.471051. Entropy: 1.286978.
episode: 3124   score: 33600.0  epsilon: 1.0    steps: 385  evaluation reward: 33135.0
episode: 3125   score: 29000.0  epsilon: 1.0    steps: 557  evaluation reward: 33134.0
Training network. lr: 0.000071. clip: 0.028272
Iteration 23362: Policy loss: -0.054518. Value loss: 0.341912. Entropy: 1.314885.
Iteration 23363: Policy loss: -0.065295. Value loss: 0.243131. Entropy: 1.312520.
Iteration 23364: Policy loss: -0.086598. Value loss: 0.212968. Entropy: 1.312478.
Training network. lr: 0.000071. clip: 0.028272
Iteration 23365: Policy loss: -0.066549. Value loss: 0.464692. Entropy: 1.311793.
Iteration 23366: Policy loss: -0.055348. Value loss: 0.410042. Entropy: 1.311861.
Iteration 23367: Policy loss: -0.048945. Value loss: 0.334560. Entropy: 1.312612.
Training network. lr: 0.000071. clip: 0.028272
Iteration 23368: Policy loss: 0

Iteration 23436: Policy loss: -0.179083. Value loss: 0.927071. Entropy: 1.268091.
Training network. lr: 0.000070. clip: 0.028124
Iteration 23437: Policy loss: -0.368290. Value loss: 1.335946. Entropy: 1.274300.
Iteration 23438: Policy loss: -0.421697. Value loss: 0.804022. Entropy: 1.267694.
Iteration 23439: Policy loss: -0.342985. Value loss: 0.607873. Entropy: 1.276159.
Training network. lr: 0.000070. clip: 0.028124
Iteration 23440: Policy loss: 0.214936. Value loss: 0.563805. Entropy: 1.290883.
Iteration 23441: Policy loss: 0.232951. Value loss: 0.418335. Entropy: 1.291667.
Iteration 23442: Policy loss: 0.207940. Value loss: 0.324554. Entropy: 1.292962.
Training network. lr: 0.000070. clip: 0.028124
Iteration 23443: Policy loss: 0.365494. Value loss: 1.230273. Entropy: 1.289367.
Iteration 23444: Policy loss: 0.363350. Value loss: 0.860380. Entropy: 1.284822.
Iteration 23445: Policy loss: 0.367688. Value loss: 0.723928. Entropy: 1.287016.
episode: 3134   score: 62800.0  epsilon: 1.0 

Training network. lr: 0.000070. clip: 0.027811
Iteration 23512: Policy loss: -0.137862. Value loss: 0.984954. Entropy: 1.313746.
Iteration 23513: Policy loss: -0.111065. Value loss: 0.616448. Entropy: 1.309876.
Iteration 23514: Policy loss: -0.084101. Value loss: 0.486210. Entropy: 1.316359.
Training network. lr: 0.000070. clip: 0.027811
Iteration 23515: Policy loss: -0.167119. Value loss: 1.497475. Entropy: 1.283712.
Iteration 23516: Policy loss: -0.204767. Value loss: 0.943023. Entropy: 1.281153.
Iteration 23517: Policy loss: -0.177732. Value loss: 0.688201. Entropy: 1.281741.
episode: 3144   score: 29000.0  epsilon: 1.0    steps: 91  evaluation reward: 33165.0
Training network. lr: 0.000070. clip: 0.027811
Iteration 23518: Policy loss: 0.151988. Value loss: 1.009229. Entropy: 1.282650.
Iteration 23519: Policy loss: 0.119375. Value loss: 0.684325. Entropy: 1.284982.
Iteration 23520: Policy loss: 0.114400. Value loss: 0.520049. Entropy: 1.284531.
Training network. lr: 0.000070. clip: 

Training network. lr: 0.000069. clip: 0.027664
Iteration 23590: Policy loss: -0.155775. Value loss: 1.185457. Entropy: 1.244995.
Iteration 23591: Policy loss: -0.105718. Value loss: 0.851150. Entropy: 1.238668.
Iteration 23592: Policy loss: -0.144659. Value loss: 0.678283. Entropy: 1.245728.
episode: 3152   score: 42600.0  epsilon: 1.0    steps: 120  evaluation reward: 32867.0
episode: 3153   score: 46600.0  epsilon: 1.0    steps: 597  evaluation reward: 32985.0
Training network. lr: 0.000069. clip: 0.027664
Iteration 23593: Policy loss: 0.263781. Value loss: 0.772839. Entropy: 1.279062.
Iteration 23594: Policy loss: 0.275587. Value loss: 0.437582. Entropy: 1.276166.
Iteration 23595: Policy loss: 0.284368. Value loss: 0.376331. Entropy: 1.273590.
Training network. lr: 0.000069. clip: 0.027664
Iteration 23596: Policy loss: 0.220185. Value loss: 0.335859. Entropy: 1.266507.
Iteration 23597: Policy loss: 0.237809. Value loss: 0.221665. Entropy: 1.269840.
Iteration 23598: Policy loss: 0.21

Iteration 23665: Policy loss: 0.352038. Value loss: 0.990095. Entropy: 1.269630.
Iteration 23666: Policy loss: 0.348269. Value loss: 0.556176. Entropy: 1.270486.
Iteration 23667: Policy loss: 0.289669. Value loss: 0.506415. Entropy: 1.270128.
Training network. lr: 0.000068. clip: 0.027350
Iteration 23668: Policy loss: -0.258971. Value loss: 1.136474. Entropy: 1.254622.
Iteration 23669: Policy loss: -0.249719. Value loss: 0.636893. Entropy: 1.258874.
Iteration 23670: Policy loss: -0.234703. Value loss: 0.542005. Entropy: 1.257589.
Training network. lr: 0.000068. clip: 0.027350
Iteration 23671: Policy loss: -0.206089. Value loss: 1.141757. Entropy: 1.271092.
Iteration 23672: Policy loss: -0.160939. Value loss: 0.701379. Entropy: 1.268904.
Iteration 23673: Policy loss: -0.218942. Value loss: 0.574098. Entropy: 1.272344.
episode: 3162   score: 19600.0  epsilon: 1.0    steps: 400  evaluation reward: 33801.0
episode: 3163   score: 37900.0  epsilon: 1.0    steps: 955  evaluation reward: 34087

Iteration 23742: Policy loss: 0.090987. Value loss: 0.937795. Entropy: 1.221136.
Training network. lr: 0.000068. clip: 0.027203
Iteration 23743: Policy loss: 0.150022. Value loss: 0.865681. Entropy: 1.229424.
Iteration 23744: Policy loss: 0.164746. Value loss: 0.573739. Entropy: 1.232881.
Iteration 23745: Policy loss: 0.147399. Value loss: 0.561300. Entropy: 1.228795.
episode: 3171   score: 52700.0  epsilon: 1.0    steps: 558  evaluation reward: 35124.0
Training network. lr: 0.000068. clip: 0.027203
Iteration 23746: Policy loss: 0.151172. Value loss: 0.597885. Entropy: 1.223317.
Iteration 23747: Policy loss: 0.175213. Value loss: 0.452153. Entropy: 1.226113.
Iteration 23748: Policy loss: 0.170648. Value loss: 0.313078. Entropy: 1.224575.
episode: 3172   score: 19600.0  epsilon: 1.0    steps: 746  evaluation reward: 35115.0
Training network. lr: 0.000068. clip: 0.027203
Iteration 23749: Policy loss: -0.004724. Value loss: 0.918748. Entropy: 1.259398.
Iteration 23750: Policy loss: 0.0375

Iteration 23819: Policy loss: 0.400464. Value loss: 0.746837. Entropy: 1.231150.
Iteration 23820: Policy loss: 0.406120. Value loss: 0.559611. Entropy: 1.229529.
episode: 3180   score: 40000.0  epsilon: 1.0    steps: 116  evaluation reward: 35386.0
episode: 3181   score: 37800.0  epsilon: 1.0    steps: 695  evaluation reward: 35242.0
Training network. lr: 0.000067. clip: 0.026889
Iteration 23821: Policy loss: 0.004405. Value loss: 0.473623. Entropy: 1.236613.
Iteration 23822: Policy loss: -0.009834. Value loss: 0.249666. Entropy: 1.236109.
Iteration 23823: Policy loss: -0.008408. Value loss: 0.211309. Entropy: 1.238562.
Training network. lr: 0.000067. clip: 0.026889
Iteration 23824: Policy loss: 0.171468. Value loss: 0.255632. Entropy: 1.236264.
Iteration 23825: Policy loss: 0.180248. Value loss: 0.191935. Entropy: 1.239719.
Iteration 23826: Policy loss: 0.165418. Value loss: 0.172920. Entropy: 1.238687.
episode: 3182   score: 70600.0  epsilon: 1.0    steps: 993  evaluation reward: 357

Iteration 23894: Policy loss: -0.047530. Value loss: 0.558222. Entropy: 1.233298.
Iteration 23895: Policy loss: -0.027159. Value loss: 0.475878. Entropy: 1.236944.
Training network. lr: 0.000067. clip: 0.026742
Iteration 23896: Policy loss: -0.002995. Value loss: 0.684634. Entropy: 1.244870.
Iteration 23897: Policy loss: -0.019025. Value loss: 0.505184. Entropy: 1.244201.
Iteration 23898: Policy loss: -0.003993. Value loss: 0.439631. Entropy: 1.245395.
Training network. lr: 0.000067. clip: 0.026742
Iteration 23899: Policy loss: 0.176155. Value loss: 0.924492. Entropy: 1.219608.
Iteration 23900: Policy loss: 0.247522. Value loss: 0.607356. Entropy: 1.226241.
Iteration 23901: Policy loss: 0.240979. Value loss: 0.499582. Entropy: 1.228898.
Training network. lr: 0.000066. clip: 0.026585
Iteration 23902: Policy loss: 0.070790. Value loss: 1.226670. Entropy: 1.216804.
Iteration 23903: Policy loss: 0.125290. Value loss: 0.596304. Entropy: 1.222671.
Iteration 23904: Policy loss: 0.014591. Valu

Iteration 23972: Policy loss: 0.404500. Value loss: 0.414053. Entropy: 1.213552.
Iteration 23973: Policy loss: 0.406836. Value loss: 0.345652. Entropy: 1.223056.
Training network. lr: 0.000066. clip: 0.026429
Iteration 23974: Policy loss: -0.051077. Value loss: 1.264160. Entropy: 1.195560.
Iteration 23975: Policy loss: -0.053242. Value loss: 0.812822. Entropy: 1.202834.
Iteration 23976: Policy loss: -0.009767. Value loss: 0.742781. Entropy: 1.200032.
Training network. lr: 0.000066. clip: 0.026429
Iteration 23977: Policy loss: -0.222949. Value loss: 1.412820. Entropy: 1.163350.
Iteration 23978: Policy loss: -0.200073. Value loss: 0.999479. Entropy: 1.168665.
Iteration 23979: Policy loss: -0.228630. Value loss: 0.872837. Entropy: 1.160275.
episode: 3199   score: 32300.0  epsilon: 1.0    steps: 841  evaluation reward: 35863.0
Training network. lr: 0.000066. clip: 0.026429
Iteration 23980: Policy loss: 0.002099. Value loss: 0.980786. Entropy: 1.183486.
Iteration 23981: Policy loss: 0.08516

Iteration 24049: Policy loss: -0.175318. Value loss: 1.310491. Entropy: 1.187963.
Iteration 24050: Policy loss: -0.184229. Value loss: 0.751805. Entropy: 1.198065.
Iteration 24051: Policy loss: -0.260442. Value loss: 0.613724. Entropy: 1.188332.
episode: 3207   score: 27100.0  epsilon: 1.0    steps: 533  evaluation reward: 35981.0
Training network. lr: 0.000065. clip: 0.026125
Iteration 24052: Policy loss: 0.111048. Value loss: 0.806079. Entropy: 1.193073.
Iteration 24053: Policy loss: 0.157660. Value loss: 0.603082. Entropy: 1.200511.
Iteration 24054: Policy loss: 0.120723. Value loss: 0.527081. Entropy: 1.195629.
Training network. lr: 0.000065. clip: 0.026125
Iteration 24055: Policy loss: 0.119701. Value loss: 1.006325. Entropy: 1.201198.
Iteration 24056: Policy loss: 0.086621. Value loss: 0.759489. Entropy: 1.207482.
Iteration 24057: Policy loss: 0.092868. Value loss: 0.604796. Entropy: 1.203087.
Training network. lr: 0.000065. clip: 0.026125
Iteration 24058: Policy loss: 0.138859. 

Training network. lr: 0.000065. clip: 0.025968
Iteration 24124: Policy loss: 0.068848. Value loss: 0.976817. Entropy: 1.202796.
Iteration 24125: Policy loss: 0.008019. Value loss: 0.674778. Entropy: 1.204581.
Iteration 24126: Policy loss: 0.046265. Value loss: 0.496225. Entropy: 1.205351.
Training network. lr: 0.000065. clip: 0.025968
Iteration 24127: Policy loss: -0.185934. Value loss: 0.927238. Entropy: 1.186758.
Iteration 24128: Policy loss: -0.188961. Value loss: 0.660681. Entropy: 1.190869.
Iteration 24129: Policy loss: -0.218179. Value loss: 0.540101. Entropy: 1.189749.
episode: 3218   score: 27300.0  epsilon: 1.0    steps: 35  evaluation reward: 36964.0
Training network. lr: 0.000065. clip: 0.025968
Iteration 24130: Policy loss: 0.173874. Value loss: 0.615340. Entropy: 1.216101.
Iteration 24131: Policy loss: 0.084747. Value loss: 0.373545. Entropy: 1.206455.
Iteration 24132: Policy loss: 0.158222. Value loss: 0.307597. Entropy: 1.210758.
Training network. lr: 0.000065. clip: 0.0

Iteration 24199: Policy loss: 0.187062. Value loss: 0.312039. Entropy: 1.239697.
Iteration 24200: Policy loss: 0.216876. Value loss: 0.205823. Entropy: 1.239701.
Iteration 24201: Policy loss: 0.203537. Value loss: 0.160252. Entropy: 1.239226.
Training network. lr: 0.000064. clip: 0.025664
Iteration 24202: Policy loss: -0.007037. Value loss: 0.423361. Entropy: 1.248113.
Iteration 24203: Policy loss: -0.037809. Value loss: 0.242726. Entropy: 1.246351.
Iteration 24204: Policy loss: -0.010987. Value loss: 0.161659. Entropy: 1.254736.
Training network. lr: 0.000064. clip: 0.025664
Iteration 24205: Policy loss: 0.004707. Value loss: 0.362452. Entropy: 1.226486.
Iteration 24206: Policy loss: -0.021812. Value loss: 0.252363. Entropy: 1.224938.
Iteration 24207: Policy loss: 0.009632. Value loss: 0.220738. Entropy: 1.224036.
Training network. lr: 0.000064. clip: 0.025664
Iteration 24208: Policy loss: -0.088755. Value loss: 0.906401. Entropy: 1.203984.
Iteration 24209: Policy loss: -0.051831. Val

Training network. lr: 0.000064. clip: 0.025507
Iteration 24277: Policy loss: 0.139448. Value loss: 0.575363. Entropy: 1.202490.
Iteration 24278: Policy loss: 0.190131. Value loss: 0.340399. Entropy: 1.201238.
Iteration 24279: Policy loss: 0.150415. Value loss: 0.303629. Entropy: 1.198892.
Training network. lr: 0.000064. clip: 0.025507
Iteration 24280: Policy loss: -0.368815. Value loss: 1.210093. Entropy: 1.202858.
Iteration 24281: Policy loss: -0.376371. Value loss: 0.992774. Entropy: 1.198558.
Iteration 24282: Policy loss: -0.319087. Value loss: 0.717864. Entropy: 1.196745.
Training network. lr: 0.000064. clip: 0.025507
Iteration 24283: Policy loss: 0.028805. Value loss: 0.881806. Entropy: 1.191447.
Iteration 24284: Policy loss: 0.016028. Value loss: 0.625213. Entropy: 1.193207.
Iteration 24285: Policy loss: -0.010790. Value loss: 0.505398. Entropy: 1.191659.
Training network. lr: 0.000064. clip: 0.025507
Iteration 24286: Policy loss: 0.176927. Value loss: 1.444032. Entropy: 1.165390

Iteration 24352: Policy loss: 0.204559. Value loss: 0.653278. Entropy: 1.269172.
Iteration 24353: Policy loss: 0.205017. Value loss: 0.447031. Entropy: 1.265527.
Iteration 24354: Policy loss: 0.232566. Value loss: 0.426583. Entropy: 1.267583.
Training network. lr: 0.000063. clip: 0.025203
Iteration 24355: Policy loss: -0.066002. Value loss: 1.178169. Entropy: 1.246101.
Iteration 24356: Policy loss: -0.046951. Value loss: 0.726640. Entropy: 1.252538.
Iteration 24357: Policy loss: -0.106951. Value loss: 0.737518. Entropy: 1.246461.
episode: 3246   score: 20900.0  epsilon: 1.0    steps: 301  evaluation reward: 37512.0
Training network. lr: 0.000063. clip: 0.025203
Iteration 24358: Policy loss: 0.156454. Value loss: 0.731894. Entropy: 1.264125.
Iteration 24359: Policy loss: 0.142282. Value loss: 0.571646. Entropy: 1.264810.
Iteration 24360: Policy loss: 0.150110. Value loss: 0.433708. Entropy: 1.264583.
Training network. lr: 0.000063. clip: 0.025203
Iteration 24361: Policy loss: -0.143864.

Iteration 24428: Policy loss: 0.099825. Value loss: 0.264243. Entropy: 1.251089.
Iteration 24429: Policy loss: 0.100680. Value loss: 0.206701. Entropy: 1.248672.
Training network. lr: 0.000063. clip: 0.025046
Iteration 24430: Policy loss: 0.018825. Value loss: 0.583763. Entropy: 1.251882.
Iteration 24431: Policy loss: 0.023109. Value loss: 0.392828. Entropy: 1.254122.
Iteration 24432: Policy loss: 0.017849. Value loss: 0.322149. Entropy: 1.259734.
episode: 3256   score: 45900.0  epsilon: 1.0    steps: 720  evaluation reward: 37830.0
Training network. lr: 0.000063. clip: 0.025046
Iteration 24433: Policy loss: 0.318151. Value loss: 0.608729. Entropy: 1.255049.
Iteration 24434: Policy loss: 0.381158. Value loss: 0.460439. Entropy: 1.254976.
Iteration 24435: Policy loss: 0.335394. Value loss: 0.354720. Entropy: 1.252332.
Training network. lr: 0.000063. clip: 0.025046
Iteration 24436: Policy loss: -0.245448. Value loss: 1.280817. Entropy: 1.267869.
Iteration 24437: Policy loss: -0.243073. V

Iteration 24504: Policy loss: -0.051051. Value loss: 0.470192. Entropy: 1.296146.
episode: 3266   score: 41200.0  epsilon: 1.0    steps: 818  evaluation reward: 37292.0
Training network. lr: 0.000062. clip: 0.024742
Iteration 24505: Policy loss: -0.102192. Value loss: 1.012969. Entropy: 1.260790.
Iteration 24506: Policy loss: -0.055090. Value loss: 0.748951. Entropy: 1.267785.
Iteration 24507: Policy loss: -0.080958. Value loss: 0.661432. Entropy: 1.259900.
episode: 3267   score: 10200.0  epsilon: 1.0    steps: 492  evaluation reward: 37126.0
Training network. lr: 0.000062. clip: 0.024742
Iteration 24508: Policy loss: -0.074930. Value loss: 0.664248. Entropy: 1.280878.
Iteration 24509: Policy loss: -0.092976. Value loss: 0.670160. Entropy: 1.284772.
Iteration 24510: Policy loss: -0.081145. Value loss: 0.487185. Entropy: 1.282783.
Training network. lr: 0.000062. clip: 0.024742
Iteration 24511: Policy loss: -0.119370. Value loss: 0.909148. Entropy: 1.262692.
Iteration 24512: Policy loss:

Training network. lr: 0.000061. clip: 0.024585
Iteration 24580: Policy loss: 0.062133. Value loss: 0.530870. Entropy: 1.255599.
Iteration 24581: Policy loss: 0.141411. Value loss: 0.356901. Entropy: 1.259407.
Iteration 24582: Policy loss: 0.070597. Value loss: 0.319527. Entropy: 1.261193.
episode: 3276   score: 48000.0  epsilon: 1.0    steps: 629  evaluation reward: 37086.0
Training network. lr: 0.000061. clip: 0.024585
Iteration 24583: Policy loss: 0.260453. Value loss: 0.244158. Entropy: 1.274049.
Iteration 24584: Policy loss: 0.261080. Value loss: 0.154827. Entropy: 1.275936.
Iteration 24585: Policy loss: 0.267144. Value loss: 0.149200. Entropy: 1.279745.
Training network. lr: 0.000061. clip: 0.024585
Iteration 24586: Policy loss: 0.059815. Value loss: 0.578710. Entropy: 1.285208.
Iteration 24587: Policy loss: 0.045954. Value loss: 0.377401. Entropy: 1.288914.
Iteration 24588: Policy loss: 0.051682. Value loss: 0.297105. Entropy: 1.285456.
Training network. lr: 0.000061. clip: 0.024

Iteration 24658: Policy loss: 0.069168. Value loss: 1.230053. Entropy: 1.257627.
Iteration 24659: Policy loss: 0.102078. Value loss: 0.934962. Entropy: 1.256906.
Iteration 24660: Policy loss: 0.080489. Value loss: 0.804110. Entropy: 1.258490.
Training network. lr: 0.000061. clip: 0.024281
Iteration 24661: Policy loss: -0.054495. Value loss: 1.072681. Entropy: 1.244275.
Iteration 24662: Policy loss: -0.021830. Value loss: 0.815323. Entropy: 1.238411.
Iteration 24663: Policy loss: -0.055163. Value loss: 0.676474. Entropy: 1.237414.
Training network. lr: 0.000061. clip: 0.024281
Iteration 24664: Policy loss: 0.407624. Value loss: 1.000992. Entropy: 1.268536.
Iteration 24665: Policy loss: 0.474700. Value loss: 0.684827. Entropy: 1.267613.
Iteration 24666: Policy loss: 0.412923. Value loss: 0.566312. Entropy: 1.268023.
episode: 3283   score: 62800.0  epsilon: 1.0    steps: 214  evaluation reward: 36891.0
Training network. lr: 0.000061. clip: 0.024281
Iteration 24667: Policy loss: 0.032004. 

Iteration 24734: Policy loss: -0.323780. Value loss: 1.173960. Entropy: 1.261643.
Iteration 24735: Policy loss: -0.301018. Value loss: 0.792316. Entropy: 1.263033.
Training network. lr: 0.000060. clip: 0.024125
Iteration 24736: Policy loss: 0.230870. Value loss: 1.150165. Entropy: 1.264332.
Iteration 24737: Policy loss: 0.241378. Value loss: 0.763325. Entropy: 1.266044.
Iteration 24738: Policy loss: 0.223203. Value loss: 0.774226. Entropy: 1.260396.
Training network. lr: 0.000060. clip: 0.024125
Iteration 24739: Policy loss: 0.159356. Value loss: 0.936994. Entropy: 1.282439.
Iteration 24740: Policy loss: 0.151321. Value loss: 0.759834. Entropy: 1.276078.
Iteration 24741: Policy loss: 0.145010. Value loss: 0.569954. Entropy: 1.271346.
Training network. lr: 0.000060. clip: 0.024125
Iteration 24742: Policy loss: -0.183575. Value loss: 1.730815. Entropy: 1.259395.
Iteration 24743: Policy loss: -0.218004. Value loss: 1.048906. Entropy: 1.259261.
Iteration 24744: Policy loss: -0.144644. Valu

Iteration 24813: Policy loss: 0.403563. Value loss: 0.642072. Entropy: 1.293112.
episode: 3300   score: 29200.0  epsilon: 1.0    steps: 422  evaluation reward: 37982.0
Training network. lr: 0.000060. clip: 0.023821
Iteration 24814: Policy loss: -0.064632. Value loss: 0.777073. Entropy: 1.264499.
Iteration 24815: Policy loss: -0.058428. Value loss: 0.533085. Entropy: 1.254699.
Iteration 24816: Policy loss: -0.080334. Value loss: 0.398574. Entropy: 1.259728.
Training network. lr: 0.000060. clip: 0.023821
Iteration 24817: Policy loss: -0.150807. Value loss: 0.844683. Entropy: 1.259547.
Iteration 24818: Policy loss: -0.106384. Value loss: 0.579343. Entropy: 1.262335.
Iteration 24819: Policy loss: -0.139183. Value loss: 0.518660. Entropy: 1.262000.
now time :  2019-02-27 02:08:31.806708
episode: 3301   score: 61300.0  epsilon: 1.0    steps: 637  evaluation reward: 38058.0
Training network. lr: 0.000060. clip: 0.023821
Iteration 24820: Policy loss: 0.230736. Value loss: 1.742716. Entropy: 1.

Iteration 24888: Policy loss: 0.218955. Value loss: 0.306902. Entropy: 1.281505.
Training network. lr: 0.000059. clip: 0.023664
Iteration 24889: Policy loss: 0.145424. Value loss: 1.241208. Entropy: 1.268585.
Iteration 24890: Policy loss: 0.090252. Value loss: 0.950855. Entropy: 1.273735.
Iteration 24891: Policy loss: 0.129180. Value loss: 0.624619. Entropy: 1.270657.
Training network. lr: 0.000059. clip: 0.023664
Iteration 24892: Policy loss: -0.038185. Value loss: 0.519727. Entropy: 1.265720.
Iteration 24893: Policy loss: -0.011668. Value loss: 0.302211. Entropy: 1.269260.
Iteration 24894: Policy loss: -0.025532. Value loss: 0.275185. Entropy: 1.270514.
Training network. lr: 0.000059. clip: 0.023664
Iteration 24895: Policy loss: -0.184242. Value loss: 0.809375. Entropy: 1.225196.
Iteration 24896: Policy loss: -0.175629. Value loss: 0.568168. Entropy: 1.232896.
Iteration 24897: Policy loss: -0.201433. Value loss: 0.596955. Entropy: 1.225779.
Training network. lr: 0.000059. clip: 0.023

Iteration 24965: Policy loss: 0.013617. Value loss: 1.145421. Entropy: 1.251499.
Iteration 24966: Policy loss: 0.028867. Value loss: 0.998639. Entropy: 1.248000.
Training network. lr: 0.000058. clip: 0.023360
Iteration 24967: Policy loss: -0.029876. Value loss: 1.513880. Entropy: 1.264643.
Iteration 24968: Policy loss: -0.070226. Value loss: 0.887164. Entropy: 1.265508.
Iteration 24969: Policy loss: -0.028974. Value loss: 0.764383. Entropy: 1.265848.
Training network. lr: 0.000058. clip: 0.023360
Iteration 24970: Policy loss: 0.078986. Value loss: 1.195079. Entropy: 1.248488.
Iteration 24971: Policy loss: 0.119676. Value loss: 0.819861. Entropy: 1.244793.
Iteration 24972: Policy loss: 0.070854. Value loss: 0.789139. Entropy: 1.251134.
Training network. lr: 0.000058. clip: 0.023360
Iteration 24973: Policy loss: -0.278165. Value loss: 1.290190. Entropy: 1.230531.
Iteration 24974: Policy loss: -0.231703. Value loss: 0.699368. Entropy: 1.226291.
Iteration 24975: Policy loss: -0.263498. Val

Training network. lr: 0.000058. clip: 0.023203
Iteration 25042: Policy loss: -0.037487. Value loss: 1.415754. Entropy: 1.259290.
Iteration 25043: Policy loss: 0.018825. Value loss: 0.918952. Entropy: 1.266160.
Iteration 25044: Policy loss: -0.051772. Value loss: 0.778021. Entropy: 1.262337.
Training network. lr: 0.000058. clip: 0.023203
Iteration 25045: Policy loss: -0.118894. Value loss: 1.123058. Entropy: 1.222940.
Iteration 25046: Policy loss: -0.062536. Value loss: 0.681014. Entropy: 1.222117.
Iteration 25047: Policy loss: -0.096787. Value loss: 0.597156. Entropy: 1.218173.
Training network. lr: 0.000058. clip: 0.023203
Iteration 25048: Policy loss: 0.176305. Value loss: 0.751102. Entropy: 1.270220.
Iteration 25049: Policy loss: 0.172597. Value loss: 0.549337. Entropy: 1.271237.
Iteration 25050: Policy loss: 0.160634. Value loss: 0.460411. Entropy: 1.271571.
episode: 3328   score: 33400.0  epsilon: 1.0    steps: 67  evaluation reward: 37669.0
Training network. lr: 0.000058. clip: 0

Iteration 25117: Policy loss: 0.186487. Value loss: 0.403626. Entropy: 1.258916.
Iteration 25118: Policy loss: 0.187901. Value loss: 0.208685. Entropy: 1.259210.
Iteration 25119: Policy loss: 0.174697. Value loss: 0.235195. Entropy: 1.263477.
Training network. lr: 0.000057. clip: 0.022899
Iteration 25120: Policy loss: 0.185529. Value loss: 0.234735. Entropy: 1.259864.
Iteration 25121: Policy loss: 0.179132. Value loss: 0.178568. Entropy: 1.263720.
Iteration 25122: Policy loss: 0.164975. Value loss: 0.167633. Entropy: 1.264787.
episode: 3338   score: 29300.0  epsilon: 1.0    steps: 600  evaluation reward: 37305.0
episode: 3339   score: 35700.0  epsilon: 1.0    steps: 1011  evaluation reward: 37419.0
Training network. lr: 0.000057. clip: 0.022899
Iteration 25123: Policy loss: -0.205818. Value loss: 1.772954. Entropy: 1.260159.
Iteration 25124: Policy loss: -0.185406. Value loss: 1.169757. Entropy: 1.260927.
Iteration 25125: Policy loss: -0.214445. Value loss: 0.985577. Entropy: 1.257394.

Iteration 25194: Policy loss: -0.122130. Value loss: 0.355881. Entropy: 1.246235.
episode: 3347   score: 23200.0  epsilon: 1.0    steps: 164  evaluation reward: 37550.0
Training network. lr: 0.000057. clip: 0.022742
Iteration 25195: Policy loss: 0.051125. Value loss: 0.715051. Entropy: 1.263634.
Iteration 25196: Policy loss: 0.092554. Value loss: 0.546255. Entropy: 1.257915.
Iteration 25197: Policy loss: 0.017302. Value loss: 0.509055. Entropy: 1.263380.
Training network. lr: 0.000057. clip: 0.022742
Iteration 25198: Policy loss: 0.117775. Value loss: 0.726144. Entropy: 1.252890.
Iteration 25199: Policy loss: 0.156492. Value loss: 0.470702. Entropy: 1.244979.
Iteration 25200: Policy loss: 0.127147. Value loss: 0.424102. Entropy: 1.249305.
episode: 3348   score: 30600.0  epsilon: 1.0    steps: 262  evaluation reward: 37477.0
episode: 3349   score: 48200.0  epsilon: 1.0    steps: 564  evaluation reward: 37584.0
Training network. lr: 0.000056. clip: 0.022595
Iteration 25201: Policy loss: 

Iteration 25267: Policy loss: -0.157683. Value loss: 0.919048. Entropy: 1.244872.
Iteration 25268: Policy loss: -0.138121. Value loss: 0.484025. Entropy: 1.242860.
Iteration 25269: Policy loss: -0.135394. Value loss: 0.406119. Entropy: 1.242874.
episode: 3359   score: 42200.0  epsilon: 1.0    steps: 244  evaluation reward: 36719.0
Training network. lr: 0.000056. clip: 0.022438
Iteration 25270: Policy loss: 0.226967. Value loss: 1.233477. Entropy: 1.199485.
Iteration 25271: Policy loss: 0.250804. Value loss: 1.177077. Entropy: 1.193114.
Iteration 25272: Policy loss: 0.299139. Value loss: 0.786240. Entropy: 1.199011.
Training network. lr: 0.000056. clip: 0.022438
Iteration 25273: Policy loss: -0.034995. Value loss: 0.886208. Entropy: 1.220888.
Iteration 25274: Policy loss: -0.050103. Value loss: 0.523173. Entropy: 1.215304.
Iteration 25275: Policy loss: -0.041113. Value loss: 0.469151. Entropy: 1.220100.
episode: 3360   score: 48100.0  epsilon: 1.0    steps: 560  evaluation reward: 36768

episode: 3367   score: 37600.0  epsilon: 1.0    steps: 633  evaluation reward: 37364.0
Training network. lr: 0.000056. clip: 0.022281
Iteration 25345: Policy loss: -0.075427. Value loss: 0.927221. Entropy: 1.172132.
Iteration 25346: Policy loss: -0.045767. Value loss: 0.693478. Entropy: 1.161661.
Iteration 25347: Policy loss: -0.032731. Value loss: 0.539532. Entropy: 1.158892.
Training network. lr: 0.000056. clip: 0.022281
Iteration 25348: Policy loss: 0.133142. Value loss: 0.955087. Entropy: 1.205514.
Iteration 25349: Policy loss: 0.094715. Value loss: 0.786102. Entropy: 1.208501.
Iteration 25350: Policy loss: 0.093366. Value loss: 0.650012. Entropy: 1.203205.
episode: 3368   score: 40700.0  epsilon: 1.0    steps: 224  evaluation reward: 37580.0
Training network. lr: 0.000055. clip: 0.022134
Iteration 25351: Policy loss: 0.477103. Value loss: 0.381451. Entropy: 1.235153.
Iteration 25352: Policy loss: 0.471320. Value loss: 0.293028. Entropy: 1.241039.
Iteration 25353: Policy loss: 0.46

episode: 3375   score: 38800.0  epsilon: 1.0    steps: 615  evaluation reward: 37670.0
Training network. lr: 0.000055. clip: 0.021977
Iteration 25423: Policy loss: -0.077034. Value loss: 1.216299. Entropy: 1.173136.
Iteration 25424: Policy loss: -0.041627. Value loss: 0.873674. Entropy: 1.176371.
Iteration 25425: Policy loss: -0.068834. Value loss: 0.800144. Entropy: 1.178083.
episode: 3376   score: 57100.0  epsilon: 1.0    steps: 242  evaluation reward: 37761.0
Training network. lr: 0.000055. clip: 0.021977
Iteration 25426: Policy loss: -0.000369. Value loss: 1.027539. Entropy: 1.157965.
Iteration 25427: Policy loss: 0.026354. Value loss: 0.666938. Entropy: 1.155617.
Iteration 25428: Policy loss: -0.004608. Value loss: 0.671437. Entropy: 1.162468.
episode: 3377   score: 68600.0  epsilon: 1.0    steps: 757  evaluation reward: 38111.0
Training network. lr: 0.000055. clip: 0.021977
Iteration 25429: Policy loss: 0.153533. Value loss: 0.889880. Entropy: 1.220144.
Iteration 25430: Policy lo

Iteration 25499: Policy loss: 0.142457. Value loss: 0.513899. Entropy: 1.194679.
Iteration 25500: Policy loss: 0.137016. Value loss: 0.482900. Entropy: 1.191285.
Training network. lr: 0.000054. clip: 0.021673
Iteration 25501: Policy loss: 0.137415. Value loss: 1.142643. Entropy: 1.168791.
Iteration 25502: Policy loss: 0.088664. Value loss: 0.868249. Entropy: 1.172106.
Iteration 25503: Policy loss: 0.122306. Value loss: 0.730072. Entropy: 1.166218.
episode: 3384   score: 48600.0  epsilon: 1.0    steps: 176  evaluation reward: 37932.0
Training network. lr: 0.000054. clip: 0.021673
Iteration 25504: Policy loss: -0.108793. Value loss: 1.440858. Entropy: 1.165432.
Iteration 25505: Policy loss: -0.163445. Value loss: 0.950749. Entropy: 1.167598.
Iteration 25506: Policy loss: -0.123668. Value loss: 0.806081. Entropy: 1.176675.
episode: 3385   score: 40900.0  epsilon: 1.0    steps: 60  evaluation reward: 37790.0
Training network. lr: 0.000054. clip: 0.021673
Iteration 25507: Policy loss: 0.383

Iteration 25574: Policy loss: 0.172833. Value loss: 0.881616. Entropy: 1.220537.
Iteration 25575: Policy loss: 0.199342. Value loss: 0.626121. Entropy: 1.222464.
Training network. lr: 0.000054. clip: 0.021517
Iteration 25576: Policy loss: 0.194223. Value loss: 0.659361. Entropy: 1.180309.
Iteration 25577: Policy loss: 0.219535. Value loss: 0.395227. Entropy: 1.181802.
Iteration 25578: Policy loss: 0.186085. Value loss: 0.351620. Entropy: 1.176928.
Training network. lr: 0.000054. clip: 0.021517
Iteration 25579: Policy loss: 0.011762. Value loss: 0.923500. Entropy: 1.194159.
Iteration 25580: Policy loss: 0.010780. Value loss: 0.758499. Entropy: 1.203466.
Iteration 25581: Policy loss: 0.027405. Value loss: 0.511760. Entropy: 1.196746.
episode: 3395   score: 42300.0  epsilon: 1.0    steps: 998  evaluation reward: 37666.0
Training network. lr: 0.000054. clip: 0.021517
Iteration 25582: Policy loss: -0.205597. Value loss: 1.190407. Entropy: 1.196698.
Iteration 25583: Policy loss: -0.232433. V

Iteration 25652: Policy loss: 0.035473. Value loss: 0.679652. Entropy: 1.172452.
Iteration 25653: Policy loss: 0.031819. Value loss: 0.602029. Entropy: 1.174989.
Training network. lr: 0.000053. clip: 0.021212
Iteration 25654: Policy loss: -0.226709. Value loss: 0.966206. Entropy: 1.127764.
Iteration 25655: Policy loss: -0.179101. Value loss: 0.763382. Entropy: 1.115390.
Iteration 25656: Policy loss: -0.187476. Value loss: 0.642624. Entropy: 1.128593.
episode: 3402   score: 46700.0  epsilon: 1.0    steps: 290  evaluation reward: 37536.0
Training network. lr: 0.000053. clip: 0.021212
Iteration 25657: Policy loss: 0.010701. Value loss: 0.802330. Entropy: 1.150360.
Iteration 25658: Policy loss: 0.057866. Value loss: 0.587252. Entropy: 1.146278.
Iteration 25659: Policy loss: -0.017778. Value loss: 0.531340. Entropy: 1.141715.
Training network. lr: 0.000053. clip: 0.021212
Iteration 25660: Policy loss: -0.274862. Value loss: 0.806815. Entropy: 1.118672.
Iteration 25661: Policy loss: -0.27504

Training network. lr: 0.000053. clip: 0.021056
Iteration 25729: Policy loss: -0.088517. Value loss: 0.818701. Entropy: 1.195605.
Iteration 25730: Policy loss: -0.111712. Value loss: 0.439612. Entropy: 1.201213.
Iteration 25731: Policy loss: -0.034777. Value loss: 0.325713. Entropy: 1.199781.
Training network. lr: 0.000053. clip: 0.021056
Iteration 25732: Policy loss: -0.369966. Value loss: 1.350487. Entropy: 1.139176.
Iteration 25733: Policy loss: -0.336116. Value loss: 0.807530. Entropy: 1.140154.
Iteration 25734: Policy loss: -0.346879. Value loss: 0.688926. Entropy: 1.154328.
Training network. lr: 0.000053. clip: 0.021056
Iteration 25735: Policy loss: 0.233662. Value loss: 0.597525. Entropy: 1.185496.
Iteration 25736: Policy loss: 0.216044. Value loss: 0.589563. Entropy: 1.176424.
Iteration 25737: Policy loss: 0.247021. Value loss: 0.400919. Entropy: 1.181629.
Training network. lr: 0.000053. clip: 0.021056
Iteration 25738: Policy loss: -0.213289. Value loss: 1.476116. Entropy: 1.151

Iteration 25807: Policy loss: -0.180649. Value loss: 1.439745. Entropy: 1.148272.
Iteration 25808: Policy loss: -0.170659. Value loss: 1.293350. Entropy: 1.147596.
Iteration 25809: Policy loss: -0.150608. Value loss: 0.941288. Entropy: 1.152331.
episode: 3418   score: 58600.0  epsilon: 1.0    steps: 662  evaluation reward: 39012.0
Training network. lr: 0.000052. clip: 0.020752
Iteration 25810: Policy loss: 0.333587. Value loss: 1.044958. Entropy: 1.187732.
Iteration 25811: Policy loss: 0.323216. Value loss: 0.740935. Entropy: 1.190436.
Iteration 25812: Policy loss: 0.300398. Value loss: 0.816114. Entropy: 1.181205.
Training network. lr: 0.000052. clip: 0.020752
Iteration 25813: Policy loss: 0.114012. Value loss: 1.706294. Entropy: 1.186976.
Iteration 25814: Policy loss: 0.083167. Value loss: 1.270976. Entropy: 1.183915.
Iteration 25815: Policy loss: 0.070391. Value loss: 1.142139. Entropy: 1.183071.
Training network. lr: 0.000052. clip: 0.020752
Iteration 25816: Policy loss: 0.193006. 

Iteration 25883: Policy loss: 0.158536. Value loss: 0.633965. Entropy: 1.172638.
Iteration 25884: Policy loss: 0.118651. Value loss: 0.490730. Entropy: 1.174187.
Training network. lr: 0.000051. clip: 0.020595
Iteration 25885: Policy loss: -0.022781. Value loss: 2.124765. Entropy: 1.163374.
Iteration 25886: Policy loss: -0.048959. Value loss: 1.472879. Entropy: 1.156497.
Iteration 25887: Policy loss: -0.086448. Value loss: 1.233456. Entropy: 1.160476.
Training network. lr: 0.000051. clip: 0.020595
Iteration 25888: Policy loss: -0.210017. Value loss: 1.837896. Entropy: 1.137321.
Iteration 25889: Policy loss: -0.195077. Value loss: 1.426437. Entropy: 1.130578.
Iteration 25890: Policy loss: -0.158107. Value loss: 1.133384. Entropy: 1.144742.
Training network. lr: 0.000051. clip: 0.020595
Iteration 25891: Policy loss: 0.209142. Value loss: 1.168413. Entropy: 1.135245.
Iteration 25892: Policy loss: 0.212260. Value loss: 0.855973. Entropy: 1.136692.
Iteration 25893: Policy loss: 0.217526. Val

Training network. lr: 0.000051. clip: 0.020291
Iteration 25960: Policy loss: -0.320431. Value loss: 1.759244. Entropy: 1.137565.
Iteration 25961: Policy loss: -0.365070. Value loss: 1.345499. Entropy: 1.146117.
Iteration 25962: Policy loss: -0.336052. Value loss: 1.188063. Entropy: 1.144290.
episode: 3437   score: 28700.0  epsilon: 1.0    steps: 223  evaluation reward: 39421.0
Training network. lr: 0.000051. clip: 0.020291
Iteration 25963: Policy loss: -0.167330. Value loss: 1.158623. Entropy: 1.168327.
Iteration 25964: Policy loss: -0.129127. Value loss: 0.779579. Entropy: 1.171929.
Iteration 25965: Policy loss: -0.156559. Value loss: 0.738693. Entropy: 1.164680.
Training network. lr: 0.000051. clip: 0.020291
Iteration 25966: Policy loss: 0.129016. Value loss: 0.829403. Entropy: 1.149462.
Iteration 25967: Policy loss: 0.159404. Value loss: 0.530032. Entropy: 1.149799.
Iteration 25968: Policy loss: 0.165354. Value loss: 0.408575. Entropy: 1.149225.
Training network. lr: 0.000051. clip:

Iteration 26037: Policy loss: -0.454716. Value loss: 1.233818. Entropy: 1.175295.
Training network. lr: 0.000050. clip: 0.020134
Iteration 26038: Policy loss: 0.152988. Value loss: 1.597352. Entropy: 1.150995.
Iteration 26039: Policy loss: 0.164434. Value loss: 1.084971. Entropy: 1.162024.
Iteration 26040: Policy loss: 0.179040. Value loss: 0.886234. Entropy: 1.157902.
Training network. lr: 0.000050. clip: 0.020134
Iteration 26041: Policy loss: 0.162148. Value loss: 0.726832. Entropy: 1.165068.
Iteration 26042: Policy loss: 0.154029. Value loss: 0.453307. Entropy: 1.161049.
Iteration 26043: Policy loss: 0.182375. Value loss: 0.367114. Entropy: 1.172984.
Training network. lr: 0.000050. clip: 0.020134
Iteration 26044: Policy loss: 0.461531. Value loss: 0.816026. Entropy: 1.179832.
Iteration 26045: Policy loss: 0.420459. Value loss: 0.512220. Entropy: 1.177096.
Iteration 26046: Policy loss: 0.443832. Value loss: 0.417432. Entropy: 1.187584.
episode: 3446   score: 30400.0  epsilon: 1.0    

Training network. lr: 0.000050. clip: 0.019830
Iteration 26113: Policy loss: -0.223888. Value loss: 0.991388. Entropy: 1.185122.
Iteration 26114: Policy loss: -0.200212. Value loss: 0.729262. Entropy: 1.177005.
Iteration 26115: Policy loss: -0.203609. Value loss: 0.677034. Entropy: 1.183700.
Training network. lr: 0.000050. clip: 0.019830
Iteration 26116: Policy loss: 0.056697. Value loss: 0.939088. Entropy: 1.209561.
Iteration 26117: Policy loss: 0.073415. Value loss: 0.621716. Entropy: 1.210077.
Iteration 26118: Policy loss: 0.036648. Value loss: 0.622582. Entropy: 1.208095.
episode: 3456   score: 38400.0  epsilon: 1.0    steps: 551  evaluation reward: 40401.0
Training network. lr: 0.000050. clip: 0.019830
Iteration 26119: Policy loss: -0.005365. Value loss: 1.162215. Entropy: 1.191379.
Iteration 26120: Policy loss: 0.035527. Value loss: 0.882410. Entropy: 1.184821.
Iteration 26121: Policy loss: 0.021934. Value loss: 0.741789. Entropy: 1.193915.
Training network. lr: 0.000050. clip: 0

Training network. lr: 0.000049. clip: 0.019673
Iteration 26191: Policy loss: -0.052027. Value loss: 0.999304. Entropy: 1.224057.
Iteration 26192: Policy loss: -0.043027. Value loss: 0.721844. Entropy: 1.219802.
Iteration 26193: Policy loss: -0.070135. Value loss: 0.586807. Entropy: 1.223231.
Training network. lr: 0.000049. clip: 0.019673
Iteration 26194: Policy loss: 0.269327. Value loss: 0.849704. Entropy: 1.213525.
Iteration 26195: Policy loss: 0.248634. Value loss: 0.544777. Entropy: 1.206227.
Iteration 26196: Policy loss: 0.211509. Value loss: 0.409116. Entropy: 1.212008.
Training network. lr: 0.000049. clip: 0.019673
Iteration 26197: Policy loss: 0.089705. Value loss: 1.307685. Entropy: 1.180171.
Iteration 26198: Policy loss: 0.082343. Value loss: 0.930345. Entropy: 1.171904.
Iteration 26199: Policy loss: 0.103378. Value loss: 0.669178. Entropy: 1.177298.
episode: 3464   score: 17000.0  epsilon: 1.0    steps: 979  evaluation reward: 40776.0
Training network. lr: 0.000049. clip: 0.

Iteration 26266: Policy loss: -0.105825. Value loss: 0.799978. Entropy: 1.195146.
Iteration 26267: Policy loss: -0.084282. Value loss: 0.623932. Entropy: 1.199171.
Iteration 26268: Policy loss: -0.079023. Value loss: 0.540273. Entropy: 1.199396.
Training network. lr: 0.000048. clip: 0.019369
Iteration 26269: Policy loss: -0.099342. Value loss: 1.021409. Entropy: 1.207941.
Iteration 26270: Policy loss: -0.065904. Value loss: 0.633110. Entropy: 1.194675.
Iteration 26271: Policy loss: -0.071750. Value loss: 0.554372. Entropy: 1.209901.
Training network. lr: 0.000048. clip: 0.019369
Iteration 26272: Policy loss: 0.076177. Value loss: 0.778259. Entropy: 1.226463.
Iteration 26273: Policy loss: 0.140902. Value loss: 0.558408. Entropy: 1.224460.
Iteration 26274: Policy loss: 0.107494. Value loss: 0.502476. Entropy: 1.228959.
Training network. lr: 0.000048. clip: 0.019369
Iteration 26275: Policy loss: 0.261435. Value loss: 0.846478. Entropy: 1.237012.
Iteration 26276: Policy loss: 0.285070. Val

Iteration 26343: Policy loss: -0.105815. Value loss: 0.735967. Entropy: 1.272106.
Training network. lr: 0.000048. clip: 0.019213
Iteration 26344: Policy loss: -0.195364. Value loss: 0.996129. Entropy: 1.255368.
Iteration 26345: Policy loss: -0.185642. Value loss: 0.762508. Entropy: 1.247574.
Iteration 26346: Policy loss: -0.209191. Value loss: 0.657140. Entropy: 1.259110.
Training network. lr: 0.000048. clip: 0.019213
Iteration 26347: Policy loss: 0.222104. Value loss: 1.301614. Entropy: 1.202331.
Iteration 26348: Policy loss: 0.243182. Value loss: 1.111649. Entropy: 1.212071.
Iteration 26349: Policy loss: 0.234751. Value loss: 0.925639. Entropy: 1.221120.
episode: 3483   score: 46500.0  epsilon: 1.0    steps: 888  evaluation reward: 40555.0
Training network. lr: 0.000048. clip: 0.019213
Iteration 26350: Policy loss: -0.159351. Value loss: 1.457051. Entropy: 1.214205.
Iteration 26351: Policy loss: -0.119530. Value loss: 1.115970. Entropy: 1.223140.
Iteration 26352: Policy loss: -0.1765

Iteration 26419: Policy loss: 0.280790. Value loss: 0.560798. Entropy: 1.203424.
Iteration 26420: Policy loss: 0.276500. Value loss: 0.405310. Entropy: 1.206436.
Iteration 26421: Policy loss: 0.299544. Value loss: 0.346726. Entropy: 1.201556.
Training network. lr: 0.000047. clip: 0.018908
Iteration 26422: Policy loss: 0.162250. Value loss: 0.609058. Entropy: 1.228872.
Iteration 26423: Policy loss: 0.175376. Value loss: 0.416374. Entropy: 1.238832.
Iteration 26424: Policy loss: 0.195122. Value loss: 0.325619. Entropy: 1.231968.
Training network. lr: 0.000047. clip: 0.018908
Iteration 26425: Policy loss: 0.100933. Value loss: 0.482587. Entropy: 1.241254.
Iteration 26426: Policy loss: 0.147153. Value loss: 0.322992. Entropy: 1.233576.
Iteration 26427: Policy loss: 0.135662. Value loss: 0.250427. Entropy: 1.238155.
episode: 3492   score: 14400.0  epsilon: 1.0    steps: 710  evaluation reward: 39864.0
Training network. lr: 0.000047. clip: 0.018908
Iteration 26428: Policy loss: -0.175337. Va

Training network. lr: 0.000047. clip: 0.018752
Iteration 26497: Policy loss: -0.205626. Value loss: 1.118276. Entropy: 1.222791.
Iteration 26498: Policy loss: -0.177288. Value loss: 0.626203. Entropy: 1.216793.
Iteration 26499: Policy loss: -0.169622. Value loss: 0.543447. Entropy: 1.214451.
episode: 3500   score: 33900.0  epsilon: 1.0    steps: 786  evaluation reward: 39915.0
Training network. lr: 0.000047. clip: 0.018752
Iteration 26500: Policy loss: 0.173760. Value loss: 0.791700. Entropy: 1.252236.
Iteration 26501: Policy loss: 0.141936. Value loss: 0.688258. Entropy: 1.250737.
Iteration 26502: Policy loss: 0.164454. Value loss: 0.603597. Entropy: 1.258084.
now time :  2019-02-27 02:39:37.727822
episode: 3501   score: 49900.0  epsilon: 1.0    steps: 641  evaluation reward: 39758.0
Training network. lr: 0.000046. clip: 0.018595
Iteration 26503: Policy loss: 0.102282. Value loss: 0.799636. Entropy: 1.286736.
Iteration 26504: Policy loss: 0.097321. Value loss: 0.591957. Entropy: 1.287

Iteration 26573: Policy loss: 0.284005. Value loss: 0.985313. Entropy: 1.271528.
Iteration 26574: Policy loss: 0.381636. Value loss: 0.730994. Entropy: 1.267132.
episode: 3509   score: 46500.0  epsilon: 1.0    steps: 788  evaluation reward: 39829.0
Training network. lr: 0.000046. clip: 0.018448
Iteration 26575: Policy loss: 0.066704. Value loss: 0.339717. Entropy: 1.294714.
Iteration 26576: Policy loss: 0.081900. Value loss: 0.216851. Entropy: 1.288600.
Iteration 26577: Policy loss: 0.064673. Value loss: 0.226510. Entropy: 1.295081.
episode: 3510   score: 41100.0  epsilon: 1.0    steps: 676  evaluation reward: 39844.0
Training network. lr: 0.000046. clip: 0.018448
Iteration 26578: Policy loss: 0.142568. Value loss: 0.317469. Entropy: 1.278646.
Iteration 26579: Policy loss: 0.136609. Value loss: 0.238979. Entropy: 1.274918.
Iteration 26580: Policy loss: 0.143482. Value loss: 0.200556. Entropy: 1.277333.
Training network. lr: 0.000046. clip: 0.018448
Iteration 26581: Policy loss: -0.3950

Iteration 26648: Policy loss: -0.103499. Value loss: 0.243073. Entropy: 1.291690.
Iteration 26649: Policy loss: -0.091498. Value loss: 0.233357. Entropy: 1.294680.
Training network. lr: 0.000046. clip: 0.018291
Iteration 26650: Policy loss: -0.033102. Value loss: 1.266331. Entropy: 1.272922.
Iteration 26651: Policy loss: -0.053607. Value loss: 0.937061. Entropy: 1.266898.
Iteration 26652: Policy loss: -0.016945. Value loss: 0.868696. Entropy: 1.272861.
episode: 3520   score: 20500.0  epsilon: 1.0    steps: 209  evaluation reward: 38192.0
Training network. lr: 0.000045. clip: 0.018134
Iteration 26653: Policy loss: -0.139752. Value loss: 1.409888. Entropy: 1.293285.
Iteration 26654: Policy loss: -0.107286. Value loss: 0.973698. Entropy: 1.295658.
Iteration 26655: Policy loss: -0.089407. Value loss: 0.940214. Entropy: 1.293910.
episode: 3521   score: 31100.0  epsilon: 1.0    steps: 549  evaluation reward: 38106.0
Training network. lr: 0.000045. clip: 0.018134
Iteration 26656: Policy loss:

Iteration 26726: Policy loss: -0.063610. Value loss: 0.582832. Entropy: 1.277592.
Iteration 26727: Policy loss: -0.124945. Value loss: 0.494343. Entropy: 1.280099.
Training network. lr: 0.000045. clip: 0.017987
Iteration 26728: Policy loss: -0.155023. Value loss: 1.122834. Entropy: 1.229772.
Iteration 26729: Policy loss: -0.134783. Value loss: 0.759058. Entropy: 1.227140.
Iteration 26730: Policy loss: -0.151135. Value loss: 0.652789. Entropy: 1.221103.
episode: 3528   score: 58400.0  epsilon: 1.0    steps: 861  evaluation reward: 38427.0
Training network. lr: 0.000045. clip: 0.017987
Iteration 26731: Policy loss: -0.017009. Value loss: 1.125734. Entropy: 1.248140.
Iteration 26732: Policy loss: -0.012079. Value loss: 0.724669. Entropy: 1.252623.
Iteration 26733: Policy loss: -0.018172. Value loss: 0.540850. Entropy: 1.251796.
episode: 3529   score: 51400.0  epsilon: 1.0    steps: 572  evaluation reward: 38727.0
Training network. lr: 0.000045. clip: 0.017987
Iteration 26734: Policy loss:

Iteration 26802: Policy loss: 0.074629. Value loss: 0.377037. Entropy: 1.229308.
Training network. lr: 0.000044. clip: 0.017673
Iteration 26803: Policy loss: 0.030209. Value loss: 0.653771. Entropy: 1.210331.
Iteration 26804: Policy loss: -0.000878. Value loss: 0.597731. Entropy: 1.206312.
Iteration 26805: Policy loss: 0.005681. Value loss: 0.468299. Entropy: 1.209871.
Training network. lr: 0.000044. clip: 0.017673
Iteration 26806: Policy loss: -0.396038. Value loss: 1.024735. Entropy: 1.212700.
Iteration 26807: Policy loss: -0.399414. Value loss: 0.584394. Entropy: 1.206070.
Iteration 26808: Policy loss: -0.386393. Value loss: 0.466175. Entropy: 1.213155.
Training network. lr: 0.000044. clip: 0.017673
Iteration 26809: Policy loss: 0.294602. Value loss: 0.625579. Entropy: 1.269570.
Iteration 26810: Policy loss: 0.324808. Value loss: 0.396777. Entropy: 1.261584.
Iteration 26811: Policy loss: 0.303882. Value loss: 0.315010. Entropy: 1.263926.
Training network. lr: 0.000044. clip: 0.01767

Iteration 26879: Policy loss: 0.355304. Value loss: 0.474586. Entropy: 1.244016.
Iteration 26880: Policy loss: 0.336406. Value loss: 0.360086. Entropy: 1.245437.
Training network. lr: 0.000044. clip: 0.017526
Iteration 26881: Policy loss: -0.344445. Value loss: 2.464842. Entropy: 1.217246.
Iteration 26882: Policy loss: -0.319189. Value loss: 1.826423. Entropy: 1.219598.
Iteration 26883: Policy loss: -0.331878. Value loss: 1.694127. Entropy: 1.218570.
Training network. lr: 0.000044. clip: 0.017526
Iteration 26884: Policy loss: -0.106067. Value loss: 0.965831. Entropy: 1.233546.
Iteration 26885: Policy loss: -0.133386. Value loss: 0.825722. Entropy: 1.227066.
Iteration 26886: Policy loss: -0.120010. Value loss: 0.744422. Entropy: 1.226252.
episode: 3547   score: 16200.0  epsilon: 1.0    steps: 154  evaluation reward: 37217.0
Training network. lr: 0.000044. clip: 0.017526
Iteration 26887: Policy loss: -0.029129. Value loss: 1.380125. Entropy: 1.215922.
Iteration 26888: Policy loss: 0.0077

Iteration 26954: Policy loss: 0.081889. Value loss: 0.966821. Entropy: 1.217463.
Iteration 26955: Policy loss: 0.080067. Value loss: 0.863496. Entropy: 1.212214.
Training network. lr: 0.000043. clip: 0.017213
Iteration 26956: Policy loss: 0.097690. Value loss: 0.570543. Entropy: 1.246213.
Iteration 26957: Policy loss: 0.064927. Value loss: 0.516223. Entropy: 1.243642.
Iteration 26958: Policy loss: 0.088823. Value loss: 0.428099. Entropy: 1.245142.
Training network. lr: 0.000043. clip: 0.017213
Iteration 26959: Policy loss: 0.061545. Value loss: 1.424474. Entropy: 1.159804.
Iteration 26960: Policy loss: 0.055113. Value loss: 1.119343. Entropy: 1.170979.
Iteration 26961: Policy loss: 0.038746. Value loss: 1.030147. Entropy: 1.160204.
episode: 3557   score: 45000.0  epsilon: 1.0    steps: 411  evaluation reward: 37796.0
Training network. lr: 0.000043. clip: 0.017213
Iteration 26962: Policy loss: -0.054961. Value loss: 0.762449. Entropy: 1.234431.
Iteration 26963: Policy loss: -0.040032. V

Training network. lr: 0.000043. clip: 0.017065
Iteration 27031: Policy loss: 0.110505. Value loss: 0.521125. Entropy: 1.232030.
Iteration 27032: Policy loss: 0.099279. Value loss: 0.372145. Entropy: 1.234965.
Iteration 27033: Policy loss: 0.087445. Value loss: 0.335657. Entropy: 1.237875.
episode: 3566   score: 33200.0  epsilon: 1.0    steps: 983  evaluation reward: 36941.0
Training network. lr: 0.000043. clip: 0.017065
Iteration 27034: Policy loss: 0.045955. Value loss: 0.902302. Entropy: 1.234814.
Iteration 27035: Policy loss: 0.018564. Value loss: 0.656843. Entropy: 1.239929.
Iteration 27036: Policy loss: 0.069905. Value loss: 0.529577. Entropy: 1.233267.
Training network. lr: 0.000043. clip: 0.017065
Iteration 27037: Policy loss: -0.098237. Value loss: 0.692218. Entropy: 1.241904.
Iteration 27038: Policy loss: -0.100740. Value loss: 0.551550. Entropy: 1.241849.
Iteration 27039: Policy loss: -0.091069. Value loss: 0.334208. Entropy: 1.242665.
Training network. lr: 0.000043. clip: 0.

Training network. lr: 0.000042. clip: 0.016752
Iteration 27106: Policy loss: 0.150892. Value loss: 0.915764. Entropy: 1.224906.
Iteration 27107: Policy loss: 0.114823. Value loss: 0.761350. Entropy: 1.222662.
Iteration 27108: Policy loss: 0.157098. Value loss: 0.563021. Entropy: 1.229161.
Training network. lr: 0.000042. clip: 0.016752
Iteration 27109: Policy loss: 0.176307. Value loss: 1.390492. Entropy: 1.269073.
Iteration 27110: Policy loss: 0.197735. Value loss: 0.899852. Entropy: 1.266790.
Iteration 27111: Policy loss: 0.182184. Value loss: 0.834130. Entropy: 1.270692.
Training network. lr: 0.000042. clip: 0.016752
Iteration 27112: Policy loss: 0.020253. Value loss: 1.056332. Entropy: 1.210618.
Iteration 27113: Policy loss: 0.026165. Value loss: 0.662245. Entropy: 1.224786.
Iteration 27114: Policy loss: 0.069199. Value loss: 0.590715. Entropy: 1.212983.
Training network. lr: 0.000042. clip: 0.016752
Iteration 27115: Policy loss: 0.104894. Value loss: 0.656407. Entropy: 1.232039.
It

Training network. lr: 0.000042. clip: 0.016604
Iteration 27184: Policy loss: -0.133956. Value loss: 1.352823. Entropy: 1.224405.
Iteration 27185: Policy loss: -0.129545. Value loss: 1.193255. Entropy: 1.227277.
Iteration 27186: Policy loss: -0.143575. Value loss: 0.978567. Entropy: 1.219644.
episode: 3585   score: 26600.0  epsilon: 1.0    steps: 476  evaluation reward: 36413.0
Training network. lr: 0.000042. clip: 0.016604
Iteration 27187: Policy loss: -0.279808. Value loss: 1.246311. Entropy: 1.173540.
Iteration 27188: Policy loss: -0.288516. Value loss: 0.989778. Entropy: 1.175616.
Iteration 27189: Policy loss: -0.316688. Value loss: 0.983095. Entropy: 1.176892.
Training network. lr: 0.000042. clip: 0.016604
Iteration 27190: Policy loss: -0.221931. Value loss: 1.066746. Entropy: 1.190940.
Iteration 27191: Policy loss: -0.253683. Value loss: 0.816961. Entropy: 1.181644.
Iteration 27192: Policy loss: -0.208986. Value loss: 0.642386. Entropy: 1.183868.
Training network. lr: 0.000042. cl

Training network. lr: 0.000041. clip: 0.016291
Iteration 27262: Policy loss: -0.068413. Value loss: 1.222971. Entropy: 1.236014.
Iteration 27263: Policy loss: -0.034960. Value loss: 1.044782. Entropy: 1.234399.
Iteration 27264: Policy loss: 0.010155. Value loss: 0.839383. Entropy: 1.239136.
Training network. lr: 0.000041. clip: 0.016291
Iteration 27265: Policy loss: 0.410402. Value loss: 0.499659. Entropy: 1.254180.
Iteration 27266: Policy loss: 0.370625. Value loss: 0.332055. Entropy: 1.254198.
Iteration 27267: Policy loss: 0.411287. Value loss: 0.291299. Entropy: 1.256776.
episode: 3593   score: 20100.0  epsilon: 1.0    steps: 25  evaluation reward: 37254.0
Training network. lr: 0.000041. clip: 0.016291
Iteration 27268: Policy loss: -0.068613. Value loss: 1.067209. Entropy: 1.243204.
Iteration 27269: Policy loss: -0.001067. Value loss: 0.745318. Entropy: 1.240355.
Iteration 27270: Policy loss: -0.040018. Value loss: 0.659748. Entropy: 1.240073.
episode: 3594   score: 23200.0  epsilon

episode: 3603   score: 25500.0  epsilon: 1.0    steps: 564  evaluation reward: 36817.0
Training network. lr: 0.000040. clip: 0.016144
Iteration 27337: Policy loss: -0.184358. Value loss: 1.384058. Entropy: 1.236083.
Iteration 27338: Policy loss: -0.083292. Value loss: 0.922397. Entropy: 1.235175.
Iteration 27339: Policy loss: -0.113018. Value loss: 0.674301. Entropy: 1.229906.
Training network. lr: 0.000040. clip: 0.016144
Iteration 27340: Policy loss: 0.048897. Value loss: 0.935257. Entropy: 1.223991.
Iteration 27341: Policy loss: -0.032840. Value loss: 0.778128. Entropy: 1.228335.
Iteration 27342: Policy loss: 0.054912. Value loss: 0.664100. Entropy: 1.231954.
Training network. lr: 0.000040. clip: 0.016144
Iteration 27343: Policy loss: -0.093202. Value loss: 1.049266. Entropy: 1.235230.
Iteration 27344: Policy loss: -0.051795. Value loss: 0.732366. Entropy: 1.232209.
Iteration 27345: Policy loss: -0.084190. Value loss: 0.657758. Entropy: 1.231806.
Training network. lr: 0.000040. clip

Iteration 27411: Policy loss: 0.144708. Value loss: 0.703260. Entropy: 1.294443.
Training network. lr: 0.000040. clip: 0.015830
Iteration 27412: Policy loss: 0.199925. Value loss: 0.749333. Entropy: 1.264140.
Iteration 27413: Policy loss: 0.180584. Value loss: 0.589619. Entropy: 1.272868.
Iteration 27414: Policy loss: 0.211696. Value loss: 0.451959. Entropy: 1.267503.
Training network. lr: 0.000040. clip: 0.015830
Iteration 27415: Policy loss: 0.080290. Value loss: 1.311388. Entropy: 1.240439.
Iteration 27416: Policy loss: 0.103773. Value loss: 1.028970. Entropy: 1.237481.
Iteration 27417: Policy loss: 0.144181. Value loss: 0.837474. Entropy: 1.236689.
episode: 3615   score: 38700.0  epsilon: 1.0    steps: 387  evaluation reward: 36175.0
Training network. lr: 0.000040. clip: 0.015830
Iteration 27418: Policy loss: 0.145132. Value loss: 0.484825. Entropy: 1.267864.
Iteration 27419: Policy loss: 0.147433. Value loss: 0.363636. Entropy: 1.271754.
Iteration 27420: Policy loss: 0.121186. Val

Iteration 27487: Policy loss: 0.087412. Value loss: 1.757320. Entropy: 1.243007.
Iteration 27488: Policy loss: 0.051152. Value loss: 1.260541. Entropy: 1.242678.
Iteration 27489: Policy loss: 0.034873. Value loss: 1.015628. Entropy: 1.245571.
Training network. lr: 0.000039. clip: 0.015683
Iteration 27490: Policy loss: 0.087696. Value loss: 0.296224. Entropy: 1.263087.
Iteration 27491: Policy loss: 0.081138. Value loss: 0.278077. Entropy: 1.252147.
Iteration 27492: Policy loss: 0.131205. Value loss: 0.208006. Entropy: 1.254400.
Training network. lr: 0.000039. clip: 0.015683
Iteration 27493: Policy loss: -0.108657. Value loss: 1.901691. Entropy: 1.197701.
Iteration 27494: Policy loss: -0.136686. Value loss: 1.301952. Entropy: 1.201368.
Iteration 27495: Policy loss: -0.127348. Value loss: 1.190709. Entropy: 1.201914.
Training network. lr: 0.000039. clip: 0.015683
Iteration 27496: Policy loss: 0.130295. Value loss: 0.737072. Entropy: 1.228493.
Iteration 27497: Policy loss: 0.129457. Value 

Training network. lr: 0.000038. clip: 0.015369
Iteration 27565: Policy loss: -0.081541. Value loss: 0.665146. Entropy: 1.218928.
Iteration 27566: Policy loss: -0.041814. Value loss: 0.520562. Entropy: 1.224214.
Iteration 27567: Policy loss: -0.057650. Value loss: 0.488254. Entropy: 1.227836.
Training network. lr: 0.000038. clip: 0.015369
Iteration 27568: Policy loss: 0.031189. Value loss: 1.347374. Entropy: 1.178848.
Iteration 27569: Policy loss: 0.023425. Value loss: 0.941839. Entropy: 1.185108.
Iteration 27570: Policy loss: -0.003366. Value loss: 0.914983. Entropy: 1.176218.
episode: 3632   score: 21500.0  epsilon: 1.0    steps: 784  evaluation reward: 36058.0
Training network. lr: 0.000038. clip: 0.015369
Iteration 27571: Policy loss: -0.014350. Value loss: 0.541709. Entropy: 1.247812.
Iteration 27572: Policy loss: -0.004899. Value loss: 0.462718. Entropy: 1.246175.
Iteration 27573: Policy loss: -0.014863. Value loss: 0.326450. Entropy: 1.244051.
Training network. lr: 0.000038. clip

Iteration 27640: Policy loss: 0.165387. Value loss: 0.658949. Entropy: 1.266991.
Iteration 27641: Policy loss: 0.167743. Value loss: 0.607413. Entropy: 1.259143.
Iteration 27642: Policy loss: 0.130013. Value loss: 0.686319. Entropy: 1.262913.
Training network. lr: 0.000038. clip: 0.015222
Iteration 27643: Policy loss: -0.151421. Value loss: 1.407221. Entropy: 1.219077.
Iteration 27644: Policy loss: -0.116939. Value loss: 0.925417. Entropy: 1.220660.
Iteration 27645: Policy loss: -0.100947. Value loss: 0.792393. Entropy: 1.218581.
Training network. lr: 0.000038. clip: 0.015222
Iteration 27646: Policy loss: -0.046529. Value loss: 0.759894. Entropy: 1.208070.
Iteration 27647: Policy loss: -0.021586. Value loss: 0.573031. Entropy: 1.212650.
Iteration 27648: Policy loss: -0.043059. Value loss: 0.570282. Entropy: 1.212302.
Training network. lr: 0.000038. clip: 0.015222
Iteration 27649: Policy loss: -0.052545. Value loss: 1.059623. Entropy: 1.185920.
Iteration 27650: Policy loss: -0.062665. V

Training network. lr: 0.000037. clip: 0.014909
Iteration 27718: Policy loss: 0.095738. Value loss: 0.555621. Entropy: 1.193631.
Iteration 27719: Policy loss: 0.125838. Value loss: 0.307748. Entropy: 1.194885.
Iteration 27720: Policy loss: 0.095637. Value loss: 0.253497. Entropy: 1.196821.
episode: 3650   score: 36600.0  epsilon: 1.0    steps: 495  evaluation reward: 36900.0
Training network. lr: 0.000037. clip: 0.014909
Iteration 27721: Policy loss: 0.319414. Value loss: 0.423194. Entropy: 1.196600.
Iteration 27722: Policy loss: 0.319563. Value loss: 0.357379. Entropy: 1.196114.
Iteration 27723: Policy loss: 0.323651. Value loss: 0.297672. Entropy: 1.200225.
Training network. lr: 0.000037. clip: 0.014909
Iteration 27724: Policy loss: 0.097495. Value loss: 1.236699. Entropy: 1.210659.
Iteration 27725: Policy loss: 0.046488. Value loss: 1.021755. Entropy: 1.215120.
Iteration 27726: Policy loss: 0.053964. Value loss: 0.691168. Entropy: 1.217831.
Training network. lr: 0.000037. clip: 0.014

Iteration 27795: Policy loss: -0.003876. Value loss: 1.101725. Entropy: 1.196876.
episode: 3658   score: 20200.0  epsilon: 1.0    steps: 67  evaluation reward: 37191.0
Training network. lr: 0.000037. clip: 0.014761
Iteration 27796: Policy loss: 0.166816. Value loss: 0.444684. Entropy: 1.212675.
Iteration 27797: Policy loss: 0.152681. Value loss: 0.344406. Entropy: 1.216475.
Iteration 27798: Policy loss: 0.176228. Value loss: 0.284698. Entropy: 1.212044.
Training network. lr: 0.000037. clip: 0.014761
Iteration 27799: Policy loss: 0.085151. Value loss: 1.228153. Entropy: 1.204954.
Iteration 27800: Policy loss: 0.019234. Value loss: 0.910256. Entropy: 1.211974.
Iteration 27801: Policy loss: 0.070185. Value loss: 0.876091. Entropy: 1.206541.
Training network. lr: 0.000037. clip: 0.014605
Iteration 27802: Policy loss: -0.470665. Value loss: 1.640174. Entropy: 1.183353.
Iteration 27803: Policy loss: -0.490809. Value loss: 1.346771. Entropy: 1.186668.
Iteration 27804: Policy loss: -0.428657. 

Iteration 27872: Policy loss: -0.115874. Value loss: 0.853623. Entropy: 1.176605.
Iteration 27873: Policy loss: -0.109914. Value loss: 0.613437. Entropy: 1.174543.
episode: 3667   score: 46000.0  epsilon: 1.0    steps: 466  evaluation reward: 37385.0
Training network. lr: 0.000036. clip: 0.014448
Iteration 27874: Policy loss: -0.149090. Value loss: 0.715894. Entropy: 1.184256.
Iteration 27875: Policy loss: -0.087626. Value loss: 0.577220. Entropy: 1.187335.
Iteration 27876: Policy loss: -0.164417. Value loss: 0.644988. Entropy: 1.185814.
episode: 3668   score: 34000.0  epsilon: 1.0    steps: 553  evaluation reward: 37598.0
Training network. lr: 0.000036. clip: 0.014448
Iteration 27877: Policy loss: 0.132984. Value loss: 0.629994. Entropy: 1.183412.
Iteration 27878: Policy loss: 0.131648. Value loss: 0.462777. Entropy: 1.189938.
Iteration 27879: Policy loss: 0.136787. Value loss: 0.425662. Entropy: 1.184452.
episode: 3669   score: 58600.0  epsilon: 1.0    steps: 67  evaluation reward: 3

Iteration 27948: Policy loss: -0.075054. Value loss: 0.768078. Entropy: 1.222439.
Training network. lr: 0.000036. clip: 0.014300
Iteration 27949: Policy loss: 0.180149. Value loss: 0.720510. Entropy: 1.216407.
Iteration 27950: Policy loss: 0.197701. Value loss: 0.611160. Entropy: 1.225024.
Iteration 27951: Policy loss: 0.180255. Value loss: 0.525873. Entropy: 1.224583.
Training network. lr: 0.000035. clip: 0.014144
Iteration 27952: Policy loss: 0.240711. Value loss: 0.846829. Entropy: 1.209313.
Iteration 27953: Policy loss: 0.253938. Value loss: 0.551850. Entropy: 1.218915.
Iteration 27954: Policy loss: 0.199038. Value loss: 0.621949. Entropy: 1.213388.
Training network. lr: 0.000035. clip: 0.014144
Iteration 27955: Policy loss: 0.053962. Value loss: 1.443471. Entropy: 1.224370.
Iteration 27956: Policy loss: 0.032190. Value loss: 1.111119. Entropy: 1.209507.
Iteration 27957: Policy loss: -0.004628. Value loss: 0.863804. Entropy: 1.222080.
Training network. lr: 0.000035. clip: 0.014144


Iteration 28025: Policy loss: -0.247212. Value loss: 1.096671. Entropy: 1.240587.
Iteration 28026: Policy loss: -0.268153. Value loss: 1.011834. Entropy: 1.243552.
Training network. lr: 0.000035. clip: 0.013987
Iteration 28027: Policy loss: -0.100130. Value loss: 1.803903. Entropy: 1.167531.
Iteration 28028: Policy loss: -0.073525. Value loss: 1.238501. Entropy: 1.152620.
Iteration 28029: Policy loss: -0.046108. Value loss: 1.291437. Entropy: 1.173336.
Training network. lr: 0.000035. clip: 0.013987
Iteration 28030: Policy loss: -0.051827. Value loss: 0.978433. Entropy: 1.198687.
Iteration 28031: Policy loss: -0.028338. Value loss: 0.855679. Entropy: 1.197528.
Iteration 28032: Policy loss: -0.048390. Value loss: 0.731926. Entropy: 1.188629.
Training network. lr: 0.000035. clip: 0.013987
Iteration 28033: Policy loss: 0.085722. Value loss: 0.810179. Entropy: 1.182394.
Iteration 28034: Policy loss: 0.048636. Value loss: 0.685950. Entropy: 1.187734.
Iteration 28035: Policy loss: 0.063253. V

Iteration 28102: Policy loss: -0.021466. Value loss: 1.480396. Entropy: 1.214172.
Iteration 28103: Policy loss: 0.008359. Value loss: 1.205930. Entropy: 1.224496.
Iteration 28104: Policy loss: -0.053382. Value loss: 1.114681. Entropy: 1.222025.
episode: 3694   score: 49600.0  epsilon: 1.0    steps: 125  evaluation reward: 38927.0
Training network. lr: 0.000034. clip: 0.013683
Iteration 28105: Policy loss: 0.276417. Value loss: 0.882630. Entropy: 1.186976.
Iteration 28106: Policy loss: 0.280283. Value loss: 0.587818. Entropy: 1.202127.
Iteration 28107: Policy loss: 0.266994. Value loss: 0.625363. Entropy: 1.194607.
Training network. lr: 0.000034. clip: 0.013683
Iteration 28108: Policy loss: 0.108090. Value loss: 0.470743. Entropy: 1.220389.
Iteration 28109: Policy loss: 0.111095. Value loss: 0.416896. Entropy: 1.218139.
Iteration 28110: Policy loss: 0.128816. Value loss: 0.349787. Entropy: 1.223206.
episode: 3695   score: 28500.0  epsilon: 1.0    steps: 943  evaluation reward: 38686.0
T

Iteration 28179: Policy loss: -0.190302. Value loss: 0.509420. Entropy: 1.236560.
Training network. lr: 0.000034. clip: 0.013526
Iteration 28180: Policy loss: -0.003905. Value loss: 1.106649. Entropy: 1.224995.
Iteration 28181: Policy loss: 0.006950. Value loss: 0.729188. Entropy: 1.219430.
Iteration 28182: Policy loss: -0.010902. Value loss: 0.676530. Entropy: 1.218550.
episode: 3703   score: 31300.0  epsilon: 1.0    steps: 929  evaluation reward: 38544.0
Training network. lr: 0.000034. clip: 0.013526
Iteration 28183: Policy loss: 0.027205. Value loss: 0.574875. Entropy: 1.228166.
Iteration 28184: Policy loss: 0.033087. Value loss: 0.389949. Entropy: 1.230519.
Iteration 28185: Policy loss: 0.019000. Value loss: 0.339823. Entropy: 1.228703.
Training network. lr: 0.000034. clip: 0.013526
Iteration 28186: Policy loss: 0.066034. Value loss: 0.998346. Entropy: 1.231059.
Iteration 28187: Policy loss: 0.059791. Value loss: 0.602764. Entropy: 1.220368.
Iteration 28188: Policy loss: 0.051238. 

Training network. lr: 0.000033. clip: 0.013222
Iteration 28258: Policy loss: 0.323900. Value loss: 0.792554. Entropy: 1.230979.
Iteration 28259: Policy loss: 0.327106. Value loss: 0.590732. Entropy: 1.234488.
Iteration 28260: Policy loss: 0.321913. Value loss: 0.546322. Entropy: 1.232245.
Training network. lr: 0.000033. clip: 0.013222
Iteration 28261: Policy loss: -0.070924. Value loss: 1.464962. Entropy: 1.223127.
Iteration 28262: Policy loss: -0.074402. Value loss: 1.338008. Entropy: 1.226025.
Iteration 28263: Policy loss: -0.111892. Value loss: 1.130015. Entropy: 1.222945.
episode: 3710   score: 32000.0  epsilon: 1.0    steps: 255  evaluation reward: 38909.0
Training network. lr: 0.000033. clip: 0.013222
Iteration 28264: Policy loss: -0.226477. Value loss: 1.304891. Entropy: 1.196743.
Iteration 28265: Policy loss: -0.241421. Value loss: 0.946050. Entropy: 1.202189.
Iteration 28266: Policy loss: -0.181704. Value loss: 0.841766. Entropy: 1.197781.
episode: 3711   score: 61400.0  epsil

Iteration 28335: Policy loss: -0.078010. Value loss: 0.916900. Entropy: 1.207800.
Training network. lr: 0.000033. clip: 0.013065
Iteration 28336: Policy loss: 0.136356. Value loss: 1.214057. Entropy: 1.231301.
Iteration 28337: Policy loss: 0.093339. Value loss: 1.047747. Entropy: 1.237391.
Iteration 28338: Policy loss: 0.132766. Value loss: 0.761867. Entropy: 1.232011.
Training network. lr: 0.000033. clip: 0.013065
Iteration 28339: Policy loss: -0.306284. Value loss: 1.637834. Entropy: 1.245283.
Iteration 28340: Policy loss: -0.220121. Value loss: 1.113879. Entropy: 1.248798.
Iteration 28341: Policy loss: -0.318552. Value loss: 1.346517. Entropy: 1.249102.
episode: 3719   score: 45600.0  epsilon: 1.0    steps: 629  evaluation reward: 40261.0
Training network. lr: 0.000033. clip: 0.013065
Iteration 28342: Policy loss: 0.164796. Value loss: 0.999430. Entropy: 1.226962.
Iteration 28343: Policy loss: 0.170872. Value loss: 0.666210. Entropy: 1.225972.
Iteration 28344: Policy loss: 0.137447.

Iteration 28412: Policy loss: 0.202368. Value loss: 0.314381. Entropy: 1.246622.
Iteration 28413: Policy loss: 0.198141. Value loss: 0.316067. Entropy: 1.242551.
episode: 3728   score: 27700.0  epsilon: 1.0    steps: 322  evaluation reward: 40518.0
episode: 3729   score: 37200.0  epsilon: 1.0    steps: 533  evaluation reward: 40523.0
Training network. lr: 0.000032. clip: 0.012761
Iteration 28414: Policy loss: 0.089228. Value loss: 0.495764. Entropy: 1.249332.
Iteration 28415: Policy loss: 0.096976. Value loss: 0.335044. Entropy: 1.249696.
Iteration 28416: Policy loss: 0.113171. Value loss: 0.351680. Entropy: 1.259149.
episode: 3730   score: 57300.0  epsilon: 1.0    steps: 411  evaluation reward: 40627.0
Training network. lr: 0.000032. clip: 0.012761
Iteration 28417: Policy loss: 0.196923. Value loss: 0.495909. Entropy: 1.267293.
Iteration 28418: Policy loss: 0.219391. Value loss: 0.384979. Entropy: 1.273943.
Iteration 28419: Policy loss: 0.212017. Value loss: 0.318124. Entropy: 1.26627

Training network. lr: 0.000032. clip: 0.012605
Iteration 28489: Policy loss: 0.020966. Value loss: 1.061759. Entropy: 1.249375.
Iteration 28490: Policy loss: 0.018860. Value loss: 0.828729. Entropy: 1.234854.
Iteration 28491: Policy loss: -0.026367. Value loss: 0.689754. Entropy: 1.246188.
Training network. lr: 0.000032. clip: 0.012605
Iteration 28492: Policy loss: 0.275105. Value loss: 1.049452. Entropy: 1.233874.
Iteration 28493: Policy loss: 0.231318. Value loss: 0.833436. Entropy: 1.243491.
Iteration 28494: Policy loss: 0.263334. Value loss: 0.668972. Entropy: 1.240421.
Training network. lr: 0.000032. clip: 0.012605
Iteration 28495: Policy loss: 0.244743. Value loss: 1.302017. Entropy: 1.274311.
Iteration 28496: Policy loss: 0.255059. Value loss: 0.881331. Entropy: 1.273253.
Iteration 28497: Policy loss: 0.271577. Value loss: 0.771010. Entropy: 1.282175.
Training network. lr: 0.000032. clip: 0.012605
Iteration 28498: Policy loss: -0.317944. Value loss: 0.914641. Entropy: 1.255310.


Iteration 28566: Policy loss: -0.041293. Value loss: 0.822435. Entropy: 1.198302.
Training network. lr: 0.000031. clip: 0.012301
Iteration 28567: Policy loss: 0.184876. Value loss: 0.740487. Entropy: 1.238109.
Iteration 28568: Policy loss: 0.184294. Value loss: 0.622748. Entropy: 1.237883.
Iteration 28569: Policy loss: 0.152059. Value loss: 0.590444. Entropy: 1.240499.
Training network. lr: 0.000031. clip: 0.012301
Iteration 28570: Policy loss: -0.033616. Value loss: 0.667206. Entropy: 1.235310.
Iteration 28571: Policy loss: -0.053230. Value loss: 0.670443. Entropy: 1.218547.
Iteration 28572: Policy loss: -0.064996. Value loss: 0.532529. Entropy: 1.228509.
Training network. lr: 0.000031. clip: 0.012301
Iteration 28573: Policy loss: -0.331225. Value loss: 2.200796. Entropy: 1.202263.
Iteration 28574: Policy loss: -0.310541. Value loss: 1.924016. Entropy: 1.202886.
Iteration 28575: Policy loss: -0.315414. Value loss: 1.658490. Entropy: 1.199471.
episode: 3746   score: 36500.0  epsilon: 1

Iteration 28642: Policy loss: 0.173628. Value loss: 0.891428. Entropy: 1.251416.
Iteration 28643: Policy loss: 0.110559. Value loss: 0.827626. Entropy: 1.246109.
Iteration 28644: Policy loss: 0.095949. Value loss: 0.693097. Entropy: 1.254557.
Training network. lr: 0.000030. clip: 0.012144
Iteration 28645: Policy loss: 0.050027. Value loss: 1.135223. Entropy: 1.242809.
Iteration 28646: Policy loss: 0.030401. Value loss: 0.984160. Entropy: 1.247738.
Iteration 28647: Policy loss: 0.035373. Value loss: 0.896329. Entropy: 1.245433.
Training network. lr: 0.000030. clip: 0.012144
Iteration 28648: Policy loss: 0.126693. Value loss: 0.999084. Entropy: 1.229291.
Iteration 28649: Policy loss: 0.144772. Value loss: 0.819600. Entropy: 1.223488.
Iteration 28650: Policy loss: 0.128137. Value loss: 0.631339. Entropy: 1.221146.
Training network. lr: 0.000030. clip: 0.011996
Iteration 28651: Policy loss: -0.010672. Value loss: 0.627439. Entropy: 1.233521.
Iteration 28652: Policy loss: 0.009464. Value lo

episode: 3766   score: 28300.0  epsilon: 1.0    steps: 74  evaluation reward: 39589.0
Training network. lr: 0.000030. clip: 0.011840
Iteration 28717: Policy loss: -0.272875. Value loss: 0.765998. Entropy: 1.256640.
Iteration 28718: Policy loss: -0.275868. Value loss: 0.560734. Entropy: 1.252892.
Iteration 28719: Policy loss: -0.295288. Value loss: 0.551850. Entropy: 1.256163.
episode: 3767   score: 32000.0  epsilon: 1.0    steps: 960  evaluation reward: 39449.0
Training network. lr: 0.000030. clip: 0.011840
Iteration 28720: Policy loss: 0.174410. Value loss: 0.265236. Entropy: 1.267345.
Iteration 28721: Policy loss: 0.168906. Value loss: 0.197673. Entropy: 1.265201.
Iteration 28722: Policy loss: 0.141203. Value loss: 0.216597. Entropy: 1.262658.
Training network. lr: 0.000030. clip: 0.011840
Iteration 28723: Policy loss: -0.108191. Value loss: 0.832363. Entropy: 1.294423.
Iteration 28724: Policy loss: -0.125099. Value loss: 0.648471. Entropy: 1.290985.
Iteration 28725: Policy loss: -0.

Iteration 28794: Policy loss: 0.121508. Value loss: 0.588605. Entropy: 1.233435.
episode: 3774   score: 23400.0  epsilon: 1.0    steps: 827  evaluation reward: 38934.0
Training network. lr: 0.000029. clip: 0.011683
Iteration 28795: Policy loss: -0.020211. Value loss: 1.428660. Entropy: 1.247353.
Iteration 28796: Policy loss: -0.003735. Value loss: 1.121886. Entropy: 1.245141.
Iteration 28797: Policy loss: -0.033324. Value loss: 1.032660. Entropy: 1.248485.
Training network. lr: 0.000029. clip: 0.011683
Iteration 28798: Policy loss: 0.170541. Value loss: 0.933991. Entropy: 1.225725.
Iteration 28799: Policy loss: 0.210640. Value loss: 0.706782. Entropy: 1.222596.
Iteration 28800: Policy loss: 0.181207. Value loss: 0.644130. Entropy: 1.221703.
episode: 3775   score: 46600.0  epsilon: 1.0    steps: 568  evaluation reward: 39065.0
Training network. lr: 0.000029. clip: 0.011536
Iteration 28801: Policy loss: 0.094943. Value loss: 1.194074. Entropy: 1.254239.
Iteration 28802: Policy loss: 0.07

episode: 3781   score: 48200.0  epsilon: 1.0    steps: 817  evaluation reward: 39427.0
Training network. lr: 0.000028. clip: 0.011379
Iteration 28873: Policy loss: -0.052694. Value loss: 1.087933. Entropy: 1.197918.
Iteration 28874: Policy loss: -0.011788. Value loss: 0.762829. Entropy: 1.197050.
Iteration 28875: Policy loss: -0.049471. Value loss: 0.638794. Entropy: 1.195477.
Training network. lr: 0.000028. clip: 0.011379
Iteration 28876: Policy loss: -0.056962. Value loss: 0.999039. Entropy: 1.213970.
Iteration 28877: Policy loss: -0.029269. Value loss: 0.625739. Entropy: 1.214517.
Iteration 28878: Policy loss: -0.040629. Value loss: 0.554098. Entropy: 1.206180.
episode: 3782   score: 72900.0  epsilon: 1.0    steps: 346  evaluation reward: 39792.0
Training network. lr: 0.000028. clip: 0.011379
Iteration 28879: Policy loss: -0.028253. Value loss: 1.566631. Entropy: 1.214695.
Iteration 28880: Policy loss: 0.027577. Value loss: 0.908762. Entropy: 1.227402.
Iteration 28881: Policy loss: 

Iteration 28948: Policy loss: -0.136252. Value loss: 0.916078. Entropy: 1.227936.
Iteration 28949: Policy loss: -0.145571. Value loss: 0.766927. Entropy: 1.227462.
Iteration 28950: Policy loss: -0.154382. Value loss: 0.681683. Entropy: 1.226138.
episode: 3791   score: 38500.0  epsilon: 1.0    steps: 308  evaluation reward: 39591.0
Training network. lr: 0.000028. clip: 0.011075
Iteration 28951: Policy loss: -0.088212. Value loss: 2.388649. Entropy: 1.235911.
Iteration 28952: Policy loss: -0.052392. Value loss: 2.123377. Entropy: 1.236880.
Iteration 28953: Policy loss: -0.084251. Value loss: 1.808301. Entropy: 1.231078.
Training network. lr: 0.000028. clip: 0.011075
Iteration 28954: Policy loss: 0.064647. Value loss: 0.968465. Entropy: 1.246371.
Iteration 28955: Policy loss: 0.099783. Value loss: 0.799611. Entropy: 1.240410.
Iteration 28956: Policy loss: 0.061118. Value loss: 0.663506. Entropy: 1.250535.
Training network. lr: 0.000028. clip: 0.011075
Iteration 28957: Policy loss: 0.00641

Iteration 29025: Policy loss: 0.019099. Value loss: 0.493927. Entropy: 1.225895.
Training network. lr: 0.000027. clip: 0.010918
Iteration 29026: Policy loss: 0.385491. Value loss: 1.043224. Entropy: 1.213027.
Iteration 29027: Policy loss: 0.393837. Value loss: 0.781573. Entropy: 1.212520.
Iteration 29028: Policy loss: 0.401074. Value loss: 0.641164. Entropy: 1.212633.
episode: 3800   score: 83600.0  epsilon: 1.0    steps: 135  evaluation reward: 40357.0
now time :  2019-02-27 03:26:23.265654
episode: 3801   score: 46700.0  epsilon: 1.0    steps: 274  evaluation reward: 40242.0
Training network. lr: 0.000027. clip: 0.010918
Iteration 29029: Policy loss: 0.055303. Value loss: 1.092151. Entropy: 1.232456.
Iteration 29030: Policy loss: 0.087311. Value loss: 0.648296. Entropy: 1.240468.
Iteration 29031: Policy loss: 0.038091. Value loss: 0.589307. Entropy: 1.239511.
Training network. lr: 0.000027. clip: 0.010918
Iteration 29032: Policy loss: -0.361109. Value loss: 1.097865. Entropy: 1.21922

Iteration 29101: Policy loss: 0.136170. Value loss: 1.101619. Entropy: 1.217799.
Iteration 29102: Policy loss: 0.104719. Value loss: 1.000907. Entropy: 1.214245.
Iteration 29103: Policy loss: 0.185150. Value loss: 0.652600. Entropy: 1.215322.
episode: 3809   score: 20000.0  epsilon: 1.0    steps: 594  evaluation reward: 39759.0
Training network. lr: 0.000027. clip: 0.010614
Iteration 29104: Policy loss: 0.074713. Value loss: 0.948535. Entropy: 1.224716.
Iteration 29105: Policy loss: 0.144008. Value loss: 0.567981. Entropy: 1.223622.
Iteration 29106: Policy loss: 0.097141. Value loss: 0.692855. Entropy: 1.220156.
episode: 3810   score: 46900.0  epsilon: 1.0    steps: 184  evaluation reward: 39908.0
episode: 3811   score: 25400.0  epsilon: 1.0    steps: 809  evaluation reward: 39548.0
Training network. lr: 0.000027. clip: 0.010614
Iteration 29107: Policy loss: 0.051424. Value loss: 0.555720. Entropy: 1.249698.
Iteration 29108: Policy loss: 0.055081. Value loss: 0.381596. Entropy: 1.24904

Training network. lr: 0.000026. clip: 0.010457
Iteration 29176: Policy loss: -0.052185. Value loss: 0.745236. Entropy: 1.237812.
Iteration 29177: Policy loss: -0.024377. Value loss: 0.494413. Entropy: 1.244092.
Iteration 29178: Policy loss: -0.018886. Value loss: 0.343042. Entropy: 1.244838.
Training network. lr: 0.000026. clip: 0.010457
Iteration 29179: Policy loss: 0.174999. Value loss: 1.013588. Entropy: 1.245859.
Iteration 29180: Policy loss: 0.237541. Value loss: 0.853916. Entropy: 1.240727.
Iteration 29181: Policy loss: 0.215023. Value loss: 0.723570. Entropy: 1.239503.
Training network. lr: 0.000026. clip: 0.010457
Iteration 29182: Policy loss: 0.150392. Value loss: 0.449901. Entropy: 1.229221.
Iteration 29183: Policy loss: 0.162786. Value loss: 0.329598. Entropy: 1.234394.
Iteration 29184: Policy loss: 0.156995. Value loss: 0.313860. Entropy: 1.222434.
Training network. lr: 0.000026. clip: 0.010457
Iteration 29185: Policy loss: -0.040208. Value loss: 1.304595. Entropy: 1.224496

Iteration 29253: Policy loss: 0.300507. Value loss: 0.890660. Entropy: 1.215628.
Training network. lr: 0.000025. clip: 0.010153
Iteration 29254: Policy loss: -0.164540. Value loss: 2.050794. Entropy: 1.242187.
Iteration 29255: Policy loss: -0.168686. Value loss: 1.364761. Entropy: 1.236348.
Iteration 29256: Policy loss: -0.134342. Value loss: 1.201138. Entropy: 1.248231.
Training network. lr: 0.000025. clip: 0.010153
Iteration 29257: Policy loss: 0.189310. Value loss: 0.885931. Entropy: 1.215213.
Iteration 29258: Policy loss: 0.132477. Value loss: 0.832305. Entropy: 1.208438.
Iteration 29259: Policy loss: 0.249055. Value loss: 0.518238. Entropy: 1.220562.
Training network. lr: 0.000025. clip: 0.010153
Iteration 29260: Policy loss: 0.020200. Value loss: 0.660527. Entropy: 1.212495.
Iteration 29261: Policy loss: 0.033367. Value loss: 0.481656. Entropy: 1.213475.
Iteration 29262: Policy loss: 0.007949. Value loss: 0.493666. Entropy: 1.211002.
Training network. lr: 0.000025. clip: 0.010153

Iteration 30: Policy loss: 0.021166. Value loss: 0.006463. Entropy: 2.875834.
Training network. lr: 0.000250. clip: 0.100000
Iteration 31: Policy loss: 0.006931. Value loss: 0.011376. Entropy: 2.877586.
Iteration 32: Policy loss: 0.004881. Value loss: 0.008264. Entropy: 2.877537.
Iteration 33: Policy loss: 0.002988. Value loss: 0.007036. Entropy: 2.874729.
Training network. lr: 0.000250. clip: 0.100000
Iteration 34: Policy loss: 0.027813. Value loss: 0.005716. Entropy: 2.877801.
Iteration 35: Policy loss: 0.024981. Value loss: 0.004482. Entropy: 2.872584.
Iteration 36: Policy loss: 0.027869. Value loss: 0.003601. Entropy: 2.867854.
episode: 9   score: 200.0  epsilon: 1.0    steps: 489  evaluation reward: 168.88888888888889
episode: 10   score: 210.0  epsilon: 1.0    steps: 708  evaluation reward: 173.0
episode: 11   score: 130.0  epsilon: 1.0    steps: 1005  evaluation reward: 169.0909090909091
Training network. lr: 0.000250. clip: 0.100000
Iteration 37: Policy loss: -0.019329. Value l

Iteration 94: Policy loss: 0.010313. Value loss: 0.012832. Entropy: 2.831367.
Iteration 95: Policy loss: 0.005556. Value loss: 0.009736. Entropy: 2.834796.
Iteration 96: Policy loss: 0.012566. Value loss: 0.007208. Entropy: 2.831837.
episode: 34   score: 180.0  epsilon: 1.0    steps: 461  evaluation reward: 195.58823529411765
episode: 35   score: 250.0  epsilon: 1.0    steps: 891  evaluation reward: 197.14285714285714
Training network. lr: 0.000250. clip: 0.099853
Iteration 97: Policy loss: -0.007514. Value loss: 0.011432. Entropy: 2.827548.
Iteration 98: Policy loss: -0.011010. Value loss: 0.008998. Entropy: 2.834219.
Iteration 99: Policy loss: -0.007851. Value loss: 0.007815. Entropy: 2.831923.
episode: 36   score: 130.0  epsilon: 1.0    steps: 299  evaluation reward: 195.27777777777777
Training network. lr: 0.000250. clip: 0.099853
Iteration 100: Policy loss: 0.033740. Value loss: 0.012654. Entropy: 2.829029.
Iteration 101: Policy loss: 0.035166. Value loss: 0.009300. Entropy: 2.828

Iteration 157: Policy loss: -0.018628. Value loss: 0.011654. Entropy: 2.798806.
Iteration 158: Policy loss: -0.016677. Value loss: 0.007066. Entropy: 2.803659.
Iteration 159: Policy loss: -0.021458. Value loss: 0.005198. Entropy: 2.804785.
Training network. lr: 0.000249. clip: 0.099548
Iteration 160: Policy loss: -0.001392. Value loss: 0.015263. Entropy: 2.820411.
Iteration 161: Policy loss: 0.003296. Value loss: 0.007872. Entropy: 2.821234.
Iteration 162: Policy loss: 0.000579. Value loss: 0.005593. Entropy: 2.819751.
episode: 58   score: 240.0  epsilon: 1.0    steps: 366  evaluation reward: 214.13793103448276
episode: 59   score: 240.0  epsilon: 1.0    steps: 389  evaluation reward: 214.57627118644066
episode: 60   score: 320.0  epsilon: 1.0    steps: 910  evaluation reward: 216.33333333333334
Training network. lr: 0.000249. clip: 0.099548
Iteration 163: Policy loss: 0.037452. Value loss: 0.010618. Entropy: 2.814254.
Iteration 164: Policy loss: 0.039310. Value loss: 0.008300. Entropy

Iteration 219: Policy loss: 0.000249. Value loss: 0.005321. Entropy: 2.826818.
Training network. lr: 0.000248. clip: 0.099392
Iteration 220: Policy loss: 0.035389. Value loss: 0.007157. Entropy: 2.847461.
Iteration 221: Policy loss: 0.032905. Value loss: 0.003229. Entropy: 2.849872.
Iteration 222: Policy loss: 0.033405. Value loss: 0.002021. Entropy: 2.850549.
Training network. lr: 0.000248. clip: 0.099392
Iteration 223: Policy loss: 0.009588. Value loss: 0.008293. Entropy: 2.863628.
Iteration 224: Policy loss: 0.013950. Value loss: 0.004953. Entropy: 2.864675.
Iteration 225: Policy loss: 0.011079. Value loss: 0.003816. Entropy: 2.862093.
episode: 84   score: 150.0  epsilon: 1.0    steps: 800  evaluation reward: 218.33333333333334
Training network. lr: 0.000248. clip: 0.099392
Iteration 226: Policy loss: 0.000123. Value loss: 0.007715. Entropy: 2.861798.
Iteration 227: Policy loss: -0.001784. Value loss: 0.006075. Entropy: 2.864558.
Iteration 228: Policy loss: -0.001443. Value loss: 0.

Iteration 282: Policy loss: 0.014356. Value loss: 0.004667. Entropy: 2.826619.
episode: 109   score: 200.0  epsilon: 1.0    steps: 928  evaluation reward: 220.6
Training network. lr: 0.000248. clip: 0.099235
Iteration 283: Policy loss: 0.025757. Value loss: 0.007496. Entropy: 2.825684.
Iteration 284: Policy loss: 0.024671. Value loss: 0.004560. Entropy: 2.826578.
Iteration 285: Policy loss: 0.026593. Value loss: 0.003698. Entropy: 2.829426.
episode: 110   score: 200.0  epsilon: 1.0    steps: 161  evaluation reward: 220.5
Training network. lr: 0.000248. clip: 0.099235
Iteration 286: Policy loss: 0.003494. Value loss: 0.008195. Entropy: 2.830608.
Iteration 287: Policy loss: -0.000572. Value loss: 0.005290. Entropy: 2.836297.
Iteration 288: Policy loss: 0.000857. Value loss: 0.004895. Entropy: 2.834637.
episode: 111   score: 190.0  epsilon: 1.0    steps: 27  evaluation reward: 221.1
Training network. lr: 0.000248. clip: 0.099235
Iteration 289: Policy loss: 0.045519. Value loss: 0.008870. 

Iteration 348: Policy loss: -0.009257. Value loss: 0.003085. Entropy: 2.858551.
episode: 133   score: 380.0  epsilon: 1.0    steps: 77  evaluation reward: 223.2
episode: 134   score: 200.0  epsilon: 1.0    steps: 140  evaluation reward: 223.4
episode: 135   score: 280.0  epsilon: 1.0    steps: 910  evaluation reward: 223.7
Training network. lr: 0.000248. clip: 0.099088
Iteration 349: Policy loss: 0.008813. Value loss: 0.011954. Entropy: 2.823435.
Iteration 350: Policy loss: 0.006397. Value loss: 0.006668. Entropy: 2.824287.
Iteration 351: Policy loss: 0.005561. Value loss: 0.008074. Entropy: 2.822094.
episode: 136   score: 230.0  epsilon: 1.0    steps: 346  evaluation reward: 224.7
episode: 137   score: 280.0  epsilon: 1.0    steps: 396  evaluation reward: 224.8
Training network. lr: 0.000247. clip: 0.098931
Iteration 352: Policy loss: -0.006791. Value loss: 0.010816. Entropy: 2.815106.
Iteration 353: Policy loss: -0.011665. Value loss: 0.008629. Entropy: 2.811288.
Iteration 354: Polic

Iteration 411: Policy loss: 0.043301. Value loss: 0.010061. Entropy: 2.806110.
Training network. lr: 0.000247. clip: 0.098774
Iteration 412: Policy loss: 0.033222. Value loss: 0.013018. Entropy: 2.811822.
Iteration 413: Policy loss: 0.030347. Value loss: 0.005617. Entropy: 2.804758.
Iteration 414: Policy loss: 0.025226. Value loss: 0.004623. Entropy: 2.799268.
Training network. lr: 0.000247. clip: 0.098774
Iteration 415: Policy loss: -0.168515. Value loss: 0.722404. Entropy: 2.821967.
Iteration 416: Policy loss: -0.176099. Value loss: 0.387108. Entropy: 2.808421.
Iteration 417: Policy loss: -0.141450. Value loss: 0.215817. Entropy: 2.795343.
episode: 160   score: 910.0  epsilon: 1.0    steps: 31  evaluation reward: 237.8
episode: 161   score: 220.0  epsilon: 1.0    steps: 252  evaluation reward: 237.6
episode: 162   score: 320.0  epsilon: 1.0    steps: 369  evaluation reward: 238.0
Training network. lr: 0.000247. clip: 0.098774
Iteration 418: Policy loss: 0.002293. Value loss: 0.058244

episode: 184   score: 210.0  epsilon: 1.0    steps: 706  evaluation reward: 254.5
Training network. lr: 0.000247. clip: 0.098627
Iteration 478: Policy loss: 0.046986. Value loss: 0.011531. Entropy: 2.797140.
Iteration 479: Policy loss: 0.044361. Value loss: 0.006777. Entropy: 2.793912.
Iteration 480: Policy loss: 0.047806. Value loss: 0.005648. Entropy: 2.794776.
episode: 185   score: 220.0  epsilon: 1.0    steps: 507  evaluation reward: 254.9
episode: 186   score: 230.0  epsilon: 1.0    steps: 836  evaluation reward: 255.5
Training network. lr: 0.000247. clip: 0.098627
Iteration 481: Policy loss: -0.001076. Value loss: 0.014229. Entropy: 2.794389.
Iteration 482: Policy loss: -0.009493. Value loss: 0.010684. Entropy: 2.794380.
Iteration 483: Policy loss: -0.003710. Value loss: 0.006584. Entropy: 2.794518.
episode: 187   score: 180.0  epsilon: 1.0    steps: 231  evaluation reward: 253.6
Training network. lr: 0.000247. clip: 0.098627
Iteration 484: Policy loss: 0.030727. Value loss: 0.01

Iteration 541: Policy loss: -0.008902. Value loss: 0.010516. Entropy: 2.809926.
Iteration 542: Policy loss: -0.010083. Value loss: 0.006511. Entropy: 2.808495.
Iteration 543: Policy loss: -0.013411. Value loss: 0.007361. Entropy: 2.817199.
Training network. lr: 0.000246. clip: 0.098470
Iteration 544: Policy loss: -0.010823. Value loss: 0.006673. Entropy: 2.853477.
Iteration 545: Policy loss: -0.009570. Value loss: 0.002810. Entropy: 2.853874.
Iteration 546: Policy loss: -0.016800. Value loss: 0.001937. Entropy: 2.853882.
episode: 210   score: 220.0  epsilon: 1.0    steps: 215  evaluation reward: 264.5
episode: 211   score: 290.0  epsilon: 1.0    steps: 863  evaluation reward: 265.5
Training network. lr: 0.000246. clip: 0.098470
Iteration 547: Policy loss: -0.014920. Value loss: 0.009821. Entropy: 2.829480.
Iteration 548: Policy loss: -0.008459. Value loss: 0.006204. Entropy: 2.826408.
Iteration 549: Policy loss: -0.019315. Value loss: 0.007618. Entropy: 2.823305.
episode: 212   score: 

Iteration 606: Policy loss: 0.007809. Value loss: 0.004436. Entropy: 2.836478.
episode: 235   score: 220.0  epsilon: 1.0    steps: 902  evaluation reward: 278.7
Training network. lr: 0.000245. clip: 0.098166
Iteration 607: Policy loss: 0.039562. Value loss: 0.005736. Entropy: 2.816800.
Iteration 608: Policy loss: 0.038755. Value loss: 0.003721. Entropy: 2.820501.
Iteration 609: Policy loss: 0.040744. Value loss: 0.005311. Entropy: 2.818301.
Training network. lr: 0.000245. clip: 0.098166
Iteration 610: Policy loss: -0.069521. Value loss: 0.012921. Entropy: 2.839781.
Iteration 611: Policy loss: -0.077597. Value loss: 0.007523. Entropy: 2.835339.
Iteration 612: Policy loss: -0.066326. Value loss: 0.004665. Entropy: 2.836760.
episode: 236   score: 370.0  epsilon: 1.0    steps: 56  evaluation reward: 280.1
episode: 237   score: 250.0  epsilon: 1.0    steps: 295  evaluation reward: 279.8
episode: 238   score: 240.0  epsilon: 1.0    steps: 666  evaluation reward: 278.8
Training network. lr: 0

episode: 261   score: 220.0  epsilon: 1.0    steps: 892  evaluation reward: 271.6
Training network. lr: 0.000245. clip: 0.098009
Iteration 670: Policy loss: -0.043328. Value loss: 0.013856. Entropy: 2.814384.
Iteration 671: Policy loss: -0.047524. Value loss: 0.007902. Entropy: 2.819118.
Iteration 672: Policy loss: -0.041968. Value loss: 0.004340. Entropy: 2.814941.
episode: 262   score: 330.0  epsilon: 1.0    steps: 226  evaluation reward: 271.7
episode: 263   score: 340.0  epsilon: 1.0    steps: 453  evaluation reward: 272.4
Training network. lr: 0.000245. clip: 0.098009
Iteration 673: Policy loss: 0.027105. Value loss: 0.013813. Entropy: 2.783579.
Iteration 674: Policy loss: 0.021250. Value loss: 0.008690. Entropy: 2.784425.
Iteration 675: Policy loss: 0.023010. Value loss: 0.007939. Entropy: 2.784668.
episode: 264   score: 260.0  epsilon: 1.0    steps: 532  evaluation reward: 273.9
episode: 265   score: 350.0  epsilon: 1.0    steps: 756  evaluation reward: 273.8
Training network. l

Training network. lr: 0.000245. clip: 0.097853
Iteration 733: Policy loss: -0.000447. Value loss: 0.013987. Entropy: 2.767500.
Iteration 734: Policy loss: -0.007365. Value loss: 0.010928. Entropy: 2.768721.
Iteration 735: Policy loss: -0.007563. Value loss: 0.008557. Entropy: 2.760756.
Training network. lr: 0.000245. clip: 0.097853
Iteration 736: Policy loss: 0.011563. Value loss: 0.008166. Entropy: 2.793243.
Iteration 737: Policy loss: 0.010379. Value loss: 0.004116. Entropy: 2.792753.
Iteration 738: Policy loss: 0.009640. Value loss: 0.002888. Entropy: 2.796619.
episode: 288   score: 390.0  epsilon: 1.0    steps: 33  evaluation reward: 274.1
episode: 289   score: 230.0  epsilon: 1.0    steps: 692  evaluation reward: 273.0
episode: 290   score: 390.0  epsilon: 1.0    steps: 943  evaluation reward: 274.9
Training network. lr: 0.000245. clip: 0.097853
Iteration 739: Policy loss: -0.001721. Value loss: 0.014286. Entropy: 2.761238.
Iteration 740: Policy loss: -0.004105. Value loss: 0.0087

Iteration 796: Policy loss: 0.024126. Value loss: 0.015522. Entropy: 2.761238.
Iteration 797: Policy loss: 0.019610. Value loss: 0.010121. Entropy: 2.757376.
Iteration 798: Policy loss: 0.023395. Value loss: 0.008176. Entropy: 2.758766.
Training network. lr: 0.000244. clip: 0.097705
Iteration 799: Policy loss: -0.018575. Value loss: 0.012293. Entropy: 2.811430.
Iteration 800: Policy loss: -0.014670. Value loss: 0.007122. Entropy: 2.808156.
Iteration 801: Policy loss: -0.021277. Value loss: 0.004815. Entropy: 2.808942.
episode: 314   score: 260.0  epsilon: 1.0    steps: 608  evaluation reward: 282.9
episode: 315   score: 320.0  epsilon: 1.0    steps: 925  evaluation reward: 282.7
Training network. lr: 0.000244. clip: 0.097549
Iteration 802: Policy loss: 0.017471. Value loss: 0.011409. Entropy: 2.784716.
Iteration 803: Policy loss: 0.013537. Value loss: 0.010427. Entropy: 2.781845.
Iteration 804: Policy loss: 0.012865. Value loss: 0.006936. Entropy: 2.781261.
episode: 316   score: 330.0 

Iteration 862: Policy loss: -0.004168. Value loss: 0.005959. Entropy: 2.748863.
Iteration 863: Policy loss: -0.008291. Value loss: 0.003399. Entropy: 2.751373.
Iteration 864: Policy loss: -0.012714. Value loss: 0.002774. Entropy: 2.751976.
episode: 338   score: 280.0  epsilon: 1.0    steps: 443  evaluation reward: 296.9
episode: 339   score: 220.0  epsilon: 1.0    steps: 808  evaluation reward: 296.2
Training network. lr: 0.000243. clip: 0.097392
Iteration 865: Policy loss: 0.038324. Value loss: 0.013996. Entropy: 2.738586.
Iteration 866: Policy loss: 0.032332. Value loss: 0.015907. Entropy: 2.736794.
Iteration 867: Policy loss: 0.029809. Value loss: 0.011430. Entropy: 2.736031.
Training network. lr: 0.000243. clip: 0.097392
Iteration 868: Policy loss: 0.059898. Value loss: 0.010293. Entropy: 2.794015.
Iteration 869: Policy loss: 0.056664. Value loss: 0.006275. Entropy: 2.790185.
Iteration 870: Policy loss: 0.053904. Value loss: 0.004391. Entropy: 2.786655.
Training network. lr: 0.0002

Iteration 926: Policy loss: -0.001046. Value loss: 0.006175. Entropy: 2.724224.
Iteration 927: Policy loss: -0.001361. Value loss: 0.010225. Entropy: 2.719075.
Training network. lr: 0.000243. clip: 0.097244
Iteration 928: Policy loss: 0.023323. Value loss: 0.008905. Entropy: 2.795546.
Iteration 929: Policy loss: 0.019200. Value loss: 0.005822. Entropy: 2.789300.
Iteration 930: Policy loss: 0.019415. Value loss: 0.005130. Entropy: 2.789751.
episode: 364   score: 310.0  epsilon: 1.0    steps: 216  evaluation reward: 302.5
episode: 365   score: 300.0  epsilon: 1.0    steps: 761  evaluation reward: 302.0
Training network. lr: 0.000243. clip: 0.097244
Iteration 931: Policy loss: 0.062385. Value loss: 0.010442. Entropy: 2.803228.
Iteration 932: Policy loss: 0.058835. Value loss: 0.006879. Entropy: 2.809321.
Iteration 933: Policy loss: 0.059537. Value loss: 0.006054. Entropy: 2.810890.
Training network. lr: 0.000243. clip: 0.097244
Iteration 934: Policy loss: 0.035378. Value loss: 0.004871. E

Training network. lr: 0.000243. clip: 0.097088
Iteration 991: Policy loss: 0.022841. Value loss: 0.010568. Entropy: 2.756463.
Iteration 992: Policy loss: 0.021773. Value loss: 0.007630. Entropy: 2.758466.
Iteration 993: Policy loss: 0.019905. Value loss: 0.007424. Entropy: 2.753191.
episode: 389   score: 330.0  epsilon: 1.0    steps: 108  evaluation reward: 303.9
Training network. lr: 0.000243. clip: 0.097088
Iteration 994: Policy loss: -0.010473. Value loss: 0.011310. Entropy: 2.793995.
Iteration 995: Policy loss: -0.013647. Value loss: 0.008358. Entropy: 2.789547.
Iteration 996: Policy loss: -0.013370. Value loss: 0.003785. Entropy: 2.787430.
episode: 390   score: 280.0  epsilon: 1.0    steps: 746  evaluation reward: 302.8
Training network. lr: 0.000243. clip: 0.097088
Iteration 997: Policy loss: 0.000570. Value loss: 0.011224. Entropy: 2.776662.
Iteration 998: Policy loss: 0.005480. Value loss: 0.007306. Entropy: 2.778564.
Iteration 999: Policy loss: 0.005883. Value loss: 0.006104. 

Iteration 1056: Policy loss: 0.030863. Value loss: 0.016006. Entropy: 2.753948.
episode: 413   score: 280.0  epsilon: 1.0    steps: 740  evaluation reward: 325.3
Training network. lr: 0.000242. clip: 0.096784
Iteration 1057: Policy loss: 0.053421. Value loss: 0.015327. Entropy: 2.758989.
Iteration 1058: Policy loss: 0.053977. Value loss: 0.006960. Entropy: 2.745270.
Iteration 1059: Policy loss: 0.045304. Value loss: 0.009425. Entropy: 2.736015.
episode: 414   score: 320.0  epsilon: 1.0    steps: 620  evaluation reward: 325.9
Training network. lr: 0.000242. clip: 0.096784
Iteration 1060: Policy loss: 0.023995. Value loss: 0.017751. Entropy: 2.734218.
Iteration 1061: Policy loss: 0.018259. Value loss: 0.013008. Entropy: 2.742083.
Iteration 1062: Policy loss: 0.016171. Value loss: 0.011183. Entropy: 2.743857.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1063: Policy loss: 0.052373. Value loss: 0.017060. Entropy: 2.732965.
Iteration 1064: Policy loss: 0.051807. Value loss: 0.01

Iteration 1121: Policy loss: 0.010859. Value loss: 0.009042. Entropy: 2.730247.
Iteration 1122: Policy loss: 0.014606. Value loss: 0.004757. Entropy: 2.728490.
Training network. lr: 0.000242. clip: 0.096627
Iteration 1123: Policy loss: 0.017202. Value loss: 0.010790. Entropy: 2.786943.
Iteration 1124: Policy loss: 0.011944. Value loss: 0.006030. Entropy: 2.790022.
Iteration 1125: Policy loss: 0.012442. Value loss: 0.004082. Entropy: 2.793288.
Training network. lr: 0.000242. clip: 0.096627
Iteration 1126: Policy loss: 0.023706. Value loss: 0.008694. Entropy: 2.800018.
Iteration 1127: Policy loss: 0.024559. Value loss: 0.005160. Entropy: 2.804316.
Iteration 1128: Policy loss: 0.021316. Value loss: 0.004248. Entropy: 2.798558.
episode: 437   score: 430.0  epsilon: 1.0    steps: 256  evaluation reward: 325.9
episode: 438   score: 370.0  epsilon: 1.0    steps: 569  evaluation reward: 326.8
episode: 439   score: 300.0  epsilon: 1.0    steps: 733  evaluation reward: 327.6
Training network. lr

Training network. lr: 0.000241. clip: 0.096470
Iteration 1186: Policy loss: -0.009948. Value loss: 0.010994. Entropy: 2.737074.
Iteration 1187: Policy loss: -0.017378. Value loss: 0.007120. Entropy: 2.723258.
Iteration 1188: Policy loss: -0.013311. Value loss: 0.003518. Entropy: 2.718996.
episode: 461   score: 220.0  epsilon: 1.0    steps: 556  evaluation reward: 338.8
episode: 462   score: 320.0  epsilon: 1.0    steps: 686  evaluation reward: 339.3
Training network. lr: 0.000241. clip: 0.096470
Iteration 1189: Policy loss: 0.025429. Value loss: 0.009235. Entropy: 2.735630.
Iteration 1190: Policy loss: 0.020861. Value loss: 0.006444. Entropy: 2.730770.
Iteration 1191: Policy loss: 0.018887. Value loss: 0.008143. Entropy: 2.731987.
Training network. lr: 0.000241. clip: 0.096470
Iteration 1192: Policy loss: 0.021878. Value loss: 0.009297. Entropy: 2.776093.
Iteration 1193: Policy loss: 0.026429. Value loss: 0.004301. Entropy: 2.772407.
Iteration 1194: Policy loss: 0.024610. Value loss: 0

Iteration 1251: Policy loss: 0.025304. Value loss: 0.006526. Entropy: 2.749820.
episode: 485   score: 250.0  epsilon: 1.0    steps: 328  evaluation reward: 345.8
episode: 486   score: 300.0  epsilon: 1.0    steps: 916  evaluation reward: 345.7
Training network. lr: 0.000240. clip: 0.096166
Iteration 1252: Policy loss: 0.020963. Value loss: 0.010969. Entropy: 2.753643.
Iteration 1253: Policy loss: 0.023829. Value loss: 0.005079. Entropy: 2.751844.
Iteration 1254: Policy loss: 0.021957. Value loss: 0.004687. Entropy: 2.753561.
episode: 487   score: 320.0  epsilon: 1.0    steps: 670  evaluation reward: 345.3
Training network. lr: 0.000240. clip: 0.096166
Iteration 1255: Policy loss: 0.007901. Value loss: 0.015478. Entropy: 2.769493.
Iteration 1256: Policy loss: 0.006498. Value loss: 0.009944. Entropy: 2.764949.
Iteration 1257: Policy loss: -0.001374. Value loss: 0.006964. Entropy: 2.757045.
episode: 488   score: 320.0  epsilon: 1.0    steps: 127  evaluation reward: 346.3
Training network.

Training network. lr: 0.000240. clip: 0.096009
Iteration 1315: Policy loss: -0.022459. Value loss: 0.009777. Entropy: 2.745618.
Iteration 1316: Policy loss: -0.025891. Value loss: 0.005044. Entropy: 2.753018.
Iteration 1317: Policy loss: -0.033315. Value loss: 0.003849. Entropy: 2.750265.
episode: 510   score: 340.0  epsilon: 1.0    steps: 170  evaluation reward: 327.1
Training network. lr: 0.000240. clip: 0.096009
Iteration 1318: Policy loss: 0.005070. Value loss: 0.011116. Entropy: 2.770328.
Iteration 1319: Policy loss: 0.001091. Value loss: 0.007106. Entropy: 2.776622.
Iteration 1320: Policy loss: -0.000093. Value loss: 0.005688. Entropy: 2.777064.
episode: 511   score: 430.0  epsilon: 1.0    steps: 64  evaluation reward: 328.3
episode: 512   score: 250.0  epsilon: 1.0    steps: 588  evaluation reward: 327.5
Training network. lr: 0.000240. clip: 0.096009
Iteration 1321: Policy loss: 0.017748. Value loss: 0.012835. Entropy: 2.778207.
Iteration 1322: Policy loss: 0.010959. Value loss:

Iteration 1380: Policy loss: 0.063429. Value loss: 0.008474. Entropy: 2.745646.
episode: 534   score: 940.0  epsilon: 1.0    steps: 674  evaluation reward: 335.7
Training network. lr: 0.000240. clip: 0.095862
Iteration 1381: Policy loss: -0.007748. Value loss: 0.019576. Entropy: 2.726974.
Iteration 1382: Policy loss: -0.004326. Value loss: 0.011140. Entropy: 2.721327.
Iteration 1383: Policy loss: -0.008339. Value loss: 0.009155. Entropy: 2.718200.
episode: 535   score: 340.0  epsilon: 1.0    steps: 295  evaluation reward: 334.8
episode: 536   score: 390.0  epsilon: 1.0    steps: 588  evaluation reward: 336.8
Training network. lr: 0.000240. clip: 0.095862
Iteration 1384: Policy loss: 0.072098. Value loss: 0.015933. Entropy: 2.765458.
Iteration 1385: Policy loss: 0.059400. Value loss: 0.010955. Entropy: 2.755729.
Iteration 1386: Policy loss: 0.065622. Value loss: 0.008775. Entropy: 2.758088.
Training network. lr: 0.000240. clip: 0.095862
Iteration 1387: Policy loss: 0.038044. Value loss:

episode: 559   score: 290.0  epsilon: 1.0    steps: 272  evaluation reward: 328.0
Training network. lr: 0.000239. clip: 0.095705
Iteration 1444: Policy loss: 0.053748. Value loss: 0.010553. Entropy: 2.764011.
Iteration 1445: Policy loss: 0.055682. Value loss: 0.007054. Entropy: 2.754941.
Iteration 1446: Policy loss: 0.055474. Value loss: 0.006948. Entropy: 2.752554.
episode: 560   score: 320.0  epsilon: 1.0    steps: 155  evaluation reward: 327.7
Training network. lr: 0.000239. clip: 0.095705
Iteration 1447: Policy loss: 0.010184. Value loss: 0.014368. Entropy: 2.755965.
Iteration 1448: Policy loss: 0.012757. Value loss: 0.007175. Entropy: 2.751096.
Iteration 1449: Policy loss: 0.007078. Value loss: 0.008377. Entropy: 2.750638.
episode: 561   score: 320.0  epsilon: 1.0    steps: 766  evaluation reward: 328.7
episode: 562   score: 280.0  epsilon: 1.0    steps: 863  evaluation reward: 328.3
Training network. lr: 0.000239. clip: 0.095705
Iteration 1450: Policy loss: -0.012906. Value loss:

Iteration 1507: Policy loss: 0.017884. Value loss: 0.012067. Entropy: 2.781127.
Iteration 1508: Policy loss: 0.016343. Value loss: 0.007607. Entropy: 2.781698.
Iteration 1509: Policy loss: 0.012983. Value loss: 0.006687. Entropy: 2.781397.
episode: 585   score: 410.0  epsilon: 1.0    steps: 856  evaluation reward: 333.0
episode: 586   score: 190.0  epsilon: 1.0    steps: 980  evaluation reward: 331.9
Training network. lr: 0.000239. clip: 0.095401
Iteration 1510: Policy loss: 0.015250. Value loss: 0.011715. Entropy: 2.771406.
Iteration 1511: Policy loss: 0.016509. Value loss: 0.006995. Entropy: 2.778880.
Iteration 1512: Policy loss: 0.010725. Value loss: 0.005782. Entropy: 2.779200.
episode: 587   score: 350.0  epsilon: 1.0    steps: 502  evaluation reward: 332.2
Training network. lr: 0.000239. clip: 0.095401
Iteration 1513: Policy loss: -0.003293. Value loss: 0.011017. Entropy: 2.763953.
Iteration 1514: Policy loss: -0.007600. Value loss: 0.006541. Entropy: 2.763231.
Iteration 1515: Po

Iteration 1572: Policy loss: -0.024447. Value loss: 0.007891. Entropy: 2.791491.
episode: 609   score: 260.0  epsilon: 1.0    steps: 662  evaluation reward: 341.6
episode: 610   score: 330.0  epsilon: 1.0    steps: 805  evaluation reward: 341.5
Training network. lr: 0.000238. clip: 0.095245
Iteration 1573: Policy loss: 0.012002. Value loss: 0.015938. Entropy: 2.761023.
Iteration 1574: Policy loss: 0.007866. Value loss: 0.008909. Entropy: 2.770868.
Iteration 1575: Policy loss: 0.003791. Value loss: 0.011000. Entropy: 2.764390.
episode: 611   score: 360.0  epsilon: 1.0    steps: 102  evaluation reward: 340.8
episode: 612   score: 230.0  epsilon: 1.0    steps: 483  evaluation reward: 340.6
Training network. lr: 0.000238. clip: 0.095245
Iteration 1576: Policy loss: 0.042166. Value loss: 0.018138. Entropy: 2.778091.
Iteration 1577: Policy loss: 0.039960. Value loss: 0.009572. Entropy: 2.779716.
Iteration 1578: Policy loss: 0.036951. Value loss: 0.010291. Entropy: 2.775067.
episode: 613   sc

Training network. lr: 0.000238. clip: 0.095088
Iteration 1636: Policy loss: -0.024415. Value loss: 0.014603. Entropy: 2.729909.
Iteration 1637: Policy loss: -0.033364. Value loss: 0.007630. Entropy: 2.732162.
Iteration 1638: Policy loss: -0.031578. Value loss: 0.005983. Entropy: 2.727168.
Training network. lr: 0.000238. clip: 0.095088
Iteration 1639: Policy loss: 0.022014. Value loss: 0.009495. Entropy: 2.789398.
Iteration 1640: Policy loss: 0.024155. Value loss: 0.005593. Entropy: 2.792495.
Iteration 1641: Policy loss: 0.019546. Value loss: 0.004950. Entropy: 2.790998.
episode: 635   score: 430.0  epsilon: 1.0    steps: 70  evaluation reward: 333.6
episode: 636   score: 360.0  epsilon: 1.0    steps: 409  evaluation reward: 333.3
episode: 637   score: 310.0  epsilon: 1.0    steps: 1021  evaluation reward: 333.2
Training network. lr: 0.000238. clip: 0.095088
Iteration 1642: Policy loss: 0.033866. Value loss: 0.015946. Entropy: 2.739876.
Iteration 1643: Policy loss: 0.030713. Value loss:

Iteration 1701: Policy loss: 0.029728. Value loss: 0.006629. Entropy: 2.718744.
episode: 659   score: 330.0  epsilon: 1.0    steps: 188  evaluation reward: 340.3
episode: 660   score: 400.0  epsilon: 1.0    steps: 943  evaluation reward: 341.1
Training network. lr: 0.000237. clip: 0.094784
Iteration 1702: Policy loss: -0.025761. Value loss: 0.017575. Entropy: 2.705502.
Iteration 1703: Policy loss: -0.020369. Value loss: 0.011988. Entropy: 2.706231.
Iteration 1704: Policy loss: -0.023826. Value loss: 0.008657. Entropy: 2.704509.
Training network. lr: 0.000237. clip: 0.094784
Iteration 1705: Policy loss: 0.011464. Value loss: 0.011959. Entropy: 2.754212.
Iteration 1706: Policy loss: 0.007451. Value loss: 0.006037. Entropy: 2.762182.
Iteration 1707: Policy loss: 0.005108. Value loss: 0.004937. Entropy: 2.759309.
episode: 661   score: 330.0  epsilon: 1.0    steps: 84  evaluation reward: 341.2
Training network. lr: 0.000237. clip: 0.094784
Iteration 1708: Policy loss: 0.036710. Value loss: 

Training network. lr: 0.000237. clip: 0.094627
Iteration 1765: Policy loss: -0.026108. Value loss: 0.009444. Entropy: 2.793120.
Iteration 1766: Policy loss: -0.026074. Value loss: 0.006278. Entropy: 2.788192.
Iteration 1767: Policy loss: -0.034955. Value loss: 0.006520. Entropy: 2.786259.
Training network. lr: 0.000237. clip: 0.094627
Iteration 1768: Policy loss: -0.004687. Value loss: 0.008384. Entropy: 2.756332.
Iteration 1769: Policy loss: -0.006134. Value loss: 0.004075. Entropy: 2.756415.
Iteration 1770: Policy loss: -0.009274. Value loss: 0.003014. Entropy: 2.759533.
episode: 685   score: 370.0  epsilon: 1.0    steps: 69  evaluation reward: 341.6
episode: 686   score: 360.0  epsilon: 1.0    steps: 905  evaluation reward: 343.3
Training network. lr: 0.000237. clip: 0.094627
Iteration 1771: Policy loss: 0.006547. Value loss: 0.008837. Entropy: 2.769271.
Iteration 1772: Policy loss: 0.001329. Value loss: 0.007278. Entropy: 2.773604.
Iteration 1773: Policy loss: 0.004073. Value loss:

Training network. lr: 0.000236. clip: 0.094480
Iteration 1831: Policy loss: 0.023077. Value loss: 0.010092. Entropy: 2.752141.
Iteration 1832: Policy loss: 0.017506. Value loss: 0.005035. Entropy: 2.753311.
Iteration 1833: Policy loss: 0.019536. Value loss: 0.004013. Entropy: 2.752879.
episode: 708   score: 380.0  epsilon: 1.0    steps: 641  evaluation reward: 343.9
Training network. lr: 0.000236. clip: 0.094480
Iteration 1834: Policy loss: 0.020193. Value loss: 0.008320. Entropy: 2.753255.
Iteration 1835: Policy loss: 0.020601. Value loss: 0.004254. Entropy: 2.756748.
Iteration 1836: Policy loss: 0.017010. Value loss: 0.002817. Entropy: 2.752340.
episode: 709   score: 380.0  epsilon: 1.0    steps: 10  evaluation reward: 345.1
episode: 710   score: 320.0  epsilon: 1.0    steps: 284  evaluation reward: 345.0
episode: 711   score: 400.0  epsilon: 1.0    steps: 972  evaluation reward: 345.4
Training network. lr: 0.000236. clip: 0.094480
Iteration 1837: Policy loss: 0.027745. Value loss: 0

Iteration 1896: Policy loss: -0.026670. Value loss: 0.008674. Entropy: 2.783974.
episode: 732   score: 440.0  epsilon: 1.0    steps: 7  evaluation reward: 354.2
Training network. lr: 0.000236. clip: 0.094323
Iteration 1897: Policy loss: -0.019366. Value loss: 0.012876. Entropy: 2.772810.
Iteration 1898: Policy loss: -0.019981. Value loss: 0.008048. Entropy: 2.770762.
Iteration 1899: Policy loss: -0.021226. Value loss: 0.007388. Entropy: 2.766718.
episode: 733   score: 390.0  epsilon: 1.0    steps: 581  evaluation reward: 353.9
Training network. lr: 0.000236. clip: 0.094323
Iteration 1900: Policy loss: -0.144572. Value loss: 0.903736. Entropy: 2.799995.
Iteration 1901: Policy loss: -0.160356. Value loss: 0.704794. Entropy: 2.792878.
Iteration 1902: Policy loss: -0.159077. Value loss: 0.658753. Entropy: 2.779951.
Training network. lr: 0.000235. clip: 0.094166
Iteration 1903: Policy loss: 0.022705. Value loss: 0.018109. Entropy: 2.765860.
Iteration 1904: Policy loss: 0.015891. Value loss:

Iteration 1962: Policy loss: 0.050760. Value loss: 0.005327. Entropy: 2.739839.
episode: 755   score: 310.0  epsilon: 1.0    steps: 352  evaluation reward: 376.0
Training network. lr: 0.000235. clip: 0.094019
Iteration 1963: Policy loss: 0.024604. Value loss: 0.022156. Entropy: 2.761365.
Iteration 1964: Policy loss: 0.024024. Value loss: 0.011917. Entropy: 2.765777.
Iteration 1965: Policy loss: 0.022527. Value loss: 0.009420. Entropy: 2.758384.
Training network. lr: 0.000235. clip: 0.094019
Iteration 1966: Policy loss: 0.004031. Value loss: 0.017685. Entropy: 2.738253.
Iteration 1967: Policy loss: -0.004426. Value loss: 0.010257. Entropy: 2.736428.
Iteration 1968: Policy loss: -0.003770. Value loss: 0.007122. Entropy: 2.735841.
episode: 756   score: 490.0  epsilon: 1.0    steps: 880  evaluation reward: 376.2
Training network. lr: 0.000235. clip: 0.094019
Iteration 1969: Policy loss: 0.056927. Value loss: 0.021870. Entropy: 2.763938.
Iteration 1970: Policy loss: 0.046901. Value loss: 0.

Iteration 2027: Policy loss: 0.018030. Value loss: 0.006180. Entropy: 2.773460.
Iteration 2028: Policy loss: 0.015321. Value loss: 0.005144. Entropy: 2.769558.
episode: 779   score: 350.0  epsilon: 1.0    steps: 401  evaluation reward: 383.7
episode: 780   score: 310.0  epsilon: 1.0    steps: 865  evaluation reward: 383.3
Training network. lr: 0.000235. clip: 0.093862
Iteration 2029: Policy loss: 0.020704. Value loss: 0.012459. Entropy: 2.733147.
Iteration 2030: Policy loss: 0.017325. Value loss: 0.009371. Entropy: 2.734195.
Iteration 2031: Policy loss: 0.016933. Value loss: 0.009262. Entropy: 2.731855.
Training network. lr: 0.000235. clip: 0.093862
Iteration 2032: Policy loss: -0.010780. Value loss: 0.009279. Entropy: 2.758434.
Iteration 2033: Policy loss: -0.010673. Value loss: 0.004841. Entropy: 2.765506.
Iteration 2034: Policy loss: -0.012724. Value loss: 0.004220. Entropy: 2.764769.
Training network. lr: 0.000235. clip: 0.093862
Iteration 2035: Policy loss: 0.034473. Value loss: 0

Iteration 2093: Policy loss: -0.016803. Value loss: 0.008473. Entropy: 2.784586.
Iteration 2094: Policy loss: -0.020225. Value loss: 0.008161. Entropy: 2.786138.
episode: 802   score: 440.0  epsilon: 1.0    steps: 526  evaluation reward: 385.6
episode: 803   score: 320.0  epsilon: 1.0    steps: 1015  evaluation reward: 385.7
Training network. lr: 0.000234. clip: 0.093705
Iteration 2095: Policy loss: 0.022468. Value loss: 0.013742. Entropy: 2.719082.
Iteration 2096: Policy loss: 0.017803. Value loss: 0.007810. Entropy: 2.711794.
Iteration 2097: Policy loss: 0.014825. Value loss: 0.005012. Entropy: 2.712878.
Training network. lr: 0.000234. clip: 0.093705
Iteration 2098: Policy loss: -0.000078. Value loss: 0.011583. Entropy: 2.748845.
Iteration 2099: Policy loss: -0.006846. Value loss: 0.006677. Entropy: 2.740463.
Iteration 2100: Policy loss: -0.006565. Value loss: 0.004795. Entropy: 2.739119.
episode: 804   score: 440.0  epsilon: 1.0    steps: 4  evaluation reward: 386.6
Training network

Iteration 2157: Policy loss: -0.001192. Value loss: 0.008539. Entropy: 2.696851.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2158: Policy loss: 0.047153. Value loss: 0.014316. Entropy: 2.791871.
Iteration 2159: Policy loss: 0.040918. Value loss: 0.008271. Entropy: 2.778436.
Iteration 2160: Policy loss: 0.042862. Value loss: 0.006131. Entropy: 2.781579.
episode: 828   score: 350.0  epsilon: 1.0    steps: 979  evaluation reward: 392.3
Training network. lr: 0.000234. clip: 0.093401
Iteration 2161: Policy loss: 0.026984. Value loss: 0.015684. Entropy: 2.784954.
Iteration 2162: Policy loss: 0.019028. Value loss: 0.009943. Entropy: 2.789409.
Iteration 2163: Policy loss: 0.022282. Value loss: 0.006960. Entropy: 2.784815.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2164: Policy loss: 0.043892. Value loss: 0.014301. Entropy: 2.769249.
Iteration 2165: Policy loss: 0.046133. Value loss: 0.006799. Entropy: 2.768037.
Iteration 2166: Policy loss: 0.039996. Value loss: 0.005

episode: 850   score: 440.0  epsilon: 1.0    steps: 75  evaluation reward: 377.0
now time :  2019-02-27 04:15:58.464557
episode: 851   score: 310.0  epsilon: 1.0    steps: 948  evaluation reward: 376.2
Training network. lr: 0.000233. clip: 0.093245
Iteration 2224: Policy loss: 0.055391. Value loss: 0.016239. Entropy: 2.720527.
Iteration 2225: Policy loss: 0.051406. Value loss: 0.009908. Entropy: 2.719026.
Iteration 2226: Policy loss: 0.051129. Value loss: 0.007896. Entropy: 2.725644.
Training network. lr: 0.000233. clip: 0.093245
Iteration 2227: Policy loss: 0.028099. Value loss: 0.010659. Entropy: 2.771832.
Iteration 2228: Policy loss: 0.018382. Value loss: 0.004636. Entropy: 2.773733.
Iteration 2229: Policy loss: 0.019189. Value loss: 0.002686. Entropy: 2.771150.
Training network. lr: 0.000233. clip: 0.093245
Iteration 2230: Policy loss: 0.023865. Value loss: 0.012184. Entropy: 2.802321.
Iteration 2231: Policy loss: 0.025469. Value loss: 0.006011. Entropy: 2.804089.
Iteration 2232: P

Iteration 2288: Policy loss: 0.019896. Value loss: 0.005455. Entropy: 2.765574.
Iteration 2289: Policy loss: 0.006797. Value loss: 0.009158. Entropy: 2.763038.
episode: 875   score: 350.0  epsilon: 1.0    steps: 321  evaluation reward: 368.3
episode: 876   score: 370.0  epsilon: 1.0    steps: 386  evaluation reward: 367.8
episode: 877   score: 330.0  epsilon: 1.0    steps: 666  evaluation reward: 367.4
Training network. lr: 0.000233. clip: 0.093097
Iteration 2290: Policy loss: 0.006519. Value loss: 0.017273. Entropy: 2.724023.
Iteration 2291: Policy loss: 0.001117. Value loss: 0.009980. Entropy: 2.719784.
Iteration 2292: Policy loss: 0.004171. Value loss: 0.006485. Entropy: 2.719439.
Training network. lr: 0.000233. clip: 0.093097
Iteration 2293: Policy loss: 0.053461. Value loss: 0.013850. Entropy: 2.775877.
Iteration 2294: Policy loss: 0.054733. Value loss: 0.009445. Entropy: 2.766856.
Iteration 2295: Policy loss: 0.052722. Value loss: 0.007458. Entropy: 2.764270.
episode: 878   score

Iteration 2353: Policy loss: 0.004030. Value loss: 0.016704. Entropy: 2.747077.
Iteration 2354: Policy loss: -0.007864. Value loss: 0.008483. Entropy: 2.746254.
Iteration 2355: Policy loss: -0.005335. Value loss: 0.005821. Entropy: 2.748460.
episode: 899   score: 270.0  epsilon: 1.0    steps: 130  evaluation reward: 383.8
Training network. lr: 0.000232. clip: 0.092784
Iteration 2356: Policy loss: -0.420356. Value loss: 3.560781. Entropy: 2.697239.
Iteration 2357: Policy loss: -0.435360. Value loss: 3.101388. Entropy: 2.611300.
Iteration 2358: Policy loss: -0.441412. Value loss: 2.242341. Entropy: 2.562822.
episode: 900   score: 270.0  epsilon: 1.0    steps: 474  evaluation reward: 383.2
now time :  2019-02-27 04:18:36.014371
episode: 901   score: 360.0  epsilon: 1.0    steps: 573  evaluation reward: 383.2
episode: 902   score: 280.0  epsilon: 1.0    steps: 820  evaluation reward: 381.6
Training network. lr: 0.000232. clip: 0.092784
Iteration 2359: Policy loss: 0.232436. Value loss: 0.1

Iteration 2418: Policy loss: 0.085579. Value loss: 0.008915. Entropy: 2.783747.
episode: 923   score: 230.0  epsilon: 1.0    steps: 595  evaluation reward: 379.0
Training network. lr: 0.000232. clip: 0.092636
Iteration 2419: Policy loss: 0.047017. Value loss: 0.013844. Entropy: 2.794563.
Iteration 2420: Policy loss: 0.045580. Value loss: 0.007431. Entropy: 2.798115.
Iteration 2421: Policy loss: 0.045461. Value loss: 0.006110. Entropy: 2.795425.
episode: 924   score: 180.0  epsilon: 1.0    steps: 264  evaluation reward: 377.3
Training network. lr: 0.000232. clip: 0.092636
Iteration 2422: Policy loss: 0.048581. Value loss: 0.018866. Entropy: 2.801596.
Iteration 2423: Policy loss: 0.042498. Value loss: 0.011690. Entropy: 2.809775.
Iteration 2424: Policy loss: 0.046702. Value loss: 0.008504. Entropy: 2.805451.
episode: 925   score: 220.0  epsilon: 1.0    steps: 509  evaluation reward: 376.7
episode: 926   score: 200.0  epsilon: 1.0    steps: 884  evaluation reward: 374.0
Training network. 

episode: 945   score: 120.0  epsilon: 1.0    steps: 524  evaluation reward: 347.9
episode: 946   score: 210.0  epsilon: 1.0    steps: 791  evaluation reward: 347.3
Training network. lr: 0.000231. clip: 0.092480
Iteration 2485: Policy loss: 0.024600. Value loss: 0.013358. Entropy: 2.804897.
Iteration 2486: Policy loss: 0.025942. Value loss: 0.007884. Entropy: 2.812584.
Iteration 2487: Policy loss: 0.024302. Value loss: 0.006334. Entropy: 2.810188.
episode: 947   score: 280.0  epsilon: 1.0    steps: 308  evaluation reward: 345.8
episode: 948   score: 370.0  epsilon: 1.0    steps: 924  evaluation reward: 346.0
Training network. lr: 0.000231. clip: 0.092480
Iteration 2488: Policy loss: 0.008284. Value loss: 0.011852. Entropy: 2.823099.
Iteration 2489: Policy loss: 0.007577. Value loss: 0.005881. Entropy: 2.814924.
Iteration 2490: Policy loss: 0.010184. Value loss: 0.004567. Entropy: 2.816444.
Training network. lr: 0.000231. clip: 0.092480
Iteration 2491: Policy loss: 0.023817. Value loss: 

Iteration 2548: Policy loss: 0.024911. Value loss: 0.011471. Entropy: 2.765416.
Iteration 2549: Policy loss: 0.021286. Value loss: 0.006815. Entropy: 2.772717.
Iteration 2550: Policy loss: 0.023637. Value loss: 0.005692. Entropy: 2.768850.
Training network. lr: 0.000230. clip: 0.092176
Iteration 2551: Policy loss: -0.000498. Value loss: 0.009566. Entropy: 2.775105.
Iteration 2552: Policy loss: -0.004432. Value loss: 0.005931. Entropy: 2.776509.
Iteration 2553: Policy loss: -0.003542. Value loss: 0.005109. Entropy: 2.780280.
episode: 970   score: 280.0  epsilon: 1.0    steps: 321  evaluation reward: 333.0
Training network. lr: 0.000230. clip: 0.092176
Iteration 2554: Policy loss: 0.005642. Value loss: 0.011191. Entropy: 2.812353.
Iteration 2555: Policy loss: 0.006081. Value loss: 0.008145. Entropy: 2.807829.
Iteration 2556: Policy loss: 0.004798. Value loss: 0.006337. Entropy: 2.808411.
episode: 971   score: 290.0  epsilon: 1.0    steps: 1006  evaluation reward: 332.4
Training network. 

Training network. lr: 0.000230. clip: 0.092019
Iteration 2614: Policy loss: -0.017843. Value loss: 0.012638. Entropy: 2.779964.
Iteration 2615: Policy loss: -0.019707. Value loss: 0.006826. Entropy: 2.781743.
Iteration 2616: Policy loss: -0.022314. Value loss: 0.007170. Entropy: 2.777519.
episode: 994   score: 360.0  epsilon: 1.0    steps: 69  evaluation reward: 313.2
episode: 995   score: 350.0  epsilon: 1.0    steps: 547  evaluation reward: 313.1
Training network. lr: 0.000230. clip: 0.092019
Iteration 2617: Policy loss: -0.006938. Value loss: 0.015096. Entropy: 2.716055.
Iteration 2618: Policy loss: -0.013297. Value loss: 0.010103. Entropy: 2.719437.
Iteration 2619: Policy loss: -0.009099. Value loss: 0.007116. Entropy: 2.725460.
Training network. lr: 0.000230. clip: 0.092019
Iteration 2620: Policy loss: 0.036314. Value loss: 0.010287. Entropy: 2.780655.
Iteration 2621: Policy loss: 0.040831. Value loss: 0.005977. Entropy: 2.781910.
Iteration 2622: Policy loss: 0.032672. Value loss:

Iteration 2677: Policy loss: 0.043628. Value loss: 0.014663. Entropy: 2.706201.
Iteration 2678: Policy loss: 0.038977. Value loss: 0.008717. Entropy: 2.701758.
Iteration 2679: Policy loss: 0.032523. Value loss: 0.009375. Entropy: 2.694557.
Training network. lr: 0.000230. clip: 0.091862
Iteration 2680: Policy loss: 0.038627. Value loss: 0.010272. Entropy: 2.752837.
Iteration 2681: Policy loss: 0.033656. Value loss: 0.005448. Entropy: 2.752977.
Iteration 2682: Policy loss: 0.035827. Value loss: 0.003079. Entropy: 2.752205.
episode: 1019   score: 400.0  epsilon: 1.0    steps: 673  evaluation reward: 302.0
Training network. lr: 0.000230. clip: 0.091862
Iteration 2683: Policy loss: 0.028779. Value loss: 0.011959. Entropy: 2.777014.
Iteration 2684: Policy loss: 0.024557. Value loss: 0.008482. Entropy: 2.779980.
Iteration 2685: Policy loss: 0.021301. Value loss: 0.007367. Entropy: 2.779763.
Training network. lr: 0.000230. clip: 0.091862
Iteration 2686: Policy loss: 0.023413. Value loss: 0.009

Training network. lr: 0.000229. clip: 0.091715
Iteration 2743: Policy loss: 0.042882. Value loss: 0.030783. Entropy: 2.625715.
Iteration 2744: Policy loss: 0.035787. Value loss: 0.016514. Entropy: 2.626843.
Iteration 2745: Policy loss: 0.036520. Value loss: 0.011128. Entropy: 2.621367.
episode: 1042   score: 900.0  epsilon: 1.0    steps: 415  evaluation reward: 336.0
Training network. lr: 0.000229. clip: 0.091715
Iteration 2746: Policy loss: 0.083228. Value loss: 0.039379. Entropy: 2.678030.
Iteration 2747: Policy loss: 0.072140. Value loss: 0.018173. Entropy: 2.671129.
Iteration 2748: Policy loss: 0.072853. Value loss: 0.012925. Entropy: 2.669303.
episode: 1043   score: 460.0  epsilon: 1.0    steps: 326  evaluation reward: 338.3
episode: 1044   score: 270.0  epsilon: 1.0    steps: 971  evaluation reward: 338.1
Training network. lr: 0.000229. clip: 0.091715
Iteration 2749: Policy loss: 0.070640. Value loss: 0.023858. Entropy: 2.694270.
Iteration 2750: Policy loss: 0.067782. Value loss:

Iteration 2806: Policy loss: -0.007976. Value loss: 0.017536. Entropy: 2.708545.
Iteration 2807: Policy loss: -0.016637. Value loss: 0.010076. Entropy: 2.708809.
Iteration 2808: Policy loss: -0.013841. Value loss: 0.006507. Entropy: 2.708203.
episode: 1067   score: 360.0  epsilon: 1.0    steps: 338  evaluation reward: 354.6
Training network. lr: 0.000229. clip: 0.091401
Iteration 2809: Policy loss: 0.046778. Value loss: 0.009384. Entropy: 2.668350.
Iteration 2810: Policy loss: 0.046703. Value loss: 0.006906. Entropy: 2.666199.
Iteration 2811: Policy loss: 0.045268. Value loss: 0.005449. Entropy: 2.661089.
episode: 1068   score: 310.0  epsilon: 1.0    steps: 222  evaluation reward: 354.5
Training network. lr: 0.000229. clip: 0.091401
Iteration 2812: Policy loss: 0.035316. Value loss: 0.010353. Entropy: 2.686909.
Iteration 2813: Policy loss: 0.028547. Value loss: 0.007529. Entropy: 2.701392.
Iteration 2814: Policy loss: 0.030565. Value loss: 0.005515. Entropy: 2.697044.
episode: 1069   s

Iteration 2873: Policy loss: 0.067375. Value loss: 0.006819. Entropy: 2.732094.
Iteration 2874: Policy loss: 0.060985. Value loss: 0.004764. Entropy: 2.730267.
episode: 1089   score: 350.0  epsilon: 1.0    steps: 831  evaluation reward: 359.7
Training network. lr: 0.000228. clip: 0.091254
Iteration 2875: Policy loss: 0.020136. Value loss: 0.012353. Entropy: 2.756104.
Iteration 2876: Policy loss: 0.020453. Value loss: 0.007241. Entropy: 2.765377.
Iteration 2877: Policy loss: 0.021114. Value loss: 0.003897. Entropy: 2.762157.
episode: 1090   score: 320.0  epsilon: 1.0    steps: 193  evaluation reward: 360.4
episode: 1091   score: 360.0  epsilon: 1.0    steps: 501  evaluation reward: 360.9
episode: 1092   score: 400.0  epsilon: 1.0    steps: 980  evaluation reward: 361.4
Training network. lr: 0.000228. clip: 0.091254
Iteration 2878: Policy loss: 0.054912. Value loss: 0.018412. Entropy: 2.745341.
Iteration 2879: Policy loss: 0.051947. Value loss: 0.013505. Entropy: 2.744968.
Iteration 2880

Training network. lr: 0.000228. clip: 0.091097
Iteration 2938: Policy loss: 0.039692. Value loss: 0.053108. Entropy: 2.697325.
Iteration 2939: Policy loss: 0.032360. Value loss: 0.016622. Entropy: 2.695750.
Iteration 2940: Policy loss: 0.032787. Value loss: 0.011201. Entropy: 2.692351.
episode: 1113   score: 380.0  epsilon: 1.0    steps: 684  evaluation reward: 363.3
episode: 1114   score: 350.0  epsilon: 1.0    steps: 1022  evaluation reward: 363.0
Training network. lr: 0.000228. clip: 0.091097
Iteration 2941: Policy loss: -0.064609. Value loss: 0.038808. Entropy: 2.713015.
Iteration 2942: Policy loss: -0.079633. Value loss: 0.014385. Entropy: 2.713033.
Iteration 2943: Policy loss: -0.074989. Value loss: 0.010015. Entropy: 2.701291.
episode: 1115   score: 280.0  epsilon: 1.0    steps: 577  evaluation reward: 362.7
Training network. lr: 0.000228. clip: 0.091097
Iteration 2944: Policy loss: -0.001288. Value loss: 0.020884. Entropy: 2.725138.
Iteration 2945: Policy loss: -0.002215. Value

Iteration 3002: Policy loss: 0.039668. Value loss: 0.025989. Entropy: 2.656111.
Iteration 3003: Policy loss: 0.034406. Value loss: 0.021707. Entropy: 2.650086.
episode: 1138   score: 300.0  epsilon: 1.0    steps: 104  evaluation reward: 368.5
Training network. lr: 0.000227. clip: 0.090793
Iteration 3004: Policy loss: 0.051109. Value loss: 0.023553. Entropy: 2.684944.
Iteration 3005: Policy loss: 0.045593. Value loss: 0.013961. Entropy: 2.687582.
Iteration 3006: Policy loss: 0.042807. Value loss: 0.008614. Entropy: 2.680961.
episode: 1139   score: 370.0  epsilon: 1.0    steps: 509  evaluation reward: 367.8
Training network. lr: 0.000227. clip: 0.090793
Iteration 3007: Policy loss: 0.000595. Value loss: 0.020002. Entropy: 2.667476.
Iteration 3008: Policy loss: -0.000770. Value loss: 0.015089. Entropy: 2.670846.
Iteration 3009: Policy loss: 0.007236. Value loss: 0.010181. Entropy: 2.667291.
Training network. lr: 0.000227. clip: 0.090793
Iteration 3010: Policy loss: 0.049002. Value loss: 0

Training network. lr: 0.000227. clip: 0.090637
Iteration 3067: Policy loss: 0.053210. Value loss: 0.026639. Entropy: 2.778417.
Iteration 3068: Policy loss: 0.051854. Value loss: 0.010110. Entropy: 2.762350.
Iteration 3069: Policy loss: 0.037463. Value loss: 0.010095. Entropy: 2.763246.
episode: 1162   score: 260.0  epsilon: 1.0    steps: 420  evaluation reward: 354.8
Training network. lr: 0.000227. clip: 0.090637
Iteration 3070: Policy loss: 0.044084. Value loss: 0.011153. Entropy: 2.759091.
Iteration 3071: Policy loss: 0.043209. Value loss: 0.005473. Entropy: 2.757201.
Iteration 3072: Policy loss: 0.040969. Value loss: 0.008003. Entropy: 2.748229.
episode: 1163   score: 330.0  epsilon: 1.0    steps: 720  evaluation reward: 354.6
Training network. lr: 0.000227. clip: 0.090637
Iteration 3073: Policy loss: 0.008045. Value loss: 0.015008. Entropy: 2.766218.
Iteration 3074: Policy loss: 0.011236. Value loss: 0.007953. Entropy: 2.764366.
Iteration 3075: Policy loss: 0.005341. Value loss: 0.

Iteration 3134: Policy loss: 0.061549. Value loss: 0.016502. Entropy: 2.734308.
Iteration 3135: Policy loss: 0.065011. Value loss: 0.013775. Entropy: 2.728727.
Training network. lr: 0.000226. clip: 0.090480
Iteration 3136: Policy loss: -0.019100. Value loss: 0.035794. Entropy: 2.700917.
Iteration 3137: Policy loss: -0.019063. Value loss: 0.019392. Entropy: 2.701874.
Iteration 3138: Policy loss: -0.014849. Value loss: 0.014513. Entropy: 2.691903.
episode: 1183   score: 330.0  epsilon: 1.0    steps: 529  evaluation reward: 371.7
episode: 1184   score: 310.0  epsilon: 1.0    steps: 897  evaluation reward: 372.1
Training network. lr: 0.000226. clip: 0.090480
Iteration 3139: Policy loss: -0.006853. Value loss: 0.026469. Entropy: 2.681141.
Iteration 3140: Policy loss: -0.009183. Value loss: 0.015931. Entropy: 2.687149.
Iteration 3141: Policy loss: -0.014609. Value loss: 0.013052. Entropy: 2.681912.
episode: 1185   score: 410.0  epsilon: 1.0    steps: 506  evaluation reward: 372.8
episode: 11

Training network. lr: 0.000226. clip: 0.090332
Iteration 3199: Policy loss: 0.005292. Value loss: 0.016283. Entropy: 2.688707.
Iteration 3200: Policy loss: 0.007628. Value loss: 0.008969. Entropy: 2.687621.
Iteration 3201: Policy loss: 0.007717. Value loss: 0.006373. Entropy: 2.703210.
episode: 1207   score: 170.0  epsilon: 1.0    steps: 720  evaluation reward: 377.9
Training network. lr: 0.000225. clip: 0.090176
Iteration 3202: Policy loss: 0.025380. Value loss: 0.018584. Entropy: 2.703931.
Iteration 3203: Policy loss: 0.020032. Value loss: 0.012268. Entropy: 2.715670.
Iteration 3204: Policy loss: 0.019698. Value loss: 0.010818. Entropy: 2.721630.
Training network. lr: 0.000225. clip: 0.090176
Iteration 3205: Policy loss: 0.032903. Value loss: 0.019763. Entropy: 2.686216.
Iteration 3206: Policy loss: 0.028277. Value loss: 0.010173. Entropy: 2.698230.
Iteration 3207: Policy loss: 0.026146. Value loss: 0.007742. Entropy: 2.689580.
episode: 1208   score: 400.0  epsilon: 1.0    steps: 442

Training network. lr: 0.000225. clip: 0.090019
Iteration 3265: Policy loss: 0.051326. Value loss: 0.015481. Entropy: 2.756126.
Iteration 3266: Policy loss: 0.046948. Value loss: 0.008507. Entropy: 2.756081.
Iteration 3267: Policy loss: 0.045537. Value loss: 0.005871. Entropy: 2.752544.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3268: Policy loss: 0.027565. Value loss: 0.019200. Entropy: 2.719075.
Iteration 3269: Policy loss: 0.025923. Value loss: 0.008224. Entropy: 2.718113.
Iteration 3270: Policy loss: 0.022889. Value loss: 0.005905. Entropy: 2.717458.
episode: 1230   score: 470.0  epsilon: 1.0    steps: 443  evaluation reward: 391.3
Training network. lr: 0.000225. clip: 0.090019
Iteration 3271: Policy loss: -0.107930. Value loss: 0.725417. Entropy: 2.754264.
Iteration 3272: Policy loss: -0.107631. Value loss: 0.559406. Entropy: 2.692858.
Iteration 3273: Policy loss: -0.111171. Value loss: 0.323937. Entropy: 2.667855.
episode: 1231   score: 290.0  epsilon: 1.0    steps: 

Training network. lr: 0.000225. clip: 0.089872
Iteration 3331: Policy loss: 0.065700. Value loss: 0.033378. Entropy: 2.610560.
Iteration 3332: Policy loss: 0.058606. Value loss: 0.013140. Entropy: 2.613423.
Iteration 3333: Policy loss: 0.058918. Value loss: 0.010695. Entropy: 2.618486.
Training network. lr: 0.000225. clip: 0.089872
Iteration 3334: Policy loss: -0.009013. Value loss: 0.023330. Entropy: 2.642901.
Iteration 3335: Policy loss: -0.003539. Value loss: 0.012765. Entropy: 2.643787.
Iteration 3336: Policy loss: -0.023365. Value loss: 0.010097. Entropy: 2.647115.
episode: 1252   score: 430.0  epsilon: 1.0    steps: 476  evaluation reward: 428.7
Training network. lr: 0.000225. clip: 0.089872
Iteration 3337: Policy loss: 0.017772. Value loss: 0.025429. Entropy: 2.670274.
Iteration 3338: Policy loss: 0.022738. Value loss: 0.015489. Entropy: 2.664072.
Iteration 3339: Policy loss: 0.023313. Value loss: 0.014172. Entropy: 2.664706.
Training network. lr: 0.000225. clip: 0.089872
Iterat

Training network. lr: 0.000224. clip: 0.089715
Iteration 3397: Policy loss: 0.009371. Value loss: 0.015216. Entropy: 2.710092.
Iteration 3398: Policy loss: 0.008637. Value loss: 0.009247. Entropy: 2.723546.
Iteration 3399: Policy loss: 0.006014. Value loss: 0.008722. Entropy: 2.713293.
episode: 1275   score: 310.0  epsilon: 1.0    steps: 402  evaluation reward: 433.5
Training network. lr: 0.000224. clip: 0.089715
Iteration 3400: Policy loss: 0.024321. Value loss: 0.015270. Entropy: 2.738872.
Iteration 3401: Policy loss: 0.016570. Value loss: 0.008834. Entropy: 2.746214.
Iteration 3402: Policy loss: 0.022150. Value loss: 0.008377. Entropy: 2.739149.
episode: 1276   score: 340.0  epsilon: 1.0    steps: 892  evaluation reward: 418.2
Training network. lr: 0.000224. clip: 0.089558
Iteration 3403: Policy loss: 0.028637. Value loss: 0.015502. Entropy: 2.734290.
Iteration 3404: Policy loss: 0.028201. Value loss: 0.010265. Entropy: 2.748362.
Iteration 3405: Policy loss: 0.021934. Value loss: 0.

Iteration 3462: Policy loss: -0.015259. Value loss: 0.011146. Entropy: 2.724058.
Training network. lr: 0.000224. clip: 0.089411
Iteration 3463: Policy loss: 0.016214. Value loss: 0.015776. Entropy: 2.730814.
Iteration 3464: Policy loss: 0.013282. Value loss: 0.009258. Entropy: 2.726071.
Iteration 3465: Policy loss: 0.009633. Value loss: 0.009316. Entropy: 2.723566.
episode: 1299   score: 410.0  epsilon: 1.0    steps: 115  evaluation reward: 411.5
Training network. lr: 0.000224. clip: 0.089411
Iteration 3466: Policy loss: 0.043353. Value loss: 0.014122. Entropy: 2.758886.
Iteration 3467: Policy loss: 0.036801. Value loss: 0.010568. Entropy: 2.753110.
Iteration 3468: Policy loss: 0.041598. Value loss: 0.007188. Entropy: 2.751504.
episode: 1300   score: 300.0  epsilon: 1.0    steps: 486  evaluation reward: 410.5
Training network. lr: 0.000224. clip: 0.089411
Iteration 3469: Policy loss: 0.049782. Value loss: 0.016484. Entropy: 2.687806.
Iteration 3470: Policy loss: 0.048037. Value loss: 0

Iteration 3527: Policy loss: -0.168223. Value loss: 0.322727. Entropy: 2.670285.
Iteration 3528: Policy loss: -0.149919. Value loss: 0.243867. Entropy: 2.670246.
Training network. lr: 0.000223. clip: 0.089254
Iteration 3529: Policy loss: -0.046483. Value loss: 0.039566. Entropy: 2.678569.
Iteration 3530: Policy loss: -0.053313. Value loss: 0.016268. Entropy: 2.679532.
Iteration 3531: Policy loss: -0.057518. Value loss: 0.010034. Entropy: 2.679384.
episode: 1322   score: 970.0  epsilon: 1.0    steps: 59  evaluation reward: 415.4
episode: 1323   score: 460.0  epsilon: 1.0    steps: 520  evaluation reward: 410.1
episode: 1324   score: 360.0  epsilon: 1.0    steps: 794  evaluation reward: 410.4
Training network. lr: 0.000223. clip: 0.089254
Iteration 3532: Policy loss: 0.037664. Value loss: 0.026792. Entropy: 2.682081.
Iteration 3533: Policy loss: 0.048454. Value loss: 0.012363. Entropy: 2.684786.
Iteration 3534: Policy loss: 0.033505. Value loss: 0.017748. Entropy: 2.687952.
episode: 1325

episode: 1346   score: 370.0  epsilon: 1.0    steps: 202  evaluation reward: 392.4
episode: 1347   score: 400.0  epsilon: 1.0    steps: 508  evaluation reward: 392.5
episode: 1348   score: 400.0  epsilon: 1.0    steps: 789  evaluation reward: 392.8
Training network. lr: 0.000223. clip: 0.089097
Iteration 3592: Policy loss: 0.069898. Value loss: 0.020661. Entropy: 2.683945.
Iteration 3593: Policy loss: 0.061551. Value loss: 0.015425. Entropy: 2.683023.
Iteration 3594: Policy loss: 0.071078. Value loss: 0.010277. Entropy: 2.682441.
episode: 1349   score: 360.0  epsilon: 1.0    steps: 762  evaluation reward: 392.2
Training network. lr: 0.000223. clip: 0.089097
Iteration 3595: Policy loss: 0.013649. Value loss: 0.011365. Entropy: 2.711419.
Iteration 3596: Policy loss: 0.009381. Value loss: 0.007485. Entropy: 2.702857.
Iteration 3597: Policy loss: 0.004988. Value loss: 0.007569. Entropy: 2.708190.
episode: 1350   score: 270.0  epsilon: 1.0    steps: 261  evaluation reward: 390.2
Training ne

Iteration 3655: Policy loss: 0.013305. Value loss: 0.015690. Entropy: 2.753209.
Iteration 3656: Policy loss: 0.015527. Value loss: 0.008270. Entropy: 2.748311.
Iteration 3657: Policy loss: 0.007984. Value loss: 0.008054. Entropy: 2.747561.
Training network. lr: 0.000222. clip: 0.088793
Iteration 3658: Policy loss: -0.011313. Value loss: 0.012562. Entropy: 2.751099.
Iteration 3659: Policy loss: -0.013955. Value loss: 0.007264. Entropy: 2.747112.
Iteration 3660: Policy loss: -0.015835. Value loss: 0.006000. Entropy: 2.745409.
episode: 1371   score: 310.0  epsilon: 1.0    steps: 100  evaluation reward: 391.7
episode: 1372   score: 410.0  epsilon: 1.0    steps: 586  evaluation reward: 391.0
episode: 1373   score: 340.0  epsilon: 1.0    steps: 709  evaluation reward: 390.6
Training network. lr: 0.000222. clip: 0.088793
Iteration 3661: Policy loss: 0.038572. Value loss: 0.020388. Entropy: 2.734581.
Iteration 3662: Policy loss: 0.036554. Value loss: 0.011784. Entropy: 2.729753.
Iteration 3663

Training network. lr: 0.000222. clip: 0.088637
Iteration 3721: Policy loss: -0.017488. Value loss: 0.014914. Entropy: 2.769849.
Iteration 3722: Policy loss: -0.023591. Value loss: 0.010197. Entropy: 2.769643.
Iteration 3723: Policy loss: -0.019550. Value loss: 0.006488. Entropy: 2.762015.
episode: 1395   score: 330.0  epsilon: 1.0    steps: 207  evaluation reward: 386.6
episode: 1396   score: 360.0  epsilon: 1.0    steps: 521  evaluation reward: 386.4
Training network. lr: 0.000222. clip: 0.088637
Iteration 3724: Policy loss: 0.007274. Value loss: 0.015655. Entropy: 2.733589.
Iteration 3725: Policy loss: 0.001440. Value loss: 0.007613. Entropy: 2.732854.
Iteration 3726: Policy loss: 0.007637. Value loss: 0.007740. Entropy: 2.731915.
episode: 1397   score: 310.0  epsilon: 1.0    steps: 672  evaluation reward: 386.2
episode: 1398   score: 420.0  epsilon: 1.0    steps: 968  evaluation reward: 385.7
Training network. lr: 0.000222. clip: 0.088637
Iteration 3727: Policy loss: 0.013613. Value

Iteration 3784: Policy loss: 0.071264. Value loss: 0.012101. Entropy: 2.731585.
Iteration 3785: Policy loss: 0.063638. Value loss: 0.008495. Entropy: 2.727510.
Iteration 3786: Policy loss: 0.069177. Value loss: 0.005995. Entropy: 2.735019.
Training network. lr: 0.000221. clip: 0.088489
Iteration 3787: Policy loss: 0.006825. Value loss: 0.011942. Entropy: 2.706259.
Iteration 3788: Policy loss: 0.000262. Value loss: 0.007349. Entropy: 2.711016.
Iteration 3789: Policy loss: 0.002773. Value loss: 0.006952. Entropy: 2.702091.
episode: 1420   score: 330.0  epsilon: 1.0    steps: 209  evaluation reward: 389.0
episode: 1421   score: 420.0  epsilon: 1.0    steps: 518  evaluation reward: 389.2
episode: 1422   score: 300.0  epsilon: 1.0    steps: 940  evaluation reward: 382.5
Training network. lr: 0.000221. clip: 0.088489
Iteration 3790: Policy loss: 0.008597. Value loss: 0.021197. Entropy: 2.678742.
Iteration 3791: Policy loss: 0.009456. Value loss: 0.016152. Entropy: 2.685842.
Iteration 3792: P

episode: 1443   score: 290.0  epsilon: 1.0    steps: 75  evaluation reward: 376.3
Training network. lr: 0.000221. clip: 0.088333
Iteration 3850: Policy loss: -0.001601. Value loss: 0.014384. Entropy: 2.751474.
Iteration 3851: Policy loss: -0.004428. Value loss: 0.006263. Entropy: 2.749792.
Iteration 3852: Policy loss: -0.002117. Value loss: 0.008178. Entropy: 2.750791.
Training network. lr: 0.000220. clip: 0.088176
Iteration 3853: Policy loss: 0.008543. Value loss: 0.013646. Entropy: 2.756432.
Iteration 3854: Policy loss: 0.004257. Value loss: 0.007987. Entropy: 2.750651.
Iteration 3855: Policy loss: 0.001791. Value loss: 0.005854. Entropy: 2.747893.
episode: 1444   score: 270.0  epsilon: 1.0    steps: 586  evaluation reward: 375.6
Training network. lr: 0.000220. clip: 0.088176
Iteration 3856: Policy loss: -0.000033. Value loss: 0.011916. Entropy: 2.767078.
Iteration 3857: Policy loss: -0.003448. Value loss: 0.007975. Entropy: 2.762816.
Iteration 3858: Policy loss: 0.000369. Value loss

episode: 1466   score: 390.0  epsilon: 1.0    steps: 845  evaluation reward: 370.3
Training network. lr: 0.000220. clip: 0.088028
Iteration 3916: Policy loss: -0.000234. Value loss: 0.019672. Entropy: 2.750474.
Iteration 3917: Policy loss: -0.000319. Value loss: 0.011968. Entropy: 2.748851.
Iteration 3918: Policy loss: -0.010373. Value loss: 0.008452. Entropy: 2.743227.
Training network. lr: 0.000220. clip: 0.088028
Iteration 3919: Policy loss: 0.027452. Value loss: 0.010950. Entropy: 2.754827.
Iteration 3920: Policy loss: 0.023808. Value loss: 0.005168. Entropy: 2.755609.
Iteration 3921: Policy loss: 0.023930. Value loss: 0.003577. Entropy: 2.749430.
episode: 1467   score: 330.0  epsilon: 1.0    steps: 64  evaluation reward: 368.2
episode: 1468   score: 430.0  epsilon: 1.0    steps: 966  evaluation reward: 369.2
Training network. lr: 0.000220. clip: 0.088028
Iteration 3922: Policy loss: 0.003323. Value loss: 0.019570. Entropy: 2.756145.
Iteration 3923: Policy loss: 0.000081. Value los

episode: 1489   score: 600.0  epsilon: 1.0    steps: 837  evaluation reward: 382.8
Training network. lr: 0.000220. clip: 0.087872
Iteration 3982: Policy loss: 0.014647. Value loss: 0.019403. Entropy: 2.659904.
Iteration 3983: Policy loss: 0.018504. Value loss: 0.008303. Entropy: 2.665131.
Iteration 3984: Policy loss: 0.015053. Value loss: 0.013274. Entropy: 2.661365.
episode: 1490   score: 510.0  epsilon: 1.0    steps: 445  evaluation reward: 385.1
Training network. lr: 0.000220. clip: 0.087872
Iteration 3985: Policy loss: 0.056951. Value loss: 0.026737. Entropy: 2.646657.
Iteration 3986: Policy loss: 0.053919. Value loss: 0.017389. Entropy: 2.630513.
Iteration 3987: Policy loss: 0.056704. Value loss: 0.014443. Entropy: 2.642195.
Training network. lr: 0.000220. clip: 0.087872
Iteration 3988: Policy loss: 0.007984. Value loss: 0.021278. Entropy: 2.677709.
Iteration 3989: Policy loss: 0.006489. Value loss: 0.013625. Entropy: 2.682927.
Iteration 3990: Policy loss: 0.003377. Value loss: 0.

Iteration 4047: Policy loss: -0.033206. Value loss: 0.011045. Entropy: 2.647376.
episode: 1512   score: 470.0  epsilon: 1.0    steps: 867  evaluation reward: 403.5
Training network. lr: 0.000219. clip: 0.087715
Iteration 4048: Policy loss: 0.040641. Value loss: 0.009619. Entropy: 2.725345.
Iteration 4049: Policy loss: 0.043206. Value loss: 0.008785. Entropy: 2.724376.
Iteration 4050: Policy loss: 0.040050. Value loss: 0.007994. Entropy: 2.724740.
episode: 1513   score: 410.0  epsilon: 1.0    steps: 132  evaluation reward: 398.7
Training network. lr: 0.000219. clip: 0.087568
Iteration 4051: Policy loss: -0.000570. Value loss: 0.012835. Entropy: 2.703091.
Iteration 4052: Policy loss: -0.002026. Value loss: 0.007056. Entropy: 2.694216.
Iteration 4053: Policy loss: -0.004109. Value loss: 0.005557. Entropy: 2.696994.
episode: 1514   score: 420.0  epsilon: 1.0    steps: 506  evaluation reward: 399.3
Training network. lr: 0.000219. clip: 0.087568
Iteration 4054: Policy loss: 0.058219. Value l

Iteration 4111: Policy loss: 0.025850. Value loss: 0.019885. Entropy: 2.745922.
Iteration 4112: Policy loss: 0.019855. Value loss: 0.011830. Entropy: 2.740446.
Iteration 4113: Policy loss: 0.020199. Value loss: 0.009257. Entropy: 2.745111.
episode: 1537   score: 370.0  epsilon: 1.0    steps: 162  evaluation reward: 397.8
Training network. lr: 0.000219. clip: 0.087411
Iteration 4114: Policy loss: 0.003022. Value loss: 0.010972. Entropy: 2.708725.
Iteration 4115: Policy loss: -0.000169. Value loss: 0.009287. Entropy: 2.717522.
Iteration 4116: Policy loss: -0.000216. Value loss: 0.006003. Entropy: 2.707524.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4117: Policy loss: 0.044289. Value loss: 0.014131. Entropy: 2.783666.
Iteration 4118: Policy loss: 0.048696. Value loss: 0.007368. Entropy: 2.778450.
Iteration 4119: Policy loss: 0.037673. Value loss: 0.010122. Entropy: 2.773055.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4120: Policy loss: 0.029247. Value loss: 0.0

Iteration 4177: Policy loss: 0.011457. Value loss: 0.053555. Entropy: 2.737811.
Iteration 4178: Policy loss: -0.006221. Value loss: 0.037928. Entropy: 2.736099.
Iteration 4179: Policy loss: -0.009998. Value loss: 0.026379. Entropy: 2.737941.
episode: 1559   score: 360.0  epsilon: 1.0    steps: 610  evaluation reward: 405.4
episode: 1560   score: 930.0  epsilon: 1.0    steps: 792  evaluation reward: 412.0
Training network. lr: 0.000218. clip: 0.087254
Iteration 4180: Policy loss: 0.152098. Value loss: 0.031428. Entropy: 2.744815.
Iteration 4181: Policy loss: 0.143119. Value loss: 0.013286. Entropy: 2.748293.
Iteration 4182: Policy loss: 0.125561. Value loss: 0.012180. Entropy: 2.748159.
episode: 1561   score: 320.0  epsilon: 1.0    steps: 183  evaluation reward: 411.3
Training network. lr: 0.000218. clip: 0.087254
Iteration 4183: Policy loss: 0.004395. Value loss: 0.026628. Entropy: 2.723149.
Iteration 4184: Policy loss: 0.005471. Value loss: 0.014056. Entropy: 2.717566.
Iteration 4185:

episode: 1583   score: 300.0  epsilon: 1.0    steps: 845  evaluation reward: 400.7
Training network. lr: 0.000218. clip: 0.087107
Iteration 4243: Policy loss: 0.027162. Value loss: 0.015529. Entropy: 2.774753.
Iteration 4244: Policy loss: 0.027017. Value loss: 0.007382. Entropy: 2.780132.
Iteration 4245: Policy loss: 0.019822. Value loss: 0.006348. Entropy: 2.777168.
episode: 1584   score: 380.0  epsilon: 1.0    steps: 600  evaluation reward: 401.7
Training network. lr: 0.000218. clip: 0.087107
Iteration 4246: Policy loss: 0.020539. Value loss: 0.016124. Entropy: 2.746794.
Iteration 4247: Policy loss: 0.020670. Value loss: 0.009222. Entropy: 2.746917.
Iteration 4248: Policy loss: 0.016218. Value loss: 0.008330. Entropy: 2.747134.
Training network. lr: 0.000218. clip: 0.087107
Iteration 4249: Policy loss: 0.026178. Value loss: 0.010516. Entropy: 2.762170.
Iteration 4250: Policy loss: 0.023710. Value loss: 0.005774. Entropy: 2.762364.
Iteration 4251: Policy loss: 0.018896. Value loss: 0.

Iteration 4308: Policy loss: 0.011422. Value loss: 0.006455. Entropy: 2.754143.
episode: 1606   score: 330.0  epsilon: 1.0    steps: 604  evaluation reward: 381.8
episode: 1607   score: 470.0  epsilon: 1.0    steps: 984  evaluation reward: 382.4
Training network. lr: 0.000217. clip: 0.086793
Iteration 4309: Policy loss: 0.045710. Value loss: 0.015699. Entropy: 2.771774.
Iteration 4310: Policy loss: 0.042975. Value loss: 0.011069. Entropy: 2.767660.
Iteration 4311: Policy loss: 0.040333. Value loss: 0.010825. Entropy: 2.772783.
episode: 1608   score: 420.0  epsilon: 1.0    steps: 378  evaluation reward: 383.1
Training network. lr: 0.000217. clip: 0.086793
Iteration 4312: Policy loss: 0.010091. Value loss: 0.015628. Entropy: 2.764697.
Iteration 4313: Policy loss: 0.013107. Value loss: 0.009505. Entropy: 2.763330.
Iteration 4314: Policy loss: 0.010801. Value loss: 0.007829. Entropy: 2.758262.
Training network. lr: 0.000217. clip: 0.086793
Iteration 4315: Policy loss: -0.006081. Value loss

episode: 1628   score: 370.0  epsilon: 1.0    steps: 840  evaluation reward: 377.5
Training network. lr: 0.000217. clip: 0.086646
Iteration 4375: Policy loss: 0.013050. Value loss: 0.012557. Entropy: 2.765596.
Iteration 4376: Policy loss: 0.014457. Value loss: 0.007409. Entropy: 2.761352.
Iteration 4377: Policy loss: 0.008582. Value loss: 0.005516. Entropy: 2.754438.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4378: Policy loss: 0.037989. Value loss: 0.013909. Entropy: 2.770536.
Iteration 4379: Policy loss: 0.031283. Value loss: 0.007332. Entropy: 2.765617.
Iteration 4380: Policy loss: 0.029189. Value loss: 0.005403. Entropy: 2.762516.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4381: Policy loss: -0.001331. Value loss: 0.012308. Entropy: 2.765696.
Iteration 4382: Policy loss: -0.004155. Value loss: 0.007540. Entropy: 2.765328.
Iteration 4383: Policy loss: -0.004485. Value loss: 0.006501. Entropy: 2.763451.
episode: 1629   score: 420.0  epsilon: 1.0    steps: 

Iteration 4439: Policy loss: 0.029233. Value loss: 0.010722. Entropy: 2.672504.
Iteration 4440: Policy loss: 0.028397. Value loss: 0.009160. Entropy: 2.680473.
Training network. lr: 0.000216. clip: 0.086489
Iteration 4441: Policy loss: 0.060661. Value loss: 0.018551. Entropy: 2.693592.
Iteration 4442: Policy loss: 0.052293. Value loss: 0.011951. Entropy: 2.702092.
Iteration 4443: Policy loss: 0.057144. Value loss: 0.009651. Entropy: 2.699560.
Training network. lr: 0.000216. clip: 0.086489
Iteration 4444: Policy loss: 0.014871. Value loss: 0.016887. Entropy: 2.727441.
Iteration 4445: Policy loss: 0.020283. Value loss: 0.010789. Entropy: 2.724494.
Iteration 4446: Policy loss: 0.012023. Value loss: 0.008626. Entropy: 2.722294.
episode: 1652   score: 350.0  epsilon: 1.0    steps: 856  evaluation reward: 385.7
Training network. lr: 0.000216. clip: 0.086489
Iteration 4447: Policy loss: 0.017455. Value loss: 0.017970. Entropy: 2.702798.
Iteration 4448: Policy loss: 0.023148. Value loss: 0.010

episode: 1673   score: 360.0  epsilon: 1.0    steps: 1011  evaluation reward: 391.7
Training network. lr: 0.000215. clip: 0.086185
Iteration 4507: Policy loss: 0.031850. Value loss: 0.011755. Entropy: 2.716689.
Iteration 4508: Policy loss: 0.034099. Value loss: 0.005520. Entropy: 2.727461.
Iteration 4509: Policy loss: 0.025915. Value loss: 0.007116. Entropy: 2.720567.
episode: 1674   score: 430.0  epsilon: 1.0    steps: 268  evaluation reward: 392.8
episode: 1675   score: 360.0  epsilon: 1.0    steps: 808  evaluation reward: 392.8
Training network. lr: 0.000215. clip: 0.086185
Iteration 4510: Policy loss: 0.024945. Value loss: 0.012259. Entropy: 2.723155.
Iteration 4511: Policy loss: 0.015850. Value loss: 0.007743. Entropy: 2.711482.
Iteration 4512: Policy loss: 0.019519. Value loss: 0.006747. Entropy: 2.714085.
Training network. lr: 0.000215. clip: 0.086185
Iteration 4513: Policy loss: -0.039662. Value loss: 0.016576. Entropy: 2.743058.
Iteration 4514: Policy loss: -0.038290. Value lo

Iteration 4572: Policy loss: 0.027184. Value loss: 0.007330. Entropy: 2.735883.
episode: 1697   score: 430.0  epsilon: 1.0    steps: 239  evaluation reward: 398.7
Training network. lr: 0.000215. clip: 0.086029
Iteration 4573: Policy loss: -0.011300. Value loss: 0.014902. Entropy: 2.728106.
Iteration 4574: Policy loss: -0.013330. Value loss: 0.007150. Entropy: 2.732512.
Iteration 4575: Policy loss: -0.018606. Value loss: 0.008397. Entropy: 2.727482.
episode: 1698   score: 420.0  epsilon: 1.0    steps: 108  evaluation reward: 399.9
episode: 1699   score: 330.0  epsilon: 1.0    steps: 297  evaluation reward: 398.2
Training network. lr: 0.000215. clip: 0.086029
Iteration 4576: Policy loss: 0.016621. Value loss: 0.010962. Entropy: 2.761568.
Iteration 4577: Policy loss: 0.015417. Value loss: 0.006279. Entropy: 2.762934.
Iteration 4578: Policy loss: 0.014765. Value loss: 0.005080. Entropy: 2.761907.
Training network. lr: 0.000215. clip: 0.086029
Iteration 4579: Policy loss: -0.005612. Value l

Iteration 4636: Policy loss: 0.047316. Value loss: 0.013693. Entropy: 2.752745.
Iteration 4637: Policy loss: 0.043825. Value loss: 0.011839. Entropy: 2.751991.
Iteration 4638: Policy loss: 0.045000. Value loss: 0.010309. Entropy: 2.754352.
episode: 1721   score: 300.0  epsilon: 1.0    steps: 341  evaluation reward: 397.8
Training network. lr: 0.000215. clip: 0.085872
Iteration 4639: Policy loss: -0.052032. Value loss: 0.015391. Entropy: 2.730428.
Iteration 4640: Policy loss: -0.051015. Value loss: 0.009941. Entropy: 2.734676.
Iteration 4641: Policy loss: -0.051566. Value loss: 0.008694. Entropy: 2.737429.
episode: 1722   score: 300.0  epsilon: 1.0    steps: 110  evaluation reward: 396.5
episode: 1723   score: 320.0  epsilon: 1.0    steps: 170  evaluation reward: 397.2
Training network. lr: 0.000215. clip: 0.085872
Iteration 4642: Policy loss: 0.019264. Value loss: 0.015081. Entropy: 2.755516.
Iteration 4643: Policy loss: 0.022124. Value loss: 0.007639. Entropy: 2.758950.
Iteration 4644

Training network. lr: 0.000214. clip: 0.085568
Iteration 4702: Policy loss: -0.044866. Value loss: 0.049262. Entropy: 2.463358.
Iteration 4703: Policy loss: -0.042336. Value loss: 0.026694. Entropy: 2.479138.
Iteration 4704: Policy loss: -0.032493. Value loss: 0.018794. Entropy: 2.450792.
episode: 1744   score: 390.0  epsilon: 1.0    steps: 336  evaluation reward: 405.1
Training network. lr: 0.000214. clip: 0.085568
Iteration 4705: Policy loss: 0.034105. Value loss: 0.049615. Entropy: 2.531425.
Iteration 4706: Policy loss: 0.014244. Value loss: 0.028686. Entropy: 2.526775.
Iteration 4707: Policy loss: 0.020743. Value loss: 0.022736. Entropy: 2.519673.
episode: 1745   score: 1080.0  epsilon: 1.0    steps: 538  evaluation reward: 406.3
Training network. lr: 0.000214. clip: 0.085568
Iteration 4708: Policy loss: 0.030323. Value loss: 0.030839. Entropy: 2.570767.
Iteration 4709: Policy loss: 0.035044. Value loss: 0.018272. Entropy: 2.550331.
Iteration 4710: Policy loss: 0.026711. Value loss

Training network. lr: 0.000214. clip: 0.085411
Iteration 4768: Policy loss: -0.165399. Value loss: 0.765909. Entropy: 2.561687.
Iteration 4769: Policy loss: -0.151368. Value loss: 0.343043. Entropy: 2.496870.
Iteration 4770: Policy loss: -0.143734. Value loss: 0.210790. Entropy: 2.449334.
episode: 1766   score: 210.0  epsilon: 1.0    steps: 268  evaluation reward: 408.2
Training network. lr: 0.000214. clip: 0.085411
Iteration 4771: Policy loss: -0.040820. Value loss: 0.074858. Entropy: 2.494950.
Iteration 4772: Policy loss: -0.046195. Value loss: 0.038263. Entropy: 2.513309.
Iteration 4773: Policy loss: -0.049718. Value loss: 0.027735. Entropy: 2.491713.
episode: 1767   score: 240.0  epsilon: 1.0    steps: 562  evaluation reward: 407.4
Training network. lr: 0.000214. clip: 0.085411
Iteration 4774: Policy loss: 0.111052. Value loss: 0.035291. Entropy: 2.557051.
Iteration 4775: Policy loss: 0.107801. Value loss: 0.018769. Entropy: 2.557214.
Iteration 4776: Policy loss: 0.093606. Value lo

Training network. lr: 0.000213. clip: 0.085264
Iteration 4834: Policy loss: 0.081527. Value loss: 0.047275. Entropy: 2.681674.
Iteration 4835: Policy loss: 0.065928. Value loss: 0.022952. Entropy: 2.692330.
Iteration 4836: Policy loss: 0.082348. Value loss: 0.017838. Entropy: 2.684822.
Training network. lr: 0.000213. clip: 0.085264
Iteration 4837: Policy loss: 0.070944. Value loss: 0.021360. Entropy: 2.689578.
Iteration 4838: Policy loss: 0.075356. Value loss: 0.015357. Entropy: 2.692992.
Iteration 4839: Policy loss: 0.069867. Value loss: 0.010914. Entropy: 2.692915.
Training network. lr: 0.000213. clip: 0.085264
Iteration 4840: Policy loss: 0.064228. Value loss: 0.024430. Entropy: 2.659561.
Iteration 4841: Policy loss: 0.069644. Value loss: 0.016648. Entropy: 2.666127.
Iteration 4842: Policy loss: 0.057784. Value loss: 0.012537. Entropy: 2.664837.
episode: 1789   score: 410.0  epsilon: 1.0    steps: 1  evaluation reward: 418.5
episode: 1790   score: 310.0  epsilon: 1.0    steps: 455  

Iteration 4901: Policy loss: -0.147324. Value loss: 0.578890. Entropy: 2.684383.
Iteration 4902: Policy loss: -0.154087. Value loss: 0.536870. Entropy: 2.642320.
Training network. lr: 0.000212. clip: 0.084950
Iteration 4903: Policy loss: 0.072586. Value loss: 0.048939. Entropy: 2.602692.
Iteration 4904: Policy loss: 0.062812. Value loss: 0.030024. Entropy: 2.605376.
Iteration 4905: Policy loss: 0.067254. Value loss: 0.022756. Entropy: 2.594476.
Training network. lr: 0.000212. clip: 0.084950
Iteration 4906: Policy loss: 0.105980. Value loss: 0.041106. Entropy: 2.644509.
Iteration 4907: Policy loss: 0.106850. Value loss: 0.022600. Entropy: 2.645407.
Iteration 4908: Policy loss: 0.093603. Value loss: 0.017304. Entropy: 2.634229.
episode: 1810   score: 320.0  epsilon: 1.0    steps: 434  evaluation reward: 433.8
episode: 1811   score: 430.0  epsilon: 1.0    steps: 642  evaluation reward: 428.5
Training network. lr: 0.000212. clip: 0.084950
Iteration 4909: Policy loss: -0.018855. Value loss:

episode: 1831   score: 260.0  epsilon: 1.0    steps: 225  evaluation reward: 421.7
Training network. lr: 0.000212. clip: 0.084803
Iteration 4969: Policy loss: 0.010253. Value loss: 0.012222. Entropy: 2.738159.
Iteration 4970: Policy loss: 0.013905. Value loss: 0.007988. Entropy: 2.742357.
Iteration 4971: Policy loss: 0.007286. Value loss: 0.008211. Entropy: 2.742593.
episode: 1832   score: 340.0  epsilon: 1.0    steps: 297  evaluation reward: 420.9
Training network. lr: 0.000212. clip: 0.084803
Iteration 4972: Policy loss: -0.008528. Value loss: 0.011411. Entropy: 2.734721.
Iteration 4973: Policy loss: -0.014145. Value loss: 0.007143. Entropy: 2.733275.
Iteration 4974: Policy loss: -0.009735. Value loss: 0.005801. Entropy: 2.735829.
Training network. lr: 0.000212. clip: 0.084803
Iteration 4975: Policy loss: 0.003491. Value loss: 0.009651. Entropy: 2.762334.
Iteration 4976: Policy loss: 0.007350. Value loss: 0.006817. Entropy: 2.767284.
Iteration 4977: Policy loss: 0.001043. Value loss:

Iteration 5034: Policy loss: 0.123455. Value loss: 0.026221. Entropy: 2.686437.
episode: 1854   score: 490.0  epsilon: 1.0    steps: 214  evaluation reward: 390.6
episode: 1855   score: 570.0  epsilon: 1.0    steps: 1006  evaluation reward: 394.4
Training network. lr: 0.000212. clip: 0.084646
Iteration 5035: Policy loss: 0.067332. Value loss: 0.031509. Entropy: 2.708687.
Iteration 5036: Policy loss: 0.059389. Value loss: 0.019620. Entropy: 2.705387.
Iteration 5037: Policy loss: 0.058321. Value loss: 0.013853. Entropy: 2.699444.
episode: 1856   score: 210.0  epsilon: 1.0    steps: 81  evaluation reward: 386.4
Training network. lr: 0.000212. clip: 0.084646
Iteration 5038: Policy loss: 0.019269. Value loss: 0.029477. Entropy: 2.692256.
Iteration 5039: Policy loss: 0.016674. Value loss: 0.016833. Entropy: 2.691288.
Iteration 5040: Policy loss: 0.017928. Value loss: 0.012738. Entropy: 2.695218.
Training network. lr: 0.000212. clip: 0.084646
Iteration 5041: Policy loss: 0.040949. Value loss:

Iteration 5099: Policy loss: 0.081291. Value loss: 0.022603. Entropy: 2.649516.
Iteration 5100: Policy loss: 0.068898. Value loss: 0.017243. Entropy: 2.641285.
episode: 1878   score: 420.0  epsilon: 1.0    steps: 863  evaluation reward: 383.1
Training network. lr: 0.000211. clip: 0.084342
Iteration 5101: Policy loss: 0.066814. Value loss: 0.032072. Entropy: 2.674478.
Iteration 5102: Policy loss: 0.066470. Value loss: 0.017075. Entropy: 2.666626.
Iteration 5103: Policy loss: 0.063163. Value loss: 0.011434. Entropy: 2.651197.
Training network. lr: 0.000211. clip: 0.084342
Iteration 5104: Policy loss: -0.025881. Value loss: 0.027181. Entropy: 2.674015.
Iteration 5105: Policy loss: -0.034500. Value loss: 0.017237. Entropy: 2.670580.
Iteration 5106: Policy loss: -0.031566. Value loss: 0.011968. Entropy: 2.665973.
episode: 1879   score: 950.0  epsilon: 1.0    steps: 309  evaluation reward: 389.2
episode: 1880   score: 490.0  epsilon: 1.0    steps: 737  evaluation reward: 390.2
Training netwo

Training network. lr: 0.000210. clip: 0.084185
Iteration 5164: Policy loss: 0.014769. Value loss: 0.017014. Entropy: 2.732439.
Iteration 5165: Policy loss: 0.014349. Value loss: 0.008490. Entropy: 2.722543.
Iteration 5166: Policy loss: 0.010706. Value loss: 0.008647. Entropy: 2.730965.
episode: 1902   score: 330.0  epsilon: 1.0    steps: 316  evaluation reward: 382.9
episode: 1903   score: 260.0  epsilon: 1.0    steps: 869  evaluation reward: 375.8
Training network. lr: 0.000210. clip: 0.084185
Iteration 5167: Policy loss: -0.038571. Value loss: 0.022731. Entropy: 2.728597.
Iteration 5168: Policy loss: -0.038631. Value loss: 0.015239. Entropy: 2.722961.
Iteration 5169: Policy loss: -0.037229. Value loss: 0.010092. Entropy: 2.728777.
episode: 1904   score: 190.0  epsilon: 1.0    steps: 174  evaluation reward: 375.2
Training network. lr: 0.000210. clip: 0.084185
Iteration 5170: Policy loss: 0.032175. Value loss: 0.015578. Entropy: 2.736795.
Iteration 5171: Policy loss: 0.028268. Value lo

Iteration 5228: Policy loss: 0.003202. Value loss: 0.010566. Entropy: 2.700258.
Iteration 5229: Policy loss: 0.006266. Value loss: 0.007498. Entropy: 2.703202.
Training network. lr: 0.000210. clip: 0.084029
Iteration 5230: Policy loss: 0.018307. Value loss: 0.013362. Entropy: 2.763566.
Iteration 5231: Policy loss: 0.015049. Value loss: 0.008149. Entropy: 2.757277.
Iteration 5232: Policy loss: 0.008952. Value loss: 0.007110. Entropy: 2.755434.
Training network. lr: 0.000210. clip: 0.084029
Iteration 5233: Policy loss: 0.003698. Value loss: 0.011295. Entropy: 2.704273.
Iteration 5234: Policy loss: 0.006042. Value loss: 0.007416. Entropy: 2.719643.
Iteration 5235: Policy loss: 0.003754. Value loss: 0.006254. Entropy: 2.709162.
episode: 1927   score: 300.0  epsilon: 1.0    steps: 257  evaluation reward: 364.8
episode: 1928   score: 250.0  epsilon: 1.0    steps: 518  evaluation reward: 364.2
Training network. lr: 0.000210. clip: 0.084029
Iteration 5236: Policy loss: 0.027859. Value loss: 0.

Iteration 5295: Policy loss: 0.008483. Value loss: 0.006179. Entropy: 2.774007.
Training network. lr: 0.000210. clip: 0.083881
Iteration 5296: Policy loss: 0.021995. Value loss: 0.019280. Entropy: 2.753750.
Iteration 5297: Policy loss: 0.013814. Value loss: 0.010619. Entropy: 2.751864.
Iteration 5298: Policy loss: 0.017328. Value loss: 0.007696. Entropy: 2.745945.
episode: 1949   score: 300.0  epsilon: 1.0    steps: 123  evaluation reward: 372.3
episode: 1950   score: 1010.0  epsilon: 1.0    steps: 159  evaluation reward: 379.4
now time :  2019-02-27 05:16:19.214321
episode: 1951   score: 480.0  epsilon: 1.0    steps: 758  evaluation reward: 380.3
Training network. lr: 0.000210. clip: 0.083881
Iteration 5299: Policy loss: 0.060859. Value loss: 0.029753. Entropy: 2.749860.
Iteration 5300: Policy loss: 0.062171. Value loss: 0.018039. Entropy: 2.755869.
Iteration 5301: Policy loss: 0.059081. Value loss: 0.010086. Entropy: 2.752099.
episode: 1952   score: 400.0  epsilon: 1.0    steps: 534 

Training network. lr: 0.000209. clip: 0.083568
Iteration 5359: Policy loss: -0.153394. Value loss: 0.864403. Entropy: 2.768559.
Iteration 5360: Policy loss: -0.114708. Value loss: 0.345261. Entropy: 2.679021.
Iteration 5361: Policy loss: -0.117282. Value loss: 0.309764. Entropy: 2.652723.
episode: 1974   score: 490.0  epsilon: 1.0    steps: 723  evaluation reward: 377.3
Training network. lr: 0.000209. clip: 0.083568
Iteration 5362: Policy loss: 0.061871. Value loss: 0.027788. Entropy: 2.614631.
Iteration 5363: Policy loss: 0.066762. Value loss: 0.014621. Entropy: 2.635870.
Iteration 5364: Policy loss: 0.068861. Value loss: 0.010995. Entropy: 2.648161.
episode: 1975   score: 300.0  epsilon: 1.0    steps: 254  evaluation reward: 377.6
episode: 1976   score: 330.0  epsilon: 1.0    steps: 548  evaluation reward: 378.1
episode: 1977   score: 270.0  epsilon: 1.0    steps: 880  evaluation reward: 377.8
Training network. lr: 0.000209. clip: 0.083568
Iteration 5365: Policy loss: 0.040183. Value

Iteration 5424: Policy loss: 0.010601. Value loss: 0.006770. Entropy: 2.738352.
episode: 1998   score: 400.0  epsilon: 1.0    steps: 547  evaluation reward: 363.1
episode: 1999   score: 220.0  epsilon: 1.0    steps: 816  evaluation reward: 361.9
Training network. lr: 0.000209. clip: 0.083420
Iteration 5425: Policy loss: 0.017138. Value loss: 0.022646. Entropy: 2.727089.
Iteration 5426: Policy loss: 0.019564. Value loss: 0.009721. Entropy: 2.703367.
Iteration 5427: Policy loss: 0.015830. Value loss: 0.007646. Entropy: 2.708258.
episode: 2000   score: 390.0  epsilon: 1.0    steps: 436  evaluation reward: 363.8
Training network. lr: 0.000209. clip: 0.083420
Iteration 5428: Policy loss: 0.004689. Value loss: 0.013911. Entropy: 2.746509.
Iteration 5429: Policy loss: 0.007523. Value loss: 0.007290. Entropy: 2.752829.
Iteration 5430: Policy loss: 0.002470. Value loss: 0.005388. Entropy: 2.741178.
now time :  2019-02-27 05:18:53.611318
episode: 2001   score: 340.0  epsilon: 1.0    steps: 80  e

Training network. lr: 0.000208. clip: 0.083264
Iteration 5488: Policy loss: 0.038829. Value loss: 0.020138. Entropy: 2.748850.
Iteration 5489: Policy loss: 0.034072. Value loss: 0.012070. Entropy: 2.741713.
Iteration 5490: Policy loss: 0.032136. Value loss: 0.011538. Entropy: 2.738477.
episode: 2023   score: 320.0  epsilon: 1.0    steps: 475  evaluation reward: 371.2
episode: 2024   score: 470.0  epsilon: 1.0    steps: 544  evaluation reward: 372.8
Training network. lr: 0.000208. clip: 0.083264
Iteration 5491: Policy loss: 0.045486. Value loss: 0.025458. Entropy: 2.712930.
Iteration 5492: Policy loss: 0.043450. Value loss: 0.013272. Entropy: 2.692347.
Iteration 5493: Policy loss: 0.040024. Value loss: 0.009087. Entropy: 2.708325.
Training network. lr: 0.000208. clip: 0.083264
Iteration 5494: Policy loss: 0.019458. Value loss: 0.015285. Entropy: 2.766474.
Iteration 5495: Policy loss: 0.014217. Value loss: 0.010976. Entropy: 2.764381.
Iteration 5496: Policy loss: 0.020238. Value loss: 0.

Iteration 5555: Policy loss: 0.059419. Value loss: 0.008835. Entropy: 2.772640.
Iteration 5556: Policy loss: 0.066077. Value loss: 0.006492. Entropy: 2.771630.
episode: 2044   score: 260.0  epsilon: 1.0    steps: 78  evaluation reward: 370.2
episode: 2045   score: 440.0  epsilon: 1.0    steps: 531  evaluation reward: 370.8
episode: 2046   score: 430.0  epsilon: 1.0    steps: 648  evaluation reward: 371.7
Training network. lr: 0.000207. clip: 0.082960
Iteration 5557: Policy loss: -0.019217. Value loss: 0.025646. Entropy: 2.767737.
Iteration 5558: Policy loss: -0.026171. Value loss: 0.014492. Entropy: 2.764097.
Iteration 5559: Policy loss: -0.023068. Value loss: 0.011284. Entropy: 2.763127.
episode: 2047   score: 350.0  epsilon: 1.0    steps: 435  evaluation reward: 371.9
episode: 2048   score: 850.0  epsilon: 1.0    steps: 926  evaluation reward: 375.1
Training network. lr: 0.000207. clip: 0.082960
Iteration 5560: Policy loss: 0.001426. Value loss: 0.028038. Entropy: 2.761981.
Iteration

Iteration 5620: Policy loss: 0.006111. Value loss: 0.028957. Entropy: 2.619847.
Iteration 5621: Policy loss: 0.010825. Value loss: 0.018145. Entropy: 2.611663.
Iteration 5622: Policy loss: -0.001529. Value loss: 0.016396. Entropy: 2.624096.
Training network. lr: 0.000207. clip: 0.082803
Iteration 5623: Policy loss: -0.136448. Value loss: 0.676003. Entropy: 2.602324.
Iteration 5624: Policy loss: -0.128464. Value loss: 0.468976. Entropy: 2.556789.
Iteration 5625: Policy loss: -0.133009. Value loss: 0.399228. Entropy: 2.530539.
episode: 2067   score: 260.0  epsilon: 1.0    steps: 43  evaluation reward: 398.6
episode: 2068   score: 440.0  epsilon: 1.0    steps: 145  evaluation reward: 399.3
episode: 2069   score: 150.0  epsilon: 1.0    steps: 541  evaluation reward: 396.7
Training network. lr: 0.000207. clip: 0.082803
Iteration 5626: Policy loss: 0.020519. Value loss: 0.056508. Entropy: 2.411307.
Iteration 5627: Policy loss: 0.018510. Value loss: 0.032160. Entropy: 2.432129.
Iteration 5628

Iteration 5685: Policy loss: 0.029719. Value loss: 0.010704. Entropy: 2.742678.
episode: 2091   score: 280.0  epsilon: 1.0    steps: 25  evaluation reward: 392.1
Training network. lr: 0.000207. clip: 0.082646
Iteration 5686: Policy loss: 0.011256. Value loss: 0.011719. Entropy: 2.753600.
Iteration 5687: Policy loss: 0.010422. Value loss: 0.007231. Entropy: 2.755388.
Iteration 5688: Policy loss: 0.007256. Value loss: 0.006635. Entropy: 2.756605.
episode: 2092   score: 310.0  epsilon: 1.0    steps: 489  evaluation reward: 392.3
episode: 2093   score: 220.0  epsilon: 1.0    steps: 589  evaluation reward: 391.5
Training network. lr: 0.000207. clip: 0.082646
Iteration 5689: Policy loss: 0.058309. Value loss: 0.021451. Entropy: 2.748666.
Iteration 5690: Policy loss: 0.063687. Value loss: 0.009566. Entropy: 2.749567.
Iteration 5691: Policy loss: 0.060925. Value loss: 0.008554. Entropy: 2.746415.
Training network. lr: 0.000207. clip: 0.082646
Iteration 5692: Policy loss: 0.032358. Value loss: 

Training network. lr: 0.000206. clip: 0.082499
Iteration 5749: Policy loss: 0.023517. Value loss: 0.016845. Entropy: 2.714955.
Iteration 5750: Policy loss: 0.027213. Value loss: 0.010134. Entropy: 2.719735.
Iteration 5751: Policy loss: 0.023189. Value loss: 0.009550. Entropy: 2.711884.
episode: 2116   score: 400.0  epsilon: 1.0    steps: 897  evaluation reward: 386.4
Training network. lr: 0.000206. clip: 0.082342
Iteration 5752: Policy loss: 0.007196. Value loss: 0.011315. Entropy: 2.751159.
Iteration 5753: Policy loss: 0.005741. Value loss: 0.006759. Entropy: 2.756698.
Iteration 5754: Policy loss: 0.007234. Value loss: 0.005371. Entropy: 2.750086.
episode: 2117   score: 400.0  epsilon: 1.0    steps: 267  evaluation reward: 387.7
Training network. lr: 0.000206. clip: 0.082342
Iteration 5755: Policy loss: 0.020669. Value loss: 0.013643. Entropy: 2.771000.
Iteration 5756: Policy loss: 0.024887. Value loss: 0.007837. Entropy: 2.771350.
Iteration 5757: Policy loss: 0.020290. Value loss: 0.

episode: 2139   score: 420.0  epsilon: 1.0    steps: 868  evaluation reward: 394.3
episode: 2140   score: 290.0  epsilon: 1.0    steps: 912  evaluation reward: 391.5
Training network. lr: 0.000205. clip: 0.082185
Iteration 5815: Policy loss: 0.018508. Value loss: 0.024600. Entropy: 2.759747.
Iteration 5816: Policy loss: 0.020092. Value loss: 0.014832. Entropy: 2.760977.
Iteration 5817: Policy loss: 0.016424. Value loss: 0.014163. Entropy: 2.753505.
Training network. lr: 0.000205. clip: 0.082185
Iteration 5818: Policy loss: 0.013562. Value loss: 0.020142. Entropy: 2.772375.
Iteration 5819: Policy loss: 0.009386. Value loss: 0.011995. Entropy: 2.769065.
Iteration 5820: Policy loss: 0.011517. Value loss: 0.009180. Entropy: 2.766713.
Training network. lr: 0.000205. clip: 0.082185
Iteration 5821: Policy loss: 0.029295. Value loss: 0.014000. Entropy: 2.774256.
Iteration 5822: Policy loss: 0.027295. Value loss: 0.008290. Entropy: 2.779786.
Iteration 5823: Policy loss: 0.026177. Value loss: 0.

Iteration 5881: Policy loss: -0.019977. Value loss: 0.012711. Entropy: 2.793407.
Iteration 5882: Policy loss: -0.018660. Value loss: 0.006227. Entropy: 2.788236.
Iteration 5883: Policy loss: -0.020265. Value loss: 0.004269. Entropy: 2.788755.
episode: 2161   score: 380.0  epsilon: 1.0    steps: 940  evaluation reward: 369.3
Training network. lr: 0.000205. clip: 0.082038
Iteration 5884: Policy loss: 0.026676. Value loss: 0.011053. Entropy: 2.775608.
Iteration 5885: Policy loss: 0.022148. Value loss: 0.006435. Entropy: 2.778544.
Iteration 5886: Policy loss: 0.024901. Value loss: 0.005570. Entropy: 2.779045.
episode: 2162   score: 360.0  epsilon: 1.0    steps: 144  evaluation reward: 369.9
Training network. lr: 0.000205. clip: 0.082038
Iteration 5887: Policy loss: -0.018980. Value loss: 0.015995. Entropy: 2.795450.
Iteration 5888: Policy loss: -0.021393. Value loss: 0.009252. Entropy: 2.786059.
Iteration 5889: Policy loss: -0.028017. Value loss: 0.007344. Entropy: 2.782128.
episode: 2163 

Training network. lr: 0.000205. clip: 0.081881
Iteration 5947: Policy loss: 0.081606. Value loss: 0.020178. Entropy: 2.778325.
Iteration 5948: Policy loss: 0.081187. Value loss: 0.011308. Entropy: 2.780285.
Iteration 5949: Policy loss: 0.077108. Value loss: 0.009795. Entropy: 2.773398.
episode: 2184   score: 240.0  epsilon: 1.0    steps: 939  evaluation reward: 368.1
Training network. lr: 0.000205. clip: 0.081881
Iteration 5950: Policy loss: 0.068770. Value loss: 0.020205. Entropy: 2.772889.
Iteration 5951: Policy loss: 0.057043. Value loss: 0.013966. Entropy: 2.767390.
Iteration 5952: Policy loss: 0.065762. Value loss: 0.009481. Entropy: 2.764796.
Training network. lr: 0.000204. clip: 0.081725
Iteration 5953: Policy loss: 0.070553. Value loss: 0.021719. Entropy: 2.780476.
Iteration 5954: Policy loss: 0.068836. Value loss: 0.012569. Entropy: 2.779032.
Iteration 5955: Policy loss: 0.070091. Value loss: 0.009993. Entropy: 2.777688.
episode: 2185   score: 370.0  epsilon: 1.0    steps: 10 

Iteration 6012: Policy loss: 0.048529. Value loss: 0.014068. Entropy: 2.421608.
Training network. lr: 0.000204. clip: 0.081577
Iteration 6013: Policy loss: 0.075313. Value loss: 0.035019. Entropy: 2.418099.
Iteration 6014: Policy loss: 0.068725. Value loss: 0.019816. Entropy: 2.403871.
Iteration 6015: Policy loss: 0.068106. Value loss: 0.013065. Entropy: 2.412461.
episode: 2207   score: 260.0  epsilon: 1.0    steps: 974  evaluation reward: 408.0
Training network. lr: 0.000204. clip: 0.081577
Iteration 6016: Policy loss: 0.024773. Value loss: 0.025684. Entropy: 2.487293.
Iteration 6017: Policy loss: 0.026041. Value loss: 0.013231. Entropy: 2.474636.
Iteration 6018: Policy loss: 0.023225. Value loss: 0.010483. Entropy: 2.476373.
episode: 2208   score: 250.0  epsilon: 1.0    steps: 705  evaluation reward: 407.6
Training network. lr: 0.000204. clip: 0.081577
Iteration 6019: Policy loss: 0.047085. Value loss: 0.024111. Entropy: 2.467114.
Iteration 6020: Policy loss: 0.036374. Value loss: 0.

Iteration 6077: Policy loss: 0.031329. Value loss: 0.018431. Entropy: 2.709374.
Iteration 6078: Policy loss: 0.021695. Value loss: 0.014696. Entropy: 2.707472.
episode: 2231   score: 210.0  epsilon: 1.0    steps: 832  evaluation reward: 390.4
Training network. lr: 0.000204. clip: 0.081421
Iteration 6079: Policy loss: 0.073564. Value loss: 0.030949. Entropy: 2.676017.
Iteration 6080: Policy loss: 0.080733. Value loss: 0.017240. Entropy: 2.684985.
Iteration 6081: Policy loss: 0.072352. Value loss: 0.012697. Entropy: 2.686466.
episode: 2232   score: 240.0  epsilon: 1.0    steps: 482  evaluation reward: 389.6
Training network. lr: 0.000204. clip: 0.081421
Iteration 6082: Policy loss: 0.050392. Value loss: 0.024583. Entropy: 2.728170.
Iteration 6083: Policy loss: 0.045244. Value loss: 0.014273. Entropy: 2.723720.
Iteration 6084: Policy loss: 0.044528. Value loss: 0.012494. Entropy: 2.721286.
Training network. lr: 0.000204. clip: 0.081421
Iteration 6085: Policy loss: 0.095936. Value loss: 0.

Iteration 6142: Policy loss: 0.018274. Value loss: 0.013303. Entropy: 2.750014.
Iteration 6143: Policy loss: 0.013890. Value loss: 0.009093. Entropy: 2.750953.
Iteration 6144: Policy loss: 0.018887. Value loss: 0.006792. Entropy: 2.747236.
Training network. lr: 0.000203. clip: 0.081264
Iteration 6145: Policy loss: -0.049904. Value loss: 0.014953. Entropy: 2.757426.
Iteration 6146: Policy loss: -0.047934. Value loss: 0.010161. Entropy: 2.758915.
Iteration 6147: Policy loss: -0.050679. Value loss: 0.007479. Entropy: 2.759406.
Training network. lr: 0.000203. clip: 0.081264
Iteration 6148: Policy loss: 0.026408. Value loss: 0.015697. Entropy: 2.742195.
Iteration 6149: Policy loss: 0.025702. Value loss: 0.007279. Entropy: 2.760827.
Iteration 6150: Policy loss: 0.020408. Value loss: 0.006744. Entropy: 2.762415.
episode: 2254   score: 330.0  epsilon: 1.0    steps: 197  evaluation reward: 388.8
Training network. lr: 0.000203. clip: 0.081116
Iteration 6151: Policy loss: -0.005709. Value loss: 0

Iteration 6210: Policy loss: 0.105474. Value loss: 0.009952. Entropy: 2.789004.
episode: 2275   score: 230.0  epsilon: 1.0    steps: 216  evaluation reward: 400.3
episode: 2276   score: 410.0  epsilon: 1.0    steps: 690  evaluation reward: 400.1
Training network. lr: 0.000202. clip: 0.080960
Iteration 6211: Policy loss: 0.022250. Value loss: 0.029787. Entropy: 2.735928.
Iteration 6212: Policy loss: 0.024415. Value loss: 0.017298. Entropy: 2.734524.
Iteration 6213: Policy loss: 0.013610. Value loss: 0.014220. Entropy: 2.736824.
episode: 2277   score: 340.0  epsilon: 1.0    steps: 892  evaluation reward: 400.2
Training network. lr: 0.000202. clip: 0.080960
Iteration 6214: Policy loss: 0.101909. Value loss: 0.031313. Entropy: 2.751175.
Iteration 6215: Policy loss: 0.097884. Value loss: 0.014868. Entropy: 2.746634.
Iteration 6216: Policy loss: 0.092045. Value loss: 0.010230. Entropy: 2.744058.
episode: 2278   score: 450.0  epsilon: 1.0    steps: 1005  evaluation reward: 402.1
Training netw

episode: 2297   score: 300.0  epsilon: 1.0    steps: 462  evaluation reward: 392.0
Training network. lr: 0.000202. clip: 0.080803
Iteration 6277: Policy loss: 0.085482. Value loss: 0.051786. Entropy: 2.689595.
Iteration 6278: Policy loss: 0.064042. Value loss: 0.027545. Entropy: 2.687438.
Iteration 6279: Policy loss: 0.089516. Value loss: 0.014645. Entropy: 2.681288.
episode: 2298   score: 330.0  epsilon: 1.0    steps: 360  evaluation reward: 390.9
Training network. lr: 0.000202. clip: 0.080803
Iteration 6280: Policy loss: -0.251042. Value loss: 1.500419. Entropy: 2.676781.
Iteration 6281: Policy loss: -0.283833. Value loss: 1.080598. Entropy: 2.573646.
Iteration 6282: Policy loss: -0.284318. Value loss: 0.653840. Entropy: 2.587748.
episode: 2299   score: 330.0  epsilon: 1.0    steps: 541  evaluation reward: 391.7
Training network. lr: 0.000202. clip: 0.080803
Iteration 6283: Policy loss: 0.026762. Value loss: 0.672344. Entropy: 2.623680.
Iteration 6284: Policy loss: 0.027569. Value lo

Iteration 6342: Policy loss: 0.034076. Value loss: 0.024268. Entropy: 2.720501.
Training network. lr: 0.000202. clip: 0.080656
Iteration 6343: Policy loss: 0.009029. Value loss: 0.030677. Entropy: 2.717491.
Iteration 6344: Policy loss: 0.011831. Value loss: 0.018110. Entropy: 2.723053.
Iteration 6345: Policy loss: 0.003059. Value loss: 0.016198. Entropy: 2.718816.
episode: 2320   score: 400.0  epsilon: 1.0    steps: 196  evaluation reward: 445.3
episode: 2321   score: 420.0  epsilon: 1.0    steps: 606  evaluation reward: 447.3
Training network. lr: 0.000202. clip: 0.080656
Iteration 6346: Policy loss: 0.159716. Value loss: 0.054287. Entropy: 2.738433.
Iteration 6347: Policy loss: 0.149945. Value loss: 0.028206. Entropy: 2.738738.
Iteration 6348: Policy loss: 0.161318. Value loss: 0.019460. Entropy: 2.741307.
Training network. lr: 0.000202. clip: 0.080656
Iteration 6349: Policy loss: -0.049711. Value loss: 0.645941. Entropy: 2.723705.
Iteration 6350: Policy loss: -0.048285. Value loss: 

episode: 2342   score: 360.0  epsilon: 1.0    steps: 111  evaluation reward: 466.6
Training network. lr: 0.000201. clip: 0.080342
Iteration 6409: Policy loss: 0.013486. Value loss: 0.027970. Entropy: 2.623279.
Iteration 6410: Policy loss: 0.011336. Value loss: 0.020024. Entropy: 2.630598.
Iteration 6411: Policy loss: 0.005502. Value loss: 0.015838. Entropy: 2.625249.
episode: 2343   score: 1010.0  epsilon: 1.0    steps: 168  evaluation reward: 473.6
episode: 2344   score: 390.0  epsilon: 1.0    steps: 536  evaluation reward: 474.0
episode: 2345   score: 360.0  epsilon: 1.0    steps: 719  evaluation reward: 473.8
Training network. lr: 0.000201. clip: 0.080342
Iteration 6412: Policy loss: 0.021941. Value loss: 0.034880. Entropy: 2.585008.
Iteration 6413: Policy loss: 0.016999. Value loss: 0.025011. Entropy: 2.586388.
Iteration 6414: Policy loss: 0.018576. Value loss: 0.017671. Entropy: 2.589621.
Training network. lr: 0.000201. clip: 0.080342
Iteration 6415: Policy loss: -0.000206. Value 

Iteration 6474: Policy loss: 0.022349. Value loss: 0.018306. Entropy: 2.673470.
episode: 2365   score: 370.0  epsilon: 1.0    steps: 965  evaluation reward: 493.3
Training network. lr: 0.000200. clip: 0.080195
Iteration 6475: Policy loss: 0.121029. Value loss: 0.032706. Entropy: 2.694769.
Iteration 6476: Policy loss: 0.121590. Value loss: 0.017378. Entropy: 2.696332.
Iteration 6477: Policy loss: 0.110812. Value loss: 0.014246. Entropy: 2.686764.
episode: 2366   score: 430.0  epsilon: 1.0    steps: 750  evaluation reward: 493.6
Training network. lr: 0.000200. clip: 0.080195
Iteration 6478: Policy loss: 0.040925. Value loss: 0.029853. Entropy: 2.672443.
Iteration 6479: Policy loss: 0.038267. Value loss: 0.018424. Entropy: 2.671077.
Iteration 6480: Policy loss: 0.033372. Value loss: 0.013923. Entropy: 2.674168.
episode: 2367   score: 380.0  epsilon: 1.0    steps: 269  evaluation reward: 493.2
Training network. lr: 0.000200. clip: 0.080195
Iteration 6481: Policy loss: 0.055223. Value loss:

Iteration 6540: Policy loss: 0.044972. Value loss: 0.024176. Entropy: 2.679743.
episode: 2388   score: 390.0  epsilon: 1.0    steps: 458  evaluation reward: 498.1
Training network. lr: 0.000200. clip: 0.080038
Iteration 6541: Policy loss: 0.164858. Value loss: 0.048648. Entropy: 2.683471.
Iteration 6542: Policy loss: 0.159441. Value loss: 0.026584. Entropy: 2.683841.
Iteration 6543: Policy loss: 0.161467. Value loss: 0.021605. Entropy: 2.683135.
Training network. lr: 0.000200. clip: 0.080038
Iteration 6544: Policy loss: 0.116370. Value loss: 0.036370. Entropy: 2.698131.
Iteration 6545: Policy loss: 0.112565. Value loss: 0.021436. Entropy: 2.693027.
Iteration 6546: Policy loss: 0.111176. Value loss: 0.015922. Entropy: 2.696743.
episode: 2389   score: 310.0  epsilon: 1.0    steps: 596  evaluation reward: 497.2
episode: 2390   score: 290.0  epsilon: 1.0    steps: 823  evaluation reward: 497.2
Training network. lr: 0.000200. clip: 0.080038
Iteration 6547: Policy loss: 0.049303. Value loss:

Iteration 6606: Policy loss: 0.192285. Value loss: 0.024720. Entropy: 2.666266.
episode: 2411   score: 530.0  epsilon: 1.0    steps: 121  evaluation reward: 469.9
episode: 2412   score: 200.0  epsilon: 1.0    steps: 1004  evaluation reward: 469.7
Training network. lr: 0.000199. clip: 0.079734
Iteration 6607: Policy loss: 0.120006. Value loss: 0.052033. Entropy: 2.677320.
Iteration 6608: Policy loss: 0.106352. Value loss: 0.017187. Entropy: 2.678092.
Iteration 6609: Policy loss: 0.113246. Value loss: 0.016806. Entropy: 2.671151.
episode: 2413   score: 280.0  epsilon: 1.0    steps: 530  evaluation reward: 467.8
episode: 2414   score: 410.0  epsilon: 1.0    steps: 876  evaluation reward: 467.3
Training network. lr: 0.000199. clip: 0.079734
Iteration 6610: Policy loss: 0.064169. Value loss: 0.017661. Entropy: 2.651414.
Iteration 6611: Policy loss: 0.063918. Value loss: 0.014702. Entropy: 2.665884.
Iteration 6612: Policy loss: 0.061982. Value loss: 0.010647. Entropy: 2.654287.
Training netw

Iteration 6671: Policy loss: 0.049144. Value loss: 0.009885. Entropy: 2.716634.
Iteration 6672: Policy loss: 0.044647. Value loss: 0.010577. Entropy: 2.712375.
Training network. lr: 0.000199. clip: 0.079577
Iteration 6673: Policy loss: -0.004554. Value loss: 0.013426. Entropy: 2.719115.
Iteration 6674: Policy loss: -0.003627. Value loss: 0.007728. Entropy: 2.719753.
Iteration 6675: Policy loss: -0.005201. Value loss: 0.006311. Entropy: 2.711410.
episode: 2435   score: 450.0  epsilon: 1.0    steps: 134  evaluation reward: 452.0
episode: 2436   score: 340.0  epsilon: 1.0    steps: 940  evaluation reward: 449.8
Training network. lr: 0.000199. clip: 0.079577
Iteration 6676: Policy loss: 0.047119. Value loss: 0.014586. Entropy: 2.731348.
Iteration 6677: Policy loss: 0.044485. Value loss: 0.008844. Entropy: 2.734345.
Iteration 6678: Policy loss: 0.047206. Value loss: 0.007560. Entropy: 2.734733.
episode: 2437   score: 410.0  epsilon: 1.0    steps: 630  evaluation reward: 444.7
Training netwo

Iteration 6736: Policy loss: 0.104324. Value loss: 0.043761. Entropy: 2.685041.
Iteration 6737: Policy loss: 0.100523. Value loss: 0.026638. Entropy: 2.690854.
Iteration 6738: Policy loss: 0.097248. Value loss: 0.018910. Entropy: 2.684947.
episode: 2458   score: 350.0  epsilon: 1.0    steps: 136  evaluation reward: 432.7
Training network. lr: 0.000199. clip: 0.079421
Iteration 6739: Policy loss: 0.062761. Value loss: 0.022378. Entropy: 2.669635.
Iteration 6740: Policy loss: 0.060456. Value loss: 0.013368. Entropy: 2.670734.
Iteration 6741: Policy loss: 0.065219. Value loss: 0.010832. Entropy: 2.665590.
episode: 2459   score: 470.0  epsilon: 1.0    steps: 341  evaluation reward: 427.2
Training network. lr: 0.000199. clip: 0.079421
Iteration 6742: Policy loss: 0.075176. Value loss: 0.031096. Entropy: 2.662829.
Iteration 6743: Policy loss: 0.071569. Value loss: 0.016402. Entropy: 2.674923.
Iteration 6744: Policy loss: 0.067707. Value loss: 0.013499. Entropy: 2.664165.
episode: 2460   scor

Iteration 6802: Policy loss: 0.084457. Value loss: 0.040221. Entropy: 2.700191.
Iteration 6803: Policy loss: 0.084041. Value loss: 0.019183. Entropy: 2.681586.
Iteration 6804: Policy loss: 0.082576. Value loss: 0.015259. Entropy: 2.693153.
episode: 2481   score: 390.0  epsilon: 1.0    steps: 457  evaluation reward: 443.5
Training network. lr: 0.000198. clip: 0.079117
Iteration 6805: Policy loss: -0.006530. Value loss: 0.036627. Entropy: 2.684252.
Iteration 6806: Policy loss: 0.003186. Value loss: 0.021918. Entropy: 2.684521.
Iteration 6807: Policy loss: -0.018207. Value loss: 0.016019. Entropy: 2.685397.
Training network. lr: 0.000198. clip: 0.079117
Iteration 6808: Policy loss: 0.035436. Value loss: 0.016010. Entropy: 2.728382.
Iteration 6809: Policy loss: 0.036068. Value loss: 0.009593. Entropy: 2.738331.
Iteration 6810: Policy loss: 0.035312. Value loss: 0.007553. Entropy: 2.736106.
Training network. lr: 0.000198. clip: 0.079117
Iteration 6811: Policy loss: 0.085386. Value loss: 0.0

Iteration 6868: Policy loss: -0.049258. Value loss: 0.030683. Entropy: 2.636771.
Iteration 6869: Policy loss: -0.041984. Value loss: 0.015024. Entropy: 2.631915.
Iteration 6870: Policy loss: -0.044245. Value loss: 0.010775. Entropy: 2.633191.
episode: 2503   score: 550.0  epsilon: 1.0    steps: 321  evaluation reward: 433.7
episode: 2504   score: 1000.0  epsilon: 1.0    steps: 762  evaluation reward: 440.6
Training network. lr: 0.000197. clip: 0.078960
Iteration 6871: Policy loss: -0.001455. Value loss: 0.035533. Entropy: 2.610553.
Iteration 6872: Policy loss: -0.008633. Value loss: 0.018616. Entropy: 2.625833.
Iteration 6873: Policy loss: -0.008822. Value loss: 0.014231. Entropy: 2.615668.
episode: 2505   score: 280.0  epsilon: 1.0    steps: 886  evaluation reward: 424.3
Training network. lr: 0.000197. clip: 0.078960
Iteration 6874: Policy loss: -0.015846. Value loss: 0.027316. Entropy: 2.630728.
Iteration 6875: Policy loss: -0.010104. Value loss: 0.017511. Entropy: 2.638392.
Iteratio

Iteration 6935: Policy loss: 0.034644. Value loss: 0.016485. Entropy: 2.731843.
Iteration 6936: Policy loss: 0.028670. Value loss: 0.014673. Entropy: 2.728544.
episode: 2525   score: 400.0  epsilon: 1.0    steps: 969  evaluation reward: 421.2
Training network. lr: 0.000197. clip: 0.078812
Iteration 6937: Policy loss: 0.019583. Value loss: 0.017061. Entropy: 2.728479.
Iteration 6938: Policy loss: 0.019936. Value loss: 0.008845. Entropy: 2.727126.
Iteration 6939: Policy loss: 0.016834. Value loss: 0.008054. Entropy: 2.728830.
episode: 2526   score: 250.0  epsilon: 1.0    steps: 172  evaluation reward: 419.4
episode: 2527   score: 390.0  epsilon: 1.0    steps: 571  evaluation reward: 420.2
episode: 2528   score: 400.0  epsilon: 1.0    steps: 739  evaluation reward: 420.3
Training network. lr: 0.000197. clip: 0.078812
Iteration 6940: Policy loss: -0.001111. Value loss: 0.023887. Entropy: 2.699640.
Iteration 6941: Policy loss: 0.002004. Value loss: 0.013564. Entropy: 2.703522.
Iteration 694

episode: 2546   score: 350.0  epsilon: 1.0    steps: 218  evaluation reward: 431.7
episode: 2547   score: 380.0  epsilon: 1.0    steps: 969  evaluation reward: 431.8
Training network. lr: 0.000196. clip: 0.078499
Iteration 7003: Policy loss: 0.062762. Value loss: 0.013477. Entropy: 2.753531.
Iteration 7004: Policy loss: 0.057069. Value loss: 0.008676. Entropy: 2.746585.
Iteration 7005: Policy loss: 0.058926. Value loss: 0.006960. Entropy: 2.747076.
Training network. lr: 0.000196. clip: 0.078499
Iteration 7006: Policy loss: -0.022146. Value loss: 0.017950. Entropy: 2.726366.
Iteration 7007: Policy loss: -0.024045. Value loss: 0.009441. Entropy: 2.729635.
Iteration 7008: Policy loss: -0.023735. Value loss: 0.007100. Entropy: 2.728565.
episode: 2548   score: 350.0  epsilon: 1.0    steps: 49  evaluation reward: 431.3
episode: 2549   score: 360.0  epsilon: 1.0    steps: 331  evaluation reward: 431.7
episode: 2550   score: 330.0  epsilon: 1.0    steps: 766  evaluation reward: 432.3
Training 

Iteration 7067: Policy loss: 0.023640. Value loss: 0.015494. Entropy: 2.764117.
Iteration 7068: Policy loss: 0.024841. Value loss: 0.012221. Entropy: 2.767374.
episode: 2570   score: 280.0  epsilon: 1.0    steps: 377  evaluation reward: 415.4
Training network. lr: 0.000196. clip: 0.078352
Iteration 7069: Policy loss: 0.024014. Value loss: 0.022379. Entropy: 2.729273.
Iteration 7070: Policy loss: 0.020502. Value loss: 0.012092. Entropy: 2.734509.
Iteration 7071: Policy loss: 0.021688. Value loss: 0.010221. Entropy: 2.732723.
episode: 2571   score: 430.0  epsilon: 1.0    steps: 2  evaluation reward: 409.8
episode: 2572   score: 350.0  epsilon: 1.0    steps: 946  evaluation reward: 409.1
Training network. lr: 0.000196. clip: 0.078352
Iteration 7072: Policy loss: -0.004688. Value loss: 0.018405. Entropy: 2.720025.
Iteration 7073: Policy loss: -0.001520. Value loss: 0.011937. Entropy: 2.724781.
Iteration 7074: Policy loss: -0.006945. Value loss: 0.009744. Entropy: 2.719357.
episode: 2573   

Iteration 7133: Policy loss: 0.025294. Value loss: 0.013965. Entropy: 2.767374.
Iteration 7134: Policy loss: 0.017248. Value loss: 0.011022. Entropy: 2.764057.
Training network. lr: 0.000195. clip: 0.078195
Iteration 7135: Policy loss: 0.012291. Value loss: 0.011557. Entropy: 2.782222.
Iteration 7136: Policy loss: 0.022605. Value loss: 0.007797. Entropy: 2.786542.
Iteration 7137: Policy loss: 0.008458. Value loss: 0.005770. Entropy: 2.782505.
episode: 2593   score: 310.0  epsilon: 1.0    steps: 132  evaluation reward: 384.0
episode: 2594   score: 450.0  epsilon: 1.0    steps: 730  evaluation reward: 379.4
episode: 2595   score: 420.0  epsilon: 1.0    steps: 985  evaluation reward: 379.5
Training network. lr: 0.000195. clip: 0.078195
Iteration 7138: Policy loss: 0.016711. Value loss: 0.019767. Entropy: 2.785547.
Iteration 7139: Policy loss: 0.019635. Value loss: 0.013988. Entropy: 2.781907.
Iteration 7140: Policy loss: 0.020557. Value loss: 0.012146. Entropy: 2.781091.
episode: 2596   s

Iteration 7198: Policy loss: 0.020626. Value loss: 0.012520. Entropy: 2.801933.
Iteration 7199: Policy loss: 0.023460. Value loss: 0.007309. Entropy: 2.799757.
Iteration 7200: Policy loss: 0.020357. Value loss: 0.005246. Entropy: 2.798372.
episode: 2616   score: 460.0  epsilon: 1.0    steps: 543  evaluation reward: 385.9
Training network. lr: 0.000195. clip: 0.077891
Iteration 7201: Policy loss: 0.029849. Value loss: 0.011009. Entropy: 2.789881.
Iteration 7202: Policy loss: 0.026804. Value loss: 0.006997. Entropy: 2.796670.
Iteration 7203: Policy loss: 0.026359. Value loss: 0.005501. Entropy: 2.793681.
episode: 2617   score: 250.0  epsilon: 1.0    steps: 989  evaluation reward: 384.4
Training network. lr: 0.000195. clip: 0.077891
Iteration 7204: Policy loss: 0.027445. Value loss: 0.015379. Entropy: 2.804610.
Iteration 7205: Policy loss: 0.024061. Value loss: 0.008710. Entropy: 2.804094.
Iteration 7206: Policy loss: 0.021913. Value loss: 0.007990. Entropy: 2.802631.
Training network. lr

episode: 2636   score: 410.0  epsilon: 1.0    steps: 292  evaluation reward: 375.5
episode: 2637   score: 300.0  epsilon: 1.0    steps: 804  evaluation reward: 375.9
Training network. lr: 0.000194. clip: 0.077734
Iteration 7267: Policy loss: 0.038249. Value loss: 0.013824. Entropy: 2.760026.
Iteration 7268: Policy loss: 0.034468. Value loss: 0.007397. Entropy: 2.751645.
Iteration 7269: Policy loss: 0.030839. Value loss: 0.008887. Entropy: 2.760945.
episode: 2638   score: 290.0  epsilon: 1.0    steps: 141  evaluation reward: 374.3
episode: 2639   score: 420.0  epsilon: 1.0    steps: 450  evaluation reward: 374.4
episode: 2640   score: 440.0  epsilon: 1.0    steps: 742  evaluation reward: 374.9
Training network. lr: 0.000194. clip: 0.077734
Iteration 7270: Policy loss: 0.013322. Value loss: 0.018212. Entropy: 2.727102.
Iteration 7271: Policy loss: 0.003410. Value loss: 0.016274. Entropy: 2.727044.
Iteration 7272: Policy loss: 0.007613. Value loss: 0.009175. Entropy: 2.733019.
episode: 26

Iteration 7330: Policy loss: 0.054304. Value loss: 0.013058. Entropy: 2.706740.
Iteration 7331: Policy loss: 0.049044. Value loss: 0.008944. Entropy: 2.708177.
Iteration 7332: Policy loss: 0.051818. Value loss: 0.006877. Entropy: 2.704792.
episode: 2661   score: 240.0  epsilon: 1.0    steps: 19  evaluation reward: 395.5
Training network. lr: 0.000194. clip: 0.077577
Iteration 7333: Policy loss: 0.075501. Value loss: 0.019567. Entropy: 2.703840.
Iteration 7334: Policy loss: 0.062494. Value loss: 0.017498. Entropy: 2.696403.
Iteration 7335: Policy loss: 0.065711. Value loss: 0.014169. Entropy: 2.693709.
episode: 2662   score: 270.0  epsilon: 1.0    steps: 417  evaluation reward: 393.7
Training network. lr: 0.000194. clip: 0.077577
Iteration 7336: Policy loss: 0.053752. Value loss: 0.016693. Entropy: 2.709760.
Iteration 7337: Policy loss: 0.056441. Value loss: 0.009890. Entropy: 2.702283.
Iteration 7338: Policy loss: 0.051455. Value loss: 0.009151. Entropy: 2.706663.
Training network. lr:

episode: 2684   score: 310.0  epsilon: 1.0    steps: 536  evaluation reward: 404.3
Training network. lr: 0.000194. clip: 0.077430
Iteration 7396: Policy loss: 0.009682. Value loss: 0.026161. Entropy: 2.713192.
Iteration 7397: Policy loss: 0.008201. Value loss: 0.016605. Entropy: 2.709270.
Iteration 7398: Policy loss: 0.007813. Value loss: 0.014071. Entropy: 2.719699.
episode: 2685   score: 310.0  epsilon: 1.0    steps: 695  evaluation reward: 403.4
Training network. lr: 0.000194. clip: 0.077430
Iteration 7399: Policy loss: 0.055367. Value loss: 0.019781. Entropy: 2.725775.
Iteration 7400: Policy loss: 0.045194. Value loss: 0.013815. Entropy: 2.735351.
Iteration 7401: Policy loss: 0.051426. Value loss: 0.008820. Entropy: 2.732326.
episode: 2686   score: 300.0  epsilon: 1.0    steps: 127  evaluation reward: 403.3
episode: 2687   score: 360.0  epsilon: 1.0    steps: 140  evaluation reward: 402.0
episode: 2688   score: 340.0  epsilon: 1.0    steps: 332  evaluation reward: 400.6
episode: 26

episode: 2710   score: 400.0  epsilon: 1.0    steps: 695  evaluation reward: 391.7
episode: 2711   score: 370.0  epsilon: 1.0    steps: 960  evaluation reward: 393.5
Training network. lr: 0.000193. clip: 0.077117
Iteration 7459: Policy loss: 0.031916. Value loss: 0.018337. Entropy: 2.744478.
Iteration 7460: Policy loss: 0.036589. Value loss: 0.011419. Entropy: 2.749346.
Iteration 7461: Policy loss: 0.036077. Value loss: 0.010487. Entropy: 2.748958.
episode: 2712   score: 290.0  epsilon: 1.0    steps: 313  evaluation reward: 393.9
Training network. lr: 0.000193. clip: 0.077117
Iteration 7462: Policy loss: 0.024196. Value loss: 0.013105. Entropy: 2.733002.
Iteration 7463: Policy loss: 0.021607. Value loss: 0.008201. Entropy: 2.732576.
Iteration 7464: Policy loss: 0.026823. Value loss: 0.007589. Entropy: 2.733822.
episode: 2713   score: 250.0  epsilon: 1.0    steps: 412  evaluation reward: 392.4
Training network. lr: 0.000193. clip: 0.077117
Iteration 7465: Policy loss: 0.006454. Value lo

Iteration 7523: Policy loss: 0.015507. Value loss: 0.009036. Entropy: 2.772349.
Iteration 7524: Policy loss: 0.014013. Value loss: 0.007156. Entropy: 2.770928.
episode: 2735   score: 330.0  epsilon: 1.0    steps: 145  evaluation reward: 376.3
Training network. lr: 0.000192. clip: 0.076969
Iteration 7525: Policy loss: -0.035629. Value loss: 0.011059. Entropy: 2.726869.
Iteration 7526: Policy loss: -0.038530. Value loss: 0.009540. Entropy: 2.731772.
Iteration 7527: Policy loss: -0.035180. Value loss: 0.006465. Entropy: 2.736428.
episode: 2736   score: 340.0  epsilon: 1.0    steps: 44  evaluation reward: 375.6
episode: 2737   score: 430.0  epsilon: 1.0    steps: 351  evaluation reward: 376.9
episode: 2738   score: 360.0  epsilon: 1.0    steps: 473  evaluation reward: 377.6
episode: 2739   score: 310.0  epsilon: 1.0    steps: 672  evaluation reward: 376.5
Training network. lr: 0.000192. clip: 0.076969
Iteration 7528: Policy loss: 0.041712. Value loss: 0.013533. Entropy: 2.733488.
Iteration

Iteration 7587: Policy loss: 0.026496. Value loss: 0.015112. Entropy: 2.726786.
episode: 2760   score: 430.0  epsilon: 1.0    steps: 238  evaluation reward: 367.3
Training network. lr: 0.000192. clip: 0.076813
Iteration 7588: Policy loss: 0.026933. Value loss: 0.017010. Entropy: 2.725105.
Iteration 7589: Policy loss: 0.021631. Value loss: 0.010014. Entropy: 2.717523.
Iteration 7590: Policy loss: 0.027901. Value loss: 0.007346. Entropy: 2.721200.
episode: 2761   score: 260.0  epsilon: 1.0    steps: 597  evaluation reward: 367.5
Training network. lr: 0.000192. clip: 0.076813
Iteration 7591: Policy loss: -0.002581. Value loss: 0.013751. Entropy: 2.723444.
Iteration 7592: Policy loss: -0.004740. Value loss: 0.009515. Entropy: 2.729380.
Iteration 7593: Policy loss: -0.003650. Value loss: 0.007586. Entropy: 2.728185.
episode: 2762   score: 420.0  epsilon: 1.0    steps: 97  evaluation reward: 369.0
Training network. lr: 0.000192. clip: 0.076813
Iteration 7594: Policy loss: 0.010059. Value los

Iteration 7653: Policy loss: 0.006775. Value loss: 0.009318. Entropy: 2.740208.
episode: 2783   score: 300.0  epsilon: 1.0    steps: 101  evaluation reward: 387.5
episode: 2784   score: 330.0  epsilon: 1.0    steps: 213  evaluation reward: 387.7
Training network. lr: 0.000191. clip: 0.076508
Iteration 7654: Policy loss: 0.042551. Value loss: 0.020763. Entropy: 2.712748.
Iteration 7655: Policy loss: 0.040029. Value loss: 0.010803. Entropy: 2.721031.
Iteration 7656: Policy loss: 0.034301. Value loss: 0.011475. Entropy: 2.718285.
Training network. lr: 0.000191. clip: 0.076508
Iteration 7657: Policy loss: 0.001469. Value loss: 0.017285. Entropy: 2.742884.
Iteration 7658: Policy loss: 0.000845. Value loss: 0.011295. Entropy: 2.752174.
Iteration 7659: Policy loss: -0.002995. Value loss: 0.008673. Entropy: 2.751461.
episode: 2785   score: 320.0  epsilon: 1.0    steps: 324  evaluation reward: 387.8
Training network. lr: 0.000191. clip: 0.076508
Iteration 7660: Policy loss: -0.016243. Value los

Training network. lr: 0.000191. clip: 0.076352
Iteration 7717: Policy loss: 0.002805. Value loss: 0.029149. Entropy: 2.683148.
Iteration 7718: Policy loss: 0.009193. Value loss: 0.019834. Entropy: 2.689623.
Iteration 7719: Policy loss: -0.002195. Value loss: 0.014518. Entropy: 2.688527.
episode: 2808   score: 930.0  epsilon: 1.0    steps: 663  evaluation reward: 398.9
Training network. lr: 0.000191. clip: 0.076352
Iteration 7720: Policy loss: 0.000031. Value loss: 0.023139. Entropy: 2.727742.
Iteration 7721: Policy loss: -0.007459. Value loss: 0.018949. Entropy: 2.725777.
Iteration 7722: Policy loss: -0.000719. Value loss: 0.015968. Entropy: 2.723147.
episode: 2809   score: 380.0  epsilon: 1.0    steps: 1019  evaluation reward: 399.5
Training network. lr: 0.000191. clip: 0.076352
Iteration 7723: Policy loss: 0.045563. Value loss: 0.029693. Entropy: 2.731863.
Iteration 7724: Policy loss: 0.037817. Value loss: 0.019129. Entropy: 2.726526.
Iteration 7725: Policy loss: 0.040959. Value loss

Iteration 7783: Policy loss: 0.016987. Value loss: 0.015428. Entropy: 2.761991.
Iteration 7784: Policy loss: 0.011715. Value loss: 0.012229. Entropy: 2.756528.
Iteration 7785: Policy loss: 0.012565. Value loss: 0.008702. Entropy: 2.757975.
episode: 2830   score: 410.0  epsilon: 1.0    steps: 336  evaluation reward: 401.1
episode: 2831   score: 320.0  epsilon: 1.0    steps: 635  evaluation reward: 400.0
episode: 2832   score: 250.0  epsilon: 1.0    steps: 927  evaluation reward: 399.0
Training network. lr: 0.000190. clip: 0.076195
Iteration 7786: Policy loss: 0.054624. Value loss: 0.019047. Entropy: 2.763241.
Iteration 7787: Policy loss: 0.051010. Value loss: 0.011769. Entropy: 2.765440.
Iteration 7788: Policy loss: 0.049951. Value loss: 0.011413. Entropy: 2.765240.
Training network. lr: 0.000190. clip: 0.076195
Iteration 7789: Policy loss: 0.017645. Value loss: 0.015451. Entropy: 2.746560.
Iteration 7790: Policy loss: 0.023053. Value loss: 0.009976. Entropy: 2.759720.
Iteration 7791: P

episode: 2853   score: 400.0  epsilon: 1.0    steps: 721  evaluation reward: 394.2
Training network. lr: 0.000190. clip: 0.076048
Iteration 7849: Policy loss: 0.011395. Value loss: 0.012667. Entropy: 2.766263.
Iteration 7850: Policy loss: 0.018947. Value loss: 0.009221. Entropy: 2.770329.
Iteration 7851: Policy loss: 0.010991. Value loss: 0.007542. Entropy: 2.765958.
Training network. lr: 0.000190. clip: 0.075891
Iteration 7852: Policy loss: 0.027769. Value loss: 0.010352. Entropy: 2.775744.
Iteration 7853: Policy loss: 0.023858. Value loss: 0.005744. Entropy: 2.782480.
Iteration 7854: Policy loss: 0.016272. Value loss: 0.004669. Entropy: 2.775778.
episode: 2854   score: 330.0  epsilon: 1.0    steps: 164  evaluation reward: 393.6
episode: 2855   score: 370.0  epsilon: 1.0    steps: 441  evaluation reward: 393.7
episode: 2856   score: 420.0  epsilon: 1.0    steps: 1008  evaluation reward: 393.5
Training network. lr: 0.000190. clip: 0.075891
Iteration 7855: Policy loss: -0.079758. Value 

Iteration 7914: Policy loss: -0.005463. Value loss: 0.010457. Entropy: 2.732780.
Training network. lr: 0.000189. clip: 0.075734
Iteration 7915: Policy loss: 0.033666. Value loss: 0.019741. Entropy: 2.755272.
Iteration 7916: Policy loss: 0.029695. Value loss: 0.013786. Entropy: 2.752430.
Iteration 7917: Policy loss: 0.038318. Value loss: 0.010474. Entropy: 2.755590.
episode: 2877   score: 290.0  epsilon: 1.0    steps: 921  evaluation reward: 392.7
Training network. lr: 0.000189. clip: 0.075734
Iteration 7918: Policy loss: 0.004196. Value loss: 0.026863. Entropy: 2.747405.
Iteration 7919: Policy loss: -0.003812. Value loss: 0.015821. Entropy: 2.740618.
Iteration 7920: Policy loss: -0.000651. Value loss: 0.012929. Entropy: 2.738214.
episode: 2878   score: 390.0  epsilon: 1.0    steps: 160  evaluation reward: 377.5
Training network. lr: 0.000189. clip: 0.075734
Iteration 7921: Policy loss: -0.016773. Value loss: 0.022447. Entropy: 2.731524.
Iteration 7922: Policy loss: -0.017722. Value los

episode: 2899   score: 410.0  epsilon: 1.0    steps: 703  evaluation reward: 393.8
episode: 2900   score: 310.0  epsilon: 1.0    steps: 939  evaluation reward: 393.4
Training network. lr: 0.000189. clip: 0.075587
Iteration 7981: Policy loss: 0.004524. Value loss: 0.017673. Entropy: 2.707153.
Iteration 7982: Policy loss: -0.004751. Value loss: 0.011884. Entropy: 2.710818.
Iteration 7983: Policy loss: 0.003088. Value loss: 0.011408. Entropy: 2.710819.
Training network. lr: 0.000189. clip: 0.075587
Iteration 7984: Policy loss: 0.048494. Value loss: 0.012707. Entropy: 2.704221.
Iteration 7985: Policy loss: 0.047700. Value loss: 0.007558. Entropy: 2.711901.
Iteration 7986: Policy loss: 0.044425. Value loss: 0.008897. Entropy: 2.706015.
now time :  2019-02-27 06:09:07.815554
episode: 2901   score: 370.0  epsilon: 1.0    steps: 325  evaluation reward: 393.7
Training network. lr: 0.000189. clip: 0.075587
Iteration 7987: Policy loss: 0.035075. Value loss: 0.013647. Entropy: 2.735771.
Iteration 

Training network. lr: 0.000189. clip: 0.075430
Iteration 8047: Policy loss: -0.182359. Value loss: 1.051949. Entropy: 2.622133.
Iteration 8048: Policy loss: -0.129986. Value loss: 0.597853. Entropy: 2.614382.
Iteration 8049: Policy loss: -0.177170. Value loss: 0.595543. Entropy: 2.599036.
Training network. lr: 0.000189. clip: 0.075430
Iteration 8050: Policy loss: -0.064481. Value loss: 0.877567. Entropy: 2.493460.
Iteration 8051: Policy loss: -0.052558. Value loss: 0.530140. Entropy: 2.466000.
Iteration 8052: Policy loss: -0.064164. Value loss: 0.483674. Entropy: 2.425622.
Training network. lr: 0.000188. clip: 0.075273
Iteration 8053: Policy loss: 0.052375. Value loss: 0.125132. Entropy: 2.354547.
Iteration 8054: Policy loss: 0.051236. Value loss: 0.062445. Entropy: 2.365061.
Iteration 8055: Policy loss: 0.043385. Value loss: 0.051465. Entropy: 2.367742.
episode: 2921   score: 450.0  epsilon: 1.0    steps: 131  evaluation reward: 426.4
episode: 2922   score: 1070.0  epsilon: 1.0    ste

Iteration 8114: Policy loss: 0.096822. Value loss: 0.030435. Entropy: 2.600052.
Iteration 8115: Policy loss: 0.083665. Value loss: 0.022818. Entropy: 2.597758.
episode: 2942   score: 380.0  epsilon: 1.0    steps: 209  evaluation reward: 469.6
episode: 2943   score: 1050.0  epsilon: 1.0    steps: 401  evaluation reward: 476.6
episode: 2944   score: 460.0  epsilon: 1.0    steps: 513  evaluation reward: 478.3
Training network. lr: 0.000188. clip: 0.075126
Iteration 8116: Policy loss: 0.048725. Value loss: 0.037168. Entropy: 2.641109.
Iteration 8117: Policy loss: 0.049243. Value loss: 0.021937. Entropy: 2.632528.
Iteration 8118: Policy loss: 0.046457. Value loss: 0.020415. Entropy: 2.629281.
episode: 2945   score: 360.0  epsilon: 1.0    steps: 352  evaluation reward: 477.9
episode: 2946   score: 290.0  epsilon: 1.0    steps: 709  evaluation reward: 476.6
Training network. lr: 0.000188. clip: 0.075126
Iteration 8119: Policy loss: 0.082283. Value loss: 0.055167. Entropy: 2.687011.
Iteration 

Iteration 8178: Policy loss: 0.032787. Value loss: 0.020932. Entropy: 2.516951.
episode: 2967   score: 450.0  epsilon: 1.0    steps: 902  evaluation reward: 483.3
Training network. lr: 0.000187. clip: 0.074969
Iteration 8179: Policy loss: -0.006091. Value loss: 0.022938. Entropy: 2.557661.
Iteration 8180: Policy loss: -0.000544. Value loss: 0.015260. Entropy: 2.570786.
Iteration 8181: Policy loss: -0.005461. Value loss: 0.012897. Entropy: 2.558888.
episode: 2968   score: 300.0  epsilon: 1.0    steps: 284  evaluation reward: 483.2
Training network. lr: 0.000187. clip: 0.074969
Iteration 8182: Policy loss: 0.029143. Value loss: 0.025611. Entropy: 2.666264.
Iteration 8183: Policy loss: 0.027959. Value loss: 0.018772. Entropy: 2.661876.
Iteration 8184: Policy loss: 0.021694. Value loss: 0.018078. Entropy: 2.661615.
episode: 2969   score: 940.0  epsilon: 1.0    steps: 93  evaluation reward: 489.8
Training network. lr: 0.000187. clip: 0.074969
Iteration 8185: Policy loss: 0.021780. Value los

episode: 2989   score: 290.0  epsilon: 1.0    steps: 743  evaluation reward: 488.6
Training network. lr: 0.000187. clip: 0.074813
Iteration 8245: Policy loss: 0.040674. Value loss: 0.049124. Entropy: 2.606614.
Iteration 8246: Policy loss: 0.041092. Value loss: 0.022761. Entropy: 2.594766.
Iteration 8247: Policy loss: 0.038162. Value loss: 0.018845. Entropy: 2.601745.
Training network. lr: 0.000187. clip: 0.074813
Iteration 8248: Policy loss: -0.164208. Value loss: 0.426759. Entropy: 2.553369.
Iteration 8249: Policy loss: -0.157879. Value loss: 0.186375. Entropy: 2.513340.
Iteration 8250: Policy loss: -0.160763. Value loss: 0.113402. Entropy: 2.515839.
episode: 2990   score: 470.0  epsilon: 1.0    steps: 873  evaluation reward: 487.0
Training network. lr: 0.000187. clip: 0.074665
Iteration 8251: Policy loss: 0.065091. Value loss: 0.087343. Entropy: 2.549397.
Iteration 8252: Policy loss: 0.062232. Value loss: 0.043774. Entropy: 2.566076.
Iteration 8253: Policy loss: 0.059747. Value loss:

Iteration 8308: Policy loss: 0.053592. Value loss: 0.025045. Entropy: 2.724995.
Iteration 8309: Policy loss: 0.052698. Value loss: 0.017875. Entropy: 2.723493.
Iteration 8310: Policy loss: 0.047973. Value loss: 0.013607. Entropy: 2.726826.
Training network. lr: 0.000186. clip: 0.074509
Iteration 8311: Policy loss: -0.027669. Value loss: 0.015541. Entropy: 2.697871.
Iteration 8312: Policy loss: -0.028847. Value loss: 0.009721. Entropy: 2.701941.
Iteration 8313: Policy loss: -0.027190. Value loss: 0.008531. Entropy: 2.702932.
episode: 3014   score: 370.0  epsilon: 1.0    steps: 482  evaluation reward: 468.1
episode: 3015   score: 430.0  epsilon: 1.0    steps: 885  evaluation reward: 462.4
Training network. lr: 0.000186. clip: 0.074509
Iteration 8314: Policy loss: 0.080125. Value loss: 0.016957. Entropy: 2.719338.
Iteration 8315: Policy loss: 0.079277. Value loss: 0.013401. Entropy: 2.724798.
Iteration 8316: Policy loss: 0.070371. Value loss: 0.010221. Entropy: 2.723336.
episode: 3016   s

Iteration 8376: Policy loss: 0.018376. Value loss: 0.008592. Entropy: 2.736568.
episode: 3035   score: 390.0  epsilon: 1.0    steps: 34  evaluation reward: 434.9
episode: 3036   score: 350.0  epsilon: 1.0    steps: 353  evaluation reward: 434.8
episode: 3037   score: 280.0  epsilon: 1.0    steps: 426  evaluation reward: 428.5
Training network. lr: 0.000186. clip: 0.074352
Iteration 8377: Policy loss: 0.032094. Value loss: 0.020822. Entropy: 2.719410.
Iteration 8378: Policy loss: 0.031264. Value loss: 0.014163. Entropy: 2.721032.
Iteration 8379: Policy loss: 0.029313. Value loss: 0.011982. Entropy: 2.718295.
episode: 3038   score: 280.0  epsilon: 1.0    steps: 186  evaluation reward: 426.9
Training network. lr: 0.000186. clip: 0.074352
Iteration 8380: Policy loss: -0.024713. Value loss: 0.026679. Entropy: 2.713914.
Iteration 8381: Policy loss: -0.018607. Value loss: 0.017925. Entropy: 2.708932.
Iteration 8382: Policy loss: -0.022132. Value loss: 0.013986. Entropy: 2.713214.
episode: 303

Training network. lr: 0.000186. clip: 0.074204
Iteration 8440: Policy loss: 0.125906. Value loss: 0.049962. Entropy: 2.609760.
Iteration 8441: Policy loss: 0.119974. Value loss: 0.021128. Entropy: 2.624821.
Iteration 8442: Policy loss: 0.116845. Value loss: 0.019323. Entropy: 2.628213.
episode: 3060   score: 340.0  epsilon: 1.0    steps: 268  evaluation reward: 425.6
Training network. lr: 0.000186. clip: 0.074204
Iteration 8443: Policy loss: 0.109834. Value loss: 0.036967. Entropy: 2.602890.
Iteration 8444: Policy loss: 0.112183. Value loss: 0.023237. Entropy: 2.611407.
Iteration 8445: Policy loss: 0.107904. Value loss: 0.019806. Entropy: 2.612587.
episode: 3061   score: 370.0  epsilon: 1.0    steps: 737  evaluation reward: 425.5
Training network. lr: 0.000186. clip: 0.074204
Iteration 8446: Policy loss: 0.028357. Value loss: 0.034166. Entropy: 2.650006.
Iteration 8447: Policy loss: 0.026546. Value loss: 0.022945. Entropy: 2.655788.
Iteration 8448: Policy loss: 0.028703. Value loss: 0.

Iteration 8505: Policy loss: 0.008645. Value loss: 0.010807. Entropy: 2.688268.
episode: 3084   score: 390.0  epsilon: 1.0    steps: 939  evaluation reward: 416.7
Training network. lr: 0.000185. clip: 0.073891
Iteration 8506: Policy loss: 0.016969. Value loss: 0.021919. Entropy: 2.711502.
Iteration 8507: Policy loss: 0.014995. Value loss: 0.013848. Entropy: 2.711042.
Iteration 8508: Policy loss: 0.004094. Value loss: 0.013910. Entropy: 2.711412.
episode: 3085   score: 460.0  epsilon: 1.0    steps: 493  evaluation reward: 416.7
episode: 3086   score: 430.0  epsilon: 1.0    steps: 707  evaluation reward: 411.3
Training network. lr: 0.000185. clip: 0.073891
Iteration 8509: Policy loss: -0.079353. Value loss: 0.570338. Entropy: 2.667314.
Iteration 8510: Policy loss: -0.143065. Value loss: 0.342692. Entropy: 2.620677.
Iteration 8511: Policy loss: -0.095563. Value loss: 0.204094. Entropy: 2.594672.
episode: 3087   score: 430.0  epsilon: 1.0    steps: 137  evaluation reward: 411.4
Training ne

Iteration 8570: Policy loss: -0.019766. Value loss: 0.010668. Entropy: 2.770964.
Iteration 8571: Policy loss: -0.015437. Value loss: 0.006601. Entropy: 2.771388.
Training network. lr: 0.000184. clip: 0.073744
Iteration 8572: Policy loss: -0.010642. Value loss: 0.015944. Entropy: 2.770277.
Iteration 8573: Policy loss: -0.003144. Value loss: 0.008532. Entropy: 2.769251.
Iteration 8574: Policy loss: -0.010005. Value loss: 0.008302. Entropy: 2.764533.
episode: 3108   score: 430.0  epsilon: 1.0    steps: 299  evaluation reward: 407.7
Training network. lr: 0.000184. clip: 0.073744
Iteration 8575: Policy loss: 0.006782. Value loss: 0.011910. Entropy: 2.770684.
Iteration 8576: Policy loss: 0.007266. Value loss: 0.010505. Entropy: 2.768081.
Iteration 8577: Policy loss: 0.007877. Value loss: 0.007904. Entropy: 2.763473.
episode: 3109   score: 410.0  epsilon: 1.0    steps: 43  evaluation reward: 407.9
episode: 3110   score: 470.0  epsilon: 1.0    steps: 209  evaluation reward: 408.2
episode: 3111

Iteration 8637: Policy loss: 0.046816. Value loss: 0.011278. Entropy: 2.758458.
episode: 3130   score: 430.0  epsilon: 1.0    steps: 369  evaluation reward: 405.8
Training network. lr: 0.000184. clip: 0.073587
Iteration 8638: Policy loss: 0.090333. Value loss: 0.023402. Entropy: 2.765190.
Iteration 8639: Policy loss: 0.084665. Value loss: 0.014367. Entropy: 2.761692.
Iteration 8640: Policy loss: 0.081632. Value loss: 0.011579. Entropy: 2.760035.
episode: 3131   score: 340.0  epsilon: 1.0    steps: 590  evaluation reward: 406.0
episode: 3132   score: 360.0  epsilon: 1.0    steps: 676  evaluation reward: 405.8
episode: 3133   score: 340.0  epsilon: 1.0    steps: 990  evaluation reward: 404.7
Training network. lr: 0.000184. clip: 0.073587
Iteration 8641: Policy loss: 0.071140. Value loss: 0.025363. Entropy: 2.722762.
Iteration 8642: Policy loss: 0.068209. Value loss: 0.018668. Entropy: 2.727923.
Iteration 8643: Policy loss: 0.068780. Value loss: 0.016387. Entropy: 2.730624.
Training netwo

Iteration 8702: Policy loss: 0.023043. Value loss: 0.013241. Entropy: 2.745252.
Iteration 8703: Policy loss: 0.024843. Value loss: 0.010317. Entropy: 2.743191.
episode: 3154   score: 420.0  epsilon: 1.0    steps: 1011  evaluation reward: 393.1
Training network. lr: 0.000183. clip: 0.073283
Iteration 8704: Policy loss: -0.011440. Value loss: 0.022544. Entropy: 2.767143.
Iteration 8705: Policy loss: -0.019545. Value loss: 0.011636. Entropy: 2.755746.
Iteration 8706: Policy loss: -0.017099. Value loss: 0.011217. Entropy: 2.752700.
episode: 3155   score: 320.0  epsilon: 1.0    steps: 678  evaluation reward: 393.2
Training network. lr: 0.000183. clip: 0.073283
Iteration 8707: Policy loss: 0.024269. Value loss: 0.023222. Entropy: 2.723869.
Iteration 8708: Policy loss: 0.017040. Value loss: 0.015462. Entropy: 2.718211.
Iteration 8709: Policy loss: 0.017320. Value loss: 0.011978. Entropy: 2.718670.
episode: 3156   score: 510.0  epsilon: 1.0    steps: 336  evaluation reward: 395.5
episode: 3157

Iteration 8766: Policy loss: -0.006016. Value loss: 0.013867. Entropy: 2.748035.
Training network. lr: 0.000183. clip: 0.073126
Iteration 8767: Policy loss: 0.023277. Value loss: 0.011401. Entropy: 2.747154.
Iteration 8768: Policy loss: 0.028997. Value loss: 0.006074. Entropy: 2.752381.
Iteration 8769: Policy loss: 0.027275. Value loss: 0.006463. Entropy: 2.755324.
episode: 3179   score: 260.0  epsilon: 1.0    steps: 122  evaluation reward: 395.8
episode: 3180   score: 250.0  epsilon: 1.0    steps: 139  evaluation reward: 394.7
Training network. lr: 0.000183. clip: 0.073126
Iteration 8770: Policy loss: 0.031420. Value loss: 0.017317. Entropy: 2.736381.
Iteration 8771: Policy loss: 0.034424. Value loss: 0.013463. Entropy: 2.737544.
Iteration 8772: Policy loss: 0.034578. Value loss: 0.010902. Entropy: 2.741521.
episode: 3181   score: 300.0  epsilon: 1.0    steps: 953  evaluation reward: 393.9
Training network. lr: 0.000183. clip: 0.073126
Iteration 8773: Policy loss: 0.048386. Value loss

Iteration 8833: Policy loss: 0.085691. Value loss: 0.026525. Entropy: 2.685960.
Iteration 8834: Policy loss: 0.083334. Value loss: 0.015450. Entropy: 2.665507.
Iteration 8835: Policy loss: 0.082307. Value loss: 0.012272. Entropy: 2.672865.
episode: 3200   score: 370.0  epsilon: 1.0    steps: 205  evaluation reward: 408.7
Training network. lr: 0.000182. clip: 0.072969
Iteration 8836: Policy loss: 0.046139. Value loss: 0.018693. Entropy: 2.686856.
Iteration 8837: Policy loss: 0.039076. Value loss: 0.011056. Entropy: 2.688790.
Iteration 8838: Policy loss: 0.039760. Value loss: 0.009276. Entropy: 2.688798.
now time :  2019-02-27 06:25:54.575206
episode: 3201   score: 410.0  epsilon: 1.0    steps: 579  evaluation reward: 409.2
Training network. lr: 0.000182. clip: 0.072969
Iteration 8839: Policy loss: 0.032343. Value loss: 0.016496. Entropy: 2.699541.
Iteration 8840: Policy loss: 0.038951. Value loss: 0.009652. Entropy: 2.701764.
Iteration 8841: Policy loss: 0.034440. Value loss: 0.008908. 

Iteration 8900: Policy loss: 0.094298. Value loss: 0.023018. Entropy: 2.606679.
Iteration 8901: Policy loss: 0.097788. Value loss: 0.016745. Entropy: 2.603877.
episode: 3221   score: 990.0  epsilon: 1.0    steps: 687  evaluation reward: 436.6
Training network. lr: 0.000182. clip: 0.072665
Iteration 8902: Policy loss: 0.090872. Value loss: 0.028434. Entropy: 2.623598.
Iteration 8903: Policy loss: 0.096458. Value loss: 0.015619. Entropy: 2.636601.
Iteration 8904: Policy loss: 0.091185. Value loss: 0.011570. Entropy: 2.634710.
episode: 3222   score: 320.0  epsilon: 1.0    steps: 19  evaluation reward: 435.9
episode: 3223   score: 300.0  epsilon: 1.0    steps: 380  evaluation reward: 434.1
Training network. lr: 0.000182. clip: 0.072665
Iteration 8905: Policy loss: 0.060870. Value loss: 0.035971. Entropy: 2.643394.
Iteration 8906: Policy loss: 0.048176. Value loss: 0.017783. Entropy: 2.648767.
Iteration 8907: Policy loss: 0.058023. Value loss: 0.013738. Entropy: 2.648462.
episode: 3224   sc

Iteration 8965: Policy loss: 0.057485. Value loss: 0.033104. Entropy: 2.608838.
Iteration 8966: Policy loss: 0.043649. Value loss: 0.020243. Entropy: 2.598529.
Iteration 8967: Policy loss: 0.052856. Value loss: 0.015108. Entropy: 2.597755.
Training network. lr: 0.000181. clip: 0.072509
Iteration 8968: Policy loss: -0.218062. Value loss: 0.654854. Entropy: 2.506695.
Iteration 8969: Policy loss: -0.208722. Value loss: 0.385444. Entropy: 2.428979.
Iteration 8970: Policy loss: -0.226441. Value loss: 0.227173. Entropy: 2.403846.
episode: 3245   score: 390.0  epsilon: 1.0    steps: 248  evaluation reward: 446.8
episode: 3246   score: 390.0  epsilon: 1.0    steps: 334  evaluation reward: 446.3
Training network. lr: 0.000181. clip: 0.072509
Iteration 8971: Policy loss: 0.078926. Value loss: 0.140927. Entropy: 2.285202.
Iteration 8972: Policy loss: 0.063074. Value loss: 0.073166. Entropy: 2.262980.
Iteration 8973: Policy loss: 0.065872. Value loss: 0.053830. Entropy: 2.264822.
episode: 3247   s

Iteration 9031: Policy loss: 0.101465. Value loss: 0.048321. Entropy: 2.335298.
Iteration 9032: Policy loss: 0.094376. Value loss: 0.026932. Entropy: 2.376529.
Iteration 9033: Policy loss: 0.091850. Value loss: 0.022184. Entropy: 2.363178.
episode: 3267   score: 390.0  epsilon: 1.0    steps: 741  evaluation reward: 459.5
Training network. lr: 0.000181. clip: 0.072361
Iteration 9034: Policy loss: -0.127459. Value loss: 0.498252. Entropy: 2.344101.
Iteration 9035: Policy loss: -0.112520. Value loss: 0.250701. Entropy: 2.257266.
Iteration 9036: Policy loss: -0.124480. Value loss: 0.131071. Entropy: 2.229570.
episode: 3268   score: 420.0  epsilon: 1.0    steps: 211  evaluation reward: 459.4
episode: 3269   score: 320.0  epsilon: 1.0    steps: 376  evaluation reward: 457.8
Training network. lr: 0.000181. clip: 0.072361
Iteration 9037: Policy loss: -0.029834. Value loss: 0.048669. Entropy: 2.254619.
Iteration 9038: Policy loss: -0.025857. Value loss: 0.028262. Entropy: 2.257887.
Iteration 90

Training network. lr: 0.000181. clip: 0.072205
Iteration 9097: Policy loss: 0.037484. Value loss: 0.023515. Entropy: 2.554102.
Iteration 9098: Policy loss: 0.044696. Value loss: 0.013054. Entropy: 2.569281.
Iteration 9099: Policy loss: 0.038584. Value loss: 0.010856. Entropy: 2.558161.
episode: 3290   score: 340.0  epsilon: 1.0    steps: 551  evaluation reward: 470.1
episode: 3291   score: 290.0  epsilon: 1.0    steps: 670  evaluation reward: 470.2
Training network. lr: 0.000181. clip: 0.072205
Iteration 9100: Policy loss: -0.004276. Value loss: 0.020711. Entropy: 2.579676.
Iteration 9101: Policy loss: 0.001040. Value loss: 0.012041. Entropy: 2.571593.
Iteration 9102: Policy loss: -0.007733. Value loss: 0.011585. Entropy: 2.574890.
episode: 3292   score: 300.0  epsilon: 1.0    steps: 487  evaluation reward: 469.7
Training network. lr: 0.000180. clip: 0.072048
Iteration 9103: Policy loss: 0.034030. Value loss: 0.013666. Entropy: 2.571800.
Iteration 9104: Policy loss: 0.030150. Value los

Training network. lr: 0.000180. clip: 0.071900
Iteration 9163: Policy loss: 0.002254. Value loss: 0.033887. Entropy: 2.547304.
Iteration 9164: Policy loss: -0.003535. Value loss: 0.020700. Entropy: 2.540622.
Iteration 9165: Policy loss: -0.001895. Value loss: 0.018135. Entropy: 2.535139.
episode: 3312   score: 350.0  epsilon: 1.0    steps: 587  evaluation reward: 460.6
episode: 3313   score: 920.0  epsilon: 1.0    steps: 704  evaluation reward: 459.9
episode: 3314   score: 290.0  epsilon: 1.0    steps: 909  evaluation reward: 459.1
Training network. lr: 0.000180. clip: 0.071900
Iteration 9166: Policy loss: 0.062274. Value loss: 0.024575. Entropy: 2.510478.
Iteration 9167: Policy loss: 0.062004. Value loss: 0.015462. Entropy: 2.514891.
Iteration 9168: Policy loss: 0.059339. Value loss: 0.013798. Entropy: 2.517002.
Training network. lr: 0.000180. clip: 0.071900
Iteration 9169: Policy loss: 0.006049. Value loss: 0.018632. Entropy: 2.498276.
Iteration 9170: Policy loss: 0.010406. Value los

Training network. lr: 0.000179. clip: 0.071744
Iteration 9229: Policy loss: -0.022855. Value loss: 0.023153. Entropy: 2.537705.
Iteration 9230: Policy loss: -0.027120. Value loss: 0.012586. Entropy: 2.523852.
Iteration 9231: Policy loss: -0.034072. Value loss: 0.011997. Entropy: 2.527221.
episode: 3335   score: 470.0  epsilon: 1.0    steps: 113  evaluation reward: 459.8
Training network. lr: 0.000179. clip: 0.071744
Iteration 9232: Policy loss: 0.081405. Value loss: 0.032948. Entropy: 2.624110.
Iteration 9233: Policy loss: 0.059234. Value loss: 0.014972. Entropy: 2.611735.
Iteration 9234: Policy loss: 0.073279. Value loss: 0.011409. Entropy: 2.602643.
episode: 3336   score: 390.0  epsilon: 1.0    steps: 261  evaluation reward: 460.2
Training network. lr: 0.000179. clip: 0.071744
Iteration 9235: Policy loss: 0.031399. Value loss: 0.024671. Entropy: 2.595434.
Iteration 9236: Policy loss: 0.026865. Value loss: 0.011752. Entropy: 2.595645.
Iteration 9237: Policy loss: 0.025663. Value loss:

episode: 3357   score: 340.0  epsilon: 1.0    steps: 545  evaluation reward: 454.2
episode: 3358   score: 510.0  epsilon: 1.0    steps: 956  evaluation reward: 451.0
Training network. lr: 0.000179. clip: 0.071587
Iteration 9295: Policy loss: 0.049624. Value loss: 0.038200. Entropy: 2.608451.
Iteration 9296: Policy loss: 0.042942. Value loss: 0.030129. Entropy: 2.604213.
Iteration 9297: Policy loss: 0.048308. Value loss: 0.021330. Entropy: 2.608143.
Training network. lr: 0.000179. clip: 0.071587
Iteration 9298: Policy loss: 0.051285. Value loss: 0.025173. Entropy: 2.544264.
Iteration 9299: Policy loss: 0.040599. Value loss: 0.016608. Entropy: 2.544942.
Iteration 9300: Policy loss: 0.048135. Value loss: 0.014127. Entropy: 2.551880.
episode: 3359   score: 310.0  epsilon: 1.0    steps: 765  evaluation reward: 449.8
Training network. lr: 0.000179. clip: 0.071440
Iteration 9301: Policy loss: -0.081330. Value loss: 0.631533. Entropy: 2.511393.
Iteration 9302: Policy loss: -0.097119. Value los

Iteration 9361: Policy loss: 0.020271. Value loss: 0.025213. Entropy: 2.602222.
Iteration 9362: Policy loss: 0.019344. Value loss: 0.012915. Entropy: 2.596545.
Iteration 9363: Policy loss: 0.013517. Value loss: 0.010140. Entropy: 2.597873.
episode: 3379   score: 420.0  epsilon: 1.0    steps: 80  evaluation reward: 459.4
episode: 3380   score: 320.0  epsilon: 1.0    steps: 499  evaluation reward: 452.5
Training network. lr: 0.000178. clip: 0.071283
Iteration 9364: Policy loss: 0.074535. Value loss: 0.023865. Entropy: 2.656845.
Iteration 9365: Policy loss: 0.074664. Value loss: 0.014280. Entropy: 2.665724.
Iteration 9366: Policy loss: 0.079401. Value loss: 0.012896. Entropy: 2.663580.
Training network. lr: 0.000178. clip: 0.071283
Iteration 9367: Policy loss: -0.011776. Value loss: 0.016971. Entropy: 2.683808.
Iteration 9368: Policy loss: -0.019249. Value loss: 0.009412. Entropy: 2.686995.
Iteration 9369: Policy loss: -0.016410. Value loss: 0.007059. Entropy: 2.680961.
episode: 3381   sc

episode: 3402   score: 430.0  epsilon: 1.0    steps: 95  evaluation reward: 435.8
episode: 3403   score: 430.0  epsilon: 1.0    steps: 885  evaluation reward: 436.4
Training network. lr: 0.000178. clip: 0.071126
Iteration 9427: Policy loss: 0.135760. Value loss: 0.030336. Entropy: 2.720201.
Iteration 9428: Policy loss: 0.120942. Value loss: 0.016873. Entropy: 2.716578.
Iteration 9429: Policy loss: 0.128053. Value loss: 0.014157. Entropy: 2.719807.
episode: 3404   score: 350.0  epsilon: 1.0    steps: 379  evaluation reward: 435.6
episode: 3405   score: 280.0  epsilon: 1.0    steps: 589  evaluation reward: 434.8
episode: 3406   score: 430.0  epsilon: 1.0    steps: 1015  evaluation reward: 436.4
Training network. lr: 0.000178. clip: 0.071126
Iteration 9430: Policy loss: 0.021876. Value loss: 0.207661. Entropy: 2.556541.
Iteration 9431: Policy loss: -0.026808. Value loss: 0.478759. Entropy: 2.530304.
Iteration 9432: Policy loss: -0.007328. Value loss: 0.263019. Entropy: 2.506433.
Training 

Training network. lr: 0.000177. clip: 0.070979
Iteration 9490: Policy loss: 0.056853. Value loss: 0.020932. Entropy: 2.602916.
Iteration 9491: Policy loss: 0.052066. Value loss: 0.015088. Entropy: 2.604485.
Iteration 9492: Policy loss: 0.057500. Value loss: 0.013469. Entropy: 2.604345.
Training network. lr: 0.000177. clip: 0.070979
Iteration 9493: Policy loss: 0.004483. Value loss: 0.019925. Entropy: 2.622609.
Iteration 9494: Policy loss: 0.005921. Value loss: 0.013616. Entropy: 2.619278.
Iteration 9495: Policy loss: 0.000580. Value loss: 0.011813. Entropy: 2.629778.
episode: 3428   score: 450.0  epsilon: 1.0    steps: 84  evaluation reward: 442.1
Training network. lr: 0.000177. clip: 0.070979
Iteration 9496: Policy loss: 0.015339. Value loss: 0.029904. Entropy: 2.607979.
Iteration 9497: Policy loss: 0.020824. Value loss: 0.019959. Entropy: 2.619951.
Iteration 9498: Policy loss: 0.010802. Value loss: 0.017634. Entropy: 2.617438.
episode: 3429   score: 280.0  epsilon: 1.0    steps: 818 

Iteration 9556: Policy loss: 0.092387. Value loss: 0.029925. Entropy: 2.513630.
Iteration 9557: Policy loss: 0.094773. Value loss: 0.020758. Entropy: 2.511931.
Iteration 9558: Policy loss: 0.096044. Value loss: 0.016252. Entropy: 2.507378.
episode: 3450   score: 260.0  epsilon: 1.0    steps: 22  evaluation reward: 421.6
now time :  2019-02-27 06:40:00.518693
episode: 3451   score: 300.0  epsilon: 1.0    steps: 184  evaluation reward: 421.6
episode: 3452   score: 400.0  epsilon: 1.0    steps: 314  evaluation reward: 421.1
episode: 3453   score: 430.0  epsilon: 1.0    steps: 1005  evaluation reward: 422.3
Training network. lr: 0.000177. clip: 0.070665
Iteration 9559: Policy loss: -0.000059. Value loss: 0.038901. Entropy: 2.526848.
Iteration 9560: Policy loss: 0.000052. Value loss: 0.029670. Entropy: 2.532687.
Iteration 9561: Policy loss: 0.005167. Value loss: 0.023831. Entropy: 2.529982.
episode: 3454   score: 350.0  epsilon: 1.0    steps: 782  evaluation reward: 422.2
Training network. 

Iteration 9621: Policy loss: 0.085020. Value loss: 0.016024. Entropy: 2.519800.
episode: 3474   score: 800.0  epsilon: 1.0    steps: 510  evaluation reward: 437.7
Training network. lr: 0.000176. clip: 0.070518
Iteration 9622: Policy loss: 0.001886. Value loss: 0.090226. Entropy: 2.457181.
Iteration 9623: Policy loss: -0.000286. Value loss: 0.057117. Entropy: 2.458673.
Iteration 9624: Policy loss: -0.011773. Value loss: 0.043464. Entropy: 2.462909.
Training network. lr: 0.000176. clip: 0.070518
Iteration 9625: Policy loss: 0.192548. Value loss: 0.033457. Entropy: 2.574775.
Iteration 9626: Policy loss: 0.210194. Value loss: 0.019742. Entropy: 2.575412.
Iteration 9627: Policy loss: 0.193445. Value loss: 0.015432. Entropy: 2.574738.
Training network. lr: 0.000176. clip: 0.070518
Iteration 9628: Policy loss: 0.095058. Value loss: 0.045590. Entropy: 2.601691.
Iteration 9629: Policy loss: 0.085474. Value loss: 0.026679. Entropy: 2.596174.
Iteration 9630: Policy loss: 0.095059. Value loss: 0.0

Iteration 9687: Policy loss: 0.014808. Value loss: 0.039820. Entropy: 2.232943.
Training network. lr: 0.000176. clip: 0.070361
Iteration 9688: Policy loss: 0.062321. Value loss: 0.049521. Entropy: 2.322152.
Iteration 9689: Policy loss: 0.065184. Value loss: 0.024737. Entropy: 2.340902.
Iteration 9690: Policy loss: 0.059758. Value loss: 0.020883. Entropy: 2.339726.
episode: 3497   score: 370.0  epsilon: 1.0    steps: 775  evaluation reward: 447.1
Training network. lr: 0.000176. clip: 0.070361
Iteration 9691: Policy loss: 0.059411. Value loss: 0.049660. Entropy: 2.333091.
Iteration 9692: Policy loss: 0.056336. Value loss: 0.029400. Entropy: 2.328755.
Iteration 9693: Policy loss: 0.053545. Value loss: 0.023306. Entropy: 2.337579.
episode: 3498   score: 900.0  epsilon: 1.0    steps: 153  evaluation reward: 452.5
episode: 3499   score: 830.0  epsilon: 1.0    steps: 939  evaluation reward: 451.3
Training network. lr: 0.000176. clip: 0.070361
Iteration 9694: Policy loss: 0.076900. Value loss:

Iteration 9753: Policy loss: 0.104394. Value loss: 0.029158. Entropy: 2.392829.
episode: 3519   score: 1830.0  epsilon: 1.0    steps: 121  evaluation reward: 469.1
episode: 3520   score: 350.0  epsilon: 1.0    steps: 240  evaluation reward: 468.2
episode: 3521   score: 410.0  epsilon: 1.0    steps: 880  evaluation reward: 467.7
Training network. lr: 0.000175. clip: 0.070057
Iteration 9754: Policy loss: 0.024853. Value loss: 0.050808. Entropy: 2.382810.
Iteration 9755: Policy loss: 0.016590. Value loss: 0.033479. Entropy: 2.371018.
Iteration 9756: Policy loss: 0.012037. Value loss: 0.030016. Entropy: 2.365477.
episode: 3522   score: 440.0  epsilon: 1.0    steps: 602  evaluation reward: 462.1
Training network. lr: 0.000175. clip: 0.070057
Iteration 9757: Policy loss: 0.066258. Value loss: 0.041313. Entropy: 2.544610.
Iteration 9758: Policy loss: 0.073828. Value loss: 0.027445. Entropy: 2.549039.
Iteration 9759: Policy loss: 0.058622. Value loss: 0.021183. Entropy: 2.547224.
episode: 3523

Training network. lr: 0.000175. clip: 0.069901
Iteration 9817: Policy loss: 0.031268. Value loss: 0.068848. Entropy: 2.312234.
Iteration 9818: Policy loss: 0.021314. Value loss: 0.039393. Entropy: 2.298188.
Iteration 9819: Policy loss: 0.028554. Value loss: 0.030206. Entropy: 2.298851.
Training network. lr: 0.000175. clip: 0.069901
Iteration 9820: Policy loss: 0.116895. Value loss: 0.102576. Entropy: 2.381201.
Iteration 9821: Policy loss: 0.105549. Value loss: 0.056036. Entropy: 2.371637.
Iteration 9822: Policy loss: 0.092554. Value loss: 0.041175. Entropy: 2.351376.
Training network. lr: 0.000175. clip: 0.069901
Iteration 9823: Policy loss: 0.079583. Value loss: 0.052689. Entropy: 2.429633.
Iteration 9824: Policy loss: 0.086084. Value loss: 0.031995. Entropy: 2.423300.
Iteration 9825: Policy loss: 0.084568. Value loss: 0.026635. Entropy: 2.417766.
episode: 3544   score: 380.0  epsilon: 1.0    steps: 685  evaluation reward: 487.3
Training network. lr: 0.000175. clip: 0.069901
Iteration

Iteration 9884: Policy loss: 0.059849. Value loss: 0.021180. Entropy: 2.618508.
Iteration 9885: Policy loss: 0.058894. Value loss: 0.017491. Entropy: 2.611351.
episode: 3565   score: 370.0  epsilon: 1.0    steps: 86  evaluation reward: 478.4
episode: 3566   score: 210.0  epsilon: 1.0    steps: 484  evaluation reward: 477.2
episode: 3567   score: 480.0  epsilon: 1.0    steps: 906  evaluation reward: 478.7
Training network. lr: 0.000174. clip: 0.069744
Iteration 9886: Policy loss: 0.056473. Value loss: 0.024150. Entropy: 2.673749.
Iteration 9887: Policy loss: 0.054518. Value loss: 0.016691. Entropy: 2.681824.
Iteration 9888: Policy loss: 0.053505. Value loss: 0.014987. Entropy: 2.684356.
Training network. lr: 0.000174. clip: 0.069744
Iteration 9889: Policy loss: 0.062752. Value loss: 0.025314. Entropy: 2.650468.
Iteration 9890: Policy loss: 0.060425. Value loss: 0.013860. Entropy: 2.652712.
Iteration 9891: Policy loss: 0.064262. Value loss: 0.012418. Entropy: 2.645514.
episode: 3568   sc

episode: 3586   score: 440.0  epsilon: 1.0    steps: 676  evaluation reward: 474.0
episode: 3587   score: 380.0  epsilon: 1.0    steps: 1013  evaluation reward: 473.9
Training network. lr: 0.000174. clip: 0.069440
Iteration 9952: Policy loss: 0.034243. Value loss: 0.037244. Entropy: 2.345394.
Iteration 9953: Policy loss: 0.036479. Value loss: 0.020490. Entropy: 2.365228.
Iteration 9954: Policy loss: 0.032260. Value loss: 0.021867. Entropy: 2.367265.
episode: 3588   score: 350.0  epsilon: 1.0    steps: 331  evaluation reward: 474.1
Training network. lr: 0.000174. clip: 0.069440
Iteration 9955: Policy loss: -0.004767. Value loss: 0.043275. Entropy: 2.387583.
Iteration 9956: Policy loss: 0.006007. Value loss: 0.022996. Entropy: 2.395926.
Iteration 9957: Policy loss: -0.014630. Value loss: 0.016850. Entropy: 2.368161.
episode: 3589   score: 360.0  epsilon: 1.0    steps: 485  evaluation reward: 469.0
Training network. lr: 0.000174. clip: 0.069440
Iteration 9958: Policy loss: 0.035519. Value

Iteration 10015: Policy loss: -0.092686. Value loss: 0.513254. Entropy: 2.330511.
Iteration 10016: Policy loss: -0.099590. Value loss: 0.368436. Entropy: 2.293881.
Iteration 10017: Policy loss: -0.093566. Value loss: 0.130726. Entropy: 2.320143.
episode: 3611   score: 850.0  epsilon: 1.0    steps: 987  evaluation reward: 478.0
Training network. lr: 0.000173. clip: 0.069283
Iteration 10018: Policy loss: -0.019173. Value loss: 0.217936. Entropy: 2.228066.
Iteration 10019: Policy loss: -0.004485. Value loss: 0.118633. Entropy: 2.193000.
Iteration 10020: Policy loss: -0.012394. Value loss: 0.082436. Entropy: 2.220716.
Training network. lr: 0.000173. clip: 0.069283
Iteration 10021: Policy loss: 0.008078. Value loss: 0.282100. Entropy: 2.212845.
Iteration 10022: Policy loss: 0.017165. Value loss: 0.113828. Entropy: 2.212341.
Iteration 10023: Policy loss: -0.018833. Value loss: 0.106456. Entropy: 2.189853.
episode: 3612   score: 450.0  epsilon: 1.0    steps: 453  evaluation reward: 478.9
Trai

Iteration 10081: Policy loss: 0.021618. Value loss: 0.025976. Entropy: 2.693240.
Iteration 10082: Policy loss: 0.020226. Value loss: 0.015456. Entropy: 2.681105.
Iteration 10083: Policy loss: 0.016526. Value loss: 0.012005. Entropy: 2.685519.
Training network. lr: 0.000173. clip: 0.069136
Iteration 10084: Policy loss: -0.019124. Value loss: 0.026056. Entropy: 2.667504.
Iteration 10085: Policy loss: -0.022790. Value loss: 0.014616. Entropy: 2.666039.
Iteration 10086: Policy loss: -0.024465. Value loss: 0.010819. Entropy: 2.654716.
Training network. lr: 0.000173. clip: 0.069136
Iteration 10087: Policy loss: 0.115695. Value loss: 0.018761. Entropy: 2.687748.
Iteration 10088: Policy loss: 0.119132. Value loss: 0.008527. Entropy: 2.684006.
Iteration 10089: Policy loss: 0.114486. Value loss: 0.006484. Entropy: 2.680058.
episode: 3633   score: 410.0  epsilon: 1.0    steps: 675  evaluation reward: 447.3
Training network. lr: 0.000173. clip: 0.069136
Iteration 10090: Policy loss: 0.073114. Valu

Training network. lr: 0.000172. clip: 0.068979
Iteration 10147: Policy loss: 0.005235. Value loss: 0.027961. Entropy: 2.664076.
Iteration 10148: Policy loss: 0.007045. Value loss: 0.031077. Entropy: 2.671471.
Iteration 10149: Policy loss: 0.004387. Value loss: 0.016716. Entropy: 2.658973.
episode: 3655   score: 460.0  epsilon: 1.0    steps: 48  evaluation reward: 427.1
episode: 3656   score: 440.0  epsilon: 1.0    steps: 319  evaluation reward: 427.9
Training network. lr: 0.000172. clip: 0.068979
Iteration 10150: Policy loss: -0.033182. Value loss: 0.020523. Entropy: 2.698259.
Iteration 10151: Policy loss: -0.039096. Value loss: 0.016461. Entropy: 2.699291.
Iteration 10152: Policy loss: -0.041669. Value loss: 0.012335. Entropy: 2.685231.
Training network. lr: 0.000172. clip: 0.068822
Iteration 10153: Policy loss: -0.055953. Value loss: 0.016937. Entropy: 2.668985.
Iteration 10154: Policy loss: -0.053477. Value loss: 0.010365. Entropy: 2.670247.
Iteration 10155: Policy loss: -0.054611. 

Iteration 10214: Policy loss: 0.051162. Value loss: 0.012166. Entropy: 2.715976.
Iteration 10215: Policy loss: 0.051868. Value loss: 0.009691. Entropy: 2.715704.
episode: 3676   score: 490.0  epsilon: 1.0    steps: 580  evaluation reward: 446.2
episode: 3677   score: 400.0  epsilon: 1.0    steps: 792  evaluation reward: 446.2
Training network. lr: 0.000172. clip: 0.068675
Iteration 10216: Policy loss: 0.052886. Value loss: 0.020110. Entropy: 2.699729.
Iteration 10217: Policy loss: 0.055360. Value loss: 0.013888. Entropy: 2.705278.
Iteration 10218: Policy loss: 0.047530. Value loss: 0.011971. Entropy: 2.701932.
Training network. lr: 0.000172. clip: 0.068675
Iteration 10219: Policy loss: 0.069140. Value loss: 0.032653. Entropy: 2.704680.
Iteration 10220: Policy loss: 0.063242. Value loss: 0.018944. Entropy: 2.695007.
Iteration 10221: Policy loss: 0.067026. Value loss: 0.016873. Entropy: 2.700097.
Training network. lr: 0.000172. clip: 0.068675
Iteration 10222: Policy loss: 0.041554. Value

Iteration 10280: Policy loss: 0.050402. Value loss: 0.019247. Entropy: 2.678819.
Iteration 10281: Policy loss: 0.055269. Value loss: 0.015999. Entropy: 2.677584.
Training network. lr: 0.000171. clip: 0.068518
Iteration 10282: Policy loss: 0.033495. Value loss: 0.033098. Entropy: 2.726131.
Iteration 10283: Policy loss: 0.030866. Value loss: 0.019228. Entropy: 2.721284.
Iteration 10284: Policy loss: 0.036905. Value loss: 0.016429. Entropy: 2.718648.
episode: 3698   score: 430.0  epsilon: 1.0    steps: 144  evaluation reward: 435.9
episode: 3699   score: 420.0  epsilon: 1.0    steps: 281  evaluation reward: 437.1
episode: 3700   score: 390.0  epsilon: 1.0    steps: 552  evaluation reward: 436.9
Training network. lr: 0.000171. clip: 0.068518
Iteration 10285: Policy loss: 0.051293. Value loss: 0.021409. Entropy: 2.712024.
Iteration 10286: Policy loss: 0.049748. Value loss: 0.015666. Entropy: 2.711132.
Iteration 10287: Policy loss: 0.053909. Value loss: 0.014368. Entropy: 2.713810.
Training 

Iteration 10343: Policy loss: 0.055480. Value loss: 0.013655. Entropy: 2.747865.
Iteration 10344: Policy loss: 0.051783. Value loss: 0.011456. Entropy: 2.749246.
episode: 3723   score: 330.0  epsilon: 1.0    steps: 230  evaluation reward: 408.4
Training network. lr: 0.000171. clip: 0.068361
Iteration 10345: Policy loss: 0.033078. Value loss: 0.013272. Entropy: 2.745938.
Iteration 10346: Policy loss: 0.034284. Value loss: 0.009359. Entropy: 2.738423.
Iteration 10347: Policy loss: 0.031476. Value loss: 0.008359. Entropy: 2.739918.
episode: 3724   score: 230.0  epsilon: 1.0    steps: 799  evaluation reward: 407.8
Training network. lr: 0.000171. clip: 0.068361
Iteration 10348: Policy loss: 0.034567. Value loss: 0.014389. Entropy: 2.743738.
Iteration 10349: Policy loss: 0.033788. Value loss: 0.008924. Entropy: 2.745856.
Iteration 10350: Policy loss: 0.034239. Value loss: 0.007876. Entropy: 2.747902.
Training network. lr: 0.000171. clip: 0.068214
Iteration 10351: Policy loss: 0.022123. Value

Iteration 10408: Policy loss: 0.022389. Value loss: 0.032843. Entropy: 2.679558.
Iteration 10409: Policy loss: 0.024473. Value loss: 0.019017. Entropy: 2.688651.
Iteration 10410: Policy loss: 0.026166. Value loss: 0.015424. Entropy: 2.687102.
episode: 3746   score: 480.0  epsilon: 1.0    steps: 109  evaluation reward: 427.2
episode: 3747   score: 360.0  epsilon: 1.0    steps: 213  evaluation reward: 426.7
episode: 3748   score: 270.0  epsilon: 1.0    steps: 831  evaluation reward: 425.4
Training network. lr: 0.000170. clip: 0.068057
Iteration 10411: Policy loss: 0.073718. Value loss: 0.033500. Entropy: 2.689845.
Iteration 10412: Policy loss: 0.077170. Value loss: 0.021262. Entropy: 2.694055.
Iteration 10413: Policy loss: 0.080043. Value loss: 0.018718. Entropy: 2.693593.
Training network. lr: 0.000170. clip: 0.068057
Iteration 10414: Policy loss: -0.045084. Value loss: 0.519475. Entropy: 2.608823.
Iteration 10415: Policy loss: -0.068733. Value loss: 0.207660. Entropy: 2.587272.
Iterati

Iteration 10473: Policy loss: 0.031364. Value loss: 0.010921. Entropy: 2.743739.
episode: 3769   score: 300.0  epsilon: 1.0    steps: 652  evaluation reward: 412.6
Training network. lr: 0.000170. clip: 0.067901
Iteration 10474: Policy loss: 0.048329. Value loss: 0.016247. Entropy: 2.782873.
Iteration 10475: Policy loss: 0.040570. Value loss: 0.010444. Entropy: 2.781997.
Iteration 10476: Policy loss: 0.044695. Value loss: 0.008479. Entropy: 2.783668.
episode: 3770   score: 360.0  epsilon: 1.0    steps: 239  evaluation reward: 412.9
Training network. lr: 0.000170. clip: 0.067901
Iteration 10477: Policy loss: 0.023779. Value loss: 0.016994. Entropy: 2.767887.
Iteration 10478: Policy loss: 0.016872. Value loss: 0.013001. Entropy: 2.766177.
Iteration 10479: Policy loss: 0.017188. Value loss: 0.008307. Entropy: 2.765397.
episode: 3771   score: 280.0  epsilon: 1.0    steps: 77  evaluation reward: 407.1
episode: 3772   score: 300.0  epsilon: 1.0    steps: 965  evaluation reward: 406.3
Training

Iteration 10537: Policy loss: -0.040178. Value loss: 0.033493. Entropy: 2.686626.
Iteration 10538: Policy loss: -0.061797. Value loss: 0.024115. Entropy: 2.680696.
Iteration 10539: Policy loss: -0.039239. Value loss: 0.016425. Entropy: 2.682780.
Training network. lr: 0.000169. clip: 0.067753
Iteration 10540: Policy loss: -0.020840. Value loss: 0.027654. Entropy: 2.639421.
Iteration 10541: Policy loss: -0.023076. Value loss: 0.021714. Entropy: 2.642854.
Iteration 10542: Policy loss: -0.019667. Value loss: 0.014294. Entropy: 2.636600.
episode: 3793   score: 380.0  epsilon: 1.0    steps: 130  evaluation reward: 391.9
episode: 3794   score: 320.0  epsilon: 1.0    steps: 337  evaluation reward: 390.4
episode: 3795   score: 350.0  epsilon: 1.0    steps: 887  evaluation reward: 389.9
episode: 3796   score: 290.0  epsilon: 1.0    steps: 994  evaluation reward: 389.6
Training network. lr: 0.000169. clip: 0.067753
Iteration 10543: Policy loss: 0.015099. Value loss: 0.024968. Entropy: 2.650540.
I

episode: 3815   score: 460.0  epsilon: 1.0    steps: 353  evaluation reward: 388.7
episode: 3816   score: 300.0  epsilon: 1.0    steps: 796  evaluation reward: 388.5
Training network. lr: 0.000169. clip: 0.067440
Iteration 10603: Policy loss: -0.006614. Value loss: 0.019067. Entropy: 2.774426.
Iteration 10604: Policy loss: -0.004304. Value loss: 0.010748. Entropy: 2.772700.
Iteration 10605: Policy loss: -0.005165. Value loss: 0.008601. Entropy: 2.774267.
episode: 3817   score: 370.0  epsilon: 1.0    steps: 412  evaluation reward: 389.4
episode: 3818   score: 350.0  epsilon: 1.0    steps: 701  evaluation reward: 389.2
Training network. lr: 0.000169. clip: 0.067440
Iteration 10606: Policy loss: 0.034686. Value loss: 0.010629. Entropy: 2.750154.
Iteration 10607: Policy loss: 0.036868. Value loss: 0.008571. Entropy: 2.755769.
Iteration 10608: Policy loss: 0.032189. Value loss: 0.008478. Entropy: 2.760713.
Training network. lr: 0.000169. clip: 0.067440
Iteration 10609: Policy loss: -0.01103

Training network. lr: 0.000168. clip: 0.067292
Iteration 10666: Policy loss: 0.006744. Value loss: 0.016713. Entropy: 2.755938.
Iteration 10667: Policy loss: 0.004487. Value loss: 0.011015. Entropy: 2.760220.
Iteration 10668: Policy loss: 0.002339. Value loss: 0.009790. Entropy: 2.757659.
Training network. lr: 0.000168. clip: 0.067292
Iteration 10669: Policy loss: -0.002375. Value loss: 0.008226. Entropy: 2.793983.
Iteration 10670: Policy loss: -0.003793. Value loss: 0.004440. Entropy: 2.792064.
Iteration 10671: Policy loss: -0.004252. Value loss: 0.003655. Entropy: 2.791554.
episode: 3841   score: 470.0  epsilon: 1.0    steps: 279  evaluation reward: 394.1
Training network. lr: 0.000168. clip: 0.067292
Iteration 10672: Policy loss: 0.054930. Value loss: 0.010755. Entropy: 2.792443.
Iteration 10673: Policy loss: 0.055764. Value loss: 0.009058. Entropy: 2.794475.
Iteration 10674: Policy loss: 0.045399. Value loss: 0.006153. Entropy: 2.796127.
Training network. lr: 0.000168. clip: 0.0672

Iteration 10732: Policy loss: 0.031538. Value loss: 0.024662. Entropy: 2.752212.
Iteration 10733: Policy loss: 0.031440. Value loss: 0.016757. Entropy: 2.755561.
Iteration 10734: Policy loss: 0.031799. Value loss: 0.014627. Entropy: 2.750382.
episode: 3863   score: 300.0  epsilon: 1.0    steps: 788  evaluation reward: 366.5
Training network. lr: 0.000168. clip: 0.067136
Iteration 10735: Policy loss: 0.032975. Value loss: 0.012974. Entropy: 2.755088.
Iteration 10736: Policy loss: 0.025453. Value loss: 0.009190. Entropy: 2.758324.
Iteration 10737: Policy loss: 0.029672. Value loss: 0.005892. Entropy: 2.761347.
episode: 3864   score: 470.0  epsilon: 1.0    steps: 43  evaluation reward: 368.5
episode: 3865   score: 370.0  epsilon: 1.0    steps: 466  evaluation reward: 369.0
Training network. lr: 0.000168. clip: 0.067136
Iteration 10738: Policy loss: -0.003852. Value loss: 0.024099. Entropy: 2.763942.
Iteration 10739: Policy loss: -0.005960. Value loss: 0.017222. Entropy: 2.762147.
Iteratio

Iteration 10798: Policy loss: 0.038392. Value loss: 0.029419. Entropy: 2.728399.
Iteration 10799: Policy loss: 0.038570. Value loss: 0.018768. Entropy: 2.716071.
Iteration 10800: Policy loss: 0.040674. Value loss: 0.015308. Entropy: 2.711602.
episode: 3886   score: 390.0  epsilon: 1.0    steps: 768  evaluation reward: 382.1
Training network. lr: 0.000167. clip: 0.066832
Iteration 10801: Policy loss: 0.007659. Value loss: 0.016646. Entropy: 2.682773.
Iteration 10802: Policy loss: 0.005828. Value loss: 0.010390. Entropy: 2.686052.
Iteration 10803: Policy loss: 0.004171. Value loss: 0.008323. Entropy: 2.681854.
episode: 3887   score: 420.0  epsilon: 1.0    steps: 75  evaluation reward: 377.7
episode: 3888   score: 420.0  epsilon: 1.0    steps: 252  evaluation reward: 378.5
episode: 3889   score: 930.0  epsilon: 1.0    steps: 367  evaluation reward: 384.3
Training network. lr: 0.000167. clip: 0.066832
Iteration 10804: Policy loss: -0.121329. Value loss: 0.529798. Entropy: 2.636528.
Iterati

Iteration 10862: Policy loss: 0.000131. Value loss: 0.017465. Entropy: 2.621834.
Iteration 10863: Policy loss: -0.000965. Value loss: 0.015302. Entropy: 2.617820.
episode: 3910   score: 370.0  epsilon: 1.0    steps: 15  evaluation reward: 395.9
Training network. lr: 0.000167. clip: 0.066675
Iteration 10864: Policy loss: 0.039317. Value loss: 0.037695. Entropy: 2.638276.
Iteration 10865: Policy loss: 0.042540. Value loss: 0.017690. Entropy: 2.638977.
Iteration 10866: Policy loss: 0.046419. Value loss: 0.013131. Entropy: 2.636771.
episode: 3911   score: 400.0  epsilon: 1.0    steps: 142  evaluation reward: 396.6
Training network. lr: 0.000167. clip: 0.066675
Iteration 10867: Policy loss: 0.071283. Value loss: 0.032315. Entropy: 2.677649.
Iteration 10868: Policy loss: 0.080737. Value loss: 0.021996. Entropy: 2.686199.
Iteration 10869: Policy loss: 0.064754. Value loss: 0.015326. Entropy: 2.680956.
episode: 3912   score: 370.0  epsilon: 1.0    steps: 339  evaluation reward: 397.4
episode: 

Iteration 10928: Policy loss: 0.013421. Value loss: 0.013069. Entropy: 2.728078.
Iteration 10929: Policy loss: 0.016862. Value loss: 0.012291. Entropy: 2.724166.
episode: 3932   score: 310.0  epsilon: 1.0    steps: 195  evaluation reward: 399.9
Training network. lr: 0.000166. clip: 0.066518
Iteration 10930: Policy loss: -0.047736. Value loss: 0.021388. Entropy: 2.721670.
Iteration 10931: Policy loss: -0.053340. Value loss: 0.015630. Entropy: 2.723810.
Iteration 10932: Policy loss: -0.056329. Value loss: 0.011921. Entropy: 2.719884.
episode: 3933   score: 400.0  epsilon: 1.0    steps: 459  evaluation reward: 400.6
Training network. lr: 0.000166. clip: 0.066518
Iteration 10933: Policy loss: -0.003035. Value loss: 0.026822. Entropy: 2.711082.
Iteration 10934: Policy loss: -0.009000. Value loss: 0.015427. Entropy: 2.707671.
Iteration 10935: Policy loss: -0.011613. Value loss: 0.010749. Entropy: 2.704832.
Training network. lr: 0.000166. clip: 0.066518
Iteration 10936: Policy loss: 0.042238.

Training network. lr: 0.000166. clip: 0.066371
Iteration 10993: Policy loss: 0.006548. Value loss: 0.022345. Entropy: 2.742054.
Iteration 10994: Policy loss: -0.000113. Value loss: 0.016820. Entropy: 2.744715.
Iteration 10995: Policy loss: 0.001646. Value loss: 0.014650. Entropy: 2.736678.
episode: 3955   score: 310.0  epsilon: 1.0    steps: 109  evaluation reward: 400.7
episode: 3956   score: 400.0  epsilon: 1.0    steps: 202  evaluation reward: 400.9
episode: 3957   score: 400.0  epsilon: 1.0    steps: 287  evaluation reward: 400.7
Training network. lr: 0.000166. clip: 0.066371
Iteration 10996: Policy loss: 0.043844. Value loss: 0.020862. Entropy: 2.727481.
Iteration 10997: Policy loss: 0.044688. Value loss: 0.018201. Entropy: 2.727837.
Iteration 10998: Policy loss: 0.043391. Value loss: 0.010675. Entropy: 2.721118.
Training network. lr: 0.000166. clip: 0.066371
Iteration 10999: Policy loss: 0.016308. Value loss: 0.015772. Entropy: 2.726327.
Iteration 11000: Policy loss: 0.009030. Va

episode: 3977   score: 430.0  epsilon: 1.0    steps: 801  evaluation reward: 414.1
Training network. lr: 0.000165. clip: 0.066057
Iteration 11059: Policy loss: 0.066368. Value loss: 0.022078. Entropy: 2.740269.
Iteration 11060: Policy loss: 0.072953. Value loss: 0.016576. Entropy: 2.746720.
Iteration 11061: Policy loss: 0.071117. Value loss: 0.012935. Entropy: 2.739679.
episode: 3978   score: 340.0  epsilon: 1.0    steps: 241  evaluation reward: 413.6
episode: 3979   score: 280.0  epsilon: 1.0    steps: 258  evaluation reward: 407.3
episode: 3980   score: 410.0  epsilon: 1.0    steps: 454  evaluation reward: 407.0
episode: 3981   score: 320.0  epsilon: 1.0    steps: 767  evaluation reward: 405.7
Training network. lr: 0.000165. clip: 0.066057
Iteration 11062: Policy loss: 0.049080. Value loss: 0.022943. Entropy: 2.730904.
Iteration 11063: Policy loss: 0.061896. Value loss: 0.011993. Entropy: 2.735997.
Iteration 11064: Policy loss: 0.062442. Value loss: 0.008590. Entropy: 2.731818.
episo

episode: 4002   score: 260.0  epsilon: 1.0    steps: 996  evaluation reward: 395.9
Training network. lr: 0.000165. clip: 0.065910
Iteration 11122: Policy loss: 0.032380. Value loss: 0.031877. Entropy: 2.745958.
Iteration 11123: Policy loss: 0.035231. Value loss: 0.018806. Entropy: 2.737387.
Iteration 11124: Policy loss: 0.034806. Value loss: 0.014476. Entropy: 2.735901.
Training network. lr: 0.000165. clip: 0.065910
Iteration 11125: Policy loss: 0.020780. Value loss: 0.013477. Entropy: 2.742131.
Iteration 11126: Policy loss: 0.026750. Value loss: 0.008169. Entropy: 2.745335.
Iteration 11127: Policy loss: 0.020768. Value loss: 0.006038. Entropy: 2.743548.
episode: 4003   score: 370.0  epsilon: 1.0    steps: 121  evaluation reward: 396.5
Training network. lr: 0.000165. clip: 0.065910
Iteration 11128: Policy loss: 0.066350. Value loss: 0.019009. Entropy: 2.779430.
Iteration 11129: Policy loss: 0.063535. Value loss: 0.014091. Entropy: 2.780806.
Iteration 11130: Policy loss: 0.063492. Value

Iteration 11187: Policy loss: 0.006257. Value loss: 0.013995. Entropy: 2.740290.
episode: 4025   score: 350.0  epsilon: 1.0    steps: 441  evaluation reward: 395.6
episode: 4026   score: 440.0  epsilon: 1.0    steps: 917  evaluation reward: 396.0
Training network. lr: 0.000164. clip: 0.065753
Iteration 11188: Policy loss: 0.029852. Value loss: 0.023529. Entropy: 2.738241.
Iteration 11189: Policy loss: 0.033105. Value loss: 0.015654. Entropy: 2.735931.
Iteration 11190: Policy loss: 0.030744. Value loss: 0.012425. Entropy: 2.740980.
episode: 4027   score: 400.0  epsilon: 1.0    steps: 679  evaluation reward: 396.8
Training network. lr: 0.000164. clip: 0.065753
Iteration 11191: Policy loss: 0.034385. Value loss: 0.014615. Entropy: 2.772291.
Iteration 11192: Policy loss: 0.031871. Value loss: 0.010042. Entropy: 2.766831.
Iteration 11193: Policy loss: 0.032332. Value loss: 0.008958. Entropy: 2.768902.
Training network. lr: 0.000164. clip: 0.065753
Iteration 11194: Policy loss: 0.023826. Val

episode: 4050   score: 900.0  epsilon: 1.0    steps: 949  evaluation reward: 419.2
Training network. lr: 0.000164. clip: 0.065449
Iteration 11251: Policy loss: -0.007011. Value loss: 0.037981. Entropy: 2.607950.
Iteration 11252: Policy loss: -0.013175. Value loss: 0.025158. Entropy: 2.608758.
Iteration 11253: Policy loss: -0.010809. Value loss: 0.018457. Entropy: 2.606553.
Training network. lr: 0.000164. clip: 0.065449
Iteration 11254: Policy loss: -0.010824. Value loss: 0.022346. Entropy: 2.702718.
Iteration 11255: Policy loss: -0.004372. Value loss: 0.017113. Entropy: 2.706465.
Iteration 11256: Policy loss: -0.017801. Value loss: 0.014121. Entropy: 2.704898.
now time :  2019-02-27 07:13:23.072665
episode: 4051   score: 340.0  epsilon: 1.0    steps: 716  evaluation reward: 419.3
Training network. lr: 0.000164. clip: 0.065449
Iteration 11257: Policy loss: 0.017514. Value loss: 0.026317. Entropy: 2.726070.
Iteration 11258: Policy loss: 0.015157. Value loss: 0.022260. Entropy: 2.732099.


Training network. lr: 0.000163. clip: 0.065293
Iteration 11317: Policy loss: 0.020682. Value loss: 0.016091. Entropy: 2.703955.
Iteration 11318: Policy loss: 0.030543. Value loss: 0.012002. Entropy: 2.697438.
Iteration 11319: Policy loss: 0.018563. Value loss: 0.009057. Entropy: 2.704721.
episode: 4072   score: 390.0  epsilon: 1.0    steps: 653  evaluation reward: 416.4
episode: 4073   score: 340.0  epsilon: 1.0    steps: 964  evaluation reward: 415.5
Training network. lr: 0.000163. clip: 0.065293
Iteration 11320: Policy loss: 0.002136. Value loss: 0.025651. Entropy: 2.680813.
Iteration 11321: Policy loss: 0.010854. Value loss: 0.020040. Entropy: 2.697098.
Iteration 11322: Policy loss: 0.006070. Value loss: 0.014464. Entropy: 2.691744.
episode: 4074   score: 390.0  epsilon: 1.0    steps: 68  evaluation reward: 415.6
episode: 4075   score: 330.0  epsilon: 1.0    steps: 331  evaluation reward: 416.1
Training network. lr: 0.000163. clip: 0.065293
Iteration 11323: Policy loss: -0.317013. V

episode: 4094   score: 430.0  epsilon: 1.0    steps: 783  evaluation reward: 455.8
Training network. lr: 0.000163. clip: 0.065136
Iteration 11383: Policy loss: -0.086680. Value loss: 0.569072. Entropy: 2.563201.
Iteration 11384: Policy loss: -0.091641. Value loss: 0.395113. Entropy: 2.568782.
Iteration 11385: Policy loss: -0.089933. Value loss: 0.300482. Entropy: 2.571692.
episode: 4095   score: 400.0  epsilon: 1.0    steps: 569  evaluation reward: 456.4
Training network. lr: 0.000163. clip: 0.065136
Iteration 11386: Policy loss: 0.004961. Value loss: 0.038462. Entropy: 2.570548.
Iteration 11387: Policy loss: 0.006127. Value loss: 0.025066. Entropy: 2.582842.
Iteration 11388: Policy loss: -0.004249. Value loss: 0.018253. Entropy: 2.568854.
episode: 4096   score: 270.0  epsilon: 1.0    steps: 303  evaluation reward: 455.1
episode: 4097   score: 1020.0  epsilon: 1.0    steps: 450  evaluation reward: 460.2
Training network. lr: 0.000163. clip: 0.065136
Iteration 11389: Policy loss: 0.0075

Iteration 11448: Policy loss: 0.102575. Value loss: 0.026057. Entropy: 2.618440.
Training network. lr: 0.000162. clip: 0.064988
Iteration 11449: Policy loss: 0.073300. Value loss: 0.028474. Entropy: 2.664592.
Iteration 11450: Policy loss: 0.072034. Value loss: 0.017991. Entropy: 2.662245.
Iteration 11451: Policy loss: 0.069899. Value loss: 0.015691. Entropy: 2.676237.
episode: 4117   score: 290.0  epsilon: 1.0    steps: 421  evaluation reward: 475.1
Training network. lr: 0.000162. clip: 0.064832
Iteration 11452: Policy loss: 0.055515. Value loss: 0.039223. Entropy: 2.703026.
Iteration 11453: Policy loss: 0.049257. Value loss: 0.023989. Entropy: 2.707947.
Iteration 11454: Policy loss: 0.052266. Value loss: 0.025019. Entropy: 2.713572.
episode: 4118   score: 440.0  epsilon: 1.0    steps: 714  evaluation reward: 475.1
episode: 4119   score: 370.0  epsilon: 1.0    steps: 784  evaluation reward: 475.0
Training network. lr: 0.000162. clip: 0.064832
Iteration 11455: Policy loss: 0.094752. Val

Iteration 11512: Policy loss: 0.001071. Value loss: 0.011755. Entropy: 2.715565.
Iteration 11513: Policy loss: -0.006881. Value loss: 0.018864. Entropy: 2.706445.
Iteration 11514: Policy loss: -0.003852. Value loss: 0.007395. Entropy: 2.705711.
episode: 4141   score: 370.0  epsilon: 1.0    steps: 414  evaluation reward: 459.4
Training network. lr: 0.000162. clip: 0.064675
Iteration 11515: Policy loss: 0.038714. Value loss: 0.020190. Entropy: 2.744360.
Iteration 11516: Policy loss: 0.035763. Value loss: 0.011276. Entropy: 2.734910.
Iteration 11517: Policy loss: 0.040273. Value loss: 0.014825. Entropy: 2.747649.
episode: 4142   score: 350.0  epsilon: 1.0    steps: 676  evaluation reward: 453.9
Training network. lr: 0.000162. clip: 0.064675
Iteration 11518: Policy loss: -0.008982. Value loss: 0.010355. Entropy: 2.743644.
Iteration 11519: Policy loss: -0.008251. Value loss: 0.007737. Entropy: 2.740165.
Iteration 11520: Policy loss: -0.009528. Value loss: 0.004237. Entropy: 2.741778.
episod

episode: 4163   score: 370.0  epsilon: 1.0    steps: 746  evaluation reward: 446.0
episode: 4164   score: 370.0  epsilon: 1.0    steps: 1019  evaluation reward: 446.4
Training network. lr: 0.000161. clip: 0.064528
Iteration 11578: Policy loss: -0.096868. Value loss: 0.363681. Entropy: 2.736577.
Iteration 11579: Policy loss: -0.123295. Value loss: 0.470710. Entropy: 2.682122.
Iteration 11580: Policy loss: -0.134007. Value loss: 0.316538. Entropy: 2.635868.
Training network. lr: 0.000161. clip: 0.064528
Iteration 11581: Policy loss: 0.030242. Value loss: 0.027220. Entropy: 2.671557.
Iteration 11582: Policy loss: 0.033263. Value loss: 0.015040. Entropy: 2.670871.
Iteration 11583: Policy loss: 0.027630. Value loss: 0.011283. Entropy: 2.670157.
episode: 4165   score: 980.0  epsilon: 1.0    steps: 609  evaluation reward: 451.8
Training network. lr: 0.000161. clip: 0.064528
Iteration 11584: Policy loss: -0.002948. Value loss: 0.023622. Entropy: 2.695424.
Iteration 11585: Policy loss: -0.00772

Training network. lr: 0.000161. clip: 0.064371
Iteration 11644: Policy loss: 0.029430. Value loss: 0.029374. Entropy: 2.546812.
Iteration 11645: Policy loss: 0.035645. Value loss: 0.016390. Entropy: 2.572963.
Iteration 11646: Policy loss: 0.024797. Value loss: 0.014425. Entropy: 2.560977.
episode: 4185   score: 1090.0  epsilon: 1.0    steps: 185  evaluation reward: 432.8
Training network. lr: 0.000161. clip: 0.064371
Iteration 11647: Policy loss: -0.034060. Value loss: 0.035482. Entropy: 2.646699.
Iteration 11648: Policy loss: -0.033637. Value loss: 0.018493. Entropy: 2.654537.
Iteration 11649: Policy loss: -0.035326. Value loss: 0.016751. Entropy: 2.655349.
episode: 4186   score: 380.0  epsilon: 1.0    steps: 96  evaluation reward: 432.7
episode: 4187   score: 380.0  epsilon: 1.0    steps: 569  evaluation reward: 427.1
episode: 4188   score: 480.0  epsilon: 1.0    steps: 677  evaluation reward: 427.6
Training network. lr: 0.000161. clip: 0.064371
Iteration 11650: Policy loss: 0.044086

episode: 4207   score: 330.0  epsilon: 1.0    steps: 939  evaluation reward: 415.7
Training network. lr: 0.000160. clip: 0.064067
Iteration 11710: Policy loss: 0.020134. Value loss: 0.016124. Entropy: 2.684079.
Iteration 11711: Policy loss: 0.027089. Value loss: 0.009894. Entropy: 2.673559.
Iteration 11712: Policy loss: 0.023644. Value loss: 0.008813. Entropy: 2.680155.
episode: 4208   score: 240.0  epsilon: 1.0    steps: 464  evaluation reward: 412.9
Training network. lr: 0.000160. clip: 0.064067
Iteration 11713: Policy loss: 0.026057. Value loss: 0.022014. Entropy: 2.717136.
Iteration 11714: Policy loss: 0.026723. Value loss: 0.013500. Entropy: 2.713170.
Iteration 11715: Policy loss: 0.029925. Value loss: 0.010493. Entropy: 2.715487.
Training network. lr: 0.000160. clip: 0.064067
Iteration 11716: Policy loss: -0.058410. Value loss: 0.572712. Entropy: 2.635150.
Iteration 11717: Policy loss: -0.057178. Value loss: 0.212943. Entropy: 2.557101.
Iteration 11718: Policy loss: -0.056383. Va

Iteration 11774: Policy loss: 0.055890. Value loss: 0.016207. Entropy: 2.652057.
Iteration 11775: Policy loss: 0.053623. Value loss: 0.014243. Entropy: 2.652747.
Training network. lr: 0.000160. clip: 0.063910
Iteration 11776: Policy loss: 0.099916. Value loss: 0.016756. Entropy: 2.679476.
Iteration 11777: Policy loss: 0.102186. Value loss: 0.011017. Entropy: 2.667977.
Iteration 11778: Policy loss: 0.096183. Value loss: 0.008676. Entropy: 2.673608.
episode: 4231   score: 310.0  epsilon: 1.0    steps: 448  evaluation reward: 399.8
Training network. lr: 0.000160. clip: 0.063910
Iteration 11779: Policy loss: 0.023808. Value loss: 0.017654. Entropy: 2.675755.
Iteration 11780: Policy loss: 0.024022. Value loss: 0.011565. Entropy: 2.669877.
Iteration 11781: Policy loss: 0.023059. Value loss: 0.010580. Entropy: 2.669071.
episode: 4232   score: 250.0  epsilon: 1.0    steps: 1008  evaluation reward: 398.5
Training network. lr: 0.000160. clip: 0.063910
Iteration 11782: Policy loss: 0.027074. Valu

Iteration 11838: Policy loss: 0.021089. Value loss: 0.009210. Entropy: 2.720687.
Training network. lr: 0.000159. clip: 0.063753
Iteration 11839: Policy loss: 0.018257. Value loss: 0.010593. Entropy: 2.774966.
Iteration 11840: Policy loss: 0.013917. Value loss: 0.007411. Entropy: 2.778812.
Iteration 11841: Policy loss: 0.014766. Value loss: 0.005325. Entropy: 2.771199.
episode: 4255   score: 340.0  epsilon: 1.0    steps: 165  evaluation reward: 392.5
Training network. lr: 0.000159. clip: 0.063753
Iteration 11842: Policy loss: 0.024281. Value loss: 0.021615. Entropy: 2.769798.
Iteration 11843: Policy loss: 0.023518. Value loss: 0.012464. Entropy: 2.771581.
Iteration 11844: Policy loss: 0.019632. Value loss: 0.010271. Entropy: 2.771971.
episode: 4256   score: 450.0  epsilon: 1.0    steps: 627  evaluation reward: 394.2
Training network. lr: 0.000159. clip: 0.063753
Iteration 11845: Policy loss: -0.019726. Value loss: 0.019102. Entropy: 2.764183.
Iteration 11846: Policy loss: -0.017869. Val

Iteration 11903: Policy loss: -0.010698. Value loss: 0.008956. Entropy: 2.771795.
Iteration 11904: Policy loss: -0.005998. Value loss: 0.006353. Entropy: 2.771424.
episode: 4278   score: 340.0  epsilon: 1.0    steps: 469  evaluation reward: 377.3
episode: 4279   score: 310.0  epsilon: 1.0    steps: 617  evaluation reward: 376.4
Training network. lr: 0.000159. clip: 0.063449
Iteration 11905: Policy loss: 0.017054. Value loss: 0.013710. Entropy: 2.772921.
Iteration 11906: Policy loss: 0.023235. Value loss: 0.006496. Entropy: 2.777878.
Iteration 11907: Policy loss: 0.018966. Value loss: 0.008917. Entropy: 2.776588.
episode: 4280   score: 380.0  epsilon: 1.0    steps: 211  evaluation reward: 375.5
Training network. lr: 0.000159. clip: 0.063449
Iteration 11908: Policy loss: 0.015978. Value loss: 0.012276. Entropy: 2.740582.
Iteration 11909: Policy loss: 0.017462. Value loss: 0.009284. Entropy: 2.746651.
Iteration 11910: Policy loss: 0.017731. Value loss: 0.008406. Entropy: 2.747850.
Trainin

Iteration 11966: Policy loss: 0.008010. Value loss: 0.021400. Entropy: 2.635906.
Iteration 11967: Policy loss: 0.010584. Value loss: 0.019261. Entropy: 2.655629.
Training network. lr: 0.000158. clip: 0.063293
Iteration 11968: Policy loss: 0.002049. Value loss: 0.021786. Entropy: 2.680003.
Iteration 11969: Policy loss: -0.000156. Value loss: 0.014482. Entropy: 2.674361.
Iteration 11970: Policy loss: 0.001697. Value loss: 0.012716. Entropy: 2.679681.
episode: 4303   score: 960.0  epsilon: 1.0    steps: 603  evaluation reward: 357.6
Training network. lr: 0.000158. clip: 0.063293
Iteration 11971: Policy loss: 0.032913. Value loss: 0.027930. Entropy: 2.733282.
Iteration 11972: Policy loss: 0.028296. Value loss: 0.022267. Entropy: 2.722806.
Iteration 11973: Policy loss: 0.033527. Value loss: 0.012560. Entropy: 2.729128.
Training network. lr: 0.000158. clip: 0.063293
Iteration 11974: Policy loss: 0.075957. Value loss: 0.025175. Entropy: 2.770365.
Iteration 11975: Policy loss: 0.083412. Value 

Iteration 12031: Policy loss: 0.041944. Value loss: 0.020457. Entropy: 2.776916.
Iteration 12032: Policy loss: 0.043167. Value loss: 0.010159. Entropy: 2.772164.
Iteration 12033: Policy loss: 0.037297. Value loss: 0.009206. Entropy: 2.771006.
episode: 4326   score: 300.0  epsilon: 1.0    steps: 240  evaluation reward: 352.2
Training network. lr: 0.000158. clip: 0.063145
Iteration 12034: Policy loss: 0.060947. Value loss: 0.022743. Entropy: 2.785638.
Iteration 12035: Policy loss: 0.064071. Value loss: 0.013201. Entropy: 2.789312.
Iteration 12036: Policy loss: 0.064079. Value loss: 0.008989. Entropy: 2.799410.
episode: 4327   score: 230.0  epsilon: 1.0    steps: 309  evaluation reward: 350.8
episode: 4328   score: 500.0  epsilon: 1.0    steps: 488  evaluation reward: 352.8
episode: 4329   score: 440.0  epsilon: 1.0    steps: 1007  evaluation reward: 354.5
Training network. lr: 0.000158. clip: 0.063145
Iteration 12037: Policy loss: 0.015453. Value loss: 0.025088. Entropy: 2.771044.
Iterat

Iteration 12097: Policy loss: 0.076678. Value loss: 0.015220. Entropy: 2.790587.
Iteration 12098: Policy loss: 0.070491. Value loss: 0.012665. Entropy: 2.787420.
Iteration 12099: Policy loss: 0.074069. Value loss: 0.008144. Entropy: 2.792531.
episode: 4348   score: 290.0  epsilon: 1.0    steps: 316  evaluation reward: 374.3
episode: 4349   score: 390.0  epsilon: 1.0    steps: 398  evaluation reward: 375.6
episode: 4350   score: 300.0  epsilon: 1.0    steps: 656  evaluation reward: 375.9
now time :  2019-02-27 07:29:58.655700
episode: 4351   score: 340.0  epsilon: 1.0    steps: 915  evaluation reward: 376.5
Training network. lr: 0.000157. clip: 0.062989
Iteration 12100: Policy loss: 0.040370. Value loss: 0.017150. Entropy: 2.723245.
Iteration 12101: Policy loss: 0.040783. Value loss: 0.012964. Entropy: 2.724451.
Iteration 12102: Policy loss: 0.041721. Value loss: 0.007571. Entropy: 2.723111.
Training network. lr: 0.000157. clip: 0.062832
Iteration 12103: Policy loss: 0.011526. Value los

Iteration 12160: Policy loss: 0.014174. Value loss: 0.014508. Entropy: 2.794065.
Iteration 12161: Policy loss: 0.008704. Value loss: 0.010491. Entropy: 2.795721.
Iteration 12162: Policy loss: 0.012113. Value loss: 0.008788. Entropy: 2.797421.
episode: 4373   score: 340.0  epsilon: 1.0    steps: 123  evaluation reward: 369.6
episode: 4374   score: 360.0  epsilon: 1.0    steps: 278  evaluation reward: 370.2
Training network. lr: 0.000157. clip: 0.062684
Iteration 12163: Policy loss: 0.025790. Value loss: 0.016831. Entropy: 2.731662.
Iteration 12164: Policy loss: 0.025568. Value loss: 0.007269. Entropy: 2.728844.
Iteration 12165: Policy loss: 0.023317. Value loss: 0.008967. Entropy: 2.729048.
Training network. lr: 0.000157. clip: 0.062684
Iteration 12166: Policy loss: -0.001502. Value loss: 0.012672. Entropy: 2.764838.
Iteration 12167: Policy loss: 0.000068. Value loss: 0.007931. Entropy: 2.765747.
Iteration 12168: Policy loss: -0.003601. Value loss: 0.006461. Entropy: 2.766966.
episode: 

episode: 4396   score: 300.0  epsilon: 1.0    steps: 732  evaluation reward: 380.8
Training network. lr: 0.000156. clip: 0.062528
Iteration 12226: Policy loss: -0.015626. Value loss: 0.026443. Entropy: 2.686640.
Iteration 12227: Policy loss: -0.020930. Value loss: 0.017899. Entropy: 2.683596.
Iteration 12228: Policy loss: -0.010097. Value loss: 0.015273. Entropy: 2.686721.
Training network. lr: 0.000156. clip: 0.062528
Iteration 12229: Policy loss: 0.055402. Value loss: 0.014255. Entropy: 2.698804.
Iteration 12230: Policy loss: 0.053695. Value loss: 0.010696. Entropy: 2.693936.
Iteration 12231: Policy loss: 0.055911. Value loss: 0.007013. Entropy: 2.704344.
episode: 4397   score: 320.0  epsilon: 1.0    steps: 797  evaluation reward: 375.4
episode: 4398   score: 1010.0  epsilon: 1.0    steps: 926  evaluation reward: 382.2
Training network. lr: 0.000156. clip: 0.062528
Iteration 12232: Policy loss: 0.024352. Value loss: 0.013509. Entropy: 2.750273.
Iteration 12233: Policy loss: 0.025261.

episode: 4418   score: 480.0  epsilon: 1.0    steps: 688  evaluation reward: 380.5
Training network. lr: 0.000156. clip: 0.062371
Iteration 12292: Policy loss: -0.097232. Value loss: 0.434359. Entropy: 2.751315.
Iteration 12293: Policy loss: -0.146033. Value loss: 0.187909. Entropy: 2.730985.
Iteration 12294: Policy loss: -0.121587. Value loss: 0.090156. Entropy: 2.721313.
episode: 4419   score: 410.0  epsilon: 1.0    steps: 39  evaluation reward: 380.7
episode: 4420   score: 470.0  epsilon: 1.0    steps: 409  evaluation reward: 379.9
Training network. lr: 0.000156. clip: 0.062371
Iteration 12295: Policy loss: 0.008132. Value loss: 0.011779. Entropy: 2.714730.
Iteration 12296: Policy loss: 0.004000. Value loss: 0.011934. Entropy: 2.696873.
Iteration 12297: Policy loss: 0.006737. Value loss: 0.007654. Entropy: 2.701951.
episode: 4421   score: 330.0  epsilon: 1.0    steps: 173  evaluation reward: 379.2
episode: 4422   score: 270.0  epsilon: 1.0    steps: 827  evaluation reward: 378.3
epi

Iteration 12356: Policy loss: 0.025427. Value loss: 0.017550. Entropy: 2.652860.
Iteration 12357: Policy loss: 0.020595. Value loss: 0.014595. Entropy: 2.645717.
episode: 4442   score: 360.0  epsilon: 1.0    steps: 230  evaluation reward: 387.5
episode: 4443   score: 440.0  epsilon: 1.0    steps: 628  evaluation reward: 387.6
Training network. lr: 0.000155. clip: 0.062067
Iteration 12358: Policy loss: 0.035622. Value loss: 0.035382. Entropy: 2.645775.
Iteration 12359: Policy loss: 0.035891. Value loss: 0.022871. Entropy: 2.641676.
Iteration 12360: Policy loss: 0.035681. Value loss: 0.018806. Entropy: 2.644166.
episode: 4444   score: 390.0  epsilon: 1.0    steps: 51  evaluation reward: 387.0
episode: 4445   score: 570.0  epsilon: 1.0    steps: 258  evaluation reward: 388.7
Training network. lr: 0.000155. clip: 0.062067
Iteration 12361: Policy loss: 0.059490. Value loss: 0.016927. Entropy: 2.675109.
Iteration 12362: Policy loss: 0.060876. Value loss: 0.010965. Entropy: 2.676519.
Iteratio

Iteration 12421: Policy loss: -0.006334. Value loss: 0.034684. Entropy: 2.597111.
Iteration 12422: Policy loss: -0.014997. Value loss: 0.020686. Entropy: 2.605602.
Iteration 12423: Policy loss: -0.010814. Value loss: 0.016832. Entropy: 2.604712.
episode: 4465   score: 870.0  epsilon: 1.0    steps: 497  evaluation reward: 403.2
Training network. lr: 0.000155. clip: 0.061910
Iteration 12424: Policy loss: 0.038899. Value loss: 0.020538. Entropy: 2.677368.
Iteration 12425: Policy loss: 0.038645. Value loss: 0.009830. Entropy: 2.683554.
Iteration 12426: Policy loss: 0.031603. Value loss: 0.012506. Entropy: 2.689156.
episode: 4466   score: 460.0  epsilon: 1.0    steps: 169  evaluation reward: 404.8
episode: 4467   score: 460.0  epsilon: 1.0    steps: 334  evaluation reward: 406.0
episode: 4468   score: 370.0  epsilon: 1.0    steps: 565  evaluation reward: 406.0
Training network. lr: 0.000155. clip: 0.061910
Iteration 12427: Policy loss: -0.111348. Value loss: 0.621635. Entropy: 2.680647.
Ite

Training network. lr: 0.000154. clip: 0.061763
Iteration 12487: Policy loss: 0.115130. Value loss: 0.044890. Entropy: 2.686607.
Iteration 12488: Policy loss: 0.114028. Value loss: 0.024931. Entropy: 2.694131.
Iteration 12489: Policy loss: 0.098641. Value loss: 0.019444. Entropy: 2.693088.
episode: 4488   score: 320.0  epsilon: 1.0    steps: 497  evaluation reward: 432.3
episode: 4489   score: 320.0  epsilon: 1.0    steps: 938  evaluation reward: 432.4
Training network. lr: 0.000154. clip: 0.061763
Iteration 12490: Policy loss: 0.013262. Value loss: 0.023313. Entropy: 2.642649.
Iteration 12491: Policy loss: 0.014910. Value loss: 0.015696. Entropy: 2.641187.
Iteration 12492: Policy loss: 0.005327. Value loss: 0.016491. Entropy: 2.645525.
episode: 4490   score: 480.0  epsilon: 1.0    steps: 14  evaluation reward: 434.0
episode: 4491   score: 430.0  epsilon: 1.0    steps: 314  evaluation reward: 435.6
Training network. lr: 0.000154. clip: 0.061763
Iteration 12493: Policy loss: 0.023111. Va

Iteration 12552: Policy loss: -0.023462. Value loss: 0.011088. Entropy: 2.700240.
Training network. lr: 0.000154. clip: 0.061449
Iteration 12553: Policy loss: -0.065575. Value loss: 0.021942. Entropy: 2.689221.
Iteration 12554: Policy loss: -0.071114. Value loss: 0.011749. Entropy: 2.689716.
Iteration 12555: Policy loss: -0.069915. Value loss: 0.008835. Entropy: 2.681700.
Training network. lr: 0.000154. clip: 0.061449
Iteration 12556: Policy loss: -0.078568. Value loss: 0.019210. Entropy: 2.688106.
Iteration 12557: Policy loss: -0.081121. Value loss: 0.011194. Entropy: 2.680881.
Iteration 12558: Policy loss: -0.075612. Value loss: 0.008600. Entropy: 2.691071.
episode: 4511   score: 240.0  epsilon: 1.0    steps: 485  evaluation reward: 431.0
Training network. lr: 0.000154. clip: 0.061449
Iteration 12559: Policy loss: 0.030730. Value loss: 0.040393. Entropy: 2.641668.
Iteration 12560: Policy loss: 0.032944. Value loss: 0.019494. Entropy: 2.634956.
Iteration 12561: Policy loss: 0.029823. 

Iteration 12616: Policy loss: 0.014101. Value loss: 0.029483. Entropy: 2.673031.
Iteration 12617: Policy loss: 0.010130. Value loss: 0.018153. Entropy: 2.668713.
Iteration 12618: Policy loss: 0.010882. Value loss: 0.015235. Entropy: 2.675064.
Training network. lr: 0.000153. clip: 0.061302
Iteration 12619: Policy loss: 0.030540. Value loss: 0.010671. Entropy: 2.687908.
Iteration 12620: Policy loss: 0.025669. Value loss: 0.006961. Entropy: 2.693994.
Iteration 12621: Policy loss: 0.026811. Value loss: 0.005933. Entropy: 2.694500.
episode: 4535   score: 290.0  epsilon: 1.0    steps: 496  evaluation reward: 416.2
Training network. lr: 0.000153. clip: 0.061302
Iteration 12622: Policy loss: 0.059192. Value loss: 0.013255. Entropy: 2.726702.
Iteration 12623: Policy loss: 0.047599. Value loss: 0.010727. Entropy: 2.727512.
Iteration 12624: Policy loss: 0.047735. Value loss: 0.008466. Entropy: 2.718604.
Training network. lr: 0.000153. clip: 0.061302
Iteration 12625: Policy loss: -0.016607. Value 

Iteration 12680: Policy loss: 0.039380. Value loss: 0.012085. Entropy: 2.686568.
Iteration 12681: Policy loss: 0.038649. Value loss: 0.011695. Entropy: 2.690698.
Training network. lr: 0.000153. clip: 0.061145
Iteration 12682: Policy loss: -0.017575. Value loss: 0.018539. Entropy: 2.683908.
Iteration 12683: Policy loss: -0.022499. Value loss: 0.015049. Entropy: 2.678089.
Iteration 12684: Policy loss: -0.013698. Value loss: 0.010939. Entropy: 2.678243.
Training network. lr: 0.000153. clip: 0.061145
Iteration 12685: Policy loss: -0.102179. Value loss: 0.352243. Entropy: 2.712177.
Iteration 12686: Policy loss: -0.103311. Value loss: 0.234361. Entropy: 2.690326.
Iteration 12687: Policy loss: -0.114827. Value loss: 0.124169. Entropy: 2.681037.
episode: 4559   score: 350.0  epsilon: 1.0    steps: 472  evaluation reward: 410.1
Training network. lr: 0.000153. clip: 0.061145
Iteration 12688: Policy loss: 0.062881. Value loss: 0.041728. Entropy: 2.624063.
Iteration 12689: Policy loss: 0.059731. V

Iteration 12746: Policy loss: -0.082862. Value loss: 0.088770. Entropy: 2.621654.
Iteration 12747: Policy loss: -0.081291. Value loss: 0.206219. Entropy: 2.611055.
episode: 4581   score: 370.0  epsilon: 1.0    steps: 679  evaluation reward: 385.5
episode: 4582   score: 380.0  epsilon: 1.0    steps: 809  evaluation reward: 385.6
Training network. lr: 0.000152. clip: 0.060989
Iteration 12748: Policy loss: 0.072246. Value loss: 0.030383. Entropy: 2.573182.
Iteration 12749: Policy loss: 0.082523. Value loss: 0.016463. Entropy: 2.550125.
Iteration 12750: Policy loss: 0.071797. Value loss: 0.014569. Entropy: 2.546967.
episode: 4583   score: 500.0  epsilon: 1.0    steps: 177  evaluation reward: 387.3
episode: 4584   score: 370.0  epsilon: 1.0    steps: 396  evaluation reward: 385.9
episode: 4585   score: 390.0  epsilon: 1.0    steps: 640  evaluation reward: 381.3
Training network. lr: 0.000152. clip: 0.060841
Iteration 12751: Policy loss: 0.000255. Value loss: 0.022409. Entropy: 2.606443.
Ite

Iteration 12810: Policy loss: 0.018153. Value loss: 0.016911. Entropy: 2.710441.
Training network. lr: 0.000152. clip: 0.060685
Iteration 12811: Policy loss: -0.026489. Value loss: 0.046445. Entropy: 2.719019.
Iteration 12812: Policy loss: -0.029544. Value loss: 0.029876. Entropy: 2.715769.
Iteration 12813: Policy loss: -0.024527. Value loss: 0.021379. Entropy: 2.715403.
episode: 4605   score: 400.0  epsilon: 1.0    steps: 609  evaluation reward: 394.4
Training network. lr: 0.000152. clip: 0.060685
Iteration 12814: Policy loss: 0.025403. Value loss: 0.460005. Entropy: 2.619729.
Iteration 12815: Policy loss: 0.005503. Value loss: 0.104311. Entropy: 2.562099.
Iteration 12816: Policy loss: 0.016160. Value loss: 0.062302. Entropy: 2.589633.
episode: 4606   score: 290.0  epsilon: 1.0    steps: 191  evaluation reward: 394.8
episode: 4607   score: 410.0  epsilon: 1.0    steps: 662  evaluation reward: 395.1
Training network. lr: 0.000152. clip: 0.060685
Iteration 12817: Policy loss: 0.045436. 

Iteration 12876: Policy loss: 0.081239. Value loss: 0.010137. Entropy: 2.732440.
episode: 4627   score: 950.0  epsilon: 1.0    steps: 892  evaluation reward: 424.5
Training network. lr: 0.000151. clip: 0.060528
Iteration 12877: Policy loss: -0.112959. Value loss: 0.416777. Entropy: 2.685029.
Iteration 12878: Policy loss: -0.178117. Value loss: 0.327935. Entropy: 2.661569.
Iteration 12879: Policy loss: -0.115319. Value loss: 0.203204. Entropy: 2.677463.
episode: 4628   score: 290.0  epsilon: 1.0    steps: 158  evaluation reward: 425.1
episode: 4629   score: 330.0  epsilon: 1.0    steps: 950  evaluation reward: 424.7
Training network. lr: 0.000151. clip: 0.060528
Iteration 12880: Policy loss: 0.089142. Value loss: 0.081869. Entropy: 2.674537.
Iteration 12881: Policy loss: 0.103159. Value loss: 0.034235. Entropy: 2.671003.
Iteration 12882: Policy loss: 0.071510. Value loss: 0.032635. Entropy: 2.666853.
episode: 4630   score: 410.0  epsilon: 1.0    steps: 490  evaluation reward: 425.0
Trai

Iteration 12939: Policy loss: 0.013683. Value loss: 0.006995. Entropy: 2.735314.
episode: 4652   score: 310.0  epsilon: 1.0    steps: 579  evaluation reward: 419.4
Training network. lr: 0.000151. clip: 0.060380
Iteration 12940: Policy loss: 0.009856. Value loss: 0.012319. Entropy: 2.773391.
Iteration 12941: Policy loss: -0.004878. Value loss: 0.009479. Entropy: 2.776999.
Iteration 12942: Policy loss: 0.005199. Value loss: 0.006631. Entropy: 2.771451.
episode: 4653   score: 290.0  epsilon: 1.0    steps: 59  evaluation reward: 419.1
Training network. lr: 0.000151. clip: 0.060380
Iteration 12943: Policy loss: 0.011586. Value loss: 0.008471. Entropy: 2.791317.
Iteration 12944: Policy loss: 0.010648. Value loss: 0.008170. Entropy: 2.792114.
Iteration 12945: Policy loss: 0.009076. Value loss: 0.006121. Entropy: 2.791110.
Training network. lr: 0.000151. clip: 0.060380
Iteration 12946: Policy loss: 0.030662. Value loss: 0.008040. Entropy: 2.798354.
Iteration 12947: Policy loss: 0.029898. Value

episode: 4673   score: 360.0  epsilon: 1.0    steps: 384  evaluation reward: 423.0
Training network. lr: 0.000150. clip: 0.060067
Iteration 13006: Policy loss: 0.047835. Value loss: 0.030627. Entropy: 2.767003.
Iteration 13007: Policy loss: 0.047168. Value loss: 0.015192. Entropy: 2.767154.
Iteration 13008: Policy loss: 0.045182. Value loss: 0.011011. Entropy: 2.763041.
episode: 4674   score: 440.0  epsilon: 1.0    steps: 210  evaluation reward: 417.6
episode: 4675   score: 300.0  epsilon: 1.0    steps: 405  evaluation reward: 417.9
Training network. lr: 0.000150. clip: 0.060067
Iteration 13009: Policy loss: 0.052939. Value loss: 0.018935. Entropy: 2.718877.
Iteration 13010: Policy loss: 0.061108. Value loss: 0.013536. Entropy: 2.714914.
Iteration 13011: Policy loss: 0.052206. Value loss: 0.010928. Entropy: 2.715384.
Training network. lr: 0.000150. clip: 0.060067
Iteration 13012: Policy loss: -0.017080. Value loss: 0.015929. Entropy: 2.733540.
Iteration 13013: Policy loss: -0.011399. V

Iteration 13070: Policy loss: 0.022410. Value loss: 0.009200. Entropy: 2.740350.
Iteration 13071: Policy loss: 0.014805. Value loss: 0.009935. Entropy: 2.741998.
episode: 4697   score: 410.0  epsilon: 1.0    steps: 252  evaluation reward: 406.7
episode: 4698   score: 400.0  epsilon: 1.0    steps: 854  evaluation reward: 407.9
Training network. lr: 0.000150. clip: 0.059920
Iteration 13072: Policy loss: 0.014243. Value loss: 0.025257. Entropy: 2.747050.
Iteration 13073: Policy loss: 0.017455. Value loss: 0.012898. Entropy: 2.746300.
Iteration 13074: Policy loss: 0.018926. Value loss: 0.009973. Entropy: 2.750896.
episode: 4699   score: 360.0  epsilon: 1.0    steps: 77  evaluation reward: 408.3
Training network. lr: 0.000150. clip: 0.059920
Iteration 13075: Policy loss: 0.055474. Value loss: 0.016653. Entropy: 2.715720.
Iteration 13076: Policy loss: 0.055403. Value loss: 0.011028. Entropy: 2.711304.
Iteration 13077: Policy loss: 0.056789. Value loss: 0.013956. Entropy: 2.713606.
episode: 4

Iteration 13134: Policy loss: 0.007758. Value loss: 0.009070. Entropy: 2.753250.
Training network. lr: 0.000149. clip: 0.059763
Iteration 13135: Policy loss: -0.123760. Value loss: 0.370923. Entropy: 2.613378.
Iteration 13136: Policy loss: -0.130973. Value loss: 0.218866. Entropy: 2.594870.
Iteration 13137: Policy loss: -0.137170. Value loss: 0.128270. Entropy: 2.616733.
episode: 4721   score: 360.0  epsilon: 1.0    steps: 68  evaluation reward: 391.1
Training network. lr: 0.000149. clip: 0.059763
Iteration 13138: Policy loss: 0.016094. Value loss: 0.020231. Entropy: 2.613471.
Iteration 13139: Policy loss: 0.012942. Value loss: 0.014497. Entropy: 2.605236.
Iteration 13140: Policy loss: 0.012905. Value loss: 0.011519. Entropy: 2.607469.
episode: 4722   score: 350.0  epsilon: 1.0    steps: 221  evaluation reward: 390.8
episode: 4723   score: 240.0  epsilon: 1.0    steps: 631  evaluation reward: 389.2
Training network. lr: 0.000149. clip: 0.059763
Iteration 13141: Policy loss: -0.029167. 

Training network. lr: 0.000149. clip: 0.059606
Iteration 13198: Policy loss: 0.020187. Value loss: 0.040408. Entropy: 2.676246.
Iteration 13199: Policy loss: 0.025575. Value loss: 0.019831. Entropy: 2.689915.
Iteration 13200: Policy loss: 0.024469. Value loss: 0.016416. Entropy: 2.696446.
episode: 4745   score: 310.0  epsilon: 1.0    steps: 115  evaluation reward: 421.2
episode: 4746   score: 380.0  epsilon: 1.0    steps: 599  evaluation reward: 421.8
Training network. lr: 0.000149. clip: 0.059459
Iteration 13201: Policy loss: 0.076622. Value loss: 0.040149. Entropy: 2.686635.
Iteration 13202: Policy loss: 0.072975. Value loss: 0.022565. Entropy: 2.689615.
Iteration 13203: Policy loss: 0.066593. Value loss: 0.016768. Entropy: 2.695117.
Training network. lr: 0.000149. clip: 0.059459
Iteration 13204: Policy loss: 0.076638. Value loss: 0.031094. Entropy: 2.658524.
Iteration 13205: Policy loss: 0.083984. Value loss: 0.024650. Entropy: 2.650597.
Iteration 13206: Policy loss: 0.066080. Value

episode: 4767   score: 440.0  epsilon: 1.0    steps: 856  evaluation reward: 436.2
Training network. lr: 0.000148. clip: 0.059302
Iteration 13264: Policy loss: 0.074459. Value loss: 0.035565. Entropy: 2.707075.
Iteration 13265: Policy loss: 0.079197. Value loss: 0.022760. Entropy: 2.711612.
Iteration 13266: Policy loss: 0.078947. Value loss: 0.016255. Entropy: 2.708188.
Training network. lr: 0.000148. clip: 0.059302
Iteration 13267: Policy loss: 0.043545. Value loss: 0.027656. Entropy: 2.711227.
Iteration 13268: Policy loss: 0.047600. Value loss: 0.018028. Entropy: 2.725097.
Iteration 13269: Policy loss: 0.038269. Value loss: 0.013523. Entropy: 2.724854.
episode: 4768   score: 350.0  epsilon: 1.0    steps: 411  evaluation reward: 435.8
Training network. lr: 0.000148. clip: 0.059302
Iteration 13270: Policy loss: -0.113668. Value loss: 0.597809. Entropy: 2.658522.
Iteration 13271: Policy loss: -0.135495. Value loss: 0.251784. Entropy: 2.674769.
Iteration 13272: Policy loss: -0.137208. Va

episode: 4789   score: 270.0  epsilon: 1.0    steps: 634  evaluation reward: 437.5
Training network. lr: 0.000148. clip: 0.059145
Iteration 13330: Policy loss: -0.081725. Value loss: 0.663988. Entropy: 2.680886.
Iteration 13331: Policy loss: -0.047788. Value loss: 0.246779. Entropy: 2.619511.
Iteration 13332: Policy loss: -0.098087. Value loss: 0.136747. Entropy: 2.611128.
Training network. lr: 0.000148. clip: 0.059145
Iteration 13333: Policy loss: 0.068757. Value loss: 0.043719. Entropy: 2.589400.
Iteration 13334: Policy loss: 0.063477. Value loss: 0.028862. Entropy: 2.603392.
Iteration 13335: Policy loss: 0.061485. Value loss: 0.024605. Entropy: 2.602498.
episode: 4790   score: 450.0  epsilon: 1.0    steps: 266  evaluation reward: 438.7
episode: 4791   score: 930.0  epsilon: 1.0    steps: 417  evaluation reward: 443.6
Training network. lr: 0.000148. clip: 0.059145
Iteration 13336: Policy loss: 0.052648. Value loss: 0.051926. Entropy: 2.662029.
Iteration 13337: Policy loss: 0.048496. 

Training network. lr: 0.000147. clip: 0.058998
Iteration 13393: Policy loss: -0.068501. Value loss: 0.466595. Entropy: 2.703687.
Iteration 13394: Policy loss: -0.075449. Value loss: 0.344943. Entropy: 2.687025.
Iteration 13395: Policy loss: -0.075072. Value loss: 0.289203. Entropy: 2.677023.
Training network. lr: 0.000147. clip: 0.058998
Iteration 13396: Policy loss: 0.058538. Value loss: 0.035107. Entropy: 2.561859.
Iteration 13397: Policy loss: 0.060193. Value loss: 0.017942. Entropy: 2.552265.
Iteration 13398: Policy loss: 0.058891. Value loss: 0.014956. Entropy: 2.564477.
episode: 4814   score: 1060.0  epsilon: 1.0    steps: 108  evaluation reward: 436.3
episode: 4815   score: 330.0  epsilon: 1.0    steps: 496  evaluation reward: 436.3
episode: 4816   score: 290.0  epsilon: 1.0    steps: 648  evaluation reward: 436.2
Training network. lr: 0.000147. clip: 0.058998
Iteration 13399: Policy loss: 0.009363. Value loss: 0.034629. Entropy: 2.608259.
Iteration 13400: Policy loss: 0.002065.

episode: 4836   score: 940.0  epsilon: 1.0    steps: 74  evaluation reward: 436.4
episode: 4837   score: 290.0  epsilon: 1.0    steps: 527  evaluation reward: 435.4
episode: 4838   score: 300.0  epsilon: 1.0    steps: 988  evaluation reward: 435.2
Training network. lr: 0.000147. clip: 0.058685
Iteration 13459: Policy loss: 0.007996. Value loss: 0.022413. Entropy: 2.708394.
Iteration 13460: Policy loss: 0.004330. Value loss: 0.021102. Entropy: 2.692329.
Iteration 13461: Policy loss: 0.001433. Value loss: 0.014163. Entropy: 2.707586.
episode: 4839   score: 380.0  epsilon: 1.0    steps: 698  evaluation reward: 434.9
Training network. lr: 0.000147. clip: 0.058685
Iteration 13462: Policy loss: 0.023118. Value loss: 0.013690. Entropy: 2.753919.
Iteration 13463: Policy loss: 0.027967. Value loss: 0.009448. Entropy: 2.749636.
Iteration 13464: Policy loss: 0.028103. Value loss: 0.008324. Entropy: 2.749998.
Training network. lr: 0.000147. clip: 0.058685
Iteration 13465: Policy loss: -0.006630. V

episode: 4861   score: 250.0  epsilon: 1.0    steps: 837  evaluation reward: 422.4
Training network. lr: 0.000146. clip: 0.058537
Iteration 13522: Policy loss: 0.043986. Value loss: 0.012473. Entropy: 2.766999.
Iteration 13523: Policy loss: 0.042986. Value loss: 0.010419. Entropy: 2.765890.
Iteration 13524: Policy loss: 0.048091. Value loss: 0.008110. Entropy: 2.765209.
Training network. lr: 0.000146. clip: 0.058537
Iteration 13525: Policy loss: 0.010686. Value loss: 0.016064. Entropy: 2.761220.
Iteration 13526: Policy loss: 0.006403. Value loss: 0.012972. Entropy: 2.767478.
Iteration 13527: Policy loss: 0.010952. Value loss: 0.009190. Entropy: 2.764475.
Training network. lr: 0.000146. clip: 0.058537
Iteration 13528: Policy loss: 0.047237. Value loss: 0.032894. Entropy: 2.752316.
Iteration 13529: Policy loss: 0.050645. Value loss: 0.014022. Entropy: 2.754437.
Iteration 13530: Policy loss: 0.047186. Value loss: 0.009530. Entropy: 2.755933.
episode: 4862   score: 290.0  epsilon: 1.0    s

Training network. lr: 0.000146. clip: 0.058381
Iteration 13588: Policy loss: 0.027103. Value loss: 0.010620. Entropy: 2.751794.
Iteration 13589: Policy loss: 0.029974. Value loss: 0.006959. Entropy: 2.753902.
Iteration 13590: Policy loss: 0.025045. Value loss: 0.006215. Entropy: 2.754479.
episode: 4883   score: 420.0  epsilon: 1.0    steps: 74  evaluation reward: 407.5
Training network. lr: 0.000146. clip: 0.058381
Iteration 13591: Policy loss: 0.005328. Value loss: 0.015956. Entropy: 2.771106.
Iteration 13592: Policy loss: 0.001073. Value loss: 0.012410. Entropy: 2.776245.
Iteration 13593: Policy loss: 0.003092. Value loss: 0.011289. Entropy: 2.778848.
episode: 4884   score: 260.0  epsilon: 1.0    steps: 345  evaluation reward: 407.0
episode: 4885   score: 400.0  epsilon: 1.0    steps: 985  evaluation reward: 407.1
Training network. lr: 0.000146. clip: 0.058381
Iteration 13594: Policy loss: 0.031400. Value loss: 0.030315. Entropy: 2.738748.
Iteration 13595: Policy loss: 0.026707. Valu

Iteration 13653: Policy loss: -0.050259. Value loss: 0.023446. Entropy: 2.640490.
episode: 4906   score: 890.0  epsilon: 1.0    steps: 338  evaluation reward: 401.9
Training network. lr: 0.000145. clip: 0.058076
Iteration 13654: Policy loss: 0.159290. Value loss: 0.054309. Entropy: 2.709035.
Iteration 13655: Policy loss: 0.151251. Value loss: 0.026002. Entropy: 2.711418.
Iteration 13656: Policy loss: 0.164348. Value loss: 0.023107. Entropy: 2.720326.
episode: 4907   score: 390.0  epsilon: 1.0    steps: 117  evaluation reward: 401.2
episode: 4908   score: 280.0  epsilon: 1.0    steps: 702  evaluation reward: 399.8
Training network. lr: 0.000145. clip: 0.058076
Iteration 13657: Policy loss: 0.135140. Value loss: 0.076055. Entropy: 2.775872.
Iteration 13658: Policy loss: 0.145630. Value loss: 0.048056. Entropy: 2.774331.
Iteration 13659: Policy loss: 0.123449. Value loss: 0.032673. Entropy: 2.765159.
episode: 4909   score: 360.0  epsilon: 1.0    steps: 793  evaluation reward: 401.0
episod

Iteration 13718: Policy loss: -0.141464. Value loss: 0.205449. Entropy: 2.686516.
Iteration 13719: Policy loss: -0.105459. Value loss: 0.126107. Entropy: 2.671463.
episode: 4929   score: 390.0  epsilon: 1.0    steps: 295  evaluation reward: 389.6
Training network. lr: 0.000145. clip: 0.057920
Iteration 13720: Policy loss: -0.001618. Value loss: 0.039535. Entropy: 2.642456.
Iteration 13721: Policy loss: 0.013567. Value loss: 0.019517. Entropy: 2.636452.
Iteration 13722: Policy loss: -0.000553. Value loss: 0.019820. Entropy: 2.634108.
episode: 4930   score: 470.0  epsilon: 1.0    steps: 575  evaluation reward: 390.2
Training network. lr: 0.000145. clip: 0.057920
Iteration 13723: Policy loss: -0.024555. Value loss: 0.056123. Entropy: 2.682068.
Iteration 13724: Policy loss: -0.028456. Value loss: 0.030696. Entropy: 2.686934.
Iteration 13725: Policy loss: -0.026568. Value loss: 0.027685. Entropy: 2.684141.
episode: 4931   score: 270.0  epsilon: 1.0    steps: 72  evaluation reward: 390.0
Tra

Iteration 13784: Policy loss: -0.204866. Value loss: 0.430641. Entropy: 2.450197.
Iteration 13785: Policy loss: -0.236181. Value loss: 0.288788. Entropy: 2.450326.
now time :  2019-02-27 08:03:04.507226
episode: 4951   score: 340.0  epsilon: 1.0    steps: 369  evaluation reward: 390.7
episode: 4952   score: 470.0  epsilon: 1.0    steps: 476  evaluation reward: 391.5
Training network. lr: 0.000144. clip: 0.057763
Iteration 13786: Policy loss: 0.121223. Value loss: 0.040645. Entropy: 2.460577.
Iteration 13787: Policy loss: 0.121828. Value loss: 0.020754. Entropy: 2.465948.
Iteration 13788: Policy loss: 0.118256. Value loss: 0.018913. Entropy: 2.443029.
episode: 4953   score: 450.0  epsilon: 1.0    steps: 1001  evaluation reward: 393.4
Training network. lr: 0.000144. clip: 0.057763
Iteration 13789: Policy loss: 0.047143. Value loss: 0.036766. Entropy: 2.468934.
Iteration 13790: Policy loss: 0.051448. Value loss: 0.026436. Entropy: 2.461991.
Iteration 13791: Policy loss: 0.051982. Value lo

Iteration 13848: Policy loss: 0.035029. Value loss: 0.020273. Entropy: 2.650618.
episode: 4975   score: 420.0  epsilon: 1.0    steps: 470  evaluation reward: 424.7
Training network. lr: 0.000144. clip: 0.057616
Iteration 13849: Policy loss: 0.028447. Value loss: 0.022955. Entropy: 2.658128.
Iteration 13850: Policy loss: 0.036516. Value loss: 0.018109. Entropy: 2.661644.
Iteration 13851: Policy loss: 0.026208. Value loss: 0.015041. Entropy: 2.656039.
Training network. lr: 0.000144. clip: 0.057459
Iteration 13852: Policy loss: -0.030825. Value loss: 0.030903. Entropy: 2.672220.
Iteration 13853: Policy loss: -0.034602. Value loss: 0.018380. Entropy: 2.671054.
Iteration 13854: Policy loss: -0.041463. Value loss: 0.013748. Entropy: 2.669056.
Training network. lr: 0.000144. clip: 0.057459
Iteration 13855: Policy loss: 0.033575. Value loss: 0.041235. Entropy: 2.649944.
Iteration 13856: Policy loss: 0.048481. Value loss: 0.026816. Entropy: 2.647754.
Iteration 13857: Policy loss: 0.035456. Valu

Iteration 13914: Policy loss: 0.010707. Value loss: 0.014406. Entropy: 2.683219.
episode: 4997   score: 450.0  epsilon: 1.0    steps: 541  evaluation reward: 444.8
Training network. lr: 0.000143. clip: 0.057302
Iteration 13915: Policy loss: 0.052441. Value loss: 0.030224. Entropy: 2.622784.
Iteration 13916: Policy loss: 0.043607. Value loss: 0.018259. Entropy: 2.630893.
Iteration 13917: Policy loss: 0.050689. Value loss: 0.013096. Entropy: 2.640830.
episode: 4998   score: 380.0  epsilon: 1.0    steps: 338  evaluation reward: 444.4
episode: 4999   score: 440.0  epsilon: 1.0    steps: 931  evaluation reward: 445.1
Training network. lr: 0.000143. clip: 0.057302
Iteration 13918: Policy loss: 0.065547. Value loss: 0.030176. Entropy: 2.696038.
Iteration 13919: Policy loss: 0.068464. Value loss: 0.016348. Entropy: 2.693750.
Iteration 13920: Policy loss: 0.060198. Value loss: 0.020370. Entropy: 2.696380.
episode: 5000   score: 420.0  epsilon: 1.0    steps: 225  evaluation reward: 445.8
Trainin

Iteration 13977: Policy loss: 0.025436. Value loss: 0.011015. Entropy: 2.757232.
episode: 5022   score: 360.0  epsilon: 1.0    steps: 263  evaluation reward: 443.2
Training network. lr: 0.000143. clip: 0.057155
Iteration 13978: Policy loss: 0.011083. Value loss: 0.016657. Entropy: 2.746960.
Iteration 13979: Policy loss: 0.015815. Value loss: 0.010725. Entropy: 2.747548.
Iteration 13980: Policy loss: 0.011123. Value loss: 0.008696. Entropy: 2.751677.
episode: 5023   score: 410.0  epsilon: 1.0    steps: 155  evaluation reward: 443.7
Training network. lr: 0.000143. clip: 0.057155
Iteration 13981: Policy loss: -0.000572. Value loss: 0.017764. Entropy: 2.772650.
Iteration 13982: Policy loss: 0.000667. Value loss: 0.010525. Entropy: 2.769603.
Iteration 13983: Policy loss: -0.002356. Value loss: 0.008594. Entropy: 2.769113.
episode: 5024   score: 380.0  epsilon: 1.0    steps: 685  evaluation reward: 443.5
Training network. lr: 0.000143. clip: 0.057155
Iteration 13984: Policy loss: 0.011454. V

Iteration 14043: Policy loss: -0.223681. Value loss: 2.053497. Entropy: 2.582219.
episode: 5044   score: 390.0  epsilon: 1.0    steps: 687  evaluation reward: 438.2
episode: 5045   score: 350.0  epsilon: 1.0    steps: 859  evaluation reward: 437.6
Training network. lr: 0.000142. clip: 0.056998
Iteration 14044: Policy loss: 0.082984. Value loss: 0.068562. Entropy: 2.594421.
Iteration 14045: Policy loss: 0.083968. Value loss: 0.038241. Entropy: 2.602770.
Iteration 14046: Policy loss: 0.074687. Value loss: 0.032632. Entropy: 2.605084.
Training network. lr: 0.000142. clip: 0.056998
Iteration 14047: Policy loss: 0.061343. Value loss: 0.042508. Entropy: 2.582397.
Iteration 14048: Policy loss: 0.059464. Value loss: 0.023696. Entropy: 2.580162.
Iteration 14049: Policy loss: 0.067054. Value loss: 0.017417. Entropy: 2.581775.
episode: 5046   score: 500.0  epsilon: 1.0    steps: 366  evaluation reward: 439.0
Training network. lr: 0.000142. clip: 0.056998
Iteration 14050: Policy loss: 0.045642. Va

episode: 5068   score: 300.0  epsilon: 1.0    steps: 102  evaluation reward: 417.6
Training network. lr: 0.000142. clip: 0.056694
Iteration 14107: Policy loss: 0.024543. Value loss: 0.018295. Entropy: 2.725440.
Iteration 14108: Policy loss: 0.024485. Value loss: 0.011411. Entropy: 2.736967.
Iteration 14109: Policy loss: 0.023797. Value loss: 0.011775. Entropy: 2.738570.
Training network. lr: 0.000142. clip: 0.056694
Iteration 14110: Policy loss: 0.042126. Value loss: 0.011926. Entropy: 2.731963.
Iteration 14111: Policy loss: 0.041504. Value loss: 0.007570. Entropy: 2.726758.
Iteration 14112: Policy loss: 0.042158. Value loss: 0.005654. Entropy: 2.732976.
episode: 5069   score: 460.0  epsilon: 1.0    steps: 941  evaluation reward: 419.1
Training network. lr: 0.000142. clip: 0.056694
Iteration 14113: Policy loss: 0.062887. Value loss: 0.022065. Entropy: 2.732048.
Iteration 14114: Policy loss: 0.057520. Value loss: 0.012817. Entropy: 2.728564.
Iteration 14115: Policy loss: 0.056032. Value

episode: 5090   score: 340.0  epsilon: 1.0    steps: 518  evaluation reward: 398.0
episode: 5091   score: 1890.0  epsilon: 1.0    steps: 925  evaluation reward: 411.4
Training network. lr: 0.000141. clip: 0.056537
Iteration 14173: Policy loss: 0.014531. Value loss: 0.038751. Entropy: 2.708663.
Iteration 14174: Policy loss: 0.010675. Value loss: 0.033155. Entropy: 2.709727.
Iteration 14175: Policy loss: 0.009465. Value loss: 0.027287. Entropy: 2.707801.
episode: 5092   score: 500.0  epsilon: 1.0    steps: 214  evaluation reward: 411.9
episode: 5093   score: 440.0  epsilon: 1.0    steps: 338  evaluation reward: 412.2
Training network. lr: 0.000141. clip: 0.056537
Iteration 14176: Policy loss: 0.058822. Value loss: 0.032552. Entropy: 2.723727.
Iteration 14177: Policy loss: 0.056176. Value loss: 0.028495. Entropy: 2.722880.
Iteration 14178: Policy loss: 0.054555. Value loss: 0.023794. Entropy: 2.720422.
episode: 5094   score: 450.0  epsilon: 1.0    steps: 37  evaluation reward: 413.2
Train

Training network. lr: 0.000141. clip: 0.056381
Iteration 14236: Policy loss: 0.016602. Value loss: 0.023565. Entropy: 2.737073.
Iteration 14237: Policy loss: 0.015763. Value loss: 0.015724. Entropy: 2.736029.
Iteration 14238: Policy loss: 0.010148. Value loss: 0.013796. Entropy: 2.734607.
episode: 5115   score: 410.0  epsilon: 1.0    steps: 248  evaluation reward: 412.9
episode: 5116   score: 180.0  epsilon: 1.0    steps: 278  evaluation reward: 411.9
Training network. lr: 0.000141. clip: 0.056381
Iteration 14239: Policy loss: -0.003930. Value loss: 0.026996. Entropy: 2.724291.
Iteration 14240: Policy loss: -0.005331. Value loss: 0.018577. Entropy: 2.717644.
Iteration 14241: Policy loss: -0.006597. Value loss: 0.012732. Entropy: 2.716125.
Training network. lr: 0.000141. clip: 0.056381
Iteration 14242: Policy loss: 0.033441. Value loss: 0.017527. Entropy: 2.721729.
Iteration 14243: Policy loss: 0.033124. Value loss: 0.010455. Entropy: 2.724070.
Iteration 14244: Policy loss: 0.035640. Va

Iteration 14300: Policy loss: 0.176836. Value loss: 0.046829. Entropy: 2.673461.
Iteration 14301: Policy loss: 0.160047. Value loss: 0.036059. Entropy: 2.672471.
Training network. lr: 0.000140. clip: 0.056077
Iteration 14302: Policy loss: 0.015816. Value loss: 0.015701. Entropy: 2.656993.
Iteration 14303: Policy loss: 0.016256. Value loss: 0.011556. Entropy: 2.655058.
Iteration 14304: Policy loss: 0.014563. Value loss: 0.009132. Entropy: 2.654426.
Training network. lr: 0.000140. clip: 0.056077
Iteration 14305: Policy loss: -0.002076. Value loss: 0.020908. Entropy: 2.735344.
Iteration 14306: Policy loss: -0.003682. Value loss: 0.014608. Entropy: 2.738473.
Iteration 14307: Policy loss: -0.005636. Value loss: 0.010773. Entropy: 2.739471.
Training network. lr: 0.000140. clip: 0.056077
Iteration 14308: Policy loss: -0.032697. Value loss: 0.055156. Entropy: 2.748128.
Iteration 14309: Policy loss: -0.027562. Value loss: 0.029816. Entropy: 2.747086.
Iteration 14310: Policy loss: -0.030525. Val

Iteration 14367: Policy loss: -0.023294. Value loss: 0.007253. Entropy: 2.742418.
episode: 5160   score: 300.0  epsilon: 1.0    steps: 696  evaluation reward: 404.4
Training network. lr: 0.000140. clip: 0.055920
Iteration 14368: Policy loss: -0.003456. Value loss: 0.021886. Entropy: 2.760653.
Iteration 14369: Policy loss: 0.001823. Value loss: 0.014052. Entropy: 2.755187.
Iteration 14370: Policy loss: -0.002884. Value loss: 0.012067. Entropy: 2.756380.
episode: 5161   score: 310.0  epsilon: 1.0    steps: 506  evaluation reward: 404.6
Training network. lr: 0.000140. clip: 0.055920
Iteration 14371: Policy loss: 0.047531. Value loss: 0.016523. Entropy: 2.779837.
Iteration 14372: Policy loss: 0.049879. Value loss: 0.010159. Entropy: 2.783825.
Iteration 14373: Policy loss: 0.050348. Value loss: 0.007057. Entropy: 2.784873.
episode: 5162   score: 390.0  epsilon: 1.0    steps: 314  evaluation reward: 404.0
episode: 5163   score: 350.0  epsilon: 1.0    steps: 850  evaluation reward: 404.1
epis

episode: 5181   score: 460.0  epsilon: 1.0    steps: 464  evaluation reward: 444.2
episode: 5182   score: 440.0  epsilon: 1.0    steps: 660  evaluation reward: 444.3
Training network. lr: 0.000139. clip: 0.055772
Iteration 14434: Policy loss: 0.060296. Value loss: 0.022566. Entropy: 2.667090.
Iteration 14435: Policy loss: 0.057813. Value loss: 0.013147. Entropy: 2.671957.
Iteration 14436: Policy loss: 0.054486. Value loss: 0.012108. Entropy: 2.669600.
Training network. lr: 0.000139. clip: 0.055772
Iteration 14437: Policy loss: 0.022186. Value loss: 0.019039. Entropy: 2.702525.
Iteration 14438: Policy loss: 0.030268. Value loss: 0.013658. Entropy: 2.698632.
Iteration 14439: Policy loss: 0.027315. Value loss: 0.009235. Entropy: 2.698469.
episode: 5183   score: 410.0  epsilon: 1.0    steps: 556  evaluation reward: 445.0
episode: 5184   score: 250.0  epsilon: 1.0    steps: 851  evaluation reward: 444.2
Training network. lr: 0.000139. clip: 0.055772
Iteration 14440: Policy loss: 0.044632. V

episode: 5204   score: 410.0  epsilon: 1.0    steps: 502  evaluation reward: 435.1
Training network. lr: 0.000139. clip: 0.055616
Iteration 14500: Policy loss: 0.063707. Value loss: 0.032611. Entropy: 2.744051.
Iteration 14501: Policy loss: 0.072398. Value loss: 0.020555. Entropy: 2.737420.
Iteration 14502: Policy loss: 0.060378. Value loss: 0.015730. Entropy: 2.737891.
episode: 5205   score: 380.0  epsilon: 1.0    steps: 308  evaluation reward: 435.5
episode: 5206   score: 240.0  epsilon: 1.0    steps: 703  evaluation reward: 434.9
Training network. lr: 0.000139. clip: 0.055459
Iteration 14503: Policy loss: -0.009414. Value loss: 0.020990. Entropy: 2.697569.
Iteration 14504: Policy loss: -0.009397. Value loss: 0.011716. Entropy: 2.694659.
Iteration 14505: Policy loss: -0.008417. Value loss: 0.010750. Entropy: 2.698568.
Training network. lr: 0.000139. clip: 0.055459
Iteration 14506: Policy loss: 0.040890. Value loss: 0.012714. Entropy: 2.754020.
Iteration 14507: Policy loss: 0.042344. 

Iteration 14563: Policy loss: -0.014077. Value loss: 0.034140. Entropy: 2.756272.
Iteration 14564: Policy loss: -0.017692. Value loss: 0.019742. Entropy: 2.758484.
Iteration 14565: Policy loss: -0.020692. Value loss: 0.016742. Entropy: 2.756439.
episode: 5229   score: 230.0  epsilon: 1.0    steps: 382  evaluation reward: 422.2
Training network. lr: 0.000138. clip: 0.055312
Iteration 14566: Policy loss: 0.055379. Value loss: 0.019510. Entropy: 2.766339.
Iteration 14567: Policy loss: 0.053637. Value loss: 0.015581. Entropy: 2.763301.
Iteration 14568: Policy loss: 0.052150. Value loss: 0.011822. Entropy: 2.768266.
episode: 5230   score: 360.0  epsilon: 1.0    steps: 596  evaluation reward: 422.0
episode: 5231   score: 340.0  epsilon: 1.0    steps: 864  evaluation reward: 421.7
episode: 5232   score: 340.0  epsilon: 1.0    steps: 1020  evaluation reward: 421.8
Training network. lr: 0.000138. clip: 0.055312
Iteration 14569: Policy loss: 0.079461. Value loss: 0.027059. Entropy: 2.738555.
Ite

Iteration 14628: Policy loss: 0.023099. Value loss: 0.010054. Entropy: 2.785126.
episode: 5252   score: 270.0  epsilon: 1.0    steps: 84  evaluation reward: 416.9
episode: 5253   score: 380.0  epsilon: 1.0    steps: 804  evaluation reward: 416.6
Training network. lr: 0.000138. clip: 0.055155
Iteration 14629: Policy loss: 0.002452. Value loss: 0.016124. Entropy: 2.737254.
Iteration 14630: Policy loss: -0.002447. Value loss: 0.011734. Entropy: 2.735264.
Iteration 14631: Policy loss: -0.000579. Value loss: 0.011194. Entropy: 2.739301.
Training network. lr: 0.000138. clip: 0.055155
Iteration 14632: Policy loss: 0.003608. Value loss: 0.011041. Entropy: 2.782322.
Iteration 14633: Policy loss: 0.000546. Value loss: 0.005946. Entropy: 2.784187.
Iteration 14634: Policy loss: 0.001835. Value loss: 0.005134. Entropy: 2.781646.
episode: 5254   score: 350.0  epsilon: 1.0    steps: 279  evaluation reward: 416.8
Training network. lr: 0.000138. clip: 0.055155
Iteration 14635: Policy loss: 0.047379. Va

Training network. lr: 0.000137. clip: 0.054998
Iteration 14692: Policy loss: 0.029623. Value loss: 0.010711. Entropy: 2.771330.
Iteration 14693: Policy loss: 0.032356. Value loss: 0.011491. Entropy: 2.771267.
Iteration 14694: Policy loss: 0.033925. Value loss: 0.008315. Entropy: 2.770897.
episode: 5277   score: 370.0  epsilon: 1.0    steps: 187  evaluation reward: 377.4
episode: 5278   score: 310.0  epsilon: 1.0    steps: 297  evaluation reward: 376.4
Training network. lr: 0.000137. clip: 0.054998
Iteration 14695: Policy loss: -0.001057. Value loss: 0.020343. Entropy: 2.745327.
Iteration 14696: Policy loss: 0.001488. Value loss: 0.015952. Entropy: 2.747034.
Iteration 14697: Policy loss: -0.001508. Value loss: 0.013268. Entropy: 2.747877.
Training network. lr: 0.000137. clip: 0.054998
Iteration 14698: Policy loss: 0.003103. Value loss: 0.013682. Entropy: 2.779479.
Iteration 14699: Policy loss: 0.001584. Value loss: 0.008880. Entropy: 2.778454.
Iteration 14700: Policy loss: 0.002421. Val

Iteration 14757: Policy loss: -0.058917. Value loss: 0.010307. Entropy: 2.810208.
episode: 5300   score: 380.0  epsilon: 1.0    steps: 480  evaluation reward: 360.7
now time :  2019-02-27 08:22:10.622352
episode: 5301   score: 340.0  epsilon: 1.0    steps: 658  evaluation reward: 360.1
episode: 5302   score: 310.0  epsilon: 1.0    steps: 1013  evaluation reward: 353.9
Training network. lr: 0.000137. clip: 0.054694
Iteration 14758: Policy loss: 0.012287. Value loss: 0.021506. Entropy: 2.794756.
Iteration 14759: Policy loss: 0.009364. Value loss: 0.019400. Entropy: 2.793016.
Iteration 14760: Policy loss: 0.009119. Value loss: 0.015727. Entropy: 2.791887.
Training network. lr: 0.000137. clip: 0.054694
Iteration 14761: Policy loss: 0.045930. Value loss: 0.020882. Entropy: 2.753968.
Iteration 14762: Policy loss: 0.047476. Value loss: 0.008420. Entropy: 2.750015.
Iteration 14763: Policy loss: 0.041064. Value loss: 0.006562. Entropy: 2.751532.
episode: 5303   score: 420.0  epsilon: 1.0    ste

Training network. lr: 0.000136. clip: 0.054537
Iteration 14821: Policy loss: 0.008665. Value loss: 0.023758. Entropy: 2.758757.
Iteration 14822: Policy loss: 0.011470. Value loss: 0.014360. Entropy: 2.764035.
Iteration 14823: Policy loss: 0.011462. Value loss: 0.012084. Entropy: 2.758852.
episode: 5324   score: 310.0  epsilon: 1.0    steps: 284  evaluation reward: 367.4
episode: 5325   score: 350.0  epsilon: 1.0    steps: 548  evaluation reward: 367.4
episode: 5326   score: 270.0  epsilon: 1.0    steps: 649  evaluation reward: 366.0
Training network. lr: 0.000136. clip: 0.054537
Iteration 14824: Policy loss: 0.014909. Value loss: 0.025308. Entropy: 2.705082.
Iteration 14825: Policy loss: 0.029405. Value loss: 0.014567. Entropy: 2.704523.
Iteration 14826: Policy loss: 0.022433. Value loss: 0.012614. Entropy: 2.709857.
episode: 5327   score: 400.0  epsilon: 1.0    steps: 432  evaluation reward: 367.2
episode: 5328   score: 370.0  epsilon: 1.0    steps: 923  evaluation reward: 366.9
Train

Training network. lr: 0.000136. clip: 0.054390
Iteration 14884: Policy loss: -0.021797. Value loss: 0.021072. Entropy: 2.708544.
Iteration 14885: Policy loss: -0.024220. Value loss: 0.015392. Entropy: 2.699004.
Iteration 14886: Policy loss: -0.022070. Value loss: 0.014030. Entropy: 2.701288.
episode: 5350   score: 380.0  epsilon: 1.0    steps: 871  evaluation reward: 367.5
now time :  2019-02-27 08:24:42.797382
episode: 5351   score: 370.0  epsilon: 1.0    steps: 956  evaluation reward: 367.9
Training network. lr: 0.000136. clip: 0.054390
Iteration 14887: Policy loss: 0.057766. Value loss: 0.018464. Entropy: 2.769887.
Iteration 14888: Policy loss: 0.053054. Value loss: 0.011786. Entropy: 2.771736.
Iteration 14889: Policy loss: 0.053214. Value loss: 0.009486. Entropy: 2.768411.
episode: 5352   score: 330.0  epsilon: 1.0    steps: 603  evaluation reward: 368.5
Training network. lr: 0.000136. clip: 0.054390
Iteration 14890: Policy loss: 0.004012. Value loss: 0.015139. Entropy: 2.723447.
I

Iteration 14948: Policy loss: -0.011032. Value loss: 0.012955. Entropy: 2.762030.
Iteration 14949: Policy loss: -0.010819. Value loss: 0.010776. Entropy: 2.764437.
Training network. lr: 0.000136. clip: 0.054233
Iteration 14950: Policy loss: 0.073115. Value loss: 0.025133. Entropy: 2.759562.
Iteration 14951: Policy loss: 0.060726. Value loss: 0.017996. Entropy: 2.761918.
Iteration 14952: Policy loss: 0.075771. Value loss: 0.010423. Entropy: 2.759635.
episode: 5374   score: 290.0  epsilon: 1.0    steps: 142  evaluation reward: 367.6
Training network. lr: 0.000135. clip: 0.054077
Iteration 14953: Policy loss: -0.044123. Value loss: 0.023807. Entropy: 2.758747.
Iteration 14954: Policy loss: -0.039510. Value loss: 0.015405. Entropy: 2.753606.
Iteration 14955: Policy loss: -0.042203. Value loss: 0.014100. Entropy: 2.753086.
episode: 5375   score: 350.0  epsilon: 1.0    steps: 99  evaluation reward: 368.2
episode: 5376   score: 560.0  epsilon: 1.0    steps: 697  evaluation reward: 370.3
Train

Training network. lr: 0.000135. clip: 0.053929
Iteration 15013: Policy loss: 0.096427. Value loss: 0.024416. Entropy: 2.703219.
Iteration 15014: Policy loss: 0.089144. Value loss: 0.014977. Entropy: 2.709695.
Iteration 15015: Policy loss: 0.091231. Value loss: 0.012689. Entropy: 2.707223.
episode: 5399   score: 460.0  epsilon: 1.0    steps: 219  evaluation reward: 379.9
Training network. lr: 0.000135. clip: 0.053929
Iteration 15016: Policy loss: 0.027876. Value loss: 0.021227. Entropy: 2.744574.
Iteration 15017: Policy loss: 0.023287. Value loss: 0.012825. Entropy: 2.745748.
Iteration 15018: Policy loss: 0.028052. Value loss: 0.011309. Entropy: 2.748932.
episode: 5400   score: 440.0  epsilon: 1.0    steps: 761  evaluation reward: 380.5
now time :  2019-02-27 08:27:18.272190
episode: 5401   score: 270.0  epsilon: 1.0    steps: 977  evaluation reward: 379.8
Training network. lr: 0.000135. clip: 0.053929
Iteration 15019: Policy loss: 0.026162. Value loss: 0.025280. Entropy: 2.737195.
Iter

episode: 5421   score: 410.0  epsilon: 1.0    steps: 97  evaluation reward: 374.2
episode: 5422   score: 390.0  epsilon: 1.0    steps: 872  evaluation reward: 374.2
Training network. lr: 0.000134. clip: 0.053773
Iteration 15079: Policy loss: 0.075876. Value loss: 0.025987. Entropy: 2.735925.
Iteration 15080: Policy loss: 0.075941. Value loss: 0.017682. Entropy: 2.739504.
Iteration 15081: Policy loss: 0.069753. Value loss: 0.015151. Entropy: 2.744097.
episode: 5423   score: 290.0  epsilon: 1.0    steps: 297  evaluation reward: 374.1
Training network. lr: 0.000134. clip: 0.053773
Iteration 15082: Policy loss: 0.001983. Value loss: 0.019830. Entropy: 2.696712.
Iteration 15083: Policy loss: -0.001758. Value loss: 0.013364. Entropy: 2.706872.
Iteration 15084: Policy loss: 0.006619. Value loss: 0.013186. Entropy: 2.704406.
episode: 5424   score: 450.0  epsilon: 1.0    steps: 971  evaluation reward: 375.5
Training network. lr: 0.000134. clip: 0.053773
Iteration 15085: Policy loss: -0.002370. 

Training network. lr: 0.000134. clip: 0.053616
Iteration 15142: Policy loss: -0.007529. Value loss: 0.011037. Entropy: 2.728847.
Iteration 15143: Policy loss: -0.013427. Value loss: 0.010720. Entropy: 2.727155.
Iteration 15144: Policy loss: -0.008760. Value loss: 0.008099. Entropy: 2.722042.
episode: 5447   score: 340.0  epsilon: 1.0    steps: 511  evaluation reward: 378.4
Training network. lr: 0.000134. clip: 0.053616
Iteration 15145: Policy loss: -0.249489. Value loss: 1.290972. Entropy: 2.737808.
Iteration 15146: Policy loss: -0.281614. Value loss: 0.849213. Entropy: 2.714357.
Iteration 15147: Policy loss: -0.262107. Value loss: 0.463109. Entropy: 2.673705.
episode: 5448   score: 1450.0  epsilon: 1.0    steps: 289  evaluation reward: 389.2
Training network. lr: 0.000134. clip: 0.053616
Iteration 15148: Policy loss: 0.041012. Value loss: 0.034371. Entropy: 2.654703.
Iteration 15149: Policy loss: 0.045407. Value loss: 0.021146. Entropy: 2.666326.
Iteration 15150: Policy loss: 0.038017

episode: 5469   score: 400.0  epsilon: 1.0    steps: 905  evaluation reward: 382.1
Training network. lr: 0.000133. clip: 0.053312
Iteration 15208: Policy loss: 0.110723. Value loss: 0.064740. Entropy: 2.561702.
Iteration 15209: Policy loss: 0.108907. Value loss: 0.033962. Entropy: 2.555106.
Iteration 15210: Policy loss: 0.100484. Value loss: 0.023839. Entropy: 2.560292.
episode: 5470   score: 410.0  epsilon: 1.0    steps: 272  evaluation reward: 381.9
episode: 5471   score: 420.0  epsilon: 1.0    steps: 523  evaluation reward: 383.2
Training network. lr: 0.000133. clip: 0.053312
Iteration 15211: Policy loss: 0.046668. Value loss: 0.061376. Entropy: 2.592094.
Iteration 15212: Policy loss: 0.043269. Value loss: 0.036654. Entropy: 2.604556.
Iteration 15213: Policy loss: 0.042814. Value loss: 0.027038. Entropy: 2.586282.
Training network. lr: 0.000133. clip: 0.053312
Iteration 15214: Policy loss: 0.145614. Value loss: 0.037703. Entropy: 2.626093.
Iteration 15215: Policy loss: 0.138093. Val

Iteration 15273: Policy loss: 0.022512. Value loss: 0.013320. Entropy: 2.646670.
episode: 5492   score: 300.0  epsilon: 1.0    steps: 535  evaluation reward: 414.6
Training network. lr: 0.000133. clip: 0.053155
Iteration 15274: Policy loss: -0.189859. Value loss: 0.723009. Entropy: 2.619576.
Iteration 15275: Policy loss: -0.194405. Value loss: 0.471843. Entropy: 2.601540.
Iteration 15276: Policy loss: -0.157047. Value loss: 0.201764. Entropy: 2.567472.
episode: 5493   score: 300.0  epsilon: 1.0    steps: 962  evaluation reward: 414.2
Training network. lr: 0.000133. clip: 0.053155
Iteration 15277: Policy loss: -0.045557. Value loss: 0.558575. Entropy: 2.442042.
Iteration 15278: Policy loss: 0.013036. Value loss: 0.237585. Entropy: 2.411731.
Iteration 15279: Policy loss: 0.000774. Value loss: 0.247704. Entropy: 2.415894.
Training network. lr: 0.000133. clip: 0.053155
Iteration 15280: Policy loss: -0.024629. Value loss: 0.932682. Entropy: 2.365406.
Iteration 15281: Policy loss: -0.016492.

Training network. lr: 0.000133. clip: 0.053008
Iteration 15340: Policy loss: -0.021110. Value loss: 0.553536. Entropy: 2.511435.
Iteration 15341: Policy loss: -0.037068. Value loss: 0.527279. Entropy: 2.486212.
Iteration 15342: Policy loss: -0.060701. Value loss: 0.456131. Entropy: 2.472057.
episode: 5513   score: 350.0  epsilon: 1.0    steps: 358  evaluation reward: 423.7
episode: 5514   score: 430.0  epsilon: 1.0    steps: 420  evaluation reward: 423.9
episode: 5515   score: 360.0  epsilon: 1.0    steps: 646  evaluation reward: 423.6
Training network. lr: 0.000133. clip: 0.053008
Iteration 15343: Policy loss: 0.034773. Value loss: 0.031421. Entropy: 2.437102.
Iteration 15344: Policy loss: 0.037925. Value loss: 0.020086. Entropy: 2.446620.
Iteration 15345: Policy loss: 0.037332. Value loss: 0.019775. Entropy: 2.448318.
episode: 5516   score: 1000.0  epsilon: 1.0    steps: 238  evaluation reward: 429.5
episode: 5517   score: 560.0  epsilon: 1.0    steps: 876  evaluation reward: 430.7
e

Iteration 15405: Policy loss: 0.046079. Value loss: 0.020517. Entropy: 2.332475.
episode: 5536   score: 1010.0  epsilon: 1.0    steps: 536  evaluation reward: 456.5
episode: 5537   score: 300.0  epsilon: 1.0    steps: 918  evaluation reward: 456.4
Training network. lr: 0.000132. clip: 0.052694
Iteration 15406: Policy loss: 0.030574. Value loss: 0.055264. Entropy: 2.558611.
Iteration 15407: Policy loss: 0.025445. Value loss: 0.030946. Entropy: 2.562390.
Iteration 15408: Policy loss: 0.022747. Value loss: 0.025950. Entropy: 2.546115.
episode: 5538   score: 420.0  epsilon: 1.0    steps: 715  evaluation reward: 457.7
Training network. lr: 0.000132. clip: 0.052694
Iteration 15409: Policy loss: 0.011127. Value loss: 0.037545. Entropy: 2.634483.
Iteration 15410: Policy loss: 0.010784. Value loss: 0.023007. Entropy: 2.638954.
Iteration 15411: Policy loss: 0.001831. Value loss: 0.019954. Entropy: 2.640024.
episode: 5539   score: 270.0  epsilon: 1.0    steps: 228  evaluation reward: 457.3
Traini

episode: 5560   score: 260.0  epsilon: 1.0    steps: 1013  evaluation reward: 465.7
Training network. lr: 0.000131. clip: 0.052547
Iteration 15469: Policy loss: 0.035717. Value loss: 0.032266. Entropy: 2.603930.
Iteration 15470: Policy loss: 0.029314. Value loss: 0.022116. Entropy: 2.611440.
Iteration 15471: Policy loss: 0.030550. Value loss: 0.018579. Entropy: 2.608190.
Training network. lr: 0.000131. clip: 0.052547
Iteration 15472: Policy loss: -0.003027. Value loss: 0.026450. Entropy: 2.575197.
Iteration 15473: Policy loss: -0.011766. Value loss: 0.018999. Entropy: 2.572613.
Iteration 15474: Policy loss: -0.008226. Value loss: 0.015167. Entropy: 2.576770.
Training network. lr: 0.000131. clip: 0.052547
Iteration 15475: Policy loss: -0.022701. Value loss: 0.039661. Entropy: 2.531845.
Iteration 15476: Policy loss: -0.025870. Value loss: 0.022891. Entropy: 2.531478.
Iteration 15477: Policy loss: -0.028951. Value loss: 0.016881. Entropy: 2.533417.
episode: 5561   score: 310.0  epsilon: 1

episode: 5582   score: 450.0  epsilon: 1.0    steps: 902  evaluation reward: 470.4
Training network. lr: 0.000131. clip: 0.052390
Iteration 15535: Policy loss: 0.003188. Value loss: 0.021407. Entropy: 2.589241.
Iteration 15536: Policy loss: 0.007719. Value loss: 0.014033. Entropy: 2.611179.
Iteration 15537: Policy loss: 0.003630. Value loss: 0.012540. Entropy: 2.607540.
episode: 5583   score: 310.0  epsilon: 1.0    steps: 587  evaluation reward: 470.1
Training network. lr: 0.000131. clip: 0.052390
Iteration 15538: Policy loss: 0.050581. Value loss: 0.016114. Entropy: 2.704586.
Iteration 15539: Policy loss: 0.045660. Value loss: 0.010611. Entropy: 2.706385.
Iteration 15540: Policy loss: 0.044669. Value loss: 0.008575. Entropy: 2.701138.
episode: 5584   score: 410.0  epsilon: 1.0    steps: 384  evaluation reward: 471.0
Training network. lr: 0.000131. clip: 0.052390
Iteration 15541: Policy loss: -0.005526. Value loss: 0.012111. Entropy: 2.723553.
Iteration 15542: Policy loss: -0.007411. V

episode: 5604   score: 350.0  epsilon: 1.0    steps: 127  evaluation reward: 455.2
episode: 5605   score: 390.0  epsilon: 1.0    steps: 816  evaluation reward: 456.9
Training network. lr: 0.000130. clip: 0.052086
Iteration 15601: Policy loss: 0.042827. Value loss: 0.025881. Entropy: 2.645465.
Iteration 15602: Policy loss: 0.046044. Value loss: 0.018205. Entropy: 2.646889.
Iteration 15603: Policy loss: 0.037210. Value loss: 0.015642. Entropy: 2.647058.
episode: 5606   score: 250.0  epsilon: 1.0    steps: 172  evaluation reward: 453.9
episode: 5607   score: 450.0  epsilon: 1.0    steps: 545  evaluation reward: 454.3
Training network. lr: 0.000130. clip: 0.052086
Iteration 15604: Policy loss: 0.012379. Value loss: 0.018616. Entropy: 2.607237.
Iteration 15605: Policy loss: 0.010821. Value loss: 0.014102. Entropy: 2.607919.
Iteration 15606: Policy loss: 0.007695. Value loss: 0.013476. Entropy: 2.614065.
episode: 5608   score: 330.0  epsilon: 1.0    steps: 694  evaluation reward: 453.6
Train

Training network. lr: 0.000130. clip: 0.051929
Iteration 15664: Policy loss: 0.042037. Value loss: 0.064725. Entropy: 2.556727.
Iteration 15665: Policy loss: 0.050837. Value loss: 0.039810. Entropy: 2.562625.
Iteration 15666: Policy loss: 0.041368. Value loss: 0.027348. Entropy: 2.575138.
episode: 5629   score: 460.0  epsilon: 1.0    steps: 170  evaluation reward: 453.7
Training network. lr: 0.000130. clip: 0.051929
Iteration 15667: Policy loss: -0.000199. Value loss: 0.135426. Entropy: 2.607820.
Iteration 15668: Policy loss: -0.005324. Value loss: 0.078889. Entropy: 2.588434.
Iteration 15669: Policy loss: -0.008914. Value loss: 0.045672. Entropy: 2.595115.
Training network. lr: 0.000130. clip: 0.051929
Iteration 15670: Policy loss: -0.088247. Value loss: 0.974021. Entropy: 2.571907.
Iteration 15671: Policy loss: -0.024877. Value loss: 0.600268. Entropy: 2.561869.
Iteration 15672: Policy loss: -0.076954. Value loss: 0.568567. Entropy: 2.545031.
Training network. lr: 0.000130. clip: 0.0

Iteration 15729: Policy loss: 0.002428. Value loss: 0.022695. Entropy: 2.559465.
episode: 5652   score: 910.0  epsilon: 1.0    steps: 957  evaluation reward: 439.3
Training network. lr: 0.000129. clip: 0.051773
Iteration 15730: Policy loss: 0.026358. Value loss: 0.023574. Entropy: 2.591187.
Iteration 15731: Policy loss: 0.029737. Value loss: 0.019051. Entropy: 2.591513.
Iteration 15732: Policy loss: 0.021602. Value loss: 0.017333. Entropy: 2.589740.
Training network. lr: 0.000129. clip: 0.051773
Iteration 15733: Policy loss: 0.011858. Value loss: 0.049050. Entropy: 2.660038.
Iteration 15734: Policy loss: 0.001880. Value loss: 0.028007. Entropy: 2.661869.
Iteration 15735: Policy loss: 0.004344. Value loss: 0.021229. Entropy: 2.661203.
episode: 5653   score: 400.0  epsilon: 1.0    steps: 103  evaluation reward: 438.9
Training network. lr: 0.000129. clip: 0.051773
Iteration 15736: Policy loss: -0.118522. Value loss: 0.409401. Entropy: 2.568772.
Iteration 15737: Policy loss: -0.124618. Val

episode: 5673   score: 860.0  epsilon: 1.0    steps: 587  evaluation reward: 458.4
episode: 5674   score: 290.0  epsilon: 1.0    steps: 778  evaluation reward: 457.7
Training network. lr: 0.000129. clip: 0.051625
Iteration 15796: Policy loss: 0.033838. Value loss: 0.034808. Entropy: 2.496778.
Iteration 15797: Policy loss: 0.031565. Value loss: 0.025540. Entropy: 2.505569.
Iteration 15798: Policy loss: 0.028948. Value loss: 0.021691. Entropy: 2.509909.
episode: 5675   score: 370.0  epsilon: 1.0    steps: 986  evaluation reward: 457.3
Training network. lr: 0.000129. clip: 0.051625
Iteration 15799: Policy loss: -0.021176. Value loss: 0.046263. Entropy: 2.529888.
Iteration 15800: Policy loss: -0.033989. Value loss: 0.027777. Entropy: 2.521082.
Iteration 15801: Policy loss: -0.021490. Value loss: 0.021579. Entropy: 2.517633.
episode: 5676   score: 950.0  epsilon: 1.0    steps: 285  evaluation reward: 463.6
episode: 5677   score: 260.0  epsilon: 1.0    steps: 671  evaluation reward: 463.3
Tr

episode: 5695   score: 950.0  epsilon: 1.0    steps: 216  evaluation reward: 475.8
episode: 5696   score: 350.0  epsilon: 1.0    steps: 449  evaluation reward: 475.4
Training network. lr: 0.000128. clip: 0.051312
Iteration 15862: Policy loss: -0.133922. Value loss: 0.417124. Entropy: 2.609012.
Iteration 15863: Policy loss: -0.123822. Value loss: 0.243640. Entropy: 2.604223.
Iteration 15864: Policy loss: -0.107260. Value loss: 0.117095. Entropy: 2.580729.
episode: 5697   score: 270.0  epsilon: 1.0    steps: 59  evaluation reward: 474.8
Training network. lr: 0.000128. clip: 0.051312
Iteration 15865: Policy loss: -0.068524. Value loss: 0.375996. Entropy: 2.473128.
Iteration 15866: Policy loss: -0.054795. Value loss: 0.209510. Entropy: 2.451298.
Iteration 15867: Policy loss: -0.027095. Value loss: 0.139831. Entropy: 2.435903.
episode: 5698   score: 480.0  epsilon: 1.0    steps: 301  evaluation reward: 475.4
Training network. lr: 0.000128. clip: 0.051312
Iteration 15868: Policy loss: 0.0604

Training network. lr: 0.000128. clip: 0.051164
Iteration 15925: Policy loss: 0.128064. Value loss: 0.114762. Entropy: 2.588755.
Iteration 15926: Policy loss: 0.121937. Value loss: 0.075099. Entropy: 2.595033.
Iteration 15927: Policy loss: 0.109417. Value loss: 0.060040. Entropy: 2.602306.
Training network. lr: 0.000128. clip: 0.051164
Iteration 15928: Policy loss: 0.187280. Value loss: 0.087395. Entropy: 2.592049.
Iteration 15929: Policy loss: 0.186381. Value loss: 0.052831. Entropy: 2.599862.
Iteration 15930: Policy loss: 0.186628. Value loss: 0.040799. Entropy: 2.601458.
episode: 5720   score: 280.0  epsilon: 1.0    steps: 528  evaluation reward: 510.4
Training network. lr: 0.000128. clip: 0.051164
Iteration 15931: Policy loss: 0.164208. Value loss: 0.600094. Entropy: 2.599785.
Iteration 15932: Policy loss: 0.173190. Value loss: 0.085441. Entropy: 2.606436.
Iteration 15933: Policy loss: 0.154875. Value loss: 0.063618. Entropy: 2.594846.
episode: 5721   score: 330.0  epsilon: 1.0    s

episode: 5739   score: 250.0  epsilon: 1.0    steps: 188  evaluation reward: 480.6
episode: 5740   score: 240.0  epsilon: 1.0    steps: 735  evaluation reward: 479.4
episode: 5741   score: 410.0  epsilon: 1.0    steps: 841  evaluation reward: 480.2
Training network. lr: 0.000128. clip: 0.051008
Iteration 15994: Policy loss: 0.038027. Value loss: 0.076529. Entropy: 2.276052.
Iteration 15995: Policy loss: 0.036023. Value loss: 0.043275. Entropy: 2.260879.
Iteration 15996: Policy loss: 0.025714. Value loss: 0.030656. Entropy: 2.296184.
episode: 5742   score: 1880.0  epsilon: 1.0    steps: 83  evaluation reward: 494.8
episode: 5743   score: 100.0  epsilon: 1.0    steps: 512  evaluation reward: 492.5
episode: 5744   score: 300.0  epsilon: 1.0    steps: 568  evaluation reward: 492.4
Training network. lr: 0.000128. clip: 0.051008
Iteration 15997: Policy loss: -0.068695. Value loss: 0.042108. Entropy: 2.409120.
Iteration 15998: Policy loss: -0.070621. Value loss: 0.031496. Entropy: 2.371965.
I

Training network. lr: 0.000127. clip: 0.050704
Iteration 16057: Policy loss: 0.036704. Value loss: 0.069986. Entropy: 2.213151.
Iteration 16058: Policy loss: 0.043676. Value loss: 0.039806. Entropy: 2.247421.
Iteration 16059: Policy loss: 0.039106. Value loss: 0.038407. Entropy: 2.229055.
episode: 5764   score: 260.0  epsilon: 1.0    steps: 40  evaluation reward: 470.8
Training network. lr: 0.000127. clip: 0.050704
Iteration 16060: Policy loss: 0.060349. Value loss: 0.103242. Entropy: 2.262153.
Iteration 16061: Policy loss: 0.066519. Value loss: 0.058724. Entropy: 2.249748.
Iteration 16062: Policy loss: 0.065829. Value loss: 0.049918. Entropy: 2.271343.
episode: 5765   score: 280.0  epsilon: 1.0    steps: 297  evaluation reward: 469.9
episode: 5766   score: 520.0  epsilon: 1.0    steps: 601  evaluation reward: 471.7
Training network. lr: 0.000127. clip: 0.050704
Iteration 16063: Policy loss: 0.047348. Value loss: 0.083851. Entropy: 2.217562.
Iteration 16064: Policy loss: 0.058706. Valu

Training network. lr: 0.000126. clip: 0.050547
Iteration 16123: Policy loss: 0.053708. Value loss: 0.021834. Entropy: 2.607940.
Iteration 16124: Policy loss: 0.049909. Value loss: 0.016705. Entropy: 2.611394.
Iteration 16125: Policy loss: 0.047696. Value loss: 0.013626. Entropy: 2.611185.
episode: 5786   score: 350.0  epsilon: 1.0    steps: 692  evaluation reward: 471.2
Training network. lr: 0.000126. clip: 0.050547
Iteration 16126: Policy loss: -0.006816. Value loss: 0.025118. Entropy: 2.630680.
Iteration 16127: Policy loss: -0.013926. Value loss: 0.018527. Entropy: 2.622646.
Iteration 16128: Policy loss: -0.007694. Value loss: 0.016723. Entropy: 2.623779.
episode: 5787   score: 330.0  epsilon: 1.0    steps: 636  evaluation reward: 471.5
Training network. lr: 0.000126. clip: 0.050547
Iteration 16129: Policy loss: -0.207083. Value loss: 0.490861. Entropy: 2.565802.
Iteration 16130: Policy loss: -0.169257. Value loss: 0.346674. Entropy: 2.549547.
Iteration 16131: Policy loss: -0.196497.

Training network. lr: 0.000126. clip: 0.050390
Iteration 16189: Policy loss: 0.093837. Value loss: 0.036093. Entropy: 2.651925.
Iteration 16190: Policy loss: 0.092175. Value loss: 0.023586. Entropy: 2.656353.
Iteration 16191: Policy loss: 0.106296. Value loss: 0.017335. Entropy: 2.662823.
episode: 5808   score: 270.0  epsilon: 1.0    steps: 526  evaluation reward: 463.7
Training network. lr: 0.000126. clip: 0.050390
Iteration 16192: Policy loss: 0.095375. Value loss: 0.027457. Entropy: 2.629280.
Iteration 16193: Policy loss: 0.085754. Value loss: 0.019268. Entropy: 2.628013.
Iteration 16194: Policy loss: 0.088704. Value loss: 0.015708. Entropy: 2.635767.
episode: 5809   score: 410.0  epsilon: 1.0    steps: 97  evaluation reward: 463.2
episode: 5810   score: 420.0  epsilon: 1.0    steps: 821  evaluation reward: 464.1
episode: 5811   score: 510.0  epsilon: 1.0    steps: 901  evaluation reward: 466.5
Training network. lr: 0.000126. clip: 0.050390
Iteration 16195: Policy loss: 0.094226. Va

Training network. lr: 0.000125. clip: 0.050086
Iteration 16255: Policy loss: -0.110468. Value loss: 0.729823. Entropy: 2.175732.
Iteration 16256: Policy loss: -0.059952. Value loss: 0.295910. Entropy: 2.097333.
Iteration 16257: Policy loss: -0.069240. Value loss: 0.287326. Entropy: 2.111887.
episode: 5830   score: 1810.0  epsilon: 1.0    steps: 780  evaluation reward: 458.3
Training network. lr: 0.000125. clip: 0.050086
Iteration 16258: Policy loss: 0.065471. Value loss: 0.063520. Entropy: 2.385582.
Iteration 16259: Policy loss: 0.066266. Value loss: 0.035937. Entropy: 2.397547.
Iteration 16260: Policy loss: 0.060743. Value loss: 0.034212. Entropy: 2.399860.
episode: 5831   score: 980.0  epsilon: 1.0    steps: 686  evaluation reward: 464.4
Training network. lr: 0.000125. clip: 0.050086
Iteration 16261: Policy loss: 0.144023. Value loss: 0.118268. Entropy: 2.598177.
Iteration 16262: Policy loss: 0.140792. Value loss: 0.068517. Entropy: 2.597972.
Iteration 16263: Policy loss: 0.146297. V

Iteration 16319: Policy loss: 0.107968. Value loss: 0.032907. Entropy: 2.446727.
Iteration 16320: Policy loss: 0.103378. Value loss: 0.025623. Entropy: 2.429544.
episode: 5854   score: 400.0  epsilon: 1.0    steps: 450  evaluation reward: 458.7
episode: 5855   score: 410.0  epsilon: 1.0    steps: 810  evaluation reward: 460.4
Training network. lr: 0.000125. clip: 0.049929
Iteration 16321: Policy loss: 0.022346. Value loss: 0.037110. Entropy: 2.629254.
Iteration 16322: Policy loss: 0.015802. Value loss: 0.026710. Entropy: 2.629570.
Iteration 16323: Policy loss: 0.025649. Value loss: 0.019307. Entropy: 2.631992.
Training network. lr: 0.000125. clip: 0.049929
Iteration 16324: Policy loss: 0.005875. Value loss: 0.039934. Entropy: 2.615077.
Iteration 16325: Policy loss: 0.001054. Value loss: 0.022820. Entropy: 2.614892.
Iteration 16326: Policy loss: 0.006070. Value loss: 0.018473. Entropy: 2.613095.
Training network. lr: 0.000125. clip: 0.049929
Iteration 16327: Policy loss: 0.010729. Value

Iteration 16385: Policy loss: -0.003710. Value loss: 0.012475. Entropy: 2.735018.
Iteration 16386: Policy loss: 0.003326. Value loss: 0.010289. Entropy: 2.735857.
Training network. lr: 0.000124. clip: 0.049782
Iteration 16387: Policy loss: -0.529957. Value loss: 2.956445. Entropy: 2.451516.
Iteration 16388: Policy loss: -0.531568. Value loss: 2.225396. Entropy: 2.487846.
Iteration 16389: Policy loss: -0.457334. Value loss: 1.323700. Entropy: 2.475776.
episode: 5876   score: 360.0  epsilon: 1.0    steps: 283  evaluation reward: 442.9
episode: 5877   score: 1850.0  epsilon: 1.0    steps: 510  evaluation reward: 458.4
Training network. lr: 0.000124. clip: 0.049782
Iteration 16390: Policy loss: 0.194148. Value loss: 0.074371. Entropy: 2.355380.
Iteration 16391: Policy loss: 0.193740. Value loss: 0.042029. Entropy: 2.344628.
Iteration 16392: Policy loss: 0.186596. Value loss: 0.037012. Entropy: 2.361672.
episode: 5878   score: 430.0  epsilon: 1.0    steps: 1023  evaluation reward: 459.3
Tra

Iteration 16450: Policy loss: 0.040521. Value loss: 0.033188. Entropy: 2.533074.
Iteration 16451: Policy loss: 0.041713. Value loss: 0.023790. Entropy: 2.519063.
Iteration 16452: Policy loss: 0.042040. Value loss: 0.017499. Entropy: 2.532162.
Training network. lr: 0.000124. clip: 0.049469
Iteration 16453: Policy loss: 0.073345. Value loss: 0.040987. Entropy: 2.671962.
Iteration 16454: Policy loss: 0.074096. Value loss: 0.029635. Entropy: 2.676654.
Iteration 16455: Policy loss: 0.072404. Value loss: 0.021778. Entropy: 2.673784.
Training network. lr: 0.000124. clip: 0.049469
Iteration 16456: Policy loss: 0.144114. Value loss: 0.035347. Entropy: 2.666553.
Iteration 16457: Policy loss: 0.146752. Value loss: 0.022304. Entropy: 2.671810.
Iteration 16458: Policy loss: 0.139241. Value loss: 0.019726. Entropy: 2.667549.
episode: 5899   score: 270.0  epsilon: 1.0    steps: 744  evaluation reward: 465.9
Training network. lr: 0.000124. clip: 0.049469
Iteration 16459: Policy loss: 0.088962. Value l

episode: 5921   score: 350.0  epsilon: 1.0    steps: 105  evaluation reward: 456.8
episode: 5922   score: 330.0  epsilon: 1.0    steps: 439  evaluation reward: 456.3
Training network. lr: 0.000123. clip: 0.049321
Iteration 16516: Policy loss: 0.043065. Value loss: 0.025896. Entropy: 2.623869.
Iteration 16517: Policy loss: 0.044176. Value loss: 0.016805. Entropy: 2.629822.
Iteration 16518: Policy loss: 0.042597. Value loss: 0.013213. Entropy: 2.636353.
Training network. lr: 0.000123. clip: 0.049321
Iteration 16519: Policy loss: 0.016265. Value loss: 0.021471. Entropy: 2.651390.
Iteration 16520: Policy loss: 0.013644. Value loss: 0.015700. Entropy: 2.650533.
Iteration 16521: Policy loss: 0.013739. Value loss: 0.013484. Entropy: 2.658682.
Training network. lr: 0.000123. clip: 0.049321
Iteration 16522: Policy loss: -0.306541. Value loss: 1.194097. Entropy: 2.529969.
Iteration 16523: Policy loss: -0.337758. Value loss: 0.925298. Entropy: 2.529127.
Iteration 16524: Policy loss: -0.288448. Va

Iteration 16581: Policy loss: 0.085358. Value loss: 0.036292. Entropy: 2.644010.
episode: 5944   score: 400.0  epsilon: 1.0    steps: 963  evaluation reward: 469.8
Training network. lr: 0.000123. clip: 0.049165
Iteration 16582: Policy loss: 0.095322. Value loss: 0.047035. Entropy: 2.632492.
Iteration 16583: Policy loss: 0.086353. Value loss: 0.028995. Entropy: 2.644317.
Iteration 16584: Policy loss: 0.090520. Value loss: 0.024885. Entropy: 2.637286.
episode: 5945   score: 370.0  epsilon: 1.0    steps: 448  evaluation reward: 469.4
Training network. lr: 0.000123. clip: 0.049165
Iteration 16585: Policy loss: 0.096531. Value loss: 0.058596. Entropy: 2.607644.
Iteration 16586: Policy loss: 0.101059. Value loss: 0.033781. Entropy: 2.608113.
Iteration 16587: Policy loss: 0.090662. Value loss: 0.026086. Entropy: 2.607170.
Training network. lr: 0.000123. clip: 0.049165
Iteration 16588: Policy loss: 0.053184. Value loss: 0.037809. Entropy: 2.643126.
Iteration 16589: Policy loss: 0.051431. Value

Iteration 16646: Policy loss: 0.069708. Value loss: 0.017983. Entropy: 2.661184.
Iteration 16647: Policy loss: 0.065971. Value loss: 0.013139. Entropy: 2.664721.
episode: 5967   score: 400.0  epsilon: 1.0    steps: 313  evaluation reward: 475.4
Training network. lr: 0.000123. clip: 0.049008
Iteration 16648: Policy loss: 0.014861. Value loss: 0.522021. Entropy: 2.436617.
Iteration 16649: Policy loss: 0.034432. Value loss: 0.326251. Entropy: 2.420823.
Iteration 16650: Policy loss: 0.022944. Value loss: 0.222826. Entropy: 2.448489.
Training network. lr: 0.000122. clip: 0.048860
Iteration 16651: Policy loss: -0.016835. Value loss: 0.056097. Entropy: 2.431474.
Iteration 16652: Policy loss: -0.014172. Value loss: 0.037111. Entropy: 2.439708.
Iteration 16653: Policy loss: -0.029405. Value loss: 0.031498. Entropy: 2.451174.
episode: 5968   score: 330.0  epsilon: 1.0    steps: 100  evaluation reward: 475.3
episode: 5969   score: 290.0  epsilon: 1.0    steps: 692  evaluation reward: 474.2
episod

Iteration 16711: Policy loss: 0.036387. Value loss: 0.059653. Entropy: 2.374085.
Iteration 16712: Policy loss: 0.041858. Value loss: 0.035003. Entropy: 2.381472.
Iteration 16713: Policy loss: 0.030830. Value loss: 0.024139. Entropy: 2.363490.
Training network. lr: 0.000122. clip: 0.048704
Iteration 16714: Policy loss: 0.063626. Value loss: 0.040530. Entropy: 2.406285.
Iteration 16715: Policy loss: 0.063423. Value loss: 0.024874. Entropy: 2.401937.
Iteration 16716: Policy loss: 0.070659. Value loss: 0.021197. Entropy: 2.391552.
episode: 5990   score: 370.0  epsilon: 1.0    steps: 775  evaluation reward: 467.3
Training network. lr: 0.000122. clip: 0.048704
Iteration 16717: Policy loss: 0.157775. Value loss: 0.048929. Entropy: 2.399108.
Iteration 16718: Policy loss: 0.148981. Value loss: 0.026054. Entropy: 2.382532.
Iteration 16719: Policy loss: 0.142806. Value loss: 0.022473. Entropy: 2.424940.
Training network. lr: 0.000122. clip: 0.048704
Iteration 16720: Policy loss: -0.050677. Value 

Iteration 16778: Policy loss: -0.016526. Value loss: 0.011838. Entropy: 2.746520.
Iteration 16779: Policy loss: -0.013884. Value loss: 0.010017. Entropy: 2.746603.
episode: 6011   score: 360.0  epsilon: 1.0    steps: 338  evaluation reward: 442.5
episode: 6012   score: 310.0  epsilon: 1.0    steps: 450  evaluation reward: 440.4
episode: 6013   score: 380.0  epsilon: 1.0    steps: 876  evaluation reward: 440.2
episode: 6014   score: 430.0  epsilon: 1.0    steps: 925  evaluation reward: 439.8
Training network. lr: 0.000121. clip: 0.048547
Iteration 16780: Policy loss: 0.041507. Value loss: 0.019786. Entropy: 2.733031.
Iteration 16781: Policy loss: 0.038703. Value loss: 0.021223. Entropy: 2.737138.
Iteration 16782: Policy loss: 0.042650. Value loss: 0.017334. Entropy: 2.739482.
Training network. lr: 0.000121. clip: 0.048547
Iteration 16783: Policy loss: -0.016288. Value loss: 0.016720. Entropy: 2.735531.
Iteration 16784: Policy loss: -0.008047. Value loss: 0.010932. Entropy: 2.733757.
Ite

Training network. lr: 0.000121. clip: 0.048400
Iteration 16843: Policy loss: 0.063449. Value loss: 0.024523. Entropy: 2.749210.
Iteration 16844: Policy loss: 0.066891. Value loss: 0.017098. Entropy: 2.751573.
Iteration 16845: Policy loss: 0.061839. Value loss: 0.015254. Entropy: 2.746852.
episode: 6035   score: 390.0  epsilon: 1.0    steps: 422  evaluation reward: 444.7
Training network. lr: 0.000121. clip: 0.048400
Iteration 16846: Policy loss: 0.019130. Value loss: 0.016805. Entropy: 2.748176.
Iteration 16847: Policy loss: 0.020835. Value loss: 0.014619. Entropy: 2.745132.
Iteration 16848: Policy loss: 0.016420. Value loss: 0.011164. Entropy: 2.744978.
episode: 6036   score: 350.0  epsilon: 1.0    steps: 330  evaluation reward: 439.9
Training network. lr: 0.000121. clip: 0.048400
Iteration 16849: Policy loss: 0.004682. Value loss: 0.029420. Entropy: 2.753998.
Iteration 16850: Policy loss: 0.003329. Value loss: 0.021246. Entropy: 2.756526.
Iteration 16851: Policy loss: 0.003922. Value

Training network. lr: 0.000120. clip: 0.048086
Iteration 16906: Policy loss: 0.003003. Value loss: 0.023280. Entropy: 2.746294.
Iteration 16907: Policy loss: -0.000327. Value loss: 0.014795. Entropy: 2.748720.
Iteration 16908: Policy loss: 0.005932. Value loss: 0.010724. Entropy: 2.745456.
Training network. lr: 0.000120. clip: 0.048086
Iteration 16909: Policy loss: 0.049757. Value loss: 0.016763. Entropy: 2.767396.
Iteration 16910: Policy loss: 0.052433. Value loss: 0.011524. Entropy: 2.761323.
Iteration 16911: Policy loss: 0.049183. Value loss: 0.008447. Entropy: 2.763336.
Training network. lr: 0.000120. clip: 0.048086
Iteration 16912: Policy loss: 0.024039. Value loss: 0.022514. Entropy: 2.770932.
Iteration 16913: Policy loss: 0.022952. Value loss: 0.014052. Entropy: 2.772234.
Iteration 16914: Policy loss: 0.015526. Value loss: 0.011498. Entropy: 2.769352.
episode: 6060   score: 390.0  epsilon: 1.0    steps: 300  evaluation reward: 416.1
Training network. lr: 0.000120. clip: 0.048086

episode: 6082   score: 320.0  epsilon: 1.0    steps: 440  evaluation reward: 404.4
Training network. lr: 0.000120. clip: 0.047939
Iteration 16972: Policy loss: -0.035638. Value loss: 0.039339. Entropy: 2.406566.
Iteration 16973: Policy loss: -0.036109. Value loss: 0.026014. Entropy: 2.429763.
Iteration 16974: Policy loss: -0.039833. Value loss: 0.017920. Entropy: 2.378409.
episode: 6083   score: 290.0  epsilon: 1.0    steps: 992  evaluation reward: 403.2
Training network. lr: 0.000120. clip: 0.047939
Iteration 16975: Policy loss: -0.473513. Value loss: 3.719634. Entropy: 2.137729.
Iteration 16976: Policy loss: -0.444149. Value loss: 2.171139. Entropy: 2.128342.
Iteration 16977: Policy loss: -0.434944. Value loss: 1.971993. Entropy: 2.135381.
episode: 6084   score: 1890.0  epsilon: 1.0    steps: 118  evaluation reward: 413.7
episode: 6085   score: 380.0  epsilon: 1.0    steps: 226  evaluation reward: 413.4
episode: 6086   score: 870.0  epsilon: 1.0    steps: 608  evaluation reward: 418.

Iteration 17035: Policy loss: -0.032058. Value loss: 0.033505. Entropy: 2.478523.
Iteration 17036: Policy loss: -0.022260. Value loss: 0.024843. Entropy: 2.504175.
Iteration 17037: Policy loss: -0.024093. Value loss: 0.019403. Entropy: 2.507560.
episode: 6106   score: 370.0  epsilon: 1.0    steps: 895  evaluation reward: 416.8
Training network. lr: 0.000119. clip: 0.047782
Iteration 17038: Policy loss: -0.231452. Value loss: 1.106357. Entropy: 2.432315.
Iteration 17039: Policy loss: -0.214466. Value loss: 0.678908. Entropy: 2.421473.
Iteration 17040: Policy loss: -0.235203. Value loss: 0.448560. Entropy: 2.389897.
Training network. lr: 0.000119. clip: 0.047782
Iteration 17041: Policy loss: -0.380970. Value loss: 3.131636. Entropy: 1.787485.
Iteration 17042: Policy loss: -0.384507. Value loss: 2.483159. Entropy: 1.758620.
Iteration 17043: Policy loss: -0.375718. Value loss: 2.044552. Entropy: 1.740493.
Training network. lr: 0.000119. clip: 0.047782
Iteration 17044: Policy loss: -0.06709

Iteration 17102: Policy loss: 0.006279. Value loss: 0.016122. Entropy: 2.729896.
Iteration 17103: Policy loss: 0.006320. Value loss: 0.013756. Entropy: 2.732150.
episode: 6127   score: 410.0  epsilon: 1.0    steps: 359  evaluation reward: 445.7
episode: 6128   score: 330.0  epsilon: 1.0    steps: 961  evaluation reward: 444.9
Training network. lr: 0.000119. clip: 0.047478
Iteration 17104: Policy loss: 0.072076. Value loss: 0.040141. Entropy: 2.716694.
Iteration 17105: Policy loss: 0.075561. Value loss: 0.023327. Entropy: 2.714912.
Iteration 17106: Policy loss: 0.073758. Value loss: 0.021483. Entropy: 2.712983.
Training network. lr: 0.000119. clip: 0.047478
Iteration 17107: Policy loss: -0.022796. Value loss: 0.022214. Entropy: 2.718827.
Iteration 17108: Policy loss: -0.020416. Value loss: 0.013770. Entropy: 2.712739.
Iteration 17109: Policy loss: -0.026011. Value loss: 0.011754. Entropy: 2.714420.
Training network. lr: 0.000119. clip: 0.047478
Iteration 17110: Policy loss: -0.090558. V

Iteration 17167: Policy loss: 0.039151. Value loss: 0.015079. Entropy: 2.743121.
Iteration 17168: Policy loss: 0.044673. Value loss: 0.008326. Entropy: 2.745960.
Iteration 17169: Policy loss: 0.041642. Value loss: 0.010542. Entropy: 2.741349.
Training network. lr: 0.000118. clip: 0.047321
Iteration 17170: Policy loss: -0.023035. Value loss: 0.014213. Entropy: 2.714314.
Iteration 17171: Policy loss: -0.028617. Value loss: 0.010036. Entropy: 2.707734.
Iteration 17172: Policy loss: -0.027434. Value loss: 0.008735. Entropy: 2.711429.
now time :  2019-02-27 09:09:28.940081
episode: 6151   score: 410.0  epsilon: 1.0    steps: 74  evaluation reward: 455.9
episode: 6152   score: 340.0  epsilon: 1.0    steps: 731  evaluation reward: 455.2
Training network. lr: 0.000118. clip: 0.047321
Iteration 17173: Policy loss: 0.059437. Value loss: 0.021339. Entropy: 2.738421.
Iteration 17174: Policy loss: 0.066303. Value loss: 0.012839. Entropy: 2.725353.
Iteration 17175: Policy loss: 0.060135. Value loss:

Iteration 17232: Policy loss: -0.103508. Value loss: 0.072595. Entropy: 2.033108.
episode: 6174   score: 910.0  epsilon: 1.0    steps: 487  evaluation reward: 450.0
episode: 6175   score: 320.0  epsilon: 1.0    steps: 976  evaluation reward: 450.0
Training network. lr: 0.000118. clip: 0.047165
Iteration 17233: Policy loss: -0.152263. Value loss: 0.569771. Entropy: 1.925513.
Iteration 17234: Policy loss: -0.138738. Value loss: 0.374751. Entropy: 1.939516.
Iteration 17235: Policy loss: -0.134794. Value loss: 0.305466. Entropy: 1.934985.
episode: 6176   score: 870.0  epsilon: 1.0    steps: 113  evaluation reward: 455.1
episode: 6177   score: 1840.0  epsilon: 1.0    steps: 697  evaluation reward: 469.5
Training network. lr: 0.000118. clip: 0.047165
Iteration 17236: Policy loss: -0.009056. Value loss: 0.094994. Entropy: 2.239449.
Iteration 17237: Policy loss: -0.004688. Value loss: 0.052675. Entropy: 2.236355.
Iteration 17238: Policy loss: 0.001926. Value loss: 0.041184. Entropy: 2.199659.


Iteration 17296: Policy loss: 0.037862. Value loss: 0.011458. Entropy: 2.726647.
Iteration 17297: Policy loss: 0.036861. Value loss: 0.008529. Entropy: 2.721322.
Iteration 17298: Policy loss: 0.037204. Value loss: 0.007306. Entropy: 2.720368.
episode: 6198   score: 240.0  epsilon: 1.0    steps: 297  evaluation reward: 452.6
episode: 6199   score: 280.0  epsilon: 1.0    steps: 955  evaluation reward: 451.0
Training network. lr: 0.000118. clip: 0.047017
Iteration 17299: Policy loss: 0.032019. Value loss: 0.015020. Entropy: 2.681376.
Iteration 17300: Policy loss: 0.032388. Value loss: 0.014291. Entropy: 2.680216.
Iteration 17301: Policy loss: 0.033079. Value loss: 0.007621. Entropy: 2.684333.
episode: 6200   score: 260.0  epsilon: 1.0    steps: 636  evaluation reward: 449.9
now time :  2019-02-27 09:12:02.145377
episode: 6201   score: 270.0  epsilon: 1.0    steps: 725  evaluation reward: 449.8
episode: 6202   score: 350.0  epsilon: 1.0    steps: 845  evaluation reward: 449.2
Training netw

Training network. lr: 0.000117. clip: 0.046704
Iteration 17359: Policy loss: 0.010945. Value loss: 0.018197. Entropy: 2.747386.
Iteration 17360: Policy loss: 0.013202. Value loss: 0.013775. Entropy: 2.755255.
Iteration 17361: Policy loss: 0.011885. Value loss: 0.013410. Entropy: 2.753280.
episode: 6224   score: 250.0  epsilon: 1.0    steps: 576  evaluation reward: 396.2
episode: 6225   score: 420.0  epsilon: 1.0    steps: 726  evaluation reward: 396.1
Training network. lr: 0.000117. clip: 0.046704
Iteration 17362: Policy loss: 0.033354. Value loss: 0.019037. Entropy: 2.757992.
Iteration 17363: Policy loss: 0.033698. Value loss: 0.018954. Entropy: 2.758822.
Iteration 17364: Policy loss: 0.041554. Value loss: 0.009612. Entropy: 2.761073.
Training network. lr: 0.000117. clip: 0.046704
Iteration 17365: Policy loss: -0.001468. Value loss: 0.014180. Entropy: 2.743562.
Iteration 17366: Policy loss: 0.004264. Value loss: 0.010293. Entropy: 2.731521.
Iteration 17367: Policy loss: -0.002448. Val

Iteration 17424: Policy loss: 0.105506. Value loss: 0.015294. Entropy: 2.715470.
episode: 6247   score: 280.0  epsilon: 1.0    steps: 313  evaluation reward: 393.4
Training network. lr: 0.000116. clip: 0.046556
Iteration 17425: Policy loss: 0.041215. Value loss: 0.030188. Entropy: 2.728247.
Iteration 17426: Policy loss: 0.040223. Value loss: 0.018685. Entropy: 2.722687.
Iteration 17427: Policy loss: 0.036099. Value loss: 0.015440. Entropy: 2.730276.
episode: 6248   score: 350.0  epsilon: 1.0    steps: 60  evaluation reward: 394.3
episode: 6249   score: 510.0  epsilon: 1.0    steps: 781  evaluation reward: 395.4
episode: 6250   score: 370.0  epsilon: 1.0    steps: 926  evaluation reward: 395.6
Training network. lr: 0.000116. clip: 0.046556
Iteration 17428: Policy loss: -0.004429. Value loss: 0.024654. Entropy: 2.663727.
Iteration 17429: Policy loss: -0.003286. Value loss: 0.018034. Entropy: 2.668739.
Iteration 17430: Policy loss: -0.006960. Value loss: 0.018979. Entropy: 2.673018.
Train

Training network. lr: 0.000116. clip: 0.046400
Iteration 17488: Policy loss: 0.038487. Value loss: 0.024424. Entropy: 2.452789.
Iteration 17489: Policy loss: 0.038277. Value loss: 0.015645. Entropy: 2.406919.
Iteration 17490: Policy loss: 0.039793. Value loss: 0.012561. Entropy: 2.427709.
episode: 6271   score: 980.0  epsilon: 1.0    steps: 692  evaluation reward: 416.0
Training network. lr: 0.000116. clip: 0.046400
Iteration 17491: Policy loss: 0.006197. Value loss: 0.032011. Entropy: 2.629198.
Iteration 17492: Policy loss: 0.001339. Value loss: 0.024636. Entropy: 2.619520.
Iteration 17493: Policy loss: 0.002291. Value loss: 0.020235. Entropy: 2.627463.
episode: 6272   score: 410.0  epsilon: 1.0    steps: 899  evaluation reward: 410.6
Training network. lr: 0.000116. clip: 0.046400
Iteration 17494: Policy loss: 0.024210. Value loss: 0.029657. Entropy: 2.746738.
Iteration 17495: Policy loss: 0.011645. Value loss: 0.023466. Entropy: 2.745677.
Iteration 17496: Policy loss: 0.016634. Value

episode: 6293   score: 360.0  epsilon: 1.0    steps: 309  evaluation reward: 392.7
episode: 6294   score: 390.0  epsilon: 1.0    steps: 408  evaluation reward: 392.7
Training network. lr: 0.000115. clip: 0.046096
Iteration 17554: Policy loss: 0.034615. Value loss: 0.024435. Entropy: 2.727660.
Iteration 17555: Policy loss: 0.034410. Value loss: 0.018809. Entropy: 2.729842.
Iteration 17556: Policy loss: 0.038292. Value loss: 0.011258. Entropy: 2.728419.
episode: 6295   score: 340.0  epsilon: 1.0    steps: 973  evaluation reward: 393.2
Training network. lr: 0.000115. clip: 0.046096
Iteration 17557: Policy loss: -0.001116. Value loss: 0.020327. Entropy: 2.772475.
Iteration 17558: Policy loss: 0.000403. Value loss: 0.014689. Entropy: 2.771687.
Iteration 17559: Policy loss: 0.004150. Value loss: 0.011169. Entropy: 2.769744.
episode: 6296   score: 460.0  epsilon: 1.0    steps: 798  evaluation reward: 394.5
Training network. lr: 0.000115. clip: 0.046096
Iteration 17560: Policy loss: -0.011896.

Training network. lr: 0.000115. clip: 0.045939
Iteration 17617: Policy loss: 0.044575. Value loss: 0.013408. Entropy: 2.738232.
Iteration 17618: Policy loss: 0.040081. Value loss: 0.008759. Entropy: 2.740511.
Iteration 17619: Policy loss: 0.038332. Value loss: 0.006447. Entropy: 2.737786.
Training network. lr: 0.000115. clip: 0.045939
Iteration 17620: Policy loss: 0.067782. Value loss: 0.015420. Entropy: 2.765733.
Iteration 17621: Policy loss: 0.074590. Value loss: 0.007781. Entropy: 2.767396.
Iteration 17622: Policy loss: 0.069909. Value loss: 0.008039. Entropy: 2.763206.
episode: 6318   score: 330.0  epsilon: 1.0    steps: 387  evaluation reward: 403.3
episode: 6319   score: 360.0  epsilon: 1.0    steps: 948  evaluation reward: 403.9
Training network. lr: 0.000115. clip: 0.045939
Iteration 17623: Policy loss: 0.041033. Value loss: 0.014551. Entropy: 2.740482.
Iteration 17624: Policy loss: 0.041313. Value loss: 0.009962. Entropy: 2.735387.
Iteration 17625: Policy loss: 0.039484. Value

Iteration 17682: Policy loss: 0.045913. Value loss: 0.032181. Entropy: 2.339538.
episode: 6341   score: 410.0  epsilon: 1.0    steps: 915  evaluation reward: 411.4
Training network. lr: 0.000114. clip: 0.045782
Iteration 17683: Policy loss: 0.049988. Value loss: 0.042203. Entropy: 2.332014.
Iteration 17684: Policy loss: 0.043568. Value loss: 0.026451. Entropy: 2.305030.
Iteration 17685: Policy loss: 0.051795. Value loss: 0.021835. Entropy: 2.318419.
episode: 6342   score: 410.0  epsilon: 1.0    steps: 106  evaluation reward: 412.1
episode: 6343   score: 880.0  epsilon: 1.0    steps: 221  evaluation reward: 416.9
episode: 6344   score: 350.0  epsilon: 1.0    steps: 457  evaluation reward: 416.0
episode: 6345   score: 480.0  epsilon: 1.0    steps: 653  evaluation reward: 411.1
Training network. lr: 0.000114. clip: 0.045782
Iteration 17686: Policy loss: 0.023429. Value loss: 0.053353. Entropy: 2.417411.
Iteration 17687: Policy loss: 0.025744. Value loss: 0.040265. Entropy: 2.430749.
Itera

episode: 6365   score: 300.0  epsilon: 1.0    steps: 213  evaluation reward: 412.4
episode: 6366   score: 320.0  epsilon: 1.0    steps: 335  evaluation reward: 411.2
Training network. lr: 0.000114. clip: 0.045635
Iteration 17746: Policy loss: 0.072926. Value loss: 0.020619. Entropy: 2.731786.
Iteration 17747: Policy loss: 0.067986. Value loss: 0.016305. Entropy: 2.731620.
Iteration 17748: Policy loss: 0.067833. Value loss: 0.013444. Entropy: 2.732114.
Training network. lr: 0.000114. clip: 0.045635
Iteration 17749: Policy loss: 0.028711. Value loss: 0.010778. Entropy: 2.710326.
Iteration 17750: Policy loss: 0.034977. Value loss: 0.009117. Entropy: 2.710881.
Iteration 17751: Policy loss: 0.028450. Value loss: 0.007450. Entropy: 2.709002.
episode: 6367   score: 330.0  epsilon: 1.0    steps: 950  evaluation reward: 410.2
Training network. lr: 0.000114. clip: 0.045478
Iteration 17752: Policy loss: 0.007027. Value loss: 0.025342. Entropy: 2.711978.
Iteration 17753: Policy loss: 0.001292. Val

Iteration 17810: Policy loss: 0.015090. Value loss: 0.017473. Entropy: 2.717703.
Iteration 17811: Policy loss: 0.008432. Value loss: 0.015460. Entropy: 2.712061.
Training network. lr: 0.000113. clip: 0.045321
Iteration 17812: Policy loss: 0.021205. Value loss: 0.025821. Entropy: 2.762352.
Iteration 17813: Policy loss: 0.023589. Value loss: 0.015820. Entropy: 2.759804.
Iteration 17814: Policy loss: 0.010452. Value loss: 0.014275. Entropy: 2.763913.
episode: 6389   score: 330.0  epsilon: 1.0    steps: 665  evaluation reward: 409.4
episode: 6390   score: 320.0  epsilon: 1.0    steps: 889  evaluation reward: 408.9
Training network. lr: 0.000113. clip: 0.045321
Iteration 17815: Policy loss: 0.005819. Value loss: 0.030527. Entropy: 2.699816.
Iteration 17816: Policy loss: 0.005987. Value loss: 0.020703. Entropy: 2.709187.
Iteration 17817: Policy loss: 0.007067. Value loss: 0.016235. Entropy: 2.702072.
Training network. lr: 0.000113. clip: 0.045321
Iteration 17818: Policy loss: -0.006293. Valu

Training network. lr: 0.000113. clip: 0.045174
Iteration 17875: Policy loss: 0.034307. Value loss: 0.018912. Entropy: 2.604689.
Iteration 17876: Policy loss: 0.038183. Value loss: 0.012542. Entropy: 2.611120.
Iteration 17877: Policy loss: 0.040889. Value loss: 0.011227. Entropy: 2.617699.
episode: 6412   score: 290.0  epsilon: 1.0    steps: 827  evaluation reward: 387.7
Training network. lr: 0.000113. clip: 0.045174
Iteration 17878: Policy loss: 0.057955. Value loss: 0.021433. Entropy: 2.589403.
Iteration 17879: Policy loss: 0.062535. Value loss: 0.013419. Entropy: 2.595110.
Iteration 17880: Policy loss: 0.059756. Value loss: 0.010750. Entropy: 2.597753.
episode: 6413   score: 240.0  epsilon: 1.0    steps: 172  evaluation reward: 386.3
episode: 6414   score: 340.0  epsilon: 1.0    steps: 375  evaluation reward: 386.0
Training network. lr: 0.000113. clip: 0.045174
Iteration 17881: Policy loss: 0.028412. Value loss: 0.022000. Entropy: 2.601257.
Iteration 17882: Policy loss: 0.023518. Val

episode: 6434   score: 300.0  epsilon: 1.0    steps: 459  evaluation reward: 391.7
Training network. lr: 0.000113. clip: 0.045017
Iteration 17941: Policy loss: -0.027698. Value loss: 0.053227. Entropy: 2.315584.
Iteration 17942: Policy loss: -0.021280. Value loss: 0.032919. Entropy: 2.323045.
Iteration 17943: Policy loss: -0.020215. Value loss: 0.033000. Entropy: 2.356549.
Training network. lr: 0.000113. clip: 0.045017
Iteration 17944: Policy loss: 0.007513. Value loss: 0.031511. Entropy: 2.333291.
Iteration 17945: Policy loss: 0.011216. Value loss: 0.019958. Entropy: 2.340176.
Iteration 17946: Policy loss: 0.012071. Value loss: 0.015279. Entropy: 2.363449.
episode: 6435   score: 890.0  epsilon: 1.0    steps: 690  evaluation reward: 396.8
Training network. lr: 0.000113. clip: 0.045017
Iteration 17947: Policy loss: -0.010894. Value loss: 0.037980. Entropy: 2.563126.
Iteration 17948: Policy loss: -0.011169. Value loss: 0.019862. Entropy: 2.566206.
Iteration 17949: Policy loss: -0.011767.

Iteration 18008: Policy loss: 0.040092. Value loss: 0.014307. Entropy: 2.755578.
Iteration 18009: Policy loss: 0.033453. Value loss: 0.012285. Entropy: 2.752728.
episode: 6454   score: 410.0  epsilon: 1.0    steps: 94  evaluation reward: 396.5
episode: 6455   score: 400.0  epsilon: 1.0    steps: 197  evaluation reward: 397.1
episode: 6456   score: 410.0  epsilon: 1.0    steps: 341  evaluation reward: 397.3
Training network. lr: 0.000112. clip: 0.044713
Iteration 18010: Policy loss: 0.106682. Value loss: 0.026795. Entropy: 2.715244.
Iteration 18011: Policy loss: 0.110665. Value loss: 0.016913. Entropy: 2.721204.
Iteration 18012: Policy loss: 0.104397. Value loss: 0.015896. Entropy: 2.722500.
Training network. lr: 0.000112. clip: 0.044713
Iteration 18013: Policy loss: 0.016726. Value loss: 0.017344. Entropy: 2.675832.
Iteration 18014: Policy loss: 0.015582. Value loss: 0.011343. Entropy: 2.680711.
Iteration 18015: Policy loss: 0.017176. Value loss: 0.008874. Entropy: 2.678458.
episode: 6

episode: 6478   score: 300.0  epsilon: 1.0    steps: 364  evaluation reward: 410.0
Training network. lr: 0.000111. clip: 0.044557
Iteration 18073: Policy loss: 0.143519. Value loss: 0.040910. Entropy: 2.703949.
Iteration 18074: Policy loss: 0.140257. Value loss: 0.029251. Entropy: 2.701769.
Iteration 18075: Policy loss: 0.147252. Value loss: 0.022147. Entropy: 2.697176.
Training network. lr: 0.000111. clip: 0.044557
Iteration 18076: Policy loss: 0.020858. Value loss: 0.027565. Entropy: 2.723642.
Iteration 18077: Policy loss: 0.022840. Value loss: 0.019571. Entropy: 2.718823.
Iteration 18078: Policy loss: 0.021163. Value loss: 0.016494. Entropy: 2.722088.
episode: 6479   score: 390.0  epsilon: 1.0    steps: 79  evaluation reward: 410.6
Training network. lr: 0.000111. clip: 0.044557
Iteration 18079: Policy loss: -0.297668. Value loss: 1.211208. Entropy: 2.363060.
Iteration 18080: Policy loss: -0.208002. Value loss: 0.698018. Entropy: 2.371224.
Iteration 18081: Policy loss: -0.227682. Val

episode: 6500   score: 310.0  epsilon: 1.0    steps: 117  evaluation reward: 413.7
Training network. lr: 0.000111. clip: 0.044400
Iteration 18139: Policy loss: 0.046588. Value loss: 0.051630. Entropy: 2.353744.
Iteration 18140: Policy loss: 0.028549. Value loss: 0.026934. Entropy: 2.373231.
Iteration 18141: Policy loss: 0.035474. Value loss: 0.017875. Entropy: 2.367407.
now time :  2019-02-27 09:28:30.610860
episode: 6501   score: 340.0  epsilon: 1.0    steps: 150  evaluation reward: 414.6
Training network. lr: 0.000111. clip: 0.044400
Iteration 18142: Policy loss: -0.063261. Value loss: 0.027217. Entropy: 2.338335.
Iteration 18143: Policy loss: -0.081384. Value loss: 0.020778. Entropy: 2.338035.
Iteration 18144: Policy loss: -0.070899. Value loss: 0.015018. Entropy: 2.364056.
episode: 6502   score: 860.0  epsilon: 1.0    steps: 529  evaluation reward: 420.3
Training network. lr: 0.000111. clip: 0.044400
Iteration 18145: Policy loss: -0.043825. Value loss: 0.029520. Entropy: 2.648037.


episode: 6522   score: 380.0  epsilon: 1.0    steps: 269  evaluation reward: 442.1
Training network. lr: 0.000110. clip: 0.044096
Iteration 18205: Policy loss: -0.024636. Value loss: 0.018059. Entropy: 2.735639.
Iteration 18206: Policy loss: -0.022471. Value loss: 0.011296. Entropy: 2.730605.
Iteration 18207: Policy loss: -0.020016. Value loss: 0.008555. Entropy: 2.729510.
episode: 6523   score: 400.0  epsilon: 1.0    steps: 13  evaluation reward: 441.9
Training network. lr: 0.000110. clip: 0.044096
Iteration 18208: Policy loss: 0.054797. Value loss: 0.026819. Entropy: 2.707030.
Iteration 18209: Policy loss: 0.043329. Value loss: 0.018026. Entropy: 2.707726.
Iteration 18210: Policy loss: 0.045207. Value loss: 0.013378. Entropy: 2.705396.
Training network. lr: 0.000110. clip: 0.044096
Iteration 18211: Policy loss: -0.129237. Value loss: 0.591187. Entropy: 2.583066.
Iteration 18212: Policy loss: -0.103650. Value loss: 0.286444. Entropy: 2.533433.
Iteration 18213: Policy loss: -0.106190. 

Iteration 18270: Policy loss: 0.021194. Value loss: 0.024407. Entropy: 2.313895.
episode: 6545   score: 450.0  epsilon: 1.0    steps: 869  evaluation reward: 450.0
Training network. lr: 0.000110. clip: 0.043939
Iteration 18271: Policy loss: 0.036706. Value loss: 0.036271. Entropy: 2.592044.
Iteration 18272: Policy loss: 0.036959. Value loss: 0.024922. Entropy: 2.579501.
Iteration 18273: Policy loss: 0.033651. Value loss: 0.021724. Entropy: 2.585568.
Training network. lr: 0.000110. clip: 0.043939
Iteration 18274: Policy loss: 0.133207. Value loss: 0.034184. Entropy: 2.625222.
Iteration 18275: Policy loss: 0.121215. Value loss: 0.018305. Entropy: 2.623300.
Iteration 18276: Policy loss: 0.130670. Value loss: 0.017113. Entropy: 2.615543.
episode: 6546   score: 410.0  epsilon: 1.0    steps: 302  evaluation reward: 444.9
episode: 6547   score: 360.0  epsilon: 1.0    steps: 1018  evaluation reward: 439.8
Training network. lr: 0.000110. clip: 0.043939
Iteration 18277: Policy loss: 0.066111. Va

Iteration 18336: Policy loss: -0.042944. Value loss: 0.016335. Entropy: 2.209428.
Training network. lr: 0.000109. clip: 0.043792
Iteration 18337: Policy loss: -0.045327. Value loss: 0.036525. Entropy: 2.280255.
Iteration 18338: Policy loss: -0.052512. Value loss: 0.019656. Entropy: 2.307652.
Iteration 18339: Policy loss: -0.054673. Value loss: 0.015691. Entropy: 2.310611.
Training network. lr: 0.000109. clip: 0.043792
Iteration 18340: Policy loss: -0.003291. Value loss: 0.033358. Entropy: 2.326544.
Iteration 18341: Policy loss: 0.000116. Value loss: 0.020536. Entropy: 2.324062.
Iteration 18342: Policy loss: -0.004343. Value loss: 0.016779. Entropy: 2.314694.
episode: 6567   score: 430.0  epsilon: 1.0    steps: 167  evaluation reward: 429.2
Training network. lr: 0.000109. clip: 0.043792
Iteration 18343: Policy loss: -0.075465. Value loss: 0.695816. Entropy: 2.010166.
Iteration 18344: Policy loss: -0.070908. Value loss: 0.394087. Entropy: 1.972936.
Iteration 18345: Policy loss: -0.079924

Training network. lr: 0.000109. clip: 0.043478
Iteration 18403: Policy loss: 0.063959. Value loss: 0.030101. Entropy: 2.344567.
Iteration 18404: Policy loss: 0.064911. Value loss: 0.017708. Entropy: 2.345544.
Iteration 18405: Policy loss: 0.061985. Value loss: 0.013839. Entropy: 2.352161.
episode: 6588   score: 440.0  epsilon: 1.0    steps: 200  evaluation reward: 438.4
episode: 6589   score: 410.0  epsilon: 1.0    steps: 260  evaluation reward: 439.2
episode: 6590   score: 880.0  epsilon: 1.0    steps: 385  evaluation reward: 444.0
Training network. lr: 0.000109. clip: 0.043478
Iteration 18406: Policy loss: 0.023445. Value loss: 0.026424. Entropy: 2.592360.
Iteration 18407: Policy loss: 0.026724. Value loss: 0.017926. Entropy: 2.597248.
Iteration 18408: Policy loss: 0.028206. Value loss: 0.015582. Entropy: 2.593701.
Training network. lr: 0.000109. clip: 0.043478
Iteration 18409: Policy loss: 0.064422. Value loss: 0.032611. Entropy: 2.590241.
Iteration 18410: Policy loss: 0.068481. Val

Iteration 18467: Policy loss: -0.000233. Value loss: 0.026152. Entropy: 2.593284.
Iteration 18468: Policy loss: 0.005548. Value loss: 0.018831. Entropy: 2.595498.
Training network. lr: 0.000108. clip: 0.043331
Iteration 18469: Policy loss: -0.020477. Value loss: 0.582124. Entropy: 2.535678.
Iteration 18470: Policy loss: 0.007736. Value loss: 0.294694. Entropy: 2.534104.
Iteration 18471: Policy loss: -0.006886. Value loss: 0.285093. Entropy: 2.525408.
Training network. lr: 0.000108. clip: 0.043331
Iteration 18472: Policy loss: 0.033650. Value loss: 0.042407. Entropy: 2.350241.
Iteration 18473: Policy loss: 0.025261. Value loss: 0.028493. Entropy: 2.286631.
Iteration 18474: Policy loss: 0.033784. Value loss: 0.022239. Entropy: 2.357970.
Training network. lr: 0.000108. clip: 0.043331
Iteration 18475: Policy loss: -0.027338. Value loss: 0.043734. Entropy: 2.312193.
Iteration 18476: Policy loss: -0.031731. Value loss: 0.030973. Entropy: 2.330158.
Iteration 18477: Policy loss: -0.033466. Val

episode: 6632   score: 350.0  epsilon: 1.0    steps: 702  evaluation reward: 457.3
Training network. lr: 0.000108. clip: 0.043174
Iteration 18535: Policy loss: 0.038365. Value loss: 0.019095. Entropy: 2.745561.
Iteration 18536: Policy loss: 0.046770. Value loss: 0.012524. Entropy: 2.744948.
Iteration 18537: Policy loss: 0.045068. Value loss: 0.011970. Entropy: 2.742335.
episode: 6633   score: 280.0  epsilon: 1.0    steps: 87  evaluation reward: 451.6
Training network. lr: 0.000108. clip: 0.043174
Iteration 18538: Policy loss: -0.119578. Value loss: 0.730348. Entropy: 2.695083.
Iteration 18539: Policy loss: -0.108703. Value loss: 0.314901. Entropy: 2.674226.
Iteration 18540: Policy loss: -0.104508. Value loss: 0.156834. Entropy: 2.691624.
episode: 6634   score: 380.0  epsilon: 1.0    steps: 771  evaluation reward: 452.8
Training network. lr: 0.000108. clip: 0.043174
Iteration 18541: Policy loss: -0.029377. Value loss: 0.052150. Entropy: 2.388289.
Iteration 18542: Policy loss: -0.029645.

Iteration 18599: Policy loss: -0.008583. Value loss: 0.015318. Entropy: 2.772261.
Iteration 18600: Policy loss: -0.006340. Value loss: 0.012956. Entropy: 2.766381.
episode: 6656   score: 260.0  epsilon: 1.0    steps: 639  evaluation reward: 438.7
Training network. lr: 0.000107. clip: 0.042870
Iteration 18601: Policy loss: 0.032991. Value loss: 0.010704. Entropy: 2.782031.
Iteration 18602: Policy loss: 0.030729. Value loss: 0.007349. Entropy: 2.779908.
Iteration 18603: Policy loss: 0.030460. Value loss: 0.005881. Entropy: 2.783694.
episode: 6657   score: 400.0  epsilon: 1.0    steps: 505  evaluation reward: 438.7
Training network. lr: 0.000107. clip: 0.042870
Iteration 18604: Policy loss: 0.042868. Value loss: 0.017388. Entropy: 2.782530.
Iteration 18605: Policy loss: 0.029503. Value loss: 0.015039. Entropy: 2.782633.
Iteration 18606: Policy loss: 0.042056. Value loss: 0.010203. Entropy: 2.784064.
Training network. lr: 0.000107. clip: 0.042870
Iteration 18607: Policy loss: 0.014578. Val

Training network. lr: 0.000107. clip: 0.042713
Iteration 18664: Policy loss: 0.009820. Value loss: 0.016578. Entropy: 2.761378.
Iteration 18665: Policy loss: 0.012333. Value loss: 0.012889. Entropy: 2.764324.
Iteration 18666: Policy loss: 0.006228. Value loss: 0.009937. Entropy: 2.761248.
episode: 6680   score: 310.0  epsilon: 1.0    steps: 34  evaluation reward: 406.1
episode: 6681   score: 370.0  epsilon: 1.0    steps: 551  evaluation reward: 405.8
Training network. lr: 0.000107. clip: 0.042713
Iteration 18667: Policy loss: 0.035695. Value loss: 0.013632. Entropy: 2.756993.
Iteration 18668: Policy loss: 0.029267. Value loss: 0.012719. Entropy: 2.762226.
Iteration 18669: Policy loss: 0.031091. Value loss: 0.010825. Entropy: 2.756428.
Training network. lr: 0.000107. clip: 0.042713
Iteration 18670: Policy loss: -0.004318. Value loss: 0.019379. Entropy: 2.802611.
Iteration 18671: Policy loss: -0.002859. Value loss: 0.013054. Entropy: 2.803483.
Iteration 18672: Policy loss: -0.002392. Val

Iteration 18729: Policy loss: 0.049735. Value loss: 0.018316. Entropy: 2.459216.
episode: 6703   score: 320.0  epsilon: 1.0    steps: 647  evaluation reward: 385.9
Training network. lr: 0.000106. clip: 0.042557
Iteration 18730: Policy loss: 0.036016. Value loss: 0.032393. Entropy: 2.616760.
Iteration 18731: Policy loss: 0.036547. Value loss: 0.022755. Entropy: 2.620233.
Iteration 18732: Policy loss: 0.031231. Value loss: 0.019690. Entropy: 2.617762.
episode: 6704   score: 280.0  epsilon: 1.0    steps: 234  evaluation reward: 380.9
Training network. lr: 0.000106. clip: 0.042557
Iteration 18733: Policy loss: 0.002642. Value loss: 0.023414. Entropy: 2.622054.
Iteration 18734: Policy loss: 0.002566. Value loss: 0.019913. Entropy: 2.636898.
Iteration 18735: Policy loss: 0.001257. Value loss: 0.015404. Entropy: 2.628110.
episode: 6705   score: 420.0  epsilon: 1.0    steps: 593  evaluation reward: 381.5
Training network. lr: 0.000106. clip: 0.042557
Iteration 18736: Policy loss: 0.023928. Val

Iteration 18795: Policy loss: 0.105013. Value loss: 0.027625. Entropy: 2.685873.
Training network. lr: 0.000106. clip: 0.042409
Iteration 18796: Policy loss: 0.033038. Value loss: 0.023660. Entropy: 2.700654.
Iteration 18797: Policy loss: 0.029206. Value loss: 0.020120. Entropy: 2.695378.
Iteration 18798: Policy loss: 0.026959. Value loss: 0.016735. Entropy: 2.695599.
episode: 6725   score: 370.0  epsilon: 1.0    steps: 800  evaluation reward: 389.7
Training network. lr: 0.000106. clip: 0.042409
Iteration 18799: Policy loss: -0.005181. Value loss: 0.031686. Entropy: 2.709297.
Iteration 18800: Policy loss: -0.009833. Value loss: 0.020496. Entropy: 2.711182.
Iteration 18801: Policy loss: -0.009989. Value loss: 0.017137. Entropy: 2.713339.
episode: 6726   score: 340.0  epsilon: 1.0    steps: 557  evaluation reward: 389.1
Training network. lr: 0.000106. clip: 0.042253
Iteration 18802: Policy loss: 0.005661. Value loss: 0.038888. Entropy: 2.677098.
Iteration 18803: Policy loss: 0.012593. Va

episode: 6746   score: 410.0  epsilon: 1.0    steps: 436  evaluation reward: 403.9
Training network. lr: 0.000105. clip: 0.042096
Iteration 18862: Policy loss: 0.026963. Value loss: 0.028400. Entropy: 2.702650.
Iteration 18863: Policy loss: 0.021212. Value loss: 0.019287. Entropy: 2.712115.
Iteration 18864: Policy loss: 0.023939. Value loss: 0.017581. Entropy: 2.706524.
episode: 6747   score: 260.0  epsilon: 1.0    steps: 856  evaluation reward: 403.7
Training network. lr: 0.000105. clip: 0.042096
Iteration 18865: Policy loss: -0.135255. Value loss: 0.716744. Entropy: 2.680296.
Iteration 18866: Policy loss: -0.099358. Value loss: 0.333740. Entropy: 2.690642.
Iteration 18867: Policy loss: -0.115656. Value loss: 0.395436. Entropy: 2.685685.
Training network. lr: 0.000105. clip: 0.042096
Iteration 18868: Policy loss: 0.062659. Value loss: 0.026884. Entropy: 2.356723.
Iteration 18869: Policy loss: 0.055417. Value loss: 0.017654. Entropy: 2.383683.
Iteration 18870: Policy loss: 0.060762. Va

Iteration 18929: Policy loss: 0.012795. Value loss: 0.018408. Entropy: 2.643163.
Iteration 18930: Policy loss: 0.014926. Value loss: 0.015060. Entropy: 2.636411.
Training network. lr: 0.000105. clip: 0.041948
Iteration 18931: Policy loss: 0.011719. Value loss: 0.025813. Entropy: 2.620059.
Iteration 18932: Policy loss: 0.008988. Value loss: 0.020095. Entropy: 2.621374.
Iteration 18933: Policy loss: 0.010435. Value loss: 0.015953. Entropy: 2.620380.
episode: 6766   score: 390.0  epsilon: 1.0    steps: 355  evaluation reward: 453.1
Training network. lr: 0.000105. clip: 0.041948
Iteration 18934: Policy loss: -0.165014. Value loss: 1.106041. Entropy: 2.455945.
Iteration 18935: Policy loss: -0.131673. Value loss: 0.570473. Entropy: 2.491478.
Iteration 18936: Policy loss: -0.128179. Value loss: 0.442576. Entropy: 2.447331.
episode: 6767   score: 490.0  epsilon: 1.0    steps: 460  evaluation reward: 454.7
episode: 6768   score: 980.0  epsilon: 1.0    steps: 585  evaluation reward: 459.8
Traini

Iteration 18996: Policy loss: 0.025486. Value loss: 0.020527. Entropy: 2.664300.
episode: 6787   score: 380.0  epsilon: 1.0    steps: 813  evaluation reward: 485.4
episode: 6788   score: 370.0  epsilon: 1.0    steps: 1018  evaluation reward: 485.9
Training network. lr: 0.000104. clip: 0.041792
Iteration 18997: Policy loss: 0.001426. Value loss: 0.051228. Entropy: 2.685220.
Iteration 18998: Policy loss: -0.005662. Value loss: 0.033920. Entropy: 2.691989.
Iteration 18999: Policy loss: 0.012079. Value loss: 0.026113. Entropy: 2.690034.
Training network. lr: 0.000104. clip: 0.041792
Iteration 19000: Policy loss: 0.033443. Value loss: 0.018824. Entropy: 2.703261.
Iteration 19001: Policy loss: 0.036705. Value loss: 0.011658. Entropy: 2.701748.
Iteration 19002: Policy loss: 0.030054. Value loss: 0.009946. Entropy: 2.702643.
Training network. lr: 0.000104. clip: 0.041635
Iteration 19003: Policy loss: 0.075412. Value loss: 0.046117. Entropy: 2.690220.
Iteration 19004: Policy loss: 0.087101. Val

episode: 6808   score: 390.0  epsilon: 1.0    steps: 597  evaluation reward: 479.5
Training network. lr: 0.000104. clip: 0.041488
Iteration 19063: Policy loss: 0.039484. Value loss: 0.025522. Entropy: 2.676211.
Iteration 19064: Policy loss: 0.043870. Value loss: 0.014052. Entropy: 2.672423.
Iteration 19065: Policy loss: 0.036512. Value loss: 0.010686. Entropy: 2.670335.
Training network. lr: 0.000104. clip: 0.041488
Iteration 19066: Policy loss: -0.001743. Value loss: 0.017567. Entropy: 2.700303.
Iteration 19067: Policy loss: -0.000992. Value loss: 0.011747. Entropy: 2.694954.
Iteration 19068: Policy loss: -0.004992. Value loss: 0.009780. Entropy: 2.698231.
episode: 6809   score: 410.0  epsilon: 1.0    steps: 220  evaluation reward: 480.8
Training network. lr: 0.000104. clip: 0.041488
Iteration 19069: Policy loss: -0.000354. Value loss: 0.016335. Entropy: 2.717566.
Iteration 19070: Policy loss: -0.003339. Value loss: 0.011236. Entropy: 2.718439.
Iteration 19071: Policy loss: 0.002920. 

Iteration 19130: Policy loss: -0.002802. Value loss: 0.021908. Entropy: 2.736236.
Iteration 19131: Policy loss: -0.006966. Value loss: 0.016118. Entropy: 2.736233.
episode: 6829   score: 320.0  epsilon: 1.0    steps: 376  evaluation reward: 469.6
Training network. lr: 0.000103. clip: 0.041331
Iteration 19132: Policy loss: 0.120031. Value loss: 0.038890. Entropy: 2.747196.
Iteration 19133: Policy loss: 0.116410. Value loss: 0.024485. Entropy: 2.752725.
Iteration 19134: Policy loss: 0.101068. Value loss: 0.018242. Entropy: 2.749097.
episode: 6830   score: 430.0  epsilon: 1.0    steps: 589  evaluation reward: 464.6
episode: 6831   score: 450.0  epsilon: 1.0    steps: 678  evaluation reward: 465.7
Training network. lr: 0.000103. clip: 0.041331
Iteration 19135: Policy loss: 0.085744. Value loss: 0.026405. Entropy: 2.715225.
Iteration 19136: Policy loss: 0.087858. Value loss: 0.018205. Entropy: 2.715337.
Iteration 19137: Policy loss: 0.087771. Value loss: 0.013288. Entropy: 2.715543.
episode

Iteration 19193: Policy loss: 0.050600. Value loss: 0.015445. Entropy: 2.697550.
Iteration 19194: Policy loss: 0.049924. Value loss: 0.014976. Entropy: 2.707214.
Training network. lr: 0.000103. clip: 0.041174
Iteration 19195: Policy loss: 0.035991. Value loss: 0.026139. Entropy: 2.734352.
Iteration 19196: Policy loss: 0.043276. Value loss: 0.015996. Entropy: 2.737999.
Iteration 19197: Policy loss: 0.040508. Value loss: 0.011884. Entropy: 2.738032.
episode: 6854   score: 400.0  epsilon: 1.0    steps: 386  evaluation reward: 450.0
episode: 6855   score: 440.0  epsilon: 1.0    steps: 909  evaluation reward: 435.3
Training network. lr: 0.000103. clip: 0.041174
Iteration 19198: Policy loss: -0.163965. Value loss: 0.531481. Entropy: 2.633547.
Iteration 19199: Policy loss: -0.173072. Value loss: 0.153186. Entropy: 2.590167.
Iteration 19200: Policy loss: -0.161111. Value loss: 0.099295. Entropy: 2.602510.
Training network. lr: 0.000103. clip: 0.041027
Iteration 19201: Policy loss: 0.061467. Va

Iteration 19260: Policy loss: 0.046051. Value loss: 0.021478. Entropy: 2.589510.
episode: 6875   score: 1860.0  epsilon: 1.0    steps: 327  evaluation reward: 430.6
episode: 6876   score: 430.0  epsilon: 1.0    steps: 833  evaluation reward: 431.2
Training network. lr: 0.000102. clip: 0.040870
Iteration 19261: Policy loss: 0.046579. Value loss: 0.049651. Entropy: 2.640240.
Iteration 19262: Policy loss: 0.040076. Value loss: 0.036316. Entropy: 2.635164.
Iteration 19263: Policy loss: 0.038138. Value loss: 0.033399. Entropy: 2.639867.
episode: 6877   score: 460.0  epsilon: 1.0    steps: 443  evaluation reward: 431.5
Training network. lr: 0.000102. clip: 0.040870
Iteration 19264: Policy loss: 0.017184. Value loss: 0.075598. Entropy: 2.656761.
Iteration 19265: Policy loss: 0.023175. Value loss: 0.058199. Entropy: 2.659715.
Iteration 19266: Policy loss: 0.013288. Value loss: 0.045485. Entropy: 2.654335.
Training network. lr: 0.000102. clip: 0.040870
Iteration 19267: Policy loss: 0.031582. Va

episode: 6896   score: 430.0  epsilon: 1.0    steps: 521  evaluation reward: 435.4
Training network. lr: 0.000102. clip: 0.040713
Iteration 19327: Policy loss: -0.019440. Value loss: 0.030781. Entropy: 2.685478.
Iteration 19328: Policy loss: -0.014510. Value loss: 0.016216. Entropy: 2.691503.
Iteration 19329: Policy loss: -0.017442. Value loss: 0.014495. Entropy: 2.695205.
episode: 6897   score: 440.0  epsilon: 1.0    steps: 183  evaluation reward: 436.2
Training network. lr: 0.000102. clip: 0.040713
Iteration 19330: Policy loss: 0.048598. Value loss: 0.020418. Entropy: 2.686926.
Iteration 19331: Policy loss: 0.050360. Value loss: 0.015786. Entropy: 2.691554.
Iteration 19332: Policy loss: 0.046825. Value loss: 0.015463. Entropy: 2.691249.
Training network. lr: 0.000102. clip: 0.040713
Iteration 19333: Policy loss: 0.094649. Value loss: 0.025464. Entropy: 2.718697.
Iteration 19334: Policy loss: 0.095530. Value loss: 0.016293. Entropy: 2.711522.
Iteration 19335: Policy loss: 0.094182. Va

Iteration 19394: Policy loss: 0.061223. Value loss: 0.014292. Entropy: 2.735574.
Iteration 19395: Policy loss: 0.057869. Value loss: 0.012286. Entropy: 2.734450.
episode: 6916   score: 350.0  epsilon: 1.0    steps: 396  evaluation reward: 443.6
episode: 6917   score: 410.0  epsilon: 1.0    steps: 570  evaluation reward: 444.6
episode: 6918   score: 360.0  epsilon: 1.0    steps: 732  evaluation reward: 443.9
Training network. lr: 0.000101. clip: 0.040566
Iteration 19396: Policy loss: 0.059578. Value loss: 0.031412. Entropy: 2.727205.
Iteration 19397: Policy loss: 0.056633. Value loss: 0.022588. Entropy: 2.718465.
Iteration 19398: Policy loss: 0.054509. Value loss: 0.020929. Entropy: 2.718506.
Training network. lr: 0.000101. clip: 0.040566
Iteration 19399: Policy loss: 0.058412. Value loss: 0.024807. Entropy: 2.721362.
Iteration 19400: Policy loss: 0.054702. Value loss: 0.019953. Entropy: 2.719489.
Iteration 19401: Policy loss: 0.055640. Value loss: 0.013996. Entropy: 2.723854.
episode: 

Iteration 19459: Policy loss: 0.063293. Value loss: 0.025202. Entropy: 2.687689.
Iteration 19460: Policy loss: 0.072857. Value loss: 0.015096. Entropy: 2.686357.
Iteration 19461: Policy loss: 0.063048. Value loss: 0.011093. Entropy: 2.682626.
Training network. lr: 0.000101. clip: 0.040253
Iteration 19462: Policy loss: 0.049451. Value loss: 0.019614. Entropy: 2.688149.
Iteration 19463: Policy loss: 0.048528. Value loss: 0.014518. Entropy: 2.681479.
Iteration 19464: Policy loss: 0.042184. Value loss: 0.010683. Entropy: 2.687719.
episode: 6939   score: 380.0  epsilon: 1.0    steps: 144  evaluation reward: 455.2
Training network. lr: 0.000101. clip: 0.040253
Iteration 19465: Policy loss: -0.098694. Value loss: 0.461701. Entropy: 2.699854.
Iteration 19466: Policy loss: -0.099654. Value loss: 0.533758. Entropy: 2.684736.
Iteration 19467: Policy loss: -0.077814. Value loss: 0.345243. Entropy: 2.668101.
episode: 6940   score: 940.0  epsilon: 1.0    steps: 723  evaluation reward: 460.8
Training

Iteration 19526: Policy loss: -0.000376. Value loss: 0.008634. Entropy: 2.747331.
Iteration 19527: Policy loss: 0.000210. Value loss: 0.007605. Entropy: 2.747658.
episode: 6960   score: 410.0  epsilon: 1.0    steps: 925  evaluation reward: 447.2
Training network. lr: 0.000100. clip: 0.040105
Iteration 19528: Policy loss: 0.017743. Value loss: 0.019385. Entropy: 2.771859.
Iteration 19529: Policy loss: 0.019828. Value loss: 0.011228. Entropy: 2.768120.
Iteration 19530: Policy loss: 0.019084. Value loss: 0.008857. Entropy: 2.767966.
episode: 6961   score: 430.0  epsilon: 1.0    steps: 455  evaluation reward: 448.6
Training network. lr: 0.000100. clip: 0.040105
Iteration 19531: Policy loss: 0.028981. Value loss: 0.012139. Entropy: 2.786672.
Iteration 19532: Policy loss: 0.027138. Value loss: 0.009603. Entropy: 2.789460.
Iteration 19533: Policy loss: 0.025878. Value loss: 0.008371. Entropy: 2.787711.
episode: 6962   score: 240.0  epsilon: 1.0    steps: 201  evaluation reward: 446.2
episode:

Iteration 19592: Policy loss: 0.058940. Value loss: 0.011879. Entropy: 2.797616.
Iteration 19593: Policy loss: 0.056339. Value loss: 0.009675. Entropy: 2.799191.
episode: 6982   score: 450.0  epsilon: 1.0    steps: 917  evaluation reward: 418.2
Training network. lr: 0.000100. clip: 0.039949
Iteration 19594: Policy loss: -0.038987. Value loss: 0.019247. Entropy: 2.765777.
Iteration 19595: Policy loss: -0.042894. Value loss: 0.012690. Entropy: 2.766421.
Iteration 19596: Policy loss: -0.035663. Value loss: 0.011156. Entropy: 2.766535.
episode: 6983   score: 330.0  epsilon: 1.0    steps: 29  evaluation reward: 412.3
Training network. lr: 0.000100. clip: 0.039949
Iteration 19597: Policy loss: -0.001225. Value loss: 0.014714. Entropy: 2.790957.
Iteration 19598: Policy loss: -0.004411. Value loss: 0.011715. Entropy: 2.790993.
Iteration 19599: Policy loss: -0.003121. Value loss: 0.011040. Entropy: 2.790483.
episode: 6984   score: 420.0  epsilon: 1.0    steps: 874  evaluation reward: 413.1
Trai