# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10

### Main Training Loop

In [2]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
max_eval = -np.inf
reset_count = 0

while (frame < 10000000):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                    
                    if np.mean(evaluation_reward) > max_eval:
                        torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
                        max_eval = float(np.mean(evaluation_reward))
                        reset_count = 0
                    elif e > 5000:
                        reset_count += 1
                        """
                        if (reset_count == reset_max):
                            print("Training went nowhere, starting again at best model")
                            agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                            agent.update_target_net()
                            reset_count = 0
                        """
                e += 1
                evaluation_reward.append(env.score)
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                
                env.done = False
                env.score = 0
                env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()


NameError: name 'envs' is not defined

In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

In [None]:
### Loop through all environments and run PPO on them

env_names = ['Breakout-v0', 'Phoenix-v0', 'Asteroids-v0', 'SpaceInvaders-v0', 'MsPacman-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']
for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0' or name == 'Breakout-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size)
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[HISTORY_SIZE-1,:,:] for i in range(num_envs)])
            next_states = []
            net_in = np.stack([envs[i].history[:HISTORY_SIZE,:,:] for i in range(num_envs)])
            step += num_envs
            frame += num_envs
            actions, values = agent.get_action(np.float32(net_in) / 255.)
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, deepcopy(curr_states[i]), actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    _, frame_next_vals = agent.get_action(np.float32(net_in) / 255.)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()
    
    for i in range(len(envs)):
        envs[i]._env.close()
    del envs




 ------- STARTING TRAINING FOR Breakout-v0 ------- 





  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Determing min/max rewards of environment
Min: 0. Max: 4.


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: 0.002390. Value loss: 0.227342. Entropy: 1.384487.
Iteration 2: Policy loss: 0.001047. Value loss: 0.233750. Entropy: 1.385055.
Iteration 3: Policy loss: 0.000125. Value loss: 0.223898. Entropy: 1.384596.
now time :  2019-03-06 12:27:39.413569
episode: 1   score: 0.0  epsilon: 1.0    steps: 304  evaluation reward: 0.0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode: 2   score: 0.0  epsilon: 1.0    steps: 448  evaluation reward: 0.0
episode: 3   score: 0.0  epsilon: 1.0    steps: 464  evaluation reward: 0.0
episode: 4   score: 1.0  epsilon: 1.0    steps: 704  evaluation reward: 0.25
episode: 5   score: 1.0  epsilon: 1.0    steps: 936  evaluation reward: 0.4
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: 0.000030. Value loss: 0.191240. Entropy: 1.384423.
Iteration 5: Policy loss: -0.001439. Value loss: 0.187264. Entropy: 1.383807.
Iteration 6: Policy loss: 0.000962. Value loss: 0.168607. Entropy: 1.384462.
episode: 6   score: 2.0  epsilon: 1.0    steps: 32  evaluation reward: 0.6666666666666666
episode: 7   score: 2.0  epsilon: 1.0    steps: 56  evaluation reward: 0.8571428571428571
episode: 8   score: 2.0  epsilon: 1.0    steps: 144  evaluation reward: 1.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: -0.001907. Value loss: 0.206907. Entropy: 1.383122.
Iteration 8: Policy loss: -0.004782

Iteration 41: Policy loss: 0.001151. Value loss: 0.080172. Entropy: 1.379997.
Iteration 42: Policy loss: -0.000961. Value loss: 0.077325. Entropy: 1.377936.
episode: 56   score: 1.0  epsilon: 1.0    steps: 400  evaluation reward: 1.1964285714285714
episode: 57   score: 0.0  epsilon: 1.0    steps: 408  evaluation reward: 1.1754385964912282
episode: 58   score: 2.0  epsilon: 1.0    steps: 704  evaluation reward: 1.1896551724137931
Training network. lr: 0.000250. clip: 0.100000
Iteration 43: Policy loss: 0.000578. Value loss: 0.341171. Entropy: 1.375225.
Iteration 44: Policy loss: -0.001975. Value loss: 0.312889. Entropy: 1.378906.
Iteration 45: Policy loss: -0.003862. Value loss: 0.288536. Entropy: 1.378144.
episode: 59   score: 0.0  epsilon: 1.0    steps: 16  evaluation reward: 1.1694915254237288
episode: 60   score: 2.0  epsilon: 1.0    steps: 440  evaluation reward: 1.1833333333333333
episode: 61   score: 5.0  epsilon: 1.0    steps: 464  evaluation reward: 1.2459016393442623
episode: 

Iteration 79: Policy loss: -0.000197. Value loss: 0.154334. Entropy: 1.368435.
Iteration 80: Policy loss: -0.007038. Value loss: 0.111079. Entropy: 1.366695.
Iteration 81: Policy loss: -0.013366. Value loss: 0.083446. Entropy: 1.367448.
episode: 108   score: 2.0  epsilon: 1.0    steps: 16  evaluation reward: 1.4
episode: 109   score: 0.0  epsilon: 1.0    steps: 216  evaluation reward: 1.4
episode: 110   score: 0.0  epsilon: 1.0    steps: 696  evaluation reward: 1.4
Training network. lr: 0.000250. clip: 0.099853
Iteration 82: Policy loss: 0.000120. Value loss: 0.161663. Entropy: 1.375745.
Iteration 83: Policy loss: -0.006593. Value loss: 0.121681. Entropy: 1.376585.
Iteration 84: Policy loss: -0.007413. Value loss: 0.105201. Entropy: 1.376545.
episode: 111   score: 1.0  epsilon: 1.0    steps: 168  evaluation reward: 1.4
episode: 112   score: 0.0  epsilon: 1.0    steps: 176  evaluation reward: 1.38
episode: 113   score: 0.0  epsilon: 1.0    steps: 424  evaluation reward: 1.35
episode: 11

episode: 162   score: 2.0  epsilon: 1.0    steps: 528  evaluation reward: 1.44
episode: 163   score: 3.0  epsilon: 1.0    steps: 624  evaluation reward: 1.46
Training network. lr: 0.000249. clip: 0.099696
Iteration 121: Policy loss: -0.000204. Value loss: 0.212706. Entropy: 1.366190.
Iteration 122: Policy loss: -0.008553. Value loss: 0.120244. Entropy: 1.364973.
Iteration 123: Policy loss: -0.013215. Value loss: 0.072746. Entropy: 1.362384.
episode: 164   score: 0.0  epsilon: 1.0    steps: 88  evaluation reward: 1.46
episode: 165   score: 1.0  epsilon: 1.0    steps: 272  evaluation reward: 1.46
episode: 166   score: 2.0  epsilon: 1.0    steps: 696  evaluation reward: 1.46
episode: 167   score: 0.0  epsilon: 1.0    steps: 704  evaluation reward: 1.46
Training network. lr: 0.000249. clip: 0.099696
Iteration 124: Policy loss: 0.004117. Value loss: 0.192623. Entropy: 1.370597.
Iteration 125: Policy loss: -0.001583. Value loss: 0.106195. Entropy: 1.373031.
Iteration 126: Policy loss: -0.007

episode: 215   score: 4.0  epsilon: 1.0    steps: 648  evaluation reward: 1.54
episode: 216   score: 3.0  epsilon: 1.0    steps: 712  evaluation reward: 1.54
Training network. lr: 0.000249. clip: 0.099548
Iteration 163: Policy loss: 0.000111. Value loss: 0.209568. Entropy: 1.347807.
Iteration 164: Policy loss: -0.009831. Value loss: 0.114119. Entropy: 1.347252.
Iteration 165: Policy loss: -0.017544. Value loss: 0.078958. Entropy: 1.337964.
episode: 217   score: 2.0  epsilon: 1.0    steps: 104  evaluation reward: 1.56
episode: 218   score: 1.0  epsilon: 1.0    steps: 136  evaluation reward: 1.56
episode: 219   score: 3.0  epsilon: 1.0    steps: 200  evaluation reward: 1.59
episode: 220   score: 5.0  epsilon: 1.0    steps: 624  evaluation reward: 1.62
episode: 221   score: 3.0  epsilon: 1.0    steps: 736  evaluation reward: 1.63
Training network. lr: 0.000249. clip: 0.099548
Iteration 166: Policy loss: 0.001925. Value loss: 0.225465. Entropy: 1.323175.
Iteration 167: Policy loss: -0.0037

Iteration 209: Policy loss: -0.005182. Value loss: 0.149935. Entropy: 1.272631.
Iteration 210: Policy loss: -0.009526. Value loss: 0.103117. Entropy: 1.270735.
episode: 263   score: 3.0  epsilon: 1.0    steps: 112  evaluation reward: 2.19
episode: 264   score: 5.0  epsilon: 1.0    steps: 152  evaluation reward: 2.24
episode: 265   score: 2.0  epsilon: 1.0    steps: 792  evaluation reward: 2.25
episode: 266   score: 3.0  epsilon: 1.0    steps: 816  evaluation reward: 2.26
episode: 267   score: 5.0  epsilon: 1.0    steps: 856  evaluation reward: 2.31
Training network. lr: 0.000248. clip: 0.099392
Iteration 211: Policy loss: 0.001765. Value loss: 0.230060. Entropy: 1.280344.
Iteration 212: Policy loss: -0.012747. Value loss: 0.123466. Entropy: 1.274100.
Iteration 213: Policy loss: -0.017580. Value loss: 0.093540. Entropy: 1.274479.
episode: 268   score: 2.0  epsilon: 1.0    steps: 560  evaluation reward: 2.32
episode: 269   score: 3.0  epsilon: 1.0    steps: 856  evaluation reward: 2.35
e

Iteration 258: Policy loss: -0.015665. Value loss: 0.169533. Entropy: 1.231574.
episode: 308   score: 5.0  epsilon: 1.0    steps: 728  evaluation reward: 3.26
episode: 309   score: 9.0  epsilon: 1.0    steps: 904  evaluation reward: 3.35
Training network. lr: 0.000248. clip: 0.099235
Iteration 259: Policy loss: 0.003702. Value loss: 0.370378. Entropy: 1.238711.
Iteration 260: Policy loss: -0.003704. Value loss: 0.221941. Entropy: 1.244420.
Iteration 261: Policy loss: -0.017160. Value loss: 0.159266. Entropy: 1.242695.
episode: 310   score: 6.0  epsilon: 1.0    steps: 920  evaluation reward: 3.39
Training network. lr: 0.000248. clip: 0.099235
Iteration 262: Policy loss: 0.002642. Value loss: 0.270457. Entropy: 1.235396.
Iteration 263: Policy loss: -0.008673. Value loss: 0.135339. Entropy: 1.233327.
Iteration 264: Policy loss: -0.021226. Value loss: 0.097609. Entropy: 1.232366.
episode: 311   score: 5.0  epsilon: 1.0    steps: 320  evaluation reward: 3.4
episode: 312   score: 4.0  epsilo

episode: 350   score: 4.0  epsilon: 1.0    steps: 392  evaluation reward: 3.99
Training network. lr: 0.000248. clip: 0.099088
Iteration 310: Policy loss: 0.004105. Value loss: 0.328511. Entropy: 1.187344.
Iteration 311: Policy loss: -0.011275. Value loss: 0.135428. Entropy: 1.188210.
Iteration 312: Policy loss: -0.019156. Value loss: 0.087905. Entropy: 1.190449.
now time :  2019-03-06 12:34:21.633487
episode: 351   score: 5.0  epsilon: 1.0    steps: 280  evaluation reward: 4.04
episode: 352   score: 8.0  epsilon: 1.0    steps: 328  evaluation reward: 4.09
episode: 353   score: 3.0  epsilon: 1.0    steps: 360  evaluation reward: 4.11
episode: 354   score: 3.0  epsilon: 1.0    steps: 800  evaluation reward: 4.12
Training network. lr: 0.000248. clip: 0.099088
Iteration 313: Policy loss: 0.004304. Value loss: 0.237943. Entropy: 1.215490.
Iteration 314: Policy loss: -0.009863. Value loss: 0.104690. Entropy: 1.208585.
Iteration 315: Policy loss: -0.022156. Value loss: 0.074907. Entropy: 1.20

episode: 390   score: 17.0  epsilon: 1.0    steps: 1008  evaluation reward: 5.29
Training network. lr: 0.000247. clip: 0.098931
Iteration 364: Policy loss: 0.003036. Value loss: 0.589540. Entropy: 1.197549.
Iteration 365: Policy loss: -0.007362. Value loss: 0.325766. Entropy: 1.191405.
Iteration 366: Policy loss: -0.014861. Value loss: 0.214987. Entropy: 1.186694.
episode: 391   score: 10.0  epsilon: 1.0    steps: 352  evaluation reward: 5.33
Training network. lr: 0.000247. clip: 0.098931
Iteration 367: Policy loss: 0.014604. Value loss: 0.525025. Entropy: 1.129842.
Iteration 368: Policy loss: -0.008341. Value loss: 0.279076. Entropy: 1.130459.
Iteration 369: Policy loss: -0.009523. Value loss: 0.176983. Entropy: 1.124755.
episode: 392   score: 6.0  epsilon: 1.0    steps: 40  evaluation reward: 5.36
episode: 393   score: 10.0  epsilon: 1.0    steps: 624  evaluation reward: 5.42
Training network. lr: 0.000247. clip: 0.098931
Iteration 370: Policy loss: 0.002637. Value loss: 0.445558. En

Iteration 419: Policy loss: -0.014698. Value loss: 0.119089. Entropy: 1.125947.
Iteration 420: Policy loss: -0.026087. Value loss: 0.078811. Entropy: 1.123764.
episode: 427   score: 8.0  epsilon: 1.0    steps: 520  evaluation reward: 5.94
episode: 428   score: 5.0  epsilon: 1.0    steps: 632  evaluation reward: 5.96
episode: 429   score: 5.0  epsilon: 1.0    steps: 768  evaluation reward: 5.97
Training network. lr: 0.000247. clip: 0.098774
Iteration 421: Policy loss: 0.004599. Value loss: 0.257992. Entropy: 1.184000.
Iteration 422: Policy loss: -0.015314. Value loss: 0.122958. Entropy: 1.177583.
Iteration 423: Policy loss: -0.025110. Value loss: 0.071340. Entropy: 1.176385.
episode: 430   score: 5.0  epsilon: 1.0    steps: 272  evaluation reward: 5.97
episode: 431   score: 8.0  epsilon: 1.0    steps: 880  evaluation reward: 6.0
episode: 432   score: 5.0  epsilon: 1.0    steps: 960  evaluation reward: 6.03
Training network. lr: 0.000247. clip: 0.098774
Iteration 424: Policy loss: 0.0053

episode: 468   score: 8.0  epsilon: 1.0    steps: 952  evaluation reward: 6.58
Training network. lr: 0.000247. clip: 0.098627
Iteration 472: Policy loss: 0.003594. Value loss: 0.220788. Entropy: 1.052667.
Iteration 473: Policy loss: -0.013559. Value loss: 0.123360. Entropy: 1.058410.
Iteration 474: Policy loss: -0.020842. Value loss: 0.085157. Entropy: 1.058885.
episode: 469   score: 5.0  epsilon: 1.0    steps: 488  evaluation reward: 6.56
Training network. lr: 0.000247. clip: 0.098627
Iteration 475: Policy loss: 0.005394. Value loss: 0.320724. Entropy: 0.932983.
Iteration 476: Policy loss: -0.014013. Value loss: 0.144004. Entropy: 0.951383.
Iteration 477: Policy loss: -0.026345. Value loss: 0.101709. Entropy: 0.946220.
episode: 470   score: 7.0  epsilon: 1.0    steps: 96  evaluation reward: 6.59
Training network. lr: 0.000247. clip: 0.098627
Iteration 478: Policy loss: 0.004355. Value loss: 0.294110. Entropy: 1.043341.
Iteration 479: Policy loss: -0.010199. Value loss: 0.161768. Entro

Iteration 527: Policy loss: -0.010588. Value loss: 0.104489. Entropy: 1.020597.
Iteration 528: Policy loss: -0.024173. Value loss: 0.077180. Entropy: 1.025509.
episode: 505   score: 3.0  epsilon: 1.0    steps: 288  evaluation reward: 6.26
episode: 506   score: 4.0  epsilon: 1.0    steps: 344  evaluation reward: 6.26
episode: 507   score: 7.0  epsilon: 1.0    steps: 432  evaluation reward: 6.29
Training network. lr: 0.000246. clip: 0.098470
Iteration 529: Policy loss: 0.005713. Value loss: 0.237102. Entropy: 0.954925.
Iteration 530: Policy loss: -0.014112. Value loss: 0.131813. Entropy: 0.956520.
Iteration 531: Policy loss: -0.023617. Value loss: 0.091946. Entropy: 0.958781.
episode: 508   score: 11.0  epsilon: 1.0    steps: 64  evaluation reward: 6.31
episode: 509   score: 9.0  epsilon: 1.0    steps: 488  evaluation reward: 6.32
episode: 510   score: 12.0  epsilon: 1.0    steps: 944  evaluation reward: 6.38
episode: 511   score: 7.0  epsilon: 1.0    steps: 960  evaluation reward: 6.41


Training network. lr: 0.000246. clip: 0.098313
Iteration 583: Policy loss: 0.004868. Value loss: 0.232219. Entropy: 0.994816.
Iteration 584: Policy loss: -0.008912. Value loss: 0.111045. Entropy: 0.988831.
Iteration 585: Policy loss: -0.018923. Value loss: 0.072158. Entropy: 0.990723.
episode: 542   score: 15.0  epsilon: 1.0    steps: 184  evaluation reward: 6.84
episode: 543   score: 7.0  epsilon: 1.0    steps: 272  evaluation reward: 6.84
episode: 544   score: 8.0  epsilon: 1.0    steps: 472  evaluation reward: 6.84
Training network. lr: 0.000246. clip: 0.098313
Iteration 586: Policy loss: 0.004587. Value loss: 0.207427. Entropy: 0.971878.
Iteration 587: Policy loss: -0.012034. Value loss: 0.083219. Entropy: 0.964421.
Iteration 588: Policy loss: -0.028973. Value loss: 0.057785. Entropy: 0.955684.
episode: 545   score: 4.0  epsilon: 1.0    steps: 120  evaluation reward: 6.84
Training network. lr: 0.000246. clip: 0.098313
Iteration 589: Policy loss: 0.013430. Value loss: 0.419853. Entr

Iteration 637: Policy loss: 0.005602. Value loss: 0.374420. Entropy: 0.916068.
Iteration 638: Policy loss: 0.005869. Value loss: 0.196664. Entropy: 0.918266.
Iteration 639: Policy loss: -0.003681. Value loss: 0.145378. Entropy: 0.906557.
episode: 580   score: 7.0  epsilon: 1.0    steps: 472  evaluation reward: 7.09
episode: 581   score: 8.0  epsilon: 1.0    steps: 688  evaluation reward: 7.1
episode: 582   score: 6.0  epsilon: 1.0    steps: 1024  evaluation reward: 7.12
Training network. lr: 0.000245. clip: 0.098166
Iteration 640: Policy loss: 0.002203. Value loss: 0.213241. Entropy: 0.981409.
Iteration 641: Policy loss: -0.014982. Value loss: 0.105491. Entropy: 0.972633.
Iteration 642: Policy loss: -0.021469. Value loss: 0.066577. Entropy: 0.970861.
Training network. lr: 0.000245. clip: 0.098166
Iteration 643: Policy loss: -0.000285. Value loss: 0.220339. Entropy: 0.985958.
Iteration 644: Policy loss: -0.014611. Value loss: 0.111689. Entropy: 0.980875.
Iteration 645: Policy loss: -0.0

episode: 616   score: 6.0  epsilon: 1.0    steps: 200  evaluation reward: 7.27
Training network. lr: 0.000245. clip: 0.098009
Iteration 694: Policy loss: 0.004269. Value loss: 0.174792. Entropy: 0.951645.
Iteration 695: Policy loss: -0.013438. Value loss: 0.085032. Entropy: 0.957506.
Iteration 696: Policy loss: -0.025275. Value loss: 0.060568. Entropy: 0.961180.
episode: 617   score: 4.0  epsilon: 1.0    steps: 416  evaluation reward: 7.24
Training network. lr: 0.000245. clip: 0.098009
Iteration 697: Policy loss: 0.005816. Value loss: 0.252816. Entropy: 0.926916.
Iteration 698: Policy loss: -0.014084. Value loss: 0.108484. Entropy: 0.920964.
Iteration 699: Policy loss: -0.024784. Value loss: 0.058554. Entropy: 0.917857.
episode: 618   score: 8.0  epsilon: 1.0    steps: 8  evaluation reward: 7.25
episode: 619   score: 10.0  epsilon: 1.0    steps: 280  evaluation reward: 7.27
episode: 620   score: 10.0  epsilon: 1.0    steps: 840  evaluation reward: 7.31
Training network. lr: 0.000245. c

episode: 652   score: 8.0  epsilon: 1.0    steps: 360  evaluation reward: 7.41
episode: 653   score: 5.0  epsilon: 1.0    steps: 480  evaluation reward: 7.37
Training network. lr: 0.000244. clip: 0.097705
Iteration 751: Policy loss: 0.009177. Value loss: 0.274535. Entropy: 0.883856.
Iteration 752: Policy loss: -0.014975. Value loss: 0.140607. Entropy: 0.889370.
Iteration 753: Policy loss: -0.024476. Value loss: 0.101394. Entropy: 0.883403.
Training network. lr: 0.000244. clip: 0.097705
Iteration 754: Policy loss: 0.007065. Value loss: 0.209752. Entropy: 0.899666.
Iteration 755: Policy loss: -0.007187. Value loss: 0.083595. Entropy: 0.896861.
Iteration 756: Policy loss: -0.025177. Value loss: 0.058675. Entropy: 0.893979.
episode: 654   score: 14.0  epsilon: 1.0    steps: 296  evaluation reward: 7.45
Training network. lr: 0.000244. clip: 0.097705
Iteration 757: Policy loss: 0.012556. Value loss: 0.260536. Entropy: 0.916277.
Iteration 758: Policy loss: -0.008790. Value loss: 0.117691. Ent

Training network. lr: 0.000244. clip: 0.097549
Iteration 808: Policy loss: 0.004424. Value loss: 0.314796. Entropy: 0.895955.
Iteration 809: Policy loss: -0.019663. Value loss: 0.126500. Entropy: 0.880473.
Iteration 810: Policy loss: -0.025777. Value loss: 0.080488. Entropy: 0.886988.
episode: 687   score: 11.0  epsilon: 1.0    steps: 440  evaluation reward: 8.1
Training network. lr: 0.000244. clip: 0.097549
Iteration 811: Policy loss: 0.008154. Value loss: 0.382092. Entropy: 0.979760.
Iteration 812: Policy loss: -0.012580. Value loss: 0.158239. Entropy: 0.968898.
Iteration 813: Policy loss: -0.023038. Value loss: 0.094272. Entropy: 0.972876.
episode: 688   score: 3.0  epsilon: 1.0    steps: 112  evaluation reward: 8.05
episode: 689   score: 8.0  epsilon: 1.0    steps: 760  evaluation reward: 8.04
episode: 690   score: 5.0  epsilon: 1.0    steps: 816  evaluation reward: 8.01
Training network. lr: 0.000244. clip: 0.097549
Iteration 814: Policy loss: 0.008319. Value loss: 0.424509. Entro

episode: 723   score: 15.0  epsilon: 1.0    steps: 280  evaluation reward: 8.06
episode: 724   score: 9.0  epsilon: 1.0    steps: 576  evaluation reward: 8.12
Training network. lr: 0.000243. clip: 0.097392
Iteration 865: Policy loss: 0.010822. Value loss: 0.408522. Entropy: 0.946239.
Iteration 866: Policy loss: -0.009088. Value loss: 0.248609. Entropy: 0.962762.
Iteration 867: Policy loss: -0.021831. Value loss: 0.181634. Entropy: 0.956745.
episode: 725   score: 9.0  epsilon: 1.0    steps: 432  evaluation reward: 8.08
Training network. lr: 0.000243. clip: 0.097392
Iteration 868: Policy loss: 0.010106. Value loss: 0.355103. Entropy: 0.911203.
Iteration 869: Policy loss: -0.006777. Value loss: 0.139914. Entropy: 0.930746.
Iteration 870: Policy loss: -0.020767. Value loss: 0.091713. Entropy: 0.914166.
episode: 726   score: 4.0  epsilon: 1.0    steps: 184  evaluation reward: 8.04
Training network. lr: 0.000243. clip: 0.097392
Iteration 871: Policy loss: 0.008415. Value loss: 0.312662. Entr

Iteration 924: Policy loss: -0.026774. Value loss: 0.102142. Entropy: 0.857549.
episode: 755   score: 12.0  epsilon: 1.0    steps: 456  evaluation reward: 8.53
episode: 756   score: 7.0  epsilon: 1.0    steps: 696  evaluation reward: 8.55
Training network. lr: 0.000243. clip: 0.097244
Iteration 925: Policy loss: 0.006055. Value loss: 0.233318. Entropy: 0.851122.
Iteration 926: Policy loss: -0.017909. Value loss: 0.108400. Entropy: 0.858987.
Iteration 927: Policy loss: -0.027862. Value loss: 0.057239. Entropy: 0.858833.
episode: 757   score: 8.0  epsilon: 1.0    steps: 488  evaluation reward: 8.55
episode: 758   score: 11.0  epsilon: 1.0    steps: 544  evaluation reward: 8.58
episode: 759   score: 7.0  epsilon: 1.0    steps: 1000  evaluation reward: 8.54
Training network. lr: 0.000243. clip: 0.097244
Iteration 928: Policy loss: 0.013225. Value loss: 0.290612. Entropy: 0.869187.
Iteration 929: Policy loss: -0.011103. Value loss: 0.124348. Entropy: 0.880308.
Iteration 930: Policy loss: -0

Training network. lr: 0.000243. clip: 0.097088
Iteration 982: Policy loss: 0.004105. Value loss: 0.324327. Entropy: 0.978900.
Iteration 983: Policy loss: -0.011882. Value loss: 0.123388. Entropy: 0.975132.
Iteration 984: Policy loss: -0.027810. Value loss: 0.071406. Entropy: 0.980858.
episode: 789   score: 8.0  epsilon: 1.0    steps: 896  evaluation reward: 8.6
Training network. lr: 0.000243. clip: 0.097088
Iteration 985: Policy loss: 0.006139. Value loss: 0.365846. Entropy: 1.005429.
Iteration 986: Policy loss: -0.011320. Value loss: 0.188579. Entropy: 1.005322.
Iteration 987: Policy loss: -0.022325. Value loss: 0.119578. Entropy: 1.008986.
episode: 790   score: 15.0  epsilon: 1.0    steps: 88  evaluation reward: 8.7
episode: 791   score: 17.0  epsilon: 1.0    steps: 648  evaluation reward: 8.8
episode: 792   score: 10.0  epsilon: 1.0    steps: 872  evaluation reward: 8.85
Training network. lr: 0.000243. clip: 0.097088
Iteration 988: Policy loss: 0.009538. Value loss: 0.343763. Entrop

Iteration 1040: Policy loss: -0.013032. Value loss: 0.075764. Entropy: 0.875184.
Iteration 1041: Policy loss: -0.024277. Value loss: 0.049007. Entropy: 0.884646.
Training network. lr: 0.000242. clip: 0.096931
Iteration 1042: Policy loss: 0.008416. Value loss: 0.671047. Entropy: 0.931283.
Iteration 1043: Policy loss: 0.000787. Value loss: 0.322875. Entropy: 0.921815.
Iteration 1044: Policy loss: -0.012139. Value loss: 0.228717. Entropy: 0.929457.
episode: 822   score: 14.0  epsilon: 1.0    steps: 304  evaluation reward: 9.44
Training network. lr: 0.000242. clip: 0.096931
Iteration 1045: Policy loss: 0.006323. Value loss: 0.498190. Entropy: 0.938171.
Iteration 1046: Policy loss: -0.007857. Value loss: 0.201767. Entropy: 0.936110.
Iteration 1047: Policy loss: -0.019002. Value loss: 0.113329. Entropy: 0.920014.
episode: 823   score: 16.0  epsilon: 1.0    steps: 424  evaluation reward: 9.45
episode: 824   score: 10.0  epsilon: 1.0    steps: 576  evaluation reward: 9.46
Training network. lr:

Iteration 1099: Policy loss: 0.005590. Value loss: 0.215687. Entropy: 0.898543.
Iteration 1100: Policy loss: -0.009642. Value loss: 0.113661. Entropy: 0.897153.
Iteration 1101: Policy loss: -0.026488. Value loss: 0.073727. Entropy: 0.891273.
episode: 853   score: 5.0  epsilon: 1.0    steps: 80  evaluation reward: 9.73
episode: 854   score: 4.0  epsilon: 1.0    steps: 512  evaluation reward: 9.65
episode: 855   score: 10.0  epsilon: 1.0    steps: 616  evaluation reward: 9.63
Training network. lr: 0.000242. clip: 0.096627
Iteration 1102: Policy loss: 0.005229. Value loss: 0.163519. Entropy: 0.831308.
Iteration 1103: Policy loss: -0.013441. Value loss: 0.059883. Entropy: 0.847743.
Iteration 1104: Policy loss: -0.024745. Value loss: 0.039509. Entropy: 0.837201.
episode: 856   score: 9.0  epsilon: 1.0    steps: 48  evaluation reward: 9.65
Training network. lr: 0.000242. clip: 0.096627
Iteration 1105: Policy loss: 0.006416. Value loss: 0.268750. Entropy: 0.880405.
Iteration 1106: Policy loss

Training network. lr: 0.000241. clip: 0.096470
Iteration 1159: Policy loss: 0.004377. Value loss: 0.192739. Entropy: 0.865663.
Iteration 1160: Policy loss: -0.011746. Value loss: 0.074778. Entropy: 0.877056.
Iteration 1161: Policy loss: -0.022311. Value loss: 0.048618. Entropy: 0.871350.
episode: 884   score: 9.0  epsilon: 1.0    steps: 312  evaluation reward: 10.24
episode: 885   score: 9.0  epsilon: 1.0    steps: 752  evaluation reward: 10.25
Training network. lr: 0.000241. clip: 0.096470
Iteration 1162: Policy loss: 0.004321. Value loss: 0.291606. Entropy: 0.895706.
Iteration 1163: Policy loss: -0.011254. Value loss: 0.134868. Entropy: 0.906427.
Iteration 1164: Policy loss: -0.025813. Value loss: 0.099976. Entropy: 0.898935.
Training network. lr: 0.000241. clip: 0.096470
Iteration 1165: Policy loss: 0.001872. Value loss: 0.408787. Entropy: 0.854197.
Iteration 1166: Policy loss: -0.015736. Value loss: 0.170107. Entropy: 0.838178.
Iteration 1167: Policy loss: -0.024692. Value loss: 0.

episode: 914   score: 12.0  epsilon: 1.0    steps: 200  evaluation reward: 10.36
Training network. lr: 0.000241. clip: 0.096323
Iteration 1219: Policy loss: 0.005353. Value loss: 0.281860. Entropy: 0.964594.
Iteration 1220: Policy loss: -0.006029. Value loss: 0.102638. Entropy: 0.943313.
Iteration 1221: Policy loss: -0.020596. Value loss: 0.068427. Entropy: 0.949444.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1222: Policy loss: 0.012749. Value loss: 0.198580. Entropy: 0.842208.
Iteration 1223: Policy loss: -0.010227. Value loss: 0.087983. Entropy: 0.864899.
Iteration 1224: Policy loss: -0.027452. Value loss: 0.056504. Entropy: 0.857709.
episode: 915   score: 10.0  epsilon: 1.0    steps: 16  evaluation reward: 10.37
episode: 916   score: 6.0  epsilon: 1.0    steps: 112  evaluation reward: 10.36
episode: 917   score: 9.0  epsilon: 1.0    steps: 168  evaluation reward: 10.38
episode: 918   score: 8.0  epsilon: 1.0    steps: 1008  evaluation reward: 10.27
Training network. lr

Training network. lr: 0.000240. clip: 0.096166
Iteration 1279: Policy loss: 0.020509. Value loss: 0.415443. Entropy: 0.824099.
Iteration 1280: Policy loss: -0.007576. Value loss: 0.222829. Entropy: 0.839329.
Iteration 1281: Policy loss: -0.017271. Value loss: 0.150809. Entropy: 0.828868.
episode: 945   score: 14.0  epsilon: 1.0    steps: 400  evaluation reward: 10.49
Training network. lr: 0.000240. clip: 0.096166
Iteration 1282: Policy loss: 0.011866. Value loss: 0.275953. Entropy: 0.892047.
Iteration 1283: Policy loss: -0.006528. Value loss: 0.128170. Entropy: 0.884848.
Iteration 1284: Policy loss: -0.021205. Value loss: 0.072632. Entropy: 0.890934.
episode: 946   score: 9.0  epsilon: 1.0    steps: 448  evaluation reward: 10.48
episode: 947   score: 10.0  epsilon: 1.0    steps: 504  evaluation reward: 10.42
Training network. lr: 0.000240. clip: 0.096166
Iteration 1285: Policy loss: 0.007530. Value loss: 0.457742. Entropy: 0.970403.
Iteration 1286: Policy loss: -0.011525. Value loss: 0

Training network. lr: 0.000240. clip: 0.096009
Iteration 1339: Policy loss: 0.007004. Value loss: 0.391934. Entropy: 0.881328.
Iteration 1340: Policy loss: -0.009637. Value loss: 0.164788. Entropy: 0.875725.
Iteration 1341: Policy loss: -0.019040. Value loss: 0.100977. Entropy: 0.867941.
episode: 976   score: 7.0  epsilon: 1.0    steps: 120  evaluation reward: 10.27
episode: 977   score: 13.0  epsilon: 1.0    steps: 136  evaluation reward: 10.29
episode: 978   score: 12.0  epsilon: 1.0    steps: 536  evaluation reward: 10.32
episode: 979   score: 8.0  epsilon: 1.0    steps: 840  evaluation reward: 10.29
Training network. lr: 0.000240. clip: 0.096009
Iteration 1342: Policy loss: 0.004121. Value loss: 0.214954. Entropy: 0.924905.
Iteration 1343: Policy loss: -0.007326. Value loss: 0.096164. Entropy: 0.924296.
Iteration 1344: Policy loss: -0.021045. Value loss: 0.066329. Entropy: 0.933173.
episode: 980   score: 13.0  epsilon: 1.0    steps: 320  evaluation reward: 10.33
Training network. l

Training network. lr: 0.000240. clip: 0.095862
Iteration 1399: Policy loss: 0.005807. Value loss: 0.453549. Entropy: 0.946991.
Iteration 1400: Policy loss: -0.008924. Value loss: 0.254986. Entropy: 0.953626.
Iteration 1401: Policy loss: -0.023060. Value loss: 0.165376. Entropy: 0.936317.
episode: 1006   score: 16.0  epsilon: 1.0    steps: 88  evaluation reward: 10.61
episode: 1007   score: 12.0  epsilon: 1.0    steps: 264  evaluation reward: 10.65
episode: 1008   score: 9.0  epsilon: 1.0    steps: 856  evaluation reward: 10.69
Training network. lr: 0.000239. clip: 0.095705
Iteration 1402: Policy loss: 0.006443. Value loss: 0.301993. Entropy: 0.848523.
Iteration 1403: Policy loss: -0.012082. Value loss: 0.109209. Entropy: 0.848968.
Iteration 1404: Policy loss: -0.024073. Value loss: 0.067482. Entropy: 0.833815.
Training network. lr: 0.000239. clip: 0.095705
Iteration 1405: Policy loss: 0.005300. Value loss: 0.206262. Entropy: 1.004983.
Iteration 1406: Policy loss: -0.016521. Value loss:

Iteration 1458: Policy loss: -0.023776. Value loss: 0.058087. Entropy: 0.856623.
episode: 1037   score: 10.0  epsilon: 1.0    steps: 760  evaluation reward: 10.73
episode: 1038   score: 10.0  epsilon: 1.0    steps: 768  evaluation reward: 10.73
Training network. lr: 0.000239. clip: 0.095549
Iteration 1459: Policy loss: 0.003428. Value loss: 0.194650. Entropy: 0.887814.
Iteration 1460: Policy loss: -0.016426. Value loss: 0.080036. Entropy: 0.901654.
Iteration 1461: Policy loss: -0.030160. Value loss: 0.046743. Entropy: 0.892899.
Training network. lr: 0.000239. clip: 0.095549
Iteration 1462: Policy loss: 0.008200. Value loss: 0.193633. Entropy: 0.973219.
Iteration 1463: Policy loss: -0.005269. Value loss: 0.079252. Entropy: 0.969751.
Iteration 1464: Policy loss: -0.022365. Value loss: 0.054617. Entropy: 0.961433.
episode: 1039   score: 9.0  epsilon: 1.0    steps: 160  evaluation reward: 10.67
episode: 1040   score: 13.0  epsilon: 1.0    steps: 288  evaluation reward: 10.63
Training netwo

Iteration 1517: Policy loss: -0.015074. Value loss: 0.094223. Entropy: 0.991179.
Iteration 1518: Policy loss: -0.025978. Value loss: 0.061228. Entropy: 0.982353.
episode: 1068   score: 10.0  epsilon: 1.0    steps: 416  evaluation reward: 10.66
episode: 1069   score: 19.0  epsilon: 1.0    steps: 968  evaluation reward: 10.77
Training network. lr: 0.000239. clip: 0.095401
Iteration 1519: Policy loss: 0.005788. Value loss: 0.354369. Entropy: 1.001547.
Iteration 1520: Policy loss: -0.008555. Value loss: 0.131320. Entropy: 0.991732.
Iteration 1521: Policy loss: -0.019634. Value loss: 0.072902. Entropy: 0.985028.
episode: 1070   score: 14.0  epsilon: 1.0    steps: 880  evaluation reward: 10.78
Training network. lr: 0.000239. clip: 0.095401
Iteration 1522: Policy loss: 0.015720. Value loss: 0.209139. Entropy: 0.910844.
Iteration 1523: Policy loss: -0.010920. Value loss: 0.090670. Entropy: 0.901037.
Iteration 1524: Policy loss: -0.026482. Value loss: 0.060028. Entropy: 0.905201.
episode: 1071 

episode: 1096   score: 14.0  epsilon: 1.0    steps: 440  evaluation reward: 11.38
episode: 1097   score: 7.0  epsilon: 1.0    steps: 648  evaluation reward: 11.3
episode: 1098   score: 11.0  epsilon: 1.0    steps: 744  evaluation reward: 11.23
episode: 1099   score: 9.0  epsilon: 1.0    steps: 912  evaluation reward: 11.24
Training network. lr: 0.000238. clip: 0.095245
Iteration 1579: Policy loss: 0.009604. Value loss: 0.529378. Entropy: 0.926107.
Iteration 1580: Policy loss: -0.012790. Value loss: 0.265163. Entropy: 0.915911.
Iteration 1581: Policy loss: -0.015097. Value loss: 0.186672. Entropy: 0.918625.
episode: 1100   score: 21.0  epsilon: 1.0    steps: 312  evaluation reward: 11.33
Training network. lr: 0.000238. clip: 0.095245
Iteration 1582: Policy loss: 0.007395. Value loss: 0.321938. Entropy: 0.871949.
Iteration 1583: Policy loss: -0.007522. Value loss: 0.152280. Entropy: 0.866510.
Iteration 1584: Policy loss: -0.022405. Value loss: 0.107793. Entropy: 0.867844.
now time :  201

episode: 1126   score: 10.0  epsilon: 1.0    steps: 416  evaluation reward: 11.75
Training network. lr: 0.000238. clip: 0.095088
Iteration 1639: Policy loss: 0.007203. Value loss: 0.399707. Entropy: 0.892922.
Iteration 1640: Policy loss: -0.012740. Value loss: 0.184333. Entropy: 0.901279.
Iteration 1641: Policy loss: -0.024325. Value loss: 0.104307. Entropy: 0.899323.
episode: 1127   score: 13.0  epsilon: 1.0    steps: 584  evaluation reward: 11.82
Training network. lr: 0.000238. clip: 0.095088
Iteration 1642: Policy loss: 0.009294. Value loss: 0.240882. Entropy: 0.878542.
Iteration 1643: Policy loss: -0.011169. Value loss: 0.090883. Entropy: 0.879468.
Iteration 1644: Policy loss: -0.021998. Value loss: 0.051691. Entropy: 0.871585.
episode: 1128   score: 10.0  epsilon: 1.0    steps: 704  evaluation reward: 11.78
Training network. lr: 0.000238. clip: 0.095088
Iteration 1645: Policy loss: 0.002869. Value loss: 0.328186. Entropy: 0.972070.
Iteration 1646: Policy loss: -0.017073. Value los

Iteration 1699: Policy loss: 0.014094. Value loss: 0.313079. Entropy: 0.880387.
Iteration 1700: Policy loss: -0.002959. Value loss: 0.131767. Entropy: 0.883337.
Iteration 1701: Policy loss: -0.018538. Value loss: 0.081687. Entropy: 0.892912.
episode: 1155   score: 10.0  epsilon: 1.0    steps: 776  evaluation reward: 12.14
Training network. lr: 0.000237. clip: 0.094784
Iteration 1702: Policy loss: 0.002118. Value loss: 0.219735. Entropy: 0.959242.
Iteration 1703: Policy loss: -0.019113. Value loss: 0.090237. Entropy: 0.958543.
Iteration 1704: Policy loss: -0.027015. Value loss: 0.058220. Entropy: 0.945415.
episode: 1156   score: 11.0  epsilon: 1.0    steps: 344  evaluation reward: 12.15
Training network. lr: 0.000237. clip: 0.094784
Iteration 1705: Policy loss: 0.007133. Value loss: 0.228436. Entropy: 0.935890.
Iteration 1706: Policy loss: -0.012821. Value loss: 0.101094. Entropy: 0.939311.
Iteration 1707: Policy loss: -0.026900. Value loss: 0.069929. Entropy: 0.939920.
episode: 1157   

Iteration 1760: Policy loss: -0.019488. Value loss: 0.074172. Entropy: 0.984953.
Iteration 1761: Policy loss: -0.028290. Value loss: 0.043308. Entropy: 0.974683.
Training network. lr: 0.000237. clip: 0.094627
Iteration 1762: Policy loss: 0.004548. Value loss: 0.164005. Entropy: 0.995727.
Iteration 1763: Policy loss: -0.018252. Value loss: 0.068095. Entropy: 0.992514.
Iteration 1764: Policy loss: -0.030754. Value loss: 0.052994. Entropy: 0.993463.
episode: 1184   score: 10.0  epsilon: 1.0    steps: 632  evaluation reward: 12.06
episode: 1185   score: 12.0  epsilon: 1.0    steps: 792  evaluation reward: 12.09
Training network. lr: 0.000237. clip: 0.094627
Iteration 1765: Policy loss: 0.007321. Value loss: 0.236581. Entropy: 1.031030.
Iteration 1766: Policy loss: -0.007123. Value loss: 0.089358. Entropy: 1.012167.
Iteration 1767: Policy loss: -0.026401. Value loss: 0.055079. Entropy: 1.033027.
episode: 1186   score: 10.0  epsilon: 1.0    steps: 320  evaluation reward: 12.08
episode: 1187 

Iteration 1820: Policy loss: -0.016341. Value loss: 0.144837. Entropy: 0.923727.
Iteration 1821: Policy loss: -0.027876. Value loss: 0.099031. Entropy: 0.932524.
episode: 1214   score: 10.0  epsilon: 1.0    steps: 776  evaluation reward: 11.86
Training network. lr: 0.000236. clip: 0.094480
Iteration 1822: Policy loss: 0.010275. Value loss: 0.321320. Entropy: 0.937263.
Iteration 1823: Policy loss: -0.019473. Value loss: 0.123685. Entropy: 0.929962.
Iteration 1824: Policy loss: -0.031566. Value loss: 0.084031. Entropy: 0.930928.
episode: 1215   score: 7.0  epsilon: 1.0    steps: 376  evaluation reward: 11.8
episode: 1216   score: 10.0  epsilon: 1.0    steps: 440  evaluation reward: 11.77
Training network. lr: 0.000236. clip: 0.094480
Iteration 1825: Policy loss: 0.006564. Value loss: 0.253332. Entropy: 0.921863.
Iteration 1826: Policy loss: -0.013615. Value loss: 0.096205. Entropy: 0.928733.
Iteration 1827: Policy loss: -0.027212. Value loss: 0.055144. Entropy: 0.914977.
episode: 1217   

Iteration 1881: Policy loss: -0.019183. Value loss: 0.084036. Entropy: 0.974525.
episode: 1243   score: 12.0  epsilon: 1.0    steps: 512  evaluation reward: 11.79
episode: 1244   score: 18.0  epsilon: 1.0    steps: 688  evaluation reward: 11.83
Training network. lr: 0.000236. clip: 0.094323
Iteration 1882: Policy loss: 0.009944. Value loss: 0.749979. Entropy: 0.938971.
Iteration 1883: Policy loss: -0.007536. Value loss: 0.378135. Entropy: 0.942299.
Iteration 1884: Policy loss: -0.022273. Value loss: 0.243996. Entropy: 0.931246.
episode: 1245   score: 18.0  epsilon: 1.0    steps: 960  evaluation reward: 11.84
Training network. lr: 0.000236. clip: 0.094323
Iteration 1885: Policy loss: 0.007818. Value loss: 0.448368. Entropy: 0.900951.
Iteration 1886: Policy loss: -0.013128. Value loss: 0.119803. Entropy: 0.915607.
Iteration 1887: Policy loss: -0.021217. Value loss: 0.050723. Entropy: 0.899789.
episode: 1246   score: 15.0  epsilon: 1.0    steps: 304  evaluation reward: 11.85
episode: 1247

Iteration 1941: Policy loss: -0.019117. Value loss: 0.060674. Entropy: 0.864118.
Training network. lr: 0.000235. clip: 0.094166
Iteration 1942: Policy loss: 0.011448. Value loss: 0.242038. Entropy: 0.855808.
Iteration 1943: Policy loss: -0.008880. Value loss: 0.098930. Entropy: 0.861727.
Iteration 1944: Policy loss: -0.021079. Value loss: 0.065529. Entropy: 0.854136.
episode: 1273   score: 16.0  epsilon: 1.0    steps: 176  evaluation reward: 11.57
Training network. lr: 0.000235. clip: 0.094166
Iteration 1945: Policy loss: 0.008053. Value loss: 0.289323. Entropy: 0.938878.
Iteration 1946: Policy loss: -0.016510. Value loss: 0.113620. Entropy: 0.941346.
Iteration 1947: Policy loss: -0.026387. Value loss: 0.079315. Entropy: 0.939838.
episode: 1274   score: 9.0  epsilon: 1.0    steps: 240  evaluation reward: 11.54
Training network. lr: 0.000235. clip: 0.094166
Iteration 1948: Policy loss: 0.006570. Value loss: 0.193970. Entropy: 0.858706.
Iteration 1949: Policy loss: -0.014888. Value loss:

Iteration 2004: Policy loss: -0.024890. Value loss: 0.063668. Entropy: 0.900285.
episode: 1299   score: 14.0  epsilon: 1.0    steps: 440  evaluation reward: 12.2
episode: 1300   score: 11.0  epsilon: 1.0    steps: 848  evaluation reward: 12.24
Training network. lr: 0.000235. clip: 0.093862
Iteration 2005: Policy loss: 0.016901. Value loss: 0.579897. Entropy: 0.957166.
Iteration 2006: Policy loss: -0.002704. Value loss: 0.184676. Entropy: 0.951098.
Iteration 2007: Policy loss: -0.014932. Value loss: 0.110005. Entropy: 0.950915.
now time :  2019-03-06 13:10:50.636281
episode: 1301   score: 10.0  epsilon: 1.0    steps: 416  evaluation reward: 12.23
episode: 1302   score: 15.0  epsilon: 1.0    steps: 480  evaluation reward: 12.21
Training network. lr: 0.000235. clip: 0.093862
Iteration 2008: Policy loss: 0.008262. Value loss: 0.267966. Entropy: 0.971428.
Iteration 2009: Policy loss: -0.012867. Value loss: 0.102500. Entropy: 0.959227.
Iteration 2010: Policy loss: -0.021217. Value loss: 0.05

Iteration 2065: Policy loss: 0.011297. Value loss: 0.288050. Entropy: 0.990432.
Iteration 2066: Policy loss: -0.009187. Value loss: 0.128058. Entropy: 0.970316.
Iteration 2067: Policy loss: -0.028128. Value loss: 0.083383. Entropy: 0.991571.
episode: 1328   score: 12.0  epsilon: 1.0    steps: 984  evaluation reward: 12.65
Training network. lr: 0.000234. clip: 0.093705
Iteration 2068: Policy loss: 0.012309. Value loss: 0.430921. Entropy: 1.003952.
Iteration 2069: Policy loss: -0.006226. Value loss: 0.162561. Entropy: 0.994184.
Iteration 2070: Policy loss: -0.017702. Value loss: 0.092242. Entropy: 0.989265.
episode: 1329   score: 17.0  epsilon: 1.0    steps: 56  evaluation reward: 12.68
episode: 1330   score: 12.0  epsilon: 1.0    steps: 536  evaluation reward: 12.67
Training network. lr: 0.000234. clip: 0.093705
Iteration 2071: Policy loss: 0.001322. Value loss: 0.144513. Entropy: 0.950807.
Iteration 2072: Policy loss: -0.017407. Value loss: 0.063505. Entropy: 0.946189.
Iteration 2073: 

Iteration 2127: Policy loss: -0.027780. Value loss: 0.057506. Entropy: 0.973666.
Training network. lr: 0.000234. clip: 0.093558
Iteration 2128: Policy loss: 0.006338. Value loss: 0.380209. Entropy: 0.956306.
Iteration 2129: Policy loss: -0.013518. Value loss: 0.128799. Entropy: 0.954074.
Iteration 2130: Policy loss: -0.017475. Value loss: 0.064597. Entropy: 0.951406.
episode: 1356   score: 11.0  epsilon: 1.0    steps: 408  evaluation reward: 12.93
episode: 1357   score: 11.0  epsilon: 1.0    steps: 424  evaluation reward: 12.97
Training network. lr: 0.000234. clip: 0.093558
Iteration 2131: Policy loss: 0.015744. Value loss: 0.300083. Entropy: 0.954482.
Iteration 2132: Policy loss: -0.013426. Value loss: 0.155588. Entropy: 0.933046.
Iteration 2133: Policy loss: -0.026384. Value loss: 0.116340. Entropy: 0.934705.
episode: 1358   score: 17.0  epsilon: 1.0    steps: 648  evaluation reward: 13.0
episode: 1359   score: 10.0  epsilon: 1.0    steps: 712  evaluation reward: 13.0
Training networ

Iteration 2190: Policy loss: -0.028783. Value loss: 0.069443. Entropy: 0.940223.
episode: 1383   score: 19.0  epsilon: 1.0    steps: 440  evaluation reward: 13.36
Training network. lr: 0.000234. clip: 0.093401
Iteration 2191: Policy loss: 0.005421. Value loss: 0.493050. Entropy: 0.964396.
Iteration 2192: Policy loss: -0.006249. Value loss: 0.182818. Entropy: 0.960808.
Iteration 2193: Policy loss: -0.018462. Value loss: 0.111066. Entropy: 0.968387.
episode: 1384   score: 13.0  epsilon: 1.0    steps: 264  evaluation reward: 13.41
Training network. lr: 0.000234. clip: 0.093401
Iteration 2194: Policy loss: 0.003230. Value loss: 0.201443. Entropy: 0.955197.
Iteration 2195: Policy loss: -0.016879. Value loss: 0.076741. Entropy: 0.954670.
Iteration 2196: Policy loss: -0.026546. Value loss: 0.044621. Entropy: 0.953244.
episode: 1385   score: 11.0  epsilon: 1.0    steps: 448  evaluation reward: 13.42
episode: 1386   score: 16.0  epsilon: 1.0    steps: 616  evaluation reward: 13.45
episode: 1387

Iteration 2251: Policy loss: 0.008536. Value loss: 0.404933. Entropy: 0.999645.
Iteration 2252: Policy loss: -0.007419. Value loss: 0.157093. Entropy: 0.995257.
Iteration 2253: Policy loss: -0.023836. Value loss: 0.087133. Entropy: 0.991634.
episode: 1411   score: 11.0  epsilon: 1.0    steps: 920  evaluation reward: 13.42
Training network. lr: 0.000233. clip: 0.093097
Iteration 2254: Policy loss: 0.005789. Value loss: 0.459524. Entropy: 1.004807.
Iteration 2255: Policy loss: -0.008276. Value loss: 0.189733. Entropy: 0.999203.
Iteration 2256: Policy loss: -0.022728. Value loss: 0.105492. Entropy: 0.993856.
episode: 1412   score: 12.0  epsilon: 1.0    steps: 248  evaluation reward: 13.35
episode: 1413   score: 14.0  epsilon: 1.0    steps: 360  evaluation reward: 13.37
episode: 1414   score: 16.0  epsilon: 1.0    steps: 840  evaluation reward: 13.36
episode: 1415   score: 15.0  epsilon: 1.0    steps: 888  evaluation reward: 13.37
Training network. lr: 0.000233. clip: 0.093097
Iteration 22

Iteration 2311: Policy loss: 0.003030. Value loss: 0.324171. Entropy: 0.993034.
Iteration 2312: Policy loss: -0.020024. Value loss: 0.151287. Entropy: 0.980640.
Iteration 2313: Policy loss: -0.030158. Value loss: 0.098911. Entropy: 0.978059.
episode: 1441   score: 7.0  epsilon: 1.0    steps: 528  evaluation reward: 13.13
Training network. lr: 0.000232. clip: 0.092941
Iteration 2314: Policy loss: 0.001358. Value loss: 0.210203. Entropy: 0.984172.
Iteration 2315: Policy loss: -0.010619. Value loss: 0.068954. Entropy: 0.983608.
Iteration 2316: Policy loss: -0.027515. Value loss: 0.040975. Entropy: 0.982932.
episode: 1442   score: 14.0  epsilon: 1.0    steps: 520  evaluation reward: 13.16
Training network. lr: 0.000232. clip: 0.092941
Iteration 2317: Policy loss: 0.004017. Value loss: 0.476132. Entropy: 1.015882.
Iteration 2318: Policy loss: -0.006678. Value loss: 0.204829. Entropy: 1.013546.
Iteration 2319: Policy loss: -0.015965. Value loss: 0.134337. Entropy: 1.014585.
episode: 1443   s

episode: 1468   score: 11.0  epsilon: 1.0    steps: 256  evaluation reward: 13.32
Training network. lr: 0.000232. clip: 0.092784
Iteration 2374: Policy loss: 0.006704. Value loss: 0.378905. Entropy: 0.936321.
Iteration 2375: Policy loss: -0.018326. Value loss: 0.152550. Entropy: 0.927119.
Iteration 2376: Policy loss: -0.026819. Value loss: 0.091939. Entropy: 0.934751.
episode: 1469   score: 18.0  epsilon: 1.0    steps: 272  evaluation reward: 13.38
episode: 1470   score: 12.0  epsilon: 1.0    steps: 536  evaluation reward: 13.4
episode: 1471   score: 10.0  epsilon: 1.0    steps: 864  evaluation reward: 13.37
Training network. lr: 0.000232. clip: 0.092784
Iteration 2377: Policy loss: 0.020842. Value loss: 0.417945. Entropy: 0.979231.
Iteration 2378: Policy loss: -0.008545. Value loss: 0.147617. Entropy: 0.952598.
Iteration 2379: Policy loss: -0.017333. Value loss: 0.081430. Entropy: 0.954991.
Training network. lr: 0.000232. clip: 0.092784
Iteration 2380: Policy loss: 0.007890. Value los

Iteration 2434: Policy loss: 0.007802. Value loss: 0.315769. Entropy: 0.961735.
Iteration 2435: Policy loss: -0.018396. Value loss: 0.126520. Entropy: 0.948296.
Iteration 2436: Policy loss: -0.025661. Value loss: 0.082653. Entropy: 0.940174.
Training network. lr: 0.000232. clip: 0.092636
Iteration 2437: Policy loss: 0.005981. Value loss: 0.266731. Entropy: 1.007636.
Iteration 2438: Policy loss: -0.015305. Value loss: 0.108564. Entropy: 0.998221.
Iteration 2439: Policy loss: -0.030490. Value loss: 0.068534. Entropy: 1.003260.
episode: 1497   score: 10.0  epsilon: 1.0    steps: 112  evaluation reward: 13.42
Training network. lr: 0.000232. clip: 0.092636
Iteration 2440: Policy loss: 0.009011. Value loss: 0.324503. Entropy: 1.003050.
Iteration 2441: Policy loss: -0.018119. Value loss: 0.134891. Entropy: 0.992400.
Iteration 2442: Policy loss: -0.028587. Value loss: 0.078033. Entropy: 0.981605.
Training network. lr: 0.000232. clip: 0.092636
Iteration 2443: Policy loss: 0.015900. Value loss: 

Iteration 2498: Policy loss: -0.009785. Value loss: 0.270903. Entropy: 0.982279.
Iteration 2499: Policy loss: -0.017802. Value loss: 0.177477. Entropy: 0.973957.
episode: 1522   score: 19.0  epsilon: 1.0    steps: 704  evaluation reward: 13.96
episode: 1523   score: 12.0  epsilon: 1.0    steps: 720  evaluation reward: 13.99
episode: 1524   score: 14.0  epsilon: 1.0    steps: 1016  evaluation reward: 13.99
Training network. lr: 0.000231. clip: 0.092480
Iteration 2500: Policy loss: 0.009664. Value loss: 0.325823. Entropy: 0.968622.
Iteration 2501: Policy loss: -0.014209. Value loss: 0.123984. Entropy: 0.975544.
Iteration 2502: Policy loss: -0.029978. Value loss: 0.065356. Entropy: 0.967041.
episode: 1525   score: 15.0  epsilon: 1.0    steps: 216  evaluation reward: 14.03
episode: 1526   score: 14.0  epsilon: 1.0    steps: 248  evaluation reward: 14.04
episode: 1527   score: 6.0  epsilon: 1.0    steps: 720  evaluation reward: 13.93
Training network. lr: 0.000231. clip: 0.092323
Iteration 

episode: 1550   score: 13.0  epsilon: 1.0    steps: 624  evaluation reward: 14.31
Training network. lr: 0.000230. clip: 0.092176
Iteration 2560: Policy loss: 0.002833. Value loss: 0.303095. Entropy: 1.024130.
Iteration 2561: Policy loss: -0.014050. Value loss: 0.127674. Entropy: 1.007955.
Iteration 2562: Policy loss: -0.026187. Value loss: 0.081466. Entropy: 1.012031.
now time :  2019-03-06 13:22:46.956943
episode: 1551   score: 22.0  epsilon: 1.0    steps: 432  evaluation reward: 14.37
Training network. lr: 0.000230. clip: 0.092176
Iteration 2563: Policy loss: 0.008660. Value loss: 0.376915. Entropy: 0.983708.
Iteration 2564: Policy loss: -0.007050. Value loss: 0.188357. Entropy: 0.976421.
Iteration 2565: Policy loss: -0.024596. Value loss: 0.116538. Entropy: 0.974557.
episode: 1552   score: 17.0  epsilon: 1.0    steps: 896  evaluation reward: 14.43
Training network. lr: 0.000230. clip: 0.092176
Iteration 2566: Policy loss: 0.006787. Value loss: 0.578530. Entropy: 0.981319.
Iteration 

Training network. lr: 0.000230. clip: 0.092019
Iteration 2623: Policy loss: 0.002520. Value loss: 0.434515. Entropy: 0.943364.
Iteration 2624: Policy loss: -0.015520. Value loss: 0.200766. Entropy: 0.941901.
Iteration 2625: Policy loss: -0.025825. Value loss: 0.125201. Entropy: 0.932680.
episode: 1576   score: 23.0  epsilon: 1.0    steps: 688  evaluation reward: 14.95
episode: 1577   score: 8.0  epsilon: 1.0    steps: 920  evaluation reward: 14.92
Training network. lr: 0.000230. clip: 0.092019
Iteration 2626: Policy loss: 0.014530. Value loss: 0.647370. Entropy: 1.005454.
Iteration 2627: Policy loss: -0.003901. Value loss: 0.279019. Entropy: 1.008839.
Iteration 2628: Policy loss: -0.019330. Value loss: 0.149249. Entropy: 0.995072.
episode: 1578   score: 12.0  epsilon: 1.0    steps: 296  evaluation reward: 14.9
Training network. lr: 0.000230. clip: 0.092019
Iteration 2629: Policy loss: 0.011149. Value loss: 0.392473. Entropy: 0.956267.
Iteration 2630: Policy loss: -0.017153. Value loss:

Training network. lr: 0.000230. clip: 0.091862
Iteration 2686: Policy loss: 0.005594. Value loss: 0.315412. Entropy: 0.960151.
Iteration 2687: Policy loss: -0.004424. Value loss: 0.091881. Entropy: 0.962539.
Iteration 2688: Policy loss: -0.017757. Value loss: 0.049319. Entropy: 0.963804.
episode: 1602   score: 10.0  epsilon: 1.0    steps: 272  evaluation reward: 15.04
episode: 1603   score: 12.0  epsilon: 1.0    steps: 384  evaluation reward: 14.99
episode: 1604   score: 13.0  epsilon: 1.0    steps: 576  evaluation reward: 14.93
Training network. lr: 0.000230. clip: 0.091862
Iteration 2689: Policy loss: 0.009702. Value loss: 0.245472. Entropy: 0.963877.
Iteration 2690: Policy loss: -0.013791. Value loss: 0.105485. Entropy: 0.962312.
Iteration 2691: Policy loss: -0.026143. Value loss: 0.073039. Entropy: 0.958318.
episode: 1605   score: 18.0  epsilon: 1.0    steps: 208  evaluation reward: 14.88
Training network. lr: 0.000230. clip: 0.091862
Iteration 2692: Policy loss: 0.003943. Value lo

Iteration 2750: Policy loss: -0.009208. Value loss: 0.081969. Entropy: 1.003793.
Iteration 2751: Policy loss: -0.025351. Value loss: 0.043235. Entropy: 1.005085.
episode: 1627   score: 16.0  epsilon: 1.0    steps: 240  evaluation reward: 15.29
episode: 1628   score: 13.0  epsilon: 1.0    steps: 520  evaluation reward: 15.22
Training network. lr: 0.000229. clip: 0.091558
Iteration 2752: Policy loss: 0.010016. Value loss: 0.207708. Entropy: 0.984071.
Iteration 2753: Policy loss: -0.014470. Value loss: 0.086537. Entropy: 0.995929.
Iteration 2754: Policy loss: -0.028461. Value loss: 0.063618. Entropy: 0.999630.
episode: 1629   score: 16.0  epsilon: 1.0    steps: 464  evaluation reward: 15.27
episode: 1630   score: 19.0  epsilon: 1.0    steps: 936  evaluation reward: 15.33
Training network. lr: 0.000229. clip: 0.091558
Iteration 2755: Policy loss: 0.008493. Value loss: 0.321952. Entropy: 0.999318.
Iteration 2756: Policy loss: -0.010182. Value loss: 0.117193. Entropy: 0.995548.
Iteration 275

Training network. lr: 0.000229. clip: 0.091401
Iteration 2815: Policy loss: 0.006575. Value loss: 0.404810. Entropy: 0.920530.
Iteration 2816: Policy loss: -0.005249. Value loss: 0.165931. Entropy: 0.931600.
Iteration 2817: Policy loss: -0.022544. Value loss: 0.089741. Entropy: 0.931034.
episode: 1652   score: 14.0  epsilon: 1.0    steps: 344  evaluation reward: 15.96
episode: 1653   score: 15.0  epsilon: 1.0    steps: 720  evaluation reward: 15.98
Training network. lr: 0.000229. clip: 0.091401
Iteration 2818: Policy loss: 0.007616. Value loss: 0.525238. Entropy: 0.976810.
Iteration 2819: Policy loss: -0.001002. Value loss: 0.232646. Entropy: 0.979263.
Iteration 2820: Policy loss: -0.014816. Value loss: 0.118037. Entropy: 0.981658.
Training network. lr: 0.000229. clip: 0.091401
Iteration 2821: Policy loss: 0.006997. Value loss: 0.463742. Entropy: 0.920814.
Iteration 2822: Policy loss: -0.003525. Value loss: 0.181436. Entropy: 0.900965.
Iteration 2823: Policy loss: -0.018273. Value loss

Iteration 2877: Policy loss: -0.027395. Value loss: 0.051987. Entropy: 0.967799.
Training network. lr: 0.000228. clip: 0.091254
Iteration 2878: Policy loss: 0.002675. Value loss: 0.237810. Entropy: 0.947212.
Iteration 2879: Policy loss: -0.011990. Value loss: 0.089712. Entropy: 0.951279.
Iteration 2880: Policy loss: -0.025306. Value loss: 0.043594. Entropy: 0.946636.
episode: 1679   score: 11.0  epsilon: 1.0    steps: 480  evaluation reward: 15.81
episode: 1680   score: 11.0  epsilon: 1.0    steps: 1008  evaluation reward: 15.78
Training network. lr: 0.000228. clip: 0.091254
Iteration 2881: Policy loss: 0.007846. Value loss: 0.538031. Entropy: 0.983177.
Iteration 2882: Policy loss: -0.006832. Value loss: 0.224617. Entropy: 0.996897.
Iteration 2883: Policy loss: -0.015650. Value loss: 0.101724. Entropy: 0.999264.
episode: 1681   score: 16.0  epsilon: 1.0    steps: 784  evaluation reward: 15.78
episode: 1682   score: 17.0  epsilon: 1.0    steps: 888  evaluation reward: 15.84
Training net

Iteration 2940: Policy loss: -0.010410. Value loss: 0.074226. Entropy: 0.958802.
episode: 1705   score: 17.0  epsilon: 1.0    steps: 528  evaluation reward: 15.62
episode: 1706   score: 20.0  epsilon: 1.0    steps: 776  evaluation reward: 15.73
Training network. lr: 0.000228. clip: 0.091097
Iteration 2941: Policy loss: 0.006354. Value loss: 0.431884. Entropy: 0.951040.
Iteration 2942: Policy loss: -0.012808. Value loss: 0.138571. Entropy: 0.934056.
Iteration 2943: Policy loss: -0.021711. Value loss: 0.071569. Entropy: 0.926040.
episode: 1707   score: 11.0  epsilon: 1.0    steps: 168  evaluation reward: 15.74
Training network. lr: 0.000228. clip: 0.091097
Iteration 2944: Policy loss: 0.012244. Value loss: 0.200730. Entropy: 1.000986.
Iteration 2945: Policy loss: -0.015354. Value loss: 0.065380. Entropy: 0.988558.
Iteration 2946: Policy loss: -0.029380. Value loss: 0.038649. Entropy: 0.988390.
episode: 1708   score: 20.0  epsilon: 1.0    steps: 904  evaluation reward: 15.81
Training netw

Training network. lr: 0.000227. clip: 0.090793
Iteration 3004: Policy loss: 0.005679. Value loss: 0.366318. Entropy: 1.011024.
Iteration 3005: Policy loss: -0.011233. Value loss: 0.140017. Entropy: 1.007031.
Iteration 3006: Policy loss: -0.023599. Value loss: 0.081643. Entropy: 1.018013.
episode: 1731   score: 25.0  epsilon: 1.0    steps: 144  evaluation reward: 15.95
episode: 1732   score: 17.0  epsilon: 1.0    steps: 560  evaluation reward: 16.0
Training network. lr: 0.000227. clip: 0.090793
Iteration 3007: Policy loss: 0.008029. Value loss: 0.542695. Entropy: 0.921114.
Iteration 3008: Policy loss: -0.011444. Value loss: 0.191048. Entropy: 0.922637.
Iteration 3009: Policy loss: -0.017682. Value loss: 0.088894. Entropy: 0.914391.
Training network. lr: 0.000227. clip: 0.090793
Iteration 3010: Policy loss: 0.011244. Value loss: 0.448879. Entropy: 0.992171.
Iteration 3011: Policy loss: -0.012237. Value loss: 0.224424. Entropy: 0.983006.
Iteration 3012: Policy loss: -0.024276. Value loss:

Iteration 3067: Policy loss: 0.013318. Value loss: 0.302992. Entropy: 0.937832.
Iteration 3068: Policy loss: -0.016776. Value loss: 0.115078. Entropy: 0.931359.
Iteration 3069: Policy loss: -0.027425. Value loss: 0.061821. Entropy: 0.937585.
episode: 1756   score: 15.0  epsilon: 1.0    steps: 928  evaluation reward: 15.82
Training network. lr: 0.000227. clip: 0.090637
Iteration 3070: Policy loss: 0.011804. Value loss: 0.328916. Entropy: 0.996334.
Iteration 3071: Policy loss: -0.014153. Value loss: 0.133599. Entropy: 1.000337.
Iteration 3072: Policy loss: -0.023729. Value loss: 0.080072. Entropy: 0.997341.
episode: 1757   score: 16.0  epsilon: 1.0    steps: 48  evaluation reward: 15.72
episode: 1758   score: 21.0  epsilon: 1.0    steps: 440  evaluation reward: 15.78
Training network. lr: 0.000227. clip: 0.090637
Iteration 3073: Policy loss: 0.009119. Value loss: 0.506576. Entropy: 0.979036.
Iteration 3074: Policy loss: -0.010379. Value loss: 0.233189. Entropy: 0.983949.
Iteration 3075: 

Iteration 3130: Policy loss: 0.008522. Value loss: 0.279534. Entropy: 0.962633.
Iteration 3131: Policy loss: -0.016590. Value loss: 0.113188. Entropy: 0.957533.
Iteration 3132: Policy loss: -0.027277. Value loss: 0.064233. Entropy: 0.951689.
Training network. lr: 0.000226. clip: 0.090480
Iteration 3133: Policy loss: 0.004894. Value loss: 0.660686. Entropy: 0.917975.
Iteration 3134: Policy loss: -0.008839. Value loss: 0.268759. Entropy: 0.916193.
Iteration 3135: Policy loss: -0.015835. Value loss: 0.121830. Entropy: 0.927228.
episode: 1782   score: 13.0  epsilon: 1.0    steps: 152  evaluation reward: 16.28
episode: 1783   score: 16.0  epsilon: 1.0    steps: 704  evaluation reward: 16.31
Training network. lr: 0.000226. clip: 0.090480
Iteration 3136: Policy loss: 0.005742. Value loss: 0.301512. Entropy: 0.889556.
Iteration 3137: Policy loss: -0.013402. Value loss: 0.111083. Entropy: 0.907054.
Iteration 3138: Policy loss: -0.027208. Value loss: 0.071651. Entropy: 0.911626.
Training network

Iteration 3195: Policy loss: -0.024154. Value loss: 0.068238. Entropy: 0.881082.
episode: 1806   score: 6.0  epsilon: 1.0    steps: 968  evaluation reward: 16.61
episode: 1807   score: 18.0  epsilon: 1.0    steps: 1024  evaluation reward: 16.68
Training network. lr: 0.000226. clip: 0.090332
Iteration 3196: Policy loss: 0.010503. Value loss: 0.482039. Entropy: 0.939395.
Iteration 3197: Policy loss: -0.001027. Value loss: 0.179233. Entropy: 0.925121.
Iteration 3198: Policy loss: -0.020756. Value loss: 0.106850. Entropy: 0.926515.
episode: 1808   score: 14.0  epsilon: 1.0    steps: 768  evaluation reward: 16.62
Training network. lr: 0.000226. clip: 0.090332
Iteration 3199: Policy loss: 0.011011. Value loss: 0.409375. Entropy: 0.902911.
Iteration 3200: Policy loss: -0.002642. Value loss: 0.164576. Entropy: 0.908808.
Iteration 3201: Policy loss: -0.024487. Value loss: 0.087513. Entropy: 0.905635.
episode: 1809   score: 17.0  epsilon: 1.0    steps: 440  evaluation reward: 16.6
Training netwo

Iteration 3258: Policy loss: -0.028048. Value loss: 0.054315. Entropy: 0.890603.
episode: 1833   score: 20.0  epsilon: 1.0    steps: 72  evaluation reward: 16.28
Training network. lr: 0.000225. clip: 0.090019
Iteration 3259: Policy loss: 0.011364. Value loss: 0.264771. Entropy: 0.816774.
Iteration 3260: Policy loss: -0.009133. Value loss: 0.115046. Entropy: 0.818856.
Iteration 3261: Policy loss: -0.023851. Value loss: 0.079698. Entropy: 0.815225.
episode: 1834   score: 18.0  epsilon: 1.0    steps: 80  evaluation reward: 16.2
Training network. lr: 0.000225. clip: 0.090019
Iteration 3262: Policy loss: 0.013898. Value loss: 0.306786. Entropy: 0.899282.
Iteration 3263: Policy loss: -0.010358. Value loss: 0.136033. Entropy: 0.889879.
Iteration 3264: Policy loss: -0.022720. Value loss: 0.088593. Entropy: 0.886395.
episode: 1835   score: 12.0  epsilon: 1.0    steps: 288  evaluation reward: 16.08
Training network. lr: 0.000225. clip: 0.090019
Iteration 3265: Policy loss: 0.014928. Value loss: 

episode: 1858   score: 24.0  epsilon: 1.0    steps: 152  evaluation reward: 15.54
Training network. lr: 0.000225. clip: 0.089872
Iteration 3322: Policy loss: 0.004273. Value loss: 0.672447. Entropy: 0.885250.
Iteration 3323: Policy loss: -0.006428. Value loss: 0.306428. Entropy: 0.870292.
Iteration 3324: Policy loss: -0.019787. Value loss: 0.179931. Entropy: 0.864391.
episode: 1859   score: 7.0  epsilon: 1.0    steps: 152  evaluation reward: 15.5
episode: 1860   score: 19.0  epsilon: 1.0    steps: 296  evaluation reward: 15.51
episode: 1861   score: 10.0  epsilon: 1.0    steps: 568  evaluation reward: 15.47
Training network. lr: 0.000225. clip: 0.089872
Iteration 3325: Policy loss: 0.003395. Value loss: 0.256394. Entropy: 0.858848.
Iteration 3326: Policy loss: -0.013185. Value loss: 0.091088. Entropy: 0.859863.
Iteration 3327: Policy loss: -0.024380. Value loss: 0.056772. Entropy: 0.853177.
episode: 1862   score: 16.0  epsilon: 1.0    steps: 392  evaluation reward: 15.48
episode: 1863 

episode: 1884   score: 22.0  epsilon: 1.0    steps: 928  evaluation reward: 15.65
Training network. lr: 0.000224. clip: 0.089715
Iteration 3385: Policy loss: 0.012684. Value loss: 0.580624. Entropy: 0.818608.
Iteration 3386: Policy loss: -0.010780. Value loss: 0.286041. Entropy: 0.820897.
Iteration 3387: Policy loss: -0.018204. Value loss: 0.177500. Entropy: 0.819639.
episode: 1885   score: 12.0  epsilon: 1.0    steps: 80  evaluation reward: 15.5
Training network. lr: 0.000224. clip: 0.089715
Iteration 3388: Policy loss: 0.004565. Value loss: 0.301538. Entropy: 0.839974.
Iteration 3389: Policy loss: -0.010758. Value loss: 0.125932. Entropy: 0.835253.
Iteration 3390: Policy loss: -0.022759. Value loss: 0.082180. Entropy: 0.831830.
episode: 1886   score: 18.0  epsilon: 1.0    steps: 640  evaluation reward: 15.54
episode: 1887   score: 18.0  epsilon: 1.0    steps: 856  evaluation reward: 15.6
Training network. lr: 0.000224. clip: 0.089715
Iteration 3391: Policy loss: 0.012242. Value loss:

Iteration 3449: Policy loss: 0.000021. Value loss: 0.232204. Entropy: 0.759768.
Iteration 3450: Policy loss: -0.015944. Value loss: 0.147373. Entropy: 0.768426.
episode: 1908   score: 28.0  epsilon: 1.0    steps: 184  evaluation reward: 16.14
episode: 1909   score: 16.0  epsilon: 1.0    steps: 640  evaluation reward: 16.13
episode: 1910   score: 12.0  epsilon: 1.0    steps: 832  evaluation reward: 16.11
Training network. lr: 0.000224. clip: 0.089411
Iteration 3451: Policy loss: 0.008998. Value loss: 0.335356. Entropy: 0.810232.
Iteration 3452: Policy loss: -0.013450. Value loss: 0.144948. Entropy: 0.804763.
Iteration 3453: Policy loss: -0.026613. Value loss: 0.084982. Entropy: 0.794584.
Training network. lr: 0.000224. clip: 0.089411
Iteration 3454: Policy loss: 0.009853. Value loss: 0.416856. Entropy: 0.829799.
Iteration 3455: Policy loss: -0.009236. Value loss: 0.170147. Entropy: 0.826939.
Iteration 3456: Policy loss: -0.017917. Value loss: 0.097769. Entropy: 0.832024.
episode: 1911  

Iteration 3513: Policy loss: -0.022602. Value loss: 0.121665. Entropy: 0.928949.
Training network. lr: 0.000223. clip: 0.089254
Iteration 3514: Policy loss: 0.003150. Value loss: 0.353288. Entropy: 0.923616.
Iteration 3515: Policy loss: -0.015113. Value loss: 0.125434. Entropy: 0.908834.
Iteration 3516: Policy loss: -0.027544. Value loss: 0.080917. Entropy: 0.913477.
episode: 1933   score: 7.0  epsilon: 1.0    steps: 568  evaluation reward: 16.6
Training network. lr: 0.000223. clip: 0.089254
Iteration 3517: Policy loss: 0.008045. Value loss: 0.398451. Entropy: 0.916299.
Iteration 3518: Policy loss: -0.013635. Value loss: 0.142317. Entropy: 0.899250.
Iteration 3519: Policy loss: -0.024113. Value loss: 0.103454. Entropy: 0.912974.
episode: 1934   score: 14.0  epsilon: 1.0    steps: 200  evaluation reward: 16.56
episode: 1935   score: 23.0  epsilon: 1.0    steps: 592  evaluation reward: 16.67
episode: 1936   score: 11.0  epsilon: 1.0    steps: 936  evaluation reward: 16.68
Training networ

Iteration 3578: Policy loss: -0.005793. Value loss: 0.216353. Entropy: 0.869292.
Iteration 3579: Policy loss: -0.018776. Value loss: 0.126944. Entropy: 0.868219.
episode: 1956   score: 23.0  epsilon: 1.0    steps: 264  evaluation reward: 17.79
episode: 1957   score: 11.0  epsilon: 1.0    steps: 584  evaluation reward: 17.82
Training network. lr: 0.000223. clip: 0.089097
Iteration 3580: Policy loss: 0.011581. Value loss: 0.604469. Entropy: 0.854712.
Iteration 3581: Policy loss: -0.008425. Value loss: 0.238278. Entropy: 0.848099.
Iteration 3582: Policy loss: -0.018759. Value loss: 0.123264. Entropy: 0.841752.
episode: 1958   score: 24.0  epsilon: 1.0    steps: 896  evaluation reward: 17.82
Training network. lr: 0.000223. clip: 0.089097
Iteration 3583: Policy loss: 0.008650. Value loss: 0.614708. Entropy: 0.847016.
Iteration 3584: Policy loss: -0.008148. Value loss: 0.195859. Entropy: 0.833183.
Iteration 3585: Policy loss: -0.015981. Value loss: 0.112544. Entropy: 0.852746.
Training netwo

episode: 1980   score: 33.0  epsilon: 1.0    steps: 672  evaluation reward: 18.72
Training network. lr: 0.000222. clip: 0.088950
Iteration 3643: Policy loss: 0.013788. Value loss: 0.637524. Entropy: 0.797390.
Iteration 3644: Policy loss: -0.000328. Value loss: 0.253536. Entropy: 0.780453.
Iteration 3645: Policy loss: -0.018046. Value loss: 0.132073. Entropy: 0.779478.
Training network. lr: 0.000222. clip: 0.088950
Iteration 3646: Policy loss: 0.010101. Value loss: 0.502147. Entropy: 0.829994.
Iteration 3647: Policy loss: -0.010323. Value loss: 0.203587. Entropy: 0.835126.
Iteration 3648: Policy loss: -0.020494. Value loss: 0.116537. Entropy: 0.835797.
episode: 1981   score: 22.0  epsilon: 1.0    steps: 896  evaluation reward: 18.84
episode: 1982   score: 15.0  epsilon: 1.0    steps: 1000  evaluation reward: 18.76
Training network. lr: 0.000222. clip: 0.088950
Iteration 3649: Policy loss: 0.006406. Value loss: 0.843912. Entropy: 0.796881.
Iteration 3650: Policy loss: 0.002437. Value los

Iteration 3707: Policy loss: -0.010944. Value loss: 0.173525. Entropy: 0.837394.
Iteration 3708: Policy loss: -0.016686. Value loss: 0.096341. Entropy: 0.842201.
episode: 2004   score: 13.0  epsilon: 1.0    steps: 720  evaluation reward: 18.5
Training network. lr: 0.000222. clip: 0.088637
Iteration 3709: Policy loss: 0.010488. Value loss: 0.598637. Entropy: 0.850326.
Iteration 3710: Policy loss: -0.007601. Value loss: 0.257452. Entropy: 0.847586.
Iteration 3711: Policy loss: -0.019954. Value loss: 0.143889. Entropy: 0.846201.
episode: 2005   score: 15.0  epsilon: 1.0    steps: 152  evaluation reward: 18.55
episode: 2006   score: 20.0  epsilon: 1.0    steps: 440  evaluation reward: 18.57
episode: 2007   score: 18.0  epsilon: 1.0    steps: 664  evaluation reward: 18.66
episode: 2008   score: 11.0  epsilon: 1.0    steps: 936  evaluation reward: 18.49
Training network. lr: 0.000222. clip: 0.088637
Iteration 3712: Policy loss: 0.004013. Value loss: 0.613218. Entropy: 0.891493.
Iteration 371

Training network. lr: 0.000221. clip: 0.088489
Iteration 3769: Policy loss: 0.008580. Value loss: 0.507958. Entropy: 0.870177.
Iteration 3770: Policy loss: -0.007906. Value loss: 0.179094. Entropy: 0.865520.
Iteration 3771: Policy loss: -0.020374. Value loss: 0.103945. Entropy: 0.848276.
Training network. lr: 0.000221. clip: 0.088489
Iteration 3772: Policy loss: 0.011577. Value loss: 0.369504. Entropy: 0.857267.
Iteration 3773: Policy loss: -0.012361. Value loss: 0.143934. Entropy: 0.852940.
Iteration 3774: Policy loss: -0.021045. Value loss: 0.073881. Entropy: 0.848005.
episode: 2032   score: 14.0  epsilon: 1.0    steps: 96  evaluation reward: 18.09
Training network. lr: 0.000221. clip: 0.088489
Iteration 3775: Policy loss: 0.002051. Value loss: 0.277320. Entropy: 0.844962.
Iteration 3776: Policy loss: -0.012425. Value loss: 0.114810. Entropy: 0.848659.
Iteration 3777: Policy loss: -0.027489. Value loss: 0.065414. Entropy: 0.843800.
Training network. lr: 0.000221. clip: 0.088489
Itera

Iteration 3836: Policy loss: -0.004713. Value loss: 0.170513. Entropy: 0.813807.
Iteration 3837: Policy loss: -0.015398. Value loss: 0.083417. Entropy: 0.805668.
episode: 2053   score: 19.0  epsilon: 1.0    steps: 192  evaluation reward: 18.06
Training network. lr: 0.000221. clip: 0.088333
Iteration 3838: Policy loss: 0.013536. Value loss: 0.783276. Entropy: 0.822039.
Iteration 3839: Policy loss: -0.000184. Value loss: 0.288500. Entropy: 0.826431.
Iteration 3840: Policy loss: -0.012430. Value loss: 0.142704. Entropy: 0.832382.
episode: 2054   score: 37.0  epsilon: 1.0    steps: 336  evaluation reward: 18.27
Training network. lr: 0.000221. clip: 0.088333
Iteration 3841: Policy loss: 0.010863. Value loss: 0.592626. Entropy: 0.814467.
Iteration 3842: Policy loss: -0.006975. Value loss: 0.187672. Entropy: 0.811462.
Iteration 3843: Policy loss: -0.018270. Value loss: 0.095494. Entropy: 0.827151.
episode: 2055   score: 23.0  epsilon: 1.0    steps: 456  evaluation reward: 18.27
Training netwo

Iteration 3901: Policy loss: 0.003602. Value loss: 0.335957. Entropy: 0.926542.
Iteration 3902: Policy loss: -0.013245. Value loss: 0.148012. Entropy: 0.923112.
Iteration 3903: Policy loss: -0.029523. Value loss: 0.102310. Entropy: 0.915652.
Training network. lr: 0.000220. clip: 0.088028
Iteration 3904: Policy loss: 0.011417. Value loss: 0.760512. Entropy: 0.878490.
Iteration 3905: Policy loss: 0.002115. Value loss: 0.332130. Entropy: 0.858413.
Iteration 3906: Policy loss: -0.013852. Value loss: 0.177241. Entropy: 0.872460.
episode: 2077   score: 31.0  epsilon: 1.0    steps: 64  evaluation reward: 18.31
episode: 2078   score: 19.0  epsilon: 1.0    steps: 632  evaluation reward: 18.28
episode: 2079   score: 15.0  epsilon: 1.0    steps: 648  evaluation reward: 18.31
Training network. lr: 0.000220. clip: 0.088028
Iteration 3907: Policy loss: 0.012530. Value loss: 0.382505. Entropy: 0.928106.
Iteration 3908: Policy loss: -0.011385. Value loss: 0.113941. Entropy: 0.911995.
Iteration 3909: P

episode: 2101   score: 25.0  epsilon: 1.0    steps: 72  evaluation reward: 18.62
Training network. lr: 0.000220. clip: 0.087872
Iteration 3967: Policy loss: 0.018311. Value loss: 0.390480. Entropy: 0.898930.
Iteration 3968: Policy loss: -0.005855. Value loss: 0.153841. Entropy: 0.908001.
Iteration 3969: Policy loss: -0.019912. Value loss: 0.078982. Entropy: 0.906487.
episode: 2102   score: 18.0  epsilon: 1.0    steps: 144  evaluation reward: 18.6
Training network. lr: 0.000220. clip: 0.087872
Iteration 3970: Policy loss: 0.012260. Value loss: 0.422439. Entropy: 0.926624.
Iteration 3971: Policy loss: -0.011698. Value loss: 0.166865. Entropy: 0.908721.
Iteration 3972: Policy loss: -0.026482. Value loss: 0.102312. Entropy: 0.909436.
episode: 2103   score: 12.0  epsilon: 1.0    steps: 192  evaluation reward: 18.58
episode: 2104   score: 7.0  epsilon: 1.0    steps: 720  evaluation reward: 18.52
Training network. lr: 0.000220. clip: 0.087872
Iteration 3973: Policy loss: 0.012910. Value loss:

episode: 2124   score: 15.0  epsilon: 1.0    steps: 928  evaluation reward: 19.22
Training network. lr: 0.000219. clip: 0.087715
Iteration 4033: Policy loss: 0.007731. Value loss: 0.423205. Entropy: 0.914553.
Iteration 4034: Policy loss: -0.013617. Value loss: 0.162490. Entropy: 0.905874.
Iteration 4035: Policy loss: -0.024701. Value loss: 0.092203. Entropy: 0.912957.
Training network. lr: 0.000219. clip: 0.087715
Iteration 4036: Policy loss: 0.009303. Value loss: 0.530136. Entropy: 0.911894.
Iteration 4037: Policy loss: -0.008916. Value loss: 0.235066. Entropy: 0.911325.
Iteration 4038: Policy loss: -0.022055. Value loss: 0.156655. Entropy: 0.912048.
episode: 2125   score: 25.0  epsilon: 1.0    steps: 96  evaluation reward: 19.24
episode: 2126   score: 3.0  epsilon: 1.0    steps: 160  evaluation reward: 19.06
episode: 2127   score: 25.0  epsilon: 1.0    steps: 760  evaluation reward: 19.09
episode: 2128   score: 19.0  epsilon: 1.0    steps: 760  evaluation reward: 19.19
Training netwo

episode: 2150   score: 13.0  epsilon: 1.0    steps: 504  evaluation reward: 18.5
Training network. lr: 0.000219. clip: 0.087568
Iteration 4096: Policy loss: 0.007344. Value loss: 0.382421. Entropy: 0.836449.
Iteration 4097: Policy loss: -0.007943. Value loss: 0.147614. Entropy: 0.839181.
Iteration 4098: Policy loss: -0.016544. Value loss: 0.082266. Entropy: 0.837705.
now time :  2019-03-06 13:55:32.344455
episode: 2151   score: 15.0  epsilon: 1.0    steps: 56  evaluation reward: 18.39
Training network. lr: 0.000219. clip: 0.087568
Iteration 4099: Policy loss: 0.005754. Value loss: 0.312264. Entropy: 0.730186.
Iteration 4100: Policy loss: -0.004992. Value loss: 0.127430. Entropy: 0.754450.
Iteration 4101: Policy loss: -0.020282. Value loss: 0.060879. Entropy: 0.762127.
episode: 2152   score: 17.0  epsilon: 1.0    steps: 920  evaluation reward: 18.42
Training network. lr: 0.000219. clip: 0.087411
Iteration 4102: Policy loss: 0.007328. Value loss: 0.227062. Entropy: 0.833885.
Iteration 41

Iteration 4158: Policy loss: -0.018877. Value loss: 0.080706. Entropy: 0.884586.
episode: 2177   score: 17.0  epsilon: 1.0    steps: 656  evaluation reward: 17.42
Training network. lr: 0.000218. clip: 0.087254
Iteration 4159: Policy loss: 0.006797. Value loss: 0.258489. Entropy: 0.884616.
Iteration 4160: Policy loss: -0.013113. Value loss: 0.103541. Entropy: 0.891073.
Iteration 4161: Policy loss: -0.024082. Value loss: 0.053762. Entropy: 0.877926.
Training network. lr: 0.000218. clip: 0.087254
Iteration 4162: Policy loss: 0.012696. Value loss: 0.245497. Entropy: 0.885307.
Iteration 4163: Policy loss: -0.012301. Value loss: 0.108517. Entropy: 0.890806.
Iteration 4164: Policy loss: -0.026918. Value loss: 0.061764. Entropy: 0.879811.
Training network. lr: 0.000218. clip: 0.087254
Iteration 4165: Policy loss: 0.009502. Value loss: 0.238903. Entropy: 0.876310.
Iteration 4166: Policy loss: -0.013437. Value loss: 0.110971. Entropy: 0.866215.
Iteration 4167: Policy loss: -0.025788. Value loss:

Iteration 4224: Policy loss: -0.021640. Value loss: 0.165465. Entropy: 0.899705.
episode: 2200   score: 30.0  epsilon: 1.0    steps: 408  evaluation reward: 16.87
Training network. lr: 0.000218. clip: 0.087107
Iteration 4225: Policy loss: 0.011920. Value loss: 0.413327. Entropy: 0.888190.
Iteration 4226: Policy loss: -0.006106. Value loss: 0.149023. Entropy: 0.897965.
Iteration 4227: Policy loss: -0.015860. Value loss: 0.086599. Entropy: 0.905756.
now time :  2019-03-06 13:58:17.977568
episode: 2201   score: 10.0  epsilon: 1.0    steps: 240  evaluation reward: 16.72
Training network. lr: 0.000218. clip: 0.087107
Iteration 4228: Policy loss: 0.005777. Value loss: 0.683861. Entropy: 0.862730.
Iteration 4229: Policy loss: -0.013322. Value loss: 0.211381. Entropy: 0.878692.
Iteration 4230: Policy loss: -0.017892. Value loss: 0.112787. Entropy: 0.866792.
episode: 2202   score: 14.0  epsilon: 1.0    steps: 136  evaluation reward: 16.68
Training network. lr: 0.000218. clip: 0.087107
Iteration

episode: 2221   score: 16.0  epsilon: 1.0    steps: 280  evaluation reward: 17.39
episode: 2222   score: 19.0  epsilon: 1.0    steps: 320  evaluation reward: 17.45
episode: 2223   score: 14.0  epsilon: 1.0    steps: 480  evaluation reward: 17.34
episode: 2224   score: 16.0  epsilon: 1.0    steps: 904  evaluation reward: 17.35
Training network. lr: 0.000217. clip: 0.086950
Iteration 4291: Policy loss: 0.011930. Value loss: 0.699116. Entropy: 0.937307.
Iteration 4292: Policy loss: -0.005668. Value loss: 0.336390. Entropy: 0.930890.
Iteration 4293: Policy loss: -0.017655. Value loss: 0.208395. Entropy: 0.919652.
episode: 2225   score: 17.0  epsilon: 1.0    steps: 920  evaluation reward: 17.27
episode: 2226   score: 20.0  epsilon: 1.0    steps: 944  evaluation reward: 17.44
Training network. lr: 0.000217. clip: 0.086950
Iteration 4294: Policy loss: 0.011717. Value loss: 0.913229. Entropy: 0.891985.
Iteration 4295: Policy loss: -0.002331. Value loss: 0.432983. Entropy: 0.886555.
Iteration 4

Iteration 4355: Policy loss: -0.007114. Value loss: 0.215219. Entropy: 0.865584.
Iteration 4356: Policy loss: -0.016540. Value loss: 0.113842. Entropy: 0.874118.
episode: 2246   score: 22.0  epsilon: 1.0    steps: 856  evaluation reward: 18.14
Training network. lr: 0.000217. clip: 0.086646
Iteration 4357: Policy loss: 0.012799. Value loss: 0.594737. Entropy: 0.898306.
Iteration 4358: Policy loss: -0.006543. Value loss: 0.199964. Entropy: 0.886851.
Iteration 4359: Policy loss: -0.020259. Value loss: 0.105738. Entropy: 0.899941.
Training network. lr: 0.000217. clip: 0.086646
Iteration 4360: Policy loss: 0.008949. Value loss: 0.601198. Entropy: 0.923141.
Iteration 4361: Policy loss: -0.008630. Value loss: 0.247834. Entropy: 0.914181.
Iteration 4362: Policy loss: -0.023603. Value loss: 0.127907. Entropy: 0.909804.
episode: 2247   score: 26.0  epsilon: 1.0    steps: 352  evaluation reward: 18.29
episode: 2248   score: 23.0  epsilon: 1.0    steps: 408  evaluation reward: 18.4
episode: 2249  

Iteration 4422: Policy loss: -0.023852. Value loss: 0.113864. Entropy: 0.916619.
episode: 2268   score: 12.0  epsilon: 1.0    steps: 192  evaluation reward: 19.92
episode: 2269   score: 15.0  epsilon: 1.0    steps: 360  evaluation reward: 19.97
Training network. lr: 0.000216. clip: 0.086489
Iteration 4423: Policy loss: 0.004509. Value loss: 0.648676. Entropy: 1.018124.
Iteration 4424: Policy loss: -0.017291. Value loss: 0.377885. Entropy: 1.031083.
Iteration 4425: Policy loss: -0.023483. Value loss: 0.265003. Entropy: 1.011627.
Training network. lr: 0.000216. clip: 0.086489
Iteration 4426: Policy loss: 0.007404. Value loss: 0.540783. Entropy: 0.928126.
Iteration 4427: Policy loss: -0.012699. Value loss: 0.218992. Entropy: 0.924330.
Iteration 4428: Policy loss: -0.019608. Value loss: 0.121997. Entropy: 0.925551.
episode: 2270   score: 19.0  epsilon: 1.0    steps: 576  evaluation reward: 20.02
Training network. lr: 0.000216. clip: 0.086489
Iteration 4429: Policy loss: 0.010190. Value los

episode: 2293   score: 25.0  epsilon: 1.0    steps: 648  evaluation reward: 19.93
episode: 2294   score: 16.0  epsilon: 1.0    steps: 728  evaluation reward: 19.93
Training network. lr: 0.000216. clip: 0.086333
Iteration 4486: Policy loss: 0.007398. Value loss: 0.398114. Entropy: 0.982060.
Iteration 4487: Policy loss: -0.008758. Value loss: 0.187851. Entropy: 0.977148.
Iteration 4488: Policy loss: -0.021589. Value loss: 0.117515. Entropy: 0.977461.
episode: 2295   score: 20.0  epsilon: 1.0    steps: 608  evaluation reward: 19.99
Training network. lr: 0.000216. clip: 0.086333
Iteration 4489: Policy loss: 0.007265. Value loss: 0.343190. Entropy: 0.966626.
Iteration 4490: Policy loss: -0.017093. Value loss: 0.121656. Entropy: 0.978008.
Iteration 4491: Policy loss: -0.029327. Value loss: 0.069829. Entropy: 0.975393.
Training network. lr: 0.000216. clip: 0.086333
Iteration 4492: Policy loss: 0.012173. Value loss: 0.429650. Entropy: 0.924448.
Iteration 4493: Policy loss: -0.008010. Value los

Training network. lr: 0.000215. clip: 0.086185
Iteration 4549: Policy loss: 0.011027. Value loss: 0.311238. Entropy: 0.891405.
Iteration 4550: Policy loss: -0.009952. Value loss: 0.121542. Entropy: 0.878006.
Iteration 4551: Policy loss: -0.027379. Value loss: 0.069686. Entropy: 0.878808.
episode: 2319   score: 16.0  epsilon: 1.0    steps: 400  evaluation reward: 19.07
Training network. lr: 0.000215. clip: 0.086029
Iteration 4552: Policy loss: 0.003490. Value loss: 0.308421. Entropy: 0.863743.
Iteration 4553: Policy loss: -0.015194. Value loss: 0.095449. Entropy: 0.855518.
Iteration 4554: Policy loss: -0.025123. Value loss: 0.055555. Entropy: 0.868816.
episode: 2320   score: 13.0  epsilon: 1.0    steps: 88  evaluation reward: 18.98
Training network. lr: 0.000215. clip: 0.086029
Iteration 4555: Policy loss: 0.010748. Value loss: 0.282675. Entropy: 0.917488.
Iteration 4556: Policy loss: -0.012646. Value loss: 0.130011. Entropy: 0.925974.
Iteration 4557: Policy loss: -0.028384. Value loss:

Iteration 4615: Policy loss: 0.012111. Value loss: 0.622189. Entropy: 0.923378.
Iteration 4616: Policy loss: -0.000887. Value loss: 0.237237. Entropy: 0.936227.
Iteration 4617: Policy loss: -0.013641. Value loss: 0.118766. Entropy: 0.941806.
episode: 2341   score: 29.0  epsilon: 1.0    steps: 208  evaluation reward: 19.55
episode: 2342   score: 24.0  epsilon: 1.0    steps: 928  evaluation reward: 19.57
Training network. lr: 0.000215. clip: 0.085872
Iteration 4618: Policy loss: 0.009318. Value loss: 0.633787. Entropy: 0.959962.
Iteration 4619: Policy loss: -0.012157. Value loss: 0.254341. Entropy: 0.969355.
Iteration 4620: Policy loss: -0.019986. Value loss: 0.127782. Entropy: 0.962595.
episode: 2343   score: 25.0  epsilon: 1.0    steps: 848  evaluation reward: 19.7
Training network. lr: 0.000215. clip: 0.085872
Iteration 4621: Policy loss: 0.011693. Value loss: 0.462601. Entropy: 0.984547.
Iteration 4622: Policy loss: -0.011467. Value loss: 0.168562. Entropy: 0.990382.
Iteration 4623: 

episode: 2364   score: 15.0  epsilon: 1.0    steps: 312  evaluation reward: 19.09
Training network. lr: 0.000214. clip: 0.085724
Iteration 4681: Policy loss: 0.006044. Value loss: 0.707151. Entropy: 0.910909.
Iteration 4682: Policy loss: -0.012456. Value loss: 0.248986. Entropy: 0.875358.
Iteration 4683: Policy loss: -0.016078. Value loss: 0.116738. Entropy: 0.875807.
episode: 2365   score: 27.0  epsilon: 1.0    steps: 96  evaluation reward: 19.14
episode: 2366   score: 10.0  epsilon: 1.0    steps: 776  evaluation reward: 18.86
episode: 2367   score: 19.0  epsilon: 1.0    steps: 944  evaluation reward: 18.87
Training network. lr: 0.000214. clip: 0.085724
Iteration 4684: Policy loss: 0.015344. Value loss: 0.670428. Entropy: 0.961017.
Iteration 4685: Policy loss: -0.006680. Value loss: 0.288393. Entropy: 0.958806.
Iteration 4686: Policy loss: -0.018669. Value loss: 0.174078. Entropy: 0.961398.
Training network. lr: 0.000214. clip: 0.085724
Iteration 4687: Policy loss: 0.008817. Value los

episode: 2387   score: 29.0  epsilon: 1.0    steps: 832  evaluation reward: 19.8
episode: 2388   score: 28.0  epsilon: 1.0    steps: 896  evaluation reward: 19.98
Training network. lr: 0.000214. clip: 0.085568
Iteration 4747: Policy loss: 0.014525. Value loss: 0.740077. Entropy: 0.948974.
Iteration 4748: Policy loss: 0.001986. Value loss: 0.252045. Entropy: 0.940068.
Iteration 4749: Policy loss: -0.016321. Value loss: 0.160018. Entropy: 0.934889.
episode: 2389   score: 21.0  epsilon: 1.0    steps: 288  evaluation reward: 20.11
Training network. lr: 0.000214. clip: 0.085568
Iteration 4750: Policy loss: 0.011480. Value loss: 0.310633. Entropy: 0.890378.
Iteration 4751: Policy loss: -0.011137. Value loss: 0.148513. Entropy: 0.898951.
Iteration 4752: Policy loss: -0.024287. Value loss: 0.094054. Entropy: 0.881421.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4753: Policy loss: 0.013418. Value loss: 0.510108. Entropy: 0.868202.
Iteration 4754: Policy loss: -0.004395. Value loss:

Training network. lr: 0.000213. clip: 0.085264
Iteration 4813: Policy loss: 0.012455. Value loss: 0.536594. Entropy: 0.886547.
Iteration 4814: Policy loss: -0.012988. Value loss: 0.237828. Entropy: 0.874760.
Iteration 4815: Policy loss: -0.015027. Value loss: 0.165916. Entropy: 0.871585.
Training network. lr: 0.000213. clip: 0.085264
Iteration 4816: Policy loss: 0.018296. Value loss: 0.674858. Entropy: 0.932936.
Iteration 4817: Policy loss: -0.005751. Value loss: 0.362423. Entropy: 0.945775.
Iteration 4818: Policy loss: -0.013202. Value loss: 0.220926. Entropy: 0.950215.
episode: 2409   score: 24.0  epsilon: 1.0    steps: 848  evaluation reward: 21.24
Training network. lr: 0.000213. clip: 0.085264
Iteration 4819: Policy loss: 0.011009. Value loss: 1.331087. Entropy: 0.900307.
Iteration 4820: Policy loss: -0.005470. Value loss: 0.554562. Entropy: 0.885604.
Iteration 4821: Policy loss: -0.016556. Value loss: 0.335889. Entropy: 0.901334.
episode: 2410   score: 36.0  epsilon: 1.0    steps:

Iteration 4880: Policy loss: -0.009774. Value loss: 0.132913. Entropy: 0.904496.
Iteration 4881: Policy loss: -0.020747. Value loss: 0.079762. Entropy: 0.915468.
Training network. lr: 0.000213. clip: 0.085107
Iteration 4882: Policy loss: 0.013687. Value loss: 0.493176. Entropy: 0.938467.
Iteration 4883: Policy loss: -0.001302. Value loss: 0.202909. Entropy: 0.945216.
Iteration 4884: Policy loss: -0.010397. Value loss: 0.097970. Entropy: 0.950008.
episode: 2430   score: 37.0  epsilon: 1.0    steps: 80  evaluation reward: 22.53
episode: 2431   score: 28.0  epsilon: 1.0    steps: 600  evaluation reward: 22.68
episode: 2432   score: 21.0  epsilon: 1.0    steps: 952  evaluation reward: 22.75
Training network. lr: 0.000213. clip: 0.085107
Iteration 4885: Policy loss: 0.010883. Value loss: 0.369148. Entropy: 0.836403.
Iteration 4886: Policy loss: -0.003938. Value loss: 0.132451. Entropy: 0.844248.
Iteration 4887: Policy loss: -0.015702. Value loss: 0.085298. Entropy: 0.845090.
episode: 2433  

Iteration 4945: Policy loss: 0.004801. Value loss: 0.477018. Entropy: 0.927451.
Iteration 4946: Policy loss: -0.010319. Value loss: 0.203131. Entropy: 0.927259.
Iteration 4947: Policy loss: -0.020195. Value loss: 0.103866. Entropy: 0.918910.
Training network. lr: 0.000212. clip: 0.084950
Iteration 4948: Policy loss: 0.007434. Value loss: 0.402222. Entropy: 0.948068.
Iteration 4949: Policy loss: -0.005260. Value loss: 0.134700. Entropy: 0.949468.
Iteration 4950: Policy loss: -0.017570. Value loss: 0.091647. Entropy: 0.958982.
Training network. lr: 0.000212. clip: 0.084803
Iteration 4951: Policy loss: 0.000360. Value loss: 0.683384. Entropy: 0.965307.
Iteration 4952: Policy loss: -0.013070. Value loss: 0.231339. Entropy: 0.955259.
Iteration 4953: Policy loss: -0.022617. Value loss: 0.102558. Entropy: 0.960890.
episode: 2453   score: 22.0  epsilon: 1.0    steps: 16  evaluation reward: 22.72
episode: 2454   score: 22.0  epsilon: 1.0    steps: 320  evaluation reward: 22.81
episode: 2455   s

episode: 2473   score: 22.0  epsilon: 1.0    steps: 992  evaluation reward: 24.01
Training network. lr: 0.000212. clip: 0.084646
Iteration 5014: Policy loss: 0.013622. Value loss: 0.578495. Entropy: 0.950253.
Iteration 5015: Policy loss: -0.011383. Value loss: 0.238840. Entropy: 0.945951.
Iteration 5016: Policy loss: -0.023669. Value loss: 0.136202. Entropy: 0.958443.
episode: 2474   score: 27.0  epsilon: 1.0    steps: 792  evaluation reward: 24.04
Training network. lr: 0.000212. clip: 0.084646
Iteration 5017: Policy loss: 0.012934. Value loss: 0.778497. Entropy: 1.020627.
Iteration 5018: Policy loss: -0.004132. Value loss: 0.383792. Entropy: 1.013403.
Iteration 5019: Policy loss: -0.016288. Value loss: 0.237718. Entropy: 1.017810.
Training network. lr: 0.000212. clip: 0.084646
Iteration 5020: Policy loss: 0.012203. Value loss: 0.448582. Entropy: 0.953318.
Iteration 5021: Policy loss: -0.003933. Value loss: 0.178040. Entropy: 0.932919.
Iteration 5022: Policy loss: -0.019553. Value loss

Training network. lr: 0.000211. clip: 0.084489
Iteration 5080: Policy loss: 0.010568. Value loss: 0.276320. Entropy: 0.901581.
Iteration 5081: Policy loss: -0.009292. Value loss: 0.126499. Entropy: 0.903291.
Iteration 5082: Policy loss: -0.021821. Value loss: 0.071114. Entropy: 0.903928.
Training network. lr: 0.000211. clip: 0.084489
Iteration 5083: Policy loss: 0.012204. Value loss: 0.735614. Entropy: 0.870684.
Iteration 5084: Policy loss: 0.001816. Value loss: 0.266888. Entropy: 0.857053.
Iteration 5085: Policy loss: -0.014442. Value loss: 0.134683. Entropy: 0.858270.
episode: 2496   score: 26.0  epsilon: 1.0    steps: 32  evaluation reward: 24.46
Training network. lr: 0.000211. clip: 0.084489
Iteration 5086: Policy loss: 0.008438. Value loss: 0.511289. Entropy: 0.982765.
Iteration 5087: Policy loss: -0.008395. Value loss: 0.201408. Entropy: 0.977367.
Iteration 5088: Policy loss: -0.014564. Value loss: 0.112327. Entropy: 0.978575.
Training network. lr: 0.000211. clip: 0.084489
Iterat

episode: 2515   score: 38.0  epsilon: 1.0    steps: 504  evaluation reward: 24.14
Training network. lr: 0.000211. clip: 0.084342
Iteration 5149: Policy loss: 0.014005. Value loss: 0.514133. Entropy: 0.998105.
Iteration 5150: Policy loss: -0.008332. Value loss: 0.206620. Entropy: 0.998261.
Iteration 5151: Policy loss: -0.021769. Value loss: 0.114699. Entropy: 0.995289.
episode: 2516   score: 13.0  epsilon: 1.0    steps: 184  evaluation reward: 24.08
Training network. lr: 0.000210. clip: 0.084185
Iteration 5152: Policy loss: 0.013442. Value loss: 0.977485. Entropy: 0.942061.
Iteration 5153: Policy loss: -0.000415. Value loss: 0.369187. Entropy: 0.932627.
Iteration 5154: Policy loss: -0.008708. Value loss: 0.202557. Entropy: 0.933440.
Training network. lr: 0.000210. clip: 0.084185
Iteration 5155: Policy loss: 0.011656. Value loss: 0.715536. Entropy: 0.916498.
Iteration 5156: Policy loss: 0.000893. Value loss: 0.320552. Entropy: 0.909932.
Iteration 5157: Policy loss: -0.006414. Value loss:

Training network. lr: 0.000210. clip: 0.084029
Iteration 5215: Policy loss: 0.004687. Value loss: 0.422507. Entropy: 0.935053.
Iteration 5216: Policy loss: -0.010942. Value loss: 0.167893. Entropy: 0.938646.
Iteration 5217: Policy loss: -0.020960. Value loss: 0.088893. Entropy: 0.925876.
Training network. lr: 0.000210. clip: 0.084029
Iteration 5218: Policy loss: 0.010414. Value loss: 0.424536. Entropy: 0.882848.
Iteration 5219: Policy loss: -0.006049. Value loss: 0.135555. Entropy: 0.875699.
Iteration 5220: Policy loss: -0.016138. Value loss: 0.061133. Entropy: 0.873320.
episode: 2538   score: 39.0  epsilon: 1.0    steps: 784  evaluation reward: 24.54
Training network. lr: 0.000210. clip: 0.084029
Iteration 5221: Policy loss: 0.010913. Value loss: 0.433284. Entropy: 0.897020.
Iteration 5222: Policy loss: -0.009394. Value loss: 0.177795. Entropy: 0.894206.
Iteration 5223: Policy loss: -0.024572. Value loss: 0.099998. Entropy: 0.891431.
episode: 2539   score: 15.0  epsilon: 1.0    steps:

Iteration 5282: Policy loss: -0.005884. Value loss: 0.146389. Entropy: 0.857954.
Iteration 5283: Policy loss: -0.018261. Value loss: 0.089185. Entropy: 0.835902.
Training network. lr: 0.000210. clip: 0.083881
Iteration 5284: Policy loss: 0.006631. Value loss: 0.527846. Entropy: 0.949829.
Iteration 5285: Policy loss: -0.006454. Value loss: 0.187713. Entropy: 0.947789.
Iteration 5286: Policy loss: -0.013116. Value loss: 0.105339. Entropy: 0.955283.
episode: 2559   score: 12.0  epsilon: 1.0    steps: 912  evaluation reward: 24.74
Training network. lr: 0.000210. clip: 0.083881
Iteration 5287: Policy loss: 0.010975. Value loss: 0.704713. Entropy: 0.910279.
Iteration 5288: Policy loss: -0.003416. Value loss: 0.183597. Entropy: 0.916834.
Iteration 5289: Policy loss: -0.015558. Value loss: 0.103632. Entropy: 0.910689.
episode: 2560   score: 27.0  epsilon: 1.0    steps: 344  evaluation reward: 24.74
Training network. lr: 0.000210. clip: 0.083881
Iteration 5290: Policy loss: 0.010544. Value loss

Iteration 5350: Policy loss: 0.013604. Value loss: 0.677520. Entropy: 0.918343.
Iteration 5351: Policy loss: -0.005143. Value loss: 0.330050. Entropy: 0.937192.
Iteration 5352: Policy loss: -0.020380. Value loss: 0.228887. Entropy: 0.930002.
episode: 2579   score: 16.0  epsilon: 1.0    steps: 448  evaluation reward: 24.4
episode: 2580   score: 14.0  epsilon: 1.0    steps: 456  evaluation reward: 24.41
episode: 2581   score: 47.0  epsilon: 1.0    steps: 904  evaluation reward: 24.68
Training network. lr: 0.000209. clip: 0.083568
Iteration 5353: Policy loss: 0.025779. Value loss: 1.340302. Entropy: 0.951573.
Iteration 5354: Policy loss: 0.011379. Value loss: 0.541788. Entropy: 0.949979.
Iteration 5355: Policy loss: -0.003845. Value loss: 0.318727. Entropy: 0.936287.
episode: 2582   score: 21.0  epsilon: 1.0    steps: 456  evaluation reward: 24.62
Training network. lr: 0.000209. clip: 0.083568
Iteration 5356: Policy loss: 0.006075. Value loss: 0.566622. Entropy: 0.840777.
Iteration 5357: 

Training network. lr: 0.000209. clip: 0.083420
Iteration 5416: Policy loss: 0.011112. Value loss: 0.649178. Entropy: 0.910448.
Iteration 5417: Policy loss: -0.004765. Value loss: 0.311443. Entropy: 0.892641.
Iteration 5418: Policy loss: -0.019587. Value loss: 0.180123. Entropy: 0.914656.
episode: 2602   score: 15.0  epsilon: 1.0    steps: 864  evaluation reward: 24.24
Training network. lr: 0.000209. clip: 0.083420
Iteration 5419: Policy loss: 0.006492. Value loss: 0.611225. Entropy: 0.938740.
Iteration 5420: Policy loss: -0.004595. Value loss: 0.213020. Entropy: 0.945541.
Iteration 5421: Policy loss: -0.020553. Value loss: 0.119153. Entropy: 0.950697.
episode: 2603   score: 17.0  epsilon: 1.0    steps: 8  evaluation reward: 24.08
episode: 2604   score: 25.0  epsilon: 1.0    steps: 280  evaluation reward: 24.0
Training network. lr: 0.000209. clip: 0.083420
Iteration 5422: Policy loss: 0.011193. Value loss: 0.266456. Entropy: 0.963243.
Iteration 5423: Policy loss: -0.003407. Value loss: 

Iteration 5484: Policy loss: -0.011222. Value loss: 0.177099. Entropy: 0.988786.
episode: 2622   score: 13.0  epsilon: 1.0    steps: 824  evaluation reward: 24.29
Training network. lr: 0.000208. clip: 0.083264
Iteration 5485: Policy loss: 0.014423. Value loss: 0.789376. Entropy: 0.955460.
Iteration 5486: Policy loss: -0.001861. Value loss: 0.294848. Entropy: 0.948404.
Iteration 5487: Policy loss: -0.015204. Value loss: 0.142317. Entropy: 0.941985.
episode: 2623   score: 22.0  epsilon: 1.0    steps: 248  evaluation reward: 24.23
episode: 2624   score: 27.0  epsilon: 1.0    steps: 904  evaluation reward: 24.26
Training network. lr: 0.000208. clip: 0.083264
Iteration 5488: Policy loss: 0.006940. Value loss: 1.102019. Entropy: 0.970111.
Iteration 5489: Policy loss: -0.005268. Value loss: 0.450778. Entropy: 0.968286.
Iteration 5490: Policy loss: -0.012580. Value loss: 0.248029. Entropy: 0.961506.
episode: 2625   score: 18.0  epsilon: 1.0    steps: 800  evaluation reward: 24.3
episode: 2626 

Iteration 5551: Policy loss: 0.007049. Value loss: 0.281697. Entropy: 0.952197.
Iteration 5552: Policy loss: -0.006692. Value loss: 0.082719. Entropy: 0.952417.
Iteration 5553: Policy loss: -0.022677. Value loss: 0.043540. Entropy: 0.937701.
Training network. lr: 0.000207. clip: 0.082960
Iteration 5554: Policy loss: 0.010062. Value loss: 0.569516. Entropy: 1.036177.
Iteration 5555: Policy loss: -0.008975. Value loss: 0.241825. Entropy: 1.029056.
Iteration 5556: Policy loss: -0.015786. Value loss: 0.133587. Entropy: 1.030980.
episode: 2643   score: 11.0  epsilon: 1.0    steps: 120  evaluation reward: 24.82
episode: 2644   score: 13.0  epsilon: 1.0    steps: 952  evaluation reward: 24.69
Training network. lr: 0.000207. clip: 0.082960
Iteration 5557: Policy loss: 0.000896. Value loss: 0.556331. Entropy: 1.037791.
Iteration 5558: Policy loss: -0.003582. Value loss: 0.211380. Entropy: 1.043259.
Iteration 5559: Policy loss: -0.016003. Value loss: 0.132839. Entropy: 1.036868.
episode: 2645   

Iteration 5616: Policy loss: -0.023301. Value loss: 0.116357. Entropy: 0.940134.
Training network. lr: 0.000207. clip: 0.082803
Iteration 5617: Policy loss: 0.013162. Value loss: 0.302088. Entropy: 0.938868.
Iteration 5618: Policy loss: -0.009167. Value loss: 0.120565. Entropy: 0.933231.
Iteration 5619: Policy loss: -0.019275. Value loss: 0.064609. Entropy: 0.918931.
episode: 2667   score: 15.0  epsilon: 1.0    steps: 24  evaluation reward: 23.18
episode: 2668   score: 19.0  epsilon: 1.0    steps: 40  evaluation reward: 23.03
Training network. lr: 0.000207. clip: 0.082803
Iteration 5620: Policy loss: 0.011730. Value loss: 0.230811. Entropy: 0.885577.
Iteration 5621: Policy loss: -0.011999. Value loss: 0.107760. Entropy: 0.886866.
Iteration 5622: Policy loss: -0.021122. Value loss: 0.067428. Entropy: 0.882493.
Training network. lr: 0.000207. clip: 0.082803
Iteration 5623: Policy loss: 0.011174. Value loss: 0.679705. Entropy: 1.006182.
Iteration 5624: Policy loss: -0.004489. Value loss: 

episode: 2689   score: 21.0  epsilon: 1.0    steps: 1016  evaluation reward: 23.09
Training network. lr: 0.000207. clip: 0.082646
Iteration 5683: Policy loss: 0.008068. Value loss: 0.732481. Entropy: 1.045250.
Iteration 5684: Policy loss: -0.003662. Value loss: 0.287250. Entropy: 1.027311.
Iteration 5685: Policy loss: -0.023566. Value loss: 0.139819. Entropy: 1.036740.
episode: 2690   score: 19.0  epsilon: 1.0    steps: 456  evaluation reward: 22.98
episode: 2691   score: 24.0  epsilon: 1.0    steps: 592  evaluation reward: 22.91
Training network. lr: 0.000207. clip: 0.082646
Iteration 5686: Policy loss: 0.014072. Value loss: 0.784855. Entropy: 0.981632.
Iteration 5687: Policy loss: -0.000316. Value loss: 0.351455. Entropy: 0.958261.
Iteration 5688: Policy loss: -0.011065. Value loss: 0.231054. Entropy: 0.969749.
Training network. lr: 0.000207. clip: 0.082646
Iteration 5689: Policy loss: 0.015657. Value loss: 0.376996. Entropy: 0.862228.
Iteration 5690: Policy loss: -0.007850. Value lo

Training network. lr: 0.000206. clip: 0.082499
Iteration 5749: Policy loss: 0.008352. Value loss: 0.428316. Entropy: 0.935090.
Iteration 5750: Policy loss: -0.005511. Value loss: 0.193479. Entropy: 0.932868.
Iteration 5751: Policy loss: -0.019097. Value loss: 0.106502. Entropy: 0.923642.
episode: 2711   score: 15.0  epsilon: 1.0    steps: 248  evaluation reward: 23.65
Training network. lr: 0.000206. clip: 0.082342
Iteration 5752: Policy loss: 0.007226. Value loss: 0.998216. Entropy: 0.898218.
Iteration 5753: Policy loss: -0.004161. Value loss: 0.314265. Entropy: 0.903094.
Iteration 5754: Policy loss: -0.015209. Value loss: 0.132831. Entropy: 0.914861.
episode: 2712   score: 26.0  epsilon: 1.0    steps: 384  evaluation reward: 23.71
Training network. lr: 0.000206. clip: 0.082342
Iteration 5755: Policy loss: 0.014963. Value loss: 0.765442. Entropy: 0.927341.
Iteration 5756: Policy loss: 0.001256. Value loss: 0.328476. Entropy: 0.932045.
Iteration 5757: Policy loss: -0.017420. Value loss:

episode: 2730   score: 22.0  epsilon: 1.0    steps: 968  evaluation reward: 24.15
Training network. lr: 0.000205. clip: 0.082185
Iteration 5818: Policy loss: 0.018762. Value loss: 0.767001. Entropy: 0.938084.
Iteration 5819: Policy loss: 0.000582. Value loss: 0.361371. Entropy: 0.952686.
Iteration 5820: Policy loss: -0.019246. Value loss: 0.218678. Entropy: 0.935936.
Training network. lr: 0.000205. clip: 0.082185
Iteration 5821: Policy loss: 0.007929. Value loss: 0.717999. Entropy: 0.905859.
Iteration 5822: Policy loss: -0.007731. Value loss: 0.302599. Entropy: 0.912348.
Iteration 5823: Policy loss: -0.016245. Value loss: 0.171096. Entropy: 0.918392.
episode: 2731   score: 23.0  epsilon: 1.0    steps: 136  evaluation reward: 24.19
Training network. lr: 0.000205. clip: 0.082185
Iteration 5824: Policy loss: 0.009615. Value loss: 0.671597. Entropy: 0.908704.
Iteration 5825: Policy loss: -0.004207. Value loss: 0.285005. Entropy: 0.902251.
Iteration 5826: Policy loss: -0.014659. Value loss:

Training network. lr: 0.000205. clip: 0.082038
Iteration 5884: Policy loss: 0.009313. Value loss: 0.695065. Entropy: 0.931210.
Iteration 5885: Policy loss: -0.000753. Value loss: 0.296006. Entropy: 0.941197.
Iteration 5886: Policy loss: -0.013142. Value loss: 0.207896. Entropy: 0.931031.
Training network. lr: 0.000205. clip: 0.082038
Iteration 5887: Policy loss: 0.006923. Value loss: 0.360644. Entropy: 0.903348.
Iteration 5888: Policy loss: -0.014649. Value loss: 0.183015. Entropy: 0.907450.
Iteration 5889: Policy loss: -0.021845. Value loss: 0.107670. Entropy: 0.901606.
Training network. lr: 0.000205. clip: 0.082038
Iteration 5890: Policy loss: 0.013889. Value loss: 0.578570. Entropy: 0.907335.
Iteration 5891: Policy loss: -0.010879. Value loss: 0.260236. Entropy: 0.924907.
Iteration 5892: Policy loss: -0.021105. Value loss: 0.170100. Entropy: 0.909412.
episode: 2753   score: 14.0  epsilon: 1.0    steps: 784  evaluation reward: 24.61
Training network. lr: 0.000205. clip: 0.082038
Iter

Iteration 5952: Policy loss: -0.014720. Value loss: 0.220884. Entropy: 0.858049.
episode: 2773   score: 39.0  epsilon: 1.0    steps: 664  evaluation reward: 25.59
Training network. lr: 0.000204. clip: 0.081725
Iteration 5953: Policy loss: 0.009088. Value loss: 0.918803. Entropy: 0.875249.
Iteration 5954: Policy loss: -0.002769. Value loss: 0.388700. Entropy: 0.870289.
Iteration 5955: Policy loss: -0.013630. Value loss: 0.185295. Entropy: 0.893543.
episode: 2774   score: 33.0  epsilon: 1.0    steps: 312  evaluation reward: 25.67
Training network. lr: 0.000204. clip: 0.081725
Iteration 5956: Policy loss: 0.006388. Value loss: 0.728494. Entropy: 0.879322.
Iteration 5957: Policy loss: -0.003646. Value loss: 0.351877. Entropy: 0.909075.
Iteration 5958: Policy loss: -0.015010. Value loss: 0.213084. Entropy: 0.906478.
episode: 2775   score: 29.0  epsilon: 1.0    steps: 160  evaluation reward: 25.72
Training network. lr: 0.000204. clip: 0.081725
Iteration 5959: Policy loss: 0.020407. Value los

Iteration 6020: Policy loss: 0.000563. Value loss: 0.274567. Entropy: 0.950024.
Iteration 6021: Policy loss: -0.011985. Value loss: 0.151770. Entropy: 0.935090.
episode: 2793   score: 27.0  epsilon: 1.0    steps: 48  evaluation reward: 26.41
episode: 2794   score: 27.0  epsilon: 1.0    steps: 232  evaluation reward: 26.39
Training network. lr: 0.000204. clip: 0.081577
Iteration 6022: Policy loss: 0.007480. Value loss: 0.465365. Entropy: 0.988815.
Iteration 6023: Policy loss: -0.010461. Value loss: 0.216958. Entropy: 0.966286.
Iteration 6024: Policy loss: -0.024308. Value loss: 0.139524. Entropy: 0.974710.
episode: 2795   score: 26.0  epsilon: 1.0    steps: 384  evaluation reward: 26.33
episode: 2796   score: 13.0  epsilon: 1.0    steps: 400  evaluation reward: 26.27
Training network. lr: 0.000204. clip: 0.081577
Iteration 6025: Policy loss: 0.010778. Value loss: 0.635878. Entropy: 0.990306.
Iteration 6026: Policy loss: -0.001156. Value loss: 0.284577. Entropy: 0.985531.
Iteration 6027:

Iteration 6085: Policy loss: 0.017388. Value loss: 0.660889. Entropy: 0.895499.
Iteration 6086: Policy loss: -0.001955. Value loss: 0.250764. Entropy: 0.895722.
Iteration 6087: Policy loss: -0.017460. Value loss: 0.140320. Entropy: 0.897449.
Training network. lr: 0.000204. clip: 0.081421
Iteration 6088: Policy loss: 0.013068. Value loss: 0.730401. Entropy: 0.917966.
Iteration 6089: Policy loss: -0.003962. Value loss: 0.341696. Entropy: 0.929607.
Iteration 6090: Policy loss: -0.012297. Value loss: 0.214097. Entropy: 0.928712.
episode: 2816   score: 18.0  epsilon: 1.0    steps: 824  evaluation reward: 25.43
Training network. lr: 0.000204. clip: 0.081421
Iteration 6091: Policy loss: 0.004494. Value loss: 0.468623. Entropy: 1.021340.
Iteration 6092: Policy loss: -0.007539. Value loss: 0.200340. Entropy: 1.027399.
Iteration 6093: Policy loss: -0.021117. Value loss: 0.119998. Entropy: 1.013253.
episode: 2817   score: 28.0  epsilon: 1.0    steps: 224  evaluation reward: 25.28
episode: 2818   

Iteration 6153: Policy loss: -0.012759. Value loss: 0.085436. Entropy: 0.935581.
episode: 2837   score: 40.0  epsilon: 1.0    steps: 272  evaluation reward: 23.85
Training network. lr: 0.000203. clip: 0.081116
Iteration 6154: Policy loss: 0.006900. Value loss: 0.572348. Entropy: 0.939859.
Iteration 6155: Policy loss: -0.008272. Value loss: 0.172522. Entropy: 0.941424.
Iteration 6156: Policy loss: -0.018163. Value loss: 0.107900. Entropy: 0.944831.
episode: 2838   score: 26.0  epsilon: 1.0    steps: 1000  evaluation reward: 23.92
Training network. lr: 0.000203. clip: 0.081116
Iteration 6157: Policy loss: 0.010468. Value loss: 0.792735. Entropy: 0.961453.
Iteration 6158: Policy loss: -0.008110. Value loss: 0.252738. Entropy: 0.956605.
Iteration 6159: Policy loss: -0.022536. Value loss: 0.148574. Entropy: 0.966424.
episode: 2839   score: 15.0  epsilon: 1.0    steps: 696  evaluation reward: 23.69
episode: 2840   score: 26.0  epsilon: 1.0    steps: 880  evaluation reward: 23.44
Training net

Iteration 6217: Policy loss: 0.011846. Value loss: 0.516371. Entropy: 0.968523.
Iteration 6218: Policy loss: -0.008649. Value loss: 0.227630. Entropy: 0.968957.
Iteration 6219: Policy loss: -0.022593. Value loss: 0.118404. Entropy: 0.957074.
Training network. lr: 0.000202. clip: 0.080960
Iteration 6220: Policy loss: 0.009112. Value loss: 0.464366. Entropy: 1.006196.
Iteration 6221: Policy loss: -0.006795. Value loss: 0.169038. Entropy: 1.003655.
Iteration 6222: Policy loss: -0.017246. Value loss: 0.095362. Entropy: 1.002374.
episode: 2861   score: 21.0  epsilon: 1.0    steps: 160  evaluation reward: 22.73
episode: 2862   score: 18.0  epsilon: 1.0    steps: 856  evaluation reward: 22.77
Training network. lr: 0.000202. clip: 0.080960
Iteration 6223: Policy loss: 0.004580. Value loss: 0.371939. Entropy: 0.911884.
Iteration 6224: Policy loss: 0.000898. Value loss: 0.126556. Entropy: 0.926654.
Iteration 6225: Policy loss: -0.018990. Value loss: 0.054271. Entropy: 0.929276.
episode: 2863   s

Iteration 6283: Policy loss: 0.004814. Value loss: 0.656987. Entropy: 0.828506.
Iteration 6284: Policy loss: -0.001389. Value loss: 0.250767. Entropy: 0.816660.
Iteration 6285: Policy loss: -0.011878. Value loss: 0.127116. Entropy: 0.817243.
Training network. lr: 0.000202. clip: 0.080803
Iteration 6286: Policy loss: 0.011290. Value loss: 0.344046. Entropy: 0.896372.
Iteration 6287: Policy loss: -0.002612. Value loss: 0.132854. Entropy: 0.883171.
Iteration 6288: Policy loss: -0.016701. Value loss: 0.073705. Entropy: 0.880920.
Training network. lr: 0.000202. clip: 0.080803
Iteration 6289: Policy loss: 0.007919. Value loss: 0.584459. Entropy: 0.804776.
Iteration 6290: Policy loss: -0.004375. Value loss: 0.213415. Entropy: 0.791206.
Iteration 6291: Policy loss: -0.015271. Value loss: 0.095258. Entropy: 0.797714.
episode: 2884   score: 20.0  epsilon: 1.0    steps: 80  evaluation reward: 22.07
Training network. lr: 0.000202. clip: 0.080803
Iteration 6292: Policy loss: 0.010764. Value loss: 0

Training network. lr: 0.000201. clip: 0.080499
Iteration 6352: Policy loss: 0.010438. Value loss: 0.317839. Entropy: 0.863497.
Iteration 6353: Policy loss: -0.008533. Value loss: 0.170227. Entropy: 0.858777.
Iteration 6354: Policy loss: -0.018583. Value loss: 0.114295. Entropy: 0.866396.
episode: 2904   score: 21.0  epsilon: 1.0    steps: 944  evaluation reward: 22.12
Training network. lr: 0.000201. clip: 0.080499
Iteration 6355: Policy loss: 0.014693. Value loss: 0.714068. Entropy: 0.874369.
Iteration 6356: Policy loss: -0.003444. Value loss: 0.328436. Entropy: 0.864915.
Iteration 6357: Policy loss: -0.014663. Value loss: 0.192497. Entropy: 0.869155.
episode: 2905   score: 22.0  epsilon: 1.0    steps: 968  evaluation reward: 22.12
Training network. lr: 0.000201. clip: 0.080499
Iteration 6358: Policy loss: 0.008488. Value loss: 0.337012. Entropy: 0.898089.
Iteration 6359: Policy loss: -0.011153. Value loss: 0.138442. Entropy: 0.886360.
Iteration 6360: Policy loss: -0.020912. Value loss

Iteration 6420: Policy loss: -0.017078. Value loss: 0.121447. Entropy: 0.879445.
Training network. lr: 0.000201. clip: 0.080342
Iteration 6421: Policy loss: 0.010021. Value loss: 0.362794. Entropy: 0.883033.
Iteration 6422: Policy loss: -0.007472. Value loss: 0.134360. Entropy: 0.896973.
Iteration 6423: Policy loss: -0.021579. Value loss: 0.063794. Entropy: 0.898353.
Training network. lr: 0.000201. clip: 0.080342
Iteration 6424: Policy loss: 0.008458. Value loss: 0.833477. Entropy: 0.925866.
Iteration 6425: Policy loss: -0.009415. Value loss: 0.399890. Entropy: 0.922857.
Iteration 6426: Policy loss: -0.019639. Value loss: 0.218893. Entropy: 0.908247.
episode: 2924   score: 22.0  epsilon: 1.0    steps: 8  evaluation reward: 22.85
episode: 2925   score: 34.0  epsilon: 1.0    steps: 1008  evaluation reward: 22.98
Training network. lr: 0.000201. clip: 0.080342
Iteration 6427: Policy loss: 0.008290. Value loss: 0.538130. Entropy: 0.923107.
Iteration 6428: Policy loss: -0.003415. Value loss:

episode: 2946   score: 39.0  epsilon: 1.0    steps: 384  evaluation reward: 23.14
Training network. lr: 0.000200. clip: 0.080195
Iteration 6487: Policy loss: 0.012311. Value loss: 0.513643. Entropy: 0.844339.
Iteration 6488: Policy loss: -0.008490. Value loss: 0.211040. Entropy: 0.826217.
Iteration 6489: Policy loss: -0.010666. Value loss: 0.104392. Entropy: 0.825013.
Training network. lr: 0.000200. clip: 0.080195
Iteration 6490: Policy loss: 0.019673. Value loss: 0.757324. Entropy: 0.966123.
Iteration 6491: Policy loss: -0.002262. Value loss: 0.369918. Entropy: 0.949337.
Iteration 6492: Policy loss: -0.008094. Value loss: 0.217859. Entropy: 0.955326.
episode: 2947   score: 11.0  epsilon: 1.0    steps: 360  evaluation reward: 23.08
Training network. lr: 0.000200. clip: 0.080195
Iteration 6493: Policy loss: 0.012698. Value loss: 0.761781. Entropy: 0.927631.
Iteration 6494: Policy loss: -0.008481. Value loss: 0.310974. Entropy: 0.925326.
Iteration 6495: Policy loss: -0.014799. Value loss

Iteration 6554: Policy loss: -0.002393. Value loss: 0.208477. Entropy: 0.832810.
Iteration 6555: Policy loss: -0.017923. Value loss: 0.138330. Entropy: 0.842905.
Training network. lr: 0.000200. clip: 0.079881
Iteration 6556: Policy loss: 0.016754. Value loss: 0.657665. Entropy: 0.993879.
Iteration 6557: Policy loss: 0.002169. Value loss: 0.280109. Entropy: 0.960451.
Iteration 6558: Policy loss: -0.014083. Value loss: 0.153007. Entropy: 0.975780.
episode: 2967   score: 22.0  epsilon: 1.0    steps: 112  evaluation reward: 24.36
Training network. lr: 0.000200. clip: 0.079881
Iteration 6559: Policy loss: 0.007791. Value loss: 0.547061. Entropy: 0.755201.
Iteration 6560: Policy loss: -0.005950. Value loss: 0.196501. Entropy: 0.747280.
Iteration 6561: Policy loss: -0.013684. Value loss: 0.107973. Entropy: 0.750250.
Training network. lr: 0.000200. clip: 0.079881
Iteration 6562: Policy loss: 0.017245. Value loss: 0.476932. Entropy: 0.891415.
Iteration 6563: Policy loss: -0.006423. Value loss: 

episode: 2988   score: 24.0  epsilon: 1.0    steps: 376  evaluation reward: 24.55
episode: 2989   score: 16.0  epsilon: 1.0    steps: 824  evaluation reward: 24.44
Training network. lr: 0.000199. clip: 0.079734
Iteration 6622: Policy loss: 0.010735. Value loss: 0.648275. Entropy: 0.881469.
Iteration 6623: Policy loss: -0.008063. Value loss: 0.250970. Entropy: 0.895128.
Iteration 6624: Policy loss: -0.019817. Value loss: 0.168788. Entropy: 0.914116.
Training network. lr: 0.000199. clip: 0.079734
Iteration 6625: Policy loss: 0.001844. Value loss: 0.274492. Entropy: 0.828902.
Iteration 6626: Policy loss: -0.012365. Value loss: 0.104029. Entropy: 0.837602.
Iteration 6627: Policy loss: -0.018328. Value loss: 0.056438. Entropy: 0.832913.
episode: 2990   score: 20.0  epsilon: 1.0    steps: 336  evaluation reward: 24.38
episode: 2991   score: 28.0  epsilon: 1.0    steps: 920  evaluation reward: 24.37
Training network. lr: 0.000199. clip: 0.079734
Iteration 6628: Policy loss: 0.012277. Value lo

Training network. lr: 0.000199. clip: 0.079577
Iteration 6688: Policy loss: 0.017158. Value loss: 0.742422. Entropy: 0.856960.
Iteration 6689: Policy loss: 0.002939. Value loss: 0.396338. Entropy: 0.850033.
Iteration 6690: Policy loss: -0.008994. Value loss: 0.278711. Entropy: 0.827178.
Training network. lr: 0.000199. clip: 0.079577
Iteration 6691: Policy loss: 0.008808. Value loss: 0.458111. Entropy: 0.910683.
Iteration 6692: Policy loss: -0.003339. Value loss: 0.166387. Entropy: 0.903445.
Iteration 6693: Policy loss: -0.017501. Value loss: 0.076169. Entropy: 0.894357.
episode: 3011   score: 22.0  epsilon: 1.0    steps: 312  evaluation reward: 24.8
Training network. lr: 0.000199. clip: 0.079577
Iteration 6694: Policy loss: 0.006578. Value loss: 0.521107. Entropy: 0.796867.
Iteration 6695: Policy loss: -0.003371. Value loss: 0.226183. Entropy: 0.785361.
Iteration 6696: Policy loss: -0.014393. Value loss: 0.131117. Entropy: 0.775535.
Training network. lr: 0.000199. clip: 0.079577
Iterat

Training network. lr: 0.000198. clip: 0.079273
Iteration 6757: Policy loss: 0.014622. Value loss: 0.781504. Entropy: 0.904315.
Iteration 6758: Policy loss: -0.004861. Value loss: 0.386178. Entropy: 0.901394.
Iteration 6759: Policy loss: 0.005096. Value loss: 0.197665. Entropy: 0.896588.
episode: 3031   score: 19.0  epsilon: 1.0    steps: 224  evaluation reward: 23.65
Training network. lr: 0.000198. clip: 0.079273
Iteration 6760: Policy loss: 0.012740. Value loss: 0.668736. Entropy: 0.929418.
Iteration 6761: Policy loss: 0.005385. Value loss: 0.264787. Entropy: 0.916439.
Iteration 6762: Policy loss: -0.012836. Value loss: 0.165621. Entropy: 0.924935.
episode: 3032   score: 17.0  epsilon: 1.0    steps: 88  evaluation reward: 23.65
Training network. lr: 0.000198. clip: 0.079273
Iteration 6763: Policy loss: 0.012717. Value loss: 0.555367. Entropy: 0.867447.
Iteration 6764: Policy loss: -0.003049. Value loss: 0.197554. Entropy: 0.877598.
Iteration 6765: Policy loss: -0.015314. Value loss: 0

Iteration 6824: Policy loss: -0.006845. Value loss: 0.319595. Entropy: 0.915033.
Iteration 6825: Policy loss: -0.016140. Value loss: 0.203784. Entropy: 0.907417.
Training network. lr: 0.000198. clip: 0.079117
Iteration 6826: Policy loss: 0.016255. Value loss: 0.587945. Entropy: 0.828110.
Iteration 6827: Policy loss: 0.008004. Value loss: 0.207711. Entropy: 0.834549.
Iteration 6828: Policy loss: -0.011588. Value loss: 0.103250. Entropy: 0.842347.
episode: 3052   score: 21.0  epsilon: 1.0    steps: 496  evaluation reward: 24.85
episode: 3053   score: 22.0  epsilon: 1.0    steps: 608  evaluation reward: 24.65
episode: 3054   score: 25.0  epsilon: 1.0    steps: 912  evaluation reward: 24.72
Training network. lr: 0.000198. clip: 0.079117
Iteration 6829: Policy loss: 0.014078. Value loss: 0.665770. Entropy: 0.926082.
Iteration 6830: Policy loss: -0.010885. Value loss: 0.271720. Entropy: 0.918732.
Iteration 6831: Policy loss: -0.015420. Value loss: 0.153600. Entropy: 0.918313.
episode: 3055  

Training network. lr: 0.000197. clip: 0.078960
Iteration 6892: Policy loss: 0.003724. Value loss: 0.558454. Entropy: 0.920172.
Iteration 6893: Policy loss: -0.010690. Value loss: 0.244960. Entropy: 0.917757.
Iteration 6894: Policy loss: -0.020174. Value loss: 0.171922. Entropy: 0.911170.
Training network. lr: 0.000197. clip: 0.078960
Iteration 6895: Policy loss: 0.011985. Value loss: 0.439979. Entropy: 0.864415.
Iteration 6896: Policy loss: -0.003596. Value loss: 0.142481. Entropy: 0.864281.
Iteration 6897: Policy loss: -0.015691. Value loss: 0.068553. Entropy: 0.867901.
Training network. lr: 0.000197. clip: 0.078960
Iteration 6898: Policy loss: 0.005849. Value loss: 0.622428. Entropy: 0.953556.
Iteration 6899: Policy loss: -0.007877. Value loss: 0.304724. Entropy: 0.948101.
Iteration 6900: Policy loss: -0.017530. Value loss: 0.200508. Entropy: 0.952566.
episode: 3073   score: 29.0  epsilon: 1.0    steps: 480  evaluation reward: 24.33
Training network. lr: 0.000197. clip: 0.078812
Iter

Iteration 6962: Policy loss: -0.009085. Value loss: 0.166773. Entropy: 0.868457.
Iteration 6963: Policy loss: -0.019520. Value loss: 0.091293. Entropy: 0.857394.
Training network. lr: 0.000197. clip: 0.078656
Iteration 6964: Policy loss: 0.014382. Value loss: 1.086417. Entropy: 0.842945.
Iteration 6965: Policy loss: -0.009270. Value loss: 0.520578. Entropy: 0.864477.
Iteration 6966: Policy loss: -0.020752. Value loss: 0.294742. Entropy: 0.872622.
Training network. lr: 0.000197. clip: 0.078656
Iteration 6967: Policy loss: 0.012663. Value loss: 1.039458. Entropy: 0.950874.
Iteration 6968: Policy loss: -0.001536. Value loss: 0.589190. Entropy: 0.955787.
Iteration 6969: Policy loss: -0.011987. Value loss: 0.376131. Entropy: 0.951052.
episode: 3091   score: 37.0  epsilon: 1.0    steps: 152  evaluation reward: 26.09
Training network. lr: 0.000197. clip: 0.078656
Iteration 6970: Policy loss: 0.013064. Value loss: 0.907229. Entropy: 0.995605.
Iteration 6971: Policy loss: 0.002191. Value loss: 

Iteration 7032: Policy loss: -0.012661. Value loss: 0.171076. Entropy: 0.896026.
episode: 3109   score: 33.0  epsilon: 1.0    steps: 184  evaluation reward: 27.13
episode: 3110   score: 44.0  epsilon: 1.0    steps: 632  evaluation reward: 27.43
episode: 3111   score: 42.0  epsilon: 1.0    steps: 888  evaluation reward: 27.63
Training network. lr: 0.000196. clip: 0.078499
Iteration 7033: Policy loss: 0.009887. Value loss: 0.764182. Entropy: 0.897060.
Iteration 7034: Policy loss: -0.001039. Value loss: 0.239731. Entropy: 0.907877.
Iteration 7035: Policy loss: -0.010566. Value loss: 0.120823. Entropy: 0.920003.
Training network. lr: 0.000196. clip: 0.078499
Iteration 7036: Policy loss: 0.009816. Value loss: 0.656654. Entropy: 0.895453.
Iteration 7037: Policy loss: -0.001278. Value loss: 0.413015. Entropy: 0.908703.
Iteration 7038: Policy loss: -0.014471. Value loss: 0.264502. Entropy: 0.884075.
Training network. lr: 0.000196. clip: 0.078499
Iteration 7039: Policy loss: 0.016982. Value los

Iteration 7100: Policy loss: 0.005633. Value loss: 0.396798. Entropy: 0.970138.
Iteration 7101: Policy loss: -0.010405. Value loss: 0.228726. Entropy: 0.964569.
Training network. lr: 0.000195. clip: 0.078195
Iteration 7102: Policy loss: 0.006545. Value loss: 0.607028. Entropy: 0.939854.
Iteration 7103: Policy loss: -0.006640. Value loss: 0.212700. Entropy: 0.936513.
Iteration 7104: Policy loss: -0.015098. Value loss: 0.108154. Entropy: 0.943845.
episode: 3129   score: 37.0  epsilon: 1.0    steps: 584  evaluation reward: 28.87
episode: 3130   score: 32.0  epsilon: 1.0    steps: 648  evaluation reward: 28.94
episode: 3131   score: 36.0  epsilon: 1.0    steps: 1024  evaluation reward: 29.11
Training network. lr: 0.000195. clip: 0.078195
Iteration 7105: Policy loss: 0.016895. Value loss: 0.685470. Entropy: 0.953878.
Iteration 7106: Policy loss: 0.000042. Value loss: 0.304462. Entropy: 0.968775.
Iteration 7107: Policy loss: -0.014519. Value loss: 0.159824. Entropy: 0.957124.
Training networ

Iteration 7169: Policy loss: -0.001466. Value loss: 0.292002. Entropy: 0.908707.
Iteration 7170: Policy loss: -0.013882. Value loss: 0.176456. Entropy: 0.894294.
episode: 3148   score: 36.0  epsilon: 1.0    steps: 816  evaluation reward: 29.79
Training network. lr: 0.000195. clip: 0.078038
Iteration 7171: Policy loss: 0.011232. Value loss: 0.729188. Entropy: 0.946836.
Iteration 7172: Policy loss: -0.002001. Value loss: 0.346613. Entropy: 0.928700.
Iteration 7173: Policy loss: -0.014830. Value loss: 0.191247. Entropy: 0.937409.
Training network. lr: 0.000195. clip: 0.078038
Iteration 7174: Policy loss: 0.007030. Value loss: 0.515577. Entropy: 0.921941.
Iteration 7175: Policy loss: -0.005336. Value loss: 0.214370. Entropy: 0.923529.
Iteration 7176: Policy loss: -0.016379. Value loss: 0.144424. Entropy: 0.917487.
episode: 3149   score: 34.0  epsilon: 1.0    steps: 64  evaluation reward: 29.83
Training network. lr: 0.000195. clip: 0.078038
Iteration 7177: Policy loss: 0.016111. Value loss:

Iteration 7239: Policy loss: -0.011969. Value loss: 0.097859. Entropy: 0.904338.
episode: 3166   score: 14.0  epsilon: 1.0    steps: 192  evaluation reward: 30.17
episode: 3167   score: 29.0  epsilon: 1.0    steps: 456  evaluation reward: 30.25
Training network. lr: 0.000195. clip: 0.077891
Iteration 7240: Policy loss: 0.022226. Value loss: 0.948722. Entropy: 0.908495.
Iteration 7241: Policy loss: 0.004577. Value loss: 0.368859. Entropy: 0.891346.
Iteration 7242: Policy loss: -0.007050. Value loss: 0.185325. Entropy: 0.903707.
episode: 3168   score: 28.0  epsilon: 1.0    steps: 96  evaluation reward: 30.31
episode: 3169   score: 32.0  epsilon: 1.0    steps: 496  evaluation reward: 30.26
Training network. lr: 0.000195. clip: 0.077891
Iteration 7243: Policy loss: 0.010461. Value loss: 0.686268. Entropy: 0.865820.
Iteration 7244: Policy loss: -0.002077. Value loss: 0.303666. Entropy: 0.865974.
Iteration 7245: Policy loss: -0.011564. Value loss: 0.211655. Entropy: 0.874390.
episode: 3170  

episode: 3188   score: 30.0  epsilon: 1.0    steps: 224  evaluation reward: 30.21
Training network. lr: 0.000194. clip: 0.077577
Iteration 7306: Policy loss: 0.002489. Value loss: 0.679436. Entropy: 1.021734.
Iteration 7307: Policy loss: -0.010930. Value loss: 0.353565. Entropy: 1.032019.
Iteration 7308: Policy loss: -0.014892. Value loss: 0.232037. Entropy: 1.028337.
episode: 3189   score: 18.0  epsilon: 1.0    steps: 120  evaluation reward: 29.85
Training network. lr: 0.000194. clip: 0.077577
Iteration 7309: Policy loss: 0.023192. Value loss: 0.798487. Entropy: 0.958822.
Iteration 7310: Policy loss: 0.002646. Value loss: 0.332251. Entropy: 0.975390.
Iteration 7311: Policy loss: -0.013557. Value loss: 0.191065. Entropy: 0.969121.
episode: 3190   score: 23.0  epsilon: 1.0    steps: 736  evaluation reward: 29.53
episode: 3191   score: 12.0  epsilon: 1.0    steps: 1024  evaluation reward: 29.28
Training network. lr: 0.000194. clip: 0.077577
Iteration 7312: Policy loss: 0.015434. Value lo

Iteration 7373: Policy loss: 0.000035. Value loss: 0.239292. Entropy: 0.937023.
Iteration 7374: Policy loss: -0.011911. Value loss: 0.121249. Entropy: 0.932542.
episode: 3209   score: 42.0  epsilon: 1.0    steps: 600  evaluation reward: 28.86
Training network. lr: 0.000194. clip: 0.077430
Iteration 7375: Policy loss: 0.014460. Value loss: 0.727331. Entropy: 0.956383.
Iteration 7376: Policy loss: 0.001507. Value loss: 0.400222. Entropy: 0.957211.
Iteration 7377: Policy loss: -0.011819. Value loss: 0.252997. Entropy: 0.937792.
Training network. lr: 0.000194. clip: 0.077430
Iteration 7378: Policy loss: 0.007897. Value loss: 0.430988. Entropy: 1.003431.
Iteration 7379: Policy loss: -0.000635. Value loss: 0.177841. Entropy: 1.008103.
Iteration 7380: Policy loss: -0.012985. Value loss: 0.121915. Entropy: 1.020665.
episode: 3210   score: 45.0  epsilon: 1.0    steps: 304  evaluation reward: 28.87
Training network. lr: 0.000194. clip: 0.077430
Iteration 7381: Policy loss: 0.012473. Value loss: 

episode: 3230   score: 29.0  epsilon: 1.0    steps: 296  evaluation reward: 27.74
Training network. lr: 0.000193. clip: 0.077273
Iteration 7441: Policy loss: 0.007736. Value loss: 0.552568. Entropy: 0.920428.
Iteration 7442: Policy loss: -0.006220. Value loss: 0.266743. Entropy: 0.892762.
Iteration 7443: Policy loss: -0.015178. Value loss: 0.152051. Entropy: 0.903703.
episode: 3231   score: 29.0  epsilon: 1.0    steps: 616  evaluation reward: 27.67
Training network. lr: 0.000193. clip: 0.077273
Iteration 7444: Policy loss: 0.008480. Value loss: 0.359242. Entropy: 0.990540.
Iteration 7445: Policy loss: -0.008110. Value loss: 0.135650. Entropy: 0.986546.
Iteration 7446: Policy loss: -0.020515. Value loss: 0.085365. Entropy: 0.983183.
Training network. lr: 0.000193. clip: 0.077273
Iteration 7447: Policy loss: 0.018987. Value loss: 0.429234. Entropy: 0.940992.
Iteration 7448: Policy loss: -0.007813. Value loss: 0.178426. Entropy: 0.961069.
Iteration 7449: Policy loss: -0.020545. Value loss

Iteration 7509: Policy loss: -0.012205. Value loss: 0.174832. Entropy: 0.965493.
Training network. lr: 0.000192. clip: 0.076969
Iteration 7510: Policy loss: 0.018282. Value loss: 0.575343. Entropy: 0.932017.
Iteration 7511: Policy loss: 0.006116. Value loss: 0.271955. Entropy: 0.923638.
Iteration 7512: Policy loss: -0.009403. Value loss: 0.167319. Entropy: 0.923837.
Training network. lr: 0.000192. clip: 0.076969
Iteration 7513: Policy loss: 0.011331. Value loss: 0.845574. Entropy: 0.901700.
Iteration 7514: Policy loss: -0.004001. Value loss: 0.393121. Entropy: 0.904894.
Iteration 7515: Policy loss: -0.015350. Value loss: 0.209440. Entropy: 0.902714.
Training network. lr: 0.000192. clip: 0.076969
Iteration 7516: Policy loss: 0.004503. Value loss: 0.792525. Entropy: 0.961320.
Iteration 7517: Policy loss: -0.004794. Value loss: 0.334391. Entropy: 0.964616.
Iteration 7518: Policy loss: -0.013607. Value loss: 0.169819. Entropy: 0.939816.
episode: 3250   score: 33.0  epsilon: 1.0    steps: 5

episode: 3269   score: 20.0  epsilon: 1.0    steps: 136  evaluation reward: 27.06
Training network. lr: 0.000192. clip: 0.076813
Iteration 7579: Policy loss: 0.021222. Value loss: 0.878411. Entropy: 0.904873.
Iteration 7580: Policy loss: 0.002831. Value loss: 0.366575. Entropy: 0.889865.
Iteration 7581: Policy loss: -0.010597. Value loss: 0.235258. Entropy: 0.873999.
episode: 3270   score: 23.0  epsilon: 1.0    steps: 96  evaluation reward: 26.86
Training network. lr: 0.000192. clip: 0.076813
Iteration 7582: Policy loss: 0.016038. Value loss: 0.405134. Entropy: 0.890160.
Iteration 7583: Policy loss: -0.001059. Value loss: 0.171254. Entropy: 0.882052.
Iteration 7584: Policy loss: -0.017292. Value loss: 0.086151. Entropy: 0.877582.
episode: 3271   score: 30.0  epsilon: 1.0    steps: 368  evaluation reward: 26.91
Training network. lr: 0.000192. clip: 0.076813
Iteration 7585: Policy loss: 0.013415. Value loss: 1.045883. Entropy: 0.881531.
Iteration 7586: Policy loss: 0.005123. Value loss: 

Iteration 7645: Policy loss: 0.012658. Value loss: 0.651407. Entropy: 0.974486.
Iteration 7646: Policy loss: -0.006723. Value loss: 0.283878. Entropy: 0.957563.
Iteration 7647: Policy loss: -0.014349. Value loss: 0.172496. Entropy: 0.963912.
episode: 3291   score: 38.0  epsilon: 1.0    steps: 224  evaluation reward: 27.18
Training network. lr: 0.000192. clip: 0.076656
Iteration 7648: Policy loss: 0.014843. Value loss: 0.709767. Entropy: 0.941790.
Iteration 7649: Policy loss: -0.002079. Value loss: 0.267318. Entropy: 0.921286.
Iteration 7650: Policy loss: -0.008686. Value loss: 0.181214. Entropy: 0.930807.
Training network. lr: 0.000191. clip: 0.076508
Iteration 7651: Policy loss: 0.011083. Value loss: 0.592891. Entropy: 0.961962.
Iteration 7652: Policy loss: -0.002949. Value loss: 0.198843. Entropy: 0.955007.
Iteration 7653: Policy loss: -0.015518. Value loss: 0.100399. Entropy: 0.976261.
episode: 3292   score: 45.0  epsilon: 1.0    steps: 432  evaluation reward: 27.38
Training network

Training network. lr: 0.000191. clip: 0.076352
Iteration 7714: Policy loss: 0.005834. Value loss: 0.653591. Entropy: 0.915352.
Iteration 7715: Policy loss: -0.008386. Value loss: 0.314990. Entropy: 0.905564.
Iteration 7716: Policy loss: -0.019305. Value loss: 0.141505. Entropy: 0.911882.
Training network. lr: 0.000191. clip: 0.076352
Iteration 7717: Policy loss: 0.009291. Value loss: 0.617164. Entropy: 0.966169.
Iteration 7718: Policy loss: -0.008040. Value loss: 0.209528. Entropy: 0.963104.
Iteration 7719: Policy loss: -0.021334. Value loss: 0.126992. Entropy: 0.952910.
episode: 3311   score: 34.0  epsilon: 1.0    steps: 152  evaluation reward: 26.3
Training network. lr: 0.000191. clip: 0.076352
Iteration 7720: Policy loss: 0.011758. Value loss: 0.695973. Entropy: 0.920116.
Iteration 7721: Policy loss: 0.004843. Value loss: 0.253831. Entropy: 0.916440.
Iteration 7722: Policy loss: -0.006705. Value loss: 0.131657. Entropy: 0.930965.
Training network. lr: 0.000191. clip: 0.076352
Iterat

Iteration 7784: Policy loss: -0.004126. Value loss: 0.261124. Entropy: 0.957848.
Iteration 7785: Policy loss: -0.012336. Value loss: 0.138541. Entropy: 0.958963.
episode: 3329   score: 28.0  epsilon: 1.0    steps: 336  evaluation reward: 27.93
Training network. lr: 0.000190. clip: 0.076195
Iteration 7786: Policy loss: 0.009472. Value loss: 0.511941. Entropy: 1.036773.
Iteration 7787: Policy loss: -0.000491. Value loss: 0.272462. Entropy: 1.037987.
Iteration 7788: Policy loss: -0.007970. Value loss: 0.152914. Entropy: 1.038887.
episode: 3330   score: 25.0  epsilon: 1.0    steps: 472  evaluation reward: 27.89
episode: 3331   score: 23.0  epsilon: 1.0    steps: 608  evaluation reward: 27.83
Training network. lr: 0.000190. clip: 0.076195
Iteration 7789: Policy loss: 0.012508. Value loss: 0.849396. Entropy: 0.912385.
Iteration 7790: Policy loss: -0.007067. Value loss: 0.397186. Entropy: 0.912765.
Iteration 7791: Policy loss: -0.012235. Value loss: 0.244565. Entropy: 0.920697.
episode: 3332 

Iteration 7854: Policy loss: -0.014881. Value loss: 0.207505. Entropy: 0.949221.
episode: 3348   score: 36.0  epsilon: 1.0    steps: 424  evaluation reward: 29.08
episode: 3349   score: 35.0  epsilon: 1.0    steps: 496  evaluation reward: 29.0
Training network. lr: 0.000190. clip: 0.075891
Iteration 7855: Policy loss: 0.011527. Value loss: 0.398700. Entropy: 0.935689.
Iteration 7856: Policy loss: -0.013375. Value loss: 0.181524. Entropy: 0.938246.
Iteration 7857: Policy loss: -0.022886. Value loss: 0.115040. Entropy: 0.943730.
episode: 3350   score: 28.0  epsilon: 1.0    steps: 912  evaluation reward: 28.95
Training network. lr: 0.000190. clip: 0.075891
Iteration 7858: Policy loss: 0.016460. Value loss: 0.608081. Entropy: 0.911187.
Iteration 7859: Policy loss: -0.006985. Value loss: 0.246726. Entropy: 0.913334.
Iteration 7860: Policy loss: -0.011463. Value loss: 0.134620. Entropy: 0.909447.
Training network. lr: 0.000190. clip: 0.075891
Iteration 7861: Policy loss: 0.010241. Value loss

Training network. lr: 0.000189. clip: 0.075734
Iteration 7921: Policy loss: 0.007274. Value loss: 0.436000. Entropy: 1.022740.
Iteration 7922: Policy loss: -0.001369. Value loss: 0.215669. Entropy: 1.026970.
Iteration 7923: Policy loss: -0.014624. Value loss: 0.145318. Entropy: 1.014653.
episode: 3369   score: 24.0  epsilon: 1.0    steps: 176  evaluation reward: 28.95
Training network. lr: 0.000189. clip: 0.075734
Iteration 7924: Policy loss: 0.007842. Value loss: 0.465519. Entropy: 1.046203.
Iteration 7925: Policy loss: -0.002320. Value loss: 0.202933. Entropy: 1.027998.
Iteration 7926: Policy loss: -0.017605. Value loss: 0.114495. Entropy: 1.031116.
Training network. lr: 0.000189. clip: 0.075734
Iteration 7927: Policy loss: 0.012472. Value loss: 0.307844. Entropy: 1.046779.
Iteration 7928: Policy loss: -0.009990. Value loss: 0.134538. Entropy: 1.022893.
Iteration 7929: Policy loss: -0.022046. Value loss: 0.079944. Entropy: 1.028968.
episode: 3370   score: 13.0  epsilon: 1.0    steps:

Iteration 7991: Policy loss: 0.007661. Value loss: 0.241958. Entropy: 1.019266.
Iteration 7992: Policy loss: -0.008653. Value loss: 0.135760. Entropy: 1.012883.
episode: 3387   score: 35.0  epsilon: 1.0    steps: 736  evaluation reward: 29.67
episode: 3388   score: 35.0  epsilon: 1.0    steps: 1024  evaluation reward: 29.85
Training network. lr: 0.000189. clip: 0.075587
Iteration 7993: Policy loss: 0.006670. Value loss: 0.841583. Entropy: 1.046165.
Iteration 7994: Policy loss: -0.010461. Value loss: 0.407623. Entropy: 1.038211.
Iteration 7995: Policy loss: -0.015527. Value loss: 0.217726. Entropy: 1.028567.
Training network. lr: 0.000189. clip: 0.075587
Iteration 7996: Policy loss: 0.009753. Value loss: 0.241557. Entropy: 1.007398.
Iteration 7997: Policy loss: -0.006369. Value loss: 0.074470. Entropy: 1.010808.
Iteration 7998: Policy loss: -0.016772. Value loss: 0.045961. Entropy: 0.999958.
episode: 3389   score: 34.0  epsilon: 1.0    steps: 672  evaluation reward: 29.93
Training netwo

episode: 3404   score: 30.0  epsilon: 1.0    steps: 944  evaluation reward: 30.6
Training network. lr: 0.000188. clip: 0.075273
Iteration 8062: Policy loss: 0.006196. Value loss: 0.871602. Entropy: 1.063262.
Iteration 8063: Policy loss: 0.007970. Value loss: 0.378143. Entropy: 1.060254.
Iteration 8064: Policy loss: 0.000954. Value loss: 0.212713. Entropy: 1.051662.
episode: 3405   score: 18.0  epsilon: 1.0    steps: 144  evaluation reward: 30.47
episode: 3406   score: 44.0  epsilon: 1.0    steps: 328  evaluation reward: 30.71
episode: 3407   score: 38.0  epsilon: 1.0    steps: 800  evaluation reward: 30.99
Training network. lr: 0.000188. clip: 0.075273
Iteration 8065: Policy loss: 0.009731. Value loss: 0.841720. Entropy: 1.002589.
Iteration 8066: Policy loss: -0.001246. Value loss: 0.396376. Entropy: 0.976248.
Iteration 8067: Policy loss: -0.011881. Value loss: 0.231411. Entropy: 0.987715.
episode: 3408   score: 31.0  epsilon: 1.0    steps: 432  evaluation reward: 31.05
Training networ

Iteration 8130: Policy loss: -0.009120. Value loss: 0.065270. Entropy: 0.987090.
episode: 3424   score: 30.0  epsilon: 1.0    steps: 128  evaluation reward: 30.19
episode: 3425   score: 33.0  epsilon: 1.0    steps: 224  evaluation reward: 30.15
Training network. lr: 0.000188. clip: 0.075126
Iteration 8131: Policy loss: 0.015481. Value loss: 0.601280. Entropy: 1.003700.
Iteration 8132: Policy loss: -0.006962. Value loss: 0.299638. Entropy: 1.014573.
Iteration 8133: Policy loss: -0.012561. Value loss: 0.170932. Entropy: 1.001551.
Training network. lr: 0.000188. clip: 0.075126
Iteration 8134: Policy loss: 0.010081. Value loss: 0.536940. Entropy: 0.987543.
Iteration 8135: Policy loss: -0.004480. Value loss: 0.231817. Entropy: 0.975824.
Iteration 8136: Policy loss: -0.013546. Value loss: 0.148521. Entropy: 0.994135.
Training network. lr: 0.000188. clip: 0.075126
Iteration 8137: Policy loss: 0.020964. Value loss: 0.785935. Entropy: 0.956966.
Iteration 8138: Policy loss: -0.005394. Value loss

Training network. lr: 0.000187. clip: 0.074969
Iteration 8197: Policy loss: 0.014876. Value loss: 0.329553. Entropy: 0.960322.
Iteration 8198: Policy loss: -0.000139. Value loss: 0.138839. Entropy: 0.955621.
Iteration 8199: Policy loss: -0.010312. Value loss: 0.084678. Entropy: 0.945230.
Training network. lr: 0.000187. clip: 0.074969
Iteration 8200: Policy loss: 0.016817. Value loss: 0.455107. Entropy: 0.935245.
Iteration 8201: Policy loss: -0.005589. Value loss: 0.223958. Entropy: 0.944091.
Iteration 8202: Policy loss: -0.014981. Value loss: 0.139593. Entropy: 0.943265.
Training network. lr: 0.000187. clip: 0.074813
Iteration 8203: Policy loss: 0.012114. Value loss: 0.666780. Entropy: 0.992323.
Iteration 8204: Policy loss: -0.001242. Value loss: 0.238409. Entropy: 1.018242.
Iteration 8205: Policy loss: -0.010351. Value loss: 0.149737. Entropy: 1.012655.
episode: 3446   score: 18.0  epsilon: 1.0    steps: 112  evaluation reward: 28.99
episode: 3447   score: 39.0  epsilon: 1.0    steps:

Iteration 8266: Policy loss: 0.013624. Value loss: 0.410086. Entropy: 1.077446.
Iteration 8267: Policy loss: -0.003364. Value loss: 0.187264. Entropy: 1.065703.
Iteration 8268: Policy loss: -0.016656. Value loss: 0.130774. Entropy: 1.063404.
episode: 3464   score: 32.0  epsilon: 1.0    steps: 424  evaluation reward: 29.22
episode: 3465   score: 41.0  epsilon: 1.0    steps: 728  evaluation reward: 29.33
Training network. lr: 0.000187. clip: 0.074665
Iteration 8269: Policy loss: 0.009552. Value loss: 0.767536. Entropy: 0.973046.
Iteration 8270: Policy loss: 0.001672. Value loss: 0.318030. Entropy: 0.950796.
Iteration 8271: Policy loss: -0.011343. Value loss: 0.162677. Entropy: 0.959853.
episode: 3466   score: 21.0  epsilon: 1.0    steps: 112  evaluation reward: 29.41
Training network. lr: 0.000187. clip: 0.074665
Iteration 8272: Policy loss: 0.004557. Value loss: 0.850645. Entropy: 1.039345.
Iteration 8273: Policy loss: -0.001356. Value loss: 0.465674. Entropy: 1.024329.
Iteration 8274: 

episode: 3484   score: 21.0  epsilon: 1.0    steps: 200  evaluation reward: 29.67
Training network. lr: 0.000186. clip: 0.074509
Iteration 8335: Policy loss: 0.012291. Value loss: 0.640888. Entropy: 0.934179.
Iteration 8336: Policy loss: 0.008568. Value loss: 0.240375. Entropy: 0.909999.
Iteration 8337: Policy loss: -0.011375. Value loss: 0.136783. Entropy: 0.899817.
Training network. lr: 0.000186. clip: 0.074509
Iteration 8338: Policy loss: 0.011425. Value loss: 0.453160. Entropy: 0.879519.
Iteration 8339: Policy loss: -0.000447. Value loss: 0.202471. Entropy: 0.893413.
Iteration 8340: Policy loss: -0.014083. Value loss: 0.116469. Entropy: 0.874090.
episode: 3485   score: 33.0  epsilon: 1.0    steps: 104  evaluation reward: 29.59
episode: 3486   score: 21.0  epsilon: 1.0    steps: 920  evaluation reward: 29.37
Training network. lr: 0.000186. clip: 0.074509
Iteration 8341: Policy loss: 0.016046. Value loss: 0.407710. Entropy: 0.986596.
Iteration 8342: Policy loss: -0.005267. Value loss

Training network. lr: 0.000186. clip: 0.074204
Iteration 8404: Policy loss: 0.016443. Value loss: 0.432737. Entropy: 0.933240.
Iteration 8405: Policy loss: -0.002189. Value loss: 0.212895. Entropy: 0.914560.
Iteration 8406: Policy loss: -0.014468. Value loss: 0.146306. Entropy: 0.915281.
episode: 3504   score: 11.0  epsilon: 1.0    steps: 672  evaluation reward: 28.51
episode: 3505   score: 17.0  epsilon: 1.0    steps: 952  evaluation reward: 28.5
Training network. lr: 0.000186. clip: 0.074204
Iteration 8407: Policy loss: 0.011755. Value loss: 0.783575. Entropy: 0.889920.
Iteration 8408: Policy loss: 0.003929. Value loss: 0.312236. Entropy: 0.873117.
Iteration 8409: Policy loss: -0.007202. Value loss: 0.144580. Entropy: 0.877787.
Training network. lr: 0.000186. clip: 0.074204
Iteration 8410: Policy loss: 0.009306. Value loss: 0.619939. Entropy: 0.854009.
Iteration 8411: Policy loss: -0.001829. Value loss: 0.299818. Entropy: 0.867574.
Iteration 8412: Policy loss: -0.013375. Value loss: 

Iteration 8472: Policy loss: -0.017946. Value loss: 0.171166. Entropy: 0.986867.
episode: 3524   score: 12.0  epsilon: 1.0    steps: 448  evaluation reward: 28.67
Training network. lr: 0.000185. clip: 0.074048
Iteration 8473: Policy loss: 0.011788. Value loss: 0.724650. Entropy: 0.804689.
Iteration 8474: Policy loss: 0.002653. Value loss: 0.256597. Entropy: 0.784120.
Iteration 8475: Policy loss: -0.014581. Value loss: 0.151433. Entropy: 0.787788.
Training network. lr: 0.000185. clip: 0.074048
Iteration 8476: Policy loss: 0.008300. Value loss: 0.653821. Entropy: 0.924864.
Iteration 8477: Policy loss: -0.002220. Value loss: 0.346122. Entropy: 0.913114.
Iteration 8478: Policy loss: -0.010467. Value loss: 0.222136. Entropy: 0.925982.
episode: 3525   score: 57.0  epsilon: 1.0    steps: 152  evaluation reward: 28.91
Training network. lr: 0.000185. clip: 0.074048
Iteration 8479: Policy loss: 0.014582. Value loss: 0.690597. Entropy: 0.955083.
Iteration 8480: Policy loss: -0.010050. Value loss:

Iteration 8539: Policy loss: 0.012817. Value loss: 0.784166. Entropy: 1.016189.
Iteration 8540: Policy loss: -0.004077. Value loss: 0.399666. Entropy: 1.003637.
Iteration 8541: Policy loss: -0.008356. Value loss: 0.259945. Entropy: 1.008276.
Training network. lr: 0.000185. clip: 0.073891
Iteration 8542: Policy loss: 0.013371. Value loss: 0.644930. Entropy: 1.008636.
Iteration 8543: Policy loss: 0.003548. Value loss: 0.245097. Entropy: 1.021612.
Iteration 8544: Policy loss: -0.006762. Value loss: 0.140394. Entropy: 1.033472.
episode: 3545   score: 20.0  epsilon: 1.0    steps: 736  evaluation reward: 28.91
Training network. lr: 0.000185. clip: 0.073891
Iteration 8545: Policy loss: 0.017371. Value loss: 0.727172. Entropy: 1.034252.
Iteration 8546: Policy loss: 0.002578. Value loss: 0.259668. Entropy: 1.028399.
Iteration 8547: Policy loss: -0.011925. Value loss: 0.138231. Entropy: 1.025430.
episode: 3546   score: 24.0  epsilon: 1.0    steps: 368  evaluation reward: 28.97
Training network. 

episode: 3565   score: 16.0  epsilon: 1.0    steps: 352  evaluation reward: 28.01
Training network. lr: 0.000184. clip: 0.073587
Iteration 8608: Policy loss: 0.011134. Value loss: 0.523998. Entropy: 1.000699.
Iteration 8609: Policy loss: -0.002975. Value loss: 0.278724. Entropy: 1.008239.
Iteration 8610: Policy loss: -0.013475. Value loss: 0.172735. Entropy: 0.990937.
Training network. lr: 0.000184. clip: 0.073587
Iteration 8611: Policy loss: 0.004137. Value loss: 0.480991. Entropy: 1.001356.
Iteration 8612: Policy loss: -0.007644. Value loss: 0.190317. Entropy: 0.975862.
Iteration 8613: Policy loss: -0.017753. Value loss: 0.106675. Entropy: 0.967996.
Training network. lr: 0.000184. clip: 0.073587
Iteration 8614: Policy loss: 0.011568. Value loss: 0.570336. Entropy: 1.026339.
Iteration 8615: Policy loss: -0.005282. Value loss: 0.255239. Entropy: 1.026691.
Iteration 8616: Policy loss: -0.010741. Value loss: 0.145734. Entropy: 1.012229.
Training network. lr: 0.000184. clip: 0.073587
Iter

Iteration 8676: Policy loss: -0.011540. Value loss: 0.168831. Entropy: 1.045890.
episode: 3585   score: 21.0  epsilon: 1.0    steps: 144  evaluation reward: 27.43
Training network. lr: 0.000184. clip: 0.073430
Iteration 8677: Policy loss: 0.015178. Value loss: 0.710279. Entropy: 0.948318.
Iteration 8678: Policy loss: 0.004866. Value loss: 0.342215. Entropy: 0.929603.
Iteration 8679: Policy loss: -0.001827. Value loss: 0.185320. Entropy: 0.933204.
episode: 3586   score: 26.0  epsilon: 1.0    steps: 664  evaluation reward: 27.48
Training network. lr: 0.000184. clip: 0.073430
Iteration 8680: Policy loss: 0.013582. Value loss: 0.902888. Entropy: 0.976765.
Iteration 8681: Policy loss: -0.003353. Value loss: 0.329133. Entropy: 0.973292.
Iteration 8682: Policy loss: -0.011684. Value loss: 0.193953. Entropy: 0.981995.
episode: 3587   score: 29.0  epsilon: 1.0    steps: 96  evaluation reward: 27.43
episode: 3588   score: 26.0  epsilon: 1.0    steps: 920  evaluation reward: 27.43
Training networ

episode: 3607   score: 13.0  epsilon: 1.0    steps: 656  evaluation reward: 26.87
Training network. lr: 0.000183. clip: 0.073283
Iteration 8743: Policy loss: 0.008007. Value loss: 0.613813. Entropy: 0.928632.
Iteration 8744: Policy loss: -0.004064. Value loss: 0.311022. Entropy: 0.927463.
Iteration 8745: Policy loss: -0.015966. Value loss: 0.195098. Entropy: 0.924256.
episode: 3608   score: 29.0  epsilon: 1.0    steps: 312  evaluation reward: 26.83
Training network. lr: 0.000183. clip: 0.073283
Iteration 8746: Policy loss: 0.015871. Value loss: 0.553333. Entropy: 0.947710.
Iteration 8747: Policy loss: 0.008473. Value loss: 0.328924. Entropy: 0.916902.
Iteration 8748: Policy loss: -0.012013. Value loss: 0.220891. Entropy: 0.930480.
Training network. lr: 0.000183. clip: 0.073283
Iteration 8749: Policy loss: 0.011146. Value loss: 0.524387. Entropy: 0.869187.
Iteration 8750: Policy loss: -0.000748. Value loss: 0.229895. Entropy: 0.885970.
Iteration 8751: Policy loss: -0.017760. Value loss:

episode: 3627   score: 22.0  epsilon: 1.0    steps: 552  evaluation reward: 25.53
episode: 3628   score: 21.0  epsilon: 1.0    steps: 840  evaluation reward: 25.54
Training network. lr: 0.000182. clip: 0.072969
Iteration 8812: Policy loss: 0.010446. Value loss: 0.637891. Entropy: 0.895227.
Iteration 8813: Policy loss: -0.008887. Value loss: 0.218936. Entropy: 0.906111.
Iteration 8814: Policy loss: -0.019798. Value loss: 0.113997. Entropy: 0.915033.
Training network. lr: 0.000182. clip: 0.072969
Iteration 8815: Policy loss: 0.014002. Value loss: 0.691196. Entropy: 0.987832.
Iteration 8816: Policy loss: 0.005564. Value loss: 0.305767. Entropy: 0.965021.
Iteration 8817: Policy loss: 0.000186. Value loss: 0.184600. Entropy: 0.996178.
episode: 3629   score: 36.0  epsilon: 1.0    steps: 864  evaluation reward: 25.59
Training network. lr: 0.000182. clip: 0.072969
Iteration 8818: Policy loss: 0.014687. Value loss: 0.462819. Entropy: 0.973454.
Iteration 8819: Policy loss: -0.001973. Value loss:

Iteration 8879: Policy loss: -0.006096. Value loss: 0.170109. Entropy: 0.920041.
Iteration 8880: Policy loss: -0.014348. Value loss: 0.084872. Entropy: 0.918893.
episode: 3648   score: 17.0  epsilon: 1.0    steps: 232  evaluation reward: 24.85
episode: 3649   score: 36.0  epsilon: 1.0    steps: 480  evaluation reward: 24.89
Training network. lr: 0.000182. clip: 0.072822
Iteration 8881: Policy loss: 0.009405. Value loss: 0.523180. Entropy: 0.957089.
Iteration 8882: Policy loss: -0.006802. Value loss: 0.234952. Entropy: 0.944518.
Iteration 8883: Policy loss: -0.013762. Value loss: 0.142685. Entropy: 0.946801.
episode: 3650   score: 24.0  epsilon: 1.0    steps: 336  evaluation reward: 24.75
now time :  2019-03-06 15:37:25.893472
episode: 3651   score: 20.0  epsilon: 1.0    steps: 616  evaluation reward: 24.81
Training network. lr: 0.000182. clip: 0.072822
Iteration 8884: Policy loss: 0.008227. Value loss: 0.384271. Entropy: 0.993419.
Iteration 8885: Policy loss: -0.008042. Value loss: 0.1

Iteration 8945: Policy loss: 0.003785. Value loss: 0.218854. Entropy: 0.992675.
Iteration 8946: Policy loss: -0.009197. Value loss: 0.117425. Entropy: 0.994418.
episode: 3670   score: 33.0  epsilon: 1.0    steps: 408  evaluation reward: 24.12
Training network. lr: 0.000182. clip: 0.072665
Iteration 8947: Policy loss: 0.016479. Value loss: 0.514644. Entropy: 0.982996.
Iteration 8948: Policy loss: 0.006332. Value loss: 0.163292. Entropy: 0.985094.
Iteration 8949: Policy loss: -0.009676. Value loss: 0.079193. Entropy: 0.991076.
episode: 3671   score: 30.0  epsilon: 1.0    steps: 1024  evaluation reward: 24.0
Training network. lr: 0.000182. clip: 0.072665
Iteration 8950: Policy loss: 0.015254. Value loss: 0.762794. Entropy: 0.971774.
Iteration 8951: Policy loss: 0.002450. Value loss: 0.343281. Entropy: 0.970975.
Iteration 8952: Policy loss: -0.006596. Value loss: 0.175743. Entropy: 0.957067.
episode: 3672   score: 15.0  epsilon: 1.0    steps: 168  evaluation reward: 23.83
Training network.

Training network. lr: 0.000181. clip: 0.072361
Iteration 9013: Policy loss: 0.013350. Value loss: 0.589880. Entropy: 0.983152.
Iteration 9014: Policy loss: -0.006532. Value loss: 0.212098. Entropy: 1.000438.
Iteration 9015: Policy loss: -0.018807. Value loss: 0.107384. Entropy: 0.987745.
episode: 3691   score: 7.0  epsilon: 1.0    steps: 32  evaluation reward: 24.05
Training network. lr: 0.000181. clip: 0.072361
Iteration 9016: Policy loss: 0.013640. Value loss: 0.537896. Entropy: 1.001923.
Iteration 9017: Policy loss: -0.005636. Value loss: 0.213197. Entropy: 1.008900.
Iteration 9018: Policy loss: -0.017296. Value loss: 0.129533. Entropy: 0.996678.
episode: 3692   score: 19.0  epsilon: 1.0    steps: 808  evaluation reward: 23.97
Training network. lr: 0.000181. clip: 0.072361
Iteration 9019: Policy loss: 0.010515. Value loss: 0.614579. Entropy: 1.024168.
Iteration 9020: Policy loss: -0.001048. Value loss: 0.215622. Entropy: 1.024131.
Iteration 9021: Policy loss: -0.011643. Value loss: 

Training network. lr: 0.000181. clip: 0.072205
Iteration 9079: Policy loss: 0.007104. Value loss: 0.531750. Entropy: 0.880035.
Iteration 9080: Policy loss: 0.001732. Value loss: 0.224499. Entropy: 0.881405.
Iteration 9081: Policy loss: -0.007074. Value loss: 0.097695. Entropy: 0.866267.
Training network. lr: 0.000181. clip: 0.072205
Iteration 9082: Policy loss: 0.008463. Value loss: 0.408642. Entropy: 0.966816.
Iteration 9083: Policy loss: -0.009030. Value loss: 0.216005. Entropy: 0.956696.
Iteration 9084: Policy loss: -0.014702. Value loss: 0.143734. Entropy: 0.947704.
episode: 3714   score: 18.0  epsilon: 1.0    steps: 896  evaluation reward: 23.84
Training network. lr: 0.000181. clip: 0.072205
Iteration 9085: Policy loss: 0.013143. Value loss: 0.688702. Entropy: 0.960586.
Iteration 9086: Policy loss: 0.004678. Value loss: 0.343670. Entropy: 0.936682.
Iteration 9087: Policy loss: -0.005277. Value loss: 0.184216. Entropy: 0.924611.
episode: 3715   score: 28.0  epsilon: 1.0    steps: 2

Iteration 9148: Policy loss: 0.015440. Value loss: 0.653621. Entropy: 0.976096.
Iteration 9149: Policy loss: 0.005638. Value loss: 0.333117. Entropy: 0.941772.
Iteration 9150: Policy loss: -0.004808. Value loss: 0.220090. Entropy: 0.950506.
Training network. lr: 0.000180. clip: 0.071900
Iteration 9151: Policy loss: 0.009968. Value loss: 0.495907. Entropy: 0.914529.
Iteration 9152: Policy loss: -0.003077. Value loss: 0.247498. Entropy: 0.881938.
Iteration 9153: Policy loss: -0.014066. Value loss: 0.157520. Entropy: 0.895407.
episode: 3733   score: 38.0  epsilon: 1.0    steps: 984  evaluation reward: 24.95
Training network. lr: 0.000180. clip: 0.071900
Iteration 9154: Policy loss: 0.015628. Value loss: 0.609226. Entropy: 0.941573.
Iteration 9155: Policy loss: -0.006042. Value loss: 0.232884. Entropy: 0.908744.
Iteration 9156: Policy loss: -0.009802. Value loss: 0.113023. Entropy: 0.912757.
episode: 3734   score: 33.0  epsilon: 1.0    steps: 48  evaluation reward: 25.09
Training network. 

Iteration 9217: Policy loss: 0.020797. Value loss: 0.708341. Entropy: 0.901199.
Iteration 9218: Policy loss: 0.008367. Value loss: 0.301321. Entropy: 0.881976.
Iteration 9219: Policy loss: -0.006166. Value loss: 0.158563. Entropy: 0.906270.
episode: 3752   score: 20.0  epsilon: 1.0    steps: 504  evaluation reward: 25.55
Training network. lr: 0.000179. clip: 0.071744
Iteration 9220: Policy loss: 0.020154. Value loss: 1.052219. Entropy: 0.880938.
Iteration 9221: Policy loss: 0.006980. Value loss: 0.351531. Entropy: 0.885071.
Iteration 9222: Policy loss: -0.008236. Value loss: 0.186022. Entropy: 0.890897.
episode: 3753   score: 46.0  epsilon: 1.0    steps: 176  evaluation reward: 25.71
episode: 3754   score: 16.0  epsilon: 1.0    steps: 184  evaluation reward: 25.53
Training network. lr: 0.000179. clip: 0.071744
Iteration 9223: Policy loss: 0.020531. Value loss: 0.625617. Entropy: 0.807609.
Iteration 9224: Policy loss: 0.003422. Value loss: 0.277763. Entropy: 0.815939.
Iteration 9225: Po

Iteration 9284: Policy loss: 0.000593. Value loss: 0.321943. Entropy: 0.826326.
Iteration 9285: Policy loss: -0.004180. Value loss: 0.223631. Entropy: 0.814487.
Training network. lr: 0.000179. clip: 0.071587
Iteration 9286: Policy loss: 0.019775. Value loss: 0.691700. Entropy: 0.864542.
Iteration 9287: Policy loss: 0.002682. Value loss: 0.283014. Entropy: 0.852793.
Iteration 9288: Policy loss: -0.012438. Value loss: 0.137451. Entropy: 0.862979.
episode: 3774   score: 37.0  epsilon: 1.0    steps: 760  evaluation reward: 26.06
Training network. lr: 0.000179. clip: 0.071587
Iteration 9289: Policy loss: 0.008940. Value loss: 0.397307. Entropy: 0.903358.
Iteration 9290: Policy loss: -0.008911. Value loss: 0.154674. Entropy: 0.905209.
Iteration 9291: Policy loss: -0.015222. Value loss: 0.088323. Entropy: 0.910098.
Training network. lr: 0.000179. clip: 0.071587
Iteration 9292: Policy loss: 0.009914. Value loss: 0.891661. Entropy: 0.873244.
Iteration 9293: Policy loss: -0.002304. Value loss: 0

episode: 3792   score: 18.0  epsilon: 1.0    steps: 384  evaluation reward: 26.82
Training network. lr: 0.000178. clip: 0.071283
Iteration 9355: Policy loss: 0.003439. Value loss: 0.566411. Entropy: 0.966028.
Iteration 9356: Policy loss: -0.004150. Value loss: 0.274021. Entropy: 0.973757.
Iteration 9357: Policy loss: -0.012554. Value loss: 0.170054. Entropy: 0.968309.
episode: 3793   score: 30.0  epsilon: 1.0    steps: 424  evaluation reward: 26.91
Training network. lr: 0.000178. clip: 0.071283
Iteration 9358: Policy loss: 0.009030. Value loss: 0.804761. Entropy: 0.920980.
Iteration 9359: Policy loss: -0.001625. Value loss: 0.326817. Entropy: 0.916075.
Iteration 9360: Policy loss: -0.008229. Value loss: 0.145830. Entropy: 0.908789.
episode: 3794   score: 23.0  epsilon: 1.0    steps: 408  evaluation reward: 26.99
Training network. lr: 0.000178. clip: 0.071283
Iteration 9361: Policy loss: 0.010843. Value loss: 0.693245. Entropy: 0.854609.
Iteration 9362: Policy loss: -0.002840. Value los

Iteration 9422: Policy loss: -0.000544. Value loss: 0.323217. Entropy: 0.946247.
Iteration 9423: Policy loss: -0.001883. Value loss: 0.193364. Entropy: 0.951093.
episode: 3813   score: 31.0  epsilon: 1.0    steps: 952  evaluation reward: 27.52
Training network. lr: 0.000178. clip: 0.071126
Iteration 9424: Policy loss: 0.017955. Value loss: 0.637136. Entropy: 0.894556.
Iteration 9425: Policy loss: 0.009436. Value loss: 0.293663. Entropy: 0.904422.
Iteration 9426: Policy loss: -0.006884. Value loss: 0.172581. Entropy: 0.900164.
Training network. lr: 0.000178. clip: 0.071126
Iteration 9427: Policy loss: 0.005201. Value loss: 0.347610. Entropy: 0.945189.
Iteration 9428: Policy loss: -0.006746. Value loss: 0.185642. Entropy: 0.940770.
Iteration 9429: Policy loss: -0.016055. Value loss: 0.127150. Entropy: 0.937647.
Training network. lr: 0.000178. clip: 0.071126
Iteration 9430: Policy loss: 0.011788. Value loss: 0.276701. Entropy: 0.913673.
Iteration 9431: Policy loss: 0.001249. Value loss: 0

In [2]:
def test_best(name):
    env = GameEnv(name)
    print("\n\n\n ------- TESTING BEST MODEL FOR %s ------- \n\n\n" % (name))
    number_lives = env.life
    
    if (name == 'SpaceInvaders-v0'):
        action_size = 4
    else:
        action_size = env.action_space.n
    rewards, episodes = [], []
    
    e = 0
    frame = 0

    agent = Agent(action_size)
    agent.policy_net.load_state_dict(torch.load("./save_model/" + name + "_ppo_best"))
    agent.update_target_net()
    agent.policy_net.eval()
    evaluation_reward = deque(maxlen=evaluation_reward_length)

    for i in range(100):
        env.done = False
        env.score = 0
        env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
        env.state = env.reset()
        env.life = number_lives
        get_init_state(env.history, env.state)
        step = 0
        while not env.done:
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            net_in = env.history[:HISTORY_SIZE,:,:]
            action, value = agent.get_action(np.float32(net_in) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            env._env.render()
            
            frame_next_state = get_frame(next_state)
            
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            env.life = env.info['ale.lives']
            
            
            env.score += env.reward
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            step += 1
        

        evaluation_reward.append(env.score)
        print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))
            

In [None]:
test_best('MsPacman-v0')

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))





 ------- TESTING BEST MODEL FOR MsPacman-v0 ------- 



episode: 0   score: 1270.0  epsilon: 1.0    steps: 965  evaluation reward: 1270.0
episode: 0   score: 1450.0  epsilon: 1.0    steps: 1162  evaluation reward: 1360.0
episode: 0   score: 1270.0  epsilon: 1.0    steps: 746  evaluation reward: 1330.0
episode: 0   score: 1790.0  epsilon: 1.0    steps: 1058  evaluation reward: 1445.0
episode: 0   score: 1160.0  epsilon: 1.0    steps: 1112  evaluation reward: 1388.0
episode: 0   score: 1350.0  epsilon: 1.0    steps: 1073  evaluation reward: 1381.6666666666667
episode: 0   score: 1500.0  epsilon: 1.0    steps: 1265  evaluation reward: 1398.5714285714287
episode: 0   score: 1270.0  epsilon: 1.0    steps: 960  evaluation reward: 1382.5
episode: 0   score: 1670.0  epsilon: 1.0    steps: 1405  evaluation reward: 1414.4444444444443
episode: 0   score: 1230.0  epsilon: 1.0    steps: 1457  evaluation reward: 1396.0
episode: 0   score: 1600.0  epsilon: 1.0    steps: 1223  evaluation reward: 14