# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10

### Main Training Loop

In [5]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
max_eval = -np.inf
reset_count = 0

while (frame < 10000000):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                    
                    if np.mean(evaluation_reward) > max_eval:
                        torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
                        max_eval = float(np.mean(evaluation_reward))
                        reset_count = 0
                    elif e > 5000:
                        reset_count += 1
                        """
                        if (reset_count == reset_max):
                            print("Training went nowhere, starting again at best model")
                            agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                            agent.update_target_net()
                            reset_count = 0
                        """
                e += 1
                evaluation_reward.append(env.score)
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                
                env.done = False
                env.score = 0
                env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()


RuntimeError: Expected 4-dimensional input for 4-dimensional weight [16, 4, 8, 8], but got input of size [4, 84, 84] instead

In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

In [None]:
### Loop through all environments and run PPO on them

env_names = ['SpaceInvaders-v0', 'MsPacman-v0', 'Asterix-v0', 'Asteroids-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']

for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size)
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[HISTORY_SIZE-1,:,:] for i in range(num_envs)])
            next_states = []
            net_in = np.stack([envs[i].history[:HISTORY_SIZE,:,:] for i in range(num_envs)])
            step += num_envs
            frame += num_envs
            actions, values = agent.get_action(np.float32(net_in) / 255.)
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = env.reward #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, deepcopy(curr_states[i]), actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    _, frame_next_vals = agent.get_action(np.float32(net_in) / 255.)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()




 ------- STARTING TRAINING FOR SpaceInvaders-v0 ------- 





  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Determing min/max rewards of environment
Min: 0. Max: 200.


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: 0.000286. Value loss: 1.222049. Entropy: 1.385591.
Iteration 2: Policy loss: 0.000138. Value loss: 0.763694. Entropy: 1.384290.
Iteration 3: Policy loss: -0.001526. Value loss: 0.645708. Entropy: 1.384588.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: 0.000514. Value loss: 1.511997. Entropy: 1.381829.
Iteration 5: Policy loss: -0.000044. Value loss: 0.797335. Entropy: 1.382019.
Iteration 6: Policy loss: -0.002469. Value loss: 0.498602. Entropy: 1.382235.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: -0.000772. Value loss: 1.050353. Entropy: 1.378190.
Iteration 8: Policy loss: -0.000993. Value loss: 0.600291. Entropy: 1.373755.
Iteration 9: Policy loss: -0.002539. Value loss: 0.334053. Entropy: 1.372794.
now time :  2019-02-28 10:54:34.430414
episode: 1   score: 110.0  epsilon: 1.0    steps: 912  evaluation reward: 110.0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: -0.001303. Value loss: 0.652697. Entropy: 1.373592.
Iteration 11: Policy loss: -0.001422. Value loss: 0.266825. Entropy: 1.369634.
Iteration 12: Policy loss: -0.001310. Value loss: 0.164508. Entropy: 1.368400.
episode: 2   score: 75.0  epsilon: 1.0    steps: 440  evaluation reward: 92.5
episode: 3   score: 55.0  epsilon: 1.0    steps: 664  evaluation reward: 80.0
episode: 4   score: 80.0  epsilon: 1.0    steps: 952  evaluation reward: 80.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: 0.000085. Value loss: 0.988260. Entropy: 1.367490.
Iteration 14: Policy loss: -0.000047. Value loss: 0.443936. Entropy: 1.370726.
Iteration 15: Policy loss: -0.001396. Value loss: 0.342720. Entropy: 1.373903.
episode: 5   score: 65.0  epsilon: 1.0    steps: 192  evaluation reward: 77.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 16: Policy loss: 0.000355. Value loss: 0.829658. Entropy: 1.3737

Iteration 71: Policy loss: -0.001968. Value loss: 0.307152. Entropy: 1.366166.
Iteration 72: Policy loss: -0.001882. Value loss: 0.190961. Entropy: 1.366615.
episode: 30   score: 165.0  epsilon: 1.0    steps: 240  evaluation reward: 140.83333333333334
episode: 31   score: 55.0  epsilon: 1.0    steps: 832  evaluation reward: 138.06451612903226
episode: 32   score: 135.0  epsilon: 1.0    steps: 1016  evaluation reward: 137.96875
Training network. lr: 0.000250. clip: 0.099853
Iteration 73: Policy loss: -0.001251. Value loss: 0.681423. Entropy: 1.357835.
Iteration 74: Policy loss: -0.000527. Value loss: 0.405220. Entropy: 1.358865.
Iteration 75: Policy loss: -0.003212. Value loss: 0.345408. Entropy: 1.352594.
episode: 33   score: 75.0  epsilon: 1.0    steps: 80  evaluation reward: 136.06060606060606
Training network. lr: 0.000250. clip: 0.099853
Iteration 76: Policy loss: 0.000558. Value loss: 0.782845. Entropy: 1.347416.
Iteration 77: Policy loss: -0.000054. Value loss: 0.286762. Entropy:

Training network. lr: 0.000249. clip: 0.099696
Iteration 130: Policy loss: 0.000529. Value loss: 0.518195. Entropy: 1.337850.
Iteration 131: Policy loss: -0.000903. Value loss: 0.113338. Entropy: 1.337968.
Iteration 132: Policy loss: -0.000621. Value loss: 0.106030. Entropy: 1.341355.
episode: 59   score: 5.0  epsilon: 1.0    steps: 456  evaluation reward: 144.83050847457628
episode: 60   score: 385.0  epsilon: 1.0    steps: 680  evaluation reward: 148.83333333333334
Training network. lr: 0.000249. clip: 0.099696
Iteration 133: Policy loss: 0.001146. Value loss: 1.080609. Entropy: 1.341153.
Iteration 134: Policy loss: -0.000639. Value loss: 0.417108. Entropy: 1.335413.
Iteration 135: Policy loss: -0.000100. Value loss: 0.344107. Entropy: 1.336276.
Training network. lr: 0.000249. clip: 0.099696
Iteration 136: Policy loss: 0.002177. Value loss: 0.949127. Entropy: 1.342453.
Iteration 137: Policy loss: 0.001035. Value loss: 0.397984. Entropy: 1.331897.
Iteration 138: Policy loss: -0.001671

Iteration 192: Policy loss: -0.003038. Value loss: 0.246436. Entropy: 1.252399.
episode: 84   score: 80.0  epsilon: 1.0    steps: 120  evaluation reward: 150.29761904761904
episode: 85   score: 300.0  epsilon: 1.0    steps: 552  evaluation reward: 152.05882352941177
Training network. lr: 0.000249. clip: 0.099548
Iteration 193: Policy loss: 0.000030. Value loss: 0.831781. Entropy: 1.269186.
Iteration 194: Policy loss: -0.004146. Value loss: 0.348322. Entropy: 1.281192.
Iteration 195: Policy loss: -0.004356. Value loss: 0.261501. Entropy: 1.273729.
episode: 86   score: 250.0  epsilon: 1.0    steps: 384  evaluation reward: 153.19767441860466
episode: 87   score: 125.0  epsilon: 1.0    steps: 456  evaluation reward: 152.8735632183908
episode: 88   score: 415.0  epsilon: 1.0    steps: 744  evaluation reward: 155.85227272727272
Training network. lr: 0.000249. clip: 0.099548
Iteration 196: Policy loss: 0.000418. Value loss: 0.587817. Entropy: 1.259564.
Iteration 197: Policy loss: -0.000474. V

episode: 111   score: 530.0  epsilon: 1.0    steps: 984  evaluation reward: 163.4
Training network. lr: 0.000248. clip: 0.099235
Iteration 253: Policy loss: 0.000473. Value loss: 1.006435. Entropy: 1.192834.
Iteration 254: Policy loss: -0.000979. Value loss: 0.314719. Entropy: 1.171571.
Iteration 255: Policy loss: 0.002728. Value loss: 0.217527. Entropy: 1.192246.
episode: 112   score: 235.0  epsilon: 1.0    steps: 776  evaluation reward: 163.65
Training network. lr: 0.000248. clip: 0.099235
Iteration 256: Policy loss: 0.001200. Value loss: 0.764650. Entropy: 1.172814.
Iteration 257: Policy loss: -0.000485. Value loss: 0.265355. Entropy: 1.150887.
Iteration 258: Policy loss: 0.000393. Value loss: 0.165204. Entropy: 1.162391.
episode: 113   score: 210.0  epsilon: 1.0    steps: 336  evaluation reward: 165.4
episode: 114   score: 455.0  epsilon: 1.0    steps: 872  evaluation reward: 169.4
Training network. lr: 0.000248. clip: 0.099235
Iteration 259: Policy loss: -0.001972. Value loss: 0.8

Iteration 313: Policy loss: 0.001698. Value loss: 1.116754. Entropy: 1.136402.
Iteration 314: Policy loss: 0.002262. Value loss: 0.393227. Entropy: 1.128906.
Iteration 315: Policy loss: 0.003017. Value loss: 0.268646. Entropy: 1.132143.
episode: 141   score: 95.0  epsilon: 1.0    steps: 104  evaluation reward: 166.8
episode: 142   score: 235.0  epsilon: 1.0    steps: 608  evaluation reward: 168.1
Training network. lr: 0.000248. clip: 0.099088
Iteration 316: Policy loss: 0.000509. Value loss: 0.799417. Entropy: 1.155182.
Iteration 317: Policy loss: -0.001385. Value loss: 0.430270. Entropy: 1.139876.
Iteration 318: Policy loss: 0.000419. Value loss: 0.266887. Entropy: 1.143153.
Training network. lr: 0.000248. clip: 0.099088
Iteration 319: Policy loss: 0.003320. Value loss: 1.071587. Entropy: 1.121362.
Iteration 320: Policy loss: -0.004040. Value loss: 0.474875. Entropy: 1.164177.
Iteration 321: Policy loss: -0.001975. Value loss: 0.300314. Entropy: 1.161351.
episode: 143   score: 105.0  

Iteration 374: Policy loss: -0.002126. Value loss: 0.323854. Entropy: 1.252575.
Iteration 375: Policy loss: 0.000572. Value loss: 0.157204. Entropy: 1.248767.
episode: 170   score: 95.0  epsilon: 1.0    steps: 88  evaluation reward: 170.0
episode: 171   score: 60.0  epsilon: 1.0    steps: 336  evaluation reward: 169.7
Training network. lr: 0.000247. clip: 0.098931
Iteration 376: Policy loss: 0.000467. Value loss: 0.669806. Entropy: 1.288777.
Iteration 377: Policy loss: -0.004846. Value loss: 0.311671. Entropy: 1.297116.
Iteration 378: Policy loss: -0.003149. Value loss: 0.292329. Entropy: 1.281451.
Training network. lr: 0.000247. clip: 0.098931
Iteration 379: Policy loss: 0.004133. Value loss: 0.864706. Entropy: 1.306637.
Iteration 380: Policy loss: 0.000414. Value loss: 0.420613. Entropy: 1.322720.
Iteration 381: Policy loss: 0.000654. Value loss: 0.299733. Entropy: 1.319386.
episode: 172   score: 665.0  epsilon: 1.0    steps: 72  evaluation reward: 175.0
Training network. lr: 0.00024

Iteration 437: Policy loss: -0.001550. Value loss: 0.346096. Entropy: 1.382690.
Iteration 438: Policy loss: -0.001846. Value loss: 0.272740. Entropy: 1.380611.
Training network. lr: 0.000247. clip: 0.098774
Iteration 439: Policy loss: -0.001222. Value loss: 0.813801. Entropy: 1.381625.
Iteration 440: Policy loss: 0.001537. Value loss: 0.400102. Entropy: 1.381539.
Iteration 441: Policy loss: -0.002603. Value loss: 0.319707. Entropy: 1.383786.
Training network. lr: 0.000247. clip: 0.098774
Iteration 442: Policy loss: 0.000838. Value loss: 1.048621. Entropy: 1.383499.
Iteration 443: Policy loss: 0.001516. Value loss: 0.484455. Entropy: 1.382848.
Iteration 444: Policy loss: -0.000511. Value loss: 0.285935. Entropy: 1.383954.
episode: 197   score: 80.0  epsilon: 1.0    steps: 16  evaluation reward: 178.55
episode: 198   score: 155.0  epsilon: 1.0    steps: 720  evaluation reward: 179.8
episode: 199   score: 25.0  epsilon: 1.0    steps: 928  evaluation reward: 178.65
episode: 200   score: 14

episode: 225   score: 180.0  epsilon: 1.0    steps: 400  evaluation reward: 163.7
episode: 226   score: 155.0  epsilon: 1.0    steps: 904  evaluation reward: 164.05
Training network. lr: 0.000247. clip: 0.098627
Iteration 499: Policy loss: -0.003920. Value loss: 0.658022. Entropy: 1.377845.
Iteration 500: Policy loss: -0.004209. Value loss: 0.342358. Entropy: 1.375747.
Iteration 501: Policy loss: -0.004654. Value loss: 0.295927. Entropy: 1.374444.
Training network. lr: 0.000246. clip: 0.098470
Iteration 502: Policy loss: -0.003378. Value loss: 1.224157. Entropy: 1.371393.
Iteration 503: Policy loss: -0.002440. Value loss: 0.465626. Entropy: 1.374895.
Iteration 504: Policy loss: -0.003081. Value loss: 0.318167. Entropy: 1.374768.
episode: 227   score: 215.0  epsilon: 1.0    steps: 896  evaluation reward: 165.3
Training network. lr: 0.000246. clip: 0.098470
Iteration 505: Policy loss: -0.000556. Value loss: 0.948508. Entropy: 1.370398.
Iteration 506: Policy loss: 0.000848. Value loss: 0.

Training network. lr: 0.000246. clip: 0.098313
Iteration 559: Policy loss: 0.001046. Value loss: 0.789505. Entropy: 1.374625.
Iteration 560: Policy loss: 0.003708. Value loss: 0.348092. Entropy: 1.373293.
Iteration 561: Policy loss: 0.000541. Value loss: 0.302961. Entropy: 1.370317.
episode: 255   score: 180.0  epsilon: 1.0    steps: 400  evaluation reward: 162.5
Training network. lr: 0.000246. clip: 0.098313
Iteration 562: Policy loss: -0.002642. Value loss: 0.646548. Entropy: 1.374341.
Iteration 563: Policy loss: -0.005103. Value loss: 0.306671. Entropy: 1.375334.
Iteration 564: Policy loss: -0.003396. Value loss: 0.198019. Entropy: 1.371187.
episode: 256   score: 135.0  epsilon: 1.0    steps: 96  evaluation reward: 163.75
Training network. lr: 0.000246. clip: 0.098313
Iteration 565: Policy loss: -0.003891. Value loss: 0.904185. Entropy: 1.369268.
Iteration 566: Policy loss: -0.003303. Value loss: 0.511294. Entropy: 1.368344.
Iteration 567: Policy loss: -0.005577. Value loss: 0.35371

Iteration 623: Policy loss: 0.000794. Value loss: 0.548118. Entropy: 1.334084.
Iteration 624: Policy loss: -0.001581. Value loss: 0.356963. Entropy: 1.334742.
episode: 281   score: 330.0  epsilon: 1.0    steps: 824  evaluation reward: 153.45
Training network. lr: 0.000245. clip: 0.098166
Iteration 625: Policy loss: -0.000850. Value loss: 0.560053. Entropy: 1.332955.
Iteration 626: Policy loss: -0.000344. Value loss: 0.320324. Entropy: 1.323729.
Iteration 627: Policy loss: -0.004009. Value loss: 0.206762. Entropy: 1.318530.
episode: 282   score: 180.0  epsilon: 1.0    steps: 592  evaluation reward: 154.95
Training network. lr: 0.000245. clip: 0.098166
Iteration 628: Policy loss: 0.001298. Value loss: 0.645914. Entropy: 1.315797.
Iteration 629: Policy loss: -0.000518. Value loss: 0.284427. Entropy: 1.310604.
Iteration 630: Policy loss: -0.000169. Value loss: 0.200117. Entropy: 1.311901.
episode: 283   score: 105.0  epsilon: 1.0    steps: 248  evaluation reward: 154.2
Training network. lr

Training network. lr: 0.000245. clip: 0.098009
Iteration 685: Policy loss: -0.000775. Value loss: 0.626401. Entropy: 1.348105.
Iteration 686: Policy loss: -0.001441. Value loss: 0.289914. Entropy: 1.345137.
Iteration 687: Policy loss: -0.001302. Value loss: 0.250950. Entropy: 1.342915.
Training network. lr: 0.000245. clip: 0.098009
Iteration 688: Policy loss: -0.002560. Value loss: 0.665067. Entropy: 1.345379.
Iteration 689: Policy loss: -0.003046. Value loss: 0.237568. Entropy: 1.349173.
Iteration 690: Policy loss: -0.004434. Value loss: 0.170077. Entropy: 1.344705.
episode: 309   score: 210.0  epsilon: 1.0    steps: 232  evaluation reward: 161.9
Training network. lr: 0.000245. clip: 0.098009
Iteration 691: Policy loss: 0.001160. Value loss: 0.421941. Entropy: 1.350475.
Iteration 692: Policy loss: -0.002144. Value loss: 0.162802. Entropy: 1.340665.
Iteration 693: Policy loss: 0.000390. Value loss: 0.121355. Entropy: 1.345542.
episode: 310   score: 50.0  epsilon: 1.0    steps: 632  eva

Iteration 747: Policy loss: -0.000359. Value loss: 0.363945. Entropy: 1.383755.
episode: 337   score: 0.0  epsilon: 1.0    steps: 48  evaluation reward: 165.35
episode: 338   score: 240.0  epsilon: 1.0    steps: 264  evaluation reward: 166.45
episode: 339   score: 65.0  epsilon: 1.0    steps: 536  evaluation reward: 165.45
Training network. lr: 0.000245. clip: 0.097853
Iteration 748: Policy loss: 0.001340. Value loss: 0.872315. Entropy: 1.383852.
Iteration 749: Policy loss: 0.000688. Value loss: 0.397908. Entropy: 1.381920.
Iteration 750: Policy loss: -0.001594. Value loss: 0.276667. Entropy: 1.382097.
Training network. lr: 0.000244. clip: 0.097705
Iteration 751: Policy loss: 0.001424. Value loss: 0.765044. Entropy: 1.379500.
Iteration 752: Policy loss: -0.001809. Value loss: 0.311080. Entropy: 1.377003.
Iteration 753: Policy loss: -0.001099. Value loss: 0.233982. Entropy: 1.372923.
episode: 340   score: 105.0  epsilon: 1.0    steps: 624  evaluation reward: 164.95
Training network. lr:

Training network. lr: 0.000244. clip: 0.097549
Iteration 808: Policy loss: -0.001260. Value loss: 0.951850. Entropy: 1.367219.
Iteration 809: Policy loss: -0.000461. Value loss: 0.394570. Entropy: 1.366755.
Iteration 810: Policy loss: -0.002247. Value loss: 0.239307. Entropy: 1.369269.
episode: 366   score: 95.0  epsilon: 1.0    steps: 736  evaluation reward: 157.7
Training network. lr: 0.000244. clip: 0.097549
Iteration 811: Policy loss: 0.000049. Value loss: 0.710114. Entropy: 1.367905.
Iteration 812: Policy loss: -0.002033. Value loss: 0.277859. Entropy: 1.366133.
Iteration 813: Policy loss: -0.003441. Value loss: 0.177242. Entropy: 1.372905.
episode: 367   score: 50.0  epsilon: 1.0    steps: 600  evaluation reward: 155.25
episode: 368   score: 95.0  epsilon: 1.0    steps: 672  evaluation reward: 155.15
episode: 369   score: 310.0  epsilon: 1.0    steps: 760  evaluation reward: 157.5
episode: 370   score: 585.0  epsilon: 1.0    steps: 984  evaluation reward: 162.0
Training network. 

episode: 397   score: 45.0  epsilon: 1.0    steps: 784  evaluation reward: 154.4
Training network. lr: 0.000243. clip: 0.097392
Iteration 868: Policy loss: -0.003115. Value loss: 0.542910. Entropy: 1.362395.
Iteration 869: Policy loss: -0.005983. Value loss: 0.276367. Entropy: 1.364335.
Iteration 870: Policy loss: -0.005644. Value loss: 0.193613. Entropy: 1.357212.
Training network. lr: 0.000243. clip: 0.097392
Iteration 871: Policy loss: -0.000528. Value loss: 0.689646. Entropy: 1.363378.
Iteration 872: Policy loss: -0.002021. Value loss: 0.306134. Entropy: 1.370276.
Iteration 873: Policy loss: -0.004426. Value loss: 0.192729. Entropy: 1.368963.
episode: 398   score: 50.0  epsilon: 1.0    steps: 344  evaluation reward: 153.35
Training network. lr: 0.000243. clip: 0.097392
Iteration 874: Policy loss: -0.002911. Value loss: 0.855241. Entropy: 1.376957.
Iteration 875: Policy loss: -0.002506. Value loss: 0.320400. Entropy: 1.374833.
Iteration 876: Policy loss: -0.003843. Value loss: 0.201

Iteration 928: Policy loss: -0.000200. Value loss: 1.088633. Entropy: 1.355265.
Iteration 929: Policy loss: -0.001267. Value loss: 0.457591. Entropy: 1.340577.
Iteration 930: Policy loss: -0.003059. Value loss: 0.328105. Entropy: 1.346573.
episode: 426   score: 190.0  epsilon: 1.0    steps: 704  evaluation reward: 145.7
episode: 427   score: 95.0  epsilon: 1.0    steps: 928  evaluation reward: 145.1
episode: 428   score: 65.0  epsilon: 1.0    steps: 1024  evaluation reward: 143.65
Training network. lr: 0.000243. clip: 0.097244
Iteration 931: Policy loss: 0.002905. Value loss: 0.904168. Entropy: 1.347106.
Iteration 932: Policy loss: -0.000389. Value loss: 0.373455. Entropy: 1.349141.
Iteration 933: Policy loss: -0.003406. Value loss: 0.255681. Entropy: 1.348151.
Training network. lr: 0.000243. clip: 0.097244
Iteration 934: Policy loss: -0.000778. Value loss: 1.099221. Entropy: 1.353529.
Iteration 935: Policy loss: -0.001188. Value loss: 0.476076. Entropy: 1.355538.
Iteration 936: Policy

Iteration 991: Policy loss: 0.002702. Value loss: 0.720495. Entropy: 1.289119.
Iteration 992: Policy loss: -0.003955. Value loss: 0.259900. Entropy: 1.297074.
Iteration 993: Policy loss: -0.003166. Value loss: 0.181295. Entropy: 1.291531.
episode: 453   score: 140.0  epsilon: 1.0    steps: 280  evaluation reward: 157.2
Training network. lr: 0.000243. clip: 0.097088
Iteration 994: Policy loss: 0.001022. Value loss: 0.774993. Entropy: 1.287341.
Iteration 995: Policy loss: -0.001926. Value loss: 0.370072. Entropy: 1.291892.
Iteration 996: Policy loss: -0.005284. Value loss: 0.205804. Entropy: 1.291524.
episode: 454   score: 15.0  epsilon: 1.0    steps: 352  evaluation reward: 156.45
episode: 455   score: 200.0  epsilon: 1.0    steps: 672  evaluation reward: 157.45
Training network. lr: 0.000243. clip: 0.097088
Iteration 997: Policy loss: 0.003618. Value loss: 1.248585. Entropy: 1.300207.
Iteration 998: Policy loss: 0.002590. Value loss: 0.352318. Entropy: 1.295792.
Iteration 999: Policy l

Iteration 1052: Policy loss: -0.001468. Value loss: 0.272619. Entropy: 1.356586.
Iteration 1053: Policy loss: -0.002085. Value loss: 0.196204. Entropy: 1.364029.
episode: 482   score: 50.0  epsilon: 1.0    steps: 736  evaluation reward: 177.15
Training network. lr: 0.000242. clip: 0.096784
Iteration 1054: Policy loss: -0.003457. Value loss: 1.121577. Entropy: 1.361055.
Iteration 1055: Policy loss: -0.000862. Value loss: 0.505578. Entropy: 1.367444.
Iteration 1056: Policy loss: -0.001633. Value loss: 0.363580. Entropy: 1.359291.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1057: Policy loss: -0.001230. Value loss: 0.740962. Entropy: 1.369733.
Iteration 1058: Policy loss: -0.002336. Value loss: 0.301921. Entropy: 1.365581.
Iteration 1059: Policy loss: -0.003117. Value loss: 0.234890. Entropy: 1.365055.
episode: 483   score: 210.0  epsilon: 1.0    steps: 200  evaluation reward: 178.6
Training network. lr: 0.000242. clip: 0.096784
Iteration 1060: Policy loss: -0.002752. Value l

Iteration 1113: Policy loss: -0.006946. Value loss: 0.231835. Entropy: 1.316263.
Training network. lr: 0.000242. clip: 0.096627
Iteration 1114: Policy loss: 0.000168. Value loss: 0.712648. Entropy: 1.294327.
Iteration 1115: Policy loss: -0.002924. Value loss: 0.292921. Entropy: 1.291376.
Iteration 1116: Policy loss: -0.007690. Value loss: 0.209032. Entropy: 1.284816.
episode: 510   score: 215.0  epsilon: 1.0    steps: 592  evaluation reward: 168.3
episode: 511   score: 30.0  epsilon: 1.0    steps: 680  evaluation reward: 167.1
Training network. lr: 0.000242. clip: 0.096627
Iteration 1117: Policy loss: -0.003313. Value loss: 0.984109. Entropy: 1.276011.
Iteration 1118: Policy loss: -0.007915. Value loss: 0.452388. Entropy: 1.268295.
Iteration 1119: Policy loss: -0.008796. Value loss: 0.222017. Entropy: 1.276121.
episode: 512   score: 150.0  epsilon: 1.0    steps: 80  evaluation reward: 167.95
episode: 513   score: 110.0  epsilon: 1.0    steps: 808  evaluation reward: 167.6
Training netw

Training network. lr: 0.000241. clip: 0.096470
Iteration 1174: Policy loss: -0.000729. Value loss: 0.583836. Entropy: 1.361619.
Iteration 1175: Policy loss: -0.001497. Value loss: 0.380203. Entropy: 1.362444.
Iteration 1176: Policy loss: -0.001489. Value loss: 0.160695. Entropy: 1.357457.
episode: 541   score: 105.0  epsilon: 1.0    steps: 912  evaluation reward: 174.3
episode: 542   score: 110.0  epsilon: 1.0    steps: 928  evaluation reward: 169.5
Training network. lr: 0.000241. clip: 0.096470
Iteration 1177: Policy loss: 0.002407. Value loss: 1.094679. Entropy: 1.357050.
Iteration 1178: Policy loss: 0.000418. Value loss: 0.419157. Entropy: 1.353254.
Iteration 1179: Policy loss: -0.000747. Value loss: 0.303961. Entropy: 1.348730.
episode: 543   score: 110.0  epsilon: 1.0    steps: 224  evaluation reward: 167.75
Training network. lr: 0.000241. clip: 0.096470
Iteration 1180: Policy loss: -0.001556. Value loss: 0.594263. Entropy: 1.349389.
Iteration 1181: Policy loss: -0.004123. Value l

Training network. lr: 0.000241. clip: 0.096323
Iteration 1234: Policy loss: 0.004302. Value loss: 0.804820. Entropy: 1.361942.
Iteration 1235: Policy loss: -0.000481. Value loss: 0.277509. Entropy: 1.366618.
Iteration 1236: Policy loss: -0.000315. Value loss: 0.211536. Entropy: 1.367739.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1237: Policy loss: 0.001526. Value loss: 0.943721. Entropy: 1.362615.
Iteration 1238: Policy loss: -0.001619. Value loss: 0.448170. Entropy: 1.364224.
Iteration 1239: Policy loss: -0.003443. Value loss: 0.231017. Entropy: 1.365386.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1240: Policy loss: -0.003334. Value loss: 0.868149. Entropy: 1.360299.
Iteration 1241: Policy loss: -0.003202. Value loss: 0.360511. Entropy: 1.362844.
Iteration 1242: Policy loss: -0.005978. Value loss: 0.250494. Entropy: 1.362018.
episode: 570   score: 430.0  epsilon: 1.0    steps: 176  evaluation reward: 149.0
episode: 571   score: 105.0  epsilon: 1.0    steps

Iteration 1294: Policy loss: -0.000931. Value loss: 0.677435. Entropy: 1.314547.
Iteration 1295: Policy loss: -0.009079. Value loss: 0.221457. Entropy: 1.316043.
Iteration 1296: Policy loss: -0.009257. Value loss: 0.148009. Entropy: 1.317224.
episode: 599   score: 45.0  epsilon: 1.0    steps: 680  evaluation reward: 154.4
Training network. lr: 0.000240. clip: 0.096166
Iteration 1297: Policy loss: -0.002923. Value loss: 0.810211. Entropy: 1.326381.
Iteration 1298: Policy loss: -0.002526. Value loss: 0.373924. Entropy: 1.318807.
Iteration 1299: Policy loss: -0.003613. Value loss: 0.245648. Entropy: 1.318836.
Training network. lr: 0.000240. clip: 0.096166
Iteration 1300: Policy loss: 0.000909. Value loss: 0.772111. Entropy: 1.320058.
Iteration 1301: Policy loss: -0.002435. Value loss: 0.330619. Entropy: 1.311717.
Iteration 1302: Policy loss: -0.006932. Value loss: 0.227337. Entropy: 1.308321.
episode: 600   score: 275.0  epsilon: 1.0    steps: 328  evaluation reward: 156.05
Training netwo

episode: 629   score: 80.0  epsilon: 1.0    steps: 248  evaluation reward: 142.0
Training network. lr: 0.000240. clip: 0.095862
Iteration 1354: Policy loss: -0.000396. Value loss: 1.252330. Entropy: 1.356126.
Iteration 1355: Policy loss: -0.002690. Value loss: 0.413567. Entropy: 1.353830.
Iteration 1356: Policy loss: -0.007207. Value loss: 0.253690. Entropy: 1.353251.
episode: 630   score: 85.0  epsilon: 1.0    steps: 600  evaluation reward: 140.75
episode: 631   score: 245.0  epsilon: 1.0    steps: 968  evaluation reward: 141.3
Training network. lr: 0.000240. clip: 0.095862
Iteration 1357: Policy loss: -0.001759. Value loss: 0.937263. Entropy: 1.358657.
Iteration 1358: Policy loss: -0.006117. Value loss: 0.298684. Entropy: 1.355706.
Iteration 1359: Policy loss: -0.009658. Value loss: 0.198668. Entropy: 1.359261.
episode: 632   score: 130.0  epsilon: 1.0    steps: 696  evaluation reward: 140.25
episode: 633   score: 90.0  epsilon: 1.0    steps: 864  evaluation reward: 139.6
Training ne

episode: 655   score: 245.0  epsilon: 1.0    steps: 832  evaluation reward: 144.9
Training network. lr: 0.000239. clip: 0.095705
Iteration 1417: Policy loss: -0.002997. Value loss: 0.633337. Entropy: 1.372467.
Iteration 1418: Policy loss: -0.003857. Value loss: 0.253013. Entropy: 1.374032.
Iteration 1419: Policy loss: -0.006299. Value loss: 0.162075. Entropy: 1.373029.
episode: 656   score: 155.0  epsilon: 1.0    steps: 64  evaluation reward: 145.8
episode: 657   score: 150.0  epsilon: 1.0    steps: 104  evaluation reward: 146.85
episode: 658   score: 155.0  epsilon: 1.0    steps: 544  evaluation reward: 147.55
episode: 659   score: 155.0  epsilon: 1.0    steps: 568  evaluation reward: 148.0
Training network. lr: 0.000239. clip: 0.095705
Iteration 1420: Policy loss: 0.002744. Value loss: 0.526709. Entropy: 1.369523.
Iteration 1421: Policy loss: -0.002052. Value loss: 0.287119. Entropy: 1.368962.
Iteration 1422: Policy loss: -0.005941. Value loss: 0.182950. Entropy: 1.374452.
episode: 6

Training network. lr: 0.000239. clip: 0.095549
Iteration 1474: Policy loss: 0.002204. Value loss: 0.682544. Entropy: 1.359805.
Iteration 1475: Policy loss: -0.000407. Value loss: 0.304604. Entropy: 1.357856.
Iteration 1476: Policy loss: -0.005331. Value loss: 0.229893. Entropy: 1.358539.
episode: 688   score: 90.0  epsilon: 1.0    steps: 208  evaluation reward: 129.6
episode: 689   score: 65.0  epsilon: 1.0    steps: 944  evaluation reward: 129.65
Training network. lr: 0.000239. clip: 0.095549
Iteration 1477: Policy loss: 0.000690. Value loss: 0.626612. Entropy: 1.358550.
Iteration 1478: Policy loss: -0.000540. Value loss: 0.290469. Entropy: 1.354664.
Iteration 1479: Policy loss: -0.007977. Value loss: 0.229008. Entropy: 1.360906.
episode: 690   score: 150.0  epsilon: 1.0    steps: 128  evaluation reward: 130.55
episode: 691   score: 50.0  epsilon: 1.0    steps: 448  evaluation reward: 130.15
episode: 692   score: 55.0  epsilon: 1.0    steps: 624  evaluation reward: 129.9
Training netw

Training network. lr: 0.000239. clip: 0.095401
Iteration 1534: Policy loss: -0.000023. Value loss: 0.818349. Entropy: 1.324992.
Iteration 1535: Policy loss: 0.002523. Value loss: 0.382264. Entropy: 1.326054.
Iteration 1536: Policy loss: -0.003798. Value loss: 0.241669. Entropy: 1.329968.
episode: 717   score: 190.0  epsilon: 1.0    steps: 144  evaluation reward: 139.7
episode: 718   score: 15.0  epsilon: 1.0    steps: 480  evaluation reward: 138.65
episode: 719   score: 140.0  epsilon: 1.0    steps: 680  evaluation reward: 139.3
Training network. lr: 0.000239. clip: 0.095401
Iteration 1537: Policy loss: 0.005005. Value loss: 0.587623. Entropy: 1.321669.
Iteration 1538: Policy loss: -0.000920. Value loss: 0.188714. Entropy: 1.349552.
Iteration 1539: Policy loss: -0.002284. Value loss: 0.133977. Entropy: 1.346380.
Training network. lr: 0.000239. clip: 0.095401
Iteration 1540: Policy loss: -0.001463. Value loss: 1.171591. Entropy: 1.338230.
Iteration 1541: Policy loss: 0.000397. Value los

Iteration 1595: Policy loss: -0.003129. Value loss: 0.388333. Entropy: 1.298643.
Iteration 1596: Policy loss: -0.012512. Value loss: 0.216197. Entropy: 1.299206.
episode: 745   score: 210.0  epsilon: 1.0    steps: 40  evaluation reward: 140.75
episode: 746   score: 125.0  epsilon: 1.0    steps: 568  evaluation reward: 139.6
Training network. lr: 0.000238. clip: 0.095245
Iteration 1597: Policy loss: -0.001233. Value loss: 0.674232. Entropy: 1.295094.
Iteration 1598: Policy loss: -0.002748. Value loss: 0.320214. Entropy: 1.304870.
Iteration 1599: Policy loss: -0.006904. Value loss: 0.344363. Entropy: 1.304366.
episode: 747   score: 210.0  epsilon: 1.0    steps: 720  evaluation reward: 138.0
episode: 748   score: 155.0  epsilon: 1.0    steps: 832  evaluation reward: 139.15
Training network. lr: 0.000238. clip: 0.095245
Iteration 1600: Policy loss: 0.000904. Value loss: 0.678397. Entropy: 1.312959.
Iteration 1601: Policy loss: -0.000688. Value loss: 0.246891. Entropy: 1.319874.
Iteration 1

Iteration 1656: Policy loss: -0.003938. Value loss: 0.223098. Entropy: 1.340202.
episode: 773   score: 150.0  epsilon: 1.0    steps: 48  evaluation reward: 149.6
episode: 774   score: 155.0  epsilon: 1.0    steps: 480  evaluation reward: 150.8
Training network. lr: 0.000237. clip: 0.094940
Iteration 1657: Policy loss: -0.000390. Value loss: 0.550783. Entropy: 1.360564.
Iteration 1658: Policy loss: -0.003378. Value loss: 0.216169. Entropy: 1.361124.
Iteration 1659: Policy loss: -0.007884. Value loss: 0.136875. Entropy: 1.362440.
Training network. lr: 0.000237. clip: 0.094940
Iteration 1660: Policy loss: 0.001378. Value loss: 0.585791. Entropy: 1.343158.
Iteration 1661: Policy loss: -0.002169. Value loss: 0.280802. Entropy: 1.340505.
Iteration 1662: Policy loss: -0.006526. Value loss: 0.192432. Entropy: 1.338882.
episode: 775   score: 180.0  epsilon: 1.0    steps: 272  evaluation reward: 151.05
episode: 776   score: 185.0  epsilon: 1.0    steps: 392  evaluation reward: 152.3
Training net

Iteration 1715: Policy loss: -0.002507. Value loss: 0.234068. Entropy: 1.302372.
Iteration 1716: Policy loss: -0.006419. Value loss: 0.158776. Entropy: 1.298220.
episode: 803   score: 80.0  epsilon: 1.0    steps: 96  evaluation reward: 154.15
Training network. lr: 0.000237. clip: 0.094784
Iteration 1717: Policy loss: -0.001366. Value loss: 0.850625. Entropy: 1.303010.
Iteration 1718: Policy loss: 0.001285. Value loss: 0.322301. Entropy: 1.295494.
Iteration 1719: Policy loss: -0.002303. Value loss: 0.193806. Entropy: 1.294569.
episode: 804   score: 170.0  epsilon: 1.0    steps: 1024  evaluation reward: 151.75
Training network. lr: 0.000237. clip: 0.094784
Iteration 1720: Policy loss: 0.000653. Value loss: 0.919566. Entropy: 1.319631.
Iteration 1721: Policy loss: -0.000355. Value loss: 0.355458. Entropy: 1.320275.
Iteration 1722: Policy loss: -0.008320. Value loss: 0.251089. Entropy: 1.318902.
episode: 805   score: 85.0  epsilon: 1.0    steps: 288  evaluation reward: 151.7
episode: 806  

Iteration 1778: Policy loss: 0.001480. Value loss: 0.284956. Entropy: 1.338204.
Iteration 1779: Policy loss: -0.001647. Value loss: 0.184551. Entropy: 1.327842.
episode: 829   score: 260.0  epsilon: 1.0    steps: 472  evaluation reward: 161.25
episode: 830   score: 425.0  epsilon: 1.0    steps: 592  evaluation reward: 165.15
episode: 831   score: 30.0  epsilon: 1.0    steps: 904  evaluation reward: 163.3
Training network. lr: 0.000237. clip: 0.094627
Iteration 1780: Policy loss: 0.001810. Value loss: 0.634982. Entropy: 1.338495.
Iteration 1781: Policy loss: -0.001966. Value loss: 0.224945. Entropy: 1.337461.
Iteration 1782: Policy loss: -0.005415. Value loss: 0.148659. Entropy: 1.335186.
Training network. lr: 0.000237. clip: 0.094627
Iteration 1783: Policy loss: 0.000821. Value loss: 0.965967. Entropy: 1.327319.
Iteration 1784: Policy loss: -0.001085. Value loss: 0.391883. Entropy: 1.326980.
Iteration 1785: Policy loss: -0.005657. Value loss: 0.247772. Entropy: 1.323154.
episode: 832  

Training network. lr: 0.000236. clip: 0.094480
Iteration 1837: Policy loss: -0.001806. Value loss: 0.854738. Entropy: 1.320468.
Iteration 1838: Policy loss: -0.006289. Value loss: 0.439933. Entropy: 1.316164.
Iteration 1839: Policy loss: -0.005270. Value loss: 0.258902. Entropy: 1.316316.
episode: 860   score: 145.0  epsilon: 1.0    steps: 288  evaluation reward: 156.3
episode: 861   score: 485.0  epsilon: 1.0    steps: 808  evaluation reward: 160.05
Training network. lr: 0.000236. clip: 0.094480
Iteration 1840: Policy loss: 0.000900. Value loss: 0.615209. Entropy: 1.314622.
Iteration 1841: Policy loss: -0.004730. Value loss: 0.242442. Entropy: 1.313397.
Iteration 1842: Policy loss: -0.003452. Value loss: 0.216473. Entropy: 1.319900.
episode: 862   score: 35.0  epsilon: 1.0    steps: 280  evaluation reward: 159.3
Training network. lr: 0.000236. clip: 0.094480
Iteration 1843: Policy loss: 0.000255. Value loss: 0.693370. Entropy: 1.310022.
Iteration 1844: Policy loss: -0.005561. Value lo

Iteration 1897: Policy loss: 0.004667. Value loss: 0.824210. Entropy: 1.313796.
Iteration 1898: Policy loss: -0.000555. Value loss: 0.401758. Entropy: 1.321953.
Iteration 1899: Policy loss: -0.001906. Value loss: 0.304641. Entropy: 1.318436.
episode: 889   score: 160.0  epsilon: 1.0    steps: 176  evaluation reward: 175.0
episode: 890   score: 50.0  epsilon: 1.0    steps: 312  evaluation reward: 175.2
Training network. lr: 0.000236. clip: 0.094323
Iteration 1900: Policy loss: 0.001016. Value loss: 0.512544. Entropy: 1.310236.
Iteration 1901: Policy loss: -0.001912. Value loss: 0.285804. Entropy: 1.295277.
Iteration 1902: Policy loss: -0.007892. Value loss: 0.186546. Entropy: 1.289031.
episode: 891   score: 125.0  epsilon: 1.0    steps: 648  evaluation reward: 175.65
Training network. lr: 0.000235. clip: 0.094166
Iteration 1903: Policy loss: -0.000818. Value loss: 0.828988. Entropy: 1.280023.
Iteration 1904: Policy loss: -0.001954. Value loss: 0.350869. Entropy: 1.288988.
Iteration 1905

Iteration 1956: Policy loss: -0.005234. Value loss: 0.189724. Entropy: 1.310729.
Training network. lr: 0.000235. clip: 0.094019
Iteration 1957: Policy loss: -0.001584. Value loss: 0.462215. Entropy: 1.310633.
Iteration 1958: Policy loss: -0.002091. Value loss: 0.247389. Entropy: 1.304306.
Iteration 1959: Policy loss: -0.012918. Value loss: 0.168764. Entropy: 1.309525.
episode: 920   score: 130.0  epsilon: 1.0    steps: 360  evaluation reward: 174.25
Training network. lr: 0.000235. clip: 0.094019
Iteration 1960: Policy loss: 0.000752. Value loss: 0.651202. Entropy: 1.284698.
Iteration 1961: Policy loss: -0.001591. Value loss: 0.364336. Entropy: 1.280014.
Iteration 1962: Policy loss: -0.004456. Value loss: 0.253100. Entropy: 1.285594.
episode: 921   score: 55.0  epsilon: 1.0    steps: 120  evaluation reward: 172.95
episode: 922   score: 105.0  epsilon: 1.0    steps: 800  evaluation reward: 173.2
Training network. lr: 0.000235. clip: 0.094019
Iteration 1963: Policy loss: -0.001868. Value 

episode: 949   score: 210.0  epsilon: 1.0    steps: 336  evaluation reward: 176.15
episode: 950   score: 100.0  epsilon: 1.0    steps: 952  evaluation reward: 176.15
Training network. lr: 0.000235. clip: 0.093862
Iteration 2017: Policy loss: 0.002718. Value loss: 0.452572. Entropy: 1.289137.
Iteration 2018: Policy loss: -0.006222. Value loss: 0.241229. Entropy: 1.289296.
Iteration 2019: Policy loss: -0.001783. Value loss: 0.151413. Entropy: 1.285908.
now time :  2019-02-28 11:18:53.792028
episode: 951   score: 110.0  epsilon: 1.0    steps: 744  evaluation reward: 176.7
episode: 952   score: 100.0  epsilon: 1.0    steps: 776  evaluation reward: 175.9
Training network. lr: 0.000235. clip: 0.093862
Iteration 2020: Policy loss: 0.001319. Value loss: 0.542458. Entropy: 1.263084.
Iteration 2021: Policy loss: -0.003041. Value loss: 0.241717. Entropy: 1.253908.
Iteration 2022: Policy loss: -0.007907. Value loss: 0.125287. Entropy: 1.258021.
episode: 953   score: 155.0  epsilon: 1.0    steps: 7

Training network. lr: 0.000234. clip: 0.093705
Iteration 2074: Policy loss: -0.000560. Value loss: 0.583572. Entropy: 1.306960.
Iteration 2075: Policy loss: -0.011164. Value loss: 0.206945. Entropy: 1.300143.
Iteration 2076: Policy loss: -0.005572. Value loss: 0.189575. Entropy: 1.311672.
episode: 982   score: 225.0  epsilon: 1.0    steps: 264  evaluation reward: 159.05
episode: 983   score: 110.0  epsilon: 1.0    steps: 496  evaluation reward: 158.4
episode: 984   score: 75.0  epsilon: 1.0    steps: 648  evaluation reward: 157.1
episode: 985   score: 50.0  epsilon: 1.0    steps: 976  evaluation reward: 155.95
Training network. lr: 0.000234. clip: 0.093705
Iteration 2077: Policy loss: 0.003525. Value loss: 0.484385. Entropy: 1.321927.
Iteration 2078: Policy loss: 0.001715. Value loss: 0.281212. Entropy: 1.320262.
Iteration 2079: Policy loss: -0.005887. Value loss: 0.178483. Entropy: 1.320707.
Training network. lr: 0.000234. clip: 0.093705
Iteration 2080: Policy loss: 0.001698. Value lo

Iteration 2133: Policy loss: -0.005778. Value loss: 0.219109. Entropy: 1.318961.
episode: 1012   score: 180.0  epsilon: 1.0    steps: 864  evaluation reward: 138.6
Training network. lr: 0.000234. clip: 0.093558
Iteration 2134: Policy loss: 0.002448. Value loss: 0.403256. Entropy: 1.282125.
Iteration 2135: Policy loss: -0.001496. Value loss: 0.183495. Entropy: 1.279609.
Iteration 2136: Policy loss: -0.009051. Value loss: 0.149384. Entropy: 1.282229.
episode: 1013   score: 135.0  epsilon: 1.0    steps: 464  evaluation reward: 138.25
episode: 1014   score: 195.0  epsilon: 1.0    steps: 576  evaluation reward: 138.8
Training network. lr: 0.000234. clip: 0.093558
Iteration 2137: Policy loss: 0.001314. Value loss: 0.442820. Entropy: 1.302869.
Iteration 2138: Policy loss: -0.004115. Value loss: 0.195727. Entropy: 1.302696.
Iteration 2139: Policy loss: -0.009280. Value loss: 0.152301. Entropy: 1.304235.
episode: 1015   score: 135.0  epsilon: 1.0    steps: 8  evaluation reward: 138.9
episode: 1

Training network. lr: 0.000234. clip: 0.093401
Iteration 2191: Policy loss: 0.000009. Value loss: 0.624387. Entropy: 1.268852.
Iteration 2192: Policy loss: -0.006446. Value loss: 0.223863. Entropy: 1.260405.
Iteration 2193: Policy loss: -0.003156. Value loss: 0.112947. Entropy: 1.267585.
episode: 1044   score: 115.0  epsilon: 1.0    steps: 144  evaluation reward: 139.95
episode: 1045   score: 60.0  epsilon: 1.0    steps: 392  evaluation reward: 137.0
episode: 1046   score: 75.0  epsilon: 1.0    steps: 752  evaluation reward: 136.95
episode: 1047   score: 235.0  epsilon: 1.0    steps: 880  evaluation reward: 138.3
Training network. lr: 0.000234. clip: 0.093401
Iteration 2194: Policy loss: -0.002336. Value loss: 0.745587. Entropy: 1.288085.
Iteration 2195: Policy loss: -0.003255. Value loss: 0.267918. Entropy: 1.294272.
Iteration 2196: Policy loss: -0.005605. Value loss: 0.196001. Entropy: 1.290133.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2197: Policy loss: -0.001842. Va

Iteration 2250: Policy loss: -0.012383. Value loss: 0.159802. Entropy: 1.265557.
episode: 1074   score: 185.0  epsilon: 1.0    steps: 112  evaluation reward: 144.45
episode: 1075   score: 75.0  epsilon: 1.0    steps: 272  evaluation reward: 144.9
episode: 1076   score: 305.0  epsilon: 1.0    steps: 320  evaluation reward: 147.45
Training network. lr: 0.000233. clip: 0.093097
Iteration 2251: Policy loss: 0.000604. Value loss: 0.487497. Entropy: 1.279967.
Iteration 2252: Policy loss: 0.003074. Value loss: 0.186318. Entropy: 1.275977.
Iteration 2253: Policy loss: -0.007464. Value loss: 0.130735. Entropy: 1.275476.
episode: 1077   score: 110.0  epsilon: 1.0    steps: 232  evaluation reward: 148.05
episode: 1078   score: 160.0  epsilon: 1.0    steps: 272  evaluation reward: 148.6
episode: 1079   score: 195.0  epsilon: 1.0    steps: 880  evaluation reward: 150.25
Training network. lr: 0.000233. clip: 0.093097
Iteration 2254: Policy loss: 0.003903. Value loss: 0.470680. Entropy: 1.229092.
Ite

episode: 1102   score: 180.0  epsilon: 1.0    steps: 616  evaluation reward: 165.1
episode: 1103   score: 50.0  epsilon: 1.0    steps: 928  evaluation reward: 164.5
Training network. lr: 0.000232. clip: 0.092941
Iteration 2311: Policy loss: -0.000164. Value loss: 0.475623. Entropy: 1.295171.
Iteration 2312: Policy loss: -0.003246. Value loss: 0.196589. Entropy: 1.295372.
Iteration 2313: Policy loss: -0.007347. Value loss: 0.133277. Entropy: 1.291646.
episode: 1104   score: 165.0  epsilon: 1.0    steps: 152  evaluation reward: 164.95
episode: 1105   score: 105.0  epsilon: 1.0    steps: 576  evaluation reward: 164.9
Training network. lr: 0.000232. clip: 0.092941
Iteration 2314: Policy loss: 0.000976. Value loss: 0.529897. Entropy: 1.248711.
Iteration 2315: Policy loss: -0.004861. Value loss: 0.218406. Entropy: 1.244027.
Iteration 2316: Policy loss: -0.008667. Value loss: 0.154751. Entropy: 1.242978.
episode: 1106   score: 235.0  epsilon: 1.0    steps: 752  evaluation reward: 166.65
Train

Iteration 2369: Policy loss: -0.001047. Value loss: 0.341808. Entropy: 1.250067.
Iteration 2370: Policy loss: -0.007383. Value loss: 0.227436. Entropy: 1.255263.
episode: 1133   score: 215.0  epsilon: 1.0    steps: 272  evaluation reward: 171.3
episode: 1134   score: 80.0  epsilon: 1.0    steps: 272  evaluation reward: 167.95
Training network. lr: 0.000232. clip: 0.092784
Iteration 2371: Policy loss: 0.002183. Value loss: 1.030706. Entropy: 1.282869.
Iteration 2372: Policy loss: 0.005611. Value loss: 0.367807. Entropy: 1.286081.
Iteration 2373: Policy loss: -0.003383. Value loss: 0.217569. Entropy: 1.293463.
episode: 1135   score: 135.0  epsilon: 1.0    steps: 240  evaluation reward: 166.8
episode: 1136   score: 355.0  epsilon: 1.0    steps: 344  evaluation reward: 169.7
Training network. lr: 0.000232. clip: 0.092784
Iteration 2374: Policy loss: -0.000906. Value loss: 0.477510. Entropy: 1.266204.
Iteration 2375: Policy loss: -0.002735. Value loss: 0.274481. Entropy: 1.262409.
Iteration

episode: 1164   score: 105.0  epsilon: 1.0    steps: 784  evaluation reward: 174.1
Training network. lr: 0.000232. clip: 0.092636
Iteration 2428: Policy loss: 0.001041. Value loss: 0.459163. Entropy: 1.139717.
Iteration 2429: Policy loss: -0.005870. Value loss: 0.225984. Entropy: 1.152168.
Iteration 2430: Policy loss: -0.008758. Value loss: 0.154809. Entropy: 1.147865.
episode: 1165   score: 180.0  epsilon: 1.0    steps: 480  evaluation reward: 174.05
episode: 1166   score: 315.0  epsilon: 1.0    steps: 832  evaluation reward: 175.9
episode: 1167   score: 125.0  epsilon: 1.0    steps: 1016  evaluation reward: 175.05
Training network. lr: 0.000232. clip: 0.092636
Iteration 2431: Policy loss: 0.000325. Value loss: 0.770926. Entropy: 1.242528.
Iteration 2432: Policy loss: -0.002768. Value loss: 0.249712. Entropy: 1.237276.
Iteration 2433: Policy loss: -0.006589. Value loss: 0.173792. Entropy: 1.250345.
Training network. lr: 0.000232. clip: 0.092636
Iteration 2434: Policy loss: 0.000038. V

Iteration 2487: Policy loss: -0.006041. Value loss: 0.204342. Entropy: 1.186713.
episode: 1194   score: 180.0  epsilon: 1.0    steps: 320  evaluation reward: 174.7
Training network. lr: 0.000231. clip: 0.092480
Iteration 2488: Policy loss: 0.000701. Value loss: 0.558322. Entropy: 1.127030.
Iteration 2489: Policy loss: -0.004058. Value loss: 0.251644. Entropy: 1.125635.
Iteration 2490: Policy loss: -0.008949. Value loss: 0.149678. Entropy: 1.124546.
episode: 1195   score: 180.0  epsilon: 1.0    steps: 48  evaluation reward: 174.4
episode: 1196   score: 210.0  epsilon: 1.0    steps: 160  evaluation reward: 174.7
episode: 1197   score: 225.0  epsilon: 1.0    steps: 520  evaluation reward: 176.2
episode: 1198   score: 75.0  epsilon: 1.0    steps: 952  evaluation reward: 176.15
Training network. lr: 0.000231. clip: 0.092480
Iteration 2491: Policy loss: 0.004505. Value loss: 0.497994. Entropy: 0.965617.
Iteration 2492: Policy loss: -0.003042. Value loss: 0.204847. Entropy: 0.965706.
Iteratio

Iteration 2543: Policy loss: -0.001941. Value loss: 0.321496. Entropy: 1.170330.
Iteration 2544: Policy loss: -0.009177. Value loss: 0.246094. Entropy: 1.177321.
episode: 1227   score: 105.0  epsilon: 1.0    steps: 504  evaluation reward: 167.45
episode: 1228   score: 320.0  epsilon: 1.0    steps: 864  evaluation reward: 167.95
Training network. lr: 0.000231. clip: 0.092323
Iteration 2545: Policy loss: 0.002650. Value loss: 0.425830. Entropy: 1.043717.
Iteration 2546: Policy loss: -0.002223. Value loss: 0.170986. Entropy: 1.049377.
Iteration 2547: Policy loss: -0.008015. Value loss: 0.119544. Entropy: 1.035729.
Training network. lr: 0.000231. clip: 0.092323
Iteration 2548: Policy loss: 0.001697. Value loss: 0.515254. Entropy: 1.119382.
Iteration 2549: Policy loss: -0.006086. Value loss: 0.174971. Entropy: 1.142637.
Iteration 2550: Policy loss: -0.009060. Value loss: 0.163501. Entropy: 1.131822.
episode: 1229   score: 210.0  epsilon: 1.0    steps: 208  evaluation reward: 165.85
Training

Iteration 2603: Policy loss: -0.001392. Value loss: 0.324416. Entropy: 1.201594.
Iteration 2604: Policy loss: -0.013039. Value loss: 0.168627. Entropy: 1.216808.
Training network. lr: 0.000230. clip: 0.092019
Iteration 2605: Policy loss: 0.000144. Value loss: 0.254324. Entropy: 1.210647.
Iteration 2606: Policy loss: -0.004454. Value loss: 0.113295. Entropy: 1.218742.
Iteration 2607: Policy loss: -0.013054. Value loss: 0.101484. Entropy: 1.223998.
Training network. lr: 0.000230. clip: 0.092019
Iteration 2608: Policy loss: -0.001693. Value loss: 0.419016. Entropy: 1.233606.
Iteration 2609: Policy loss: -0.007205. Value loss: 0.249823. Entropy: 1.217410.
Iteration 2610: Policy loss: -0.008863. Value loss: 0.141290. Entropy: 1.219609.
episode: 1256   score: 210.0  epsilon: 1.0    steps: 40  evaluation reward: 173.0
episode: 1257   score: 120.0  epsilon: 1.0    steps: 520  evaluation reward: 170.4
episode: 1258   score: 205.0  epsilon: 1.0    steps: 696  evaluation reward: 170.35
Training n

episode: 1283   score: 215.0  epsilon: 1.0    steps: 208  evaluation reward: 172.5
Training network. lr: 0.000230. clip: 0.091862
Iteration 2665: Policy loss: -0.001740. Value loss: 0.400436. Entropy: 1.260533.
Iteration 2666: Policy loss: -0.004091. Value loss: 0.185406. Entropy: 1.263634.
Iteration 2667: Policy loss: -0.009314. Value loss: 0.107271. Entropy: 1.264649.
episode: 1284   score: 90.0  epsilon: 1.0    steps: 72  evaluation reward: 172.9
episode: 1285   score: 270.0  epsilon: 1.0    steps: 864  evaluation reward: 173.85
Training network. lr: 0.000230. clip: 0.091862
Iteration 2668: Policy loss: 0.002489. Value loss: 0.293182. Entropy: 1.031769.
Iteration 2669: Policy loss: -0.000012. Value loss: 0.125691. Entropy: 1.048906.
Iteration 2670: Policy loss: -0.006235. Value loss: 0.106757. Entropy: 1.050905.
Training network. lr: 0.000230. clip: 0.091862
Iteration 2671: Policy loss: 0.002361. Value loss: 0.263088. Entropy: 1.146973.
Iteration 2672: Policy loss: -0.004291. Value 

episode: 1312   score: 210.0  epsilon: 1.0    steps: 648  evaluation reward: 181.85
Training network. lr: 0.000229. clip: 0.091715
Iteration 2725: Policy loss: -0.001785. Value loss: 0.522143. Entropy: 0.987743.
Iteration 2726: Policy loss: -0.000821. Value loss: 0.261085. Entropy: 0.996644.
Iteration 2727: Policy loss: -0.004758. Value loss: 0.178424. Entropy: 0.991958.
episode: 1313   score: 300.0  epsilon: 1.0    steps: 48  evaluation reward: 183.3
episode: 1314   score: 75.0  epsilon: 1.0    steps: 224  evaluation reward: 183.0
episode: 1315   score: 210.0  epsilon: 1.0    steps: 648  evaluation reward: 182.85
episode: 1316   score: 240.0  epsilon: 1.0    steps: 880  evaluation reward: 183.4
Training network. lr: 0.000229. clip: 0.091715
Iteration 2728: Policy loss: 0.002012. Value loss: 0.427513. Entropy: 0.980130.
Iteration 2729: Policy loss: -0.002068. Value loss: 0.224996. Entropy: 0.976731.
Iteration 2730: Policy loss: 0.002252. Value loss: 0.182339. Entropy: 0.981935.
episode

Iteration 2782: Policy loss: 0.002243. Value loss: 0.858271. Entropy: 1.014449.
Iteration 2783: Policy loss: -0.001711. Value loss: 0.302477. Entropy: 1.034239.
Iteration 2784: Policy loss: -0.001665. Value loss: 0.214917. Entropy: 1.041187.
episode: 1344   score: 245.0  epsilon: 1.0    steps: 1024  evaluation reward: 185.95
Training network. lr: 0.000229. clip: 0.091558
Iteration 2785: Policy loss: -0.000023. Value loss: 0.506636. Entropy: 1.170026.
Iteration 2786: Policy loss: -0.002854. Value loss: 0.208397. Entropy: 1.186180.
Iteration 2787: Policy loss: -0.006462. Value loss: 0.167804. Entropy: 1.174232.
episode: 1345   score: 180.0  epsilon: 1.0    steps: 104  evaluation reward: 186.4
episode: 1346   score: 90.0  epsilon: 1.0    steps: 520  evaluation reward: 186.2
episode: 1347   score: 150.0  epsilon: 1.0    steps: 624  evaluation reward: 186.65
Training network. lr: 0.000229. clip: 0.091558
Iteration 2788: Policy loss: 0.002551. Value loss: 0.604595. Entropy: 1.225888.
Iterati

Training network. lr: 0.000229. clip: 0.091401
Iteration 2842: Policy loss: 0.002673. Value loss: 0.325480. Entropy: 1.114906.
Iteration 2843: Policy loss: -0.005267. Value loss: 0.172732. Entropy: 1.110585.
Iteration 2844: Policy loss: -0.011240. Value loss: 0.109496. Entropy: 1.097995.
Training network. lr: 0.000229. clip: 0.091401
Iteration 2845: Policy loss: 0.000080. Value loss: 0.535498. Entropy: 1.188345.
Iteration 2846: Policy loss: -0.010671. Value loss: 0.272179. Entropy: 1.202419.
Iteration 2847: Policy loss: -0.008407. Value loss: 0.175218. Entropy: 1.186834.
episode: 1374   score: 185.0  epsilon: 1.0    steps: 216  evaluation reward: 195.0
episode: 1375   score: 180.0  epsilon: 1.0    steps: 240  evaluation reward: 195.0
episode: 1376   score: 215.0  epsilon: 1.0    steps: 368  evaluation reward: 195.6
episode: 1377   score: 240.0  epsilon: 1.0    steps: 736  evaluation reward: 193.4
Training network. lr: 0.000229. clip: 0.091401
Iteration 2848: Policy loss: 0.001712. Valu

Iteration 2904: Policy loss: -0.010891. Value loss: 0.135064. Entropy: 1.112640.
now time :  2019-02-28 11:29:35.301455
episode: 1401   score: 210.0  epsilon: 1.0    steps: 88  evaluation reward: 197.95
episode: 1402   score: 425.0  epsilon: 1.0    steps: 136  evaluation reward: 200.65
episode: 1403   score: 180.0  epsilon: 1.0    steps: 416  evaluation reward: 200.35
episode: 1404   score: 150.0  epsilon: 1.0    steps: 808  evaluation reward: 199.7
Training network. lr: 0.000228. clip: 0.091097
Iteration 2905: Policy loss: -0.001320. Value loss: 0.469207. Entropy: 1.163862.
Iteration 2906: Policy loss: -0.002812. Value loss: 0.193867. Entropy: 1.182479.
Iteration 2907: Policy loss: -0.009989. Value loss: 0.158448. Entropy: 1.170676.
Training network. lr: 0.000228. clip: 0.091097
Iteration 2908: Policy loss: 0.004531. Value loss: 0.668384. Entropy: 0.974724.
Iteration 2909: Policy loss: -0.005416. Value loss: 0.361790. Entropy: 0.987171.
Iteration 2910: Policy loss: -0.010389. Value lo

episode: 1429   score: 260.0  epsilon: 1.0    steps: 984  evaluation reward: 207.3
Training network. lr: 0.000227. clip: 0.090941
Iteration 2965: Policy loss: 0.004218. Value loss: 0.946223. Entropy: 1.243033.
Iteration 2966: Policy loss: -0.000439. Value loss: 0.459771. Entropy: 1.235768.
Iteration 2967: Policy loss: 0.000861. Value loss: 0.267526. Entropy: 1.248371.
episode: 1430   score: 155.0  epsilon: 1.0    steps: 352  evaluation reward: 206.7
episode: 1431   score: 135.0  epsilon: 1.0    steps: 488  evaluation reward: 206.7
episode: 1432   score: 215.0  epsilon: 1.0    steps: 920  evaluation reward: 207.8
Training network. lr: 0.000227. clip: 0.090941
Iteration 2968: Policy loss: 0.006963. Value loss: 0.648414. Entropy: 1.186883.
Iteration 2969: Policy loss: 0.003255. Value loss: 0.267639. Entropy: 1.189155.
Iteration 2970: Policy loss: -0.002481. Value loss: 0.177370. Entropy: 1.180650.
episode: 1433   score: 285.0  epsilon: 1.0    steps: 80  evaluation reward: 206.25
episode: 

Training network. lr: 0.000227. clip: 0.090793
Iteration 3022: Policy loss: -0.000181. Value loss: 0.186226. Entropy: 0.980697.
Iteration 3023: Policy loss: -0.003854. Value loss: 0.090089. Entropy: 0.988187.
Iteration 3024: Policy loss: -0.004239. Value loss: 0.065857. Entropy: 0.987363.
Training network. lr: 0.000227. clip: 0.090793
Iteration 3025: Policy loss: -0.001395. Value loss: 0.151987. Entropy: 1.039802.
Iteration 3026: Policy loss: -0.002938. Value loss: 0.094859. Entropy: 1.038723.
Iteration 3027: Policy loss: -0.009791. Value loss: 0.085926. Entropy: 1.040756.
episode: 1461   score: 150.0  epsilon: 1.0    steps: 80  evaluation reward: 204.05
episode: 1462   score: 210.0  epsilon: 1.0    steps: 152  evaluation reward: 204.6
episode: 1463   score: 210.0  epsilon: 1.0    steps: 400  evaluation reward: 205.9
episode: 1464   score: 180.0  epsilon: 1.0    steps: 896  evaluation reward: 205.6
Training network. lr: 0.000227. clip: 0.090793
Iteration 3028: Policy loss: 0.000516. Va

Training network. lr: 0.000227. clip: 0.090637
Iteration 3082: Policy loss: 0.000761. Value loss: 0.214959. Entropy: 0.904182.
Iteration 3083: Policy loss: -0.003918. Value loss: 0.095962. Entropy: 0.909448.
Iteration 3084: Policy loss: -0.006430. Value loss: 0.074259. Entropy: 0.895967.
episode: 1490   score: 255.0  epsilon: 1.0    steps: 688  evaluation reward: 206.3
Training network. lr: 0.000227. clip: 0.090637
Iteration 3085: Policy loss: -0.000201. Value loss: 0.444147. Entropy: 1.228241.
Iteration 3086: Policy loss: -0.003033. Value loss: 0.148844. Entropy: 1.230170.
Iteration 3087: Policy loss: -0.011460. Value loss: 0.123830. Entropy: 1.227246.
episode: 1491   score: 225.0  epsilon: 1.0    steps: 120  evaluation reward: 207.5
episode: 1492   score: 155.0  epsilon: 1.0    steps: 888  evaluation reward: 205.95
Training network. lr: 0.000227. clip: 0.090637
Iteration 3088: Policy loss: 0.005724. Value loss: 0.278818. Entropy: 1.097336.
Iteration 3089: Policy loss: -0.000367. Valu

episode: 1519   score: 100.0  epsilon: 1.0    steps: 272  evaluation reward: 196.95
Training network. lr: 0.000226. clip: 0.090480
Iteration 3142: Policy loss: 0.000753. Value loss: 0.627931. Entropy: 1.168783.
Iteration 3143: Policy loss: 0.003303. Value loss: 0.276884. Entropy: 1.194111.
Iteration 3144: Policy loss: -0.009842. Value loss: 0.162611. Entropy: 1.166379.
episode: 1520   score: 215.0  epsilon: 1.0    steps: 408  evaluation reward: 194.75
episode: 1521   score: 285.0  epsilon: 1.0    steps: 480  evaluation reward: 196.05
episode: 1522   score: 125.0  epsilon: 1.0    steps: 712  evaluation reward: 195.0
Training network. lr: 0.000226. clip: 0.090480
Iteration 3145: Policy loss: -0.003949. Value loss: 0.238251. Entropy: 1.060853.
Iteration 3146: Policy loss: -0.005736. Value loss: 0.109323. Entropy: 1.059068.
Iteration 3147: Policy loss: -0.005815. Value loss: 0.093135. Entropy: 1.071560.
Training network. lr: 0.000226. clip: 0.090480
Iteration 3148: Policy loss: -0.001314. 

Iteration 3199: Policy loss: 0.006730. Value loss: 0.505161. Entropy: 1.264863.
Iteration 3200: Policy loss: -0.004413. Value loss: 0.197122. Entropy: 1.256385.
Iteration 3201: Policy loss: -0.003537. Value loss: 0.114934. Entropy: 1.260308.
now time :  2019-02-28 11:33:11.366713
episode: 1551   score: 210.0  epsilon: 1.0    steps: 480  evaluation reward: 194.9
Training network. lr: 0.000225. clip: 0.090176
Iteration 3202: Policy loss: 0.000425. Value loss: 0.135641. Entropy: 0.968035.
Iteration 3203: Policy loss: -0.011308. Value loss: 0.071903. Entropy: 0.976823.
Iteration 3204: Policy loss: -0.008964. Value loss: 0.060888. Entropy: 0.957264.
Training network. lr: 0.000225. clip: 0.090176
Iteration 3205: Policy loss: 0.001829. Value loss: 0.764505. Entropy: 1.210417.
Iteration 3206: Policy loss: -0.000855. Value loss: 0.276710. Entropy: 1.205506.
Iteration 3207: Policy loss: -0.004153. Value loss: 0.136273. Entropy: 1.223545.
episode: 1552   score: 235.0  epsilon: 1.0    steps: 48  e

Iteration 3260: Policy loss: -0.006640. Value loss: 0.130144. Entropy: 1.000992.
Iteration 3261: Policy loss: -0.007484. Value loss: 0.078056. Entropy: 0.980741.
episode: 1579   score: 90.0  epsilon: 1.0    steps: 624  evaluation reward: 187.85
episode: 1580   score: 240.0  epsilon: 1.0    steps: 904  evaluation reward: 188.45
Training network. lr: 0.000225. clip: 0.090019
Iteration 3262: Policy loss: 0.000018. Value loss: 0.329305. Entropy: 1.133596.
Iteration 3263: Policy loss: 0.001967. Value loss: 0.147876. Entropy: 1.145369.
Iteration 3264: Policy loss: -0.005819. Value loss: 0.107389. Entropy: 1.147061.
episode: 1581   score: 210.0  epsilon: 1.0    steps: 448  evaluation reward: 188.1
Training network. lr: 0.000225. clip: 0.090019
Iteration 3265: Policy loss: 0.002863. Value loss: 0.264493. Entropy: 1.010847.
Iteration 3266: Policy loss: -0.001691. Value loss: 0.117035. Entropy: 1.037885.
Iteration 3267: Policy loss: -0.002488. Value loss: 0.082094. Entropy: 1.034982.
episode: 15

Iteration 3317: Policy loss: -0.002949. Value loss: 0.116649. Entropy: 1.060730.
Iteration 3318: Policy loss: -0.007105. Value loss: 0.089353. Entropy: 1.056739.
episode: 1611   score: 210.0  epsilon: 1.0    steps: 24  evaluation reward: 180.55
Training network. lr: 0.000225. clip: 0.089872
Iteration 3319: Policy loss: 0.002242. Value loss: 0.183096. Entropy: 1.132808.
Iteration 3320: Policy loss: -0.003518. Value loss: 0.096212. Entropy: 1.151816.
Iteration 3321: Policy loss: -0.004602. Value loss: 0.062122. Entropy: 1.140330.
episode: 1612   score: 80.0  epsilon: 1.0    steps: 616  evaluation reward: 180.05
Training network. lr: 0.000225. clip: 0.089872
Iteration 3322: Policy loss: 0.002821. Value loss: 0.409015. Entropy: 1.097686.
Iteration 3323: Policy loss: -0.001472. Value loss: 0.170649. Entropy: 1.102223.
Iteration 3324: Policy loss: -0.009705. Value loss: 0.141543. Entropy: 1.105790.
Training network. lr: 0.000225. clip: 0.089872
Iteration 3325: Policy loss: -0.000903. Value l

episode: 1640   score: 340.0  epsilon: 1.0    steps: 184  evaluation reward: 170.8
episode: 1641   score: 75.0  epsilon: 1.0    steps: 384  evaluation reward: 169.75
episode: 1642   score: 195.0  epsilon: 1.0    steps: 656  evaluation reward: 168.85
episode: 1643   score: 140.0  epsilon: 1.0    steps: 976  evaluation reward: 168.15
Training network. lr: 0.000224. clip: 0.089715
Iteration 3379: Policy loss: 0.005797. Value loss: 0.580047. Entropy: 1.143886.
Iteration 3380: Policy loss: -0.001575. Value loss: 0.449080. Entropy: 1.174261.
Iteration 3381: Policy loss: -0.004633. Value loss: 0.298541. Entropy: 1.145336.
episode: 1644   score: 235.0  epsilon: 1.0    steps: 176  evaluation reward: 169.7
episode: 1645   score: 430.0  epsilon: 1.0    steps: 288  evaluation reward: 171.9
Training network. lr: 0.000224. clip: 0.089715
Iteration 3382: Policy loss: 0.003936. Value loss: 0.335098. Entropy: 1.069512.
Iteration 3383: Policy loss: 0.000534. Value loss: 0.227004. Entropy: 1.062955.
Iter

Iteration 3438: Policy loss: -0.012190. Value loss: 0.116836. Entropy: 1.103320.
episode: 1670   score: 210.0  epsilon: 1.0    steps: 144  evaluation reward: 171.15
episode: 1671   score: 155.0  epsilon: 1.0    steps: 912  evaluation reward: 171.95
Training network. lr: 0.000224. clip: 0.089558
Iteration 3439: Policy loss: -0.002930. Value loss: 0.384771. Entropy: 1.231881.
Iteration 3440: Policy loss: -0.005144. Value loss: 0.120660. Entropy: 1.244924.
Iteration 3441: Policy loss: -0.007744. Value loss: 0.072340. Entropy: 1.233932.
episode: 1672   score: 120.0  epsilon: 1.0    steps: 384  evaluation reward: 171.5
episode: 1673   score: 155.0  epsilon: 1.0    steps: 632  evaluation reward: 168.9
Training network. lr: 0.000224. clip: 0.089558
Iteration 3442: Policy loss: 0.001044. Value loss: 0.250109. Entropy: 1.174413.
Iteration 3443: Policy loss: -0.003751. Value loss: 0.138943. Entropy: 1.186875.
Iteration 3444: Policy loss: -0.008598. Value loss: 0.088729. Entropy: 1.174120.
episod

Iteration 3495: Policy loss: -0.005079. Value loss: 0.137035. Entropy: 1.197548.
episode: 1702   score: 125.0  epsilon: 1.0    steps: 768  evaluation reward: 172.3
Training network. lr: 0.000224. clip: 0.089411
Iteration 3496: Policy loss: 0.000631. Value loss: 0.391856. Entropy: 1.175730.
Iteration 3497: Policy loss: -0.005180. Value loss: 0.188867. Entropy: 1.192509.
Iteration 3498: Policy loss: -0.011210. Value loss: 0.103833. Entropy: 1.186079.
episode: 1703   score: 210.0  epsilon: 1.0    steps: 424  evaluation reward: 173.6
episode: 1704   score: 155.0  epsilon: 1.0    steps: 944  evaluation reward: 174.1
Training network. lr: 0.000224. clip: 0.089411
Iteration 3499: Policy loss: 0.001387. Value loss: 0.428607. Entropy: 1.149793.
Iteration 3500: Policy loss: -0.001778. Value loss: 0.190704. Entropy: 1.170422.
Iteration 3501: Policy loss: -0.005241. Value loss: 0.091081. Entropy: 1.150065.
episode: 1705   score: 185.0  epsilon: 1.0    steps: 208  evaluation reward: 174.15
episode:

Iteration 3555: Policy loss: -0.007262. Value loss: 0.171582. Entropy: 1.249376.
episode: 1731   score: 210.0  epsilon: 1.0    steps: 528  evaluation reward: 178.65
episode: 1732   score: 165.0  epsilon: 1.0    steps: 824  evaluation reward: 179.05
episode: 1733   score: 410.0  epsilon: 1.0    steps: 1008  evaluation reward: 182.0
Training network. lr: 0.000223. clip: 0.089097
Iteration 3556: Policy loss: 0.000870. Value loss: 0.595472. Entropy: 1.232334.
Iteration 3557: Policy loss: -0.007530. Value loss: 0.230360. Entropy: 1.237109.
Iteration 3558: Policy loss: -0.009260. Value loss: 0.160344. Entropy: 1.246895.
episode: 1734   score: 225.0  epsilon: 1.0    steps: 288  evaluation reward: 181.65
Training network. lr: 0.000223. clip: 0.089097
Iteration 3559: Policy loss: 0.002522. Value loss: 0.604139. Entropy: 1.170772.
Iteration 3560: Policy loss: -0.004524. Value loss: 0.301450. Entropy: 1.165966.
Iteration 3561: Policy loss: -0.004975. Value loss: 0.227379. Entropy: 1.162426.
episo

Iteration 3614: Policy loss: -0.005441. Value loss: 0.173728. Entropy: 1.253505.
Iteration 3615: Policy loss: -0.011186. Value loss: 0.122017. Entropy: 1.250794.
episode: 1761   score: 150.0  epsilon: 1.0    steps: 224  evaluation reward: 174.75
episode: 1762   score: 225.0  epsilon: 1.0    steps: 456  evaluation reward: 175.5
Training network. lr: 0.000222. clip: 0.088950
Iteration 3616: Policy loss: 0.003682. Value loss: 0.387901. Entropy: 1.196748.
Iteration 3617: Policy loss: -0.005420. Value loss: 0.184445. Entropy: 1.204869.
Iteration 3618: Policy loss: -0.011145. Value loss: 0.114163. Entropy: 1.214288.
episode: 1763   score: 240.0  epsilon: 1.0    steps: 88  evaluation reward: 177.3
episode: 1764   score: 120.0  epsilon: 1.0    steps: 376  evaluation reward: 176.1
episode: 1765   score: 220.0  epsilon: 1.0    steps: 728  evaluation reward: 174.0
Training network. lr: 0.000222. clip: 0.088950
Iteration 3619: Policy loss: -0.002003. Value loss: 0.329598. Entropy: 1.010892.
Iterat

Iteration 3673: Policy loss: 0.000276. Value loss: 0.459375. Entropy: 1.039181.
Iteration 3674: Policy loss: -0.004438. Value loss: 0.179339. Entropy: 1.051869.
Iteration 3675: Policy loss: -0.011163. Value loss: 0.121242. Entropy: 1.041191.
Training network. lr: 0.000222. clip: 0.088793
Iteration 3676: Policy loss: 0.001226. Value loss: 0.978065. Entropy: 1.197445.
Iteration 3677: Policy loss: -0.003815. Value loss: 0.392159. Entropy: 1.202302.
Iteration 3678: Policy loss: -0.006941. Value loss: 0.218294. Entropy: 1.196681.
episode: 1791   score: 130.0  epsilon: 1.0    steps: 336  evaluation reward: 176.5
episode: 1792   score: 210.0  epsilon: 1.0    steps: 880  evaluation reward: 176.8
Training network. lr: 0.000222. clip: 0.088793
Iteration 3679: Policy loss: 0.001728. Value loss: 0.586342. Entropy: 1.244392.
Iteration 3680: Policy loss: -0.003255. Value loss: 0.331352. Entropy: 1.240584.
Iteration 3681: Policy loss: -0.009042. Value loss: 0.321018. Entropy: 1.237737.
episode: 1793 

Training network. lr: 0.000222. clip: 0.088637
Iteration 3733: Policy loss: 0.002418. Value loss: 0.239138. Entropy: 1.184422.
Iteration 3734: Policy loss: -0.007297. Value loss: 0.092812. Entropy: 1.192322.
Iteration 3735: Policy loss: -0.009495. Value loss: 0.059722. Entropy: 1.194072.
episode: 1820   score: 105.0  epsilon: 1.0    steps: 64  evaluation reward: 185.2
episode: 1821   score: 235.0  epsilon: 1.0    steps: 800  evaluation reward: 181.9
episode: 1822   score: 125.0  epsilon: 1.0    steps: 944  evaluation reward: 182.1
Training network. lr: 0.000222. clip: 0.088637
Iteration 3736: Policy loss: 0.002876. Value loss: 0.382786. Entropy: 1.210971.
Iteration 3737: Policy loss: 0.000515. Value loss: 0.117699. Entropy: 1.207609.
Iteration 3738: Policy loss: -0.006644. Value loss: 0.089553. Entropy: 1.213346.
Training network. lr: 0.000222. clip: 0.088637
Iteration 3739: Policy loss: -0.001535. Value loss: 0.377500. Entropy: 1.147359.
Iteration 3740: Policy loss: 0.000577. Value lo

Iteration 3795: Policy loss: -0.001244. Value loss: 0.188201. Entropy: 1.253059.
episode: 1847   score: 105.0  epsilon: 1.0    steps: 512  evaluation reward: 187.4
Training network. lr: 0.000221. clip: 0.088489
Iteration 3796: Policy loss: 0.004084. Value loss: 0.884965. Entropy: 1.269662.
Iteration 3797: Policy loss: -0.003662. Value loss: 0.333934. Entropy: 1.270410.
Iteration 3798: Policy loss: -0.008881. Value loss: 0.201821. Entropy: 1.266114.
episode: 1848   score: 145.0  epsilon: 1.0    steps: 576  evaluation reward: 187.8
episode: 1849   score: 255.0  epsilon: 1.0    steps: 688  evaluation reward: 188.25
Training network. lr: 0.000221. clip: 0.088489
Iteration 3799: Policy loss: 0.001538. Value loss: 0.440977. Entropy: 1.226239.
Iteration 3800: Policy loss: -0.004714. Value loss: 0.238382. Entropy: 1.215071.
Iteration 3801: Policy loss: -0.008197. Value loss: 0.157620. Entropy: 1.213645.
episode: 1850   score: 185.0  epsilon: 1.0    steps: 448  evaluation reward: 188.55
now tim

episode: 1875   score: 170.0  epsilon: 1.0    steps: 240  evaluation reward: 188.9
Training network. lr: 0.000220. clip: 0.088176
Iteration 3856: Policy loss: 0.004750. Value loss: 0.411492. Entropy: 1.242201.
Iteration 3857: Policy loss: -0.007791. Value loss: 0.179661. Entropy: 1.233829.
Iteration 3858: Policy loss: -0.010251. Value loss: 0.120403. Entropy: 1.237140.
episode: 1876   score: 155.0  epsilon: 1.0    steps: 64  evaluation reward: 189.05
episode: 1877   score: 155.0  epsilon: 1.0    steps: 488  evaluation reward: 189.05
Training network. lr: 0.000220. clip: 0.088176
Iteration 3859: Policy loss: -0.002844. Value loss: 0.502824. Entropy: 1.165790.
Iteration 3860: Policy loss: -0.010626. Value loss: 0.205618. Entropy: 1.170525.
Iteration 3861: Policy loss: -0.011585. Value loss: 0.147830. Entropy: 1.181963.
episode: 1878   score: 180.0  epsilon: 1.0    steps: 160  evaluation reward: 186.9
episode: 1879   score: 185.0  epsilon: 1.0    steps: 864  evaluation reward: 187.2
Train

Iteration 3916: Policy loss: 0.002186. Value loss: 1.160649. Entropy: 1.258114.
Iteration 3917: Policy loss: 0.002723. Value loss: 0.519727. Entropy: 1.256294.
Iteration 3918: Policy loss: -0.006899. Value loss: 0.346512. Entropy: 1.257146.
episode: 1904   score: 90.0  epsilon: 1.0    steps: 176  evaluation reward: 189.35
Training network. lr: 0.000220. clip: 0.088028
Iteration 3919: Policy loss: 0.004795. Value loss: 0.442489. Entropy: 1.218318.
Iteration 3920: Policy loss: -0.002904. Value loss: 0.211697. Entropy: 1.217895.
Iteration 3921: Policy loss: -0.008074. Value loss: 0.134013. Entropy: 1.224160.
episode: 1905   score: 410.0  epsilon: 1.0    steps: 432  evaluation reward: 192.2
Training network. lr: 0.000220. clip: 0.088028
Iteration 3922: Policy loss: -0.001148. Value loss: 0.370151. Entropy: 1.159202.
Iteration 3923: Policy loss: -0.006955. Value loss: 0.218539. Entropy: 1.177886.
Iteration 3924: Policy loss: -0.012219. Value loss: 0.140137. Entropy: 1.165099.
episode: 1906 

episode: 1931   score: 180.0  epsilon: 1.0    steps: 992  evaluation reward: 197.8
Training network. lr: 0.000220. clip: 0.087872
Iteration 3979: Policy loss: 0.004592. Value loss: 0.388608. Entropy: 1.125445.
Iteration 3980: Policy loss: -0.003295. Value loss: 0.254880. Entropy: 1.126695.
Iteration 3981: Policy loss: -0.010845. Value loss: 0.166936. Entropy: 1.141471.
episode: 1932   score: 290.0  epsilon: 1.0    steps: 440  evaluation reward: 199.7
episode: 1933   score: 210.0  epsilon: 1.0    steps: 928  evaluation reward: 199.65
Training network. lr: 0.000220. clip: 0.087872
Iteration 3982: Policy loss: -0.001629. Value loss: 0.431528. Entropy: 1.225965.
Iteration 3983: Policy loss: 0.000386. Value loss: 0.188512. Entropy: 1.222856.
Iteration 3984: Policy loss: -0.008391. Value loss: 0.155959. Entropy: 1.216903.
episode: 1934   score: 210.0  epsilon: 1.0    steps: 912  evaluation reward: 199.8
episode: 1935   score: 215.0  epsilon: 1.0    steps: 952  evaluation reward: 200.7
Traini

Iteration 4040: Policy loss: -0.002780. Value loss: 0.280594. Entropy: 1.240640.
Iteration 4041: Policy loss: -0.006241. Value loss: 0.204831. Entropy: 1.241088.
episode: 1958   score: 230.0  epsilon: 1.0    steps: 256  evaluation reward: 202.55
episode: 1959   score: 225.0  epsilon: 1.0    steps: 408  evaluation reward: 203.2
episode: 1960   score: 240.0  epsilon: 1.0    steps: 976  evaluation reward: 203.6
Training network. lr: 0.000219. clip: 0.087715
Iteration 4042: Policy loss: -0.000309. Value loss: 0.398325. Entropy: 1.187351.
Iteration 4043: Policy loss: -0.003976. Value loss: 0.173269. Entropy: 1.189807.
Iteration 4044: Policy loss: -0.014186. Value loss: 0.121435. Entropy: 1.185416.
episode: 1961   score: 165.0  epsilon: 1.0    steps: 400  evaluation reward: 204.0
episode: 1962   score: 430.0  epsilon: 1.0    steps: 944  evaluation reward: 205.75
Training network. lr: 0.000219. clip: 0.087715
Iteration 4045: Policy loss: 0.000168. Value loss: 0.675083. Entropy: 1.064833.
Iter

Iteration 4101: Policy loss: -0.004497. Value loss: 0.120975. Entropy: 1.217266.
Training network. lr: 0.000219. clip: 0.087411
Iteration 4102: Policy loss: -0.001465. Value loss: 0.301849. Entropy: 1.149184.
Iteration 4103: Policy loss: -0.001267. Value loss: 0.146936. Entropy: 1.160625.
Iteration 4104: Policy loss: -0.011104. Value loss: 0.087083. Entropy: 1.152055.
episode: 1986   score: 215.0  epsilon: 1.0    steps: 256  evaluation reward: 218.2
episode: 1987   score: 210.0  epsilon: 1.0    steps: 560  evaluation reward: 218.75
episode: 1988   score: 185.0  epsilon: 1.0    steps: 848  evaluation reward: 218.75
Training network. lr: 0.000219. clip: 0.087411
Iteration 4105: Policy loss: -0.000835. Value loss: 0.647624. Entropy: 1.267611.
Iteration 4106: Policy loss: -0.000891. Value loss: 0.220839. Entropy: 1.257390.
Iteration 4107: Policy loss: -0.007037. Value loss: 0.181673. Entropy: 1.257969.
episode: 1989   score: 160.0  epsilon: 1.0    steps: 104  evaluation reward: 218.85
epis

Iteration 4159: Policy loss: -0.000414. Value loss: 0.265203. Entropy: 1.145553.
Iteration 4160: Policy loss: -0.004165. Value loss: 0.131320. Entropy: 1.154055.
Iteration 4161: Policy loss: -0.008977. Value loss: 0.091495. Entropy: 1.164698.
episode: 2017   score: 105.0  epsilon: 1.0    steps: 16  evaluation reward: 205.1
episode: 2018   score: 320.0  epsilon: 1.0    steps: 520  evaluation reward: 205.65
Training network. lr: 0.000218. clip: 0.087254
Iteration 4162: Policy loss: 0.000880. Value loss: 0.223426. Entropy: 1.074050.
Iteration 4163: Policy loss: 0.002731. Value loss: 0.106002. Entropy: 1.061554.
Iteration 4164: Policy loss: -0.006381. Value loss: 0.088201. Entropy: 1.077466.
episode: 2019   score: 125.0  epsilon: 1.0    steps: 656  evaluation reward: 205.75
Training network. lr: 0.000218. clip: 0.087254
Iteration 4165: Policy loss: 0.004479. Value loss: 0.391821. Entropy: 1.025317.
Iteration 4166: Policy loss: -0.000614. Value loss: 0.176060. Entropy: 1.059421.
Iteration 4

episode: 2043   score: 210.0  epsilon: 1.0    steps: 600  evaluation reward: 206.15
episode: 2044   score: 150.0  epsilon: 1.0    steps: 960  evaluation reward: 206.3
Training network. lr: 0.000218. clip: 0.087107
Iteration 4222: Policy loss: 0.003156. Value loss: 0.478317. Entropy: 1.252066.
Iteration 4223: Policy loss: 0.000841. Value loss: 0.211136. Entropy: 1.250282.
Iteration 4224: Policy loss: -0.010254. Value loss: 0.176464. Entropy: 1.251277.
episode: 2045   score: 535.0  epsilon: 1.0    steps: 208  evaluation reward: 209.35
Training network. lr: 0.000218. clip: 0.087107
Iteration 4225: Policy loss: 0.002783. Value loss: 0.509731. Entropy: 1.110202.
Iteration 4226: Policy loss: 0.003637. Value loss: 0.215188. Entropy: 1.120080.
Iteration 4227: Policy loss: -0.008045. Value loss: 0.098103. Entropy: 1.144962.
episode: 2046   score: 80.0  epsilon: 1.0    steps: 64  evaluation reward: 208.35
episode: 2047   score: 400.0  epsilon: 1.0    steps: 544  evaluation reward: 209.2
episode:

episode: 2068   score: 275.0  epsilon: 1.0    steps: 592  evaluation reward: 216.85
episode: 2069   score: 110.0  epsilon: 1.0    steps: 760  evaluation reward: 213.45
episode: 2070   score: 200.0  epsilon: 1.0    steps: 896  evaluation reward: 214.15
Training network. lr: 0.000217. clip: 0.086950
Iteration 4285: Policy loss: 0.001327. Value loss: 0.388712. Entropy: 1.239011.
Iteration 4286: Policy loss: -0.003044. Value loss: 0.141756. Entropy: 1.253736.
Iteration 4287: Policy loss: -0.007224. Value loss: 0.104643. Entropy: 1.247569.
episode: 2071   score: 210.0  epsilon: 1.0    steps: 496  evaluation reward: 215.8
episode: 2072   score: 265.0  epsilon: 1.0    steps: 944  evaluation reward: 215.75
Training network. lr: 0.000217. clip: 0.086950
Iteration 4288: Policy loss: 0.000856. Value loss: 0.471737. Entropy: 1.070927.
Iteration 4289: Policy loss: -0.002105. Value loss: 0.239450. Entropy: 1.061016.
Iteration 4290: Policy loss: -0.004999. Value loss: 0.144419. Entropy: 1.060395.
epi

episode: 2097   score: 180.0  epsilon: 1.0    steps: 800  evaluation reward: 218.35
Training network. lr: 0.000217. clip: 0.086793
Iteration 4345: Policy loss: -0.001771. Value loss: 0.319339. Entropy: 1.082403.
Iteration 4346: Policy loss: -0.004887. Value loss: 0.167570. Entropy: 1.081522.
Iteration 4347: Policy loss: -0.011997. Value loss: 0.118184. Entropy: 1.080193.
Training network. lr: 0.000217. clip: 0.086793
Iteration 4348: Policy loss: 0.003691. Value loss: 0.293848. Entropy: 1.058756.
Iteration 4349: Policy loss: -0.000736. Value loss: 0.168564. Entropy: 1.054152.
Iteration 4350: Policy loss: -0.003523. Value loss: 0.118875. Entropy: 1.054814.
episode: 2098   score: 185.0  epsilon: 1.0    steps: 952  evaluation reward: 219.4
Training network. lr: 0.000217. clip: 0.086646
Iteration 4351: Policy loss: 0.003274. Value loss: 0.578425. Entropy: 1.168058.
Iteration 4352: Policy loss: -0.004703. Value loss: 0.237596. Entropy: 1.181554.
Iteration 4353: Policy loss: -0.013821. Value 

episode: 2123   score: 180.0  epsilon: 1.0    steps: 296  evaluation reward: 232.75
episode: 2124   score: 125.0  epsilon: 1.0    steps: 744  evaluation reward: 232.95
Training network. lr: 0.000216. clip: 0.086489
Iteration 4408: Policy loss: 0.004326. Value loss: 0.156788. Entropy: 1.083164.
Iteration 4409: Policy loss: -0.003209. Value loss: 0.097910. Entropy: 1.105498.
Iteration 4410: Policy loss: -0.005288. Value loss: 0.084441. Entropy: 1.089868.
episode: 2125   score: 225.0  epsilon: 1.0    steps: 544  evaluation reward: 233.55
Training network. lr: 0.000216. clip: 0.086489
Iteration 4411: Policy loss: 0.004305. Value loss: 0.316119. Entropy: 1.080277.
Iteration 4412: Policy loss: -0.004743. Value loss: 0.133884. Entropy: 1.079658.
Iteration 4413: Policy loss: -0.010063. Value loss: 0.085613. Entropy: 1.083554.
Training network. lr: 0.000216. clip: 0.086489
Iteration 4414: Policy loss: -0.002098. Value loss: 0.234125. Entropy: 1.092173.
Iteration 4415: Policy loss: -0.008443. Va

episode: 2155   score: 245.0  epsilon: 1.0    steps: 992  evaluation reward: 231.95
Training network. lr: 0.000216. clip: 0.086333
Iteration 4465: Policy loss: 0.000612. Value loss: 0.186224. Entropy: 1.110901.
Iteration 4466: Policy loss: -0.008043. Value loss: 0.103401. Entropy: 1.109765.
Iteration 4467: Policy loss: -0.011621. Value loss: 0.069353. Entropy: 1.090835.
Training network. lr: 0.000216. clip: 0.086333
Iteration 4468: Policy loss: -0.000607. Value loss: 0.164695. Entropy: 1.025967.
Iteration 4469: Policy loss: -0.000473. Value loss: 0.084917. Entropy: 1.038958.
Iteration 4470: Policy loss: -0.008883. Value loss: 0.072015. Entropy: 1.031981.
episode: 2156   score: 155.0  epsilon: 1.0    steps: 560  evaluation reward: 229.3
Training network. lr: 0.000216. clip: 0.086333
Iteration 4471: Policy loss: 0.004162. Value loss: 0.279813. Entropy: 1.129211.
Iteration 4472: Policy loss: -0.000855. Value loss: 0.131872. Entropy: 1.142217.
Iteration 4473: Policy loss: -0.009488. Value 

Training network. lr: 0.000215. clip: 0.086185
Iteration 4525: Policy loss: -0.002933. Value loss: 0.147156. Entropy: 1.183154.
Iteration 4526: Policy loss: -0.005039. Value loss: 0.091292. Entropy: 1.161746.
Iteration 4527: Policy loss: -0.009570. Value loss: 0.058340. Entropy: 1.181210.
episode: 2184   score: 290.0  epsilon: 1.0    steps: 176  evaluation reward: 213.55
episode: 2185   score: 110.0  epsilon: 1.0    steps: 200  evaluation reward: 212.0
episode: 2186   score: 210.0  epsilon: 1.0    steps: 352  evaluation reward: 211.35
episode: 2187   score: 215.0  epsilon: 1.0    steps: 352  evaluation reward: 211.85
episode: 2188   score: 180.0  epsilon: 1.0    steps: 904  evaluation reward: 211.35
Training network. lr: 0.000215. clip: 0.086185
Iteration 4528: Policy loss: -0.000894. Value loss: 0.127638. Entropy: 1.117127.
Iteration 4529: Policy loss: -0.003683. Value loss: 0.085545. Entropy: 1.132381.
Iteration 4530: Policy loss: -0.002870. Value loss: 0.065947. Entropy: 1.117768.
T

episode: 2213   score: 120.0  epsilon: 1.0    steps: 384  evaluation reward: 207.25
Training network. lr: 0.000215. clip: 0.086029
Iteration 4585: Policy loss: -0.000411. Value loss: 0.136483. Entropy: 1.132328.
Iteration 4586: Policy loss: -0.007915. Value loss: 0.073003. Entropy: 1.132553.
Iteration 4587: Policy loss: -0.011044. Value loss: 0.055949. Entropy: 1.120632.
Training network. lr: 0.000215. clip: 0.086029
Iteration 4588: Policy loss: 0.000099. Value loss: 0.157968. Entropy: 1.086442.
Iteration 4589: Policy loss: -0.004116. Value loss: 0.067870. Entropy: 1.069150.
Iteration 4590: Policy loss: -0.011293. Value loss: 0.052887. Entropy: 1.081526.
episode: 2214   score: 210.0  epsilon: 1.0    steps: 608  evaluation reward: 203.85
Training network. lr: 0.000215. clip: 0.086029
Iteration 4591: Policy loss: 0.002150. Value loss: 0.473272. Entropy: 1.245147.
Iteration 4592: Policy loss: -0.003300. Value loss: 0.235633. Entropy: 1.249520.
Iteration 4593: Policy loss: -0.009327. Value

Training network. lr: 0.000215. clip: 0.085872
Iteration 4648: Policy loss: 0.004011. Value loss: 0.484751. Entropy: 1.051421.
Iteration 4649: Policy loss: 0.000892. Value loss: 0.215547. Entropy: 1.060176.
Iteration 4650: Policy loss: -0.002643. Value loss: 0.156665. Entropy: 1.058939.
episode: 2239   score: 335.0  epsilon: 1.0    steps: 80  evaluation reward: 217.3
Training network. lr: 0.000214. clip: 0.085724
Iteration 4651: Policy loss: 0.002925. Value loss: 0.489848. Entropy: 1.144253.
Iteration 4652: Policy loss: -0.000291. Value loss: 0.277133. Entropy: 1.131928.
Iteration 4653: Policy loss: -0.005496. Value loss: 0.177928. Entropy: 1.134448.
episode: 2240   score: 540.0  epsilon: 1.0    steps: 440  evaluation reward: 221.7
episode: 2241   score: 470.0  epsilon: 1.0    steps: 816  evaluation reward: 224.3
episode: 2242   score: 155.0  epsilon: 1.0    steps: 968  evaluation reward: 223.0
Training network. lr: 0.000214. clip: 0.085724
Iteration 4654: Policy loss: 0.001322. Value 

episode: 2268   score: 180.0  epsilon: 1.0    steps: 744  evaluation reward: 225.2
Training network. lr: 0.000214. clip: 0.085568
Iteration 4708: Policy loss: 0.001949. Value loss: 0.227561. Entropy: 0.952493.
Iteration 4709: Policy loss: -0.005782. Value loss: 0.099842. Entropy: 0.951780.
Iteration 4710: Policy loss: -0.009060. Value loss: 0.085038. Entropy: 0.967907.
episode: 2269   score: 490.0  epsilon: 1.0    steps: 112  evaluation reward: 228.0
episode: 2270   score: 215.0  epsilon: 1.0    steps: 144  evaluation reward: 229.7
episode: 2271   score: 180.0  epsilon: 1.0    steps: 560  evaluation reward: 229.1
episode: 2272   score: 180.0  epsilon: 1.0    steps: 568  evaluation reward: 229.8
Training network. lr: 0.000214. clip: 0.085568
Iteration 4711: Policy loss: 0.001535. Value loss: 0.097554. Entropy: 0.958496.
Iteration 4712: Policy loss: -0.004322. Value loss: 0.064363. Entropy: 0.946010.
Iteration 4713: Policy loss: -0.007895. Value loss: 0.047035. Entropy: 0.958882.
episode

episode: 2298   score: 210.0  epsilon: 1.0    steps: 456  evaluation reward: 241.35
episode: 2299   score: 125.0  epsilon: 1.0    steps: 752  evaluation reward: 238.95
episode: 2300   score: 285.0  epsilon: 1.0    steps: 848  evaluation reward: 240.25
Training network. lr: 0.000214. clip: 0.085411
Iteration 4768: Policy loss: 0.001481. Value loss: 0.228065. Entropy: 0.953028.
Iteration 4769: Policy loss: 0.006435. Value loss: 0.140593. Entropy: 0.929651.
Iteration 4770: Policy loss: -0.006685. Value loss: 0.111304. Entropy: 0.939685.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4771: Policy loss: 0.006473. Value loss: 0.227518. Entropy: 0.966597.
Iteration 4772: Policy loss: -0.001326. Value loss: 0.135456. Entropy: 0.947103.
Iteration 4773: Policy loss: 0.000641. Value loss: 0.110332. Entropy: 0.968463.
Training network. lr: 0.000214. clip: 0.085411
Iteration 4774: Policy loss: 0.002582. Value loss: 0.319791. Entropy: 0.932696.
Iteration 4775: Policy loss: -0.004378. Value

Training network. lr: 0.000213. clip: 0.085264
Iteration 4828: Policy loss: -0.001441. Value loss: 0.242515. Entropy: 1.169582.
Iteration 4829: Policy loss: -0.004070. Value loss: 0.137590. Entropy: 1.176031.
Iteration 4830: Policy loss: -0.007015. Value loss: 0.107048. Entropy: 1.174515.
Training network. lr: 0.000213. clip: 0.085264
Iteration 4831: Policy loss: 0.002672. Value loss: 0.280542. Entropy: 0.958003.
Iteration 4832: Policy loss: -0.005973. Value loss: 0.139358. Entropy: 0.957092.
Iteration 4833: Policy loss: -0.011499. Value loss: 0.103756. Entropy: 0.952482.
episode: 2327   score: 250.0  epsilon: 1.0    steps: 240  evaluation reward: 235.95
Training network. lr: 0.000213. clip: 0.085264
Iteration 4834: Policy loss: 0.001070. Value loss: 0.298442. Entropy: 1.226607.
Iteration 4835: Policy loss: -0.005065. Value loss: 0.123276. Entropy: 1.231098.
Iteration 4836: Policy loss: -0.011977. Value loss: 0.103050. Entropy: 1.227541.
episode: 2328   score: 285.0  epsilon: 1.0    st

Iteration 4889: Policy loss: -0.006096. Value loss: 0.173250. Entropy: 1.159200.
Iteration 4890: Policy loss: -0.008434. Value loss: 0.110718. Entropy: 1.147243.
episode: 2354   score: 285.0  epsilon: 1.0    steps: 824  evaluation reward: 227.1
Training network. lr: 0.000213. clip: 0.085107
Iteration 4891: Policy loss: 0.003717. Value loss: 0.627423. Entropy: 1.107727.
Iteration 4892: Policy loss: 0.005244. Value loss: 0.246718. Entropy: 1.114467.
Iteration 4893: Policy loss: -0.001690. Value loss: 0.146916. Entropy: 1.121567.
episode: 2355   score: 135.0  epsilon: 1.0    steps: 424  evaluation reward: 226.3
episode: 2356   score: 290.0  epsilon: 1.0    steps: 944  evaluation reward: 227.65
Training network. lr: 0.000213. clip: 0.085107
Iteration 4894: Policy loss: 0.000781. Value loss: 0.328296. Entropy: 1.052584.
Iteration 4895: Policy loss: -0.006797. Value loss: 0.136041. Entropy: 1.017880.
Iteration 4896: Policy loss: -0.008499. Value loss: 0.092080. Entropy: 1.032302.
episode: 23

Iteration 4948: Policy loss: -0.000397. Value loss: 0.204386. Entropy: 1.006564.
Iteration 4949: Policy loss: -0.003879. Value loss: 0.108573. Entropy: 1.003976.
Iteration 4950: Policy loss: -0.004572. Value loss: 0.090634. Entropy: 1.009753.
Training network. lr: 0.000212. clip: 0.084803
Iteration 4951: Policy loss: 0.003086. Value loss: 0.285920. Entropy: 0.936759.
Iteration 4952: Policy loss: -0.002070. Value loss: 0.106549. Entropy: 0.946663.
Iteration 4953: Policy loss: -0.006374. Value loss: 0.080601. Entropy: 0.945268.
episode: 2384   score: 105.0  epsilon: 1.0    steps: 88  evaluation reward: 224.8
episode: 2385   score: 210.0  epsilon: 1.0    steps: 432  evaluation reward: 225.7
Training network. lr: 0.000212. clip: 0.084803
Iteration 4954: Policy loss: 0.006236. Value loss: 0.310371. Entropy: 1.143196.
Iteration 4955: Policy loss: -0.001063. Value loss: 0.134028. Entropy: 1.123294.
Iteration 4956: Policy loss: -0.007448. Value loss: 0.098190. Entropy: 1.141445.
episode: 2386 

Iteration 5010: Policy loss: -0.009834. Value loss: 0.139024. Entropy: 1.155841.
episode: 2411   score: 475.0  epsilon: 1.0    steps: 488  evaluation reward: 223.5
episode: 2412   score: 215.0  epsilon: 1.0    steps: 528  evaluation reward: 220.5
episode: 2413   score: 65.0  epsilon: 1.0    steps: 640  evaluation reward: 219.05
episode: 2414   score: 365.0  epsilon: 1.0    steps: 656  evaluation reward: 220.3
Training network. lr: 0.000212. clip: 0.084646
Iteration 5011: Policy loss: -0.001926. Value loss: 0.798971. Entropy: 1.111579.
Iteration 5012: Policy loss: 0.006663. Value loss: 0.396555. Entropy: 1.102578.
Iteration 5013: Policy loss: -0.002001. Value loss: 0.300258. Entropy: 1.104289.
episode: 2415   score: 210.0  epsilon: 1.0    steps: 264  evaluation reward: 218.3
Training network. lr: 0.000212. clip: 0.084646
Iteration 5014: Policy loss: 0.001153. Value loss: 0.394182. Entropy: 0.959859.
Iteration 5015: Policy loss: -0.002688. Value loss: 0.212522. Entropy: 0.976735.
Iterati

Training network. lr: 0.000211. clip: 0.084489
Iteration 5068: Policy loss: 0.005155. Value loss: 0.216336. Entropy: 1.048087.
Iteration 5069: Policy loss: 0.000107. Value loss: 0.127403. Entropy: 1.055177.
Iteration 5070: Policy loss: -0.007636. Value loss: 0.113916. Entropy: 1.050346.
episode: 2443   score: 125.0  epsilon: 1.0    steps: 512  evaluation reward: 220.4
Training network. lr: 0.000211. clip: 0.084489
Iteration 5071: Policy loss: 0.003516. Value loss: 0.172053. Entropy: 1.021794.
Iteration 5072: Policy loss: -0.003937. Value loss: 0.098719. Entropy: 1.039135.
Iteration 5073: Policy loss: -0.005514. Value loss: 0.079200. Entropy: 1.017911.
Training network. lr: 0.000211. clip: 0.084489
Iteration 5074: Policy loss: -0.000075. Value loss: 0.170141. Entropy: 1.075956.
Iteration 5075: Policy loss: -0.012022. Value loss: 0.104278. Entropy: 1.087101.
Iteration 5076: Policy loss: -0.014965. Value loss: 0.081608. Entropy: 1.067583.
episode: 2444   score: 285.0  epsilon: 1.0    step

Iteration 5129: Policy loss: -0.006368. Value loss: 0.108818. Entropy: 0.929003.
Iteration 5130: Policy loss: -0.012226. Value loss: 0.078963. Entropy: 0.955459.
episode: 2470   score: 555.0  epsilon: 1.0    steps: 832  evaluation reward: 228.45
Training network. lr: 0.000211. clip: 0.084342
Iteration 5131: Policy loss: 0.005272. Value loss: 0.389115. Entropy: 1.273751.
Iteration 5132: Policy loss: -0.000241. Value loss: 0.191859. Entropy: 1.266476.
Iteration 5133: Policy loss: -0.007315. Value loss: 0.125458. Entropy: 1.268777.
episode: 2471   score: 135.0  epsilon: 1.0    steps: 224  evaluation reward: 227.35
episode: 2472   score: 215.0  epsilon: 1.0    steps: 536  evaluation reward: 227.55
Training network. lr: 0.000211. clip: 0.084342
Iteration 5134: Policy loss: 0.001845. Value loss: 0.207579. Entropy: 0.996889.
Iteration 5135: Policy loss: -0.002184. Value loss: 0.119898. Entropy: 1.000967.
Iteration 5136: Policy loss: -0.006789. Value loss: 0.080777. Entropy: 1.006735.
episode:

episode: 2497   score: 260.0  epsilon: 1.0    steps: 1008  evaluation reward: 239.75
Training network. lr: 0.000210. clip: 0.084185
Iteration 5191: Policy loss: 0.004655. Value loss: 0.310764. Entropy: 1.142440.
Iteration 5192: Policy loss: 0.001168. Value loss: 0.142649. Entropy: 1.141174.
Iteration 5193: Policy loss: -0.010287. Value loss: 0.106750. Entropy: 1.143909.
episode: 2498   score: 440.0  epsilon: 1.0    steps: 152  evaluation reward: 243.5
episode: 2499   score: 160.0  epsilon: 1.0    steps: 152  evaluation reward: 239.5
Training network. lr: 0.000210. clip: 0.084185
Iteration 5194: Policy loss: 0.001669. Value loss: 0.767335. Entropy: 1.167140.
Iteration 5195: Policy loss: 0.007241. Value loss: 0.359048. Entropy: 1.182105.
Iteration 5196: Policy loss: -0.002034. Value loss: 0.321369. Entropy: 1.155654.
episode: 2500   score: 410.0  epsilon: 1.0    steps: 608  evaluation reward: 241.45
Training network. lr: 0.000210. clip: 0.084185
Iteration 5197: Policy loss: -0.001075. Va

episode: 2526   score: 210.0  epsilon: 1.0    steps: 880  evaluation reward: 234.85
episode: 2527   score: 60.0  epsilon: 1.0    steps: 1016  evaluation reward: 233.25
Training network. lr: 0.000210. clip: 0.083881
Iteration 5251: Policy loss: 0.008371. Value loss: 0.566654. Entropy: 1.077175.
Iteration 5252: Policy loss: 0.002120. Value loss: 0.242826. Entropy: 1.082974.
Iteration 5253: Policy loss: -0.008013. Value loss: 0.174058. Entropy: 1.071349.
episode: 2528   score: 280.0  epsilon: 1.0    steps: 520  evaluation reward: 233.8
Training network. lr: 0.000210. clip: 0.083881
Iteration 5254: Policy loss: -0.002721. Value loss: 0.297507. Entropy: 1.172815.
Iteration 5255: Policy loss: -0.001332. Value loss: 0.199302. Entropy: 1.163072.
Iteration 5256: Policy loss: -0.008685. Value loss: 0.114042. Entropy: 1.173658.
Training network. lr: 0.000210. clip: 0.083881
Iteration 5257: Policy loss: 0.002142. Value loss: 0.201231. Entropy: 1.109357.
Iteration 5258: Policy loss: -0.006282. Valu

episode: 2555   score: 240.0  epsilon: 1.0    steps: 200  evaluation reward: 230.5
episode: 2556   score: 210.0  epsilon: 1.0    steps: 584  evaluation reward: 230.5
episode: 2557   score: 80.0  epsilon: 1.0    steps: 680  evaluation reward: 229.0
episode: 2558   score: 210.0  epsilon: 1.0    steps: 824  evaluation reward: 229.0
episode: 2559   score: 215.0  epsilon: 1.0    steps: 912  evaluation reward: 229.05
Training network. lr: 0.000209. clip: 0.083725
Iteration 5311: Policy loss: 0.001877. Value loss: 0.166179. Entropy: 1.226921.
Iteration 5312: Policy loss: -0.005874. Value loss: 0.103841. Entropy: 1.232088.
Iteration 5313: Policy loss: -0.014658. Value loss: 0.101684. Entropy: 1.226968.
Training network. lr: 0.000209. clip: 0.083725
Iteration 5314: Policy loss: 0.002428. Value loss: 0.259768. Entropy: 0.914609.
Iteration 5315: Policy loss: -0.003615. Value loss: 0.154687. Entropy: 0.932195.
Iteration 5316: Policy loss: -0.007159. Value loss: 0.103951. Entropy: 0.936207.
episode

Iteration 5368: Policy loss: 0.001338. Value loss: 0.266879. Entropy: 1.218491.
Iteration 5369: Policy loss: -0.004451. Value loss: 0.131698. Entropy: 1.214640.
Iteration 5370: Policy loss: -0.010557. Value loss: 0.097440. Entropy: 1.208637.
episode: 2587   score: 240.0  epsilon: 1.0    steps: 272  evaluation reward: 223.15
episode: 2588   score: 320.0  epsilon: 1.0    steps: 416  evaluation reward: 224.25
Training network. lr: 0.000209. clip: 0.083568
Iteration 5371: Policy loss: 0.003546. Value loss: 0.128818. Entropy: 1.068481.
Iteration 5372: Policy loss: -0.006878. Value loss: 0.083964. Entropy: 1.089044.
Iteration 5373: Policy loss: -0.010154. Value loss: 0.068159. Entropy: 1.096841.
Training network. lr: 0.000209. clip: 0.083568
Iteration 5374: Policy loss: -0.000031. Value loss: 0.159398. Entropy: 1.069943.
Iteration 5375: Policy loss: -0.005476. Value loss: 0.090234. Entropy: 1.070771.
Iteration 5376: Policy loss: -0.012637. Value loss: 0.064876. Entropy: 1.068917.
episode: 25

Iteration 5428: Policy loss: -0.001347. Value loss: 0.150638. Entropy: 1.045026.
Iteration 5429: Policy loss: -0.002843. Value loss: 0.078809. Entropy: 1.028677.
Iteration 5430: Policy loss: -0.009282. Value loss: 0.072893. Entropy: 1.038443.
episode: 2616   score: 120.0  epsilon: 1.0    steps: 32  evaluation reward: 219.3
episode: 2617   score: 210.0  epsilon: 1.0    steps: 680  evaluation reward: 218.9
Training network. lr: 0.000209. clip: 0.083420
Iteration 5431: Policy loss: 0.001958. Value loss: 0.148015. Entropy: 0.970703.
Iteration 5432: Policy loss: -0.004153. Value loss: 0.085482. Entropy: 0.979679.
Iteration 5433: Policy loss: -0.010692. Value loss: 0.060229. Entropy: 0.957435.
episode: 2618   score: 150.0  epsilon: 1.0    steps: 328  evaluation reward: 217.25
Training network. lr: 0.000209. clip: 0.083420
Iteration 5434: Policy loss: 0.002195. Value loss: 0.350356. Entropy: 1.083582.
Iteration 5435: Policy loss: -0.002838. Value loss: 0.170484. Entropy: 1.085251.
Iteration 5

Iteration 5492: Policy loss: -0.002178. Value loss: 0.151831. Entropy: 1.125117.
Iteration 5493: Policy loss: -0.009754. Value loss: 0.096005. Entropy: 1.110528.
episode: 2641   score: 405.0  epsilon: 1.0    steps: 200  evaluation reward: 224.6
Training network. lr: 0.000208. clip: 0.083264
Iteration 5494: Policy loss: 0.001989. Value loss: 0.661824. Entropy: 1.021438.
Iteration 5495: Policy loss: -0.001633. Value loss: 0.357019. Entropy: 1.032638.
Iteration 5496: Policy loss: -0.005961. Value loss: 0.285728. Entropy: 1.045770.
episode: 2642   score: 180.0  epsilon: 1.0    steps: 544  evaluation reward: 225.35
Training network. lr: 0.000208. clip: 0.083264
Iteration 5497: Policy loss: -0.000980. Value loss: 0.484533. Entropy: 1.137987.
Iteration 5498: Policy loss: -0.006931. Value loss: 0.289217. Entropy: 1.123938.
Iteration 5499: Policy loss: -0.011614. Value loss: 0.225796. Entropy: 1.142329.
episode: 2643   score: 260.0  epsilon: 1.0    steps: 448  evaluation reward: 225.85
episode:

episode: 2668   score: 475.0  epsilon: 1.0    steps: 424  evaluation reward: 243.8
episode: 2669   score: 235.0  epsilon: 1.0    steps: 424  evaluation reward: 244.05
episode: 2670   score: 195.0  epsilon: 1.0    steps: 872  evaluation reward: 244.75
Training network. lr: 0.000207. clip: 0.082960
Iteration 5554: Policy loss: 0.003433. Value loss: 0.228128. Entropy: 0.984967.
Iteration 5555: Policy loss: -0.004832. Value loss: 0.113707. Entropy: 0.961515.
Iteration 5556: Policy loss: -0.011981. Value loss: 0.105950. Entropy: 0.960430.
Training network. lr: 0.000207. clip: 0.082960
Iteration 5557: Policy loss: 0.002340. Value loss: 0.241329. Entropy: 0.863714.
Iteration 5558: Policy loss: -0.003660. Value loss: 0.121355. Entropy: 0.877637.
Iteration 5559: Policy loss: -0.010955. Value loss: 0.089426. Entropy: 0.860076.
Training network. lr: 0.000207. clip: 0.082960
Iteration 5560: Policy loss: 0.005525. Value loss: 0.823739. Entropy: 1.151801.
Iteration 5561: Policy loss: 0.001239. Value

Iteration 5616: Policy loss: -0.003828. Value loss: 0.211160. Entropy: 1.076724.
episode: 2695   score: 155.0  epsilon: 1.0    steps: 216  evaluation reward: 253.5
episode: 2696   score: 560.0  epsilon: 1.0    steps: 344  evaluation reward: 257.0
episode: 2697   score: 230.0  epsilon: 1.0    steps: 720  evaluation reward: 256.6
Training network. lr: 0.000207. clip: 0.082803
Iteration 5617: Policy loss: 0.006465. Value loss: 0.510989. Entropy: 0.942785.
Iteration 5618: Policy loss: 0.003391. Value loss: 0.247873. Entropy: 0.959760.
Iteration 5619: Policy loss: -0.006116. Value loss: 0.199333. Entropy: 0.957349.
episode: 2698   score: 415.0  epsilon: 1.0    steps: 816  evaluation reward: 258.65
Training network. lr: 0.000207. clip: 0.082803
Iteration 5620: Policy loss: 0.000430. Value loss: 0.342280. Entropy: 0.964649.
Iteration 5621: Policy loss: 0.000732. Value loss: 0.184751. Entropy: 0.978143.
Iteration 5622: Policy loss: -0.006383. Value loss: 0.152424. Entropy: 0.966108.
episode: 2

Iteration 5676: Policy loss: -0.006852. Value loss: 0.120772. Entropy: 1.138310.
Training network. lr: 0.000207. clip: 0.082646
Iteration 5677: Policy loss: 0.001882. Value loss: 0.194483. Entropy: 0.864522.
Iteration 5678: Policy loss: -0.002346. Value loss: 0.086024. Entropy: 0.863822.
Iteration 5679: Policy loss: -0.008856. Value loss: 0.063235. Entropy: 0.871391.
episode: 2724   score: 210.0  epsilon: 1.0    steps: 816  evaluation reward: 258.8
Training network. lr: 0.000207. clip: 0.082646
Iteration 5680: Policy loss: 0.000797. Value loss: 0.276330. Entropy: 1.044468.
Iteration 5681: Policy loss: -0.005454. Value loss: 0.117076. Entropy: 1.023476.
Iteration 5682: Policy loss: -0.016348. Value loss: 0.100280. Entropy: 1.069551.
episode: 2725   score: 215.0  epsilon: 1.0    steps: 560  evaluation reward: 258.85
episode: 2726   score: 305.0  epsilon: 1.0    steps: 856  evaluation reward: 260.1
Training network. lr: 0.000207. clip: 0.082646
Iteration 5683: Policy loss: 0.006313. Value

Iteration 5739: Policy loss: -0.017732. Value loss: 0.080312. Entropy: 1.142750.
Training network. lr: 0.000206. clip: 0.082499
Iteration 5740: Policy loss: 0.000462. Value loss: 0.283313. Entropy: 1.118068.
Iteration 5741: Policy loss: -0.002525. Value loss: 0.120279. Entropy: 1.082295.
Iteration 5742: Policy loss: -0.011542. Value loss: 0.097398. Entropy: 1.090813.
Training network. lr: 0.000206. clip: 0.082499
Iteration 5743: Policy loss: 0.007972. Value loss: 0.937901. Entropy: 1.175616.
Iteration 5744: Policy loss: 0.005941. Value loss: 0.464076. Entropy: 1.173471.
Iteration 5745: Policy loss: 0.004398. Value loss: 0.265517. Entropy: 1.187358.
episode: 2750   score: 370.0  epsilon: 1.0    steps: 200  evaluation reward: 250.75
now time :  2019-02-28 12:03:54.207323
episode: 2751   score: 460.0  epsilon: 1.0    steps: 624  evaluation reward: 253.8
episode: 2752   score: 395.0  epsilon: 1.0    steps: 928  evaluation reward: 255.35
Training network. lr: 0.000206. clip: 0.082499
Iterat

Iteration 5802: Policy loss: -0.010755. Value loss: 0.108426. Entropy: 1.161700.
episode: 2775   score: 240.0  epsilon: 1.0    steps: 160  evaluation reward: 264.45
episode: 2776   score: 215.0  epsilon: 1.0    steps: 168  evaluation reward: 264.05
episode: 2777   score: 220.0  epsilon: 1.0    steps: 496  evaluation reward: 261.3
episode: 2778   score: 290.0  epsilon: 1.0    steps: 536  evaluation reward: 262.05
episode: 2779   score: 345.0  epsilon: 1.0    steps: 584  evaluation reward: 263.4
Training network. lr: 0.000205. clip: 0.082185
Iteration 5803: Policy loss: 0.001716. Value loss: 0.240913. Entropy: 0.973510.
Iteration 5804: Policy loss: -0.004053. Value loss: 0.127286. Entropy: 0.981254.
Iteration 5805: Policy loss: -0.008371. Value loss: 0.095484. Entropy: 0.970890.
Training network. lr: 0.000205. clip: 0.082185
Iteration 5806: Policy loss: 0.003066. Value loss: 0.118472. Entropy: 0.851631.
Iteration 5807: Policy loss: -0.005664. Value loss: 0.083441. Entropy: 0.851036.
Iter

Iteration 5862: Policy loss: 0.002405. Value loss: 0.227855. Entropy: 0.953943.
episode: 2804   score: 225.0  epsilon: 1.0    steps: 872  evaluation reward: 258.85
episode: 2805   score: 120.0  epsilon: 1.0    steps: 880  evaluation reward: 257.95
Training network. lr: 0.000205. clip: 0.082038
Iteration 5863: Policy loss: -0.000114. Value loss: 0.162628. Entropy: 1.013819.
Iteration 5864: Policy loss: -0.004847. Value loss: 0.072654. Entropy: 1.026659.
Iteration 5865: Policy loss: -0.008556. Value loss: 0.055937. Entropy: 1.030482.
episode: 2806   score: 415.0  epsilon: 1.0    steps: 960  evaluation reward: 257.6
Training network. lr: 0.000205. clip: 0.082038
Iteration 5866: Policy loss: -0.001030. Value loss: 0.293687. Entropy: 0.945384.
Iteration 5867: Policy loss: -0.006950. Value loss: 0.138323. Entropy: 0.942663.
Iteration 5868: Policy loss: -0.010731. Value loss: 0.095319. Entropy: 0.946417.
episode: 2807   score: 125.0  epsilon: 1.0    steps: 920  evaluation reward: 257.0
Traini

Iteration 5925: Policy loss: -0.009737. Value loss: 0.094563. Entropy: 1.194897.
episode: 2830   score: 210.0  epsilon: 1.0    steps: 344  evaluation reward: 261.85
Training network. lr: 0.000205. clip: 0.081881
Iteration 5926: Policy loss: 0.007060. Value loss: 0.707462. Entropy: 1.195698.
Iteration 5927: Policy loss: 0.001168. Value loss: 0.272476. Entropy: 1.202683.
Iteration 5928: Policy loss: -0.000272. Value loss: 0.217801. Entropy: 1.200408.
episode: 2831   score: 260.0  epsilon: 1.0    steps: 32  evaluation reward: 260.25
episode: 2832   score: 250.0  epsilon: 1.0    steps: 184  evaluation reward: 261.65
episode: 2833   score: 265.0  epsilon: 1.0    steps: 232  evaluation reward: 261.6
Training network. lr: 0.000205. clip: 0.081881
Iteration 5929: Policy loss: 0.001185. Value loss: 0.219822. Entropy: 1.051885.
Iteration 5930: Policy loss: -0.008738. Value loss: 0.119909. Entropy: 1.048384.
Iteration 5931: Policy loss: -0.010422. Value loss: 0.106625. Entropy: 1.031251.
episode:

Training network. lr: 0.000204. clip: 0.081725
Iteration 5986: Policy loss: 0.001504. Value loss: 0.324335. Entropy: 1.167762.
Iteration 5987: Policy loss: 0.003182. Value loss: 0.132005. Entropy: 1.173423.
Iteration 5988: Policy loss: -0.007470. Value loss: 0.094723. Entropy: 1.172518.
Training network. lr: 0.000204. clip: 0.081725
Iteration 5989: Policy loss: 0.003929. Value loss: 0.154410. Entropy: 0.900909.
Iteration 5990: Policy loss: -0.006917. Value loss: 0.067720. Entropy: 0.876932.
Iteration 5991: Policy loss: -0.009116. Value loss: 0.062392. Entropy: 0.905743.
episode: 2859   score: 185.0  epsilon: 1.0    steps: 144  evaluation reward: 257.35
Training network. lr: 0.000204. clip: 0.081725
Iteration 5992: Policy loss: 0.002105. Value loss: 0.280570. Entropy: 1.134702.
Iteration 5993: Policy loss: -0.003963. Value loss: 0.112884. Entropy: 1.101055.
Iteration 5994: Policy loss: -0.008649. Value loss: 0.084064. Entropy: 1.144722.
episode: 2860   score: 270.0  epsilon: 1.0    step

episode: 2885   score: 155.0  epsilon: 1.0    steps: 448  evaluation reward: 250.2
episode: 2886   score: 320.0  epsilon: 1.0    steps: 504  evaluation reward: 250.35
episode: 2887   score: 565.0  epsilon: 1.0    steps: 848  evaluation reward: 253.7
Training network. lr: 0.000204. clip: 0.081577
Iteration 6049: Policy loss: 0.000801. Value loss: 0.445579. Entropy: 1.061063.
Iteration 6050: Policy loss: -0.001308. Value loss: 0.214091. Entropy: 1.042053.
Iteration 6051: Policy loss: -0.006349. Value loss: 0.196517. Entropy: 1.064472.
episode: 2888   score: 320.0  epsilon: 1.0    steps: 80  evaluation reward: 253.9
Training network. lr: 0.000204. clip: 0.081421
Iteration 6052: Policy loss: -0.001839. Value loss: 0.461887. Entropy: 0.752401.
Iteration 6053: Policy loss: -0.003181. Value loss: 0.267827. Entropy: 0.799476.
Iteration 6054: Policy loss: -0.006178. Value loss: 0.186457. Entropy: 0.774344.
Training network. lr: 0.000204. clip: 0.081421
Iteration 6055: Policy loss: 0.000500. Val

Iteration 6109: Policy loss: 0.001114. Value loss: 0.392898. Entropy: 1.138191.
Iteration 6110: Policy loss: -0.002667. Value loss: 0.246530. Entropy: 1.155622.
Iteration 6111: Policy loss: -0.008592. Value loss: 0.196690. Entropy: 1.123855.
episode: 2913   score: 390.0  epsilon: 1.0    steps: 680  evaluation reward: 260.85
Training network. lr: 0.000203. clip: 0.081264
Iteration 6112: Policy loss: 0.000206. Value loss: 0.399757. Entropy: 0.912848.
Iteration 6113: Policy loss: -0.001975. Value loss: 0.308146. Entropy: 0.892383.
Iteration 6114: Policy loss: -0.006218. Value loss: 0.193886. Entropy: 0.921977.
Training network. lr: 0.000203. clip: 0.081264
Iteration 6115: Policy loss: 0.000600. Value loss: 0.416927. Entropy: 1.063373.
Iteration 6116: Policy loss: -0.003550. Value loss: 0.201875. Entropy: 1.052203.
Iteration 6117: Policy loss: -0.007018. Value loss: 0.170670. Entropy: 1.071709.
episode: 2914   score: 170.0  epsilon: 1.0    steps: 304  evaluation reward: 258.95
episode: 291

Training network. lr: 0.000203. clip: 0.081116
Iteration 6172: Policy loss: -0.002626. Value loss: 0.267325. Entropy: 0.951444.
Iteration 6173: Policy loss: -0.006080. Value loss: 0.154569. Entropy: 0.948364.
Iteration 6174: Policy loss: -0.014201. Value loss: 0.103724. Entropy: 0.965461.
episode: 2940   score: 215.0  epsilon: 1.0    steps: 376  evaluation reward: 258.15
episode: 2941   score: 185.0  epsilon: 1.0    steps: 656  evaluation reward: 257.5
Training network. lr: 0.000203. clip: 0.081116
Iteration 6175: Policy loss: -0.001587. Value loss: 0.288166. Entropy: 1.229182.
Iteration 6176: Policy loss: -0.006777. Value loss: 0.167310. Entropy: 1.227046.
Iteration 6177: Policy loss: -0.014707. Value loss: 0.131466. Entropy: 1.225855.
episode: 2942   score: 65.0  epsilon: 1.0    steps: 1024  evaluation reward: 255.0
Training network. lr: 0.000203. clip: 0.081116
Iteration 6178: Policy loss: -0.000934. Value loss: 0.228034. Entropy: 1.072194.
Iteration 6179: Policy loss: -0.006305. Va

Iteration 6234: Policy loss: -0.006403. Value loss: 0.060390. Entropy: 1.006398.
episode: 2966   score: 215.0  epsilon: 1.0    steps: 56  evaluation reward: 264.15
episode: 2967   score: 245.0  epsilon: 1.0    steps: 816  evaluation reward: 263.4
Training network. lr: 0.000202. clip: 0.080960
Iteration 6235: Policy loss: 0.001384. Value loss: 0.236475. Entropy: 1.085348.
Iteration 6236: Policy loss: -0.004909. Value loss: 0.147898. Entropy: 1.103213.
Iteration 6237: Policy loss: -0.005721. Value loss: 0.111503. Entropy: 1.091823.
episode: 2968   score: 175.0  epsilon: 1.0    steps: 680  evaluation reward: 263.05
Training network. lr: 0.000202. clip: 0.080960
Iteration 6238: Policy loss: 0.002129. Value loss: 0.264652. Entropy: 1.075840.
Iteration 6239: Policy loss: -0.007425. Value loss: 0.132978. Entropy: 1.066363.
Iteration 6240: Policy loss: -0.013570. Value loss: 0.098673. Entropy: 1.065206.
episode: 2969   score: 330.0  epsilon: 1.0    steps: 432  evaluation reward: 265.1
Training

Iteration 6293: Policy loss: -0.005465. Value loss: 0.090738. Entropy: 1.142429.
Iteration 6294: Policy loss: -0.014008. Value loss: 0.065921. Entropy: 1.163326.
episode: 2996   score: 180.0  epsilon: 1.0    steps: 128  evaluation reward: 255.35
episode: 2997   score: 120.0  epsilon: 1.0    steps: 968  evaluation reward: 254.3
Training network. lr: 0.000202. clip: 0.080803
Iteration 6295: Policy loss: 0.000540. Value loss: 0.155833. Entropy: 1.075570.
Iteration 6296: Policy loss: -0.003791. Value loss: 0.084458. Entropy: 1.083399.
Iteration 6297: Policy loss: -0.007273. Value loss: 0.086448. Entropy: 1.090438.
episode: 2998   score: 230.0  epsilon: 1.0    steps: 160  evaluation reward: 254.8
episode: 2999   score: 210.0  epsilon: 1.0    steps: 976  evaluation reward: 253.9
Training network. lr: 0.000202. clip: 0.080803
Iteration 6298: Policy loss: 0.001527. Value loss: 0.163053. Entropy: 1.081183.
Iteration 6299: Policy loss: -0.006104. Value loss: 0.100620. Entropy: 1.072776.
Iteratio

episode: 3023   score: 225.0  epsilon: 1.0    steps: 424  evaluation reward: 248.85
episode: 3024   score: 185.0  epsilon: 1.0    steps: 720  evaluation reward: 245.3
episode: 3025   score: 210.0  epsilon: 1.0    steps: 1000  evaluation reward: 245.2
Training network. lr: 0.000201. clip: 0.080499
Iteration 6355: Policy loss: 0.002580. Value loss: 0.409629. Entropy: 1.160843.
Iteration 6356: Policy loss: 0.000880. Value loss: 0.228968. Entropy: 1.168916.
Iteration 6357: Policy loss: -0.006335. Value loss: 0.191618. Entropy: 1.160034.
episode: 3026   score: 210.0  epsilon: 1.0    steps: 552  evaluation reward: 245.6
episode: 3027   score: 155.0  epsilon: 1.0    steps: 616  evaluation reward: 245.0
episode: 3028   score: 410.0  epsilon: 1.0    steps: 880  evaluation reward: 246.5
Training network. lr: 0.000201. clip: 0.080499
Iteration 6358: Policy loss: 0.004088. Value loss: 0.260267. Entropy: 0.943807.
Iteration 6359: Policy loss: -0.000388. Value loss: 0.145929. Entropy: 0.957558.
Iter

Iteration 6416: Policy loss: -0.005962. Value loss: 0.236250. Entropy: 1.234530.
Iteration 6417: Policy loss: -0.008325. Value loss: 0.186248. Entropy: 1.243917.
now time :  2019-02-28 12:12:00.159851
episode: 3051   score: 225.0  epsilon: 1.0    steps: 8  evaluation reward: 240.55
episode: 3052   score: 395.0  epsilon: 1.0    steps: 440  evaluation reward: 237.4
Training network. lr: 0.000201. clip: 0.080342
Iteration 6418: Policy loss: 0.000246. Value loss: 0.532409. Entropy: 1.169160.
Iteration 6419: Policy loss: -0.002150. Value loss: 0.200451. Entropy: 1.149727.
Iteration 6420: Policy loss: -0.005536. Value loss: 0.138876. Entropy: 1.160567.
episode: 3053   score: 160.0  epsilon: 1.0    steps: 72  evaluation reward: 234.1
episode: 3054   score: 270.0  epsilon: 1.0    steps: 216  evaluation reward: 233.35
episode: 3055   score: 315.0  epsilon: 1.0    steps: 312  evaluation reward: 232.1
episode: 3056   score: 385.0  epsilon: 1.0    steps: 984  evaluation reward: 233.05
Training net

Iteration 6475: Policy loss: -0.000148. Value loss: 0.255485. Entropy: 1.142878.
Iteration 6476: Policy loss: -0.002976. Value loss: 0.171227. Entropy: 1.137406.
Iteration 6477: Policy loss: -0.005536. Value loss: 0.166708. Entropy: 1.139427.
episode: 3081   score: 180.0  epsilon: 1.0    steps: 464  evaluation reward: 238.1
Training network. lr: 0.000200. clip: 0.080195
Iteration 6478: Policy loss: 0.000412. Value loss: 0.203649. Entropy: 0.839065.
Iteration 6479: Policy loss: -0.003868. Value loss: 0.135926. Entropy: 0.841166.
Iteration 6480: Policy loss: -0.005534. Value loss: 0.099012. Entropy: 0.855067.
episode: 3082   score: 360.0  epsilon: 1.0    steps: 584  evaluation reward: 239.6
Training network. lr: 0.000200. clip: 0.080195
Iteration 6481: Policy loss: 0.000620. Value loss: 0.199428. Entropy: 0.972800.
Iteration 6482: Policy loss: -0.003941. Value loss: 0.118860. Entropy: 0.951275.
Iteration 6483: Policy loss: -0.011275. Value loss: 0.100735. Entropy: 0.976536.
Training netw

episode: 3107   score: 235.0  epsilon: 1.0    steps: 240  evaluation reward: 247.25
episode: 3108   score: 330.0  epsilon: 1.0    steps: 792  evaluation reward: 247.7
Training network. lr: 0.000200. clip: 0.080038
Iteration 6538: Policy loss: 0.000296. Value loss: 0.236012. Entropy: 1.122324.
Iteration 6539: Policy loss: -0.004719. Value loss: 0.121380. Entropy: 1.108409.
Iteration 6540: Policy loss: -0.006950. Value loss: 0.101019. Entropy: 1.117257.
episode: 3109   score: 215.0  epsilon: 1.0    steps: 488  evaluation reward: 245.6
Training network. lr: 0.000200. clip: 0.080038
Iteration 6541: Policy loss: 0.001647. Value loss: 0.186782. Entropy: 1.066058.
Iteration 6542: Policy loss: -0.003244. Value loss: 0.103579. Entropy: 1.045113.
Iteration 6543: Policy loss: -0.011429. Value loss: 0.079797. Entropy: 1.061569.
episode: 3110   score: 265.0  epsilon: 1.0    steps: 408  evaluation reward: 245.7
Training network. lr: 0.000200. clip: 0.080038
Iteration 6544: Policy loss: 0.001853. Val

Iteration 6595: Policy loss: -0.002923. Value loss: 0.107544. Entropy: 0.883350.
Iteration 6596: Policy loss: -0.008312. Value loss: 0.070418. Entropy: 0.880668.
Iteration 6597: Policy loss: -0.012928. Value loss: 0.049538. Entropy: 0.878861.
episode: 3139   score: 210.0  epsilon: 1.0    steps: 592  evaluation reward: 240.7
episode: 3140   score: 250.0  epsilon: 1.0    steps: 648  evaluation reward: 241.4
Training network. lr: 0.000200. clip: 0.079881
Iteration 6598: Policy loss: 0.002715. Value loss: 0.132721. Entropy: 1.057269.
Iteration 6599: Policy loss: -0.001777. Value loss: 0.067534. Entropy: 1.053133.
Iteration 6600: Policy loss: -0.002685. Value loss: 0.067158. Entropy: 1.067686.
episode: 3141   score: 210.0  epsilon: 1.0    steps: 416  evaluation reward: 242.0
Training network. lr: 0.000199. clip: 0.079734
Iteration 6601: Policy loss: 0.002288. Value loss: 0.092244. Entropy: 0.859925.
Iteration 6602: Policy loss: -0.003227. Value loss: 0.062916. Entropy: 0.881289.
Iteration 6

episode: 3168   score: 135.0  epsilon: 1.0    steps: 576  evaluation reward: 238.05
episode: 3169   score: 315.0  epsilon: 1.0    steps: 608  evaluation reward: 239.1
Training network. lr: 0.000199. clip: 0.079577
Iteration 6655: Policy loss: 0.000777. Value loss: 0.173507. Entropy: 1.079863.
Iteration 6656: Policy loss: -0.004971. Value loss: 0.110098. Entropy: 1.088675.
Iteration 6657: Policy loss: -0.008208. Value loss: 0.084512. Entropy: 1.097745.
episode: 3170   score: 210.0  epsilon: 1.0    steps: 288  evaluation reward: 239.4
episode: 3171   score: 325.0  epsilon: 1.0    steps: 400  evaluation reward: 236.45
Training network. lr: 0.000199. clip: 0.079577
Iteration 6658: Policy loss: 0.000550. Value loss: 0.322772. Entropy: 0.976481.
Iteration 6659: Policy loss: -0.002113. Value loss: 0.194896. Entropy: 0.959239.
Iteration 6660: Policy loss: -0.006518. Value loss: 0.161574. Entropy: 0.962025.
episode: 3172   score: 275.0  epsilon: 1.0    steps: 72  evaluation reward: 237.05
Train

Iteration 6712: Policy loss: -0.000986. Value loss: 0.076793. Entropy: 0.930854.
Iteration 6713: Policy loss: -0.008322. Value loss: 0.046272. Entropy: 0.937302.
Iteration 6714: Policy loss: -0.009178. Value loss: 0.039014. Entropy: 0.913733.
episode: 3200   score: 240.0  epsilon: 1.0    steps: 144  evaluation reward: 230.3
now time :  2019-02-28 12:15:38.629922
episode: 3201   score: 215.0  epsilon: 1.0    steps: 568  evaluation reward: 229.5
Training network. lr: 0.000199. clip: 0.079421
Iteration 6715: Policy loss: 0.001381. Value loss: 0.129346. Entropy: 1.102507.
Iteration 6716: Policy loss: -0.003836. Value loss: 0.077815. Entropy: 1.100967.
Iteration 6717: Policy loss: -0.007944. Value loss: 0.062357. Entropy: 1.112944.
episode: 3202   score: 210.0  epsilon: 1.0    steps: 464  evaluation reward: 229.8
Training network. lr: 0.000199. clip: 0.079421
Iteration 6718: Policy loss: -0.001438. Value loss: 0.152248. Entropy: 0.873818.
Iteration 6719: Policy loss: -0.010012. Value loss: 

Iteration 6770: Policy loss: 0.003239. Value loss: 0.240761. Entropy: 1.190555.
Iteration 6771: Policy loss: -0.001017. Value loss: 0.196664. Entropy: 1.195369.
episode: 3231   score: 410.0  epsilon: 1.0    steps: 712  evaluation reward: 229.25
Training network. lr: 0.000198. clip: 0.079273
Iteration 6772: Policy loss: 0.000874. Value loss: 0.150582. Entropy: 0.812562.
Iteration 6773: Policy loss: -0.005375. Value loss: 0.078401. Entropy: 0.804930.
Iteration 6774: Policy loss: -0.005683. Value loss: 0.059888. Entropy: 0.805376.
episode: 3232   score: 210.0  epsilon: 1.0    steps: 176  evaluation reward: 229.25
Training network. lr: 0.000198. clip: 0.079273
Iteration 6775: Policy loss: -0.001253. Value loss: 0.162700. Entropy: 1.088054.
Iteration 6776: Policy loss: -0.006257. Value loss: 0.099457. Entropy: 1.072156.
Iteration 6777: Policy loss: -0.011905. Value loss: 0.102922. Entropy: 1.088286.
episode: 3233   score: 180.0  epsilon: 1.0    steps: 248  evaluation reward: 229.7
episode: 

Iteration 6830: Policy loss: -0.004599. Value loss: 0.076520. Entropy: 1.164457.
Iteration 6831: Policy loss: -0.009284. Value loss: 0.065921. Entropy: 1.150126.
episode: 3260   score: 235.0  epsilon: 1.0    steps: 952  evaluation reward: 231.1
Training network. lr: 0.000198. clip: 0.079117
Iteration 6832: Policy loss: -0.001764. Value loss: 0.103252. Entropy: 0.838621.
Iteration 6833: Policy loss: -0.005017. Value loss: 0.075917. Entropy: 0.841432.
Iteration 6834: Policy loss: -0.009494. Value loss: 0.057222. Entropy: 0.853134.
episode: 3261   score: 210.0  epsilon: 1.0    steps: 80  evaluation reward: 232.0
episode: 3262   score: 240.0  epsilon: 1.0    steps: 864  evaluation reward: 232.3
Training network. lr: 0.000198. clip: 0.079117
Iteration 6835: Policy loss: -0.000067. Value loss: 0.147189. Entropy: 1.047356.
Iteration 6836: Policy loss: -0.003323. Value loss: 0.092009. Entropy: 1.060730.
Iteration 6837: Policy loss: -0.005114. Value loss: 0.098826. Entropy: 1.064236.
episode: 3