# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10

### Main Training Loop

In [None]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
max_eval = -np.inf
reset_count = 0

while (frame < 10000000):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                    
                    if np.mean(evaluation_reward) > max_eval:
                        torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
                        max_eval = float(np.mean(evaluation_reward))
                        reset_count = 0
                    elif e > 5000:
                        reset_count += 1
                        """
                        if (reset_count == reset_max):
                            print("Training went nowhere, starting again at best model")
                            agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                            agent.update_target_net()
                            reset_count = 0
                        """
                e += 1
                evaluation_reward.append(env.score)
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                
                env.done = False
                env.score = 0
                env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()

'''

  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: -1.094032. Value loss: 6.568430. Entropy: 1.787648.
Iteration 2: Policy loss: -1.059619. Value loss: 5.662950. Entropy: 1.787355.
Iteration 3: Policy loss: -1.087144. Value loss: 5.936832. Entropy: 1.787368.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: -3.762961. Value loss: 23.942623. Entropy: 1.753100.
Iteration 5: Policy loss: -3.626564. Value loss: 21.021532. Entropy: 1.765004.
Iteration 6: Policy loss: -3.676214. Value loss: 18.459532. Entropy: 1.769020.
now time :  2019-02-26 12:27:11.380270
episode: 1   score: 105.0  epsilon: 1.0    steps: 255  evaluation reward: 105.0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode: 2   score: 35.0  epsilon: 1.0    steps: 896  evaluation reward: 70.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: -0.501706. Value loss: 21.333206. Entropy: 1.764121.
Iteration 8: Policy loss: -0.357049. Value loss: 19.093555. Entropy: 1.760776.
Iteration 9: Policy loss: -0.408432. Value loss: 18.803070. Entropy: 1.770087.
episode: 3   score: 65.0  epsilon: 1.0    steps: 2  evaluation reward: 68.33333333333333
episode: 4   score: 80.0  epsilon: 1.0    steps: 910  evaluation reward: 71.25
Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: -1.910898. Value loss: 37.708187. Entropy: 1.745175.
Iteration 11: Policy loss: -2.176072. Value loss: 38.048820. Entropy: 1.751735.
Iteration 12: Policy loss: -1.972138. Value loss: 35.902405. Entropy: 1.746001.
episode: 5   score: 125.0  epsilon: 1.0    steps: 380  evaluation reward: 82.0
episode: 6   score: 120.0  epsilon: 1.0    steps: 646  evaluation reward: 88.33333333333333
Training net

Iteration 67: Policy loss: -0.374049. Value loss: 57.317833. Entropy: 1.153395.
Iteration 68: Policy loss: -0.356640. Value loss: 46.394497. Entropy: 1.207495.
Iteration 69: Policy loss: 0.085022. Value loss: 50.242630. Entropy: 1.173969.
episode: 30   score: 165.0  epsilon: 1.0    steps: 272  evaluation reward: 169.83333333333334
episode: 31   score: 110.0  epsilon: 1.0    steps: 409  evaluation reward: 167.90322580645162
Training network. lr: 0.000250. clip: 0.099853
Iteration 70: Policy loss: 1.645714. Value loss: 238.341461. Entropy: 1.154018.
Iteration 71: Policy loss: 1.606470. Value loss: 180.611511. Entropy: 1.136458.
Iteration 72: Policy loss: 0.898455. Value loss: 228.929352. Entropy: 1.173779.
episode: 32   score: 210.0  epsilon: 1.0    steps: 76  evaluation reward: 169.21875
episode: 33   score: 460.0  epsilon: 1.0    steps: 733  evaluation reward: 178.03030303030303
Training network. lr: 0.000250. clip: 0.099853
Iteration 73: Policy loss: 0.793839. Value loss: 38.408730. E

Iteration 128: Policy loss: -0.100772. Value loss: 20.411129. Entropy: 0.454290.
Iteration 129: Policy loss: 0.035658. Value loss: 20.497225. Entropy: 0.464881.
episode: 56   score: 185.0  epsilon: 1.0    steps: 55  evaluation reward: 184.64285714285714
Training network. lr: 0.000249. clip: 0.099696
Iteration 130: Policy loss: 1.379640. Value loss: 28.818233. Entropy: 0.481112.
Iteration 131: Policy loss: 1.105922. Value loss: 21.068638. Entropy: 0.445000.
Iteration 132: Policy loss: 1.059253. Value loss: 19.317085. Entropy: 0.456872.
Training network. lr: 0.000249. clip: 0.099696
Iteration 133: Policy loss: -0.343825. Value loss: 17.710852. Entropy: 0.468456.
Iteration 134: Policy loss: -0.481948. Value loss: 11.700258. Entropy: 0.470345.
Iteration 135: Policy loss: -0.431312. Value loss: 10.790524. Entropy: 0.458852.
episode: 57   score: 180.0  epsilon: 1.0    steps: 891  evaluation reward: 184.56140350877192
Training network. lr: 0.000249. clip: 0.099696
Iteration 136: Policy loss: 

Iteration 191: Policy loss: 1.321886. Value loss: 8.135585. Entropy: 0.691685.
Iteration 192: Policy loss: 1.208769. Value loss: 8.007512. Entropy: 0.714133.
episode: 79   score: 520.0  epsilon: 1.0    steps: 889  evaluation reward: 194.55696202531647
episode: 80   score: 105.0  epsilon: 1.0    steps: 915  evaluation reward: 193.4375
Training network. lr: 0.000249. clip: 0.099548
Iteration 193: Policy loss: -0.114598. Value loss: 22.619146. Entropy: 0.620992.
Iteration 194: Policy loss: 0.149672. Value loss: 16.985306. Entropy: 0.664565.
Iteration 195: Policy loss: 0.031528. Value loss: 15.849419. Entropy: 0.657354.
episode: 81   score: 210.0  epsilon: 1.0    steps: 493  evaluation reward: 193.64197530864197
episode: 82   score: 160.0  epsilon: 1.0    steps: 637  evaluation reward: 193.23170731707316
Training network. lr: 0.000249. clip: 0.099548
Iteration 196: Policy loss: 1.487036. Value loss: 40.822403. Entropy: 0.649981.
Iteration 197: Policy loss: 1.482673. Value loss: 29.011583. 

Iteration 252: Policy loss: 1.911385. Value loss: 17.369148. Entropy: 0.971937.
episode: 105   score: 75.0  epsilon: 1.0    steps: 166  evaluation reward: 202.15
Training network. lr: 0.000248. clip: 0.099235
Iteration 253: Policy loss: 2.334609. Value loss: 29.768791. Entropy: 0.871741.
Iteration 254: Policy loss: 2.279338. Value loss: 18.676300. Entropy: 0.856928.
Iteration 255: Policy loss: 2.320926. Value loss: 17.512388. Entropy: 0.871839.
episode: 106   score: 155.0  epsilon: 1.0    steps: 82  evaluation reward: 202.5
episode: 107   score: 110.0  epsilon: 1.0    steps: 433  evaluation reward: 202.2
Training network. lr: 0.000248. clip: 0.099235
Iteration 256: Policy loss: -2.251868. Value loss: 48.598141. Entropy: 0.901539.
Iteration 257: Policy loss: -2.409133. Value loss: 34.160103. Entropy: 0.918745.
Iteration 258: Policy loss: -1.999194. Value loss: 28.673079. Entropy: 0.913760.
episode: 108   score: 250.0  epsilon: 1.0    steps: 307  evaluation reward: 203.5
episode: 109   s

episode: 131   score: 80.0  epsilon: 1.0    steps: 148  evaluation reward: 199.3
Training network. lr: 0.000248. clip: 0.099088
Iteration 316: Policy loss: 1.190094. Value loss: 16.201731. Entropy: 0.603407.
Iteration 317: Policy loss: 1.182451. Value loss: 7.632030. Entropy: 0.693849.
Iteration 318: Policy loss: 1.187920. Value loss: 6.448758. Entropy: 0.669992.
episode: 132   score: 545.0  epsilon: 1.0    steps: 65  evaluation reward: 202.65
episode: 133   score: 260.0  epsilon: 1.0    steps: 936  evaluation reward: 200.65
Training network. lr: 0.000248. clip: 0.099088
Iteration 319: Policy loss: 0.369271. Value loss: 27.780775. Entropy: 0.641300.
Iteration 320: Policy loss: 0.592883. Value loss: 24.362944. Entropy: 0.633898.
Iteration 321: Policy loss: 0.504433. Value loss: 20.884628. Entropy: 0.643890.
episode: 134   score: 210.0  epsilon: 1.0    steps: 497  evaluation reward: 201.7
episode: 135   score: 280.0  epsilon: 1.0    steps: 674  evaluation reward: 203.45
Training network.

episode: 161   score: 230.0  epsilon: 1.0    steps: 858  evaluation reward: 196.1
episode: 162   score: 180.0  epsilon: 1.0    steps: 897  evaluation reward: 195.8
Training network. lr: 0.000247. clip: 0.098931
Iteration 376: Policy loss: 1.100583. Value loss: 23.121008. Entropy: 0.935528.
Iteration 377: Policy loss: 0.981534. Value loss: 18.602787. Entropy: 0.955684.
Iteration 378: Policy loss: 1.151357. Value loss: 14.238213. Entropy: 0.968996.
episode: 163   score: 110.0  epsilon: 1.0    steps: 572  evaluation reward: 194.8
episode: 164   score: 55.0  epsilon: 1.0    steps: 732  evaluation reward: 193.1
Training network. lr: 0.000247. clip: 0.098931
Iteration 379: Policy loss: 2.178570. Value loss: 46.976063. Entropy: 0.798756.
Iteration 380: Policy loss: 2.289909. Value loss: 35.869354. Entropy: 0.790697.
Iteration 381: Policy loss: 2.096302. Value loss: 27.059549. Entropy: 0.832144.
Training network. lr: 0.000247. clip: 0.098931
Iteration 382: Policy loss: 1.527689. Value loss: 23

Iteration 434: Policy loss: 0.685018. Value loss: 23.438894. Entropy: 0.940887.
Iteration 435: Policy loss: 0.601201. Value loss: 21.345741. Entropy: 0.909577.
Training network. lr: 0.000247. clip: 0.098774
Iteration 436: Policy loss: 0.156238. Value loss: 20.034294. Entropy: 0.844988.
Iteration 437: Policy loss: 0.137889. Value loss: 16.058397. Entropy: 0.834918.
Iteration 438: Policy loss: 0.110493. Value loss: 14.556019. Entropy: 0.853298.
episode: 193   score: 50.0  epsilon: 1.0    steps: 357  evaluation reward: 174.75
Training network. lr: 0.000247. clip: 0.098774
Iteration 439: Policy loss: -0.947008. Value loss: 27.475838. Entropy: 0.907396.
Iteration 440: Policy loss: -0.862115. Value loss: 19.089130. Entropy: 0.930528.
Iteration 441: Policy loss: -0.882099. Value loss: 16.341553. Entropy: 0.938067.
episode: 194   score: 150.0  epsilon: 1.0    steps: 569  evaluation reward: 174.45
episode: 195   score: 50.0  epsilon: 1.0    steps: 965  evaluation reward: 173.15
Training network

Iteration 497: Policy loss: -2.869219. Value loss: 144.810547. Entropy: 0.780263.
Iteration 498: Policy loss: -2.581054. Value loss: 123.001953. Entropy: 0.749960.
episode: 219   score: 355.0  epsilon: 1.0    steps: 956  evaluation reward: 183.85
Training network. lr: 0.000247. clip: 0.098627
Iteration 499: Policy loss: -0.033314. Value loss: 13.755231. Entropy: 0.872719.
Iteration 500: Policy loss: 0.007322. Value loss: 11.209599. Entropy: 0.871224.
Iteration 501: Policy loss: -0.093113. Value loss: 10.645125. Entropy: 0.862429.
Training network. lr: 0.000246. clip: 0.098470
Iteration 502: Policy loss: -0.025862. Value loss: 10.893949. Entropy: 0.884854.
Iteration 503: Policy loss: -0.101424. Value loss: 7.984505. Entropy: 0.889052.
Iteration 504: Policy loss: -0.064026. Value loss: 6.341643. Entropy: 0.904426.
Training network. lr: 0.000246. clip: 0.098470
Iteration 505: Policy loss: 0.824595. Value loss: 15.914540. Entropy: 0.707347.
Iteration 506: Policy loss: 0.990628. Value loss:

Iteration 559: Policy loss: -0.247459. Value loss: 48.653835. Entropy: 1.163217.
Iteration 560: Policy loss: -0.154402. Value loss: 40.285690. Entropy: 1.200026.
Iteration 561: Policy loss: -0.066101. Value loss: 35.820118. Entropy: 1.183124.
episode: 246   score: 90.0  epsilon: 1.0    steps: 181  evaluation reward: 178.85
episode: 247   score: 60.0  epsilon: 1.0    steps: 821  evaluation reward: 174.85
Training network. lr: 0.000246. clip: 0.098313
Iteration 562: Policy loss: 0.940456. Value loss: 46.449745. Entropy: 1.126326.
Iteration 563: Policy loss: 0.849038. Value loss: 31.073532. Entropy: 1.126516.
Iteration 564: Policy loss: 0.441706. Value loss: 28.903900. Entropy: 1.143933.
episode: 248   score: 405.0  epsilon: 1.0    steps: 113  evaluation reward: 177.85
episode: 249   score: 160.0  epsilon: 1.0    steps: 580  evaluation reward: 177.9
Training network. lr: 0.000246. clip: 0.098313
Iteration 565: Policy loss: 1.728884. Value loss: 90.193153. Entropy: 1.126497.
Iteration 566:

Iteration 620: Policy loss: 1.279294. Value loss: 17.873552. Entropy: 1.151036.
Iteration 621: Policy loss: 0.920216. Value loss: 14.364750. Entropy: 1.161243.
Training network. lr: 0.000245. clip: 0.098166
Iteration 622: Policy loss: -6.356101. Value loss: 471.963959. Entropy: 1.090229.
Iteration 623: Policy loss: -5.312871. Value loss: 291.055511. Entropy: 1.017005.
Iteration 624: Policy loss: -5.780021. Value loss: 268.739410. Entropy: 1.116791.
episode: 275   score: 415.0  epsilon: 1.0    steps: 172  evaluation reward: 176.05
episode: 276   score: 235.0  epsilon: 1.0    steps: 372  evaluation reward: 176.3
Training network. lr: 0.000245. clip: 0.098166
Iteration 625: Policy loss: 0.230960. Value loss: 116.726196. Entropy: 1.197583.
Iteration 626: Policy loss: 0.838236. Value loss: 73.674355. Entropy: 1.183872.
Iteration 627: Policy loss: 0.313165. Value loss: 56.506966. Entropy: 1.190513.
episode: 277   score: 140.0  epsilon: 1.0    steps: 476  evaluation reward: 176.6
Training net

episode: 303   score: 185.0  epsilon: 1.0    steps: 818  evaluation reward: 187.15
Training network. lr: 0.000245. clip: 0.098009
Iteration 682: Policy loss: 0.199533. Value loss: 38.971970. Entropy: 0.875023.
Iteration 683: Policy loss: 0.358227. Value loss: 23.479279. Entropy: 0.895578.
Iteration 684: Policy loss: 0.539025. Value loss: 20.305292. Entropy: 0.913318.
episode: 304   score: 185.0  epsilon: 1.0    steps: 222  evaluation reward: 187.2
Training network. lr: 0.000245. clip: 0.098009
Iteration 685: Policy loss: -1.131722. Value loss: 45.407246. Entropy: 0.995499.
Iteration 686: Policy loss: -1.151071. Value loss: 31.908087. Entropy: 1.016736.
Iteration 687: Policy loss: -1.187189. Value loss: 31.037338. Entropy: 1.003963.
episode: 305   score: 75.0  epsilon: 1.0    steps: 919  evaluation reward: 185.1
Training network. lr: 0.000245. clip: 0.098009
Iteration 688: Policy loss: 0.790027. Value loss: 42.425541. Entropy: 1.001015.
Iteration 689: Policy loss: 0.717023. Value loss: 

Iteration 744: Policy loss: 0.245105. Value loss: 21.039413. Entropy: 0.730159.
episode: 331   score: 115.0  epsilon: 1.0    steps: 586  evaluation reward: 175.65
Training network. lr: 0.000245. clip: 0.097853
Iteration 745: Policy loss: 1.653847. Value loss: 20.628984. Entropy: 0.951563.
Iteration 746: Policy loss: 1.426474. Value loss: 16.929083. Entropy: 0.960754.
Iteration 747: Policy loss: 1.570952. Value loss: 15.804911. Entropy: 0.971307.
episode: 332   score: 255.0  epsilon: 1.0    steps: 35  evaluation reward: 175.3
Training network. lr: 0.000245. clip: 0.097853
Iteration 748: Policy loss: 0.030440. Value loss: 9.031096. Entropy: 1.124690.
Iteration 749: Policy loss: -0.024782. Value loss: 7.545158. Entropy: 1.139948.
Iteration 750: Policy loss: -0.019261. Value loss: 7.025337. Entropy: 1.139237.
episode: 333   score: 75.0  epsilon: 1.0    steps: 428  evaluation reward: 173.5
Training network. lr: 0.000244. clip: 0.097705
Iteration 751: Policy loss: 1.099890. Value loss: 7.073

Training network. lr: 0.000244. clip: 0.097549
Iteration 808: Policy loss: 0.137146. Value loss: 16.915133. Entropy: 1.036311.
Iteration 809: Policy loss: 0.116427. Value loss: 12.730200. Entropy: 1.048073.
Iteration 810: Policy loss: 0.161240. Value loss: 11.768647. Entropy: 1.028202.
episode: 357   score: 180.0  epsilon: 1.0    steps: 141  evaluation reward: 172.65
episode: 358   score: 210.0  epsilon: 1.0    steps: 272  evaluation reward: 172.85
episode: 359   score: 180.0  epsilon: 1.0    steps: 706  evaluation reward: 172.0
Training network. lr: 0.000244. clip: 0.097549
Iteration 811: Policy loss: 0.361530. Value loss: 16.650620. Entropy: 0.963433.
Iteration 812: Policy loss: 0.449559. Value loss: 12.920420. Entropy: 0.919009.
Iteration 813: Policy loss: 0.337339. Value loss: 11.455174. Entropy: 0.948294.
episode: 360   score: 495.0  epsilon: 1.0    steps: 425  evaluation reward: 176.15
episode: 361   score: 180.0  epsilon: 1.0    steps: 919  evaluation reward: 175.85
Training net

episode: 384   score: 210.0  epsilon: 1.0    steps: 431  evaluation reward: 169.6
episode: 385   score: 210.0  epsilon: 1.0    steps: 931  evaluation reward: 169.8
Training network. lr: 0.000243. clip: 0.097392
Iteration 871: Policy loss: -1.025433. Value loss: 16.493488. Entropy: 0.737045.
Iteration 872: Policy loss: -1.038777. Value loss: 13.030171. Entropy: 0.726725.
Iteration 873: Policy loss: -1.063892. Value loss: 13.269789. Entropy: 0.757343.
episode: 386   score: 210.0  epsilon: 1.0    steps: 521  evaluation reward: 171.35
Training network. lr: 0.000243. clip: 0.097392
Iteration 874: Policy loss: 0.042608. Value loss: 10.231205. Entropy: 0.778889.
Iteration 875: Policy loss: 0.126452. Value loss: 7.422169. Entropy: 0.780808.
Iteration 876: Policy loss: 0.103560. Value loss: 6.176163. Entropy: 0.793503.
episode: 387   score: 180.0  epsilon: 1.0    steps: 798  evaluation reward: 171.7
Training network. lr: 0.000243. clip: 0.097392
Iteration 877: Policy loss: 0.431102. Value loss:

Training network. lr: 0.000243. clip: 0.097244
Iteration 934: Policy loss: -0.033432. Value loss: 11.683496. Entropy: 1.020974.
Iteration 935: Policy loss: 0.086071. Value loss: 10.623855. Entropy: 1.025875.
Iteration 936: Policy loss: -0.132412. Value loss: 12.130360. Entropy: 1.028537.
Training network. lr: 0.000243. clip: 0.097244
Iteration 937: Policy loss: 0.005007. Value loss: 9.300933. Entropy: 1.070382.
Iteration 938: Policy loss: -0.081080. Value loss: 8.099711. Entropy: 1.080618.
Iteration 939: Policy loss: -0.051248. Value loss: 5.914804. Entropy: 1.074592.
episode: 410   score: 210.0  epsilon: 1.0    steps: 20  evaluation reward: 185.3
Training network. lr: 0.000243. clip: 0.097244
Iteration 940: Policy loss: 0.314940. Value loss: 5.438137. Entropy: 0.989615.
Iteration 941: Policy loss: 0.314456. Value loss: 4.625469. Entropy: 0.983555.
Iteration 942: Policy loss: 0.534730. Value loss: 4.194338. Entropy: 0.994154.
episode: 411   score: 210.0  epsilon: 1.0    steps: 342  eva

Iteration 998: Policy loss: -0.614410. Value loss: 11.697846. Entropy: 1.176036.
Iteration 999: Policy loss: -0.629233. Value loss: 11.170350. Entropy: 1.198759.
episode: 435   score: 210.0  epsilon: 1.0    steps: 387  evaluation reward: 195.9
episode: 436   score: 210.0  epsilon: 1.0    steps: 609  evaluation reward: 197.25
Training network. lr: 0.000243. clip: 0.097088
Iteration 1000: Policy loss: -0.163238. Value loss: 15.001289. Entropy: 1.122367.
Iteration 1001: Policy loss: -0.230630. Value loss: 9.560289. Entropy: 1.131785.
Iteration 1002: Policy loss: -0.263302. Value loss: 8.473472. Entropy: 1.133854.
Training network. lr: 0.000242. clip: 0.096931
Iteration 1003: Policy loss: 1.808013. Value loss: 13.695273. Entropy: 1.191929.
Iteration 1004: Policy loss: 1.770518. Value loss: 9.736426. Entropy: 1.175255.
Iteration 1005: Policy loss: 1.933730. Value loss: 8.373353. Entropy: 1.203704.
episode: 437   score: 120.0  epsilon: 1.0    steps: 173  evaluation reward: 197.35
episode: 43

Iteration 1060: Policy loss: 1.032392. Value loss: 10.998137. Entropy: 0.915581.
Iteration 1061: Policy loss: 0.888903. Value loss: 7.338737. Entropy: 0.919114.
Iteration 1062: Policy loss: 0.916811. Value loss: 6.500465. Entropy: 0.910907.
episode: 461   score: 155.0  epsilon: 1.0    steps: 142  evaluation reward: 191.85
Training network. lr: 0.000242. clip: 0.096784
Iteration 1063: Policy loss: 0.751853. Value loss: 20.129177. Entropy: 0.778445.
Iteration 1064: Policy loss: 0.590345. Value loss: 15.078423. Entropy: 0.776416.
Iteration 1065: Policy loss: 0.569882. Value loss: 12.831829. Entropy: 0.793290.
Training network. lr: 0.000242. clip: 0.096784
Iteration 1066: Policy loss: 1.174552. Value loss: 9.694540. Entropy: 0.925325.
Iteration 1067: Policy loss: 1.158807. Value loss: 6.870113. Entropy: 0.970062.
Iteration 1068: Policy loss: 1.277643. Value loss: 6.561716. Entropy: 0.946955.
episode: 462   score: 180.0  epsilon: 1.0    steps: 364  evaluation reward: 191.85
episode: 463   s

episode: 487   score: 160.0  epsilon: 1.0    steps: 928  evaluation reward: 191.05
Training network. lr: 0.000242. clip: 0.096627
Iteration 1123: Policy loss: 1.247193. Value loss: 10.147388. Entropy: 1.057870.
Iteration 1124: Policy loss: 1.257364. Value loss: 7.021204. Entropy: 1.088825.
Iteration 1125: Policy loss: 1.203314. Value loss: 6.411729. Entropy: 1.068564.
Training network. lr: 0.000242. clip: 0.096627
Iteration 1126: Policy loss: -2.449863. Value loss: 272.327332. Entropy: 0.852639.
Iteration 1127: Policy loss: -2.762070. Value loss: 185.786774. Entropy: 0.830648.
Iteration 1128: Policy loss: -2.386631. Value loss: 132.509277. Entropy: 0.868059.
episode: 488   score: 210.0  epsilon: 1.0    steps: 424  evaluation reward: 191.35
episode: 489   score: 410.0  epsilon: 1.0    steps: 735  evaluation reward: 193.65
episode: 490   score: 135.0  epsilon: 1.0    steps: 815  evaluation reward: 193.2
Training network. lr: 0.000242. clip: 0.096627
Iteration 1129: Policy loss: -0.116010

Training network. lr: 0.000241. clip: 0.096470
Iteration 1186: Policy loss: -0.870614. Value loss: 209.306335. Entropy: 0.865506.
Iteration 1187: Policy loss: -1.149569. Value loss: 76.695930. Entropy: 0.726525.
Iteration 1188: Policy loss: -1.337688. Value loss: 52.358418. Entropy: 0.761310.
episode: 512   score: 500.0  epsilon: 1.0    steps: 249  evaluation reward: 182.35
episode: 513   score: 180.0  epsilon: 1.0    steps: 769  evaluation reward: 182.35
Training network. lr: 0.000241. clip: 0.096470
Iteration 1189: Policy loss: 1.735553. Value loss: 12.139707. Entropy: 0.741844.
Iteration 1190: Policy loss: 1.866455. Value loss: 6.632883. Entropy: 0.769863.
Iteration 1191: Policy loss: 2.051028. Value loss: 4.911047. Entropy: 0.780433.
episode: 514   score: 155.0  epsilon: 1.0    steps: 588  evaluation reward: 182.1
Training network. lr: 0.000241. clip: 0.096470
Iteration 1192: Policy loss: -0.009432. Value loss: 133.017197. Entropy: 0.798083.
Iteration 1193: Policy loss: 0.509619. V

Iteration 1250: Policy loss: -2.292176. Value loss: 231.789093. Entropy: 0.753788.
Iteration 1251: Policy loss: -1.080633. Value loss: 144.329163. Entropy: 0.717520.
episode: 536   score: 355.0  epsilon: 1.0    steps: 280  evaluation reward: 197.05
episode: 537   score: 390.0  epsilon: 1.0    steps: 738  evaluation reward: 199.75
episode: 538   score: 210.0  epsilon: 1.0    steps: 868  evaluation reward: 200.0
Training network. lr: 0.000240. clip: 0.096166
Iteration 1252: Policy loss: -0.174668. Value loss: 25.745150. Entropy: 0.776736.
Iteration 1253: Policy loss: -0.205782. Value loss: 15.185195. Entropy: 0.776732.
Iteration 1254: Policy loss: -0.086241. Value loss: 10.956415. Entropy: 0.792446.
episode: 539   score: 105.0  epsilon: 1.0    steps: 69  evaluation reward: 199.5
episode: 540   score: 160.0  epsilon: 1.0    steps: 989  evaluation reward: 199.55
Training network. lr: 0.000240. clip: 0.096166
Iteration 1255: Policy loss: 2.068122. Value loss: 16.927010. Entropy: 0.757598.
I

episode: 563   score: 155.0  epsilon: 1.0    steps: 842  evaluation reward: 203.15
Training network. lr: 0.000240. clip: 0.096009
Iteration 1312: Policy loss: 0.254200. Value loss: 25.018066. Entropy: 1.165870.
Iteration 1313: Policy loss: 0.251259. Value loss: 18.922510. Entropy: 1.134781.
Iteration 1314: Policy loss: 0.445401. Value loss: 14.481759. Entropy: 1.177143.
Training network. lr: 0.000240. clip: 0.096009
Iteration 1315: Policy loss: -2.238944. Value loss: 271.003845. Entropy: 0.768658.
Iteration 1316: Policy loss: -1.543033. Value loss: 125.113991. Entropy: 0.900708.
Iteration 1317: Policy loss: -1.931308. Value loss: 84.826813. Entropy: 0.831455.
episode: 564   score: 105.0  epsilon: 1.0    steps: 83  evaluation reward: 202.1
Training network. lr: 0.000240. clip: 0.096009
Iteration 1318: Policy loss: 1.037854. Value loss: 29.631046. Entropy: 1.131828.
Iteration 1319: Policy loss: 1.131469. Value loss: 22.313091. Entropy: 1.149633.
Iteration 1320: Policy loss: 1.183512. Val

episode: 590   score: 210.0  epsilon: 1.0    steps: 594  evaluation reward: 207.45
Training network. lr: 0.000240. clip: 0.095862
Iteration 1375: Policy loss: 0.180647. Value loss: 20.427515. Entropy: 0.845950.
Iteration 1376: Policy loss: -0.066163. Value loss: 14.359523. Entropy: 0.827542.
Iteration 1377: Policy loss: 0.074366. Value loss: 12.817825. Entropy: 0.875112.
episode: 591   score: 210.0  epsilon: 1.0    steps: 933  evaluation reward: 207.45
Training network. lr: 0.000240. clip: 0.095862
Iteration 1378: Policy loss: -0.402633. Value loss: 26.980284. Entropy: 0.908135.
Iteration 1379: Policy loss: -0.630051. Value loss: 18.911873. Entropy: 0.898230.
Iteration 1380: Policy loss: -0.453342. Value loss: 15.259083. Entropy: 0.926020.
episode: 592   score: 105.0  epsilon: 1.0    steps: 394  evaluation reward: 206.4
Training network. lr: 0.000240. clip: 0.095862
Iteration 1381: Policy loss: -1.333618. Value loss: 35.361946. Entropy: 1.126266.
Iteration 1382: Policy loss: -1.145853.

episode: 615   score: 300.0  epsilon: 1.0    steps: 523  evaluation reward: 224.6
Training network. lr: 0.000239. clip: 0.095705
Iteration 1438: Policy loss: -2.041078. Value loss: 240.566681. Entropy: 0.867251.
Iteration 1439: Policy loss: -1.563635. Value loss: 131.532227. Entropy: 0.829863.
Iteration 1440: Policy loss: -1.343656. Value loss: 88.775070. Entropy: 0.930272.
episode: 616   score: 210.0  epsilon: 1.0    steps: 58  evaluation reward: 224.6
episode: 617   score: 135.0  epsilon: 1.0    steps: 423  evaluation reward: 223.85
Training network. lr: 0.000239. clip: 0.095705
Iteration 1441: Policy loss: -0.002983. Value loss: 19.080410. Entropy: 0.851455.
Iteration 1442: Policy loss: 0.102643. Value loss: 14.715319. Entropy: 0.855634.
Iteration 1443: Policy loss: 0.055114. Value loss: 14.254767. Entropy: 0.865817.
Training network. lr: 0.000239. clip: 0.095705
Iteration 1444: Policy loss: -0.341877. Value loss: 26.474606. Entropy: 0.965547.
Iteration 1445: Policy loss: -0.358871.

Iteration 1499: Policy loss: 0.562985. Value loss: 37.888386. Entropy: 0.960751.
Iteration 1500: Policy loss: 0.531327. Value loss: 28.042667. Entropy: 0.958089.
Training network. lr: 0.000239. clip: 0.095401
Iteration 1501: Policy loss: 1.470808. Value loss: 50.234436. Entropy: 0.845951.
Iteration 1502: Policy loss: 1.499507. Value loss: 30.314114. Entropy: 0.864336.
Iteration 1503: Policy loss: 1.695602. Value loss: 23.645315. Entropy: 0.876364.
episode: 643   score: 240.0  epsilon: 1.0    steps: 119  evaluation reward: 222.45
episode: 644   score: 150.0  epsilon: 1.0    steps: 424  evaluation reward: 221.85
Training network. lr: 0.000239. clip: 0.095401
Iteration 1504: Policy loss: 2.683555. Value loss: 40.944687. Entropy: 0.993898.
Iteration 1505: Policy loss: 2.976480. Value loss: 26.296568. Entropy: 1.057872.
Iteration 1506: Policy loss: 2.812706. Value loss: 18.046619. Entropy: 1.034498.
episode: 645   score: 80.0  epsilon: 1.0    steps: 273  evaluation reward: 220.55
episode: 6

episode: 670   score: 305.0  epsilon: 1.0    steps: 471  evaluation reward: 225.8
episode: 671   score: 105.0  epsilon: 1.0    steps: 674  evaluation reward: 225.5
Training network. lr: 0.000238. clip: 0.095245
Iteration 1561: Policy loss: -0.321187. Value loss: 33.412102. Entropy: 1.034179.
Iteration 1562: Policy loss: -0.406290. Value loss: 26.312763. Entropy: 1.032219.
Iteration 1563: Policy loss: -0.545655. Value loss: 24.035101. Entropy: 1.021787.
episode: 672   score: 135.0  epsilon: 1.0    steps: 900  evaluation reward: 225.5
Training network. lr: 0.000238. clip: 0.095245
Iteration 1564: Policy loss: -0.518783. Value loss: 26.940498. Entropy: 1.009220.
Iteration 1565: Policy loss: -0.267651. Value loss: 18.300024. Entropy: 1.000599.
Iteration 1566: Policy loss: -0.364980. Value loss: 15.861204. Entropy: 0.995887.
episode: 673   score: 185.0  epsilon: 1.0    steps: 156  evaluation reward: 226.05
Training network. lr: 0.000238. clip: 0.095245
Iteration 1567: Policy loss: 0.049835.

Iteration 1625: Policy loss: -0.875435. Value loss: 21.031429. Entropy: 1.107937.
Iteration 1626: Policy loss: -0.613873. Value loss: 17.285608. Entropy: 1.116277.
episode: 694   score: 275.0  epsilon: 1.0    steps: 343  evaluation reward: 232.1
episode: 695   score: 200.0  epsilon: 1.0    steps: 660  evaluation reward: 231.75
episode: 696   score: 140.0  epsilon: 1.0    steps: 790  evaluation reward: 232.1
episode: 697   score: 105.0  epsilon: 1.0    steps: 965  evaluation reward: 231.8
Training network. lr: 0.000238. clip: 0.095088
Iteration 1627: Policy loss: 2.267570. Value loss: 24.258530. Entropy: 1.239579.
Iteration 1628: Policy loss: 2.009513. Value loss: 14.485106. Entropy: 1.222010.
Iteration 1629: Policy loss: 2.011754. Value loss: 10.495629. Entropy: 1.216516.
Training network. lr: 0.000238. clip: 0.095088
Iteration 1630: Policy loss: -0.839688. Value loss: 29.400782. Entropy: 1.018559.
Iteration 1631: Policy loss: -0.761229. Value loss: 20.015207. Entropy: 0.957410.
Iterat

Training network. lr: 0.000237. clip: 0.094940
Iteration 1687: Policy loss: -0.673835. Value loss: 12.046342. Entropy: 0.731099.
Iteration 1688: Policy loss: -0.712121. Value loss: 8.774471. Entropy: 0.725841.
Iteration 1689: Policy loss: -0.719627. Value loss: 9.431079. Entropy: 0.721567.
episode: 721   score: 135.0  epsilon: 1.0    steps: 226  evaluation reward: 220.05
Training network. lr: 0.000237. clip: 0.094940
Iteration 1690: Policy loss: -3.925382. Value loss: 254.696808. Entropy: 0.410820.
Iteration 1691: Policy loss: -3.533398. Value loss: 169.711502. Entropy: 0.300053.
Iteration 1692: Policy loss: -4.905250. Value loss: 197.679352. Entropy: 0.309886.
episode: 722   score: 270.0  epsilon: 1.0    steps: 498  evaluation reward: 221.4
Training network. lr: 0.000237. clip: 0.094940
Iteration 1693: Policy loss: 3.494462. Value loss: 42.655220. Entropy: 0.199075.
Iteration 1694: Policy loss: 3.897582. Value loss: 27.585079. Entropy: 0.209461.
Iteration 1695: Policy loss: 3.658527. 

Iteration 1749: Policy loss: -4.066023. Value loss: 116.825409. Entropy: 0.630893.
episode: 748   score: 460.0  epsilon: 1.0    steps: 50  evaluation reward: 219.75
episode: 749   score: 125.0  epsilon: 1.0    steps: 320  evaluation reward: 218.85
Training network. lr: 0.000237. clip: 0.094784
Iteration 1750: Policy loss: 2.775236. Value loss: 62.867756. Entropy: 0.595703.
Iteration 1751: Policy loss: 2.696617. Value loss: 36.527176. Entropy: 0.685970.
Iteration 1752: Policy loss: 2.646499. Value loss: 30.790228. Entropy: 0.840031.
episode: 750   score: 220.0  epsilon: 1.0    steps: 906  evaluation reward: 219.2
Training network. lr: 0.000237. clip: 0.094627
Iteration 1753: Policy loss: 2.367167. Value loss: 44.803761. Entropy: 0.923444.
Iteration 1754: Policy loss: 2.425138. Value loss: 30.788416. Entropy: 0.960375.
Iteration 1755: Policy loss: 2.291775. Value loss: 24.452574. Entropy: 0.948977.
Training network. lr: 0.000237. clip: 0.094627
Iteration 1756: Policy loss: 3.374216. Valu

Iteration 1810: Policy loss: -1.941224. Value loss: 241.812622. Entropy: 0.253425.
Iteration 1811: Policy loss: -1.283008. Value loss: 159.414886. Entropy: 0.229384.
Iteration 1812: Policy loss: -1.756278. Value loss: 162.342911. Entropy: 0.191980.
episode: 775   score: 80.0  epsilon: 1.0    steps: 297  evaluation reward: 226.6
episode: 776   score: 210.0  epsilon: 1.0    steps: 926  evaluation reward: 226.85
Training network. lr: 0.000236. clip: 0.094480
Iteration 1813: Policy loss: 1.087275. Value loss: 54.720272. Entropy: 0.407839.
Iteration 1814: Policy loss: 0.920385. Value loss: 25.449926. Entropy: 0.384704.
Iteration 1815: Policy loss: 0.983053. Value loss: 20.108599. Entropy: 0.425955.
episode: 777   score: 240.0  epsilon: 1.0    steps: 32  evaluation reward: 229.05
episode: 778   score: 155.0  epsilon: 1.0    steps: 216  evaluation reward: 229.85
episode: 779   score: 210.0  epsilon: 1.0    steps: 618  evaluation reward: 230.4
episode: 780   score: 135.0  epsilon: 1.0    steps

Iteration 1872: Policy loss: -0.496429. Value loss: 43.581467. Entropy: 0.721432.
episode: 802   score: 145.0  epsilon: 1.0    steps: 126  evaluation reward: 217.8
Training network. lr: 0.000236. clip: 0.094323
Iteration 1873: Policy loss: -1.435789. Value loss: 61.940807. Entropy: 1.022902.
Iteration 1874: Policy loss: -1.115221. Value loss: 40.345119. Entropy: 1.047972.
Iteration 1875: Policy loss: -1.118605. Value loss: 31.423044. Entropy: 1.025760.
episode: 803   score: 300.0  epsilon: 1.0    steps: 253  evaluation reward: 218.7
Training network. lr: 0.000236. clip: 0.094323
Iteration 1876: Policy loss: -3.714795. Value loss: 60.878227. Entropy: 0.991863.
Iteration 1877: Policy loss: -3.457717. Value loss: 39.995323. Entropy: 0.974983.
Iteration 1878: Policy loss: -3.709589. Value loss: 31.510607. Entropy: 0.925961.
episode: 804   score: 180.0  epsilon: 1.0    steps: 366  evaluation reward: 216.95
episode: 805   score: 350.0  epsilon: 1.0    steps: 809  evaluation reward: 218.9
Tra

episode: 830   score: 155.0  epsilon: 1.0    steps: 1013  evaluation reward: 234.7
Training network. lr: 0.000235. clip: 0.094166
Iteration 1933: Policy loss: 3.493282. Value loss: 55.385120. Entropy: 0.577141.
Iteration 1934: Policy loss: 3.693843. Value loss: 29.503822. Entropy: 0.553455.
Iteration 1935: Policy loss: 3.545416. Value loss: 23.241152. Entropy: 0.602404.
episode: 831   score: 440.0  epsilon: 1.0    steps: 150  evaluation reward: 235.75
Training network. lr: 0.000235. clip: 0.094166
Iteration 1936: Policy loss: 2.955927. Value loss: 44.844292. Entropy: 0.697165.
Iteration 1937: Policy loss: 3.016835. Value loss: 30.753458. Entropy: 0.627177.
Iteration 1938: Policy loss: 3.162304. Value loss: 26.364586. Entropy: 0.671633.
episode: 832   score: 105.0  epsilon: 1.0    steps: 489  evaluation reward: 236.05
episode: 833   score: 215.0  epsilon: 1.0    steps: 755  evaluation reward: 234.4
Training network. lr: 0.000235. clip: 0.094166
Iteration 1939: Policy loss: 2.294163. Val

Training network. lr: 0.000235. clip: 0.094019
Iteration 1996: Policy loss: 2.059394. Value loss: 59.245068. Entropy: 0.748718.
Iteration 1997: Policy loss: 2.273056. Value loss: 35.222099. Entropy: 0.803460.
Iteration 1998: Policy loss: 2.296624. Value loss: 30.952427. Entropy: 0.830147.
episode: 855   score: 105.0  epsilon: 1.0    steps: 116  evaluation reward: 238.0
episode: 856   score: 180.0  epsilon: 1.0    steps: 597  evaluation reward: 239.35
episode: 857   score: 185.0  epsilon: 1.0    steps: 818  evaluation reward: 238.55
episode: 858   score: 660.0  epsilon: 1.0    steps: 942  evaluation reward: 242.0
Training network. lr: 0.000235. clip: 0.094019
Iteration 1999: Policy loss: 2.138669. Value loss: 52.366707. Entropy: 0.863845.
Iteration 2000: Policy loss: 2.494306. Value loss: 38.720352. Entropy: 0.885465.
Iteration 2001: Policy loss: 2.308179. Value loss: 29.487810. Entropy: 0.887828.
episode: 859   score: 120.0  epsilon: 1.0    steps: 352  evaluation reward: 240.4
episode:

Iteration 2059: Policy loss: 1.160418. Value loss: 45.023956. Entropy: 0.681939.
Iteration 2060: Policy loss: 0.716431. Value loss: 35.549557. Entropy: 0.683526.
Iteration 2061: Policy loss: 1.114015. Value loss: 27.411850. Entropy: 0.698455.
Training network. lr: 0.000234. clip: 0.093705
Iteration 2062: Policy loss: 2.207444. Value loss: 36.869732. Entropy: 0.663849.
Iteration 2063: Policy loss: 2.530267. Value loss: 22.720463. Entropy: 0.672217.
Iteration 2064: Policy loss: 2.516617. Value loss: 16.771582. Entropy: 0.619695.
episode: 880   score: 320.0  epsilon: 1.0    steps: 32  evaluation reward: 247.65
episode: 881   score: 265.0  epsilon: 1.0    steps: 989  evaluation reward: 247.9
Training network. lr: 0.000234. clip: 0.093705
Iteration 2065: Policy loss: 1.326175. Value loss: 32.149227. Entropy: 0.598219.
Iteration 2066: Policy loss: 1.337972. Value loss: 22.005419. Entropy: 0.627605.
Iteration 2067: Policy loss: 1.583149. Value loss: 21.979647. Entropy: 0.624625.
episode: 882 

Iteration 2122: Policy loss: 0.144983. Value loss: 15.641424. Entropy: 0.629536.
Iteration 2123: Policy loss: 0.371753. Value loss: 11.478615. Entropy: 0.544508.
Iteration 2124: Policy loss: 0.457594. Value loss: 10.472921. Entropy: 0.574925.
episode: 905   score: 210.0  epsilon: 1.0    steps: 777  evaluation reward: 240.55
Training network. lr: 0.000234. clip: 0.093558
Iteration 2125: Policy loss: -0.419458. Value loss: 12.145477. Entropy: 0.302038.
Iteration 2126: Policy loss: -0.535957. Value loss: 10.433533. Entropy: 0.295201.
Iteration 2127: Policy loss: -0.406082. Value loss: 9.531811. Entropy: 0.307350.
episode: 906   score: 180.0  epsilon: 1.0    steps: 133  evaluation reward: 237.25
episode: 907   score: 210.0  epsilon: 1.0    steps: 402  evaluation reward: 235.75
Training network. lr: 0.000234. clip: 0.093558
Iteration 2128: Policy loss: -0.463126. Value loss: 5.199169. Entropy: 0.266860.
Iteration 2129: Policy loss: -0.468141. Value loss: 4.698680. Entropy: 0.260660.
Iterati

episode: 931   score: 620.0  epsilon: 1.0    steps: 922  evaluation reward: 227.65
Training network. lr: 0.000234. clip: 0.093401
Iteration 2185: Policy loss: -0.003084. Value loss: 28.389328. Entropy: 0.136751.
Iteration 2186: Policy loss: 0.021397. Value loss: 15.925108. Entropy: 0.127457.
Iteration 2187: Policy loss: -0.060650. Value loss: 15.704276. Entropy: 0.139121.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2188: Policy loss: 0.731687. Value loss: 10.546285. Entropy: 0.062879.
Iteration 2189: Policy loss: 0.613728. Value loss: 9.416876. Entropy: 0.066704.
Iteration 2190: Policy loss: 0.656514. Value loss: 7.542037. Entropy: 0.076876.
Training network. lr: 0.000234. clip: 0.093401
Iteration 2191: Policy loss: 0.022201. Value loss: 5.267183. Entropy: 0.056365.
Iteration 2192: Policy loss: 0.133983. Value loss: 3.991392. Entropy: 0.047958.
Iteration 2193: Policy loss: -0.013594. Value loss: 3.903472. Entropy: 0.045504.
Training network. lr: 0.000234. clip: 0.093401
It

Iteration 2249: Policy loss: 0.265741. Value loss: 8.027597. Entropy: 0.158766.
Iteration 2250: Policy loss: 0.266925. Value loss: 8.168193. Entropy: 0.148996.
episode: 955   score: 365.0  epsilon: 1.0    steps: 587  evaluation reward: 222.2
Training network. lr: 0.000233. clip: 0.093097
Iteration 2251: Policy loss: 0.371944. Value loss: 20.353098. Entropy: 0.327018.
Iteration 2252: Policy loss: 0.218327. Value loss: 14.113194. Entropy: 0.348334.
Iteration 2253: Policy loss: 0.282345. Value loss: 12.123341. Entropy: 0.366254.
episode: 956   score: 135.0  epsilon: 1.0    steps: 283  evaluation reward: 221.75
Training network. lr: 0.000233. clip: 0.093097
Iteration 2254: Policy loss: -3.152433. Value loss: 253.016190. Entropy: 0.368955.
Iteration 2255: Policy loss: -2.846454. Value loss: 219.682465. Entropy: 0.384032.
Iteration 2256: Policy loss: -3.739532. Value loss: 199.592056. Entropy: 0.385241.
episode: 957   score: 185.0  epsilon: 1.0    steps: 121  evaluation reward: 221.75
episod

Iteration 2310: Policy loss: 1.245649. Value loss: 18.490248. Entropy: 0.911175.
episode: 983   score: 275.0  epsilon: 1.0    steps: 280  evaluation reward: 203.05
Training network. lr: 0.000232. clip: 0.092941
Iteration 2311: Policy loss: 2.952392. Value loss: 28.026291. Entropy: 0.693844.
Iteration 2312: Policy loss: 3.078306. Value loss: 17.082567. Entropy: 0.689212.
Iteration 2313: Policy loss: 3.072373. Value loss: 15.625852. Entropy: 0.682251.
episode: 984   score: 240.0  epsilon: 1.0    steps: 206  evaluation reward: 202.55
episode: 985   score: 170.0  epsilon: 1.0    steps: 606  evaluation reward: 203.05
Training network. lr: 0.000232. clip: 0.092941
Iteration 2314: Policy loss: 1.652220. Value loss: 31.761036. Entropy: 0.455542.
Iteration 2315: Policy loss: 1.294827. Value loss: 19.405153. Entropy: 0.477048.
Iteration 2316: Policy loss: 1.614235. Value loss: 16.601763. Entropy: 0.477437.
Training network. lr: 0.000232. clip: 0.092941
Iteration 2317: Policy loss: 1.125315. Valu

episode: 1007   score: 140.0  epsilon: 1.0    steps: 796  evaluation reward: 218.75
Training network. lr: 0.000232. clip: 0.092784
Iteration 2374: Policy loss: 1.308556. Value loss: 37.495998. Entropy: 0.815529.
Iteration 2375: Policy loss: 1.353734. Value loss: 26.910820. Entropy: 0.849331.
Iteration 2376: Policy loss: 1.305030. Value loss: 21.709814. Entropy: 0.827069.
episode: 1008   score: 135.0  epsilon: 1.0    steps: 315  evaluation reward: 217.5
Training network. lr: 0.000232. clip: 0.092784
Iteration 2377: Policy loss: 3.599595. Value loss: 52.866333. Entropy: 0.826645.
Iteration 2378: Policy loss: 3.215151. Value loss: 35.240326. Entropy: 0.801955.
Iteration 2379: Policy loss: 3.281137. Value loss: 31.176695. Entropy: 0.840964.
episode: 1009   score: 495.0  epsilon: 1.0    steps: 197  evaluation reward: 220.35
episode: 1010   score: 135.0  epsilon: 1.0    steps: 623  evaluation reward: 219.9
episode: 1011   score: 175.0  epsilon: 1.0    steps: 676  evaluation reward: 219.85
Tr

Iteration 2438: Policy loss: 0.335836. Value loss: 14.213782. Entropy: 0.493087.
Iteration 2439: Policy loss: 0.376142. Value loss: 11.066486. Entropy: 0.471368.
episode: 1031   score: 545.0  epsilon: 1.0    steps: 351  evaluation reward: 229.25
episode: 1032   score: 655.0  epsilon: 1.0    steps: 589  evaluation reward: 233.7
Training network. lr: 0.000232. clip: 0.092636
Iteration 2440: Policy loss: 0.723191. Value loss: 187.087860. Entropy: 0.420463.
Iteration 2441: Policy loss: 0.581151. Value loss: 100.023987. Entropy: 0.422657.
Iteration 2442: Policy loss: -0.241384. Value loss: 117.017998. Entropy: 0.422389.
episode: 1033   score: 530.0  epsilon: 1.0    steps: 815  evaluation reward: 237.2
Training network. lr: 0.000232. clip: 0.092636
Iteration 2443: Policy loss: 1.164596. Value loss: 48.488770. Entropy: 0.472288.
Iteration 2444: Policy loss: 1.365269. Value loss: 26.865570. Entropy: 0.499868.
Iteration 2445: Policy loss: 1.238146. Value loss: 20.791712. Entropy: 0.510744.
epis

Iteration 2502: Policy loss: 0.196643. Value loss: 16.159082. Entropy: 0.675585.
episode: 1055   score: 70.0  epsilon: 1.0    steps: 650  evaluation reward: 245.1
episode: 1056   score: 250.0  epsilon: 1.0    steps: 831  evaluation reward: 246.25
Training network. lr: 0.000231. clip: 0.092323
Iteration 2503: Policy loss: 0.212027. Value loss: 45.640972. Entropy: 0.520016.
Iteration 2504: Policy loss: 0.217784. Value loss: 22.862068. Entropy: 0.488754.
Iteration 2505: Policy loss: 0.210003. Value loss: 18.937296. Entropy: 0.500271.
Training network. lr: 0.000231. clip: 0.092323
Iteration 2506: Policy loss: 1.282356. Value loss: 55.050133. Entropy: 0.454208.
Iteration 2507: Policy loss: 1.387764. Value loss: 32.111546. Entropy: 0.460017.
Iteration 2508: Policy loss: 1.178184. Value loss: 24.353481. Entropy: 0.471908.
Training network. lr: 0.000231. clip: 0.092323
Iteration 2509: Policy loss: 2.211497. Value loss: 44.860241. Entropy: 0.671053.
Iteration 2510: Policy loss: 1.330361. Value 

Iteration 2567: Policy loss: -1.422301. Value loss: 44.463882. Entropy: 0.499967.
Iteration 2568: Policy loss: -1.418653. Value loss: 33.290657. Entropy: 0.453538.
episode: 1078   score: 325.0  epsilon: 1.0    steps: 348  evaluation reward: 272.75
episode: 1079   score: 220.0  epsilon: 1.0    steps: 427  evaluation reward: 272.8
Training network. lr: 0.000230. clip: 0.092176
Iteration 2569: Policy loss: 1.949825. Value loss: 34.974438. Entropy: 0.506779.
Iteration 2570: Policy loss: 1.936129. Value loss: 19.009113. Entropy: 0.428042.
Iteration 2571: Policy loss: 1.775492. Value loss: 15.003953. Entropy: 0.447247.
episode: 1080   score: 190.0  epsilon: 1.0    steps: 514  evaluation reward: 271.9
Training network. lr: 0.000230. clip: 0.092176
Iteration 2572: Policy loss: 1.362781. Value loss: 26.705202. Entropy: 0.444312.
Iteration 2573: Policy loss: 1.601865. Value loss: 16.238417. Entropy: 0.448368.
Iteration 2574: Policy loss: 1.502978. Value loss: 14.197544. Entropy: 0.448937.
episod

Iteration 2628: Policy loss: -1.882595. Value loss: 29.553719. Entropy: 0.437971.
episode: 1105   score: 345.0  epsilon: 1.0    steps: 321  evaluation reward: 272.7
Training network. lr: 0.000230. clip: 0.092019
Iteration 2629: Policy loss: 1.251305. Value loss: 30.265930. Entropy: 0.525082.
Iteration 2630: Policy loss: 1.733644. Value loss: 18.092102. Entropy: 0.530175.
Iteration 2631: Policy loss: 1.533573. Value loss: 15.163594. Entropy: 0.508286.
episode: 1106   score: 460.0  epsilon: 1.0    steps: 447  evaluation reward: 275.2
episode: 1107   score: 435.0  epsilon: 1.0    steps: 603  evaluation reward: 278.15
episode: 1108   score: 245.0  epsilon: 1.0    steps: 687  evaluation reward: 279.25
Training network. lr: 0.000230. clip: 0.092019
Iteration 2632: Policy loss: 0.171768. Value loss: 30.270229. Entropy: 0.447448.
Iteration 2633: Policy loss: -0.030293. Value loss: 15.754935. Entropy: 0.453132.
Iteration 2634: Policy loss: 0.264177. Value loss: 13.187986. Entropy: 0.435547.
epi

Training network. lr: 0.000230. clip: 0.091862
Iteration 2689: Policy loss: -0.109741. Value loss: 22.062237. Entropy: 0.337546.
Iteration 2690: Policy loss: 0.019269. Value loss: 16.610226. Entropy: 0.357432.
Iteration 2691: Policy loss: -0.168237. Value loss: 16.206587. Entropy: 0.372025.
Training network. lr: 0.000230. clip: 0.091862
Iteration 2692: Policy loss: -1.037794. Value loss: 28.457348. Entropy: 0.345785.
Iteration 2693: Policy loss: -1.042592. Value loss: 17.973328. Entropy: 0.301555.
Iteration 2694: Policy loss: -1.049241. Value loss: 15.879011. Entropy: 0.332941.
episode: 1133   score: 210.0  epsilon: 1.0    steps: 735  evaluation reward: 265.05
episode: 1134   score: 225.0  epsilon: 1.0    steps: 798  evaluation reward: 265.2
Training network. lr: 0.000230. clip: 0.091862
Iteration 2695: Policy loss: -1.346569. Value loss: 254.995300. Entropy: 0.261764.
Iteration 2696: Policy loss: -0.099847. Value loss: 136.716721. Entropy: 0.289440.
Iteration 2697: Policy loss: -0.897

Iteration 2751: Policy loss: 2.906044. Value loss: 19.169172. Entropy: 0.538533.
episode: 1159   score: 145.0  epsilon: 1.0    steps: 169  evaluation reward: 253.75
Training network. lr: 0.000229. clip: 0.091558
Iteration 2752: Policy loss: 0.335826. Value loss: 22.001699. Entropy: 0.786891.
Iteration 2753: Policy loss: 0.247425. Value loss: 14.579674. Entropy: 0.755266.
Iteration 2754: Policy loss: 0.348698. Value loss: 12.102354. Entropy: 0.756864.
episode: 1160   score: 195.0  epsilon: 1.0    steps: 351  evaluation reward: 254.6
Training network. lr: 0.000229. clip: 0.091558
Iteration 2755: Policy loss: -1.720709. Value loss: 51.092899. Entropy: 0.656760.
Iteration 2756: Policy loss: -2.402290. Value loss: 29.690002. Entropy: 0.591446.
Iteration 2757: Policy loss: -1.971863. Value loss: 24.360291. Entropy: 0.605505.
episode: 1161   score: 290.0  epsilon: 1.0    steps: 601  evaluation reward: 256.2
episode: 1162   score: 195.0  epsilon: 1.0    steps: 863  evaluation reward: 256.15
Tr

Training network. lr: 0.000229. clip: 0.091401
Iteration 2815: Policy loss: 0.592289. Value loss: 35.006107. Entropy: 0.388277.
Iteration 2816: Policy loss: 0.863799. Value loss: 20.904135. Entropy: 0.432565.
Iteration 2817: Policy loss: 0.878595. Value loss: 14.070838. Entropy: 0.455264.
episode: 1183   score: 460.0  epsilon: 1.0    steps: 564  evaluation reward: 250.2
episode: 1184   score: 215.0  epsilon: 1.0    steps: 853  evaluation reward: 246.15
Training network. lr: 0.000229. clip: 0.091401
Iteration 2818: Policy loss: 0.907194. Value loss: 40.084076. Entropy: 0.654945.
Iteration 2819: Policy loss: 0.634081. Value loss: 29.769474. Entropy: 0.612965.
Iteration 2820: Policy loss: 0.812464. Value loss: 23.095398. Entropy: 0.658658.
episode: 1185   score: 315.0  epsilon: 1.0    steps: 159  evaluation reward: 248.0
episode: 1186   score: 455.0  epsilon: 1.0    steps: 926  evaluation reward: 250.5
Training network. lr: 0.000229. clip: 0.091401
Iteration 2821: Policy loss: -0.665778. 

episode: 1204   score: 280.0  epsilon: 1.0    steps: 222  evaluation reward: 271.2
Training network. lr: 0.000228. clip: 0.091254
Iteration 2881: Policy loss: 0.203833. Value loss: 27.823561. Entropy: 0.440726.
Iteration 2882: Policy loss: 0.555640. Value loss: 18.656727. Entropy: 0.419970.
Iteration 2883: Policy loss: 0.446668. Value loss: 14.666582. Entropy: 0.422256.
episode: 1205   score: 220.0  epsilon: 1.0    steps: 18  evaluation reward: 269.95
Training network. lr: 0.000228. clip: 0.091254
Iteration 2884: Policy loss: -0.903610. Value loss: 305.173401. Entropy: 0.400092.
Iteration 2885: Policy loss: -0.564637. Value loss: 164.144058. Entropy: 0.316441.
Iteration 2886: Policy loss: -0.190821. Value loss: 96.429207. Entropy: 0.296261.
Training network. lr: 0.000228. clip: 0.091254
Iteration 2887: Policy loss: -0.647410. Value loss: 67.382118. Entropy: 0.330669.
Iteration 2888: Policy loss: -0.696636. Value loss: 47.239082. Entropy: 0.369963.
Iteration 2889: Policy loss: -0.710221

Iteration 2945: Policy loss: 0.066068. Value loss: 19.298328. Entropy: 0.219111.
Iteration 2946: Policy loss: 0.298025. Value loss: 18.638599. Entropy: 0.232104.
episode: 1228   score: 190.0  epsilon: 1.0    steps: 158  evaluation reward: 288.3
episode: 1229   score: 370.0  epsilon: 1.0    steps: 466  evaluation reward: 291.7
Training network. lr: 0.000228. clip: 0.091097
Iteration 2947: Policy loss: -1.213589. Value loss: 34.982826. Entropy: 0.252010.
Iteration 2948: Policy loss: -1.161341. Value loss: 23.716103. Entropy: 0.225231.
Iteration 2949: Policy loss: -1.284958. Value loss: 20.323792. Entropy: 0.212977.
episode: 1230   score: 325.0  epsilon: 1.0    steps: 77  evaluation reward: 291.5
Training network. lr: 0.000228. clip: 0.091097
Iteration 2950: Policy loss: 1.228065. Value loss: 30.115664. Entropy: 0.202295.
Iteration 2951: Policy loss: 1.326921. Value loss: 18.029827. Entropy: 0.207661.
Iteration 2952: Policy loss: 1.101540. Value loss: 15.772848. Entropy: 0.286997.
Trainin

episode: 1255   score: 300.0  epsilon: 1.0    steps: 285  evaluation reward: 291.2
episode: 1256   score: 195.0  epsilon: 1.0    steps: 735  evaluation reward: 290.2
episode: 1257   score: 315.0  epsilon: 1.0    steps: 981  evaluation reward: 291.1
Training network. lr: 0.000227. clip: 0.090793
Iteration 3007: Policy loss: -0.861262. Value loss: 28.108791. Entropy: 0.280731.
Iteration 3008: Policy loss: -1.006473. Value loss: 17.896538. Entropy: 0.292160.
Iteration 3009: Policy loss: -0.801275. Value loss: 14.612124. Entropy: 0.304765.
episode: 1258   score: 180.0  epsilon: 1.0    steps: 212  evaluation reward: 291.15
episode: 1259   score: 135.0  epsilon: 1.0    steps: 386  evaluation reward: 291.05
Training network. lr: 0.000227. clip: 0.090793
Iteration 3010: Policy loss: 0.447557. Value loss: 20.673946. Entropy: 0.244339.
Iteration 3011: Policy loss: 0.128267. Value loss: 13.799458. Entropy: 0.209047.
Iteration 3012: Policy loss: 0.307414. Value loss: 10.154296. Entropy: 0.238947.


episode: 1280   score: 100.0  epsilon: 1.0    steps: 676  evaluation reward: 292.4
episode: 1281   score: 390.0  epsilon: 1.0    steps: 857  evaluation reward: 291.5
episode: 1282   score: 230.0  epsilon: 1.0    steps: 902  evaluation reward: 291.5
Training network. lr: 0.000227. clip: 0.090637
Iteration 3070: Policy loss: 1.098296. Value loss: 36.025547. Entropy: 0.716109.
Iteration 3071: Policy loss: 1.087731. Value loss: 21.875546. Entropy: 0.682727.
Iteration 3072: Policy loss: 0.983153. Value loss: 17.975864. Entropy: 0.662271.
episode: 1283   score: 105.0  epsilon: 1.0    steps: 397  evaluation reward: 287.95
Training network. lr: 0.000227. clip: 0.090637
Iteration 3073: Policy loss: -2.845424. Value loss: 55.091675. Entropy: 0.784409.
Iteration 3074: Policy loss: -2.630213. Value loss: 32.241028. Entropy: 0.753630.
Iteration 3075: Policy loss: -2.677572. Value loss: 26.387604. Entropy: 0.751730.
episode: 1284   score: 75.0  epsilon: 1.0    steps: 767  evaluation reward: 286.55
T

Iteration 3131: Policy loss: 3.727169. Value loss: 12.011443. Entropy: 0.902255.
Iteration 3132: Policy loss: 3.608222. Value loss: 11.574549. Entropy: 0.939129.
Training network. lr: 0.000226. clip: 0.090480
Iteration 3133: Policy loss: 1.810600. Value loss: 28.256300. Entropy: 0.827180.
Iteration 3134: Policy loss: 2.077802. Value loss: 16.660896. Entropy: 0.801288.
Iteration 3135: Policy loss: 1.846941. Value loss: 13.066233. Entropy: 0.808594.
episode: 1307   score: 345.0  epsilon: 1.0    steps: 80  evaluation reward: 259.85
Training network. lr: 0.000226. clip: 0.090480
Iteration 3136: Policy loss: 1.998812. Value loss: 26.725208. Entropy: 0.702071.
Iteration 3137: Policy loss: 1.892511. Value loss: 15.096488. Entropy: 0.700095.
Iteration 3138: Policy loss: 1.997638. Value loss: 12.049831. Entropy: 0.728160.
episode: 1308   score: 285.0  epsilon: 1.0    steps: 418  evaluation reward: 259.05
Training network. lr: 0.000226. clip: 0.090480
Iteration 3139: Policy loss: 0.345649. Value

Training network. lr: 0.000226. clip: 0.090332
Iteration 3196: Policy loss: 1.760134. Value loss: 29.363588. Entropy: 0.565931.
Iteration 3197: Policy loss: 1.663014. Value loss: 14.385849. Entropy: 0.591730.
Iteration 3198: Policy loss: 1.811888. Value loss: 10.725096. Entropy: 0.581039.
Training network. lr: 0.000226. clip: 0.090332
Iteration 3199: Policy loss: -8.852673. Value loss: 497.616821. Entropy: 0.300922.
Iteration 3200: Policy loss: -8.847976. Value loss: 308.920013. Entropy: 0.181237.
Iteration 3201: Policy loss: -8.510406. Value loss: 281.545502. Entropy: 0.166322.
episode: 1330   score: 290.0  epsilon: 1.0    steps: 574  evaluation reward: 254.55
Training network. lr: 0.000225. clip: 0.090176
Iteration 3202: Policy loss: 2.114472. Value loss: 48.107994. Entropy: 0.280033.
Iteration 3203: Policy loss: 2.264899. Value loss: 33.516045. Entropy: 0.331449.
Iteration 3204: Policy loss: 2.236218. Value loss: 22.245909. Entropy: 0.420650.
episode: 1331   score: 285.0  epsilon: 1

Iteration 3259: Policy loss: -1.449932. Value loss: 11.339739. Entropy: 0.420132.
Iteration 3260: Policy loss: -1.524654. Value loss: 7.105929. Entropy: 0.447225.
Iteration 3261: Policy loss: -1.393548. Value loss: 5.992872. Entropy: 0.427707.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3262: Policy loss: -0.051116. Value loss: 36.693169. Entropy: 0.433469.
Iteration 3263: Policy loss: -0.768900. Value loss: 17.556616. Entropy: 0.431394.
Iteration 3264: Policy loss: -0.087717. Value loss: 17.859558. Entropy: 0.464030.
Training network. lr: 0.000225. clip: 0.090019
Iteration 3265: Policy loss: -4.608569. Value loss: 383.028046. Entropy: 0.449020.
Iteration 3266: Policy loss: -5.016473. Value loss: 270.599640. Entropy: 0.403398.
Iteration 3267: Policy loss: -4.589665. Value loss: 160.225021. Entropy: 0.380828.
episode: 1354   score: 415.0  epsilon: 1.0    steps: 686  evaluation reward: 265.9
Training network. lr: 0.000225. clip: 0.090019
Iteration 3268: Policy loss: 0.352853

Iteration 3324: Policy loss: 2.274737. Value loss: 9.546555. Entropy: 0.217118.
Training network. lr: 0.000225. clip: 0.089872
Iteration 3325: Policy loss: -1.217020. Value loss: 150.940704. Entropy: 0.110854.
Iteration 3326: Policy loss: -0.791852. Value loss: 41.023605. Entropy: 0.114710.
Iteration 3327: Policy loss: -1.158013. Value loss: 79.255379. Entropy: 0.110095.
episode: 1377   score: 420.0  epsilon: 1.0    steps: 14  evaluation reward: 277.65
episode: 1378   score: 205.0  epsilon: 1.0    steps: 237  evaluation reward: 277.75
episode: 1379   score: 155.0  epsilon: 1.0    steps: 402  evaluation reward: 278.0
episode: 1380   score: 205.0  epsilon: 1.0    steps: 713  evaluation reward: 279.05
Training network. lr: 0.000225. clip: 0.089872
Iteration 3328: Policy loss: -1.706344. Value loss: 29.825468. Entropy: 0.191351.
Iteration 3329: Policy loss: -1.920383. Value loss: 22.525742. Entropy: 0.197335.
Iteration 3330: Policy loss: -1.645586. Value loss: 19.781557. Entropy: 0.185690.

Iteration 3386: Policy loss: 0.875120. Value loss: 16.358236. Entropy: 0.140428.
Iteration 3387: Policy loss: 0.743713. Value loss: 15.157722. Entropy: 0.140387.
episode: 1403   score: 545.0  epsilon: 1.0    steps: 906  evaluation reward: 292.7
Training network. lr: 0.000224. clip: 0.089715
Iteration 3388: Policy loss: 0.081230. Value loss: 15.967778. Entropy: 0.182493.
Iteration 3389: Policy loss: 0.092133. Value loss: 11.955717. Entropy: 0.179618.
Iteration 3390: Policy loss: 0.113569. Value loss: 10.101035. Entropy: 0.181249.
episode: 1404   score: 105.0  epsilon: 1.0    steps: 193  evaluation reward: 291.65
episode: 1405   score: 205.0  epsilon: 1.0    steps: 860  evaluation reward: 292.15
Training network. lr: 0.000224. clip: 0.089715
Iteration 3391: Policy loss: 0.423895. Value loss: 21.732441. Entropy: 0.242106.
Iteration 3392: Policy loss: 0.388050. Value loss: 14.485948. Entropy: 0.270382.
Iteration 3393: Policy loss: 0.550825. Value loss: 11.183221. Entropy: 0.282846.
episode

Training network. lr: 0.000224. clip: 0.089411
Iteration 3451: Policy loss: 0.190491. Value loss: 13.293644. Entropy: 0.061757.
Iteration 3452: Policy loss: 0.107907. Value loss: 9.159985. Entropy: 0.062285.
Iteration 3453: Policy loss: 0.135803. Value loss: 8.442396. Entropy: 0.064865.
episode: 1426   score: 195.0  epsilon: 1.0    steps: 229  evaluation reward: 303.6
episode: 1427   score: 195.0  epsilon: 1.0    steps: 737  evaluation reward: 301.45
Training network. lr: 0.000224. clip: 0.089411
Iteration 3454: Policy loss: 2.570655. Value loss: 29.517149. Entropy: 0.105443.
Iteration 3455: Policy loss: 2.427852. Value loss: 16.635798. Entropy: 0.061650.
Iteration 3456: Policy loss: 2.369313. Value loss: 14.325904. Entropy: 0.055723.
Training network. lr: 0.000224. clip: 0.089411
Iteration 3457: Policy loss: -0.678087. Value loss: 34.922802. Entropy: 0.081854.
Iteration 3458: Policy loss: -0.221538. Value loss: 18.129570. Entropy: 0.107730.
Iteration 3459: Policy loss: -0.531828. Valu

episode: 1452   score: 220.0  epsilon: 1.0    steps: 730  evaluation reward: 288.65
Training network. lr: 0.000223. clip: 0.089254
Iteration 3514: Policy loss: 1.977071. Value loss: 22.055330. Entropy: 0.264695.
Iteration 3515: Policy loss: 2.209653. Value loss: 12.639461. Entropy: 0.252650.
Iteration 3516: Policy loss: 2.047094. Value loss: 11.415161. Entropy: 0.254501.
episode: 1453   score: 70.0  epsilon: 1.0    steps: 820  evaluation reward: 288.6
Training network. lr: 0.000223. clip: 0.089254
Iteration 3517: Policy loss: -2.425020. Value loss: 28.678041. Entropy: 0.179121.
Iteration 3518: Policy loss: -2.830363. Value loss: 18.558033. Entropy: 0.189830.
Iteration 3519: Policy loss: -2.420207. Value loss: 18.093582. Entropy: 0.163668.
episode: 1454   score: 215.0  epsilon: 1.0    steps: 54  evaluation reward: 286.6
episode: 1455   score: 140.0  epsilon: 1.0    steps: 496  evaluation reward: 285.7
Training network. lr: 0.000223. clip: 0.089254
Iteration 3520: Policy loss: 0.711108. 

Iteration 3578: Policy loss: -1.823323. Value loss: 23.259022. Entropy: 0.195227.
Iteration 3579: Policy loss: -1.780225. Value loss: 19.466089. Entropy: 0.200529.
episode: 1476   score: 325.0  epsilon: 1.0    steps: 479  evaluation reward: 265.3
episode: 1477   score: 280.0  epsilon: 1.0    steps: 653  evaluation reward: 263.9
Training network. lr: 0.000223. clip: 0.089097
Iteration 3580: Policy loss: -2.415237. Value loss: 41.198334. Entropy: 0.251721.
Iteration 3581: Policy loss: -2.317340. Value loss: 22.789652. Entropy: 0.271008.
Iteration 3582: Policy loss: -2.638889. Value loss: 19.050220. Entropy: 0.264275.
episode: 1478   score: 250.0  epsilon: 1.0    steps: 331  evaluation reward: 264.35
episode: 1479   score: 435.0  epsilon: 1.0    steps: 573  evaluation reward: 267.15
episode: 1480   score: 365.0  epsilon: 1.0    steps: 984  evaluation reward: 268.75
Training network. lr: 0.000223. clip: 0.089097
Iteration 3583: Policy loss: -2.808780. Value loss: 303.420624. Entropy: 0.335

Iteration 3642: Policy loss: -0.242172. Value loss: 9.621587. Entropy: 0.227654.
Training network. lr: 0.000222. clip: 0.088950
Iteration 3643: Policy loss: -0.247327. Value loss: 226.803787. Entropy: 0.134414.
Iteration 3644: Policy loss: 0.346189. Value loss: 200.520416. Entropy: 0.061200.
Iteration 3645: Policy loss: 0.348343. Value loss: 132.500259. Entropy: 0.058940.
Training network. lr: 0.000222. clip: 0.088950
Iteration 3646: Policy loss: -0.046340. Value loss: 32.129341. Entropy: 0.103603.
Iteration 3647: Policy loss: 0.319090. Value loss: 21.490744. Entropy: 0.110812.
Iteration 3648: Policy loss: -0.074305. Value loss: 16.807722. Entropy: 0.108793.
Training network. lr: 0.000222. clip: 0.088950
Iteration 3649: Policy loss: 2.268788. Value loss: 85.124260. Entropy: 0.058471.
Iteration 3650: Policy loss: 2.677365. Value loss: 38.775421. Entropy: 0.048935.
Iteration 3651: Policy loss: 2.369014. Value loss: 31.547062. Entropy: 0.137572.
Training network. lr: 0.000222. clip: 0.088

Iteration 3708: Policy loss: -0.045451. Value loss: 15.980776. Entropy: 0.195034.
episode: 1522   score: 400.0  epsilon: 1.0    steps: 770  evaluation reward: 274.1
Training network. lr: 0.000222. clip: 0.088637
Iteration 3709: Policy loss: 0.611244. Value loss: 15.338656. Entropy: 0.220860.
Iteration 3710: Policy loss: 0.485223. Value loss: 9.261387. Entropy: 0.221617.
Iteration 3711: Policy loss: 0.341080. Value loss: 9.892632. Entropy: 0.229776.
episode: 1523   score: 215.0  epsilon: 1.0    steps: 578  evaluation reward: 274.3
Training network. lr: 0.000222. clip: 0.088637
Iteration 3712: Policy loss: -0.198284. Value loss: 44.474934. Entropy: 0.134253.
Iteration 3713: Policy loss: -0.583486. Value loss: 24.136442. Entropy: 0.127039.
Iteration 3714: Policy loss: -0.484641. Value loss: 19.954475. Entropy: 0.137250.
episode: 1524   score: 275.0  epsilon: 1.0    steps: 927  evaluation reward: 270.45
Training network. lr: 0.000222. clip: 0.088637
Iteration 3715: Policy loss: -1.328747. 

episode: 1550   score: 285.0  epsilon: 1.0    steps: 264  evaluation reward: 261.95
now time :  2019-02-26 13:38:45.956803
episode: 1551   score: 135.0  epsilon: 1.0    steps: 815  evaluation reward: 261.85
Training network. lr: 0.000221. clip: 0.088489
Iteration 3769: Policy loss: 0.135395. Value loss: 32.172546. Entropy: 0.316797.
Iteration 3770: Policy loss: 0.308135. Value loss: 19.165638. Entropy: 0.307874.
Iteration 3771: Policy loss: 0.384717. Value loss: 16.618204. Entropy: 0.307040.
episode: 1552   score: 100.0  epsilon: 1.0    steps: 62  evaluation reward: 260.65
episode: 1553   score: 155.0  epsilon: 1.0    steps: 518  evaluation reward: 261.5
Training network. lr: 0.000221. clip: 0.088489
Iteration 3772: Policy loss: 1.046761. Value loss: 36.352909. Entropy: 0.229490.
Iteration 3773: Policy loss: 0.767591. Value loss: 23.206211. Entropy: 0.226823.
Iteration 3774: Policy loss: 0.787717. Value loss: 20.053158. Entropy: 0.229384.
episode: 1554   score: 80.0  epsilon: 1.0    st

Training network. lr: 0.000221. clip: 0.088333
Iteration 3829: Policy loss: 0.305742. Value loss: 16.793684. Entropy: 0.337647.
Iteration 3830: Policy loss: 0.465713. Value loss: 9.843289. Entropy: 0.316702.
Iteration 3831: Policy loss: 0.267997. Value loss: 7.564712. Entropy: 0.322111.
Training network. lr: 0.000221. clip: 0.088333
Iteration 3832: Policy loss: -1.599959. Value loss: 20.155142. Entropy: 0.205276.
Iteration 3833: Policy loss: -1.626586. Value loss: 12.687953. Entropy: 0.229731.
Iteration 3834: Policy loss: -1.524165. Value loss: 11.063786. Entropy: 0.217740.
episode: 1579   score: 205.0  epsilon: 1.0    steps: 247  evaluation reward: 251.65
episode: 1580   score: 195.0  epsilon: 1.0    steps: 620  evaluation reward: 249.95
Training network. lr: 0.000221. clip: 0.088333
Iteration 3835: Policy loss: -1.563531. Value loss: 28.770050. Entropy: 0.270088.
Iteration 3836: Policy loss: -1.218867. Value loss: 18.720926. Entropy: 0.271002.
Iteration 3837: Policy loss: -1.343923. 

Iteration 3891: Policy loss: -0.928824. Value loss: 13.075006. Entropy: 0.199052.
episode: 1605   score: 75.0  epsilon: 1.0    steps: 83  evaluation reward: 213.75
episode: 1606   score: 210.0  epsilon: 1.0    steps: 558  evaluation reward: 209.2
Training network. lr: 0.000220. clip: 0.088176
Iteration 3892: Policy loss: -0.051510. Value loss: 20.957813. Entropy: 0.284149.
Iteration 3893: Policy loss: -0.409697. Value loss: 14.015271. Entropy: 0.264501.
Iteration 3894: Policy loss: -0.217371. Value loss: 11.035592. Entropy: 0.307459.
episode: 1607   score: 215.0  epsilon: 1.0    steps: 355  evaluation reward: 210.45
Training network. lr: 0.000220. clip: 0.088176
Iteration 3895: Policy loss: -1.987573. Value loss: 31.814426. Entropy: 0.288151.
Iteration 3896: Policy loss: -2.272815. Value loss: 22.876772. Entropy: 0.293534.
Iteration 3897: Policy loss: -2.160172. Value loss: 15.561296. Entropy: 0.309155.
episode: 1608   score: 315.0  epsilon: 1.0    steps: 844  evaluation reward: 208.15

Training network. lr: 0.000220. clip: 0.087872
Iteration 3952: Policy loss: -0.887297. Value loss: 40.823978. Entropy: 0.115070.
Iteration 3953: Policy loss: -0.883693. Value loss: 27.594193. Entropy: 0.074280.
Iteration 3954: Policy loss: -0.971265. Value loss: 22.497408. Entropy: 0.090220.
episode: 1633   score: 105.0  epsilon: 1.0    steps: 101  evaluation reward: 208.6
episode: 1634   score: 80.0  epsilon: 1.0    steps: 496  evaluation reward: 206.65
episode: 1635   score: 530.0  epsilon: 1.0    steps: 725  evaluation reward: 209.3
Training network. lr: 0.000220. clip: 0.087872
Iteration 3955: Policy loss: 2.697236. Value loss: 30.571947. Entropy: 0.109397.
Iteration 3956: Policy loss: 2.847620. Value loss: 18.051216. Entropy: 0.118830.
Iteration 3957: Policy loss: 2.492115. Value loss: 13.098247. Entropy: 0.116770.
episode: 1636   score: 130.0  epsilon: 1.0    steps: 536  evaluation reward: 209.45
episode: 1637   score: 235.0  epsilon: 1.0    steps: 788  evaluation reward: 210.75


Iteration 4009: Policy loss: -0.434667. Value loss: 19.148506. Entropy: 0.125453.
Iteration 4010: Policy loss: -0.777641. Value loss: 14.365764. Entropy: 0.112538.
Iteration 4011: Policy loss: -0.660092. Value loss: 11.808970. Entropy: 0.126076.
episode: 1664   score: 165.0  epsilon: 1.0    steps: 470  evaluation reward: 193.0
episode: 1665   score: 105.0  epsilon: 1.0    steps: 599  evaluation reward: 192.25
episode: 1666   score: 65.0  epsilon: 1.0    steps: 1010  evaluation reward: 189.95
Training network. lr: 0.000219. clip: 0.087715
Iteration 4012: Policy loss: 1.600743. Value loss: 17.493425. Entropy: 0.378607.
Iteration 4013: Policy loss: 1.545107. Value loss: 11.380888. Entropy: 0.486691.
Iteration 4014: Policy loss: 1.607961. Value loss: 9.041997. Entropy: 0.476556.
Training network. lr: 0.000219. clip: 0.087715
Iteration 4015: Policy loss: 0.772930. Value loss: 26.121996. Entropy: 0.483861.
Iteration 4016: Policy loss: 0.925399. Value loss: 13.924390. Entropy: 0.488540.
Itera

Training network. lr: 0.000219. clip: 0.087568
Iteration 4069: Policy loss: -0.693861. Value loss: 19.899605. Entropy: 0.327034.
Iteration 4070: Policy loss: -0.633094. Value loss: 12.154847. Entropy: 0.330613.
Iteration 4071: Policy loss: -0.638956. Value loss: 8.152069. Entropy: 0.322681.
episode: 1693   score: 210.0  epsilon: 1.0    steps: 56  evaluation reward: 197.05
Training network. lr: 0.000219. clip: 0.087568
Iteration 4072: Policy loss: 2.242862. Value loss: 25.694790. Entropy: 0.291980.
Iteration 4073: Policy loss: 2.373662. Value loss: 18.971207. Entropy: 0.273437.
Iteration 4074: Policy loss: 2.320723. Value loss: 15.911024. Entropy: 0.293576.
episode: 1694   score: 285.0  epsilon: 1.0    steps: 395  evaluation reward: 196.2
Training network. lr: 0.000219. clip: 0.087568
Iteration 4075: Policy loss: -0.045041. Value loss: 26.658865. Entropy: 0.430292.
Iteration 4076: Policy loss: 0.064225. Value loss: 21.375977. Entropy: 0.445840.
Iteration 4077: Policy loss: -0.022159. Va

In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

In [None]:
### Loop through all environments and run PPO on them

env_names = ['Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'Asterix-v0', 'Asteroids-v0', 'BankHeist-v0', 'MsPacman-v0']

for a in range(len(env_names)):
    
    name = env_names[a]
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size)
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        for i in range(num_envs):
            env = envs[i]
            #history = env.history
            #life = env.life
            #state, reward, done, info = [env.state, env.reward, env.done, env.info]
            for j in range(env_mem_size):
                step += 1
                frame += 1

                curr_state = env.history[HISTORY_SIZE-1,:,:]
                action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)

                next_state, env.reward, env.done, env.info = env.step(action)

                if (i == vis_env_idx):
                    vis_env._env.render()

                frame_next_state = get_frame(next_state)
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])

                env.life = env.info['ale.lives']
                r = ((env.reward - low) / (high - low)) * 10

                agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
                if (j == env_mem_size-1):
                    _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                    frame_next_vals.append(frame_next_val)
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]

                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)



        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: -0.134242. Value loss: 0.025672. Entropy: 1.384760.
Iteration 2: Policy loss: -0.134622. Value loss: 0.024417. Entropy: 1.382654.
Iteration 3: Policy loss: -0.138663. Value loss: 0.025462. Entropy: 1.380519.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: -0.166682. Value loss: 0.051994. Entropy: 1.374561.
Iteration 5: Policy loss: -0.153367. Value loss: 0.041330. Entropy: 1.375694.
Iteration 6: Policy loss: -0.167738. Value loss: 0.047337. Entropy: 1.376785.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: -0.256462. Value loss: 0.727763. Entropy: 1.370962.
Iteration 8: Policy loss: -0.249125. Value loss: 0.479165. Entropy: 1.374830.
Iteration 9: Policy loss: -0.237247. Value loss: 0.508010. Entropy: 1.382150.
Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: -0.563716. Value loss: 2.374101. Entropy: 1.376569.
Iteration 11: Policy loss: -0.543814. Value loss: 2.030069. Entropy: 1.379307.

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Training network. lr: 0.000250. clip: 0.100000
Iteration 28: Policy loss: -0.035380. Value loss: 1.883276. Entropy: 1.367360.
Iteration 29: Policy loss: -0.036834. Value loss: 1.759964. Entropy: 1.370840.
Iteration 30: Policy loss: -0.036598. Value loss: 1.746900. Entropy: 1.371950.
Training network. lr: 0.000250. clip: 0.100000
Iteration 31: Policy loss: -0.058636. Value loss: 1.006419. Entropy: 1.369001.
Iteration 32: Policy loss: -0.033320. Value loss: 0.862457. Entropy: 1.364149.
Iteration 33: Policy loss: -0.065974. Value loss: 0.884775. Entropy: 1.361571.
Training network. lr: 0.000250. clip: 0.100000
Iteration 34: Policy loss: 0.000143. Value loss: 1.397101. Entropy: 1.370030.
Iteration 35: Policy loss: -0.000328. Value loss: 1.279787. Entropy: 1.359337.
Iteration 36: Policy loss: -0.010333. Value loss: 1.356722. Entropy: 1.366242.
Training network. lr: 0.000250. clip: 0.100000
Iteration 37: Policy loss: 0.151668. Value loss: 0.326707. Entropy: 1.362033.
Iteration 38: Policy los

Iteration 103: Policy loss: 0.104650. Value loss: 0.487069. Entropy: 1.310694.
Iteration 104: Policy loss: 0.121857. Value loss: 0.390418. Entropy: 1.308963.
Iteration 105: Policy loss: 0.107936. Value loss: 0.398923. Entropy: 1.303240.
Training network. lr: 0.000249. clip: 0.099696
Iteration 106: Policy loss: -0.019600. Value loss: 0.810460. Entropy: 1.291222.
Iteration 107: Policy loss: -0.043799. Value loss: 0.813314. Entropy: 1.305367.
Iteration 108: Policy loss: -0.041227. Value loss: 0.788016. Entropy: 1.295246.
Training network. lr: 0.000249. clip: 0.099696
Iteration 109: Policy loss: -0.053537. Value loss: 1.527595. Entropy: 1.300061.
Iteration 110: Policy loss: -0.172127. Value loss: 1.685091. Entropy: 1.297742.
Iteration 111: Policy loss: -0.111701. Value loss: 1.285365. Entropy: 1.288221.
Training network. lr: 0.000249. clip: 0.099696
Iteration 112: Policy loss: 0.182973. Value loss: 0.846592. Entropy: 1.276863.
Iteration 113: Policy loss: 0.123608. Value loss: 0.738449. Ent

episode: 27   score: 15100.0  epsilon: 1.0    steps: 652  evaluation reward: 20807.40740740741
Training network. lr: 0.000249. clip: 0.099548
Iteration 178: Policy loss: 0.024522. Value loss: 1.066570. Entropy: 1.251285.
Iteration 179: Policy loss: 0.002568. Value loss: 0.774038. Entropy: 1.244492.
Iteration 180: Policy loss: 0.035311. Value loss: 0.542031. Entropy: 1.251379.
Training network. lr: 0.000249. clip: 0.099548
Iteration 181: Policy loss: 0.348849. Value loss: 0.860963. Entropy: 1.250753.
Iteration 182: Policy loss: 0.360043. Value loss: 0.543085. Entropy: 1.267107.
Iteration 183: Policy loss: 0.340807. Value loss: 0.445778. Entropy: 1.261358.
Training network. lr: 0.000249. clip: 0.099548
Iteration 184: Policy loss: -0.182871. Value loss: 1.296346. Entropy: 1.253929.
Iteration 185: Policy loss: -0.199198. Value loss: 1.305865. Entropy: 1.246943.
Iteration 186: Policy loss: -0.209038. Value loss: 1.193934. Entropy: 1.243795.
Training network. lr: 0.000249. clip: 0.099548
Ite