# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10

### Main Training Loop

In [2]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
max_eval = -np.inf
reset_count = 0

while (frame < 10000000):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                    
                    if np.mean(evaluation_reward) > max_eval:
                        torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
                        max_eval = float(np.mean(evaluation_reward))
                        reset_count = 0
                    elif e > 5000:
                        reset_count += 1
                        """
                        if (reset_count == reset_max):
                            print("Training went nowhere, starting again at best model")
                            agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                            agent.update_target_net()
                            reset_count = 0
                        """
                e += 1
                evaluation_reward.append(env.score)
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                
                env.done = False
                env.score = 0
                env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()


NameError: name 'envs' is not defined

In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

In [None]:
### Loop through all environments and run PPO on them

env_names = ['SpaceInvaders-v0', 'MsPacman-v0', 'Asteroids-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']

for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size)
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[HISTORY_SIZE-1,:,:] for i in range(num_envs)])
            next_states = []
            net_in = np.stack([envs[i].history[:HISTORY_SIZE,:,:] for i in range(num_envs)])
            step += num_envs
            frame += num_envs
            actions, values = agent.get_action(np.float32(net_in) / 255.)
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, deepcopy(curr_states[i]), actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    _, frame_next_vals = agent.get_action(np.float32(net_in) / 255.)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()




 ------- STARTING TRAINING FOR SpaceInvaders-v0 ------- 





  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Determing min/max rewards of environment
Min: 0. Max: 200.


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: -0.001224. Value loss: 0.402734. Entropy: 1.385494.
Iteration 2: Policy loss: -0.002272. Value loss: 0.416963. Entropy: 1.384887.
Iteration 3: Policy loss: -0.004752. Value loss: 0.384776. Entropy: 1.383403.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: 0.010419. Value loss: 31.506674. Entropy: 1.372683.
Iteration 5: Policy loss: 0.001003. Value loss: 26.171925. Entropy: 1.372297.
Iteration 6: Policy loss: 0.004994. Value loss: 25.923634. Entropy: 1.376680.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: 0.006099. Value loss: 14.835062. Entropy: 1.367709.
Iteration 8: Policy loss: -0.003362. Value loss: 13.744712. Entropy: 1.363533.
Iteration 9: Policy loss: -0.002212. Value loss: 15.134050. Entropy: 1.367512.
Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: 0.001002. Value loss: 21.589115. Entropy: 1.360367.
Iteration 11: Policy loss: 0.012161. Value loss: 12.698215. Entropy: 1.35124

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode: 3   score: 120.0  epsilon: 1.0    steps: 1008  evaluation reward: 101.66666666666667
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: 0.020485. Value loss: 37.421394. Entropy: 1.311320.
Iteration 14: Policy loss: 0.015823. Value loss: 31.105167. Entropy: 1.307104.
Iteration 15: Policy loss: 0.005699. Value loss: 32.345131. Entropy: 1.322819.
episode: 4   score: 45.0  epsilon: 1.0    steps: 104  evaluation reward: 87.5
episode: 5   score: 60.0  epsilon: 1.0    steps: 120  evaluation reward: 82.0
episode: 6   score: 135.0  epsilon: 1.0    steps: 864  evaluation reward: 90.83333333333333
Training network. lr: 0.000250. clip: 0.100000
Iteration 16: Policy loss: -0.001415. Value loss: 9.513955. Entropy: 1.346044.
Iteration 17: Policy loss: -0.003953. Value loss: 8.311934. Entropy: 1.356533.
Iteration 18: Policy loss: -0.001240. Value loss: 6.599928. Entropy: 1.348485.
Training network. lr: 0.000250. clip: 0.100000
Iteration 19: Policy loss: 0.005755. Value 

Iteration 75: Policy loss: 0.013419. Value loss: 16.157473. Entropy: 1.152214.
episode: 29   score: 105.0  epsilon: 1.0    steps: 192  evaluation reward: 138.6206896551724
episode: 30   score: 410.0  epsilon: 1.0    steps: 920  evaluation reward: 147.66666666666666
Training network. lr: 0.000250. clip: 0.099853
Iteration 76: Policy loss: 0.056166. Value loss: 84.095627. Entropy: 0.995897.
Iteration 77: Policy loss: 0.039778. Value loss: 54.644123. Entropy: 0.770303.
Iteration 78: Policy loss: 0.007443. Value loss: 47.117374. Entropy: 0.856941.
episode: 31   score: 75.0  epsilon: 1.0    steps: 496  evaluation reward: 145.32258064516128
Training network. lr: 0.000250. clip: 0.099853
Iteration 79: Policy loss: 0.012798. Value loss: 58.713314. Entropy: 0.897678.
Iteration 80: Policy loss: 0.013950. Value loss: 36.487633. Entropy: 0.855115.
Iteration 81: Policy loss: 0.014879. Value loss: 31.301176. Entropy: 0.873855.
Training network. lr: 0.000250. clip: 0.099853
Iteration 82: Policy loss:

Iteration 134: Policy loss: 0.010400. Value loss: 23.425772. Entropy: 0.504878.
Iteration 135: Policy loss: 0.001137. Value loss: 19.813118. Entropy: 0.565055.
episode: 57   score: 120.0  epsilon: 1.0    steps: 584  evaluation reward: 138.68421052631578
episode: 58   score: 15.0  epsilon: 1.0    steps: 760  evaluation reward: 136.55172413793105
episode: 59   score: 50.0  epsilon: 1.0    steps: 848  evaluation reward: 135.08474576271186
Training network. lr: 0.000249. clip: 0.099696
Iteration 136: Policy loss: 0.020367. Value loss: 33.883915. Entropy: 0.598921.
Iteration 137: Policy loss: 0.006268. Value loss: 22.282089. Entropy: 0.653336.
Iteration 138: Policy loss: 0.009452. Value loss: 18.004515. Entropy: 0.632629.
episode: 60   score: 60.0  epsilon: 1.0    steps: 184  evaluation reward: 133.83333333333334
Training network. lr: 0.000249. clip: 0.099696
Iteration 139: Policy loss: 0.002899. Value loss: 21.420612. Entropy: 0.532107.
Iteration 140: Policy loss: 0.005199. Value loss: 17.

episode: 85   score: 235.0  epsilon: 1.0    steps: 120  evaluation reward: 127.58823529411765
Training network. lr: 0.000249. clip: 0.099548
Iteration 193: Policy loss: 0.001916. Value loss: 25.115910. Entropy: 0.701388.
Iteration 194: Policy loss: 0.006421. Value loss: 21.141359. Entropy: 0.731251.
Iteration 195: Policy loss: 0.009058. Value loss: 17.569384. Entropy: 0.756616.
episode: 86   score: 30.0  epsilon: 1.0    steps: 992  evaluation reward: 126.45348837209302
Training network. lr: 0.000249. clip: 0.099548
Iteration 196: Policy loss: 0.010633. Value loss: 17.629299. Entropy: 0.679078.
Iteration 197: Policy loss: 0.013444. Value loss: 10.772072. Entropy: 0.659926.
Iteration 198: Policy loss: 0.002481. Value loss: 11.096463. Entropy: 0.650153.
episode: 87   score: 170.0  epsilon: 1.0    steps: 960  evaluation reward: 126.95402298850574
Training network. lr: 0.000249. clip: 0.099548
Iteration 199: Policy loss: -0.001349. Value loss: 30.163816. Entropy: 0.590754.
Iteration 200: Po

Iteration 254: Policy loss: 0.003083. Value loss: 2.591224. Entropy: 0.926070.
Iteration 255: Policy loss: 0.002077. Value loss: 2.309840. Entropy: 0.934065.
episode: 111   score: 110.0  epsilon: 1.0    steps: 152  evaluation reward: 137.3
episode: 112   score: 120.0  epsilon: 1.0    steps: 792  evaluation reward: 137.1
Training network. lr: 0.000248. clip: 0.099235
Iteration 256: Policy loss: 0.032546. Value loss: 333.466278. Entropy: 0.911208.
Iteration 257: Policy loss: 0.020345. Value loss: 288.979950. Entropy: 0.944506.
Iteration 258: Policy loss: 0.024607. Value loss: 165.856506. Entropy: 0.925961.
episode: 113   score: 105.0  epsilon: 1.0    steps: 40  evaluation reward: 136.05
episode: 114   score: 335.0  epsilon: 1.0    steps: 880  evaluation reward: 137.55
Training network. lr: 0.000248. clip: 0.099235
Iteration 259: Policy loss: 0.011300. Value loss: 13.260080. Entropy: 0.886065.
Iteration 260: Policy loss: 0.006510. Value loss: 11.480099. Entropy: 0.909054.
Iteration 261: P

episode: 136   score: 245.0  epsilon: 1.0    steps: 776  evaluation reward: 150.55
Training network. lr: 0.000248. clip: 0.099088
Iteration 319: Policy loss: 0.009518. Value loss: 32.841061. Entropy: 0.975189.
Iteration 320: Policy loss: 0.013296. Value loss: 19.430138. Entropy: 0.946360.
Iteration 321: Policy loss: 0.013718. Value loss: 16.459082. Entropy: 0.956427.
episode: 137   score: 155.0  epsilon: 1.0    steps: 520  evaluation reward: 151.05
episode: 138   score: 310.0  epsilon: 1.0    steps: 888  evaluation reward: 153.65
Training network. lr: 0.000248. clip: 0.099088
Iteration 322: Policy loss: 0.011390. Value loss: 46.014256. Entropy: 0.894510.
Iteration 323: Policy loss: 0.012625. Value loss: 30.372059. Entropy: 0.923241.
Iteration 324: Policy loss: 0.000984. Value loss: 28.126980. Entropy: 0.915155.
episode: 139   score: 155.0  epsilon: 1.0    steps: 136  evaluation reward: 151.65
episode: 140   score: 160.0  epsilon: 1.0    steps: 296  evaluation reward: 149.15
Training ne

Iteration 380: Policy loss: 0.036368. Value loss: 31.933233. Entropy: 1.012541.
Iteration 381: Policy loss: 0.025358. Value loss: 26.797184. Entropy: 1.041469.
episode: 164   score: 210.0  epsilon: 1.0    steps: 856  evaluation reward: 162.85
Training network. lr: 0.000247. clip: 0.098931
Iteration 382: Policy loss: 0.017981. Value loss: 31.856136. Entropy: 1.053175.
Iteration 383: Policy loss: 0.005816. Value loss: 23.286999. Entropy: 1.057239.
Iteration 384: Policy loss: 0.013613. Value loss: 19.705433. Entropy: 1.057767.
episode: 165   score: 150.0  epsilon: 1.0    steps: 664  evaluation reward: 163.6
episode: 166   score: 460.0  epsilon: 1.0    steps: 960  evaluation reward: 167.7
Training network. lr: 0.000247. clip: 0.098931
Iteration 385: Policy loss: 0.045537. Value loss: 368.855194. Entropy: 0.935411.
Iteration 386: Policy loss: 0.097075. Value loss: 219.701660. Entropy: 0.808452.
Iteration 387: Policy loss: 0.050678. Value loss: 144.606110. Entropy: 0.875359.
episode: 167   s

episode: 193   score: 165.0  epsilon: 1.0    steps: 440  evaluation reward: 175.8
Training network. lr: 0.000247. clip: 0.098774
Iteration 442: Policy loss: 0.009917. Value loss: 11.353271. Entropy: 0.831434.
Iteration 443: Policy loss: 0.000232. Value loss: 8.113584. Entropy: 0.830068.
Iteration 444: Policy loss: -0.001230. Value loss: 7.541282. Entropy: 0.825591.
Training network. lr: 0.000247. clip: 0.098774
Iteration 445: Policy loss: 0.004351. Value loss: 13.405587. Entropy: 0.914526.
Iteration 446: Policy loss: 0.009998. Value loss: 11.487077. Entropy: 0.916398.
Iteration 447: Policy loss: 0.002092. Value loss: 9.831747. Entropy: 0.915728.
episode: 194   score: 105.0  epsilon: 1.0    steps: 128  evaluation reward: 175.8
Training network. lr: 0.000247. clip: 0.098774
Iteration 448: Policy loss: -0.001152. Value loss: 10.384228. Entropy: 0.961905.
Iteration 449: Policy loss: 0.000528. Value loss: 7.580659. Entropy: 0.954619.
Iteration 450: Policy loss: 0.001585. Value loss: 5.62978

Iteration 505: Policy loss: 0.010547. Value loss: 9.122557. Entropy: 0.969891.
Iteration 506: Policy loss: 0.006652. Value loss: 5.493219. Entropy: 0.956614.
Iteration 507: Policy loss: 0.003087. Value loss: 4.431351. Entropy: 0.952705.
episode: 219   score: 105.0  epsilon: 1.0    steps: 496  evaluation reward: 182.3
episode: 220   score: 105.0  epsilon: 1.0    steps: 744  evaluation reward: 181.8
Training network. lr: 0.000246. clip: 0.098470
Iteration 508: Policy loss: 0.003962. Value loss: 8.271399. Entropy: 0.921920.
Iteration 509: Policy loss: 0.003386. Value loss: 6.229940. Entropy: 0.929527.
Iteration 510: Policy loss: 0.003773. Value loss: 5.385435. Entropy: 0.919668.
episode: 221   score: 230.0  epsilon: 1.0    steps: 104  evaluation reward: 182.9
episode: 222   score: 105.0  epsilon: 1.0    steps: 688  evaluation reward: 180.4
Training network. lr: 0.000246. clip: 0.098470
Iteration 511: Policy loss: 0.001414. Value loss: 13.622535. Entropy: 0.844119.
Iteration 512: Policy lo

episode: 248   score: 105.0  epsilon: 1.0    steps: 752  evaluation reward: 158.65
Training network. lr: 0.000246. clip: 0.098313
Iteration 568: Policy loss: 0.002210. Value loss: 7.892743. Entropy: 0.910323.
Iteration 569: Policy loss: -0.002105. Value loss: 6.689863. Entropy: 0.880863.
Iteration 570: Policy loss: 0.001770. Value loss: 6.416273. Entropy: 0.884119.
Training network. lr: 0.000246. clip: 0.098313
Iteration 571: Policy loss: 0.020494. Value loss: 33.714447. Entropy: 0.807545.
Iteration 572: Policy loss: 0.035256. Value loss: 24.447515. Entropy: 0.856096.
Iteration 573: Policy loss: 0.031569. Value loss: 19.758591. Entropy: 0.879890.
episode: 249   score: 120.0  epsilon: 1.0    steps: 296  evaluation reward: 158.95
Training network. lr: 0.000246. clip: 0.098313
Iteration 574: Policy loss: 0.009788. Value loss: 18.521393. Entropy: 1.031056.
Iteration 575: Policy loss: 0.011386. Value loss: 11.206679. Entropy: 1.023563.
Iteration 576: Policy loss: 0.008832. Value loss: 11.79

Iteration 633: Policy loss: 0.018825. Value loss: 15.387452. Entropy: 0.827489.
Training network. lr: 0.000245. clip: 0.098166
Iteration 634: Policy loss: 0.019513. Value loss: 25.465408. Entropy: 0.840675.
Iteration 635: Policy loss: 0.020609. Value loss: 15.938982. Entropy: 0.838745.
Iteration 636: Policy loss: 0.024434. Value loss: 14.802807. Entropy: 0.850093.
episode: 272   score: 640.0  epsilon: 1.0    steps: 64  evaluation reward: 167.1
episode: 273   score: 110.0  epsilon: 1.0    steps: 672  evaluation reward: 167.15
Training network. lr: 0.000245. clip: 0.098166
Iteration 637: Policy loss: 0.008965. Value loss: 14.186105. Entropy: 0.908284.
Iteration 638: Policy loss: 0.015568. Value loss: 7.404120. Entropy: 0.921935.
Iteration 639: Policy loss: 0.009336. Value loss: 6.250698. Entropy: 0.923434.
episode: 274   score: 230.0  epsilon: 1.0    steps: 976  evaluation reward: 168.25
Training network. lr: 0.000245. clip: 0.098166
Iteration 640: Policy loss: 0.012522. Value loss: 17.0

Iteration 699: Policy loss: 0.003671. Value loss: 7.094713. Entropy: 0.883266.
episode: 296   score: 185.0  epsilon: 1.0    steps: 1024  evaluation reward: 170.45
Training network. lr: 0.000245. clip: 0.098009
Iteration 700: Policy loss: 0.005047. Value loss: 21.787312. Entropy: 0.699120.
Iteration 701: Policy loss: 0.002267. Value loss: 14.143950. Entropy: 0.725942.
Iteration 702: Policy loss: 0.006253. Value loss: 12.851406. Entropy: 0.724204.
episode: 297   score: 180.0  epsilon: 1.0    steps: 320  evaluation reward: 170.45
episode: 298   score: 105.0  epsilon: 1.0    steps: 456  evaluation reward: 169.95
episode: 299   score: 120.0  epsilon: 1.0    steps: 688  evaluation reward: 170.1
Training network. lr: 0.000245. clip: 0.097853
Iteration 703: Policy loss: 0.009461. Value loss: 8.566672. Entropy: 0.820961.
Iteration 704: Policy loss: 0.009368. Value loss: 5.685986. Entropy: 0.833686.
Iteration 705: Policy loss: 0.008367. Value loss: 5.073797. Entropy: 0.837035.
Training network. 

Training network. lr: 0.000244. clip: 0.097705
Iteration 763: Policy loss: 0.028554. Value loss: 263.595032. Entropy: 0.655357.
Iteration 764: Policy loss: 0.038907. Value loss: 116.328506. Entropy: 0.598619.
Iteration 765: Policy loss: 0.023802. Value loss: 76.868469. Entropy: 0.621494.
episode: 321   score: 305.0  epsilon: 1.0    steps: 96  evaluation reward: 167.4
episode: 322   score: 105.0  epsilon: 1.0    steps: 784  evaluation reward: 167.4
episode: 323   score: 235.0  epsilon: 1.0    steps: 976  evaluation reward: 168.7
Training network. lr: 0.000244. clip: 0.097705
Iteration 766: Policy loss: 0.006408. Value loss: 27.823837. Entropy: 0.705808.
Iteration 767: Policy loss: 0.001982. Value loss: 17.522120. Entropy: 0.724412.
Iteration 768: Policy loss: 0.007106. Value loss: 15.333076. Entropy: 0.720672.
Training network. lr: 0.000244. clip: 0.097705
Iteration 769: Policy loss: 0.002006. Value loss: 10.721659. Entropy: 0.726298.
Iteration 770: Policy loss: 0.002600. Value loss: 7.

episode: 344   score: 490.0  epsilon: 1.0    steps: 208  evaluation reward: 190.5
Training network. lr: 0.000244. clip: 0.097549
Iteration 829: Policy loss: 0.000685. Value loss: 18.869003. Entropy: 0.743540.
Iteration 830: Policy loss: 0.007519. Value loss: 11.662848. Entropy: 0.753858.
Iteration 831: Policy loss: 0.009492. Value loss: 9.392092. Entropy: 0.751440.
episode: 345   score: 180.0  epsilon: 1.0    steps: 800  evaluation reward: 191.25
Training network. lr: 0.000244. clip: 0.097549
Iteration 832: Policy loss: 0.000296. Value loss: 35.688347. Entropy: 0.610841.
Iteration 833: Policy loss: 0.005578. Value loss: 25.128162. Entropy: 0.596622.
Iteration 834: Policy loss: 0.001868. Value loss: 20.504234. Entropy: 0.598680.
episode: 346   score: 180.0  epsilon: 1.0    steps: 72  evaluation reward: 192.0
episode: 347   score: 155.0  epsilon: 1.0    steps: 160  evaluation reward: 192.8
episode: 348   score: 210.0  epsilon: 1.0    steps: 952  evaluation reward: 193.85
Training network

episode: 370   score: 210.0  epsilon: 1.0    steps: 824  evaluation reward: 200.1
Training network. lr: 0.000243. clip: 0.097392
Iteration 892: Policy loss: 0.005237. Value loss: 15.850064. Entropy: 0.656283.
Iteration 893: Policy loss: 0.005858. Value loss: 13.868116. Entropy: 0.647288.
Iteration 894: Policy loss: 0.005596. Value loss: 11.934665. Entropy: 0.658233.
Training network. lr: 0.000243. clip: 0.097392
Iteration 895: Policy loss: 0.007658. Value loss: 19.289324. Entropy: 0.761513.
Iteration 896: Policy loss: 0.007834. Value loss: 17.459211. Entropy: 0.759682.
Iteration 897: Policy loss: 0.004058. Value loss: 15.807039. Entropy: 0.755581.
Training network. lr: 0.000243. clip: 0.097392
Iteration 898: Policy loss: 0.012831. Value loss: 14.578941. Entropy: 0.698027.
Iteration 899: Policy loss: 0.017978. Value loss: 11.851262. Entropy: 0.688455.
Iteration 900: Policy loss: 0.010766. Value loss: 11.161279. Entropy: 0.697559.
episode: 371   score: 210.0  epsilon: 1.0    steps: 952  

Iteration 956: Policy loss: 0.015347. Value loss: 10.802694. Entropy: 0.843596.
Iteration 957: Policy loss: 0.010047. Value loss: 11.181715. Entropy: 0.809457.
episode: 395   score: 155.0  epsilon: 1.0    steps: 920  evaluation reward: 205.45
Training network. lr: 0.000243. clip: 0.097088
Iteration 958: Policy loss: 0.005698. Value loss: 7.272008. Entropy: 0.674476.
Iteration 959: Policy loss: -0.001404. Value loss: 3.147821. Entropy: 0.686359.
Iteration 960: Policy loss: -0.000874. Value loss: 2.927004. Entropy: 0.662631.
episode: 396   score: 180.0  epsilon: 1.0    steps: 968  evaluation reward: 205.4
Training network. lr: 0.000243. clip: 0.097088
Iteration 961: Policy loss: 0.009192. Value loss: 11.056853. Entropy: 0.441508.
Iteration 962: Policy loss: 0.003878. Value loss: 7.909068. Entropy: 0.411175.
Iteration 963: Policy loss: 0.006188. Value loss: 8.353244. Entropy: 0.441147.
episode: 397   score: 210.0  epsilon: 1.0    steps: 392  evaluation reward: 205.7
Training network. lr: 

Iteration 1019: Policy loss: 0.008281. Value loss: 24.641596. Entropy: 0.529762.
Iteration 1020: Policy loss: 0.004963. Value loss: 21.500324. Entropy: 0.541751.
episode: 421   score: 120.0  epsilon: 1.0    steps: 88  evaluation reward: 210.7
episode: 422   score: 75.0  epsilon: 1.0    steps: 456  evaluation reward: 210.4
episode: 423   score: 210.0  epsilon: 1.0    steps: 608  evaluation reward: 210.15
Training network. lr: 0.000242. clip: 0.096931
Iteration 1021: Policy loss: 0.011460. Value loss: 38.850407. Entropy: 0.657752.
Iteration 1022: Policy loss: 0.027963. Value loss: 26.038754. Entropy: 0.715623.
Iteration 1023: Policy loss: 0.004744. Value loss: 20.234985. Entropy: 0.717486.
episode: 424   score: 225.0  epsilon: 1.0    steps: 312  evaluation reward: 211.35
Training network. lr: 0.000242. clip: 0.096931
Iteration 1024: Policy loss: 0.011166. Value loss: 19.514853. Entropy: 0.709197.
Iteration 1025: Policy loss: 0.017362. Value loss: 14.028433. Entropy: 0.739572.
Iteration 1

Iteration 1082: Policy loss: 0.004600. Value loss: 8.894964. Entropy: 0.247328.
Iteration 1083: Policy loss: 0.000718. Value loss: 7.543555. Entropy: 0.253873.
episode: 447   score: 180.0  epsilon: 1.0    steps: 528  evaluation reward: 209.5
episode: 448   score: 210.0  epsilon: 1.0    steps: 912  evaluation reward: 209.5
Training network. lr: 0.000242. clip: 0.096784
Iteration 1084: Policy loss: -0.000800. Value loss: 27.603811. Entropy: 0.262453.
Iteration 1085: Policy loss: -0.000532. Value loss: 19.389933. Entropy: 0.276810.
Iteration 1086: Policy loss: -0.000231. Value loss: 18.553757. Entropy: 0.264414.
episode: 449   score: 210.0  epsilon: 1.0    steps: 112  evaluation reward: 210.05
Training network. lr: 0.000242. clip: 0.096784
Iteration 1087: Policy loss: 0.006306. Value loss: 4.664412. Entropy: 0.735198.
Iteration 1088: Policy loss: 0.016978. Value loss: 3.250221. Entropy: 0.717981.
Iteration 1089: Policy loss: 0.017444. Value loss: 3.046385. Entropy: 0.741153.
Training netw

Iteration 1144: Policy loss: 0.000031. Value loss: 9.724627. Entropy: 0.266056.
Iteration 1145: Policy loss: 0.001835. Value loss: 9.138377. Entropy: 0.293921.
Iteration 1146: Policy loss: 0.000192. Value loss: 7.508763. Entropy: 0.302103.
Training network. lr: 0.000242. clip: 0.096627
Iteration 1147: Policy loss: 0.001783. Value loss: 27.587374. Entropy: 0.270442.
Iteration 1148: Policy loss: 0.001938. Value loss: 16.241133. Entropy: 0.247648.
Iteration 1149: Policy loss: -0.001321. Value loss: 16.412550. Entropy: 0.258554.
episode: 474   score: 155.0  epsilon: 1.0    steps: 392  evaluation reward: 203.9
Training network. lr: 0.000242. clip: 0.096627
Iteration 1150: Policy loss: 0.004844. Value loss: 18.136032. Entropy: 0.221628.
Iteration 1151: Policy loss: -0.003109. Value loss: 12.088400. Entropy: 0.303418.
Iteration 1152: Policy loss: 0.001605. Value loss: 9.968740. Entropy: 0.266591.
episode: 475   score: 210.0  epsilon: 1.0    steps: 680  evaluation reward: 203.9
episode: 476   

Iteration 1208: Policy loss: 0.002520. Value loss: 7.138885. Entropy: 0.058099.
Iteration 1209: Policy loss: -0.000629. Value loss: 6.534562. Entropy: 0.075460.
Training network. lr: 0.000241. clip: 0.096323
Iteration 1210: Policy loss: -0.000390. Value loss: 6.153535. Entropy: 0.146082.
Iteration 1211: Policy loss: 0.001189. Value loss: 4.262347. Entropy: 0.128552.
Iteration 1212: Policy loss: 0.000785. Value loss: 3.076364. Entropy: 0.111006.
episode: 499   score: 310.0  epsilon: 1.0    steps: 736  evaluation reward: 209.8
episode: 500   score: 180.0  epsilon: 1.0    steps: 1008  evaluation reward: 209.5
Training network. lr: 0.000241. clip: 0.096323
Iteration 1213: Policy loss: 0.006707. Value loss: 7.613293. Entropy: 0.238812.
Iteration 1214: Policy loss: 0.006582. Value loss: 6.734486. Entropy: 0.235535.
Iteration 1215: Policy loss: 0.005733. Value loss: 6.183196. Entropy: 0.204226.
now time :  2019-03-05 16:49:35.334526
episode: 501   score: 180.0  epsilon: 1.0    steps: 40  eval

Training network. lr: 0.000240. clip: 0.096166
Iteration 1270: Policy loss: 0.031942. Value loss: 32.520691. Entropy: 0.515231.
Iteration 1271: Policy loss: 0.051952. Value loss: 20.840885. Entropy: 0.509933.
Iteration 1272: Policy loss: 0.059112. Value loss: 16.513941. Entropy: 0.560158.
Training network. lr: 0.000240. clip: 0.096166
Iteration 1273: Policy loss: 0.006697. Value loss: 14.932837. Entropy: 0.417729.
Iteration 1274: Policy loss: 0.016607. Value loss: 10.323881. Entropy: 0.427199.
Iteration 1275: Policy loss: 0.012288. Value loss: 8.155095. Entropy: 0.445590.
Training network. lr: 0.000240. clip: 0.096166
Iteration 1276: Policy loss: 0.013505. Value loss: 28.463621. Entropy: 0.444217.
Iteration 1277: Policy loss: 0.021417. Value loss: 18.786221. Entropy: 0.459084.
Iteration 1278: Policy loss: 0.030267. Value loss: 16.643408. Entropy: 0.468520.
episode: 526   score: 380.0  epsilon: 1.0    steps: 360  evaluation reward: 213.2
Training network. lr: 0.000240. clip: 0.096166
It

Iteration 1335: Policy loss: 0.007622. Value loss: 4.456735. Entropy: 0.408282.
episode: 550   score: 180.0  epsilon: 1.0    steps: 424  evaluation reward: 206.25
Training network. lr: 0.000240. clip: 0.096009
Iteration 1336: Policy loss: 0.007625. Value loss: 5.191298. Entropy: 0.356188.
Iteration 1337: Policy loss: -0.001100. Value loss: 4.368897. Entropy: 0.351298.
Iteration 1338: Policy loss: 0.001329. Value loss: 4.378137. Entropy: 0.341249.
now time :  2019-03-05 16:52:06.419203
episode: 551   score: 210.0  epsilon: 1.0    steps: 984  evaluation reward: 206.25
episode: 552   score: 180.0  epsilon: 1.0    steps: 1008  evaluation reward: 206.25
Training network. lr: 0.000240. clip: 0.096009
Iteration 1339: Policy loss: 0.003849. Value loss: 11.631086. Entropy: 0.384774.
Iteration 1340: Policy loss: 0.004064. Value loss: 8.707512. Entropy: 0.381872.
Iteration 1341: Policy loss: 0.019512. Value loss: 8.767014. Entropy: 0.409127.
episode: 553   score: 180.0  epsilon: 1.0    steps: 160

episode: 576   score: 180.0  epsilon: 1.0    steps: 368  evaluation reward: 203.95
episode: 577   score: 155.0  epsilon: 1.0    steps: 808  evaluation reward: 203.7
Training network. lr: 0.000240. clip: 0.095862
Iteration 1399: Policy loss: 0.006111. Value loss: 13.227863. Entropy: 0.619390.
Iteration 1400: Policy loss: 0.017048. Value loss: 7.813485. Entropy: 0.608069.
Iteration 1401: Policy loss: 0.019759. Value loss: 7.014798. Entropy: 0.619438.
episode: 578   score: 210.0  epsilon: 1.0    steps: 192  evaluation reward: 203.2
Training network. lr: 0.000239. clip: 0.095705
Iteration 1402: Policy loss: 0.004041. Value loss: 6.194780. Entropy: 0.459858.
Iteration 1403: Policy loss: 0.008147. Value loss: 4.102281. Entropy: 0.450624.
Iteration 1404: Policy loss: 0.006496. Value loss: 4.058338. Entropy: 0.468030.
Training network. lr: 0.000239. clip: 0.095705
Iteration 1405: Policy loss: 0.004336. Value loss: 11.262110. Entropy: 0.415684.
Iteration 1406: Policy loss: 0.009344. Value loss:

Iteration 1461: Policy loss: 0.023801. Value loss: 7.872222. Entropy: 0.643977.
Training network. lr: 0.000239. clip: 0.095549
Iteration 1462: Policy loss: 0.007948. Value loss: 6.885470. Entropy: 0.470411.
Iteration 1463: Policy loss: 0.009963. Value loss: 3.687673. Entropy: 0.436145.
Iteration 1464: Policy loss: 0.008817. Value loss: 3.067862. Entropy: 0.427217.
episode: 603   score: 180.0  epsilon: 1.0    steps: 584  evaluation reward: 193.9
episode: 604   score: 155.0  epsilon: 1.0    steps: 816  evaluation reward: 193.65
Training network. lr: 0.000239. clip: 0.095549
Iteration 1465: Policy loss: 0.011358. Value loss: 15.361852. Entropy: 0.555090.
Iteration 1466: Policy loss: 0.005274. Value loss: 8.272793. Entropy: 0.544823.
Iteration 1467: Policy loss: 0.006060. Value loss: 7.026684. Entropy: 0.538951.
episode: 605   score: 105.0  epsilon: 1.0    steps: 360  evaluation reward: 192.9
Training network. lr: 0.000239. clip: 0.095549
Iteration 1468: Policy loss: 0.012736. Value loss: 

Iteration 1523: Policy loss: 0.000512. Value loss: 5.604450. Entropy: 0.479644.
Iteration 1524: Policy loss: 0.006678. Value loss: 4.641637. Entropy: 0.482540.
episode: 630   score: 105.0  epsilon: 1.0    steps: 832  evaluation reward: 184.6
Training network. lr: 0.000239. clip: 0.095401
Iteration 1525: Policy loss: 0.000615. Value loss: 10.101652. Entropy: 0.416528.
Iteration 1526: Policy loss: -0.001112. Value loss: 6.873206. Entropy: 0.444082.
Iteration 1527: Policy loss: -0.000096. Value loss: 5.802977. Entropy: 0.448738.
episode: 631   score: 180.0  epsilon: 1.0    steps: 680  evaluation reward: 184.6
episode: 632   score: 210.0  epsilon: 1.0    steps: 968  evaluation reward: 182.45
Training network. lr: 0.000239. clip: 0.095401
Iteration 1528: Policy loss: 0.005733. Value loss: 8.595915. Entropy: 0.363086.
Iteration 1529: Policy loss: 0.002390. Value loss: 6.275974. Entropy: 0.376746.
Iteration 1530: Policy loss: 0.001315. Value loss: 5.236938. Entropy: 0.377746.
episode: 633   s

Iteration 1587: Policy loss: 0.001876. Value loss: 17.029335. Entropy: 0.314574.
episode: 655   score: 210.0  epsilon: 1.0    steps: 272  evaluation reward: 187.75
Training network. lr: 0.000238. clip: 0.095245
Iteration 1588: Policy loss: 0.002437. Value loss: 18.724773. Entropy: 0.311004.
Iteration 1589: Policy loss: 0.002236. Value loss: 9.408863. Entropy: 0.323500.
Iteration 1590: Policy loss: -0.002275. Value loss: 6.129225. Entropy: 0.308942.
episode: 656   score: 105.0  epsilon: 1.0    steps: 368  evaluation reward: 186.7
episode: 657   score: 155.0  epsilon: 1.0    steps: 400  evaluation reward: 186.45
episode: 658   score: 485.0  epsilon: 1.0    steps: 984  evaluation reward: 189.2
Training network. lr: 0.000238. clip: 0.095245
Iteration 1591: Policy loss: 0.000933. Value loss: 31.185831. Entropy: 0.429799.
Iteration 1592: Policy loss: -0.006547. Value loss: 19.621401. Entropy: 0.475382.
Iteration 1593: Policy loss: -0.002770. Value loss: 18.094185. Entropy: 0.449148.
episode: