# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
env = gym.make('SpaceInvaders-v0')
#env.render()

In [3]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 6
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        #r = np.clip(reward, -1, 1)
        r = reward
        
        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            agent.train_policy_net(frame)
            # Update the target network
            agent.update_target_net()
        score += r
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 40 and len(evaluation_reward) > 350:
                torch.save(agent.policy_net, "./save_model/breakout_dqn")
                sys.exit()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],0)[0])


episode: 0   score: 170.0   memory length: 1037   epsilon: 1.0    steps: 1037     evaluation reward: 170.0
episode: 1   score: 655.0   memory length: 2608   epsilon: 1.0    steps: 1571     evaluation reward: 412.5
episode: 2   score: 135.0   memory length: 3174   epsilon: 1.0    steps: 566     evaluation reward: 320.0
episode: 3   score: 410.0   memory length: 4466   epsilon: 1.0    steps: 1292     evaluation reward: 342.5
episode: 4   score: 85.0   memory length: 5120   epsilon: 1.0    steps: 654     evaluation reward: 291.0
episode: 5   score: 155.0   memory length: 5820   epsilon: 1.0    steps: 700     evaluation reward: 268.3333333333333
episode: 6   score: 120.0   memory length: 6430   epsilon: 1.0    steps: 610     evaluation reward: 247.14285714285714
episode: 7   score: 140.0   memory length: 7206   epsilon: 1.0    steps: 776     evaluation reward: 233.75
episode: 8   score: 210.0   memory length: 8002   epsilon: 1.0    steps: 796     evaluation reward: 231.11111111111111
episo

  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Policy loss: 0.000150. Value loss: 11.088337. Entropy: 1.771215.
Iteration 2
Policy loss: 0.000616. Value loss: 7.356457. Entropy: 1.765690.
Iteration 3
Policy loss: -0.002851. Value loss: 5.608305. Entropy: 1.764209.
Iteration 4
Policy loss: -0.008718. Value loss: 4.618650. Entropy: 1.765876.
Iteration 5
Policy loss: -0.013365. Value loss: 3.936110. Entropy: 1.763928.
episode: 12   score: 50.0   memory length: 10240   epsilon: 1.0    steps: 380     evaluation reward: 197.69230769230768
episode: 13   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 644     evaluation reward: 193.21428571428572
episode: 14   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 666     evaluation reward: 189.33333333333334
episode: 15   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 768     evaluation reward: 190.625
episode: 16   score: 55.0   memory length: 10240   epsilon: 1.0    steps: 524     evaluation reward: 182.64705882352942
episode: 17   score: 190.0   memory 

episode: 68   score: 390.0   memory length: 10240   epsilon: 1.0    steps: 1364     evaluation reward: 149.63768115942028
episode: 69   score: 50.0   memory length: 10240   epsilon: 1.0    steps: 385     evaluation reward: 148.21428571428572
episode: 70   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 787     evaluation reward: 148.30985915492957
now time :  2018-12-18 16:43:37.071820
episode: 71   score: 245.0   memory length: 10240   epsilon: 1.0    steps: 1000     evaluation reward: 149.65277777777777
episode: 72   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 636     evaluation reward: 150.4794520547945
Training network
Iteration 1
Policy loss: -0.002079. Value loss: 6.519347. Entropy: 1.731731.
Iteration 2
Policy loss: -0.006203. Value loss: 4.463940. Entropy: 1.729289.
Iteration 3
Policy loss: -0.009409. Value loss: 3.497622. Entropy: 1.726244.
Iteration 4
Policy loss: -0.014321. Value loss: 3.060592. Entropy: 1.723430.
Iteration 5
Policy loss: -0.016

episode: 126   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 922     evaluation reward: 152.0
episode: 127   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 691     evaluation reward: 153.25
episode: 128   score: 60.0   memory length: 10240   epsilon: 1.0    steps: 389     evaluation reward: 152.35
episode: 129   score: 225.0   memory length: 10240   epsilon: 1.0    steps: 806     evaluation reward: 154.25
episode: 130   score: 80.0   memory length: 10240   epsilon: 1.0    steps: 394     evaluation reward: 153.95
episode: 131   score: 220.0   memory length: 10240   epsilon: 1.0    steps: 824     evaluation reward: 155.1
episode: 132   score: 125.0   memory length: 10240   epsilon: 1.0    steps: 609     evaluation reward: 155.45
episode: 133   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 650     evaluation reward: 155.7
Training network
Iteration 1
Policy loss: -0.002538. Value loss: 6.374971. Entropy: 1.685458.
Iteration 2
Policy loss: -0.002

episode: 186   score: 35.0   memory length: 10240   epsilon: 1.0    steps: 405     evaluation reward: 139.6
episode: 187   score: 50.0   memory length: 10240   epsilon: 1.0    steps: 459     evaluation reward: 139.3
episode: 188   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 463     evaluation reward: 139.35
episode: 189   score: 110.0   memory length: 10240   epsilon: 1.0    steps: 610     evaluation reward: 137.8
episode: 190   score: 110.0   memory length: 10240   epsilon: 1.0    steps: 557     evaluation reward: 137.55
episode: 191   score: 55.0   memory length: 10240   epsilon: 1.0    steps: 535     evaluation reward: 135.8
episode: 192   score: 110.0   memory length: 10240   epsilon: 1.0    steps: 875     evaluation reward: 136.4
episode: 193   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 702     evaluation reward: 135.65
episode: 194   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 856     evaluation reward: 136.15
episode: 195   sco

episode: 246   score: 30.0   memory length: 10240   epsilon: 1.0    steps: 578     evaluation reward: 133.25
episode: 247   score: 30.0   memory length: 10240   epsilon: 1.0    steps: 585     evaluation reward: 132.2
episode: 248   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 712     evaluation reward: 132.85
episode: 249   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 592     evaluation reward: 133.55
episode: 250   score: 80.0   memory length: 10240   epsilon: 1.0    steps: 644     evaluation reward: 133.25
episode: 251   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 767     evaluation reward: 134.35
episode: 252   score: 510.0   memory length: 10240   epsilon: 1.0    steps: 1132     evaluation reward: 138.4
episode: 253   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 587     evaluation reward: 134.7
Training network
Iteration 1
Policy loss: -0.003712. Value loss: 5.891301. Entropy: 1.617343.
Iteration 2
Policy loss: -0.011

episode: 306   score: 55.0   memory length: 10240   epsilon: 1.0    steps: 493     evaluation reward: 142.95
episode: 307   score: 75.0   memory length: 10240   epsilon: 1.0    steps: 610     evaluation reward: 139.9
episode: 308   score: 125.0   memory length: 10240   epsilon: 1.0    steps: 846     evaluation reward: 139.9
episode: 309   score: 360.0   memory length: 10240   epsilon: 1.0    steps: 1110     evaluation reward: 141.7
episode: 310   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 639     evaluation reward: 141.65
episode: 311   score: 80.0   memory length: 10240   epsilon: 1.0    steps: 672     evaluation reward: 141.65
episode: 312   score: 425.0   memory length: 10240   epsilon: 1.0    steps: 865     evaluation reward: 144.7
episode: 313   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 858     evaluation reward: 144.95
episode: 314   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 637     evaluation reward: 144.45
Training network

episode: 366   score: 160.0   memory length: 10240   epsilon: 1.0    steps: 780     evaluation reward: 150.55
episode: 367   score: 45.0   memory length: 10240   epsilon: 1.0    steps: 563     evaluation reward: 147.9
episode: 368   score: 115.0   memory length: 10240   epsilon: 1.0    steps: 660     evaluation reward: 147.8
episode: 369   score: 485.0   memory length: 10240   epsilon: 1.0    steps: 942     evaluation reward: 151.1
episode: 370   score: 285.0   memory length: 10240   epsilon: 1.0    steps: 1038     evaluation reward: 152.75
episode: 371   score: 130.0   memory length: 10240   epsilon: 1.0    steps: 589     evaluation reward: 152.6
episode: 372   score: 235.0   memory length: 10240   epsilon: 1.0    steps: 1055     evaluation reward: 153.75
Training network
Iteration 1
Policy loss: -0.014632. Value loss: 6.500848. Entropy: 1.572691.
Iteration 2
Policy loss: -0.024054. Value loss: 3.576632. Entropy: 1.565891.
Iteration 3
Policy loss: -0.030016. Value loss: 2.788805. Entr

episode: 427   score: 70.0   memory length: 10240   epsilon: 1.0    steps: 440     evaluation reward: 147.95
episode: 428   score: 125.0   memory length: 10240   epsilon: 1.0    steps: 638     evaluation reward: 147.7
episode: 429   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 718     evaluation reward: 148.75
episode: 430   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 827     evaluation reward: 150.4
episode: 431   score: 90.0   memory length: 10240   epsilon: 1.0    steps: 642     evaluation reward: 149.8
Training network
Iteration 1
Policy loss: -0.018600. Value loss: 6.125489. Entropy: 1.519716.
Iteration 2
Policy loss: -0.030241. Value loss: 3.214061. Entropy: 1.503461.
Iteration 3
Policy loss: -0.034973. Value loss: 2.429712. Entropy: 1.514540.
Iteration 4
Policy loss: -0.033711. Value loss: 2.112190. Entropy: 1.509143.
Iteration 5
Policy loss: -0.041093. Value loss: 1.896403. Entropy: 1.508600.
episode: 432   score: 585.0   memory length: 10240   

episode: 487   score: 5.0   memory length: 10240   epsilon: 1.0    steps: 585     evaluation reward: 149.6
episode: 488   score: 335.0   memory length: 10240   epsilon: 1.0    steps: 759     evaluation reward: 152.3
episode: 489   score: 255.0   memory length: 10240   epsilon: 1.0    steps: 833     evaluation reward: 153.75
episode: 490   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 504     evaluation reward: 153.15
episode: 491   score: 45.0   memory length: 10240   epsilon: 1.0    steps: 498     evaluation reward: 151.25
episode: 492   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 641     evaluation reward: 150.4
Training network
Iteration 1
Policy loss: -0.017089. Value loss: 5.967039. Entropy: 1.494846.
Iteration 2
Policy loss: -0.029316. Value loss: 3.386766. Entropy: 1.490639.
Iteration 3
Policy loss: -0.040266. Value loss: 2.575833. Entropy: 1.490611.
Iteration 4
Policy loss: -0.038157. Value loss: 2.299448. Entropy: 1.499021.
Iteration 5
Policy lo

episode: 547   score: 145.0   memory length: 10240   epsilon: 1.0    steps: 739     evaluation reward: 152.95
episode: 548   score: 110.0   memory length: 10240   epsilon: 1.0    steps: 897     evaluation reward: 152.7
episode: 549   score: 170.0   memory length: 10240   epsilon: 1.0    steps: 497     evaluation reward: 152.15
episode: 550   score: 35.0   memory length: 10240   epsilon: 1.0    steps: 553     evaluation reward: 152.45
Training network
Iteration 1
Policy loss: -0.017754. Value loss: 5.579874. Entropy: 1.424652.
Iteration 2
Policy loss: -0.028042. Value loss: 2.726170. Entropy: 1.415855.
Iteration 3
Policy loss: -0.033957. Value loss: 2.060862. Entropy: 1.425745.
Iteration 4
Policy loss: -0.038750. Value loss: 1.862630. Entropy: 1.423745.
Iteration 5
Policy loss: -0.042534. Value loss: 1.710075. Entropy: 1.431518.
episode: 551   score: 305.0   memory length: 10240   epsilon: 1.0    steps: 763     evaluation reward: 150.2
episode: 552   score: 155.0   memory length: 10240 

Training network
Iteration 1
Policy loss: -0.034095. Value loss: 5.050344. Entropy: 1.308748.
Iteration 2
Policy loss: -0.041481. Value loss: 2.739177. Entropy: 1.286111.
Iteration 3
Policy loss: -0.047750. Value loss: 2.009619. Entropy: 1.289421.
Iteration 4
Policy loss: -0.047737. Value loss: 1.776197. Entropy: 1.295384.
Iteration 5
Policy loss: -0.050093. Value loss: 1.645231. Entropy: 1.290545.
episode: 607   score: 365.0   memory length: 10240   epsilon: 1.0    steps: 1323     evaluation reward: 163.15
episode: 608   score: 160.0   memory length: 10240   epsilon: 1.0    steps: 687     evaluation reward: 163.4
episode: 609   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 677     evaluation reward: 163.6
episode: 610   score: 185.0   memory length: 10240   epsilon: 1.0    steps: 796     evaluation reward: 164.2
episode: 611   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 593     evaluation reward: 163.45
episode: 612   score: 610.0   memory length: 10240

Policy loss: -0.050714. Value loss: 1.850624. Entropy: 1.224182.
Iteration 5
Policy loss: -0.047835. Value loss: 1.678307. Entropy: 1.229036.
episode: 665   score: 515.0   memory length: 10240   epsilon: 1.0    steps: 1166     evaluation reward: 166.65
episode: 666   score: 75.0   memory length: 10240   epsilon: 1.0    steps: 532     evaluation reward: 166.5
episode: 667   score: 80.0   memory length: 10240   epsilon: 1.0    steps: 421     evaluation reward: 165.7
episode: 668   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 380     evaluation reward: 165.7
episode: 669   score: 140.0   memory length: 10240   epsilon: 1.0    steps: 795     evaluation reward: 165.4
episode: 670   score: 45.0   memory length: 10240   epsilon: 1.0    steps: 591     evaluation reward: 164.8
episode: 671   score: 90.0   memory length: 10240   epsilon: 1.0    steps: 576     evaluation reward: 163.4
episode: 672   score: 175.0   memory length: 10240   epsilon: 1.0    steps: 704     evaluation re

Policy loss: -0.052476. Value loss: 1.955612. Entropy: 1.232232.
Iteration 5
Policy loss: -0.054092. Value loss: 1.782946. Entropy: 1.219886.
episode: 725   score: 515.0   memory length: 10240   epsilon: 1.0    steps: 1505     evaluation reward: 148.35
episode: 726   score: 220.0   memory length: 10240   epsilon: 1.0    steps: 1030     evaluation reward: 149.2
episode: 727   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 805     evaluation reward: 147.9
episode: 728   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 745     evaluation reward: 148.55
episode: 729   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 792     evaluation reward: 148.9
episode: 730   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 820     evaluation reward: 149.75
episode: 731   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 506     evaluation reward: 150.4
episode: 732   score: 300.0   memory length: 10240   epsilon: 1.0    steps: 1062     evalu

Policy loss: -0.052724. Value loss: 1.595304. Entropy: 1.228303.
episode: 785   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 747     evaluation reward: 144.3
episode: 786   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 787     evaluation reward: 145.55
episode: 787   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 734     evaluation reward: 145.25
episode: 788   score: 325.0   memory length: 10240   epsilon: 1.0    steps: 1008     evaluation reward: 147.3
episode: 789   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 1127     evaluation reward: 147.5
episode: 790   score: 50.0   memory length: 10240   epsilon: 1.0    steps: 353     evaluation reward: 146.8
episode: 791   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 550     evaluation reward: 145.45
episode: 792   score: 90.0   memory length: 10240   epsilon: 1.0    steps: 554     evaluation reward: 144.0
episode: 793   score: 210.0   memory length: 10240   epsilon

episode: 845   score: 125.0   memory length: 10240   epsilon: 1.0    steps: 659     evaluation reward: 151.4
episode: 846   score: 140.0   memory length: 10240   epsilon: 1.0    steps: 557     evaluation reward: 152.0
episode: 847   score: 125.0   memory length: 10240   epsilon: 1.0    steps: 708     evaluation reward: 151.45
episode: 848   score: 515.0   memory length: 10240   epsilon: 1.0    steps: 1307     evaluation reward: 155.55
episode: 849   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 590     evaluation reward: 155.5
episode: 850   score: 125.0   memory length: 10240   epsilon: 1.0    steps: 523     evaluation reward: 155.5
episode: 851   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 662     evaluation reward: 154.6
episode: 852   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 622     evaluation reward: 155.25
episode: 853   score: 115.0   memory length: 10240   epsilon: 1.0    steps: 644     evaluation reward: 154.9
episode: 854   

episode: 905   score: 30.0   memory length: 10240   epsilon: 1.0    steps: 401     evaluation reward: 150.2
episode: 906   score: 30.0   memory length: 10240   epsilon: 1.0    steps: 487     evaluation reward: 146.7
episode: 907   score: 80.0   memory length: 10240   epsilon: 1.0    steps: 384     evaluation reward: 146.0
episode: 908   score: 170.0   memory length: 10240   epsilon: 1.0    steps: 681     evaluation reward: 146.65
episode: 909   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 594     evaluation reward: 147.35
episode: 910   score: 90.0   memory length: 10240   epsilon: 1.0    steps: 567     evaluation reward: 147.6
episode: 911   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 614     evaluation reward: 147.45
episode: 912   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 650     evaluation reward: 147.1
episode: 913   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 748     evaluation reward: 145.95
episode: 914   scor

episode: 965   score: 150.0   memory length: 10240   epsilon: 1.0    steps: 724     evaluation reward: 159.95
episode: 966   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 672     evaluation reward: 160.35
episode: 967   score: 260.0   memory length: 10240   epsilon: 1.0    steps: 922     evaluation reward: 162.1
episode: 968   score: 125.0   memory length: 10240   epsilon: 1.0    steps: 632     evaluation reward: 160.75
episode: 969   score: 60.0   memory length: 10240   epsilon: 1.0    steps: 406     evaluation reward: 160.2
episode: 970   score: 35.0   memory length: 10240   epsilon: 1.0    steps: 725     evaluation reward: 158.6
episode: 971   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 594     evaluation reward: 159.45
episode: 972   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 604     evaluation reward: 159.3
episode: 973   score: 80.0   memory length: 10240   epsilon: 1.0    steps: 408     evaluation reward: 159.55
Training network


episode: 1025   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 508     evaluation reward: 156.15
episode: 1026   score: 95.0   memory length: 10240   epsilon: 1.0    steps: 602     evaluation reward: 155.55
episode: 1027   score: 125.0   memory length: 10240   epsilon: 1.0    steps: 646     evaluation reward: 155.7
episode: 1028   score: 140.0   memory length: 10240   epsilon: 1.0    steps: 675     evaluation reward: 156.45
episode: 1029   score: 65.0   memory length: 10240   epsilon: 1.0    steps: 507     evaluation reward: 155.0
episode: 1030   score: 110.0   memory length: 10240   epsilon: 1.0    steps: 530     evaluation reward: 154.55
episode: 1031   score: 125.0   memory length: 10240   epsilon: 1.0    steps: 651     evaluation reward: 155.2
episode: 1032   score: 290.0   memory length: 10240   epsilon: 1.0    steps: 964     evaluation reward: 156.75
episode: 1033   score: 345.0   memory length: 10240   epsilon: 1.0    steps: 783     evaluation reward: 156.85
Traini

now time :  2018-12-18 17:58:28.815095
episode: 1085   score: 60.0   memory length: 10240   epsilon: 1.0    steps: 510     evaluation reward: 139.15
episode: 1086   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 403     evaluation reward: 138.8
episode: 1087   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 762     evaluation reward: 138.3
episode: 1088   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 635     evaluation reward: 137.95
episode: 1089   score: 520.0   memory length: 10240   epsilon: 1.0    steps: 1074     evaluation reward: 142.8
episode: 1090   score: 250.0   memory length: 10240   epsilon: 1.0    steps: 971     evaluation reward: 142.7
episode: 1091   score: 100.0   memory length: 10240   epsilon: 1.0    steps: 384     evaluation reward: 142.65
episode: 1092   score: 60.0   memory length: 10240   epsilon: 1.0    steps: 408     evaluation reward: 142.15
episode: 1093   score: 155.0   memory length: 10240   epsilon: 1.0    steps: 6

episode: 1145   score: 140.0   memory length: 10240   epsilon: 1.0    steps: 644     evaluation reward: 151.5
episode: 1146   score: 45.0   memory length: 10240   epsilon: 1.0    steps: 563     evaluation reward: 151.2
episode: 1147   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 630     evaluation reward: 150.85
episode: 1148   score: 35.0   memory length: 10240   epsilon: 1.0    steps: 436     evaluation reward: 150.9
episode: 1149   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 588     evaluation reward: 150.9
episode: 1150   score: 35.0   memory length: 10240   epsilon: 1.0    steps: 384     evaluation reward: 150.4
episode: 1151   score: 110.0   memory length: 10240   epsilon: 1.0    steps: 722     evaluation reward: 150.1
episode: 1152   score: 135.0   memory length: 10240   epsilon: 1.0    steps: 585     evaluation reward: 149.35
episode: 1153   score: 240.0   memory length: 10240   epsilon: 1.0    steps: 978     evaluation reward: 149.65
Training n

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn")