# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [4]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [5]:
env = gym.make('BreakoutDeterministic-v4')
env.render()

True

In [6]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 3
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [7]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action + 1)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        r = np.clip(reward, -1, 1)

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame):
            agent.train_policy_net(frame)
            # Update the target network
            if(frame % Update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 10:
                torch.save(agent.model, "./save_model/breakout_dqn")
                sys.exit()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


episode: 0   score: 0.0   memory length: 135   epsilon: 1.0    steps: 135     evaluation reward: 0.0
episode: 1   score: 2.0   memory length: 336   epsilon: 1.0    steps: 201     evaluation reward: 1.0
episode: 2   score: 3.0   memory length: 590   epsilon: 1.0    steps: 254     evaluation reward: 1.6666666666666667
episode: 3   score: 1.0   memory length: 770   epsilon: 1.0    steps: 180     evaluation reward: 1.5
episode: 4   score: 0.0   memory length: 909   epsilon: 1.0    steps: 139     evaluation reward: 1.2
episode: 5   score: 1.0   memory length: 1082   epsilon: 1.0    steps: 173     evaluation reward: 1.1666666666666667
episode: 6   score: 0.0   memory length: 1222   epsilon: 1.0    steps: 140     evaluation reward: 1.0
episode: 7   score: 1.0   memory length: 1378   epsilon: 1.0    steps: 156     evaluation reward: 1.0
episode: 8   score: 0.0   memory length: 1507   epsilon: 1.0    steps: 129     evaluation reward: 0.8888888888888888
episode: 9   score: 1.0   memory length: 1

episode: 72   score: 0.0   memory length: 13346   epsilon: 1.0    steps: 135     evaluation reward: 1.3424657534246576
episode: 73   score: 2.0   memory length: 13554   epsilon: 1.0    steps: 208     evaluation reward: 1.3513513513513513
episode: 74   score: 1.0   memory length: 13727   epsilon: 1.0    steps: 173     evaluation reward: 1.3466666666666667
episode: 75   score: 1.0   memory length: 13880   epsilon: 1.0    steps: 153     evaluation reward: 1.3421052631578947
episode: 76   score: 0.0   memory length: 14005   epsilon: 1.0    steps: 125     evaluation reward: 1.3246753246753247
episode: 77   score: 2.0   memory length: 14212   epsilon: 1.0    steps: 207     evaluation reward: 1.3333333333333333
episode: 78   score: 1.0   memory length: 14382   epsilon: 1.0    steps: 170     evaluation reward: 1.3291139240506329
episode: 79   score: 0.0   memory length: 14511   epsilon: 1.0    steps: 129     evaluation reward: 1.3125
episode: 80   score: 2.0   memory length: 14714   epsilon: 1

episode: 147   score: 1.0   memory length: 26315   epsilon: 1.0    steps: 156     evaluation reward: 1.24
episode: 148   score: 2.0   memory length: 26518   epsilon: 1.0    steps: 203     evaluation reward: 1.22
episode: 149   score: 1.0   memory length: 26695   epsilon: 1.0    steps: 177     evaluation reward: 1.19
episode: 150   score: 0.0   memory length: 26825   epsilon: 1.0    steps: 130     evaluation reward: 1.18
episode: 151   score: 1.0   memory length: 26985   epsilon: 1.0    steps: 160     evaluation reward: 1.18
episode: 152   score: 1.0   memory length: 27139   epsilon: 1.0    steps: 154     evaluation reward: 1.19
episode: 153   score: 1.0   memory length: 27322   epsilon: 1.0    steps: 183     evaluation reward: 1.18
episode: 154   score: 1.0   memory length: 27499   epsilon: 1.0    steps: 177     evaluation reward: 1.16
episode: 155   score: 2.0   memory length: 27686   epsilon: 1.0    steps: 187     evaluation reward: 1.17
episode: 156   score: 1.0   memory length: 278

episode: 225   score: 2.0   memory length: 39250   epsilon: 1.0    steps: 185     evaluation reward: 0.87
episode: 226   score: 0.0   memory length: 39376   epsilon: 1.0    steps: 126     evaluation reward: 0.87
episode: 227   score: 1.0   memory length: 39558   epsilon: 1.0    steps: 182     evaluation reward: 0.88
episode: 228   score: 0.0   memory length: 39680   epsilon: 1.0    steps: 122     evaluation reward: 0.86
episode: 229   score: 0.0   memory length: 39822   epsilon: 1.0    steps: 142     evaluation reward: 0.86
episode: 230   score: 1.0   memory length: 39995   epsilon: 1.0    steps: 173     evaluation reward: 0.86
episode: 231   score: 1.0   memory length: 40152   epsilon: 1.0    steps: 157     evaluation reward: 0.87
episode: 232   score: 3.0   memory length: 40382   epsilon: 1.0    steps: 230     evaluation reward: 0.89
episode: 233   score: 2.0   memory length: 40570   epsilon: 1.0    steps: 188     evaluation reward: 0.9
episode: 234   score: 3.0   memory length: 4081

episode: 303   score: 4.0   memory length: 52873   epsilon: 1.0    steps: 309     evaluation reward: 1.08
episode: 304   score: 3.0   memory length: 53127   epsilon: 1.0    steps: 254     evaluation reward: 1.11
episode: 305   score: 2.0   memory length: 53329   epsilon: 1.0    steps: 202     evaluation reward: 1.12
episode: 306   score: 0.0   memory length: 53468   epsilon: 1.0    steps: 139     evaluation reward: 1.12
episode: 307   score: 4.0   memory length: 53762   epsilon: 1.0    steps: 294     evaluation reward: 1.16
episode: 308   score: 0.0   memory length: 53900   epsilon: 1.0    steps: 138     evaluation reward: 1.15
episode: 309   score: 1.0   memory length: 54069   epsilon: 1.0    steps: 169     evaluation reward: 1.12
episode: 310   score: 1.0   memory length: 54256   epsilon: 1.0    steps: 187     evaluation reward: 1.13
episode: 311   score: 1.0   memory length: 54434   epsilon: 1.0    steps: 178     evaluation reward: 1.11
episode: 312   score: 1.0   memory length: 545

episode: 381   score: 1.0   memory length: 67648   epsilon: 1.0    steps: 170     evaluation reward: 1.42
episode: 382   score: 2.0   memory length: 67857   epsilon: 1.0    steps: 209     evaluation reward: 1.42
episode: 383   score: 1.0   memory length: 68027   epsilon: 1.0    steps: 170     evaluation reward: 1.41
episode: 384   score: 1.0   memory length: 68179   epsilon: 1.0    steps: 152     evaluation reward: 1.42
episode: 385   score: 3.0   memory length: 68434   epsilon: 1.0    steps: 255     evaluation reward: 1.45
episode: 386   score: 0.0   memory length: 68567   epsilon: 1.0    steps: 133     evaluation reward: 1.45
episode: 387   score: 0.0   memory length: 68700   epsilon: 1.0    steps: 133     evaluation reward: 1.44
episode: 388   score: 3.0   memory length: 68933   epsilon: 1.0    steps: 233     evaluation reward: 1.46
episode: 389   score: 1.0   memory length: 69107   epsilon: 1.0    steps: 174     evaluation reward: 1.46
episode: 390   score: 3.0   memory length: 693

episode: 459   score: 0.0   memory length: 81676   epsilon: 1.0    steps: 128     evaluation reward: 1.32
episode: 460   score: 0.0   memory length: 81804   epsilon: 1.0    steps: 128     evaluation reward: 1.31
episode: 461   score: 0.0   memory length: 81946   epsilon: 1.0    steps: 142     evaluation reward: 1.31
episode: 462   score: 1.0   memory length: 82125   epsilon: 1.0    steps: 179     evaluation reward: 1.29
episode: 463   score: 4.0   memory length: 82419   epsilon: 1.0    steps: 294     evaluation reward: 1.33
episode: 464   score: 0.0   memory length: 82553   epsilon: 1.0    steps: 134     evaluation reward: 1.3
episode: 465   score: 0.0   memory length: 82694   epsilon: 1.0    steps: 141     evaluation reward: 1.3
episode: 466   score: 1.0   memory length: 82846   epsilon: 1.0    steps: 152     evaluation reward: 1.29
episode: 467   score: 1.0   memory length: 83022   epsilon: 1.0    steps: 176     evaluation reward: 1.27
episode: 468   score: 4.0   memory length: 83329

episode: 537   score: 1.0   memory length: 95743   epsilon: 1.0    steps: 171     evaluation reward: 1.25
episode: 538   score: 1.0   memory length: 95907   epsilon: 1.0    steps: 164     evaluation reward: 1.25
episode: 539   score: 3.0   memory length: 96136   epsilon: 1.0    steps: 229     evaluation reward: 1.28
episode: 540   score: 0.0   memory length: 96276   epsilon: 1.0    steps: 140     evaluation reward: 1.26
episode: 541   score: 1.0   memory length: 96442   epsilon: 1.0    steps: 166     evaluation reward: 1.27
episode: 542   score: 2.0   memory length: 96649   epsilon: 1.0    steps: 207     evaluation reward: 1.29
episode: 543   score: 3.0   memory length: 96886   epsilon: 1.0    steps: 237     evaluation reward: 1.29
episode: 544   score: 2.0   memory length: 97109   epsilon: 1.0    steps: 223     evaluation reward: 1.27
episode: 545   score: 0.0   memory length: 97244   epsilon: 1.0    steps: 135     evaluation reward: 1.25
episode: 546   score: 2.0   memory length: 974

episode: 608   score: 0.0   memory length: 107986   epsilon: 0.9920928700001717    steps: 127     evaluation reward: 1.11
episode: 609   score: 0.0   memory length: 108116   epsilon: 0.9919641700001744    steps: 130     evaluation reward: 1.08
episode: 610   score: 1.0   memory length: 108285   epsilon: 0.9917968600001781    steps: 169     evaluation reward: 1.09
episode: 611   score: 0.0   memory length: 108415   epsilon: 0.9916681600001809    steps: 130     evaluation reward: 1.07
episode: 612   score: 0.0   memory length: 108547   epsilon: 0.9915374800001837    steps: 132     evaluation reward: 1.07
episode: 613   score: 0.0   memory length: 108680   epsilon: 0.9914058100001866    steps: 133     evaluation reward: 1.07
episode: 614   score: 1.0   memory length: 108841   epsilon: 0.99124642000019    steps: 161     evaluation reward: 1.08
episode: 615   score: 2.0   memory length: 109055   epsilon: 0.9910345600001946    steps: 214     evaluation reward: 1.09
episode: 616   score: 3.0 

episode: 676   score: 1.0   memory length: 120454   epsilon: 0.9797495500004396    steps: 155     evaluation reward: 1.23
episode: 677   score: 2.0   memory length: 120641   epsilon: 0.9795644200004436    steps: 187     evaluation reward: 1.24
episode: 678   score: 0.0   memory length: 120779   epsilon: 0.9794278000004466    steps: 138     evaluation reward: 1.23
episode: 679   score: 2.0   memory length: 120993   epsilon: 0.9792159400004512    steps: 214     evaluation reward: 1.24
episode: 680   score: 4.0   memory length: 121270   epsilon: 0.9789417100004572    steps: 277     evaluation reward: 1.26
episode: 681   score: 2.0   memory length: 121458   epsilon: 0.9787555900004612    steps: 188     evaluation reward: 1.27
episode: 682   score: 0.0   memory length: 121590   epsilon: 0.978624910000464    steps: 132     evaluation reward: 1.26
episode: 683   score: 0.0   memory length: 121724   epsilon: 0.9784922500004669    steps: 134     evaluation reward: 1.26
episode: 684   score: 2.0

episode: 744   score: 1.0   memory length: 132232   epsilon: 0.9680893300006927    steps: 155     evaluation reward: 1.25
episode: 745   score: 0.0   memory length: 132374   epsilon: 0.9679487500006958    steps: 142     evaluation reward: 1.25
episode: 746   score: 3.0   memory length: 132617   epsilon: 0.967708180000701    steps: 243     evaluation reward: 1.26
episode: 747   score: 0.0   memory length: 132751   epsilon: 0.9675755200007039    steps: 134     evaluation reward: 1.25
episode: 748   score: 0.0   memory length: 132878   epsilon: 0.9674497900007066    steps: 127     evaluation reward: 1.24
episode: 749   score: 3.0   memory length: 133155   epsilon: 0.9671755600007126    steps: 277     evaluation reward: 1.25
episode: 750   score: 1.0   memory length: 133331   epsilon: 0.9670013200007164    steps: 176     evaluation reward: 1.24
episode: 751   score: 5.0   memory length: 133646   epsilon: 0.9666894700007231    steps: 315     evaluation reward: 1.26
episode: 752   score: 1.0

episode: 812   score: 2.0   memory length: 144839   epsilon: 0.9556084000009637    steps: 200     evaluation reward: 1.36
episode: 813   score: 3.0   memory length: 145083   epsilon: 0.9553668400009689    steps: 244     evaluation reward: 1.38
episode: 814   score: 3.0   memory length: 145320   epsilon: 0.955132210000974    steps: 237     evaluation reward: 1.41
episode: 815   score: 0.0   memory length: 145454   epsilon: 0.9549995500009769    steps: 134     evaluation reward: 1.41
episode: 816   score: 0.0   memory length: 145583   epsilon: 0.9548718400009797    steps: 129     evaluation reward: 1.37
episode: 817   score: 0.0   memory length: 145716   epsilon: 0.9547401700009825    steps: 133     evaluation reward: 1.37
episode: 818   score: 2.0   memory length: 145913   epsilon: 0.9545451400009868    steps: 197     evaluation reward: 1.39
episode: 819   score: 3.0   memory length: 146167   epsilon: 0.9542936800009922    steps: 254     evaluation reward: 1.41
episode: 820   score: 2.0

episode: 880   score: 1.0   memory length: 157471   epsilon: 0.9431027200012352    steps: 183     evaluation reward: 1.46
episode: 881   score: 1.0   memory length: 157627   epsilon: 0.9429482800012385    steps: 156     evaluation reward: 1.47
episode: 882   score: 4.0   memory length: 157921   epsilon: 0.9426572200012449    steps: 294     evaluation reward: 1.51
episode: 883   score: 2.0   memory length: 158125   epsilon: 0.9424552600012492    steps: 204     evaluation reward: 1.53
episode: 884   score: 2.0   memory length: 158348   epsilon: 0.942234490001254    steps: 223     evaluation reward: 1.52
episode: 885   score: 1.0   memory length: 158524   epsilon: 0.9420602500012578    steps: 176     evaluation reward: 1.53
episode: 886   score: 0.0   memory length: 158652   epsilon: 0.9419335300012606    steps: 128     evaluation reward: 1.53
episode: 887   score: 3.0   memory length: 158871   epsilon: 0.9417167200012653    steps: 219     evaluation reward: 1.55
episode: 888   score: 2.0

episode: 948   score: 0.0   memory length: 170806   epsilon: 0.9299010700015218    steps: 130     evaluation reward: 1.66
episode: 949   score: 2.0   memory length: 171012   epsilon: 0.9296971300015262    steps: 206     evaluation reward: 1.67
episode: 950   score: 3.0   memory length: 171269   epsilon: 0.9294427000015317    steps: 257     evaluation reward: 1.69
episode: 951   score: 6.0   memory length: 171605   epsilon: 0.929110060001539    steps: 336     evaluation reward: 1.74
episode: 952   score: 4.0   memory length: 171887   epsilon: 0.928830880001545    steps: 282     evaluation reward: 1.76
episode: 953   score: 1.0   memory length: 172042   epsilon: 0.9286774300015483    steps: 155     evaluation reward: 1.77
episode: 954   score: 4.0   memory length: 172341   epsilon: 0.9283814200015548    steps: 299     evaluation reward: 1.8
episode: 955   score: 0.0   memory length: 172470   epsilon: 0.9282537100015575    steps: 129     evaluation reward: 1.77
episode: 956   score: 3.0  

episode: 1016   score: 1.0   memory length: 184413   epsilon: 0.9164301400018142    steps: 161     evaluation reward: 1.8
episode: 1017   score: 3.0   memory length: 184667   epsilon: 0.9161786800018197    steps: 254     evaluation reward: 1.81
episode: 1018   score: 3.0   memory length: 184925   epsilon: 0.9159232600018252    steps: 258     evaluation reward: 1.84
episode: 1019   score: 0.0   memory length: 185056   epsilon: 0.915793570001828    steps: 131     evaluation reward: 1.8
episode: 1020   score: 4.0   memory length: 185332   epsilon: 0.915520330001834    steps: 276     evaluation reward: 1.79
episode: 1021   score: 1.0   memory length: 185489   epsilon: 0.9153649000018373    steps: 157     evaluation reward: 1.79
episode: 1022   score: 2.0   memory length: 185696   epsilon: 0.9151599700018418    steps: 207     evaluation reward: 1.79
episode: 1023   score: 1.0   memory length: 185870   epsilon: 0.9149877100018455    steps: 174     evaluation reward: 1.8
episode: 1024   score

episode: 1083   score: 2.0   memory length: 198094   epsilon: 0.9028859500021083    steps: 201     evaluation reward: 1.88
episode: 1084   score: 2.0   memory length: 198282   epsilon: 0.9026998300021123    steps: 188     evaluation reward: 1.89
episode: 1085   score: 1.0   memory length: 198435   epsilon: 0.9025483600021156    steps: 153     evaluation reward: 1.88
episode: 1086   score: 1.0   memory length: 198603   epsilon: 0.9023820400021192    steps: 168     evaluation reward: 1.86
episode: 1087   score: 2.0   memory length: 198813   epsilon: 0.9021741400021237    steps: 210     evaluation reward: 1.86
episode: 1088   score: 0.0   memory length: 198943   epsilon: 0.9020454400021265    steps: 130     evaluation reward: 1.83
episode: 1089   score: 0.0   memory length: 199081   epsilon: 0.9019088200021295    steps: 138     evaluation reward: 1.82
episode: 1090   score: 4.0   memory length: 199350   epsilon: 0.9016425100021352    steps: 269     evaluation reward: 1.83
episode: 1091   

episode: 1150   score: 1.0   memory length: 211402   epsilon: 0.8897110300023943    steps: 168     evaluation reward: 1.89
episode: 1151   score: 5.0   memory length: 211739   epsilon: 0.8893774000024015    steps: 337     evaluation reward: 1.92
episode: 1152   score: 0.0   memory length: 211874   epsilon: 0.8892437500024044    steps: 135     evaluation reward: 1.89
episode: 1153   score: 1.0   memory length: 212030   epsilon: 0.8890893100024078    steps: 156     evaluation reward: 1.87
episode: 1154   score: 6.0   memory length: 212345   epsilon: 0.8887774600024145    steps: 315     evaluation reward: 1.91
episode: 1155   score: 1.0   memory length: 212504   epsilon: 0.888620050002418    steps: 159     evaluation reward: 1.9
episode: 1156   score: 1.0   memory length: 212666   epsilon: 0.8884596700024214    steps: 162     evaluation reward: 1.89
episode: 1157   score: 1.0   memory length: 212825   epsilon: 0.8883022600024248    steps: 159     evaluation reward: 1.85
episode: 1158   sc

episode: 1217   score: 2.0   memory length: 226557   epsilon: 0.87470758000272    steps: 214     evaluation reward: 2.27
episode: 1218   score: 2.0   memory length: 226768   epsilon: 0.8744986900027245    steps: 211     evaluation reward: 2.25
episode: 1219   score: 3.0   memory length: 226984   epsilon: 0.8742848500027292    steps: 216     evaluation reward: 2.25
episode: 1220   score: 1.0   memory length: 227144   epsilon: 0.8741264500027326    steps: 160     evaluation reward: 2.25
episode: 1221   score: 0.0   memory length: 227272   epsilon: 0.8739997300027353    steps: 128     evaluation reward: 2.2
episode: 1222   score: 0.0   memory length: 227402   epsilon: 0.8738710300027381    steps: 130     evaluation reward: 2.18
episode: 1223   score: 2.0   memory length: 227596   epsilon: 0.8736789700027423    steps: 194     evaluation reward: 2.18
episode: 1224   score: 3.0   memory length: 227838   epsilon: 0.8734393900027475    steps: 242     evaluation reward: 2.21
episode: 1225   sco

episode: 1284   score: 1.0   memory length: 240406   epsilon: 0.8609970700030176    steps: 157     evaluation reward: 2.23
episode: 1285   score: 3.0   memory length: 240680   epsilon: 0.8607258100030235    steps: 274     evaluation reward: 2.25
episode: 1286   score: 2.0   memory length: 240880   epsilon: 0.8605278100030278    steps: 200     evaluation reward: 2.22
episode: 1287   score: 2.0   memory length: 241081   epsilon: 0.8603288200030321    steps: 201     evaluation reward: 2.23
episode: 1288   score: 3.0   memory length: 241349   epsilon: 0.8600635000030379    steps: 268     evaluation reward: 2.22
episode: 1289   score: 3.0   memory length: 241582   epsilon: 0.8598328300030429    steps: 233     evaluation reward: 2.22
episode: 1290   score: 1.0   memory length: 241742   epsilon: 0.8596744300030463    steps: 160     evaluation reward: 2.2
episode: 1291   score: 0.0   memory length: 241874   epsilon: 0.8595437500030492    steps: 132     evaluation reward: 2.2
episode: 1292   sc

episode: 1351   score: 4.0   memory length: 255152   epsilon: 0.8463985300033345    steps: 305     evaluation reward: 2.3
episode: 1352   score: 5.0   memory length: 255510   epsilon: 0.8460441100033422    steps: 358     evaluation reward: 2.33
episode: 1353   score: 3.0   memory length: 255748   epsilon: 0.8458084900033473    steps: 238     evaluation reward: 2.34
episode: 1354   score: 0.0   memory length: 255888   epsilon: 0.8456698900033504    steps: 140     evaluation reward: 2.31
episode: 1355   score: 4.0   memory length: 256196   epsilon: 0.845364970003357    steps: 308     evaluation reward: 2.33
episode: 1356   score: 1.0   memory length: 256367   epsilon: 0.8451956800033606    steps: 171     evaluation reward: 2.33
episode: 1357   score: 2.0   memory length: 256577   epsilon: 0.8449877800033652    steps: 210     evaluation reward: 2.34
episode: 1358   score: 3.0   memory length: 256824   epsilon: 0.8447432500033705    steps: 247     evaluation reward: 2.36
episode: 1359   sc

episode: 1418   score: 1.0   memory length: 270151   epsilon: 0.8315495200036569    steps: 181     evaluation reward: 2.37
episode: 1419   score: 5.0   memory length: 270523   epsilon: 0.8311812400036649    steps: 372     evaluation reward: 2.4
episode: 1420   score: 4.0   memory length: 270811   epsilon: 0.8308961200036711    steps: 288     evaluation reward: 2.42
episode: 1421   score: 3.0   memory length: 271044   epsilon: 0.8306654500036761    steps: 233     evaluation reward: 2.43
episode: 1422   score: 1.0   memory length: 271201   epsilon: 0.8305100200036795    steps: 157     evaluation reward: 2.42
episode: 1423   score: 3.0   memory length: 271432   epsilon: 0.8302813300036844    steps: 231     evaluation reward: 2.45
episode: 1424   score: 1.0   memory length: 271588   epsilon: 0.8301268900036878    steps: 156     evaluation reward: 2.43
episode: 1425   score: 2.0   memory length: 271778   epsilon: 0.8299387900036919    steps: 190     evaluation reward: 2.43
episode: 1426   s

episode: 1485   score: 3.0   memory length: 285821   epsilon: 0.8160362200039937    steps: 242     evaluation reward: 2.67
episode: 1486   score: 7.0   memory length: 286239   epsilon: 0.8156224000040027    steps: 418     evaluation reward: 2.7
episode: 1487   score: 4.0   memory length: 286529   epsilon: 0.8153353000040089    steps: 290     evaluation reward: 2.67
episode: 1488   score: 4.0   memory length: 286793   epsilon: 0.8150739400040146    steps: 264     evaluation reward: 2.7
episode: 1489   score: 1.0   memory length: 286949   epsilon: 0.8149195000040179    steps: 156     evaluation reward: 2.68
episode: 1490   score: 7.0   memory length: 287339   epsilon: 0.8145334000040263    steps: 390     evaluation reward: 2.68
episode: 1491   score: 1.0   memory length: 287508   epsilon: 0.8143660900040299    steps: 169     evaluation reward: 2.67
episode: 1492   score: 2.0   memory length: 287714   epsilon: 0.8141621500040344    steps: 206     evaluation reward: 2.67
episode: 1493   sc

episode: 1552   score: 1.0   memory length: 302744   epsilon: 0.7992824500043574    steps: 175     evaluation reward: 3.2
episode: 1553   score: 2.0   memory length: 302930   epsilon: 0.7990983100043614    steps: 186     evaluation reward: 3.21
episode: 1554   score: 2.0   memory length: 303137   epsilon: 0.7988933800043658    steps: 207     evaluation reward: 3.2
episode: 1555   score: 2.0   memory length: 303364   epsilon: 0.7986686500043707    steps: 227     evaluation reward: 3.19
episode: 1556   score: 5.0   memory length: 303664   epsilon: 0.7983716500043772    steps: 300     evaluation reward: 3.22
episode: 1557   score: 5.0   memory length: 303978   epsilon: 0.7980607900043839    steps: 314     evaluation reward: 3.24
episode: 1558   score: 3.0   memory length: 304194   epsilon: 0.7978469500043885    steps: 216     evaluation reward: 3.23
episode: 1559   score: 3.0   memory length: 304415   epsilon: 0.7976281600043933    steps: 221     evaluation reward: 3.2
episode: 1560   sco

episode: 1619   score: 7.0   memory length: 319858   epsilon: 0.7823395900047252    steps: 420     evaluation reward: 3.28
episode: 1620   score: 2.0   memory length: 320061   epsilon: 0.7821386200047296    steps: 203     evaluation reward: 3.26
episode: 1621   score: 3.0   memory length: 320287   epsilon: 0.7819148800047344    steps: 226     evaluation reward: 3.24
episode: 1622   score: 1.0   memory length: 320440   epsilon: 0.7817634100047377    steps: 153     evaluation reward: 3.22
episode: 1623   score: 6.0   memory length: 320748   epsilon: 0.7814584900047443    steps: 308     evaluation reward: 3.21
episode: 1624   score: 5.0   memory length: 321039   epsilon: 0.7811704000047506    steps: 291     evaluation reward: 3.23
episode: 1625   score: 5.0   memory length: 321351   epsilon: 0.7808615200047573    steps: 312     evaluation reward: 3.25
episode: 1626   score: 1.0   memory length: 321525   epsilon: 0.780689260004761    steps: 174     evaluation reward: 3.24
episode: 1627   s

episode: 1686   score: 4.0   memory length: 335459   epsilon: 0.7668946000050605    steps: 299     evaluation reward: 3.13
episode: 1687   score: 1.0   memory length: 335633   epsilon: 0.7667223400050642    steps: 174     evaluation reward: 3.08
episode: 1688   score: 3.0   memory length: 335896   epsilon: 0.7664619700050699    steps: 263     evaluation reward: 3.06
episode: 1689   score: 3.0   memory length: 336142   epsilon: 0.7662184300050752    steps: 246     evaluation reward: 3.05
episode: 1690   score: 0.0   memory length: 336270   epsilon: 0.7660917100050779    steps: 128     evaluation reward: 2.96
episode: 1691   score: 4.0   memory length: 336545   epsilon: 0.7658194600050838    steps: 275     evaluation reward: 2.97
episode: 1692   score: 6.0   memory length: 336890   epsilon: 0.7654779100050912    steps: 345     evaluation reward: 2.98
episode: 1693   score: 5.0   memory length: 337193   epsilon: 0.7651779400050978    steps: 303     evaluation reward: 3.0
episode: 1694   s

episode: 1753   score: 3.0   memory length: 352315   epsilon: 0.7502071600054228    steps: 238     evaluation reward: 3.12
episode: 1754   score: 2.0   memory length: 352510   epsilon: 0.750014110005427    steps: 195     evaluation reward: 3.14
episode: 1755   score: 4.0   memory length: 352799   epsilon: 0.7497280000054332    steps: 289     evaluation reward: 3.16
episode: 1756   score: 5.0   memory length: 353122   epsilon: 0.7494082300054401    steps: 323     evaluation reward: 3.19
episode: 1757   score: 4.0   memory length: 353391   epsilon: 0.7491419200054459    steps: 269     evaluation reward: 3.22
episode: 1758   score: 3.0   memory length: 353643   epsilon: 0.7488924400054513    steps: 252     evaluation reward: 3.2
episode: 1759   score: 5.0   memory length: 354010   epsilon: 0.7485291100054592    steps: 367     evaluation reward: 3.22
episode: 1760   score: 4.0   memory length: 354275   epsilon: 0.7482667600054649    steps: 265     evaluation reward: 3.24
episode: 1761   sc

episode: 1820   score: 9.0   memory length: 369899   epsilon: 0.7327990000058007    steps: 333     evaluation reward: 3.51
episode: 1821   score: 6.0   memory length: 370265   epsilon: 0.7324366600058085    steps: 366     evaluation reward: 3.49
episode: 1822   score: 2.0   memory length: 370446   epsilon: 0.7322574700058124    steps: 181     evaluation reward: 3.51
episode: 1823   score: 3.0   memory length: 370672   epsilon: 0.7320337300058173    steps: 226     evaluation reward: 3.51
episode: 1824   score: 4.0   memory length: 370951   epsilon: 0.7317575200058233    steps: 279     evaluation reward: 3.53
episode: 1825   score: 6.0   memory length: 371283   epsilon: 0.7314288400058304    steps: 332     evaluation reward: 3.55
episode: 1826   score: 4.0   memory length: 371544   epsilon: 0.731170450005836    steps: 261     evaluation reward: 3.55
episode: 1827   score: 5.0   memory length: 371860   epsilon: 0.7308576100058428    steps: 316     evaluation reward: 3.58
episode: 1828   s