# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

  return f(*args, **kwds)


## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
env = gym.make('SpaceInvaders-v0')
env.render()

  result = entry_point.load(False)


True

In [3]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 6
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [None]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        #r = np.clip(reward, -1, 1)
        r = reward
        
        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame and frame % train_freq == 0):
            agent.train_policy_net(frame)
            # Update the target network
            if(frame % Update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 700:
                torch.save(agent.policy_net, "./save_model/breakout_dqn")
                sys.exit()

episode: 0   score: 120.0   memory length: 623   epsilon: 1.0    steps: 623     evaluation reward: 120.0
episode: 1   score: 80.0   memory length: 1197   epsilon: 1.0    steps: 574     evaluation reward: 100.0
episode: 2   score: 110.0   memory length: 1884   epsilon: 1.0    steps: 687     evaluation reward: 103.33333333333333
episode: 3   score: 335.0   memory length: 2655   epsilon: 1.0    steps: 771     evaluation reward: 161.25
episode: 4   score: 365.0   memory length: 3619   epsilon: 1.0    steps: 964     evaluation reward: 202.0
episode: 5   score: 105.0   memory length: 4320   epsilon: 1.0    steps: 701     evaluation reward: 185.83333333333334
episode: 6   score: 380.0   memory length: 5504   epsilon: 1.0    steps: 1184     evaluation reward: 213.57142857142858
episode: 7   score: 180.0   memory length: 6140   epsilon: 1.0    steps: 636     evaluation reward: 209.375
episode: 8   score: 110.0   memory length: 6750   epsilon: 1.0    steps: 610     evaluation reward: 198.3333333

episode: 70   score: 65.0   memory length: 48680   epsilon: 1.0    steps: 396     evaluation reward: 144.22535211267606
episode: 71   score: 100.0   memory length: 49265   epsilon: 1.0    steps: 585     evaluation reward: 143.61111111111111
episode: 72   score: 90.0   memory length: 49786   epsilon: 1.0    steps: 521     evaluation reward: 142.87671232876713
now time :  2020-02-29 12:48:41.237882
episode: 73   score: 285.0   memory length: 50968   epsilon: 0.9990794500000004    steps: 1182     evaluation reward: 144.7972972972973
episode: 74   score: 210.0   memory length: 51736   epsilon: 0.9983498500000008    steps: 768     evaluation reward: 145.66666666666666
episode: 75   score: 125.0   memory length: 52382   epsilon: 0.997736150000001    steps: 646     evaluation reward: 145.39473684210526
episode: 76   score: 55.0   memory length: 52990   epsilon: 0.9971585500000013    steps: 608     evaluation reward: 144.2207792207792
episode: 77   score: 120.0   memory length: 53655   epsilon

episode: 134   score: 155.0   memory length: 94248   epsilon: 0.9579634500000194    steps: 827     evaluation reward: 146.25
episode: 135   score: 70.0   memory length: 94858   epsilon: 0.9573839500000196    steps: 610     evaluation reward: 146.05
episode: 136   score: 125.0   memory length: 95577   epsilon: 0.95670090000002    steps: 719     evaluation reward: 146.5
episode: 137   score: 50.0   memory length: 95962   epsilon: 0.9563351500000201    steps: 385     evaluation reward: 145.9
episode: 138   score: 140.0   memory length: 96511   epsilon: 0.9558136000000204    steps: 549     evaluation reward: 145.2
episode: 139   score: 135.0   memory length: 97214   epsilon: 0.9551457500000207    steps: 703     evaluation reward: 144.0
episode: 140   score: 180.0   memory length: 97998   epsilon: 0.954400950000021    steps: 784     evaluation reward: 144.7
episode: 141   score: 20.0   memory length: 98447   epsilon: 0.9539744000000212    steps: 449     evaluation reward: 143.55
episode: 14

episode: 200   score: 65.0   memory length: 139396   epsilon: 0.9150728500000391    steps: 496     evaluation reward: 161.45
episode: 201   score: 135.0   memory length: 140184   epsilon: 0.9143242500000395    steps: 788     evaluation reward: 161.75
episode: 202   score: 115.0   memory length: 140693   epsilon: 0.9138407000000397    steps: 509     evaluation reward: 161.55
episode: 203   score: 45.0   memory length: 141101   epsilon: 0.9134531000000399    steps: 408     evaluation reward: 159.4
episode: 204   score: 430.0   memory length: 142051   epsilon: 0.9125506000000403    steps: 950     evaluation reward: 163.5
episode: 205   score: 15.0   memory length: 142548   epsilon: 0.9120784500000405    steps: 497     evaluation reward: 163.0
episode: 206   score: 120.0   memory length: 143159   epsilon: 0.9114980000000408    steps: 611     evaluation reward: 163.85
episode: 207   score: 50.0   memory length: 143571   epsilon: 0.911106600000041    steps: 412     evaluation reward: 163.0
e

episode: 266   score: 280.0   memory length: 187376   epsilon: 0.8694918500000601    steps: 941     evaluation reward: 162.0
episode: 267   score: 105.0   memory length: 187991   epsilon: 0.8689076000000604    steps: 615     evaluation reward: 161.5
episode: 268   score: 75.0   memory length: 188430   epsilon: 0.8684905500000606    steps: 439     evaluation reward: 160.7
episode: 269   score: 300.0   memory length: 189412   epsilon: 0.867557650000061    steps: 982     evaluation reward: 161.55
episode: 270   score: 55.0   memory length: 189992   epsilon: 0.8670066500000613    steps: 580     evaluation reward: 160.75
episode: 271   score: 115.0   memory length: 190771   epsilon: 0.8662666000000616    steps: 779     evaluation reward: 161.4
episode: 272   score: 185.0   memory length: 191583   epsilon: 0.865495200000062    steps: 812     evaluation reward: 161.7
episode: 273   score: 180.0   memory length: 192276   epsilon: 0.8648368500000623    steps: 693     evaluation reward: 159.65
e

episode: 332   score: 385.0   memory length: 235971   epsilon: 0.8233266000000814    steps: 1057     evaluation reward: 154.8
episode: 333   score: 75.0   memory length: 236395   epsilon: 0.8229238000000816    steps: 424     evaluation reward: 154.8
episode: 334   score: 110.0   memory length: 237063   epsilon: 0.8222892000000819    steps: 668     evaluation reward: 154.1
episode: 335   score: 240.0   memory length: 237865   epsilon: 0.8215273000000822    steps: 802     evaluation reward: 155.3
episode: 336   score: 135.0   memory length: 238638   epsilon: 0.8207929500000826    steps: 773     evaluation reward: 155.45
episode: 337   score: 110.0   memory length: 239277   epsilon: 0.8201859000000828    steps: 639     evaluation reward: 155.5
episode: 338   score: 35.0   memory length: 239824   epsilon: 0.8196662500000831    steps: 547     evaluation reward: 155.35
episode: 339   score: 225.0   memory length: 240693   epsilon: 0.8188407000000835    steps: 869     evaluation reward: 152.5

episode: 398   score: 380.0   memory length: 282879   epsilon: 0.7787640000001019    steps: 862     evaluation reward: 161.85
episode: 399   score: 210.0   memory length: 283679   epsilon: 0.7780040000001023    steps: 800     evaluation reward: 162.75
episode: 400   score: 110.0   memory length: 284376   epsilon: 0.7773418500001026    steps: 697     evaluation reward: 163.55
episode: 401   score: 375.0   memory length: 285170   epsilon: 0.7765875500001029    steps: 794     evaluation reward: 166.65
episode: 402   score: 120.0   memory length: 285808   epsilon: 0.7759814500001032    steps: 638     evaluation reward: 166.3
episode: 403   score: 110.0   memory length: 286497   epsilon: 0.7753269000001035    steps: 689     evaluation reward: 165.55
episode: 404   score: 125.0   memory length: 287276   epsilon: 0.7745868500001039    steps: 779     evaluation reward: 166.0
episode: 405   score: 155.0   memory length: 288127   epsilon: 0.7737784000001042    steps: 851     evaluation reward: 1

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn")