# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
env = gym.make('SpaceInvadersDeterministic-v4')
#env.render()

In [3]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 6
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [None]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        #r = np.clip(reward, -1, 1)
        r = reward
        
        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            agent.train_policy_net(frame)
            # Update the target network
            agent.update_target_net()
        score += r
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 40 and len(evaluation_reward) > 350:
                torch.save(agent.policy_net, "./save_model/breakout_dqn")
                sys.exit()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],0)[0])


episode: 0   score: 210.0   memory length: 1069   epsilon: 1.0    steps: 1069     evaluation reward: 210.0
episode: 1   score: 65.0   memory length: 1700   epsilon: 1.0    steps: 631     evaluation reward: 137.5
episode: 2   score: 190.0   memory length: 2674   epsilon: 1.0    steps: 974     evaluation reward: 155.0
episode: 3   score: 155.0   memory length: 3397   epsilon: 1.0    steps: 723     evaluation reward: 155.0
episode: 4   score: 440.0   memory length: 4786   epsilon: 1.0    steps: 1389     evaluation reward: 212.0
episode: 5   score: 210.0   memory length: 5555   epsilon: 1.0    steps: 769     evaluation reward: 211.66666666666666
episode: 6   score: 230.0   memory length: 6366   epsilon: 1.0    steps: 811     evaluation reward: 214.28571428571428
episode: 7   score: 215.0   memory length: 7243   epsilon: 1.0    steps: 877     evaluation reward: 214.375
episode: 8   score: 140.0   memory length: 7878   epsilon: 1.0    steps: 635     evaluation reward: 206.11111111111111
epis

  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Policy loss: 0.002120. Value loss: 9.616186. Entropy: 1.774370.
Iteration 2
Policy loss: 0.001829. Value loss: 7.188891. Entropy: 1.768062.
Iteration 3
Policy loss: -0.000299. Value loss: 5.595581. Entropy: 1.767132.
episode: 12   score: 65.0   memory length: 10467   epsilon: 1.0    steps: 544     evaluation reward: 179.23076923076923
episode: 13   score: 305.0   memory length: 11214   epsilon: 1.0    steps: 747     evaluation reward: 188.21428571428572
episode: 14   score: 5.0   memory length: 11670   epsilon: 1.0    steps: 456     evaluation reward: 176.0
episode: 15   score: 420.0   memory length: 12985   epsilon: 1.0    steps: 1315     evaluation reward: 191.25
episode: 16   score: 210.0   memory length: 13756   epsilon: 1.0    steps: 771     evaluation reward: 192.35294117647058
episode: 17   score: 155.0   memory length: 14601   epsilon: 1.0    steps: 845     evaluation reward: 190.27777777777777
episode: 18   score: 135.0   memory length: 15372   epsilon: 1.0    steps: 771     e

episode: 71   score: 70.0   memory length: 51444   epsilon: 1.0    steps: 571     evaluation reward: 153.40277777777777
episode: 72   score: 90.0   memory length: 51983   epsilon: 1.0    steps: 539     evaluation reward: 152.53424657534248
episode: 73   score: 100.0   memory length: 52556   epsilon: 1.0    steps: 573     evaluation reward: 151.82432432432432
episode: 74   score: 185.0   memory length: 53273   epsilon: 1.0    steps: 717     evaluation reward: 152.26666666666668
episode: 75   score: 140.0   memory length: 54062   epsilon: 1.0    steps: 789     evaluation reward: 152.10526315789474
episode: 76   score: 390.0   memory length: 55224   epsilon: 1.0    steps: 1162     evaluation reward: 155.19480519480518
episode: 77   score: 135.0   memory length: 55850   epsilon: 1.0    steps: 626     evaluation reward: 154.93589743589743
episode: 78   score: 140.0   memory length: 56542   epsilon: 1.0    steps: 692     evaluation reward: 154.74683544303798
episode: 79   score: 90.0   memor

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn")