# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
env = gym.make('SpaceInvadersDeterministic-v4')
#env.render()

In [3]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 6
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        #r = np.clip(reward, -1, 1)
        r = reward
        
        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            agent.train_policy_net(frame)
            # Update the target network
            agent.update_target_net()
        score += r
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 40 and len(evaluation_reward) > 350:
                torch.save(agent.policy_net, "./save_model/breakout_dqn")
                sys.exit()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],0)[0])


episode: 0   score: 90.0   memory length: 566   epsilon: 1.0    steps: 566     evaluation reward: 90.0
Training network
Iteration 1


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Policy loss: -0.007590. Value loss: 9.183960. Entropy: 1.782009.

load: 0.125391
forward: 0.060244
loss: 0.016833
clip: 0.010742
backward: 0.061074
step: 0.041012
total: 0.315297

Iteration 2
Policy loss: -0.016150. Value loss: 6.841040. Entropy: 1.771372.

load: 0.121090
forward: 0.056096
loss: 0.016732
clip: 0.010488
backward: 0.060100
step: 0.038951
total: 0.303455

Iteration 3
Policy loss: -0.016073. Value loss: 4.636711. Entropy: 1.761170.

load: 0.120648
forward: 0.056216
loss: 0.017094
clip: 0.010195
backward: 0.059795
step: 0.038394
total: 0.302341

Iteration 4
Policy loss: -0.014230. Value loss: 4.763137. Entropy: 1.745392.

load: 0.120423
forward: 0.055136
loss: 0.016663
clip: 0.010163
backward: 0.058842
step: 0.037626
total: 0.298854

Iteration 5
Policy loss: -0.036654. Value loss: 3.803099. Entropy: 1.753364.

load: 0.121626
forward: 0.055560
loss: 0.017443
clip: 0.010147
backward: 0.059712
step: 0.037637
total: 0.302124

episode: 1   score: 135.0   memory length: 1024   ep

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn")