# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
env = gym.make('SpaceInvadersDeterministic-v4')
#env.render()

In [3]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 6
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        r = np.clip(reward, -1, 1)

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            agent.train_policy_net(frame)
            # Update the target network
            agent.update_target_net()
        score += r
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 40 and len(evaluation_reward) > 5:
                torch.save(agent.policy_net, "./save_model/breakout_dqn")
                sys.exit()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],0)[0])


episode: 0   score: 6.0   memory length: 519   epsilon: 1.0    steps: 519     evaluation reward: 6.0
episode: 1   score: 6.0   memory length: 1050   epsilon: 1.0    steps: 531     evaluation reward: 6.0
episode: 2   score: 6.0   memory length: 1725   epsilon: 1.0    steps: 675     evaluation reward: 6.0
episode: 3   score: 9.0   memory length: 2407   epsilon: 1.0    steps: 682     evaluation reward: 6.75
episode: 4   score: 9.0   memory length: 2954   epsilon: 1.0    steps: 547     evaluation reward: 7.2
episode: 5   score: 7.0   memory length: 3839   epsilon: 1.0    steps: 885     evaluation reward: 7.166666666666667
episode: 6   score: 18.0   memory length: 4857   epsilon: 1.0    steps: 1018     evaluation reward: 8.714285714285714
episode: 7   score: 6.0   memory length: 5494   epsilon: 1.0    steps: 637     evaluation reward: 8.375
episode: 8   score: 13.0   memory length: 6325   epsilon: 1.0    steps: 831     evaluation reward: 8.88888888888889
episode: 9   score: 5.0   memory len

  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Policy loss: -0.004894. Value loss: 0.181557. Entropy: 1.781206.
Iteration 2
Policy loss: -0.012812. Value loss: 0.137354. Entropy: 1.778384.
Iteration 3
Policy loss: -0.020791. Value loss: 0.104406. Entropy: 1.776633.
Iteration 4
Policy loss: -0.025393. Value loss: 0.081929. Entropy: 1.774028.
Iteration 5
Policy loss: -0.026135. Value loss: 0.064560. Entropy: 1.774901.
episode: 13   score: 14.0   memory length: 10826   epsilon: 1.0    steps: 1610     evaluation reward: 9.5
episode: 14   score: 6.0   memory length: 11573   epsilon: 1.0    steps: 747     evaluation reward: 9.266666666666667
episode: 15   score: 10.0   memory length: 12198   epsilon: 1.0    steps: 625     evaluation reward: 9.3125
episode: 16   score: 13.0   memory length: 13031   epsilon: 1.0    steps: 833     evaluation reward: 9.529411764705882
episode: 17   score: 9.0   memory length: 13709   epsilon: 1.0    steps: 678     evaluation reward: 9.5
episode: 18   score: 7.0   memory length: 14101   epsilon: 1.0    steps:

episode: 71   score: 11.0   memory length: 49070   epsilon: 1.0    steps: 805     evaluation reward: 8.902777777777779
episode: 72   score: 7.0   memory length: 49593   epsilon: 1.0    steps: 523     evaluation reward: 8.876712328767123
episode: 73   score: 6.0   memory length: 49987   epsilon: 1.0    steps: 394     evaluation reward: 8.837837837837839
now time :  2018-12-17 23:52:01.998568
episode: 74   score: 12.0   memory length: 50640   epsilon: 1.0    steps: 653     evaluation reward: 8.88
Training network
Iteration 1
Policy loss: -0.006815. Value loss: 0.068802. Entropy: 1.635168.
Iteration 2
Policy loss: -0.014771. Value loss: 0.058445. Entropy: 1.629181.
Iteration 3
Policy loss: -0.021800. Value loss: 0.053694. Entropy: 1.622944.
Iteration 4
Policy loss: -0.026540. Value loss: 0.049355. Entropy: 1.622539.
Iteration 5
Policy loss: -0.031320. Value loss: 0.047998. Entropy: 1.619112.
episode: 75   score: 8.0   memory length: 51373   epsilon: 1.0    steps: 733     evaluation reward

episode: 130   score: 6.0   memory length: 90184   epsilon: 1.0    steps: 514     evaluation reward: 9.57
episode: 131   score: 15.0   memory length: 90986   epsilon: 1.0    steps: 802     evaluation reward: 9.62
episode: 132   score: 7.0   memory length: 91722   epsilon: 1.0    steps: 736     evaluation reward: 9.59
Training network
Iteration 1
Policy loss: -0.006453. Value loss: 0.059832. Entropy: 1.265498.
Iteration 2
Policy loss: -0.023314. Value loss: 0.053430. Entropy: 1.253711.
Iteration 3
Policy loss: -0.032596. Value loss: 0.050855. Entropy: 1.243698.
Iteration 4
Policy loss: -0.040911. Value loss: 0.047739. Entropy: 1.237917.
Iteration 5
Policy loss: -0.043652. Value loss: 0.045978. Entropy: 1.234080.
episode: 133   score: 13.0   memory length: 92746   epsilon: 1.0    steps: 1024     evaluation reward: 9.7
episode: 134   score: 7.0   memory length: 93135   epsilon: 1.0    steps: 389     evaluation reward: 9.7
episode: 135   score: 7.0   memory length: 93810   epsilon: 1.0    

Iteration 1
Policy loss: 0.077270. Value loss: 0.061027. Entropy: 0.731165.
Iteration 2
Policy loss: 0.039028. Value loss: 0.055285. Entropy: 0.736255.
Iteration 3
Policy loss: 0.020652. Value loss: 0.054018. Entropy: 0.731674.
Iteration 4
Policy loss: 0.014550. Value loss: 0.052156. Entropy: 0.728283.
Iteration 5
Policy loss: 0.018015. Value loss: 0.051309. Entropy: 0.716410.
episode: 192   score: 7.0   memory length: 133510   epsilon: 1.0    steps: 407     evaluation reward: 9.83
episode: 193   score: 17.0   memory length: 134883   epsilon: 1.0    steps: 1373     evaluation reward: 9.92
episode: 194   score: 9.0   memory length: 135519   epsilon: 1.0    steps: 636     evaluation reward: 9.89
episode: 195   score: 8.0   memory length: 136316   epsilon: 1.0    steps: 797     evaluation reward: 9.89
episode: 196   score: 9.0   memory length: 136929   epsilon: 1.0    steps: 613     evaluation reward: 9.9
episode: 197   score: 9.0   memory length: 137556   epsilon: 1.0    steps: 627     e

episode: 250   score: 7.0   memory length: 174675   epsilon: 1.0    steps: 723     evaluation reward: 9.67
episode: 251   score: 12.0   memory length: 175408   epsilon: 1.0    steps: 733     evaluation reward: 9.68
episode: 252   score: 12.0   memory length: 176246   epsilon: 1.0    steps: 838     evaluation reward: 9.74
episode: 253   score: 4.0   memory length: 176960   epsilon: 1.0    steps: 714     evaluation reward: 9.75
episode: 254   score: 5.0   memory length: 177419   epsilon: 1.0    steps: 459     evaluation reward: 9.74
episode: 255   score: 9.0   memory length: 178199   epsilon: 1.0    steps: 780     evaluation reward: 9.78
episode: 256   score: 9.0   memory length: 178858   epsilon: 1.0    steps: 659     evaluation reward: 9.72
episode: 257   score: 6.0   memory length: 179653   epsilon: 1.0    steps: 795     evaluation reward: 9.72
episode: 258   score: 9.0   memory length: 180320   epsilon: 1.0    steps: 667     evaluation reward: 9.66
episode: 259   score: 7.0   memory 

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn")