# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [None]:
env = gym.make('SpaceInvaders-v0')
#env.render()

In [None]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 6
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [None]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        #r = np.clip(reward, -1, 1)
        r = reward
        
        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            agent.train_policy_net(frame)
            # Update the target network
            agent.update_target_net()
        score += r
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 40 and len(evaluation_reward) > 350:
                torch.save(agent.policy_net, "./save_model/breakout_dqn")
                sys.exit()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],0)[0])


episode: 0   score: 80.0   memory length: 503   epsilon: 1.0    steps: 503     evaluation reward: 80.0
episode: 1   score: 160.0   memory length: 1164   epsilon: 1.0    steps: 661     evaluation reward: 120.0
episode: 2   score: 40.0   memory length: 1591   epsilon: 1.0    steps: 427     evaluation reward: 93.33333333333333
episode: 3   score: 125.0   memory length: 2208   epsilon: 1.0    steps: 617     evaluation reward: 101.25
episode: 4   score: 100.0   memory length: 2595   epsilon: 1.0    steps: 387     evaluation reward: 101.0
episode: 5   score: 105.0   memory length: 3112   epsilon: 1.0    steps: 517     evaluation reward: 101.66666666666667
episode: 6   score: 155.0   memory length: 3792   epsilon: 1.0    steps: 680     evaluation reward: 109.28571428571429
episode: 7   score: 120.0   memory length: 4491   epsilon: 1.0    steps: 699     evaluation reward: 110.625
episode: 8   score: 105.0   memory length: 4967   epsilon: 1.0    steps: 476     evaluation reward: 110.0
episode: 

  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Policy loss: 0.004986. Value loss: 9.844961. Entropy: 1.778321.
Iteration 2
Policy loss: 0.005097. Value loss: 7.800111. Entropy: 1.770913.
Iteration 3
Policy loss: -0.003542. Value loss: 6.126852. Entropy: 1.772685.
episode: 17   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 594     evaluation reward: 135.27777777777777
episode: 18   score: 220.0   memory length: 10240   epsilon: 1.0    steps: 828     evaluation reward: 139.73684210526315
episode: 19   score: 180.0   memory length: 10240   epsilon: 1.0    steps: 808     evaluation reward: 141.75
episode: 20   score: 180.0   memory length: 10240   epsilon: 1.0    steps: 591     evaluation reward: 143.57142857142858
episode: 21   score: 170.0   memory length: 10240   epsilon: 1.0    steps: 1136     evaluation reward: 144.77272727272728
episode: 22   score: 225.0   memory length: 10240   epsilon: 1.0    steps: 1143     evaluation reward: 148.2608695652174
episode: 23   score: 45.0   memory length: 10240   epsilon: 1.0    s

episode: 76   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 629     evaluation reward: 146.88311688311688
episode: 77   score: 150.0   memory length: 10240   epsilon: 1.0    steps: 634     evaluation reward: 146.92307692307693
episode: 78   score: 320.0   memory length: 10240   epsilon: 1.0    steps: 697     evaluation reward: 149.1139240506329
episode: 79   score: 25.0   memory length: 10240   epsilon: 1.0    steps: 401     evaluation reward: 147.5625
episode: 80   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 640     evaluation reward: 147.22222222222223
episode: 81   score: 35.0   memory length: 10240   epsilon: 1.0    steps: 567     evaluation reward: 145.85365853658536
episode: 82   score: 40.0   memory length: 10240   epsilon: 1.0    steps: 349     evaluation reward: 144.57831325301206
episode: 83   score: 305.0   memory length: 10240   epsilon: 1.0    steps: 974     evaluation reward: 146.48809523809524
episode: 84   score: 105.0   memory length: 10

episode: 140   score: 35.0   memory length: 10240   epsilon: 1.0    steps: 373     evaluation reward: 143.85
episode: 141   score: 105.0   memory length: 10240   epsilon: 1.0    steps: 454     evaluation reward: 143.7
episode: 142   score: 255.0   memory length: 10240   epsilon: 1.0    steps: 967     evaluation reward: 145.2
episode: 143   score: 180.0   memory length: 10240   epsilon: 1.0    steps: 782     evaluation reward: 146.65
episode: 144   score: 120.0   memory length: 10240   epsilon: 1.0    steps: 723     evaluation reward: 146.8
episode: 145   score: 15.0   memory length: 10240   epsilon: 1.0    steps: 575     evaluation reward: 144.8
episode: 146   score: 115.0   memory length: 10240   epsilon: 1.0    steps: 739     evaluation reward: 144.9
episode: 147   score: 60.0   memory length: 10240   epsilon: 1.0    steps: 611     evaluation reward: 144.05
now time :  2018-12-18 18:53:08.529188
episode: 148   score: 150.0   memory length: 10240   epsilon: 1.0    steps: 633     evalu

episode: 206   score: 80.0   memory length: 10240   epsilon: 1.0    steps: 507     evaluation reward: 159.8
Training network
Iteration 1
Policy loss: -0.010070. Value loss: 7.779859. Entropy: 1.660876.
Iteration 2
Policy loss: -0.024947. Value loss: 5.270581. Entropy: 1.650039.
Iteration 3
Policy loss: -0.031708. Value loss: 4.283082. Entropy: 1.642962.
episode: 207   score: 225.0   memory length: 10240   epsilon: 1.0    steps: 908     evaluation reward: 160.95
episode: 208   score: 210.0   memory length: 10240   epsilon: 1.0    steps: 813     evaluation reward: 162.5
episode: 209   score: 240.0   memory length: 10240   epsilon: 1.0    steps: 925     evaluation reward: 163.55
episode: 210   score: 35.0   memory length: 10240   epsilon: 1.0    steps: 638     evaluation reward: 161.5
episode: 211   score: 115.0   memory length: 10240   epsilon: 1.0    steps: 727     evaluation reward: 161.45
episode: 212   score: 45.0   memory length: 10240   epsilon: 1.0    steps: 623     evaluation rew

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn")