# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
env = gym.make('SpaceInvadersDeterministic-v4')
#env.render()

In [3]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 6
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [None]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        r = reward

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            agent.train_policy_net(frame)
            # Update the target network
            agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 400 and len(evaluation_reward) > 5:
                torch.save(agent.policy_net, "./save_model/breakout_dqn")
                sys.exit()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "
  probs = F.softmax(x[:,:self.action_size])


episode: 0   score: 120.0   memory length: 566   epsilon: 1.0    steps: 566     evaluation reward: 120.0
episode: 1   score: 75.0   memory length: 984   epsilon: 1.0    steps: 418     evaluation reward: 97.5
episode: 2   score: 135.0   memory length: 1763   epsilon: 1.0    steps: 779     evaluation reward: 110.0
episode: 3   score: 130.0   memory length: 2370   epsilon: 1.0    steps: 607     evaluation reward: 115.0
episode: 4   score: 85.0   memory length: 3049   epsilon: 1.0    steps: 679     evaluation reward: 109.0
episode: 5   score: 190.0   memory length: 3970   epsilon: 1.0    steps: 921     evaluation reward: 122.5
episode: 6   score: 65.0   memory length: 4547   epsilon: 1.0    steps: 577     evaluation reward: 114.28571428571429
episode: 7   score: 125.0   memory length: 5202   epsilon: 1.0    steps: 655     evaluation reward: 115.625
episode: 8   score: 90.0   memory length: 5713   epsilon: 1.0    steps: 511     evaluation reward: 112.77777777777777
episode: 9   score: 155.0

  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]


Policy loss: -0.781280. Value loss: 0.283367.
Iteration 2
Policy loss: -0.793153. Value loss: 0.147693.
Iteration 3
Policy loss: -0.792216. Value loss: 0.103779.
episode: 14   score: 100.0   memory length: 10510   epsilon: 1.0    steps: 487     evaluation reward: 132.33333333333334
episode: 15   score: 225.0   memory length: 11485   epsilon: 1.0    steps: 975     evaluation reward: 138.125
episode: 16   score: 110.0   memory length: 12238   epsilon: 1.0    steps: 753     evaluation reward: 136.47058823529412
episode: 17   score: 70.0   memory length: 12772   epsilon: 1.0    steps: 534     evaluation reward: 132.77777777777777
episode: 18   score: 120.0   memory length: 13507   epsilon: 1.0    steps: 735     evaluation reward: 132.10526315789474
episode: 19   score: 160.0   memory length: 14329   epsilon: 1.0    steps: 822     evaluation reward: 133.5
episode: 20   score: 110.0   memory length: 14854   epsilon: 1.0    steps: 525     evaluation reward: 132.38095238095238
episode: 21   sc

episode: 75   score: 130.0   memory length: 53328   epsilon: 1.0    steps: 607     evaluation reward: 141.18421052631578
episode: 76   score: 290.0   memory length: 54293   epsilon: 1.0    steps: 965     evaluation reward: 143.11688311688312
episode: 77   score: 315.0   memory length: 55103   epsilon: 1.0    steps: 810     evaluation reward: 145.32051282051282
episode: 78   score: 130.0   memory length: 55943   epsilon: 1.0    steps: 840     evaluation reward: 145.126582278481
episode: 79   score: 135.0   memory length: 56818   epsilon: 1.0    steps: 875     evaluation reward: 145.0
episode: 80   score: 10.0   memory length: 57275   epsilon: 1.0    steps: 457     evaluation reward: 143.33333333333334
episode: 81   score: 55.0   memory length: 57862   epsilon: 1.0    steps: 587     evaluation reward: 142.2560975609756
episode: 82   score: 235.0   memory length: 58947   epsilon: 1.0    steps: 1085     evaluation reward: 143.3734939759036
episode: 83   score: 325.0   memory length: 59805 

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn")