# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [None]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0


### Main Training Loop

In [None]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
while (e < EPISODES):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                e += 1
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                env.done = False
                evaluation_reward.append(env.score)
                env.score = 0
                env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()

'''
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1
        if render_breakout:
            env.render()

        # Select and perform an action
        curr_state = history[3,:,:]
        action, value = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        
        next_state, reward, done, info = env.step(action)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        #r = np.clip(reward, -1, 1)
        r = reward
        """
        if terminal_state:
            r -= 20
        """
        # Store the transition in memory 
        
        agent.memory.push(deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
        # Start training after random sample generation
        if(frame % train_frame == 0):
            _, frame_next_val = agent.get_action(np.float32(history[1:, :, :]) / 255.)
            agent.train_policy_net(frame, frame_next_val)
            # Update the target network
            agent.update_target_net()
        score += r
        history[:4, :, :] = history[1:, :, :]

        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.savefig("./save_graph/spaceinvaders_ppo.png")
            torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 10 episode is bigger than 400
            # stop training
            if np.mean(evaluation_reward) > 700 and len(evaluation_reward) > 40:
                torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                sys.exit()
'''

  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: -0.933571. Value loss: 8.549062. Entropy: 1.784475.
Iteration 2: Policy loss: -0.937304. Value loss: 7.421401. Entropy: 1.785210.
Iteration 3: Policy loss: -0.880273. Value loss: 7.842966. Entropy: 1.786776.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: -6.228714. Value loss: 57.180992. Entropy: 1.768686.
Iteration 5: Policy loss: -6.499210. Value loss: 41.566692. Entropy: 1.783181.
Iteration 6: Policy loss: -6.286735. Value loss: 38.287395. Entropy: 1.783888.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: -4.955023. Value loss: 394.677856. Entropy: 1.747445.
Iteration 8: Policy loss: -4.575887. Value loss: 362.984131. Entropy: 1.758649.
Iteration 9: Policy loss: -3.514594. Value loss: 238.865570. Entropy: 1.752203.
Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: -0.103457. Value loss: 48.649307. Entropy: 1.751076.
Iteration 11: Policy loss: -0.223800. Value loss: 40.866898. Entropy

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode: 2   score: 180.0  epsilon: 1.0    steps: 392  evaluation reward: 65.0
episode: 3   score: 85.0  epsilon: 1.0    steps: 708  evaluation reward: 122.5
episode: 4   score: 140.0  epsilon: 1.0    steps: 872  evaluation reward: 110.0
episode: 5   score: 210.0  epsilon: 1.0    steps: 1020  evaluation reward: 117.5
Training network. lr: 0.000250. clip: 0.100000
Iteration 16: Policy loss: -0.483179. Value loss: 36.736061. Entropy: 1.774945.
Iteration 17: Policy loss: -0.488345. Value loss: 32.372993. Entropy: 1.774215.
Iteration 18: Policy loss: -0.526784. Value loss: 33.224861. Entropy: 1.771197.
episode: 6   score: 135.0  epsilon: 1.0    steps: 14  evaluation reward: 136.0
episode: 7   score: 185.0  epsilon: 1.0    steps: 599  evaluation reward: 135.83333333333334
Training network. lr: 0.000250. clip: 0.100000
Iteration 19: Policy loss: -0.984435. Value loss: 22.549641. Entropy: 1.762368.
Iteration 20: Policy loss: -0.683435. Value loss: 18.380424. Entropy: 1.760056.
Iteration 21: P

episode: 31   score: 135.0  epsilon: 1.0    steps: 341  evaluation reward: 165.83333333333334
episode: 32   score: 225.0  epsilon: 1.0    steps: 682  evaluation reward: 164.83870967741936
Training network. lr: 0.000250. clip: 0.099853
Iteration 76: Policy loss: 2.063422. Value loss: 59.601566. Entropy: 1.223090.
Iteration 77: Policy loss: 2.214382. Value loss: 43.739197. Entropy: 1.316381.
Iteration 78: Policy loss: 1.914910. Value loss: 33.723267. Entropy: 1.338841.
episode: 33   score: 125.0  epsilon: 1.0    steps: 398  evaluation reward: 166.71875
episode: 34   score: 500.0  epsilon: 1.0    steps: 579  evaluation reward: 165.45454545454547
Training network. lr: 0.000250. clip: 0.099853
Iteration 79: Policy loss: -0.890890. Value loss: 42.453869. Entropy: 1.343741.
Iteration 80: Policy loss: -0.634966. Value loss: 35.943394. Entropy: 1.341413.
Iteration 81: Policy loss: -0.839585. Value loss: 30.101677. Entropy: 1.356119.
Training network. lr: 0.000250. clip: 0.099853
Iteration 82: P

Iteration 136: Policy loss: 0.279774. Value loss: 57.736980. Entropy: 1.180546.
Iteration 137: Policy loss: 0.093887. Value loss: 44.966930. Entropy: 1.146017.
Iteration 138: Policy loss: 0.452482. Value loss: 36.964764. Entropy: 1.147530.
episode: 57   score: 165.0  epsilon: 1.0    steps: 538  evaluation reward: 219.10714285714286
Training network. lr: 0.000249. clip: 0.099696
Iteration 139: Policy loss: -0.680494. Value loss: 89.562531. Entropy: 0.845747.
Iteration 140: Policy loss: -0.973763. Value loss: 65.459816. Entropy: 0.879256.
Iteration 141: Policy loss: -1.026413. Value loss: 51.685360. Entropy: 0.843506.
Training network. lr: 0.000249. clip: 0.099696
Iteration 142: Policy loss: 8.764532. Value loss: 116.229263. Entropy: 0.902161.
Iteration 143: Policy loss: 8.085527. Value loss: 84.246696. Entropy: 0.814995.
Iteration 144: Policy loss: 8.233542. Value loss: 69.685928. Entropy: 0.877447.
Training network. lr: 0.000249. clip: 0.099696
Iteration 145: Policy loss: 4.298084. Val

Iteration 199: Policy loss: -0.208454. Value loss: 16.540390. Entropy: 1.143352.
Iteration 200: Policy loss: -0.234761. Value loss: 15.044557. Entropy: 1.144075.
Iteration 201: Policy loss: -0.198929. Value loss: 12.358613. Entropy: 1.126790.
episode: 81   score: 300.0  epsilon: 1.0    steps: 384  evaluation reward: 223.9375
episode: 82   score: 75.0  epsilon: 1.0    steps: 800  evaluation reward: 224.87654320987653
episode: 83   score: 180.0  epsilon: 1.0    steps: 915  evaluation reward: 223.0487804878049
Training network. lr: 0.000248. clip: 0.099392
Iteration 202: Policy loss: 0.381033. Value loss: 16.715450. Entropy: 1.271847.
Iteration 203: Policy loss: 0.302751. Value loss: 14.012074. Entropy: 1.257766.
Iteration 204: Policy loss: 0.414481. Value loss: 12.156748. Entropy: 1.271204.
episode: 84   score: 185.0  epsilon: 1.0    steps: 504  evaluation reward: 222.53012048192772
Training network. lr: 0.000248. clip: 0.099392
Iteration 205: Policy loss: 0.082901. Value loss: 17.175900

Iteration 259: Policy loss: -0.268778. Value loss: 21.454350. Entropy: 0.935555.
Iteration 260: Policy loss: -0.621317. Value loss: 18.199528. Entropy: 0.919267.
Iteration 261: Policy loss: -0.388750. Value loss: 18.179459. Entropy: 0.929936.
Training network. lr: 0.000248. clip: 0.099235
Iteration 262: Policy loss: -0.230138. Value loss: 11.210797. Entropy: 0.955687.
Iteration 263: Policy loss: -0.081947. Value loss: 10.507551. Entropy: 0.969486.
Iteration 264: Policy loss: -0.118327. Value loss: 8.451283. Entropy: 0.987419.
episode: 108   score: 180.0  epsilon: 1.0    steps: 207  evaluation reward: 219.0
episode: 109   score: 180.0  epsilon: 1.0    steps: 407  evaluation reward: 220.0
Training network. lr: 0.000248. clip: 0.099235
Iteration 265: Policy loss: 0.104848. Value loss: 6.911222. Entropy: 1.076585.
Iteration 266: Policy loss: 0.060653. Value loss: 5.315980. Entropy: 1.106098.
Iteration 267: Policy loss: 0.222223. Value loss: 6.139189. Entropy: 1.071675.
Training network. lr

Iteration 323: Policy loss: 1.591235. Value loss: 10.879167. Entropy: 0.822671.
Iteration 324: Policy loss: 1.251508. Value loss: 11.841836. Entropy: 0.844086.
Training network. lr: 0.000248. clip: 0.099088
Iteration 325: Policy loss: 0.307136. Value loss: 5.592653. Entropy: 0.941788.
Iteration 326: Policy loss: 0.240742. Value loss: 4.978774. Entropy: 0.968712.
Iteration 327: Policy loss: 0.284613. Value loss: 4.001860. Entropy: 1.012941.
episode: 134   score: 180.0  epsilon: 1.0    steps: 342  evaluation reward: 222.2
episode: 135   score: 155.0  epsilon: 1.0    steps: 586  evaluation reward: 219.0
episode: 136   score: 180.0  epsilon: 1.0    steps: 740  evaluation reward: 219.5
Training network. lr: 0.000248. clip: 0.099088
Iteration 328: Policy loss: 0.714695. Value loss: 13.052287. Entropy: 0.942582.
Iteration 329: Policy loss: 0.841820. Value loss: 11.983703. Entropy: 0.951411.
Iteration 330: Policy loss: 0.921293. Value loss: 10.564483. Entropy: 0.892324.
episode: 137   score: 1

Iteration 386: Policy loss: -0.330903. Value loss: 5.959739. Entropy: 0.835224.
Iteration 387: Policy loss: -0.222774. Value loss: 5.280733. Entropy: 0.868079.
episode: 160   score: 210.0  epsilon: 1.0    steps: 93  evaluation reward: 191.45
episode: 161   score: 180.0  epsilon: 1.0    steps: 544  evaluation reward: 190.95
episode: 162   score: 210.0  epsilon: 1.0    steps: 995  evaluation reward: 190.95
Training network. lr: 0.000247. clip: 0.098931
Iteration 388: Policy loss: -0.425623. Value loss: 9.147566. Entropy: 1.102752.
Iteration 389: Policy loss: -0.465475. Value loss: 7.375258. Entropy: 1.108688.
Iteration 390: Policy loss: -0.470706. Value loss: 6.205242. Entropy: 1.102273.
Training network. lr: 0.000247. clip: 0.098931
Iteration 391: Policy loss: -0.884775. Value loss: 10.732188. Entropy: 0.927561.
Iteration 392: Policy loss: -0.904393. Value loss: 9.218196. Entropy: 0.881861.
Iteration 393: Policy loss: -0.982777. Value loss: 10.154572. Entropy: 0.902705.
episode: 163   s

Iteration 448: Policy loss: 0.584292. Value loss: 12.220794. Entropy: 1.273147.
Iteration 449: Policy loss: 0.480980. Value loss: 7.398629. Entropy: 1.290711.
Iteration 450: Policy loss: 0.641890. Value loss: 7.367024. Entropy: 1.273775.
Training network. lr: 0.000247. clip: 0.098627
Iteration 451: Policy loss: -0.172685. Value loss: 8.852274. Entropy: 1.124786.
Iteration 452: Policy loss: -0.155134. Value loss: 6.475684. Entropy: 1.133280.
Iteration 453: Policy loss: -0.154775. Value loss: 5.889314. Entropy: 1.152002.
episode: 188   score: 180.0  epsilon: 1.0    steps: 145  evaluation reward: 180.7
episode: 189   score: 180.0  epsilon: 1.0    steps: 481  evaluation reward: 180.7
Training network. lr: 0.000247. clip: 0.098627
Iteration 454: Policy loss: 0.121574. Value loss: 8.582655. Entropy: 1.126641.
Iteration 455: Policy loss: 0.187911. Value loss: 7.360710. Entropy: 1.119944.
Iteration 456: Policy loss: 0.334509. Value loss: 6.597432. Entropy: 1.114893.
episode: 190   score: 180.0

Iteration 512: Policy loss: 4.647836. Value loss: 46.026466. Entropy: 0.903280.
Iteration 513: Policy loss: 4.651721. Value loss: 37.197052. Entropy: 0.943941.
episode: 213   score: 210.0  epsilon: 1.0    steps: 244  evaluation reward: 186.25
episode: 214   score: 410.0  epsilon: 1.0    steps: 328  evaluation reward: 186.55
Training network. lr: 0.000246. clip: 0.098470
Iteration 514: Policy loss: -1.138902. Value loss: 24.859459. Entropy: 0.949088.
Iteration 515: Policy loss: -1.157449. Value loss: 16.479446. Entropy: 0.977673.
Iteration 516: Policy loss: -1.319549. Value loss: 13.339704. Entropy: 0.966704.
Training network. lr: 0.000246. clip: 0.098470
Iteration 517: Policy loss: 0.432747. Value loss: 18.779242. Entropy: 1.000293.
Iteration 518: Policy loss: 0.346731. Value loss: 13.435173. Entropy: 0.979491.
Iteration 519: Policy loss: 0.346564. Value loss: 9.938462. Entropy: 0.985932.
episode: 215   score: 210.0  epsilon: 1.0    steps: 576  evaluation reward: 188.55
Training networ

Iteration 576: Policy loss: 0.154621. Value loss: 15.819524. Entropy: 0.894547.
episode: 239   score: 180.0  epsilon: 1.0    steps: 588  evaluation reward: 190.55
Training network. lr: 0.000246. clip: 0.098313
Iteration 577: Policy loss: 0.151186. Value loss: 7.887047. Entropy: 1.150429.
Iteration 578: Policy loss: 0.295190. Value loss: 5.869577. Entropy: 1.132430.
Iteration 579: Policy loss: 0.085342. Value loss: 4.558584. Entropy: 1.169071.
episode: 240   score: 105.0  epsilon: 1.0    steps: 887  evaluation reward: 190.55
Training network. lr: 0.000246. clip: 0.098313
Iteration 580: Policy loss: 0.632348. Value loss: 6.596669. Entropy: 0.979960.
Iteration 581: Policy loss: 0.409425. Value loss: 4.676695. Entropy: 0.986844.
Iteration 582: Policy loss: 0.606759. Value loss: 4.237300. Entropy: 1.036243.
episode: 241   score: 180.0  epsilon: 1.0    steps: 935  evaluation reward: 190.0
Training network. lr: 0.000246. clip: 0.098313
Iteration 583: Policy loss: -0.025698. Value loss: 10.846

episode: 265   score: 105.0  epsilon: 1.0    steps: 453  evaluation reward: 188.85
episode: 266   score: 210.0  epsilon: 1.0    steps: 779  evaluation reward: 188.1
Training network. lr: 0.000245. clip: 0.098166
Iteration 640: Policy loss: 0.930046. Value loss: 10.087639. Entropy: 0.940375.
Iteration 641: Policy loss: 0.885050. Value loss: 5.509145. Entropy: 0.981000.
Iteration 642: Policy loss: 0.816088. Value loss: 5.266035. Entropy: 0.982919.


In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")