# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

### Main Training Loop

In [None]:
agent = Agent(action_size)
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10


### Loop through all environments and run PPO on them

#env_names = ['Breakout-v0', 'Phoenix-v0', 'Asteroids-v0', 'SpaceInvaders-v0', 'MsPacman-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']
env_names = ['SpaceInvaders-v4']
for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0' or name == 'Breakout-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size)
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[HISTORY_SIZE-1,:,:] for i in range(num_envs)])
            next_states = []
            net_in = np.stack([envs[i].history[:HISTORY_SIZE,:,:] for i in range(num_envs)])
            step += num_envs
            frame += num_envs
            actions, values, _ = agent.get_action(np.float32(net_in) / 255.)
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, deepcopy(curr_states[i]), actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    _, frame_next_vals, _ = agent.get_action(np.float32(net_in) / 255.)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()
    
    for i in range(len(envs)):
        envs[i]._env.close()
    del envs




 ------- STARTING TRAINING FOR SpaceInvaders-v4 ------- 



Determing min/max rewards of environment
Min: 0. Max: 200.


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: 0.000542. Value loss: 0.009386. Entropy: 1.788949.
load: 0.222877
forward: 0.028469
loss: 0.078945
clip: 0.000004
backward: 0.029757
step: 0.013512
total: 0.373564

Iteration 2: Policy loss: -0.002922. Value loss: 0.009325. Entropy: 1.789178.
load: 0.217304
forward: 0.029601
loss: 0.075127
clip: 0.000005
backward: 0.030560
step: 0.014437
total: 0.367034

Iteration 3: Policy loss: -0.001228. Value loss: 0.009557. Entropy: 1.788826.
load: 0.218923
forward: 0.027762
loss: 0.076977
clip: 0.000011
backward: 0.031800
step: 0.013444
total: 0.368917

Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: 0.001917. Value loss: 0.194006. Entropy: 1.785831.
load: 0.206221
forward: 0.028512
loss: 0.076011
clip: 0.000003
backward: 0.028849
step: 0.013279
total: 0.352875

Iteration 5: Policy loss: -0.002426. Value loss: 0.155647. Entropy: 1.788443.
load: 0.203776
forward: 0.028186
loss: 0.075319
clip: 0.000004
backward: 0.029050
step: 0.013371
total: 0.349

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: 0.007812. Value loss: 0.090573. Entropy: 1.788031.
load: 0.209718
forward: 0.028206
loss: 0.074531
clip: 0.000003
backward: 0.029182
step: 0.013379
total: 0.355019

Iteration 11: Policy loss: -0.000473. Value loss: 0.058757. Entropy: 1.790620.
load: 0.213780
forward: 0.028509
loss: 0.073828
clip: 0.000005
backward: 0.029465
step: 0.013089
total: 0.358677

Iteration 12: Policy loss: -0.003563. Value loss: 0.050626. Entropy: 1.790224.
load: 0.218368
forward: 0.029473
loss: 0.073363
clip: 0.000004
backward: 0.029905
step: 0.014127
total: 0.365238

episode: 2   score: 85.0  epsilon: 1.0    steps: 1024  evaluation reward: 75.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: 0.004388. Value loss: 0.113065. Entropy: 1.788931.
load: 0.207942
forward: 0.027733
loss: 0.073529
clip: 0.000003
backward: 0.029806
step: 0.013783
total: 0.352796

Iteration 14: Policy loss: -0.000368. Value loss: 0.08125

In [None]:
def test_best(name):
    env = GameEnv(name)
    print("\n\n\n ------- TESTING BEST MODEL FOR %s ------- \n\n\n" % (name))
    number_lives = env.life
    
    if (name == 'SpaceInvaders-v0'):
        action_size = 4
    else:
        action_size = env.action_space.n
    rewards, episodes = [], []
    
    e = 0
    frame = 0

    agent = Agent(action_size)
    agent.policy_net.load_state_dict(torch.load("./save_model/" + name + "_ppo_best"))
    agent.update_target_net()
    agent.policy_net.eval()
    evaluation_reward = deque(maxlen=evaluation_reward_length)

    for i in range(100):
        env.done = False
        env.score = 0
        env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
        env.state = env.reset()
        env.life = number_lives
        get_init_state(env.history, env.state)
        step = 0
        while not env.done:
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            net_in = env.history[:HISTORY_SIZE,:,:]
            action, value, _ = agent.get_action(np.float32(net_in) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            env._env.render()
            
            frame_next_state = get_frame(next_state)
            
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            env.life = env.info['ale.lives']
            
            
            env.score += env.reward
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            step += 1
        

        evaluation_reward.append(env.score)
        print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))
            

In [None]:
test_best('MsPacman-v0')

### Convolutional LSTM agent

In [None]:
agent = Agent(action_size, mode='PPO_LSTM')
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10


### Loop through all environments and run PPO on them

#env_names = ['Breakout-v0', 'Phoenix-v0', 'Asteroids-v0', 'SpaceInvaders-v0', 'MsPacman-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']
env_names = ['SpaceInvaders-v4']
for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
        envs[i].reset_memory(agent.init_hidden())
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0' or name == 'Breakout-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size, mode='PPO_LSTM')
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[[HISTORY_SIZE-1],:,:] for i in range(num_envs)])
            hiddens = torch.cat([envs[i].memory for i in range(num_envs)])
            next_states = []
            step += num_envs
            frame += num_envs
            actions, values, hiddens = agent.get_action(np.float32(curr_states) / 255., hiddens)
            hiddens = hiddens.detach()
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                env.memory = hiddens[[i]]
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, [deepcopy(curr_states[i]), hiddens[i].detach().cpu().data.numpy()], actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    #net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    net_in = np.stack([envs[k].history[[-1],:,:] for k in range(num_envs)])
                    _, frame_next_vals, _ = agent.get_action(np.float32(net_in) / 255., hiddens)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
                    env.reset_memory(agent.init_hidden())
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()
    
    for i in range(len(envs)):
        envs[i]._env.close()
    del envs




 ------- STARTING TRAINING FOR SpaceInvaders-v4 ------- 



Determing min/max rewards of environment
Min: 0. Max: 30.


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))
  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: 0.018577. Value loss: 0.865079. Entropy: 1.764202.
Iteration 2: Policy loss: -0.000534. Value loss: 0.863796. Entropy: 1.787912.
Iteration 3: Policy loss: 0.000039. Value loss: 0.859918. Entropy: 1.789157.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: 0.016562. Value loss: 2.537752. Entropy: 1.774985.
Iteration 5: Policy loss: 0.003056. Value loss: 2.486270. Entropy: 1.782428.
Iteration 6: Policy loss: 0.007223. Value loss: 2.419378. Entropy: 1.786161.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: 0.006031. Value loss: 1.130184. Entropy: 1.780825.
Iteration 8: Policy loss: 0.003517. Value loss: 1.098652. Entropy: 1.784235.
Iteration 9: Policy loss: -0.002592. Value loss: 1.061716. Entropy: 1.784130.
now time :  2019-08-30 14:19:20.523799
episode: 1   score: 80.0  epsilon: 1.0    steps: 896  evaluation reward: 80.0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: 0.000855. Value loss: 2.279566. Entropy: 1.783752.
Iteration 11: Policy loss: -0.002447. Value loss: 2.219868. Entropy: 1.785918.
Iteration 12: Policy loss: 0.001141. Value loss: 2.147106. Entropy: 1.782934.
episode: 2   score: 100.0  epsilon: 1.0    steps: 120  evaluation reward: 90.0
episode: 3   score: 185.0  epsilon: 1.0    steps: 1016  evaluation reward: 121.66666666666667
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: -0.000154. Value loss: 2.336051. Entropy: 1.780688.
Iteration 14: Policy loss: 0.000784. Value loss: 2.339136. Entropy: 1.781157.
Iteration 15: Policy loss: -0.005321. Value loss: 2.311987. Entropy: 1.781651.
episode: 4   score: 110.0  epsilon: 1.0    steps: 16  evaluation reward: 118.75
episode: 5   score: 105.0  epsilon: 1.0    steps: 128  evaluation reward: 116.0
episode: 6   score: 155.0  epsilon: 1.0    steps: 784  evaluation reward: 122.5
Training network. lr: 

Iteration 69: Policy loss: 0.014637. Value loss: 1.419362. Entropy: 1.735388.
episode: 32   score: 110.0  epsilon: 1.0    steps: 920  evaluation reward: 185.0
Training network. lr: 0.000250. clip: 0.099853
Iteration 70: Policy loss: 0.007490. Value loss: 2.640646. Entropy: 1.711904.
Iteration 71: Policy loss: 0.012676. Value loss: 1.896738. Entropy: 1.710703.
Iteration 72: Policy loss: 0.009383. Value loss: 1.587145. Entropy: 1.708450.
episode: 33   score: 160.0  epsilon: 1.0    steps: 200  evaluation reward: 184.24242424242425
Training network. lr: 0.000250. clip: 0.099853
Iteration 73: Policy loss: 0.021450. Value loss: 4.197939. Entropy: 1.703299.
Iteration 74: Policy loss: 0.038168. Value loss: 3.080142. Entropy: 1.677595.
Iteration 75: Policy loss: 0.030708. Value loss: 2.412647. Entropy: 1.690186.
episode: 34   score: 105.0  epsilon: 1.0    steps: 424  evaluation reward: 181.91176470588235
episode: 35   score: 65.0  epsilon: 1.0    steps: 696  evaluation reward: 178.5714285714285