# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [None]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
agent = Agent(action_size)
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10

### Main Training Loop

In [2]:
vis_env_idx = 0
vis_env = envs[vis_env_idx]
e = 0
frame = 0
max_eval = -np.inf
reset_count = 0

while (frame < 10000000):
    step = 0
    assert(num_envs * env_mem_size == train_frame)
    frame_next_vals = []
    for i in range(num_envs):
        env = envs[i]
        #history = env.history
        #life = env.life
        #state, reward, done, info = [env.state, env.reward, env.done, env.info]
        for j in range(env_mem_size):
            step += 1
            frame += 1
            
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            action, value = agent.get_action(np.float32(env.history[:HISTORY_SIZE,:,:]) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            
            if (i == vis_env_idx):
                vis_env._env.render()
            
            frame_next_state = get_frame(next_state)
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            
            env.life = env.info['ale.lives']
            r = env.reward
            
            agent.memory.push(i, deepcopy(curr_state), action, r, terminal_state, value, 0, 0)
            if (j == env_mem_size-1):
                _, frame_next_val = agent.get_action(np.float32(env.history[1:,:,:]) / 255.)
                frame_next_vals.append(frame_next_val)
            env.score += r
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            
            if (env.done):
                if (e % 50 == 0):
                    print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    pylab.plot(episodes, rewards, 'b')
                    pylab.savefig("./save_graph/spaceinvaders_ppo.png")
                    torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")
                    
                    if np.mean(evaluation_reward) > max_eval:
                        torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
                        max_eval = float(np.mean(evaluation_reward))
                        reset_count = 0
                    elif e > 5000:
                        reset_count += 1
                        """
                        if (reset_count == reset_max):
                            print("Training went nowhere, starting again at best model")
                            agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                            agent.update_target_net()
                            reset_count = 0
                        """
                e += 1
                evaluation_reward.append(env.score)
                print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                  " evaluation reward:", np.mean(evaluation_reward))
                
                env.done = False
                env.score = 0
                env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                env.state = env.reset()
                env.life = number_lives
                get_init_state(env.history, env.state)
                
                
                
    agent.train_policy_net(frame, frame_next_vals)
    agent.update_target_net()


NameError: name 'envs' is not defined

In [None]:
torch.save(agent.policy_net, "./save_model/spaceinvaders_ppo")

In [None]:
### Loop through all environments and run PPO on them

env_names = ['SpaceInvaders-v0', 'MsPacman-v0', 'Asteroids-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']

for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size)
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 10000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[HISTORY_SIZE-1,:,:] for i in range(num_envs)])
            next_states = []
            net_in = np.stack([envs[i].history[:HISTORY_SIZE,:,:] for i in range(num_envs)])
            step += num_envs
            frame += num_envs
            actions, values = agent.get_action(np.float32(net_in) / 255.)
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, deepcopy(curr_states[i]), actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    _, frame_next_vals = agent.get_action(np.float32(net_in) / 255.)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()




 ------- STARTING TRAINING FOR SpaceInvaders-v0 ------- 





  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Determing min/max rewards of environment
Min: 0. Max: 200.


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000


  pol_loss += pol_avg.detach().cpu()[0]
  vf_loss += value_loss.detach().cpu()[0]
  ent_total += ent.detach().cpu()[0]


Iteration 1: Policy loss: -0.003272. Value loss: 0.026312. Entropy: 1.384223.
Iteration 2: Policy loss: -0.004937. Value loss: 0.027108. Entropy: 1.382908.
Iteration 3: Policy loss: -0.003928. Value loss: 0.026711. Entropy: 1.383878.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: 0.002093. Value loss: 0.144672. Entropy: 1.380014.
Iteration 5: Policy loss: -0.002442. Value loss: 0.133692. Entropy: 1.383995.
Iteration 6: Policy loss: -0.001728. Value loss: 0.123989. Entropy: 1.384504.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: 0.000935. Value loss: 0.086329. Entropy: 1.384142.
Iteration 8: Policy loss: 0.000656. Value loss: 0.068617. Entropy: 1.384951.
Iteration 9: Policy loss: -0.002952. Value loss: 0.057422. Entropy: 1.382775.
Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: 0.001547. Value loss: 0.448342. Entropy: 1.382997.
Iteration 11: Policy loss: 0.001572. Value loss: 0.375268. Entropy: 1.382617.
Iter

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode: 3   score: 80.0  epsilon: 1.0    steps: 400  evaluation reward: 88.33333333333333
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: 0.000025. Value loss: 0.116523. Entropy: 1.378933.
Iteration 14: Policy loss: -0.001092. Value loss: 0.064021. Entropy: 1.380551.
Iteration 15: Policy loss: -0.007435. Value loss: 0.051885. Entropy: 1.379807.
episode: 4   score: 80.0  epsilon: 1.0    steps: 160  evaluation reward: 86.25
episode: 5   score: 190.0  epsilon: 1.0    steps: 360  evaluation reward: 107.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 16: Policy loss: -0.000568. Value loss: 0.107463. Entropy: 1.377571.
Iteration 17: Policy loss: -0.005650. Value loss: 0.074627. Entropy: 1.381477.
Iteration 18: Policy loss: -0.006085. Value loss: 0.065890. Entropy: 1.380943.
episode: 6   score: 105.0  epsilon: 1.0    steps: 1024  evaluation reward: 106.66666666666667
Training network. lr: 0.000250. clip: 0.100000
Iteration 19: Policy loss: 0.000387. Value

Iteration 72: Policy loss: -0.006011. Value loss: 0.029801. Entropy: 1.324212.
episode: 32   score: 80.0  epsilon: 1.0    steps: 656  evaluation reward: 126.875
Training network. lr: 0.000250. clip: 0.099853
Iteration 73: Policy loss: -0.001540. Value loss: 0.178080. Entropy: 1.324938.
Iteration 74: Policy loss: -0.005811. Value loss: 0.105113. Entropy: 1.315156.
Iteration 75: Policy loss: -0.006423. Value loss: 0.084247. Entropy: 1.324882.
episode: 33   score: 180.0  epsilon: 1.0    steps: 128  evaluation reward: 128.4848484848485
episode: 34   score: 425.0  epsilon: 1.0    steps: 840  evaluation reward: 137.2058823529412
Training network. lr: 0.000250. clip: 0.099853
Iteration 76: Policy loss: -0.000492. Value loss: 0.077550. Entropy: 1.339006.
Iteration 77: Policy loss: -0.002154. Value loss: 0.045066. Entropy: 1.336865.
Iteration 78: Policy loss: -0.002409. Value loss: 0.034077. Entropy: 1.334661.
episode: 35   score: 120.0  epsilon: 1.0    steps: 328  evaluation reward: 136.714285

Iteration 130: Policy loss: 0.000958. Value loss: 0.139184. Entropy: 1.327122.
Iteration 131: Policy loss: -0.003079. Value loss: 0.077704. Entropy: 1.333090.
Iteration 132: Policy loss: -0.004087. Value loss: 0.063457. Entropy: 1.323229.
episode: 61   score: 240.0  epsilon: 1.0    steps: 704  evaluation reward: 162.13114754098362
episode: 62   score: 230.0  epsilon: 1.0    steps: 1000  evaluation reward: 163.2258064516129
Training network. lr: 0.000249. clip: 0.099696
Iteration 133: Policy loss: 0.001340. Value loss: 0.209012. Entropy: 1.335206.
Iteration 134: Policy loss: -0.006557. Value loss: 0.127120. Entropy: 1.313879.
Iteration 135: Policy loss: -0.009323. Value loss: 0.101249. Entropy: 1.314960.
episode: 63   score: 70.0  epsilon: 1.0    steps: 400  evaluation reward: 161.74603174603175
episode: 64   score: 165.0  epsilon: 1.0    steps: 464  evaluation reward: 161.796875
Training network. lr: 0.000249. clip: 0.099696
Iteration 136: Policy loss: 0.000553. Value loss: 0.113796. E