# Deep Q-Learning 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
import time
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
torch.backends.cudnn.benchmarks = True
from torch.autograd import Variable
from utils import *
from agent import *
from model import *
from config import *
from env import GameEnv
%matplotlib inline
%load_ext autoreload
%autoreload 2

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [2]:
envs = []
for i in range(num_envs):
    envs.append(GameEnv('SpaceInvadersDeterministic-v4'))
#env.render()

In [3]:
number_lives = envs[0].life
state_size = envs[0].observation_space.shape
action_size = envs[0].action_space.n
rewards, episodes = [], []

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

### Main Training Loop

In [None]:
def test_best(name):
    env = GameEnv(name)
    print("\n\n\n ------- TESTING BEST MODEL FOR %s ------- \n\n\n" % (name))
    number_lives = env.life
    
    if (name == 'SpaceInvaders-v0'):
        action_size = 4
    else:
        action_size = env.action_space.n
    rewards, episodes = [], []
    
    e = 0
    frame = 0

    agent = Agent(action_size)
    agent.policy_net.load_state_dict(torch.load("./save_model/" + name + "_ppo_best"))
    agent.update_target_net()
    agent.policy_net.eval()
    evaluation_reward = deque(maxlen=evaluation_reward_length)

    for i in range(100):
        env.done = False
        env.score = 0
        env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
        env.state = env.reset()
        env.life = number_lives
        get_init_state(env.history, env.state)
        step = 0
        while not env.done:
            curr_state = env.history[HISTORY_SIZE-1,:,:]
            net_in = env.history[:HISTORY_SIZE,:,:]
            action, value, _ = agent.get_action(np.float32(net_in) / 255.)
            
            next_state, env.reward, env.done, env.info = env.step(action)
            env._env.render()
            
            frame_next_state = get_frame(next_state)
            
            env.history[HISTORY_SIZE,:,:] = frame_next_state
            terminal_state = check_live(env.life, env.info['ale.lives'])
            env.life = env.info['ale.lives']
            
            
            env.score += env.reward
            env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
            step += 1
        

        evaluation_reward.append(env.score)
        print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))
            

In [None]:
for i in env_names:
    test_best[i]

### Convolutional LSTM agent

In [None]:
agent = Agent(action_size, mode='PPO_MHDPA')
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10


### Loop through all environments and run PPO on them

env_names = ['SpaceInvaders-v0', 'Boxing-v0', 'DoubleDunk-v0', 'IceHockey-v0', 'Breakout-v0', 'Phoenix-v0', 'Asteroids-v0', 'MsPacman-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']
#env_names = ['SpaceInvaders-v4']
for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0' or name == 'Breakout-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size, mode='PPO')
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 20000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[HISTORY_SIZE-1,:,:] for i in range(num_envs)])
            next_states = []
            net_in = np.stack([envs[i].history[:HISTORY_SIZE,:,:] for i in range(num_envs)])
            step += num_envs
            frame += num_envs
            actions, values, _ = agent.get_action(np.float32(net_in) / 255.)
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, deepcopy(curr_states[i]), actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    _, frame_next_vals, _ = agent.get_action(np.float32(net_in) / 255.)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()
    
    for i in range(len(envs)):
        envs[i]._env.close()
    del envs




 ------- STARTING TRAINING FOR SpaceInvaders-v0 ------- 



Determing min/max rewards of environment
Min: 0. Max: 200.


  probs = F.softmax(x[:,:self.action_size] - torch.max(x[:,:self.action_size],1)[0].unsqueeze(1))


Training network. lr: 0.000250. clip: 0.100000
Iteration 1: Policy loss: -0.087297. Value loss: 0.027573. Entropy: 0.346327.
Iteration 2: Policy loss: -0.089115. Value loss: 0.025702. Entropy: 0.344373.
Iteration 3: Policy loss: -0.089712. Value loss: 0.021268. Entropy: 0.344549.
Training network. lr: 0.000250. clip: 0.100000
Iteration 4: Policy loss: -0.375369. Value loss: 0.161101. Entropy: 0.345617.
Iteration 5: Policy loss: -0.377800. Value loss: 0.134698. Entropy: 0.346057.
Iteration 6: Policy loss: -0.375845. Value loss: 0.099258. Entropy: 0.346682.
Training network. lr: 0.000250. clip: 0.100000
Iteration 7: Policy loss: -0.173769. Value loss: 0.115798. Entropy: 0.345349.
Iteration 8: Policy loss: -0.172274. Value loss: 0.092829. Entropy: 0.346002.
Iteration 9: Policy loss: -0.166380. Value loss: 0.071452. Entropy: 0.346318.
now time :  2019-09-28 10:16:25.975083
episode: 1   score: 65.0  epsilon: 1.0    steps: 952  evaluation reward: 65.0


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Training network. lr: 0.000250. clip: 0.100000
Iteration 10: Policy loss: -0.224217. Value loss: 0.116623. Entropy: 0.345608.
Iteration 11: Policy loss: -0.233139. Value loss: 0.069058. Entropy: 0.345300.
Iteration 12: Policy loss: -0.231558. Value loss: 0.053121. Entropy: 0.345733.
episode: 2   score: 45.0  epsilon: 1.0    steps: 152  evaluation reward: 55.0
episode: 3   score: 135.0  epsilon: 1.0    steps: 376  evaluation reward: 81.66666666666667
Training network. lr: 0.000250. clip: 0.100000
Iteration 13: Policy loss: -0.376040. Value loss: 0.381022. Entropy: 0.345861.
Iteration 14: Policy loss: -0.377707. Value loss: 0.331064. Entropy: 0.344518.
Iteration 15: Policy loss: -0.385020. Value loss: 0.304760. Entropy: 0.344705.
episode: 4   score: 120.0  epsilon: 1.0    steps: 200  evaluation reward: 91.25
episode: 5   score: 110.0  epsilon: 1.0    steps: 472  evaluation reward: 95.0
Training network. lr: 0.000250. clip: 0.100000
Iteration 16: Policy loss: -0.297327. Value loss: 0.1220

Iteration 73: Policy loss: -0.266638. Value loss: 0.365303. Entropy: 0.344974.
Iteration 74: Policy loss: -0.265152. Value loss: 0.223754. Entropy: 0.342675.
Iteration 75: Policy loss: -0.279644. Value loss: 0.175931. Entropy: 0.344162.
episode: 28   score: 460.0  epsilon: 1.0    steps: 312  evaluation reward: 209.64285714285714
episode: 29   score: 240.0  epsilon: 1.0    steps: 320  evaluation reward: 210.68965517241378
Training network. lr: 0.000250. clip: 0.099853
Iteration 76: Policy loss: 0.119494. Value loss: 0.055369. Entropy: 0.344947.
Iteration 77: Policy loss: 0.115497. Value loss: 0.029697. Entropy: 0.344809.
Iteration 78: Policy loss: 0.110617. Value loss: 0.020872. Entropy: 0.344038.
episode: 30   score: 105.0  epsilon: 1.0    steps: 872  evaluation reward: 207.16666666666666
Training network. lr: 0.000250. clip: 0.099853
Iteration 79: Policy loss: -0.281162. Value loss: 0.307344. Entropy: 0.341971.
Iteration 80: Policy loss: -0.283545. Value loss: 0.246363. Entropy: 0.343

Training network. lr: 0.000249. clip: 0.099696
Iteration 136: Policy loss: 0.011509. Value loss: 0.087159. Entropy: 0.346596.
Iteration 137: Policy loss: 0.010617. Value loss: 0.047063. Entropy: 0.344803.
Iteration 138: Policy loss: 0.010809. Value loss: 0.039403. Entropy: 0.345393.
episode: 54   score: 355.0  epsilon: 1.0    steps: 760  evaluation reward: 201.38888888888889
episode: 55   score: 185.0  epsilon: 1.0    steps: 1016  evaluation reward: 201.0909090909091
Training network. lr: 0.000249. clip: 0.099696
Iteration 139: Policy loss: 0.128009. Value loss: 0.104885. Entropy: 0.345599.
Iteration 140: Policy loss: 0.117074. Value loss: 0.048867. Entropy: 0.344989.
Iteration 141: Policy loss: 0.119619. Value loss: 0.041926. Entropy: 0.344437.
episode: 56   score: 210.0  epsilon: 1.0    steps: 144  evaluation reward: 201.25
episode: 57   score: 135.0  epsilon: 1.0    steps: 312  evaluation reward: 200.08771929824562
Training network. lr: 0.000249. clip: 0.099696
Iteration 142: Policy

Iteration 198: Policy loss: 0.048500. Value loss: 0.032288. Entropy: 0.345406.
Training network. lr: 0.000249. clip: 0.099548
Iteration 199: Policy loss: 0.117361. Value loss: 0.101837. Entropy: 0.342117.
Iteration 200: Policy loss: 0.124306. Value loss: 0.056419. Entropy: 0.341371.
Iteration 201: Policy loss: 0.109202. Value loss: 0.050456. Entropy: 0.341588.
episode: 79   score: 155.0  epsilon: 1.0    steps: 528  evaluation reward: 196.26582278481013
Training network. lr: 0.000248. clip: 0.099392
Iteration 202: Policy loss: -0.117868. Value loss: 0.069015. Entropy: 0.343690.
Iteration 203: Policy loss: -0.126265. Value loss: 0.039931. Entropy: 0.344370.
Iteration 204: Policy loss: -0.120766. Value loss: 0.033849. Entropy: 0.344203.
episode: 80   score: 210.0  epsilon: 1.0    steps: 64  evaluation reward: 196.4375
episode: 81   score: 75.0  epsilon: 1.0    steps: 304  evaluation reward: 194.93827160493828
episode: 82   score: 260.0  epsilon: 1.0    steps: 496  evaluation reward: 195.7

Iteration 259: Policy loss: 0.003642. Value loss: 0.089019. Entropy: 0.347699.
Iteration 260: Policy loss: 0.005599. Value loss: 0.045933. Entropy: 0.347633.
Iteration 261: Policy loss: 0.008168. Value loss: 0.037941. Entropy: 0.346802.
episode: 105   score: 180.0  epsilon: 1.0    steps: 232  evaluation reward: 203.95
episode: 106   score: 360.0  epsilon: 1.0    steps: 864  evaluation reward: 203.45
Training network. lr: 0.000248. clip: 0.099235
Iteration 262: Policy loss: -0.057051. Value loss: 0.173330. Entropy: 0.346794.
Iteration 263: Policy loss: -0.075035. Value loss: 0.136113. Entropy: 0.348440.
Iteration 264: Policy loss: -0.079130. Value loss: 0.100282. Entropy: 0.347378.
Training network. lr: 0.000248. clip: 0.099235
Iteration 265: Policy loss: -0.037533. Value loss: 0.051897. Entropy: 0.346597.
Iteration 266: Policy loss: -0.039830. Value loss: 0.027704. Entropy: 0.346949.
Iteration 267: Policy loss: -0.039413. Value loss: 0.024909. Entropy: 0.346945.
episode: 107   score: 1

episode: 129   score: 360.0  epsilon: 1.0    steps: 776  evaluation reward: 202.4
Training network. lr: 0.000248. clip: 0.099088
Iteration 325: Policy loss: 0.019790. Value loss: 0.078088. Entropy: 0.345850.
Iteration 326: Policy loss: 0.019424. Value loss: 0.049421. Entropy: 0.345735.
Iteration 327: Policy loss: 0.024393. Value loss: 0.039805. Entropy: 0.344593.
Training network. lr: 0.000248. clip: 0.099088
Iteration 328: Policy loss: 0.011825. Value loss: 0.096937. Entropy: 0.343039.
Iteration 329: Policy loss: 0.004697. Value loss: 0.049206. Entropy: 0.342225.
Iteration 330: Policy loss: 0.006841. Value loss: 0.042685. Entropy: 0.341256.
Training network. lr: 0.000248. clip: 0.099088
Iteration 331: Policy loss: 0.032389. Value loss: 0.022956. Entropy: 0.344992.
Iteration 332: Policy loss: 0.025535. Value loss: 0.009893. Entropy: 0.344150.
Iteration 333: Policy loss: 0.027717. Value loss: 0.008240. Entropy: 0.343644.
Training network. lr: 0.000248. clip: 0.099088
Iteration 334: Poli

Iteration 390: Policy loss: 0.172374. Value loss: 0.018726. Entropy: 0.341855.
episode: 153   score: 260.0  epsilon: 1.0    steps: 416  evaluation reward: 215.1
Training network. lr: 0.000247. clip: 0.098931
Iteration 391: Policy loss: 0.091544. Value loss: 0.096255. Entropy: 0.343960.
Iteration 392: Policy loss: 0.084675. Value loss: 0.035960. Entropy: 0.342808.
Iteration 393: Policy loss: 0.085688. Value loss: 0.022250. Entropy: 0.343732.
episode: 154   score: 180.0  epsilon: 1.0    steps: 128  evaluation reward: 213.35
Training network. lr: 0.000247. clip: 0.098931
Iteration 394: Policy loss: -0.121226. Value loss: 0.354577. Entropy: 0.341397.
Iteration 395: Policy loss: -0.126736. Value loss: 0.262716. Entropy: 0.341228.
Iteration 396: Policy loss: -0.126209. Value loss: 0.205415. Entropy: 0.340648.
episode: 155   score: 180.0  epsilon: 1.0    steps: 408  evaluation reward: 213.3
episode: 156   score: 180.0  epsilon: 1.0    steps: 664  evaluation reward: 213.0
episode: 157   score:

Training network. lr: 0.000247. clip: 0.098627
Iteration 454: Policy loss: 0.012910. Value loss: 0.058570. Entropy: 0.346831.
Iteration 455: Policy loss: 0.006990. Value loss: 0.026835. Entropy: 0.347496.
Iteration 456: Policy loss: 0.007701. Value loss: 0.021190. Entropy: 0.348011.
episode: 179   score: 180.0  epsilon: 1.0    steps: 408  evaluation reward: 225.9
Training network. lr: 0.000247. clip: 0.098627
Iteration 457: Policy loss: -0.001848. Value loss: 0.076415. Entropy: 0.344711.
Iteration 458: Policy loss: -0.010345. Value loss: 0.044428. Entropy: 0.342820.
Iteration 459: Policy loss: -0.011946. Value loss: 0.034062. Entropy: 0.342836.
episode: 180   score: 255.0  epsilon: 1.0    steps: 680  evaluation reward: 226.35
Training network. lr: 0.000247. clip: 0.098627
Iteration 460: Policy loss: 0.011668. Value loss: 0.043594. Entropy: 0.343571.
Iteration 461: Policy loss: 0.009020. Value loss: 0.025539. Entropy: 0.343418.
Iteration 462: Policy loss: 0.010595. Value loss: 0.020370.

episode: 199   score: 270.0  epsilon: 1.0    steps: 248  evaluation reward: 245.65
episode: 200   score: 240.0  epsilon: 1.0    steps: 624  evaluation reward: 245.95
Training network. lr: 0.000246. clip: 0.098470
Iteration 523: Policy loss: -0.016730. Value loss: 0.039528. Entropy: 0.344132.
Iteration 524: Policy loss: -0.019712. Value loss: 0.015304. Entropy: 0.344122.
Iteration 525: Policy loss: -0.020364. Value loss: 0.011282. Entropy: 0.344045.
Training network. lr: 0.000246. clip: 0.098470
Iteration 526: Policy loss: 0.013520. Value loss: 0.079606. Entropy: 0.339996.
Iteration 527: Policy loss: 0.017754. Value loss: 0.037295. Entropy: 0.340285.
Iteration 528: Policy loss: 0.009127. Value loss: 0.027224. Entropy: 0.337733.
now time :  2019-09-28 10:27:05.452005
episode: 201   score: 205.0  epsilon: 1.0    steps: 944  evaluation reward: 243.9
Training network. lr: 0.000246. clip: 0.098470
Iteration 529: Policy loss: -0.061711. Value loss: 0.244656. Entropy: 0.345182.
Iteration 530: 

Iteration 588: Policy loss: -0.111398. Value loss: 0.028188. Entropy: 0.340499.
Training network. lr: 0.000246. clip: 0.098313
Iteration 589: Policy loss: -0.163624. Value loss: 0.246637. Entropy: 0.341675.
Iteration 590: Policy loss: -0.204632. Value loss: 0.105078. Entropy: 0.343372.
Iteration 591: Policy loss: -0.197396. Value loss: 0.070763. Entropy: 0.342029.
episode: 223   score: 275.0  epsilon: 1.0    steps: 136  evaluation reward: 252.55
episode: 224   score: 475.0  epsilon: 1.0    steps: 152  evaluation reward: 255.8
episode: 225   score: 210.0  epsilon: 1.0    steps: 856  evaluation reward: 256.35
Training network. lr: 0.000246. clip: 0.098313
Iteration 592: Policy loss: 0.125052. Value loss: 0.100530. Entropy: 0.346043.
Iteration 593: Policy loss: 0.113857. Value loss: 0.034632. Entropy: 0.346211.
Iteration 594: Policy loss: 0.115562. Value loss: 0.024891. Entropy: 0.345471.
episode: 226   score: 210.0  epsilon: 1.0    steps: 912  evaluation reward: 256.35
Training network. 

Iteration 653: Policy loss: -0.209976. Value loss: 0.244345. Entropy: 0.343887.
Iteration 654: Policy loss: -0.201735. Value loss: 0.168159. Entropy: 0.342542.
Training network. lr: 0.000245. clip: 0.098009
Iteration 655: Policy loss: -0.053433. Value loss: 0.217412. Entropy: 0.345275.
Iteration 656: Policy loss: -0.057901. Value loss: 0.068231. Entropy: 0.344378.
Iteration 657: Policy loss: -0.069220. Value loss: 0.042026. Entropy: 0.344212.
episode: 248   score: 270.0  epsilon: 1.0    steps: 392  evaluation reward: 265.4
Training network. lr: 0.000245. clip: 0.098009
Iteration 658: Policy loss: 0.132682. Value loss: 0.089981. Entropy: 0.341429.
Iteration 659: Policy loss: 0.129864. Value loss: 0.040596. Entropy: 0.339834.
Iteration 660: Policy loss: 0.126448. Value loss: 0.029311. Entropy: 0.338891.
episode: 249   score: 135.0  epsilon: 1.0    steps: 416  evaluation reward: 265.7
episode: 250   score: 225.0  epsilon: 1.0    steps: 776  evaluation reward: 263.25
Training network. lr: 

In [None]:
agent = Agent(action_size, mode='PPO_LSTM')
torch.save(agent.policy_net.state_dict(), "./save_model/spaceinvaders_ppo_best")
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0
reset_max = 10


### Loop through all environments and run PPO on them

#env_names = ['Breakout-v0', 'Phoenix-v0', 'Asteroids-v0', 'SpaceInvaders-v0', 'MsPacman-v0', 'Asterix-v0', 'Atlantis-v0', 'Alien-v0', 'Amidar-v0', 'Assault-v0', 'BankHeist-v0']
env_names = ['SpaceInvaders-v0']
for a in range(len(env_names)):
    name = env_names[a]
    print("\n\n\n ------- STARTING TRAINING FOR %s ------- \n\n\n" % (name))
    
    envs = []
    for i in range(num_envs):
        envs.append(GameEnv(name))
        envs[i].reset_memory(agent.init_hidden())
    #env.render()
    

    number_lives = envs[0].life
    state_size = envs[0].observation_space.shape
    if (name == 'SpaceInvaders-v0' or name == 'Breakout-v0'):
        action_size = 4
    else:
        action_size = envs[0].action_space.n
    rewards, episodes = [], []

    vis_env_idx = 0
    vis_env = envs[vis_env_idx]
    e = 0
    frame = 0
    max_eval = -np.inf
    reset_count = 0


    agent = Agent(action_size, mode='PPO_LSTM')
    torch.save(agent.policy_net.state_dict(), "./save_model/" + name + "_best")
    evaluation_reward = deque(maxlen=evaluation_reward_length)
    frame = 0
    memory_size = 0
    reset_max = 10
    
    print("Determing min/max rewards of environment")
    [low, high] = score_range = get_score_range(name)
    print("Min: %d. Max: %d." % (low, high))

    while (frame < 50000000):
        step = 0
        assert(num_envs * env_mem_size == train_frame)
        frame_next_vals = []
        
        for j in range(env_mem_size):
            
            curr_states = np.stack([envs[i].history[[HISTORY_SIZE-1],:,:] for i in range(num_envs)])
            hiddens = torch.cat([envs[i].memory for i in range(num_envs)])
            next_states = []
            step += num_envs
            frame += num_envs
            actions, values, hiddens = agent.get_action(np.float32(curr_states) / 255., hiddens)
            hiddens = hiddens.detach()
            
            for i in range(num_envs):
                env = envs[i]
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                next_states.append(next_state)
                if (i == vis_env_idx):
                    vis_env._env.render()
            
            for i in range(num_envs):
                env = envs[i]
                """
                next_state, env.reward, env.done, env.info = env.step(actions[i])
                if (i == vis_env_idx):
                    vis_env._env.render()
                """
                
                frame_next_state = get_frame(next_states[i])
                env.history[HISTORY_SIZE,:,:] = frame_next_state
                env.memory = hiddens[[i]]
                terminal_state = check_live(env.life, env.info['ale.lives'])
                env.life = env.info['ale.lives']
                r = (env.reward / high) * 20.0 #np.log(max(env.reward+1, 1))#((env.reward - low) / (high - low)) * 30
                agent.memory.push(i, [deepcopy(curr_states[i]), hiddens[i].detach().cpu().data.numpy()], actions[i], r, terminal_state, values[i], 0, 0)
                
                if (j == env_mem_size-1):
                    #net_in = np.stack([envs[k].history[1:,:,:] for k in range(num_envs)])
                    net_in = np.stack([envs[k].history[[-1],:,:] for k in range(num_envs)])
                    _, frame_next_vals, _ = agent.get_action(np.float32(net_in) / 255., hiddens)
                
                env.score += env.reward
                env.history[:HISTORY_SIZE, :, :] = env.history[1:,:,:]
        
                if (env.done):
                    if (e % 50 == 0):
                        print('now time : ', datetime.now())
                        rewards.append(np.mean(evaluation_reward))
                        episodes.append(e)
                        pylab.plot(episodes, rewards, 'b')
                        pylab.savefig("./save_graph/" + name + "_ppo.png")
                        torch.save(agent.policy_net, "./save_model/" + name + "_ppo")

                        if np.mean(evaluation_reward) > max_eval:
                            torch.save(agent.policy_net.state_dict(), "./save_model/"  + name + "_ppo_best")
                            max_eval = float(np.mean(evaluation_reward))
                            reset_count = 0
                        elif e > 5000:
                            reset_count += 1
                            """
                            if (reset_count == reset_max):
                                print("Training went nowhere, starting again at best model")
                                agent.policy_net.load_state_dict(torch.load("./save_model/spaceinvaders_ppo_best"))
                                agent.update_target_net()
                                reset_count = 0
                            """
                    e += 1
                    evaluation_reward.append(env.score)
                    print("episode:", e, "  score:", env.score,  " epsilon:", agent.epsilon, "   steps:", step,
                      " evaluation reward:", np.mean(evaluation_reward))

                    env.done = False
                    env.score = 0
                    env.history = np.zeros([HISTORY_SIZE+1,84,84], dtype=np.uint8)
                    env.state = env.reset()
                    env.life = number_lives
                    get_init_state(env.history, env.state)
                    env.reset_memory(agent.init_hidden())
            
        agent.train_policy_net(frame, frame_next_vals)
        agent.update_target_net()
    print("FINISHED TRAINING FOR %s" % (name))
    pylab.figure()
    
    for i in range(len(envs)):
        envs[i]._env.close()
    del envs