# Modular Atari CNN
<br><strong>
Michael Ruggiero<br>
michael@mcruggiero.com<br>
Tuesday, August 13th, 2019<br></strong>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#Environment" data-toc-modified-id="Environment-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Environment</a></span></li><li><span><a href="#Parameters" data-toc-modified-id="Parameters-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Parameters</a></span></li><li><span><a href="#Functions-and-Classes" data-toc-modified-id="Functions-and-Classes-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Functions and Classes</a></span><ul class="toc-item"><li><span><a href="#Report-Functions" data-toc-modified-id="Report-Functions-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Report Functions</a></span></li><li><span><a href="#Image-Preprocessing-Functions" data-toc-modified-id="Image-Preprocessing-Functions-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Image Preprocessing Functions</a></span></li><li><span><a href="#Exploration-vs-Exploitation" data-toc-modified-id="Exploration-vs-Exploitation-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Exploration vs Exploitation</a></span></li><li><span><a href="#Experience-Replay" data-toc-modified-id="Experience-Replay-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Experience Replay</a></span></li></ul></li><li><span><a href="#Convolutional-Neural-Network" data-toc-modified-id="Convolutional-Neural-Network-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Convolutional Neural Network</a></span></li><li><span><a href="#Q-Learning-Agent" data-toc-modified-id="Q-Learning-Agent-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Q Learning Agent</a></span></li><li><span><a href="#Episode-Loop" data-toc-modified-id="Episode-Loop-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Episode Loop</a></span></li></ul></div>

## Libraries

In [5]:
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

from time import time
from time import sleep
import gym
import math

import os.path

from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display

from atari_wrappers import make_atari, wrap_deepmind

import seaborn as sns

import itertools as it

%matplotlib inline
plt.style.use('ggplot')

## Environment

In [6]:
#Check if gpu can be used
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Set tensor
Tensor = torch.Tensor

#Set index tensor
LongTensor = torch.LongTensor

#Check out other Environments at:
#https://github.com/openai/gym/wiki/Table-of-environments

env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
#env = wrap_deepmind(env)

#This makes a directory with videos where progress can be monitored
directory = './CNN_Videos/'
env = gym.wrappers.Monitor(env,
                           directory,
                           force = True, #To prevent overwrite remove
                           video_callable=lambda episode_id: episode_id%20==0)

number_of_inputs = env.observation_space.shape[0]
number_of_outputs = env.action_space.n

## Parameters 

In [7]:
learning_rate = 0.0001
num_episodes = 25
gamma = 0.99

hidden_layer = 512

replay_mem_size = 100000
batch_size = 32

update_target_frequency = 2000

egreedy = 0.9
egreedy_final = 0.01
egreedy_decay = 10000

report_interval = 10
score_to_solve = 18

clip_error = True
normalize_image = True

file2save = 'cnn_{}.pth'.format(env_id)
save_model_frequency = 10000

resume_previous_training = False

## Functions and Classes

### Report Functions

In [8]:
#This allows us to load a pretrained model
def load_model():
    return torch.load(file2save)

#This allows us to save the model while training
def save_model(model):
    torch.save(model.state_dict(), file2save)

def plot_results():
    plt.figure(figsize=(12,5))
    plt.title("Rewards")
    plt.plot(rewards_total, alpha=0.6, color='red')
    plt.savefig("Pong-results.png")
    plt.close()

### Image Preprocessing Functions

In [9]:
def preprocess_frame(frame):
    
    #This reworks the order of the states to make sure that pytorch can handle it
    frame = frame.transpose((2,0,1))

    #Change the array to pytorch tensor
    frame = torch.from_numpy(frame)

    #Feed the action to the device
    frame = frame.to(device, dtype=torch.float32) #Calculations use float

    #Need to feed into our network a 4d tensor, not 3d.  
    frame = frame.unsqueeze(1)
    
    return frame

### Exploration vs Exploitation

In [10]:
def calculate_epsilon(steps_done):
    epsilon = egreedy_final + (egreedy - egreedy_final) * \
              math.exp(-1. * steps_done / egreedy_decay )
    return epsilon

### Experience Replay

In [11]:
class ExperienceReplay(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0
 
    def push(self, state, action, new_state, reward, done):
        transition = (state, action, new_state, reward, done)
        
        if self.position >= len(self.memory):
            self.memory.append(transition)
        else:
            self.memory[self.position] = transition
        
        self.position = ( self.position + 1 ) % self.capacity
        
        
    def sample(self, batch_size):
        return zip(*random.sample(self.memory, batch_size))
        
        
    def __len__(self):
        return len(self.memory)

## Convolutional Neural Network

In [12]:
class CNN(nn.Module):

    #This is the architecture used by deepmind
    def __init__(self):
        super(CNN, self).__init__()

        #out_channels is the number of filters
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        
        self.advantage1 = nn.Linear(7*7*64,hidden_layer)
        self.advantage2 = nn.Linear(hidden_layer, number_of_outputs)
        
        self.value1 = nn.Linear(7*7*64,hidden_layer)
        self.value2 = nn.Linear(hidden_layer,1)

        self.activation = nn.ReLU()
        
    def forward(self, x):
        
        if normalize_image:
            x = x / 255
        
        output_conv = self.conv1(x)
        output_conv = self.activation(output_conv)
        output_conv = self.conv2(output_conv)
        output_conv = self.activation(output_conv)
        output_conv = self.conv3(output_conv)
        output_conv = self.activation(output_conv)

        #http://cs231n.github.io/convolutional-networks/
        #View flattens out the network into a 7X7X64 layer tensor. 
        output_conv = output_conv.view(output_conv.size(0), -1)

        output_advantage = self.advantage1(output_conv)
        output_advantage = self.activation(output_advantage)
        output_advantage = self.advantage2(output_advantage)
        
        output_value = self.value1(output_conv)
        output_value = self.activation(output_value)
        output_value = self.value2(output_value)
        
        output_final = output_value+ output_advantage - output_advantage.mean()

        return output_final

## Q Learning Agent

In [13]:
class Agent(object):
    def __init__(self):
        self.nn = CNN().to(device)
        self.target_nn = CNN().to(device)

        self.loss_func = nn.MSELoss()
        #self.loss_func = nn.SmoothL1Loss()
        
        self.optimizer = optim.Adam(params=self.nn.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(params=mynn.parameters(), lr=learning_rate)

        #This is a frame counter for the class
        self.frame_number = 0
        
        #Allows us to load the prior model 
        if resume_previous_training and os.path.exists(file2save):
            print("Loading saved model {}... ".format(file2save))
            self.nn.load_state_dict(load_model())
        
    def select_action(self,state,epsilon):
        
        random_for_egreedy = torch.rand(1)[0]
        
        if random_for_egreedy > epsilon:      
            
            with torch.no_grad():
                
                state = preprocess_frame(state)
                action_from_nn = self.nn(state)
                
                action = torch.max(action_from_nn,1)[1]
                action = action.item()        
        else:
            action = env.action_space.sample()
        
        return action
    
    def optimize(self):
        
        if (len(memory) < batch_size):
            return
        
        state, action, new_state, reward, done = memory.sample(batch_size)

        #This loops through all of the frames into a list
        state = [ preprocess_frame(frame) for frame in state ] 
        state = torch.cat(state)

        #Preprocess new_state
        new_state = [preprocess_frame(frame) for frame in new_state] 
        new_state = torch.cat(new_state)

        reward = Tensor(reward).to(device)
        action = LongTensor(action).to(device)
        done = Tensor(done).to(device)

        new_state_indexes = self.nn(new_state).detach()
        max_new_state_indexes = torch.max(new_state_indexes, 1)[1]  
        
        new_state_values = self.target_nn(new_state).detach()
        max_new_state_values = new_state_values.gather(1, max_new_state_indexes.unsqueeze(1)).squeeze(1)
        
        target_value = reward + (1 - done) * gamma * max_new_state_values
  
        predicted_value = self.nn(state).gather(1, action.unsqueeze(1)).squeeze(1)
        
        loss = self.loss_func(predicted_value, target_value)
    
        self.optimizer.zero_grad()
        loss.backward()
        
        if clip_error:
            for param in self.nn.parameters():
                param.grad.data.clamp_(-1,1)
        
        self.optimizer.step()

        #Updates the nn
        if self.frae_number % update_target_frequency == 0:
            self.target_nn.load_state_dict(self.nn.state_dict())

        #Saves current model
        if self.frame_number % save_model_frequency == 0:
            save_model(self.nn)
        
        self.frame_number += 1

## Episode Loop

In [14]:
memory = ExperienceReplay(replay_mem_size)
agent = Agent()

rewards_total = []

frames_total = 0 
solved_after = 0
solved = False

start_time = time

for i_episode in range(num_episodes):
    
    state = env.reset()
    
    score = 0
    #for step in range(100):
    while True:
        
        frames_total += 1
        
        epsilon = calculate_epsilon(frames_total)
        
        #action = env.action_space.sample()
        action = agent.select_action(state, epsilon)
        
        new_state, reward, done, info = env.step(action)
        
        score += reward

        memory.push(state, action, new_state, reward, done)
        agent.optimize()
        
        state = new_state

        # print("Image")
        # print(state)
        # print(type(state))
        # print(state.dtype)
        # print(state.shape)
         
        if done:
            rewards_total.append(score)
            
            mean_reward_100 = sum(rewards_total[-100:])/100
            
            if (mean_reward_100 > score_to_solve and solved == False):
                print("SOLVED! After %i episodes " % i_episode)
                solved_after = i_episode
                solved = True
            
            if (i_episode % report_interval == 0 and i_episode > 0):
                
                plot_results()
                
                print("\n*** Episode %i *** \
                      \nAv.reward: [last %i]: %.2f, [last 100]: %.2f, [all]: %.2f \
                      \nepsilon: %.2f, frames_total: %i" 
                  % 
                  ( i_episode,
                    report_interval,
                    sum(rewards_total[-report_interval:])/report_interval,
                    mean_reward_100,
                    sum(rewards_total)/len(rewards_total),
                    epsilon,
                    frames_total
                          ) 
                  )
                  
                elapsed_time = time - start_time
                print("Elapsed time: ", time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))



            break
        

print("\n\n\n\nAverage reward: %.2f" % (sum(rewards_total)/num_episodes))
print("Average reward (last 100 episodes): %.2f" % (sum(rewards_total[-100:])/100))
if solved:
    print("Solved after %i episodes" % solved_after)


env.close()
env.env.close()

RuntimeError: size mismatch, m1: [3 x 22528], m2: [3136 x 512] at /tmp/pip-req-build-pb3z3zl3/aten/src/THC/generic/THCTensorMathBlas.cu:268

In [15]:
memory = ExperienceReplay(replay_mem_size)
qnet_agent = QNet_Agent()

rewards_total = []

frames_total = 0 
solved_after = 0
solved = False

start_time = time.time()

for i_episode in range(num_episodes):
    
    state = env.reset()
    
    score = 0
    #for step in range(100):
    while True:
        
        frames_total += 1
        
        epsilon = calculate_epsilon(frames_total)
        
        #action = env.action_space.sample()
        action = qnet_agent.select_action(state, epsilon)
        
        new_state, reward, done, info = env.step(action)
        
        score += reward

        memory.push(state, action, new_state, reward, done)
        qnet_agent.optimize()
        
        state = new_state
        
        if done:
            rewards_total.append(score)
            
            mean_reward_100 = sum(rewards_total[-100:])/100
            
            if (mean_reward_100 > score_to_solve and solved == False):
                print("SOLVED! After %i episodes " % i_episode)
                solved_after = i_episode
                solved = True
            
            if (i_episode % report_interval == 0 and i_episode > 0):
                
                plot_results()
                
                print("\n*** Episode %i *** \
                      \nAv.reward: [last %i]: %.2f, [last 100]: %.2f, [all]: %.2f \
                      \nepsilon: %.2f, frames_total: %i" 
                  % 
                  ( i_episode,
                    report_interval,
                    sum(rewards_total[-report_interval:])/report_interval,
                    mean_reward_100,
                    sum(rewards_total)/len(rewards_total),
                    epsilon,
                    frames_total
                          ) 
                  )
                  
                elapsed_time = time.time() - start_time
                print("Elapsed time: ", time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))



            break
        

print("\n\n\n\nAverage reward: %.2f" % (sum(rewards_total)/num_episodes))
print("Average reward (last 100 episodes): %.2f" % (sum(rewards_total[-100:])/100))
if solved:
    print("Solved after %i episodes" % solved_after)


env.close()
env.env.close()


NameError: name 'QNet_Agent' is not defined