<a href="https://colab.research.google.com/github/madelinefuentes/reinforcement-learning-pacman/blob/master/reinforcement_learning_pacman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!pip install gym-retro

In [None]:
!python -m retro.import '/gdrive/My Drive/Pacman'

In [None]:
import retro
import numpy as np
import matplotlib.pyplot as plt
from skimage.color import rgb2gray
from skimage import transform
from collections import deque
import random
import time

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import History


def train():
    env = retro.make(game = "PacManNamco-Nes")
    obs = env.reset()

    possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())

    # model hyperparameters
    state_size = [125, 80, 4]
    learning_rate =  0.00025
    action_size = env.action_space.n

    #training hyperparameters
    total_episodes = 1
    max_steps = 50000
    batch_size = 32
    memory_size = 1000000

    max_tau = 10000
    max_exploration = 1.0
    min_exploration = .01
    decay_rate = 0.00001
    discount = .9

    memory = pretrain(memory_size, batch_size, env, possible_actions)
    model = get_model(state_size, action_size, learning_rate)
    target_model = get_model(state_size, action_size, learning_rate)
    target_model = update_target_model(model, target_model)

    history = History()
    render_episode = True
    decay_step = 0
    tau = 0

    for episode in range(total_episodes):
        step = 0
        episode_rewards = []

        stacked_frames = add_frame(None, obs, True)
        state = get_stacked_state(stacked_frames)

        while step < max_steps:
            step +=1 
            decay_step +=1
            action, explore_probability = predict_action(model, min_exploration, max_exploration,
                decay_rate, decay_step, state, possible_actions)

            obs, reward, done, info = env.step(action)
            episode_rewards.append(reward)
            
            # if(render_episode):
            #     env.render()
            
            # start next episode
            if done:
                terminal_frame = np.zeros((125,80), dtype=np.int)
                terminal_frames = add_frame(stacked_frames, terminal_frame, False)
                terminal_state = get_stacked_state(terminal_frames)
                total_reward = np.sum(episode_rewards)
                memory.add((state, action, reward, terminal_state, done))
                step = max_steps
                history_length = len(history.history['accuracy']) 
                accuracy = history.history['acuracy'][history_length - 1]
                print('Episode: ' + str(episode + 1) + ' Total Reward: ' + str(total_reward) + ' Accuracy: ' + str(accuracy))

            else:
                stacked_frames = add_frame(stacked_frames, obs, False)
                next_state = get_stacked_state(stacked_frames)
                memory.add((state, action, reward, next_state, done))
                state = next_state

            batch = memory.sample(batch_size)
            states_mb = np.array([each[0] for each in batch], ndmin=3)
            actions_mb = np.array([each[1] for each in batch])
            rewards_mb = np.array([each[2] for each in batch]) 
            next_states_mb = np.array([each[3] for each in batch], ndmin=3)
            dones_mb = np.array([each[4] for each in batch])

            next_q_values = target_model.predict(next_states_mb)
            next_q_targets = model.predict(next_states_mb)
            target_q_batch = []

            for i in range(batch_size):
                terminal = dones_mb[i]
                action = np.argmax(next_q_values[i])

                if terminal:
                    target_q_batch.append(rewards_mb[i])
                        
                else:
                    target = rewards_mb[i] + discount * next_q_targets[i][action]
                    target_q_batch.append(target)
                        
            targets_mb = np.array(target_q_batch)
            model.fit(states_mb, targets_mb, verbose = 0, callbacks = [history])

            if(tau > max_tau):
                update_target_model(model, target_model)
                tau = 0
    
        print('model saved')
        model.save('after ' + str(episode) + ' episodes')


# update target model's weights with prediction model's weight
def update_target_model(model, target_model):
    weights = model.get_weights()
    target_model.set_weights(weights)
    return target_model


# use exploration vs exploitation tradeoff to decide how agent acts
def predict_action(model, min_exploration, max_exploration,
    decay_rate, decay_step, state, possible_actions):

    tradeoff = np.random.rand()
    explore_probability = min_exploration + (max_exploration - min_exploration) * np.exp(-decay_rate * decay_step)

    if (explore_probability > tradeoff):
        choice = np.random.choice(len(possible_actions))
    else:
        state_tensor = tf.convert_to_tensor(state)
        state_tensor = tf.expand_dims(state_tensor, 0)
        action_probs = model(state_tensor, training=False)
        choice = np.argmax(action_probs)
    
    action = possible_actions[choice]
    return action, explore_probability


# initially fill memory with experience
def pretrain(memory_size, pretrain_length, env, possible_actions):
    memory = ReplayMemory(memory_size)
    stacked_frames = []

    for i in range(pretrain_length):
        if i == 0:
            obs = env.reset()
            stacked_frames = add_frame(None, obs, True)

        choice = np.random.choice(len(possible_actions))
        action = possible_actions[choice]
        obs, rew, done, info = env.step(action)
        stacked_frames = add_frame(stacked_frames, obs, False)
        state = get_stacked_state(stacked_frames)

        if done:
            next_state = np.zeros((1, 125, 80, 4))
            memory.add((state, action, rew, next_state, done))
            obs = env.reset()
            stacked_frames = add_frame(None, obs, True)
        else: 
            next_state = get_stacked_state(stacked_frames)
            memory.add((state, action, rew, next_state, done))

    return memory


# format stacked frames for input to model
def get_stacked_state(frames):
    np_frames = np.stack(frames, axis=2)
    return np_frames.reshape(1, 125, 80, 4)


# convert frame to grayscale, crop, and normalize pixel values
def preprocess_frame(frame):
    gray_frame = rgb2gray(frame)
    cropped = gray_frame[8:-4,0:-75]
    normalized = cropped/255.0
    resized = transform.resize(normalized, [125,80])
    # plt.imshow(resized)
    # plt.show()
    return resized


# add new state to stacked frames or initialize
def add_frame(stacked_frames, new_state, is_new_episode):
    frame = preprocess_frame(new_state)

    # reinitialize when starting new episode
    if(is_new_episode):
        stacked_frames = deque([np.zeros((165,120), dtype=np.int) for i in range(4)], maxlen=4)
        for n in range(4):
            stacked_frames.append(frame)
    else:
        stacked_frames.append(frame)
         
    return stacked_frames


# build convolutional neural network
def get_model(state_size, action_size, learning_rate):
    model = models.Sequential()

    model.add(layers.Conv2D(32, 8, strides = 4, activation='relu', input_shape = state_size))
    model.add(layers.Conv2D(64, 4, strides = 2, activation='relu'))
    model.add(layers.Conv2D(64, 4, strides = 1, activation='relu'))

    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(action_size))

    model.compile(optimizer=Adam(learning_rate = learning_rate),
        loss=tf.keras.losses.mean_squared_error,
        metrics=['accuracy'])

    return model


# memory class for storing and sampling experiences
class ReplayMemory:
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)

    def add(self, experience_tuple):
        self.buffer.append(experience_tuple)

    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        indices = random.sample(range(buffer_size), batch_size)
        return [self.buffer[index] for index in indices]

if __name__ == "__main__":
    train()




