This project is made for Reasoning Agents part where the fundamental idea is to apply Reinforcement Learning to Mario game.

## Libraries
We have included several libraries which will be needed for the project. They are Gym, pandas, numpy, tensorflow and others.

In [None]:
import tkinter as tk
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import gym
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace
from IPython.display import clear_output

from keras.models import save_model, load_model
import time


## Environment

In [None]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, RIGHT_ONLY)

## Simple test
This is used to test the game and see how it exactly works.

In [None]:
# total_reward = 0
# done = True
# for step in range(100000):
#     env.render()
#     if done:
#         state = env.reset()
#     state, reward, done, info = env.step(env.action_space.sample())
#     preprocess_state(state)
#     print(info)
#     total_reward += reward
#     clear_output(wait=True)
# env.close()

## Class for the Mario

In [None]:
state = env.reset()
#preprocess_state(state)

In [None]:
class MarioAgent:
    def __init__(self, state_size, action_size):
        # we need to create variables for our agent here
        self.state_space = state_size
        self.action_space = action_size
        self.memory = deque(maxlen=5000)
        self.gamma = 0.8
        self.chosenAction = 0
        #giving the epsilon value for exploration and exploiation
        self.epsilon = 1
        self.max_epsilon = 1
        self.min_epsilon = 0.01
        self.decay_epsilon = 0.0001

        # we then need to build the NN for the agent
        self.main_network = self.build_network()
        self.target_network = self.build_network()
        self.update_target_network()

    def build_network(self):
        model = Sequential()
        model.add(Conv2D(64, (4,4), strides=4, padding='same',input_shape=self.state_space))
        model.add(Activation('relu'))

        model.add(Conv2D(64, (4,4), strides=2, padding='same'))
        model.add(Activation('relu'))

        model.add(Conv2D(64, (3,3), strides=1, padding='same'))
        model.add(Activation('relu'))
        model.add(Flatten())

        model.add(Dense(512, activation='relu'))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_space, activation='linear'))

        model.compile(loss='mse', optimizer=Adam())
        return model

    def update_target_network(self):
        self.target_network.set_weights(self.main_network.get_weights())

    def act(self, state, onGround):
        if onGround < 83:
            print('Mario is on Ground')
            if random.uniform(0,1) < self.epsilon:
                self.chosenAction = np.random.randint(self.action_space)
                return self.chosenAction
            Q_value = self.main_network.predict(state)
            self.chosenAction = np.argmax(Q_value[0])
            return self.chosenAction
        else:
            print('Mario is not on Ground')
            return self.chosenAction


    def update_epsilon(self, episode):
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay_epsilon * episode)

    def train(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)

        for state, action, reward, next_state, done in minibatch:
            target = self.main_network.predict(state)

            if done:
                target[0][action] = reward
            else:
                target[0][action] = (reward+ self.gamma * np.amax(self.target_network.predict(next_state)))

            self.main_network.fit(state, target, epochs=1, verbose=0)
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def get_pred_act(self, state):
        Q_values = self.main_network.predict(state)
        return np.argmax(Q_values[0])
    def load(self, name):
        self.main_network = load_model(name)
        self.target_network = load_model(name)

    def save(self, name):
        save_model(self.main_network, name)

In [None]:
action_space = env.action_space.n
state_space = (80,88,1)
env.observation_space
from PIL import Image

def preprocess_state(state):
    image = Image.fromarray(state)
    image = image.resize((88,80))
    image = image.convert('L')
    image = np.array(image)
    return image

In [None]:
num_episodes = 1000000
num_timesteps = 400000
batch_size = 64
DEBUG_LEN = 300

In [None]:
mario = MarioAgent(state_space, action_space)

In [None]:
stuck_buffer = deque(maxlen=DEBUG_LEN)
for i in range(num_episodes):
    Return = 0
    done = False
    time_step = 0
    onGround = 79

    state = preprocess_state(env.reset())
    state = state.reshape(-1, 80, 88, 1)

    for t in range(num_timesteps):
        env.render()
        time_step += 1

        if t>1 and stuck_buffer.count(stuck_buffer[-1]) == DEBUG_LEN-50:
            action = mario.act(state, onGround=79)
        else:
            action = mario.act(state, onGround)

        action = mario.act(state, onGround)
        next_state, reward, done, info = env.step(action)
        onGround = info['y_pos']
        stuck_buffer.append(info['x_pos'])

        next_state = preprocess_state(next_state)
        next_state = next_state.reshape(-1, 80,88,1)

        mario.store_transition(state, action, reward, next_state, done)
        state = next_state
        Return += reward
        print("Episode is: {}\nTotal Time Step: {}\nCurrent Reward: {}\nEpsilon is: {}".format(str(i), str(time_step), str(Return), str(mario.epsilon)))

        clear_output(wait=True)

        if done:
            break
        if len(mario.memory) > batch_size and i > 5:
            mario.train(batch_size)
    mario.update_epsilon(i)
    clear_output(wait=True)
    mario.update_target_network()
env.close()

## Save model 

In [None]:
mario.save('mario-v0.h5')

## Load model

In [None]:
mario.load('mario-v0.h5')

In [None]:
while 1:
    done = False
    state = preprocess_state(env.reset())
    state = state.reshape(-1, 80,88,1)
    total_reward = 0
    onGround = 79
    while not done:
        env.render()
        action = mario.act(state, onGround)
        next_state, reward, done, info = env.step(action)
        
        onGround = info['y_pos']
        next_state = preprocess_state(next_state)
        next_state = next_state.reshape(-1, 80, 88, 1)
        state = next_state
env.close()
