In [5]:
#imports
from tqdm import tqdm
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
import gym_super_mario_bros
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Convolution2D, Conv2D
from tensorflow.keras import optimizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers

import tensorflow as tf

import random
import numpy as np

from tqdm import tqdm

import cv2
from matplotlib import pyplot as plt

from collections import deque


    


In [3]:
def process_image(img):
    #downsample and convert to b&w
    return cv2.resize(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY),(img_width,img_height))

def preprocess_states(state):
    
    x_t = process_image(state) #convert to b&w and downsample
    
    #create first state stack made of repeated first image
    s_t = np.stack((x_t,)*frame_stack, axis=2)
    #reshape to (1, height, width, frame_stack) for Keras
    s_t = s_t.reshape(1,s_t.shape[0],s_t.shape[1],s_t.shape[2])
    
    return(s_t)

In [77]:
class Q_CNN:
    
    def __init__(self, 
                 NUM_ACTIONS, 
                 LEARNING_RATE,
                 FRAMES_PER_ACTION,
                 INITIAL_EPSILON,EPSILON_TAPER_LENGTH,FINAL_EPSILON,
                 EXPERIENCE_MEMORY_LIMIT,
                 BATCH_SIZE,
                 GAMMA,
                 RENDER, PROCESSED_INITIAL_STATE):
        
        self.NUM_ACTIONS = NUM_ACTIONS
        self.LEARNING_RATE = LEARNING_RATE
        self.FRAMES_PER_ACTION = FRAMES_PER_ACTION
        self.INITIAL_EPSILON = INITIAL_EPSILON
        self.EPSILON_TAPER_LENGTH = EPSILON_TAPER_LENGTH
        self.FINAL_EPSILON = FINAL_EPSILON
        self.EXPERIENCE_MEMORY_LIMIT = EXPERIENCE_MEMORY_LIMIT
        self.BATCH_SIZE = BATCH_SIZE
        self.PROCESSED_INITIAL_STATE = PROCESSED_INITIAL_STATE
        self.RENDER = RENDER
        self.GAMMA = GAMMA
    
    def construct_CNN(self):
#  Description of how model was constructed : 
# Two hidden layers - 
# 
        model = Sequential()
        model.add(layers.Conv2D(filters=32, kernel_size=8, strides=4, input_shape=(120, 128, 4)))
        model.add(layers.Activation('relu'))
        model.add(layers.Conv2D(filters=64, kernel_size=2, strides=2))
        model.add(layers.Activation('relu'))
        model.add(layers.Conv2D(filters=64, kernel_size=3, strides=1))
        model.add(layers.Convolution2D(64, 3, 3))
        model.add(layers.Activation('relu'))
        model.add(layers.Flatten())
        model.add(layers.Dense(512))
        model.add(layers.Activation('relu'))
        model.add(layers.Dense(self.NUM_ACTIONS))

        adam = Adam(lr=self.LEARNING_RATE)
        model.compile(loss='mse', optimizer=adam)
        
        self.model = model

    
    def train_model(self):
        
        epsilon = self.INITIAL_EPSILON
        step = 0
        wait_steps = 100
        s_t = self.PROCESSED_INITIAL_STATE
        
        while (True):
            loss = 0
            Q_sa = 0
            r_t = 0
            a_t = 0
            if step % self.FRAMES_PER_ACTION ==0:

                if random.random() <= epsilon : 
                    a_t = env.action_space.sample()
                else:
                    q = self.model.predict(s_t)
                    a_t = np.argmax(q)

                #Reduce epsilon: 
                if epsilon > self.FINAL_EPSILON and step > wait_steps:
                    epsilon -= (self.INITIAL_EPSILON - self.FINAL_EPSILON)/self.EPSILON_TAPER_LENGTH

                #perform selected action and get reward and next state
                x_t1, r_t, done, info = env.step(a_t)
                x_t1 =  process_image(x_t1) #convert to b&w and downsample
                #add to state stack
                x_t1 = x_t1.reshape(1, x_t1.shape[0],x_t1.shape[1], 1) #1 x height x width x 1
                s_t1 = np.append(x_t1, s_t[:,:,:,:3], axis=3)

                #store the transition in experience_memory
                experience_memory.append((s_t, a_t, r_t, s_t1, done))
                #clear old memories if larger than limit
                if len(experience_memory) > self.EXPERIENCE_MEMORY_LIMIT: 
                    experience_memory.popleft()

                #train if waiting period over
                if step > wait_steps:
                    #train on a random minibatch
                    minibatch = random.sample(experience_memory, self.BATCH_SIZE)

                    #EXPERIENCE REPLAY:
                    state_t, action_t, reward_t, state_t1, done = zip(*minibatch)
                    state_t = np.concatenate(state_t)
                    state_t1 = np.concatenate(state_t1)
                    targets = self.model.predict(state_t)
                    Q_sa = self.model.predict(state_t1)
                    targets[range(BATCH_SIZE), action_t] = reward_t + self.GAMMA*np.max(Q_sa, axis=1)*np.invert(done)

                    loss += self.model.train_on_batch(state_t, targets)

                s_t = s_t1
                step += 1
                if self.RENDER == True:
                    env.render()

                #save progress every 10000 iterations
                if step % 10000 == 0: #change to 10000 after TODO
                    model.save_weights(path+"model.h5", overwrite=True)

                if step <= wait_steps:
                    state = "observe"
                elif step > wait_steps and step <= wait_steps + self.EPSILON_TAPER_LENGTH:
                    state = "exploring"
                else:
                    state = "training"

                if step % 100 == 0: 
                    print (f'TIMESTEP {step} / STATE {state} = / EPSILON {epsilon} / ACTION {a_t}' \
                          f'/ REWARD {r_t} / Q_MAX {np.max(Q_sa)} / LOSS {loss}')

                #print("EPISODE FINISHED")

                #print('****************')


In [78]:
img_height, img_width = 120, 128
frame_stack = 4 #pass four frames in


In [79]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)

env.reset()
experience_memory = deque()
x_t, reward, done, info = env.step(env.action_space.sample())
s_t = preprocess_states(x_t)

CNN = Q_CNN(NUM_ACTIONS = len(COMPLEX_MOVEMENT),
              LEARNING_RATE = 1e-4,
              FRAMES_PER_ACTION = 1,
              INITIAL_EPSILON = 0.5, EPSILON_TAPER_LENGTH = 30000,FINAL_EPSILON = 0.001,
              EXPERIENCE_MEMORY_LIMIT = 50000,
              BATCH_SIZE = 32,
              GAMMA = 0.99,
              RENDER = True,
              PROCESSED_INITIAL_STATE = s_t )

CNN.construct_CNN()
CNN = CNN.train_model()

TIMESTEP 100 / STATE observe = / EPSILON 0.5 / ACTION 8/ REWARD 0 / Q_MAX 0 / LOSS 0
TIMESTEP 200 / STATE exploring = / EPSILON 0.49835329999999844 / ACTION 4/ REWARD 0 / Q_MAX 82361.3359375 / LOSS 35750.98828125
TIMESTEP 300 / STATE exploring = / EPSILON 0.49668996666666354 / ACTION 5/ REWARD 1 / Q_MAX 278496.03125 / LOSS 125640.453125
TIMESTEP 400 / STATE exploring = / EPSILON 0.49502663333332864 / ACTION 5/ REWARD 1 / Q_MAX 102978.671875 / LOSS 43318.17578125
TIMESTEP 500 / STATE exploring = / EPSILON 0.49336329999999373 / ACTION 1/ REWARD 1 / Q_MAX 17050.4140625 / LOSS 406.8846130371094
TIMESTEP 600 / STATE exploring = / EPSILON 0.4916999666666588 / ACTION 9/ REWARD -2 / Q_MAX 4498.79345703125 / LOSS 101.24817657470703
TIMESTEP 700 / STATE exploring = / EPSILON 0.4900366333333239 / ACTION 0/ REWARD 2 / Q_MAX 1528.9083251953125 / LOSS 19.75580596923828
TIMESTEP 800 / STATE exploring = / EPSILON 0.488373299999989 / ACTION 9/ REWARD 2 / Q_MAX 3439.60595703125 / LOSS 33.900856018066406

KeyboardInterrupt: 