Hash State

In [1]:
def encode(state):
    result = []
    for i in range(len(state)):
        for j in range(len(state[i])):
            if state[i][j] == "w":
                result.append(0)
            elif state[i][j] == "d":
                result.append(1)
            elif state[i][j] == "e":
                result.append(2)

    power = 0
    hash_code = 0
    for i in range(7, -1, -1):
        hash_code += (3**power) * result[i]
        power += 1

    return hash_code


Game functions

In [3]:
import numpy as np
import random
import pygame
from pygame.locals import *

class QLearnAgent:

    def __init__ (self, alpha, epsilon, gamma, qvalues):
        self.alpha = alpha        # learning rate
        self.epsilon = epsilon    # exploration rate
        self.gamma = gamma        # discount factor
        self.qvalues = qvalues # Q-values for each state-action pair
        self.env = [
            ['w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w'],
            ['w', 'a', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'w'],
            ['w', 'd', 'w', 'w', 'w', 'd', 'w', 'w', 'w', 'd', 'w'],
            ['w', 'd', 'w', 'd', 'd', 'd', 'd', 'd', 'w', 'd', 'w'],
            ['w', 'd', 'd', 'd', 'w', 'e', 'w', 'd', 'd', 'd', 'w'],
            ['w', 'd', 'w', 'd', 'w', 'e', 'w', 'd', 'w', 'd', 'w'],
            ['w', 'd', 'w', 'd', 'd', 'w', 'd', 'd', 'w', 'd', 'w'],
            ['w', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'w'],
            ['w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w'],
            ]
        self.num_dot = 43
        self.actions = [0, 1, 2, 3]   # up, right, down, left
        self.row_ind = 1
        self.col_ind = 1
        self.state = self.get_state(0, 3, 0, 3)
        self.score = 0
        self.epochs = 0
        self.see_dot = 0

    def get_state(self, start_row, end_row, start_col, end_col):
        mat = []
        for i in range(start_row, end_row):
            mat_row = []
            for j in range(start_col, end_col):
                mat_row.append(self.env[i][j])
            mat.append(mat_row)

        return mat


    def getAction(self):
        """Return the action to take in the current state."""
        x = random.random()
        if x < self.epsilon or self.see_dot > 20: # explore
            action = random.choice(self.actions)
        else: # exploit
            action = self.getPolicy()

        return action


    def getPolicy(self):
        """Return the best action to take in the current state."""
        # Get the Q-values for each action
        code = encode(self.state)
        qvalues = [ self.qvalues[code][action] for action in self.actions ]
        ind = qvalues.index(max(qvalues))
        # Return the action with the maximum Q-value
        return ind
 
    def get_next_state(self, action):
        reward = 0
        if action == 0:   # up
            if self.env[self.row_ind - 1][self.col_ind] == 'w':
                new_state = self.state
                reward = -2
                self.see_dot += 1
            else:
                if self.env[self.row_ind - 1][self.col_ind] == 'd':
                    reward = 10
                    self.num_dot -= 1
                    self.see_dot = 0
                elif self.env[self.row_ind - 1][self.col_ind] == 'e':
                    reward = -1
                    self.see_dot += 1
                self.env[self.row_ind - 1][self.col_ind] = 'a'
                self.env[self.row_ind][self.col_ind] = 'e'
                new_state = self.get_state(self.row_ind - 2, self.row_ind + 1, self.col_ind-1, self.col_ind+2)
                self.row_ind -= 1
                
        elif action == 1:   # right
            if self.env[self.row_ind][self.col_ind+1] == 'w':
                new_state = self.state
                reward = -2
                self.see_dot += 1
            else:
                if self.env[self.row_ind][self.col_ind+1] == 'd':
                    reward = 10
                    self.num_dot -= 1
                    self.see_dot = 0
                elif self.env[self.row_ind][self.col_ind+1] == 'e':
                    reward = -1
                    self.see_dot += 1
                self.env[self.row_ind][self.col_ind+1] = 'a'
                self.env[self.row_ind][self.col_ind] = 'e'
                new_state = self.get_state(self.row_ind-1, self.row_ind+2, self.col_ind, self.col_ind+3)
                self.col_ind += 1
                
        elif action == 2:    # down
            if self.env[self.row_ind+1][self.col_ind] == 'w':
                new_state = self.state
                reward = -2
                self.see_dot += 1
            else:
                if self.env[self.row_ind+1][self.col_ind] == 'd':
                    reward = 10
                    self.num_dot -= 1
                    self.see_dot = 0
                elif self.env[self.row_ind+1][self.col_ind] == 'e':
                    reward = -1
                    self.see_dot += 1
                self.env[self.row_ind+1][self.col_ind] = 'a'
                self.env[self.row_ind][self.col_ind] = 'e'
                new_state = self.get_state(self.row_ind, self.row_ind+3, self.col_ind-1, self.col_ind+2)
                self.row_ind += 1

        elif action == 3:    # left
            if self.env[self.row_ind][self.col_ind-1] == 'w':
                new_state = self.state
                reward = -2
                self.see_dot += 1
            else:
                if self.env[self.row_ind][self.col_ind-1] == 'd':
                    reward = 10
                    self.num_dot -= 1
                    self.see_dot = 0
                elif self.env[self.row_ind][self.col_ind-1] == 'e':
                    reward = -1
                    self.see_dot += 1
                self.env[self.row_ind][self.col_ind-1] = 'a'
                self.env[self.row_ind][self.col_ind] = 'e'
                new_state = self.get_state(self.row_ind-1, self.row_ind+2, self.col_ind-2, self.col_ind+1)
                self.col_ind -= 1

        self.state = new_state

        return reward

    def get_value(self):
        """Return the maximum Q-value for the current state."""

        # Get the Q-values for each action
        code = encode(self.state)
        qvalues = [ self.qvalues[code][action] for action in self.actions ]
        return(max(qvalues))

    def update_qvalue(self):
        if self.num_dot == 0:
            self.terminal()
            return 1
        action = self.getAction()
        code = encode(self.state)
        current_qvalue = self.qvalues[code][action]
        reward = self.get_next_state(action)
        self.epochs += 1
        self.score += reward
        self.update_epsilon()
        max_next_qvalue = self.get_value()

        # Compute the new Q-value
        new_qvalue = current_qvalue + self.alpha * (reward + self.gamma * max_next_qvalue - current_qvalue)
        self.qvalues[code][action] = new_qvalue
        return 0

    def update_epsilon(self):
        if self.epsilon >= 0.1:
            self.epsilon -= 0.001 

    def display (self):
        for r in range (len(self.env)):
            for c in range (len(self.env[r])):
                if self.env[r][c] == 'a':
                    print ('A', end=" "),
                elif self.env[r][c] == 'd':
                    print ('D', end=" "),
                elif self.env[r][c] == 'w':
                    print ('W', end=" "),
                elif self.env[r][c] == 'e':
                    print ('E', end=" "),
            print() 

    def terminal(self):
        return  

    def play_game(self):
        pygame.init()
        cell_size = 100  
        map_width = 11    # Number of row cells
        map_height = 9    # Number of column cells
        screen_width = cell_size * map_width
        screen_height = cell_size * map_height
        screen = pygame.display.set_mode((screen_width, screen_height))
        pygame.display.set_caption("PacMan")
        font = pygame.font.Font(None, 34)
        WHITE = (255, 255, 255)
        BLACK = (0, 0, 0)
        
        pacman_image = pygame.image.load("pacman.png")
        pacman_image = pygame.transform.scale(pacman_image, (cell_size, cell_size))
        dot_image = pygame.image.load("pellet.png")
        dot_image = pygame.transform.scale(dot_image, (cell_size, cell_size))

        is_end = False
        agent_i = agent_j = 0   # position of agent to display with image
        dot_position = []     # save position of dots
        while not is_end:
            is_end = self.update_qvalue()
            screen.fill(BLACK)
            running = True
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    running = False
            if not running:
                break
            
            for x in range(map_height):
                for y in range(map_width):
                    rectangle = pygame.Rect(y * cell_size, x * cell_size, cell_size, cell_size)
                    cell_color = WHITE
                    if self.env[x][y] == "w":
                        cell_color = BLACK
                    if self.env[x][y] == "d":
                        dot_position.append((y * cell_size, x * cell_size))
                    if self.env[x][y] == "a":
                        agent_i = y * cell_size
                        agent_j = x * cell_size

                    pygame.draw.rect(screen, cell_color, rectangle)
                    

            score_text = font.render(f"Score: {self.score}", True, WHITE)
            screen.blit(score_text, (10, 10))
            screen.blit(pacman_image, (agent_i, agent_j))
            for dot in dot_position:
                screen.blit(dot_image, (dot[0], dot[1]))
            dot_position = []
            pygame.display.flip()
            pygame.time.delay(100)
        pygame.quit()



Train the game

In [4]:
qvalues = np.zeros((3**8, 4))

for i in range(100):
    is_end = False
    game = QLearnAgent(0.1, 0.9, 0.5, qvalues)
    while not is_end:
        is_end = game.update_qvalue()
    qvalues = game.qvalues



Test and visualize the game

In [5]:
game = QLearnAgent(0.1, 0.9, 0.5, qvalues)
game.play_game()