In [5]:
import numpy as np
import random
from collections import deque
import time

# Board definition
# 0-Path, 1-Origin, 2-hole, 3-goal
board1 = np.array([
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 3],
    [0, 0, 2, 2, 2],
    [0, 2, 0, 0, 0],
    [0, 0, 0, 0, 1]
])

selected_board = board1

# Extract start, end, and hole positions
start_pos = np.argwhere(selected_board == 1)[0]
end_pos = np.argwhere(selected_board == 3)[0]
hole_pos = np.argwhere(selected_board == 2)

# State initialization
state = start_pos.tolist()

# Actions: up-0, right-1, down-2, left-3
action_map = np.array([[-1, 0], [0, 1], [1, 0], [0, -1]])

def do_action(state, action):
    next_state = state + action_map[action]
    reward = 0
    running = True

    # Check for grid boundaries
    if not (0 <= next_state[0] < 5 and 0 <= next_state[1] < 5):
        next_state = state
    elif list(next_state) in hole_pos.tolist():
        reward = -1
        running = False
    elif list(next_state) == end_pos.tolist():
        reward = 1
        running = False

    return next_state, reward, running

def state_no(state):
    return state[0] * 5 + state[1]

def reset():
    return start_pos.copy(), True, 1

# Q-table initialization
Q = np.zeros([25, 4])

# Learning parameters
alpha = 0.8
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.9995
min_epsilon = 0.01
epochs = 10000
batch_size = 64

# Replay buffer
replay_buffer = deque(maxlen=20000)

start_time = time.time()

for x in range(epochs):
    state, game_running, no_of_actions = reset()
    while game_running and no_of_actions < 200:
        rand_number = np.random.uniform(0, 1)
        if rand_number < epsilon:
            # Precompute valid actions based on current state
            valid_actions = [i for i in range(4) if (0 <= state[0] + action_map[i][0] < 5 and 0 <= state[1] + action_map[i][1] < 5)]
            action = random.choice(valid_actions)
        else:
            action = np.argmax(Q[state_no(state), :])

        next_state, action_reward, game_running = do_action(state, action)
        next_state_number = state_no(next_state)

        # Store experience in the replay buffer
        replay_buffer.append((state_no(state), action, action_reward, next_state_number, game_running))

        # Update Q-table
        if len(replay_buffer) >= batch_size:
            minibatch = random.sample(replay_buffer, batch_size)
            state_nos, actions, rewards, next_state_nos, runnings = zip(*minibatch)
            state_nos = np.array(state_nos)
            actions = np.array(actions)
            rewards = np.array(rewards)
            next_state_nos = np.array(next_state_nos)
            runnings = np.array(runnings, dtype=int)

            target_Qs = rewards + gamma * np.max(Q[next_state_nos, :], axis=1) * runnings
            Q[state_nos, actions] += alpha * (target_Qs - Q[state_nos, actions])

        state = next_state
        no_of_actions += 1

    epsilon = max(min_epsilon, epsilon * epsilon_decay)

end_time = time.time()
print("Time taken:", end_time - start_time, "seconds")

# Accuracy calculation
q_action_values = np.zeros((5, 5), dtype=int)
for m in range(5):
    for n in range(5):
        q_action_values[m, n] = np.argmax(Q[state_no([m, n]), :])

best_action_values = np.array([
    [[2,1], [2,1], [2,1], [2, 1], [2]],
    [[1], [1], [1], [1], [4]],
    [[0,1], [0], [4], [4], [4]],
    [[0], [4], [2], [2], [2]],
    [[0], [3], [3], [3], [3]]
], dtype=object)
# Actions: up-0, right-1, down-2, left-3

accuracy = 0
for m in range(5):
    for n in range(5):
        if q_action_values[m, n] in best_action_values[m, n]:
            accuracy += 1

accuracy_percentage = (accuracy / 20) * 100
print("Accuracy =", accuracy_percentage, "%")


Time taken: 15.420482158660889 seconds
Accuracy = 100.0 %


Displaying predicted directions

In [2]:
maximum=0
ind=0
l=[]
for i in range(25):
    l=list(Q[i,:])
    maximum=max(l)
    ind=l.index(maximum)
    if ind==0:
        print('up\t',end="")
    if ind==1:
        print('right\t',end="")
    if ind==2:
        print('down\t',end="")
    if ind==3:
        print('left\t',end="")
    if (i+1)%5==0:
        print()

right	right	right	right	down	
right	right	right	right	up	
up	up	up	up	up	
up	up	down	down	down	
up	left	left	left	left	
