In [1]:
import numpy as np
import random
import time

In [2]:
GRID_ROWS = 5
GRID_COLS = 6
ACTION_COUNT = 4  # 0: Up, 1: Down, 2: Left, 3: Right
actionmap = {0: "UP", 1: "DOWN", 2: "LEFT", 3: "RIGHT"}
STATE_COUNT = GRID_ROWS * GRID_COLS  # This will correctly be 30
statemap = {0: "START", 1: "BLANK", 2: "BOMB", 3: "CHARGE", 4: "GOAL"}

# Define special states
START_STATE = (0, 0)
GOAL_STATE = (4, 5)
CHARGE_STATES = [(0, 2), (4, 0), (2,2), (2,5), (4,1)]
BOMB_STATES = [(1, 1), (1, 4), (3, 0), (3, 3)]

# Define Rewards and Penalties
REWARDS = np.full([GRID_ROWS, GRID_COLS], -1)  # -1 for each step 
REWARDS[GOAL_STATE] = 100          
for bomb in BOMB_STATES:
    REWARDS[bomb] = -100

# define matrix for state mapping
state_matrix = np.zeros((GRID_ROWS, GRID_COLS), dtype=int)
for r in range(GRID_ROWS):
    for c in range(GRID_COLS):
        if (r, c) == START_STATE:
            state_matrix[r, c] = 0  # START
        elif (r, c) == GOAL_STATE:
            state_matrix[r, c] = 4  # GOAL
        elif (r, c) in CHARGE_STATES:
            state_matrix[r, c] = 3  # CHARGE
        elif (r, c) in BOMB_STATES:
            state_matrix[r, c] = 2  # BOMB
        else:
            state_matrix[r, c] = 1  # BLANK

LEARNING_RATE = 0.1  # (alpha) How much we update Q-values from new info
DISCOUNT_FACTOR = 0.9  # (gamma) How much we value future rewards
EPISODES = 1000        # How many times the agent plays the game

# Exploration vs. Exploitation
epsilon = 1.0           # 100% exploration at the start
EPSILON_DECAY = 0.999   # How much epsilon decays after each episode
MIN_EPSILON = 0.1       # Minimum exploration rate

# Q-Table 
q_table = np.zeros((STATE_COUNT, ACTION_COUNT)) 

In [3]:
# print initial Q-Table
print("Initial Q-Table:")
print(q_table)


Initial Q-Table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [4]:
# print Rewards Table   
print("Rewards Table:")
print(REWARDS)

Rewards Table:
[[  -1   -1   -1   -1   -1   -1]
 [  -1 -100   -1   -1 -100   -1]
 [  -1   -1   -1   -1   -1   -1]
 [-100   -1   -1 -100   -1   -1]
 [  -1   -1   -1   -1   -1  100]]


In [5]:
def get_next_position(current_position, action):

    row = current_position[0]
    col = current_position[1]

    if action == 0:  # Up
        row = max(0, row - 1)
    elif action == 1:  # Down
        row = min(GRID_ROWS - 1, row + 1)
    elif action == 2:  # Left
        col = max(0, col - 1)
    elif action == 3:  # Right
        col = min(GRID_COLS - 1, col + 1)

    return (row, col)

def get_state(position):
    return state_matrix[position[0], position[1]]

def position_to_state_index(position):
    """ Converts a (row, col) tuple into a unique state index (0 to 29). """
    return position[0] * GRID_COLS + position[1]

def choose_action(state, epsilon):
    """
    Epsilon-Greedy Strategy:
    - With probability (epsilon), choose a random action (Explore)
    - With probability (1 - epsilon), choose the best action from Q-table (Exploit)
    """
    if random.uniform(0, 1) < epsilon:
        # Explore: choose a random action
        return random.randint(0, ACTION_COUNT - 1)
    else:
        # Exploit: choose the best action (highest Q-value) for this state
        return np.argmax(q_table[state, :])
    

In [6]:
verbose = False
print("Training the agent...")
for episode in range(EPISODES):
    current_position = START_STATE  # (0, 0)
    # Convert that position to its unique Q-table index (0-29)
    current_state_index = position_to_state_index(current_position) 
    
    done = False
    
    if episode % 10 == 0:
        print(f"Starting Episode {episode}")

    howmany = 0
    while not done:
        action = choose_action(current_state_index, epsilon)
        next_position = get_next_position(current_position, action)
        reward = REWARDS[next_position] 
        
        if next_position == GOAL_STATE or (next_position in BOMB_STATES):
            done = True

        if verbose:
            print(f"Episode {episode} | Pos: {current_position} | Action: {actionmap[action]} | "
                  f"Next Pos: {next_position} | Reward: {reward} | Epsilon: {epsilon:.3f}")
            
        # Get the old Q-value using the CURRENT STATE INDEX
        old_q_value = q_table[current_state_index, action]
        # Convert the new position to its unique NEXT STATE INDEX
        next_state_index = position_to_state_index(next_position)
        # Get the maximum Q-value for the NEXT STATE INDEX
        max_future_q = np.max(q_table[next_state_index, :])
        # Calculate the new Q-value
        new_q_value = old_q_value + LEARNING_RATE * \
            (reward + DISCOUNT_FACTOR * max_future_q - old_q_value)
        # Update the table using the CURRENT STATE INDEX
        q_table[current_state_index, action] = new_q_value
        
        # 5. Move to the next state
        current_position = next_position
        current_state_index = next_state_index # Use the unique index
        howmany += 1    

    if verbose:
        print(f"Episode {episode} finished in {howmany} steps.\n")
    # Decay epsilon after each episode
    epsilon = max(MIN_EPSILON, epsilon * EPSILON_DECAY)

print("Training finished!\n")

Training the agent...
Starting Episode 0
Starting Episode 10
Starting Episode 20
Starting Episode 30
Starting Episode 40
Starting Episode 50
Starting Episode 60
Starting Episode 70
Starting Episode 80
Starting Episode 90
Starting Episode 100
Starting Episode 110
Starting Episode 120
Starting Episode 130
Starting Episode 140
Starting Episode 150
Starting Episode 160
Starting Episode 170
Starting Episode 180
Starting Episode 190
Starting Episode 200
Starting Episode 210
Starting Episode 220
Starting Episode 230
Starting Episode 240
Starting Episode 250
Starting Episode 260
Starting Episode 270
Starting Episode 280
Starting Episode 290
Starting Episode 300
Starting Episode 310
Starting Episode 320
Starting Episode 330
Starting Episode 340
Starting Episode 350
Starting Episode 360
Starting Episode 370
Starting Episode 380
Starting Episode 390
Starting Episode 400
Starting Episode 410
Starting Episode 420
Starting Episode 430
Starting Episode 440
Starting Episode 450
Starting Episode 460
St

In [7]:
# Print the final Q-Table (rounded for clarity)
print("--- Final Q-Table (Rounded) ---")
print("Actions: 0=Up, 1=Down, 2=Left, 3=Right")
print(np.round(q_table, 1))
print("\n")

--- Final Q-Table (Rounded) ---
Actions: 0=Up, 1=Down, 2=Left, 3=Right
[[  32.4   32.6   32.2   37.4]
 [  37.3 -100.    32.2   42.6]
 [  42.1   48.5   36.9   43. ]
 [  13.2   53.4   27.5    2.7]
 [  -1.5  -92.    -1.7   14.4]
 [   2.8   45.8   -1.     1.3]
 [  21.9   39.8   21.3 -100. ]
 [   0.     0.     0.     0. ]
 [  39.1   55.   -99.9   52.2]
 [  32.8   62.1   29.   -98.2]
 [   0.     0.     0.     0. ]
 [  11.7   78.2  -65.1   40. ]
 [  13.2  -99.9   10.5   47.6]
 [ -99.4   11.6   21.    54.9]
 [  46.2   39.6   46.9   62.2]
 [  53.1  -99.9   54.4   70.2]
 [ -97.2   77.    60.2   79.1]
 [  65.3   89.    69.5   76.5]
 [   0.     0.     0.     0. ]
 [  31.3   -0.7  -90.2    5.5]
 [  51.2   31.8    6.8  -86.5]
 [   0.     0.     0.     0. ]
 [  23.2   65.5  -74.6   88.9]
 [  75.1  100.    76.8   87.9]
 [ -46.9   -1.3   -1.3   -0.2]
 [  -1.6   -1.4   -1.5   11.4]
 [   5.2   10.3    0.2   50.5]
 [ -71.8   28.1   15.    72.4]
 [  38.7   51.9   14.    97.2]
 [   0.     0.     0.     0. ]

In [8]:
print("--- Final Learned Path ---")

# A grid to draw our path on
# We can use your state_matrix to pre-fill it!
display_grid = [["." for _ in range(GRID_COLS)] for _ in range(GRID_ROWS)]

for r in range(GRID_ROWS):
    for c in range(GRID_COLS):
        state_type = state_matrix[r, c] # Your 0-4 tile type
        if state_type == 0:   # START
            display_grid[r][c] = "S"
        elif state_type == 2: # BOMB
            display_grid[r][c] = "B"
        elif state_type == 3: # CHARGE
            display_grid[r][c] = "C"
        elif state_type == 4: # GOAL
            display_grid[r][c] = "G"
        else:                 # BLANK
            display_grid[r][c] = "."

# Define symbols for the path
action_symbols = ['↑', '↓', '←', '→']

# --- Simulate the path ---
current_position = START_STATE
steps = 0
path = [current_position] # To detect loops

while current_position != GOAL_STATE and steps < (GRID_ROWS * GRID_COLS):
    # 1. Convert position to the unique state index (0-29)
    current_state_index = position_to_state_index(current_position)
    
    # 2. Get the BEST action from the Q-table (no more epsilon)
    best_action = np.argmax(q_table[current_state_index, :])
    
    # 3. Mark the path on our display grid
    # (Don't overwrite the 'S' at the start)
    if current_position != START_STATE:
        display_grid[current_position[0]][current_position[1]] = action_symbols[best_action]
        
    # 4. Move to the next position
    current_position = get_next_position(current_position, best_action)
    
    # 5. Check if we hit a bomb
    if current_position in BOMB_STATES:
        print("Agent learned to run into a BOMB! ☠️")
        display_grid[current_position[0]][current_position[1]] = "☠️"
        break
        
    # 6. Check if we are stuck in a loop
    if current_position in path:
        print("Agent got stuck in a loop. Training failed.")
        display_grid[current_position[0]][current_position[1]] = "O"
        break
    
    path.append(current_position)
    steps += 1

if current_position == GOAL_STATE:
    print("Agent reached the GOAL!")

# Print the final grid
for row in display_grid:
    # Join with a space for better formatting
    print(" ".join(row))

--- Final Learned Path ---
Agent reached the GOAL!
S → ↓ . . .
. B ↓ . B .
. . → → → ↓
B . . B . ↓
C C . . . G
