In [29]:
import numpy as np

In [38]:
# 1. Environment
class GridWorld:
    def __init__(self, grid_size=4, start=(0, 0), goal=(3, 3)):
        self.grid_size = grid_size
        self.start = start
        self.goal = goal
        self.state = self.reset()

    def render(self):
        """Display the grid with agent (A), start (S), and goal (G)"""
        grid = np.full((self.grid_size, self.grid_size), '.')
        # Mark agent's current position
        x, y = self.state
        grid[y, x] = 'A'  # Rows = y, columns = x
        # Mark start and goal (if not covered by the agent)
        if self.start != self.state:
            grid[self.start[1], self.start[0]] = 'S'
        if self.goal != self.state:
            grid[self.goal[1], self.goal[0]] = 'G'
        # Print the grid
        for row in grid:
            print(' '.join(row))
        print()

    def step(self, action):
        # Update state based on action
        # Return (next_state, reward, done)
        # add conditions for out of bound later on or add it to the AI run
        if action == 0:
            next_state = (self.state[0], self.state[1] - 1)
        elif action == 1:
            next_state = (self.state[0], self.state[1] + 1)
        elif action == 2:
            next_state = (self.state[0] - 1, self.state[1])
        elif action == 3:
            next_state = (self.state[0] + 1, self.state[1])
        else:
            print('invalid input')
            next_state = self.state

        # Assume the AI is never wrong
        self.state = next_state

        if next_state == self.goal:
            return next_state, +10, True
        else:
            return next_state, -1, False
        


    def reset(self):
        self.state = (0, 0)
        return self.state

In [65]:

class QLearningAgent:
    def __init__(self, num_states, num_actions, epsilon=0.2, alpha=0.8, gamma=0.99):
        self.q_table = np.zeros((num_states, num_actions))  # Q-table
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha      # Learning rate
        self.gamma = gamma      # Discount factor
        self.num_actions = num_actions

    def choose_action(self, state, env):
        # Convert state to (x, y) coordinates
        x = state[0]
        y = state[1]
        state_index = state[1] * self.num_actions + state[0]

        valid_actions = []
        # Check valid actions based on current position
        if y > 0:               # Can move Up
            valid_actions.append(0)
        if y < self.num_actions - 1:  # Can move Down
            valid_actions.append(1)
        if x > 0:               # Can move Left
            valid_actions.append(2)
        if x < self.num_actions - 1:  # Can move Right
            valid_actions.append(3)

        # print(valid_actions)

        # print(f'VALID ACTIONS: {valid_actions}')
        
        if np.random.rand() < self.epsilon:
            # Explore: choose a random valid action
            action = np.random.choice(valid_actions)
        else:
            q_values = self.q_table[state_index]
            masked_q = np.where(np.isin(range(4), valid_actions), q_values, -np.inf)
            action = np.argmax(masked_q)

        next_state, reward, done = env.step(action)

        
        return next_state, reward, done, action
                        
    def learn(self, state, action, reward, next_state):
        # Q-learning formula: Q(s,a) = Q(s,a) + α [R + γ * max(Q(s',a')) - Q(s,a)]
        # TODO: Update Q-values
        state_index = state[1] * self.num_actions + state[0]
        next_state_index = next_state[1] * self.num_actions + next_state[0]

        current_q = self.q_table[state_index][action]
        next_q = self.q_table[next_state_index]

        max_future_q = np.max(self.q_table[next_state_index])  # ✅        
        new_q = current_q + self.alpha*(reward + self.gamma * max_future_q - current_q)
        # print(f'NEW Q {new_q}')
        self.q_table[state_index][action] = new_q

In [66]:
env = GridWorld()
agent = QLearningAgent(num_states=16, num_actions=4)
for episode in range(50):
    state = env.reset()
    done = False
    print(f'EPISODE: {episode}')

    # input()
    while not done:
        next_state, reward, done, action = agent.choose_action(state, env)
        agent.learn(state, action, reward, next_state)

        state = next_state
        if state == env.goal:
            break

    # Decay epsilon
    agent.epsilon = max(0.01, agent.epsilon * 0.995)

EPISODE: 0
EPISODE: 1
EPISODE: 2
EPISODE: 3
EPISODE: 4
EPISODE: 5
EPISODE: 6
EPISODE: 7
EPISODE: 8
EPISODE: 9
EPISODE: 10
EPISODE: 11
EPISODE: 12
EPISODE: 13
EPISODE: 14
EPISODE: 15
EPISODE: 16
EPISODE: 17
EPISODE: 18
EPISODE: 19
EPISODE: 20
EPISODE: 21
EPISODE: 22
EPISODE: 23
EPISODE: 24
EPISODE: 25
EPISODE: 26
EPISODE: 27
EPISODE: 28
EPISODE: 29
EPISODE: 30
EPISODE: 31
EPISODE: 32
EPISODE: 33
EPISODE: 34
EPISODE: 35
EPISODE: 36
EPISODE: 37
EPISODE: 38
EPISODE: 39
EPISODE: 40
EPISODE: 41
EPISODE: 42
EPISODE: 43
EPISODE: 44
EPISODE: 45
EPISODE: 46
EPISODE: 47
EPISODE: 48
EPISODE: 49


In [67]:
# To stop exploring
agent.epsilon = 0

In [68]:
state = env.reset()
done = False

# input()
while not done:
        next_state, reward, done, action = agent.choose_action(state, env)
        agent.learn(state, action, reward, next_state)

        state = next_state
        if state == env.goal:
            break
        # print(state)
        print(env.render())
        # input()

S . . .
A . . .
. . . .
. . . G

None
S . . .
. . . .
A . . .
. . . G

None
S . . .
. . . .
. . . .
A . . G

None
S . . .
. . . .
. . . .
. A . G

None
S . . .
. . . .
. . . .
. . A G

None
