In [1]:
import numpy as np
import random

###############################################################################
# 1) Maze Definition and Padding
###############################################################################
world = [
    "wwwwwwwwwwwwwwwwwwwww",
    "wa         o      o w",
    "w www www wwwww www w",
    "w o                w",
    "w o www ooo ooo www w",
    "w o   o ooo ooo o   w",
    "w www wwwww wwwww www",
    "w      o     o      w",
    "www www ooo ooo wwwww",
    "w   o ooooooo o o   w",
    "w www ooooooo o www w",
    "w    o   ooo   o    w",
    "w wwwww www wwwww www",
    "w o     o o     o   w",
    "w o wwwww wwwww o www",
    "w o               o w",
    "w www www www www www",
    "w      o   g   o    w",
    "wwwwwwwwwwwwwwwwwwwww"
]


###############################################################################
# 2) Rewards Definition
###############################################################################
rewards = {
    'w': -5,  
    'o': -20, 
    'g': 80,
    ' ': -1,
    'a': -1
}

max_len = max(len(row) for row in world)
maze = [list(row.ljust(max_len)) for row in world]

###############################################################################
# 3) GridWorld Environment (Dictionary-based approach)
###############################################################################
class GridWorld:
    def __init__(self, grid, rewards):
        self.grid = np.array(grid)
        self.rewards = rewards
        self.actions = ['up', 'down', 'left', 'right']
        self.done = False
        
        # Find 'g' (goal) and 'a' (agent start)
        self.goal = None
        self.start = None
        for i in range(self.grid.shape[0]):
            for j in range(self.grid.shape[1]):
                if self.grid[i, j] == 'g':
                    self.goal = (i, j)
                if self.grid[i, j] == 'a':
                    self.start = (i, j)
        if self.start is None:
            self.start = (0, 0)
        
        self.state = self.start

    def reset(self):
        self.state = self.start
        self.done = False
        return self.state

    def step(self, action):
        """
        Actions: 'up', 'down', 'left', 'right'
        Returns (next_state, reward, done).
        """
        if self.done:
            return self.state, 0, True
        
        i, j = self.state
        if action == 'up':
            i -= 1
        elif action == 'down':
            i += 1
        elif action == 'left':
            j -= 1
        elif action == 'right':
            j += 1
        
        # Out of bounds => treat as wall
        if i < 0 or i >= self.grid.shape[0] or j < 0 or j >= self.grid.shape[1]:
            return self.state, self.rewards['w'], True
        
        # If cell is a wall => penalty, episode ends
        if self.grid[i, j] == 'w':
            return self.state, self.rewards['w'], True
        
        self.state = (i, j)
        
        # If reached the goal => big reward, done
        if self.state == self.goal:
            self.done = True
            return self.state, self.rewards['g'], True
        
        reward = self.rewards.get(self.grid[i, j], -1)
        return self.state, reward, False

    def get_all_states(self):
        """All valid states except walls."""
        states = []
        for i in range(self.grid.shape[0]):
            for j in range(self.grid.shape[1]):
                if self.grid[i, j] != 'w':
                    states.append((i, j))
        return states
    
    def get_possible_actions(self, state):
        """No actions if it's the goal; otherwise up/down/left/right."""
        if state == self.goal:
            return []
        return self.actions

###############################################################################
# 4) Value Iteration (Dictionary-based V)
###############################################################################
def value_iteration(env, gamma=0.9, theta=1e-6, max_iter=1000):
    states = env.get_all_states()
    V = {s: 0.0 for s in states}
    
    for _ in range(max_iter):
        delta = 0
        for s in states:
            old_val = V[s]
            best_val = float('-inf')
            acts = env.get_possible_actions(s)
            if not acts:
                continue
            for a in acts:
                saved_state = env.state
                env.state = s
                s_next, reward, done = env.step(a)
                env.state = saved_state
                
                candidate = reward + (0 if done else gamma * V[s_next])
                best_val = max(best_val, candidate)
            V[s] = best_val
            delta = max(delta, abs(old_val - best_val))
        if delta < theta:
            break
    
    policy = {}
    for s in states:
        actions = env.get_possible_actions(s)
        if not actions:
            policy[s] = None
            continue
        best_a = None
        best_val = float('-inf')
        for a in actions:
            saved_state = env.state
            env.state = s
            s_next, reward, done = env.step(a)
            env.state = saved_state
            
            candidate = reward + (0 if done else gamma * V[s_next])
            if candidate > best_val:
                best_val = candidate
                best_a = a
        policy[s] = best_a
    
    return V, policy

###############################################################################
# 5) Q-Learning (Dictionary-based)
###############################################################################
def q_learning(env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1, max_steps=1000):
    states = env.get_all_states()
    Q = {s: {a: 0.0 for a in env.get_possible_actions(s)} for s in states}
    
    for _ in range(episodes):
        s = env.reset()
        for _ in range(max_steps):
            actions = env.get_possible_actions(s)
            if not actions:
                break
            if random.random() < epsilon:
                a = random.choice(actions)
            else:
                a = max(Q[s], key=Q[s].get)
            
            s_next, reward, done = env.step(a)
            
            if s_next in Q and env.get_possible_actions(s_next):
                max_q_next = max(Q[s_next].values())
            else:
                max_q_next = 0.0
            
            old_q = Q[s][a]
            Q[s][a] = old_q + alpha * (reward + gamma * max_q_next - old_q)
            
            s = s_next
            if done:
                break
    
    policy = {}
    for s, actions_dict in Q.items():
        if len(actions_dict) == 0:
            policy[s] = None
        else:
            policy[s] = max(actions_dict, key=actions_dict.get)
    return Q, policy

###############################################################################
# 6) Example Usage
###############################################################################
if __name__ == "__main__":
    env = GridWorld(maze, rewards)
    
    V_vi, policy_vi = value_iteration(env, gamma=0.9, theta=1e-6, max_iter=1000)
    print("=== Value Iteration ===")
    print("Value at (1,1):", V_vi.get((1,1), None))
    print("Policy at (1,1):", policy_vi.get((1,1), None))
    
    Q, policy_q = q_learning(env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1, max_steps=1000)
    print("\n=== Q-Learning ===")
    print("Q for (1,1):", Q.get((1,1), None))
    print("Policy for (1,1):", policy_q.get((1,1), None))


=== Value Iteration ===
Value at (1,1): 0
Policy at (1,1): up

=== Q-Learning ===
Q for (1,1): {'up': -10.157689560370985, 'down': -7.24905528249796, 'left': -10.011283110739981, 'right': -7.250514297359016}
Policy for (1,1): down
