In [None]:
import numpy as np

# Environment parameters
gamma = 0.9
noise = 0.2
living_reward = 0

# Grid dimensions
rows, cols = 3, 4

# Define rewards grid
rewards = np.zeros((rows, cols))
rewards[0, 3] = +1
rewards[1, 3] = -1
rewards[1, 1] = None  # Wall

# Terminal and wall states
terminal_states = [(0, 3), (1, 3)]
wall_states = [(1, 1)]

# Actions and their vectors
actions = ['U', 'D', 'L', 'R']
action_vectors = {
    'U': (-1, 0),
    'D': (1, 0),
    'L': (0, -1),
    'R': (0, 1)
}

# Utility function to check bounds
def in_bounds(state):
    r, c = state
    return 0 <= r < rows and 0 <= c < cols and (r, c) not in wall_states

# Compute expected value of taking an action at a state
def compute_action_value(state, action, V):
    r, c = state
    primary_move = action_vectors[action]

    if action in ['U', 'D']:
        sideways = ['L', 'R']
    else:
        sideways = ['U', 'D']

    moves = [(primary_move, 1 - noise)] + [(action_vectors[a], noise / 2) for a in sideways]

    value = 0
    for move, prob in moves:
        new_r, new_c = r + move[0], c + move[1]
        if not in_bounds((new_r, new_c)):
            new_r, new_c = r, c
        reward = living_reward if (new_r, new_c) not in terminal_states else rewards[new_r, new_c]
        value += prob * (reward + gamma * V[new_r, new_c])
    return value

# Initialize policy randomly
policy = np.full((rows, cols), 'U', dtype=object)
for r, c in terminal_states + wall_states:
    policy[r, c] = None

# Initialize value function
V = np.zeros((rows, cols))

# Policy Iteration
is_policy_stable = False
iteration = 0

while not is_policy_stable:
    # Policy Evaluation
    while True:
        delta = 0
        new_V = np.copy(V)
        for r in range(rows):
            for c in range(cols):
                if (r, c) in terminal_states or (r, c) in wall_states:
                    continue
                v = compute_action_value((r, c), policy[r, c], V)
                new_V[r, c] = v
                delta = max(delta, abs(v - V[r, c]))
        V = new_V
        if delta < 1e-4:
            break

    # Policy Improvement
    is_policy_stable = True
    for r in range(rows):
        for c in range(cols):
            if (r, c) in terminal_states or (r, c) in wall_states:
                continue
            old_action = policy[r, c]
            action_values = [compute_action_value((r, c), a, V) for a in actions]
            best_action = actions[np.argmax(action_values)]
            policy[r, c] = best_action
            if best_action != old_action:
                is_policy_stable = False

import pandas as pd
import ace_tools as tools; tools.display_dataframe_to_user(name="Policy Iteration - Value Function", dataframe=pd.DataFrame(V))

policy_display = pd.DataFrame(policy)
tools.display_dataframe_to_user(name="Policy Iteration - Policy", dataframe=policy_display)
