In [None]:
import numpy as np

# Gridworld environment definition
GRID_SIZE = 4
GAMMA = 0.9  # Discount factor

# Rewards
REWARD = -1  # Step reward
GOAL_REWARD = 10  # Reward for reaching the goal state

# Transition probabilities
TRANS_PROB = {
    0: {(0, 0): 0.9, (0, 1): 0.1, (1, 0): 0.0, (0, 0): 0.0},  # Up
    1: {(0, 1): 0.1, (0, 2): 0.8, (0, 1): 0.1, (1, 1): 0.0},  # Right
    2: {(1, 0): 0.0, (2, 0): 0.1, (1, 1): 0.9, (1, 0): 0.0},  # Down
    3: {(0, 0): 0.0, (0, 0): 0.0, (0, 1): 0.1, (1, 0): 0.9}   # Left
}
# Actions
ACTIONS = [0, 1, 2, 3]  # Up, Right, Down, Left
GOAL_STATE = (0, 3)  # Goal state

# Helper function to initialize the value function
def initialize_value_function():
    return np.zeros((GRID_SIZE, GRID_SIZE))

# Helper function to calculate the expected future reward
def calculate_expected_reward(state, value_function, policy):
    row, col = state
    expected_reward = 0
    for action in ACTIONS:
        next_state_probs = TRANS_PROB[action]
        reward = REWARD
        if (row, col) == GOAL_STATE:
            reward = GOAL_REWARD
        expected_reward += policy[row, col, action] * (reward + GAMMA * sum([prob * value_function[next_row, next_col]
                                                                             for (next_row, next_col), prob in next_state_probs.items()]))
    return expected_reward

# Policy Evaluation
def policy_evaluation(policy, value_function, num_iterations=1000, theta=1e-8):
    for i in range(num_iterations):
        delta = 0
        for row in range(GRID_SIZE):
            for col in range(GRID_SIZE):
                state = (row, col)
                old_value = value_function[row, col]
                value_function[row, col] = calculate_expected_reward(state, value_function, policy)
                delta = max(delta, abs(old_value - value_function[row, col]))
        if delta < theta:
            break
    return value_function

# Policy Improvement
def policy_improvement(value_function):
    policy = np.zeros((GRID_SIZE, GRID_SIZE, len(ACTIONS)))
    for row in range(GRID_SIZE):
        for col in range(GRID_SIZE):
            state = (row, col)
            action_values = []
            for action in ACTIONS:
                next_state_probs = TRANS_PROB[action]
                action_value = REWARD
                if (row, col) == GOAL_STATE:
                    action_value = GOAL_REWARD
                action_value += GAMMA * sum(
                    [prob * value_function[next_row, next_col] for (next_row, next_col), prob in next_state_probs.items()])
                action_values.append(action_value)
            best_action = np.argmax(action_values)
            policy[row, col, best_action] = 1.0
    return policy

# Policy Iteration
def policy_iteration(num_iterations=1000):
    value_function = initialize_value_function()
    policy = np.ones((GRID_SIZE, GRID_SIZE, len(ACTIONS))) / len(ACTIONS)  # Initialize with a random policy

    for i in range(num_iterations):
        value_function = policy_evaluation(policy, value_function)
        new_policy = policy_improvement(value_function)

        if np.array_equal(policy, new_policy):
            break

        policy = new_policy

    return value_function, policy

# Example usage
value_function, optimal_policy = policy_iteration()
print("Value Function:")
print(value_function)
print("\nOptimal Policy:")
print(optimal_policy)

Value Function:
[[-1.0989011 -1.0989011 -1.0989011  9.9010989]
 [-1.0989011 -1.0989011 -1.0989011 -1.0989011]
 [-1.0989011 -1.0989011 -1.0989011 -1.0989011]
 [-1.0989011 -1.0989011 -1.0989011 -1.0989011]]

Optimal Policy:
[[[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]]
