# Experiment 6

## Problem Statement:

Implement Polciy Iteration Algorithm.

## Code

In [1]:
# importing required libraries
import numpy as np

In [7]:
num_states = 11
num_actions = 4

states = [(i, j) for i in range(3) for j in range(4) if not (i == 1 and j == 1)]

policy = {
    (0, 0): 'Right',
    (0, 1): 'Right',
    (0, 2): 'Right',
    (1, 0): 'Up',
    (1, 2): 'Up',
    (2, 0): 'Up',
    (2, 1): 'Right',
    (2, 2): 'Up',
    (2, 3): 'Left',
}

rewards = {
    (0, 3): 1,
    (1, 3): -1,
}

def transition_probabilities(state, action):
    i, j = state
    if state in rewards:
        return [(state, 1.0)]
    if action == 'Up':
        next_state = (max(i - 1, 0), j)
    elif action == 'Down':
        next_state = (min(i + 1, 2), j)
    elif action == 'Left':
        next_state = (i, max(j - 1, 0))
    elif action == 'Right':
        next_state = (i, min(j + 1, 3))
    else:
        raise ValueError("Invalid action")
    if next_state not in states:
        return [(state, 1.0)]
    return [(next_state, 1.0)]

gamma = 0.9

def policy_evaluation(policy, states, rewards, gamma, theta):
    V = {state: 0 for state in states}
    iteration = 0
    while True:
        delta = 0
        for state in states:
            v = V[state]
            action = policy.get(state)
            if action:
                transitions = transition_probabilities(state, action)
                V[state] = sum(prob * (rewards.get(next_state, 0) + gamma * V.get(next_state, 0)) for next_state, prob in transitions)
            delta = max(delta, abs(v - V[state]))
        if delta < theta:
            break
        print(f"Iteration {iteration} - Value Matrix:")
        value_matrix = np.zeros((3, 4))
        for state, value in V.items():
            i, j = state
            value_matrix[i][j] = value
        print(value_matrix)
        iteration += 1
    return V

def policy_improvement(policy, V, states, rewards, gamma):
    policy_stable = True
    for state in states:
        old_action = policy.get(state)
        if old_action:
            action_values = {}
            for action in ['Up', 'Down', 'Left', 'Right']:
                transitions = transition_probabilities(state, action)
                action_values[action] = sum(prob * (rewards.get(next_state, 0) + gamma * V.get(next_state, 0)) for next_state, prob in transitions)
            best_action = max(action_values, key=action_values.get)
            policy[state] = best_action
            if old_action != best_action:
                policy_stable = False
    return policy_stable

theta = 0.0001

while True:
    V = policy_evaluation(policy, states, rewards, gamma, theta)
    if policy_improvement(policy, V, states, rewards, gamma):
        break

print("\nOptimal Policy:")
for state, action in policy.items():
    print(f"State {state}: {action}")
print("\nOptimal Value Function:")
for state, value in V.items():
    print(f"State {state}: {value:.2f}")


Iteration 0 - Value Matrix:
[[0.    0.    1.    0.   ]
 [0.    0.    0.9   0.   ]
 [0.    0.    0.81  0.729]]
Iteration 1 - Value Matrix:
[[0.    0.9   1.    0.   ]
 [0.    0.    0.9   0.   ]
 [0.    0.729 0.81  0.729]]
Iteration 2 - Value Matrix:
[[0.81   0.9    1.     0.    ]
 [0.729  0.     0.9    0.    ]
 [0.6561 0.729  0.81   0.729 ]]

Optimal Policy:
State (0, 0): Right
State (0, 1): Right
State (0, 2): Right
State (1, 0): Up
State (1, 2): Up
State (2, 0): Up
State (2, 1): Right
State (2, 2): Up
State (2, 3): Left

Optimal Value Function:
State (0, 0): 0.81
State (0, 1): 0.90
State (0, 2): 1.00
State (0, 3): 0.00
State (1, 0): 0.73
State (1, 2): 0.90
State (1, 3): 0.00
State (2, 0): 0.66
State (2, 1): 0.73
State (2, 2): 0.81
State (2, 3): 0.73
