# Experiment 7

## Problem Statement:

Write a python program to implement value iteration in dynamic programming.

## Code

In [1]:
# importing required libraries
import numpy as np

In [2]:

def compute_state_value(state, policy, V, gamma, transition_model):
    new_v = 0
    for next_state, reward, probability in transition_model.get((state, policy[state]), []):
        new_v += probability * (reward + gamma * V[next_state])
    return new_v

In [3]:
def policy_evaluation(V, policy, gamma, transition_model):
    while True:
        delta = 0
        for state in V:
            v = V[state]
            new_v = compute_state_value(state, policy, V, gamma, transition_model)
            V[state] = new_v
            delta = max(delta, abs(v - new_v))
        if delta < 1e-6:
            break


In [4]:

def policy_improvement(V, policy, gamma, transition_model):
    policy_stable = True
    for state in V:
        old_action = policy[state]
        action_values = {}
        for action in transition_model.keys():
            action_values[action] = compute_state_value(state, {state: action}, V, gamma, transition_model)
        best_action = max(action_values, key=action_values.get)
        policy[state] = best_action
        if old_action != best_action:
            policy_stable = False
    return policy_stable

In [5]:
if __name__ == "__main__":
    transition_model = {
        (0, 'A'): [(0, 10, 0.8), (1, -10, 0.2)],
        (0, 'B'): [(1, 0, 1.0)],
        (1, 'A'): [(0, 0, 1.0)],
        (1, 'B'): [(1, 0, 1.0)]
    }

    policy = {0: 'A', 1: 'A'}
    V = {0: 0, 1: 0}
    gamma = 0.9

    while True:
        policy_evaluation(V, policy, gamma, transition_model)
        policy_stable = policy_improvement(V, policy, gamma, transition_model)
        if policy_stable:
            break

    print("Optimal Policy:")
    for state, action in policy.items():
        print(f"State {state}: Action {action}")



Optimal Policy:
State 0: Action (0, 'A')
State 1: Action (0, 'A')
