In [25]:
states = ['sI', 'sN', 'sE']
actions = ['aT', 'aA', 'aE']
gamma = 0.9

# Transition probabilities P(s' | s, a)
transition_probabilities = {
    ('sI', 'aT', 'sI'): 0.7,
    ('sN', 'aT', 'sI'): 0.3,
    ('sI', 'aA', 'sI'): 1.0,
    ('sE', 'aE', 'sI'): 1.0,
    ('sN', 'aT', 'sN'): 1.0,
    ('sI', 'aA', 'sN'): 0.5,
    ('sN', 'aA', 'sN'): 0.5,
    ('sE', 'aE', 'sN'): 1.0,
    ('sI', 'aT', 'sN'): 0.0,
    ('sE', 'aE', 'sE'): 1.0,
    ('sE', 'aT', 'sE'): 1.0,
    ('sE', 'aA', 'sE'): 1.0,
    ('sI', 'aT', 'sE'): 0.0,
    ('sN', 'aT', 'sE'): 0.0,
    ('sE', 'aT', 'sI'): 0.0
}

# Rewards R(s, a)
rewards = {
    ('sI', 'aA'): 1.1,
    ('sI', 'aE'): 10.0,
    ('sI', 'aT'): 0.0,
    ('sN', 'aA'): 0.0,
    ('sN', 'aE'): 0.0,
    ('sN', 'aT'): -1.0,
    ('sE', 'aA'): 0.0,
    ('sE', 'aE'): 0.0,
    ('sE', 'aT'): 0.0,
}

In [36]:
import numpy as np

# Calculate utility for given policy
def calculate_utility(states, transition_probabilities, rewards, gamma, policy, tolerance=1e-6):
    value_function = {state: 0.0 for state in states}
    
    while True:
        delta = 0
        new_value_function = value_function.copy()

        for state in states:
            action = policy[state]
            new_value = 0

            for next_state in states:
                transition_prob = transition_probabilities.get((next_state, action, state), 0)
                reward = rewards.get((state, action), 0)
                new_value += transition_prob * (reward + gamma * value_function[next_state])
            
            new_value_function[state] = new_value

            # Track the maximum change in value to check for convergence
            delta = max(delta, abs(new_value_function[state] - value_function[state]))

        # Check for convergence
        if delta < tolerance:
            break
        
        value_function = new_value_function

    return value_function

# Calculate maximum utility
def calculate_max_utility(states, actions, transition_probabilities, rewards, gamma, tolerance=1e-6):
    value_function = {state: 0.0 for state in states}
    
    while True:
        delta = 0
        for s in states:
            v = value_function[s]
            value_function[s] = max(sum(transition_probabilities.get((s_next, a, s),0) * 
                           (rewards.get((s, a),0) + gamma * value_function[s_next])
                           for s_next in states) for a in actions)
            delta = max(delta, abs(v - value_function[s]))
        
        # Check for convergence
        if delta < tolerance:
            break    

    return value_function



In [None]:
policyQ2 = {
    'sI': 'aE',
    'sN': 'aE',
    'sE': 'aE'
}

utility = calculate_utility(states, transition_probabilities, rewards, gamma, policyQ2)
print("Utility of each state under the policy aE:")
for state, value in utility.items():
    print(f"U({state}) = {value:.2f}")

# calculate_max_utility(states, actions, transition_probabilities, rewards, gamma)

{('sI', 'aT', 'sI'): 0.7, ('sN', 'aT', 'sI'): 0.3, ('sI', 'aA', 'sI'): 1.0, ('sE', 'aE', 'sI'): 1.0, ('sN', 'aT', 'sN'): 1.0, ('sI', 'aA', 'sN'): 0.5, ('sN', 'aA', 'sN'): 0.5, ('sE', 'aE', 'sN'): 1.0, ('sI', 'aT', 'sN'): 0.0, ('sE', 'aE', 'sE'): 1.0, ('sE', 'aT', 'sE'): 1.0, ('sE', 'aA', 'sE'): 1.0, ('sI', 'aT', 'sE'): 0.0, ('sN', 'aT', 'sE'): 0.0, ('sE', 'aT', 'sI'): 0.0}
Utility of each state under the policy:
U(sI) = -7.30
U(sN) = -10.00
U(sE) = 0.00


In [33]:
policyQ3 = {
    'sI': 'aT',
    'sN': 'aT',
    'sE': 'aT'
}

utility = calculate_utility(states, transition_probabilities, rewards, gamma, policyQ3)
print("Utility of each state under the policy aT:")
for state, value in utility.items():
    print(f"U({state}) = {value:.2f}")

Utility of each state under the policy aT:
U(sI) = -7.30
U(sN) = -10.00
U(sE) = 0.00


In [34]:
policyQ4 = {
    'sI': 'aA',
    'sN': 'aA',
    'sE': 'aA'
}

utility = calculate_utility(states, transition_probabilities, rewards, gamma, policyQ4)
print("Utility of each state under the policy aA:")
for state, value in utility.items():
    print(f"U({state}) = {value:.2f}")

Utility of each state under the policy aA:
U(sI) = 11.00
U(sN) = 9.00
U(sE) = 0.00


In [39]:
# Q5
max_utility = calculate_max_utility(states, actions, transition_probabilities, rewards, gamma)
print("Utility of each state (optimal):")
for state, value in utility.items():
    print(f"U({state}) = {value:.2f}")

Utility of each state (optimal):
U(sI) = 11.00
U(sN) = 9.00
U(sE) = 0.00


In [42]:
# Q6 / Q7
policy = {}
for s in states:
    policy[s] = max(actions, key=lambda a: sum(transition_probabilities.get((s_next, a, s),0) * (rewards.get((s, a),0) + gamma * max_utility[s_next]) for s_next in states))
print("Optimal Policy:")
for state, value in policy.items():
    print(f"pi({state}) = {value}")

Optimal Policy:
pi(sI) = aA
pi(sN) = aA
pi(sE) = aT
