In [1]:
import numpy as np

# Zustände und Aktionen
STATES = ["a", "b"]
ACTIONS = ["a", "b"]

# Belohnungsmatrix: REWARDS[state][next_state]
REWARDS = {
    "a": {"a": 0, "b": 7},
    "b": {"a": -5, "b": 0}
}

# Übergangswahrscheinlichkeiten: TRANSITIONS[state][next_state]
TRANSITIONS = {
    "a": {"a": 0.1, "b": 0.9},
    "b": {"a": 0.9, "b": 0.1}
}

# Hyperparameter
gamma = 0.9     # Diskontierungsfaktor
theta = 1e-6    # Schwelle für Konvergenz

# Wertefunktion initialisieren
V = {s: 0.0 for s in STATES}

# Value Iteration Algorithmus
while True:
    delta = 0
    for state in STATES:
        v = V[state]
        # Berechne erwarteten Wert für jede Aktion
        action_values = {}
        for action in ACTIONS:
            expected_value = 0
            for next_state, prob in TRANSITIONS[state].items():
                reward = REWARDS[state][next_state]
                expected_value += prob * (reward + gamma * V[next_state])
            action_values[action] = expected_value
        # Wähle die beste Aktion (maximiert erwarteten Wert)
        V[state] = max(action_values.values())
        delta = max(delta, abs(v - V[state]))
    if delta < theta:
        break

# Optimale Policy extrahieren
policy = {}
for state in STATES:
    best_action = None
    best_value = float('-inf')
    for action in ACTIONS:
        expected_value = 0
        for next_state, prob in TRANSITIONS[state].items():
            reward = REWARDS[state][next_state]
            expected_value += prob * (reward + gamma * V[next_state])
        if expected_value > best_value:
            best_value = expected_value
            best_action = action
    policy[state] = best_action

# Ergebnisse anzeigen
print("Optimale Wertefunktion V(s):")
for state in STATES:
    print(f"V({state}) = {V[state]:.4f}")

print("\nOptimale Policy π(s):")
for state in STATES:
    print(f"Im Zustand {state} → beste Aktion: {policy[state]}")


Optimale Wertefunktion V(s):
V(a) = 12.1395
V(b) = 5.8605

Optimale Policy π(s):
Im Zustand a → beste Aktion: a
Im Zustand b → beste Aktion: a
