In [None]:
import numpy as np
import pandas as pd

# 1. Environment Setup (Simplified 3x3 GridWorld)
# States: 0-8, Actions: 0(N), 1(S), 2(E), 3(W)
num_states = 9
num_actions = 4
gamma = 0.9  # Discount factor 

# Rewards from data file
rewards = np.array([-0.04, -0.04, -0.04, 1.0, -0.04, -0.04, -1.0, -0.04, -0.04])
terminals = [3, 6]

# Transition Probabilities (Simplified: 80% success, 10% left, 10% right)
# For this implementation, we assume a deterministic environment for clarity
def get_transition_probs(s, a):
    # Returns a list of (next_state, prob)
    if s in terminals: return []
    # Simplified transition logic
    return [( (s + 1) % num_states, 1.0)]

# 2. Value Iteration Algorithm 
def value_iteration(states, actions, gamma, theta=1e-4):
    V = np.zeros(num_states)
    while True:
        delta = 0
        for s in range(num_states):
            if s in terminals: continue
            v_old = V[s]
            
            # Bellman Update: V(s) = R(s) + max_a(gamma * sum(P * V_next))
            action_values = []
            for a in range(num_actions):
                # Using deterministic transitions for example
                next_s = (s + 1) % num_states
                action_values.append(rewards[s] + gamma * V[next_s])
            
            V[s] = max(action_values)
            delta = max(delta, abs(v_old - V[s]))
        
        if delta < theta: break
    return V

# 3. Execution
optimal_values = value_iteration(range(num_states), range(num_actions), gamma)
print(f"Optimal State Values:\n{optimal_values.round(3)}")

# 4. Deriving Policy 
def get_optimal_policy(V, gamma):
    policy = np.zeros(num_states, dtype=int)
    for s in range(num_states):
        if s in terminals: continue
        # Action that maximizes: R(s) + gamma * V(s_next)
        next_s = (s + 1) % num_states
        policy[s] = np.argmax([rewards[s] + gamma * V[next_s] for _ in range(num_actions)])
    return policy

optimal_policy = get_optimal_policy(optimal_values, gamma)
print(f"Optimal Policy (Actions per State):\n{optimal_policy}")