In [None]:
import numpy as np
import gymnasium as gym
from minigrid.wrappers import FullyObsWrapper
from copy import deepcopy
from collections import deque
import hashlib
import json


def build_minigrid_model(env):
    """
    Enumerate all reachable fully-observable states in MiniGrid,
    then build P[s,a,s'] and R[s,a]
    """
    # wrap to get full grid observation (no partial obs)
    env = FullyObsWrapper(env)
    # helper to hash an obs dictionary
    def obs_key(o):
        if isinstance(o, dict):
            # For dictionary observations, extract the 'image' part which is the grid
            if 'image' in o:
                return o['image'].tobytes()
            else:
                # If no 'image' key, convert dict to a stable string representation
                return hashlib.md5(json.dumps(str(o), sort_keys=True).encode()).hexdigest()
        else:
            # For array observations (older versions)
            return o.tobytes()

    # BFS over states
    state_dicts = []           # index -> env.__dict__ snapshot
    obs_to_idx = {}            # obs_key -> index
    transitions = {}           # (s,a) -> (s', r, done)

    # init
    obs, _ = env.reset(seed=0)
    idx0 = 0
    obs_to_idx[obs_key(obs)] = idx0
    state_dicts.append(deepcopy(env.unwrapped.__dict__))
    queue = deque([obs])

    while queue:
        obs = queue.popleft()
        s = obs_to_idx[obs_key(obs)]
        # restore that state
        env.unwrapped.__dict__.update(deepcopy(state_dicts[s]))

        for a in range(env.action_space.n):
            obs2, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated

            key2 = obs_key(obs2)
            if key2 not in obs_to_idx:
                obs_to_idx[key2] = len(state_dicts)
                state_dicts.append(deepcopy(env.unwrapped.__dict__))
                queue.append(obs2)

            s2 = obs_to_idx[key2]
            transitions[(s,a)] = (s2, r, done)

            # restore before next action
            env.unwrapped.__dict__.update(deepcopy(state_dicts[s]))

    nS = len(state_dicts)
    nA = env.action_space.n

    # build P and R arrays
    P = np.zeros((nS, nA, nS), dtype=np.float32)
    R = np.zeros((nS, nA), dtype=np.float32)
    for (s,a), (s2, r, done) in transitions.items():
        if done:
            # absorbing: stay in s
            P[s,a,s] = 1.0
        else:
            P[s,a,s2] = 1.0
        R[s,a] = r

    return P, R, state_dicts



In [None]:
def solve_policy_linear_minigrid(env, gamma=0.9):
    """
    Under uniform random policy, solve (I - γ P^π) v = R^π exactly.
    """
    P, R, state_dicts = build_minigrid_model(env)
    nS, nA = R.shape

    # uniform random policy
    pi = np.ones((nS, nA)) / nA

    # build P^π and R^π
    P_pi = np.einsum('sa,sab->sb', pi, P)  # shape (nS,nS) - fixed indices to avoid repeated output subscript
    R_pi = (pi * R).sum(axis=1)            # shape (nS,)

    # solve linear system
    A = np.eye(nS) - gamma * P_pi
    v = np.linalg.solve(A, R_pi)
    return v, state_dicts

if __name__ == "__main__":
    # 1. create the Minigrid env
    env = gym.make("MiniGrid-Empty-5x5-v0")

    # 2. solve for v under random policy
    v, states = solve_policy_linear_minigrid(env, gamma=0.99)  # Changed from 1.0 to 0.99

    # 3. print out values by (pos,dir)
    print("State-value function for each (x,y,dir):\n")
    for idx, sdict in enumerate(states):
        pos = sdict['agent_pos']
        d   = sdict['agent_dir']
        print(f"  s={idx:2d}, pos={pos}, dir={d}:  V = {v[idx]:6.3f}")


In [None]:
import numpy as np
import gymnasium as gym
from minigrid.wrappers import FullyObsWrapper

def policy_eval_iterative(env, policy, discount_factor=0.99, epsilon=0.00001):
    """
    Evaluate a policy given an environment using iterative policy evaluation.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: MiniGrid environment
        discount_factor: Gamma discount factor.
        epsilon: Threshold for convergence.
    
    Returns:
        Vector representing the value function.
    """
    # Build model to get state space size and transition probabilities
    P, R, state_dicts = build_minigrid_model(env)
    nS, nA = R.shape
    
    # Start with zeros for the value function
    V = np.zeros(nS)
    
    while True:
        delta = 0
        # For each state, perform a "full backup"
        for s in range(nS):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action, look at the possible next states
                for s_next in range(nS):
                    # If transition is possible
                    if P[s, a, s_next] > 0:
                        # Calculate the expected value
                        v += action_prob * P[s, a, s_next] * (R[s, a] + discount_factor * V[s_next])
            # How much did the value change?
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below threshold
        if delta < epsilon:
            break
            
    return V

# Create the MiniGrid environment
env = gym.make("MiniGrid-Empty-5x5-v0")
env = FullyObsWrapper(env)

# Get model dimensions
P, R, state_dicts = build_minigrid_model(env)
nS, nA = R.shape

# Create a uniform random policy
random_policy = np.ones([nS, nA]) / nA

# Evaluate the policy
v = policy_eval_iterative(env, random_policy, discount_factor=0.99)

# Print results
print("State-value function using iterative policy evaluation:\n")
for idx, sdict in enumerate(state_dicts):
    pos = sdict['agent_pos']
    d   = sdict['agent_dir']
    print(f"  s={idx:2d}, pos={pos}, dir={d}:  V = {v[idx]:6.3f}")


In [None]:
import torch
import ray
import numpy as np
import gymnasium as gym
from minigrid.wrappers import FullyObsWrapper

ray.init()  # start Ray (will auto-detect cores)

@ray.remote
def eval_state(s, V_old, policy_s, P, R, gamma, epsilon):
    """
    Compute the new V[s] for a single state s under `policy_s`
    V_old: torch tensor (nS,)
    policy_s: torch tensor (nA,)
    P: Transition probability array of shape (nS, nA, nS)
    R: Reward array of shape (nS, nA)
    """
    v = 0.0
    for a, π in enumerate(policy_s):
        for s_next in range(len(V_old)):
            # If transition is possible
            if P[s, a, s_next] > 0:
                # Calculate expected value
                v += π * P[s, a, s_next] * (R[s, a] + gamma * V_old[s_next])
    return float(v)

def policy_eval_ray_minigrid(env, policy, gamma=0.99, epsilon=1e-5):
    """
    Policy evaluation using Ray for parallel computation in MiniGrid
    """
    # Build model to get transitions and rewards
    P, R, state_dicts = build_minigrid_model(env)
    nS, nA = R.shape
    
    # torch tensors for GPU/CPU flexibility
    V_old = torch.zeros(nS, dtype=torch.float32)
    policy_t = torch.tensor(policy, dtype=torch.float32)

    while True:
        # launch one task per state
        futures = [
            eval_state.remote(
                s,
                V_old,
                policy_t[s],
                P,
                R,
                gamma,
                epsilon
            )
            for s in range(nS)
        ]
        # gather all new V's
        V_new_list = ray.get(futures)
        V_new = torch.tensor(V_new_list)

        # check convergence
        if torch.max(torch.abs(V_new - V_old)) < epsilon:
            break
        V_old = V_new

    return V_new

if __name__ == "__main__":
    # Create the MiniGrid environment
    env = gym.make("MiniGrid-Empty-5x5-v0")
    env = FullyObsWrapper(env)

    # Get model dimensions
    P, R, state_dicts = build_minigrid_model(env)
    nS, nA = R.shape
    
    # uniform random policy
    random_policy = np.ones((nS, nA)) / nA
    
    # evaluate policy using Ray parallel computation
    v = policy_eval_ray_minigrid(env, random_policy, gamma=0.99)
    
    # Print results in the same format as cell 1
    print("State-value function using Ray parallel evaluation:\n")
    for idx, sdict in enumerate(state_dicts):
        pos = sdict['agent_pos']
        d   = sdict['agent_dir']
        print(f"  s={idx:2d}, pos={pos}, dir={d}:  V = {v[idx]:6.3f}")
