In [None]:
import Sokoban_env
from Sokoban_env import Sokoban_v2
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
import rl_algorithms as rl

In [None]:
env = Sokoban_v2(map_name="level_1")
env.reset()
plt.imshow(env.render())

In [None]:
env.available_states(env)

In [None]:
P = env.create_transition_pos()
print(P)

In [None]:
def value_iteration(env,num_iterations = 100,threshold = 1e-20,gamma = 1.0):
    state_space = env.observation_space.n
    action_space = env.action_space.n
    value_table = np.zeros(state_space)

    for i in range(num_iterations):
        update_value_table = np.copy(value_table)

        for state in range(state_space):
            Q_values = []
            for action in range(action_space):
                transitions = env.P.get((state, action), [])  # Get possible transitions for (state, action)
                q_value = 0.0
                for next_state, prob in transitions:
                    reward = env.R.get((state, action, next_state), 0.0)  # Get the reward for the transition
                    q_value += prob * (reward + gamma * update_value_table[next_state])
                Q_values.append(q_value)

            value_table[state] = max(Q_values)

        if np.sum(np.fabs(update_value_table - value_table)) <= threshold:
            break

    return value_table


def extract_policy(value_table, env):
    gamma = 1.0
    policy = np.zeros(env.observation_space.n, dtype=int)

    for state in range(env.observation_space.n):
        Q_values = []
        for action in range(env.action_space.n):
            transitions = env.P.get((state, action), [])  # Get possible transitions for (state, action)
            q_value = 0.0
            for next_state, prob in transitions:
                reward = env.R.get((state, action, next_state), 0.0)  # Get the reward for the transition
                q_value += prob * (reward + gamma * value_table[next_state])
            Q_values.append(q_value)

        policy[state] = np.argmax(Q_values)

    return policy


In [None]:
optimal_value_function = value_iteration(env)
optimal_policy = extract_policy(optimal_value_function,env)
print(optimal_policy)

In [None]:
def policy_iteration(env, num_iterations=100, gamma=1.0):
    state_space = env.observation_space.n
    action_space = env.action_space.n
    policy = np.random.choice(action_space, size=state_space)

    for _ in range(num_iterations):
        value_table = np.zeros(state_space)
        threshold = 1e-20

        while True:
            update_value_table = np.copy(value_table)
            for state in range(state_space):
                action = policy[state]
                transitions = env.P.get((state, action), [])
                v = 0.0
                for next_state, prob in transitions:
                    reward = env.R.get((state, action, next_state), 0.0)
                    v += prob * (reward + gamma * update_value_table[next_state])
                value_table[state] = v

            if np.sum(np.fabs(update_value_table - value_table)) <= threshold:
                break

        # Policy Improvement
        policy_stable = True
        for state in range(state_space):
            old_action = policy[state]
            Q_values = np.zeros(action_space)
            for action in range(action_space):
                transitions = env.P.get((state, action), [])
                q_value = 0.0
                for next_state, prob in transitions:
                    reward = env.R.get((state, action, next_state), 0.0)
                    q_value += prob * (reward + gamma * value_table[next_state])
                Q_values[action] = q_value

            best_action = np.argmax(Q_values)
            policy[state] = best_action

            if old_action != best_action:
                policy_stable = False

        if policy_stable:
            break

    return policy


In [None]:
optimal_policy = policy_iteration(env)
optimal_policy