## Frozen Lake

In [1]:
# 載入相關套件
import numpy as np
import gymnasium as gym

## Random walk

In [None]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False, render_mode="human")
env.reset()

for _ in range(30):
    action = env.action_space.sample()  # this is where you would insert your policy
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

## Training

In [2]:
# 值循環函數
def value_iteration(env, theta=0.0001, discount_factor=1.0):
    # 計算行動值函數
    def one_step_lookahead(state, V):
        A = np.zeros(nA)
        for a in range(nA):
            for prob, next_state, reward, done in env.P[state][a]:
                A[a] += prob * (reward + discount_factor * V[next_state])
        return A

    # 狀態值函數初始化
    V = np.zeros(nS)
    while True:
        delta = 0
        # 更新每個狀態值的函數
        for s in range(nS):
            # 計算下一步的行動值函數
            A = one_step_lookahead(s, V)
            best_action_value = np.max(A)
            # 比較更新前後的差值，取最大值
            delta = max(delta, np.abs(best_action_value - V[s]))
            # 更新狀態值函數
            V[s] = best_action_value        
        # 若最大差值 < 門檻值，則停止評估
        if delta < theta:
            break

    # 一開始採隨機策略，往上/下/左/右走的機率(π)均等
    policy = np.zeros([nS, nA])
    for s in range(nS):
        # 計算下一步的行動值函數
        A = one_step_lookahead(s, V)
        # 選擇最佳行動
        best_action = np.argmax(A)
        # 永遠採取最佳行動
        policy[s, best_action] = 1.0
    
    return policy, V            

In [5]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)
env.reset()

nS = env.observation_space.n
nA = env.action_space.n


# 執行值循環
policy, v = value_iteration(env, discount_factor=0.9)
env.close()
print("4x4 狀態值函數:")
print(v.reshape((4, 4)))

4x4 狀態值函數:
[[0.59049 0.6561  0.729   0.6561 ]
 [0.6561  0.      0.81    0.     ]
 [0.729   0.81    0.9     0.     ]
 [0.      0.9     1.      0.     ]]


In [20]:
# 0: Move left
# 1: Move down
# 2: Move right
# 3: Move up
v = v.reshape((4, 4))
v[4-1, 4-1] = 999
def get_action_with_max_value(row, column):
    max_value = -999
    selected_action = 0
    for action, (i, j) in enumerate([(0, -1), (1, 0), (0, 1), (-1, 0)]):
        if 0<=row+i<4 and 0<=column+j<4 and v[row+i, column+j] > max_value:
            max_value = v[row+i, column+j]
            selected_action = action
    return selected_action

In [22]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False, render_mode="human")
observation, info = env.reset()

for _ in range(30):
    # action = env.action_space.sample()  
    row, column = divmod(observation, 4)
    action = get_action_with_max_value(row, column)
    # print(row, column, action)
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()