# FrozenLake之策略循環(Policy Iteration)

## 載入套件

In [1]:
import numpy as np
import gymnasium as gym

## 載入遊戲

In [2]:
env = gym.make('FrozenLake-v1', is_slippery=False, desc=["SFFF", "FFFH", "FFFF", "FFFG"])
env.reset()

(0, {'prob': 1})

In [3]:
nS = env.observation_space.n
nA = env.action_space.n
nS, nA

(16, 4)

## 顯示行動策略機率

In [4]:
env.unwrapped.P

{0: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 4, 0.0, False)],
  2: [(1.0, 1, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 1: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 5, 0.0, False)],
  2: [(1.0, 2, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)],
  1: [(1.0, 6, 0.0, False)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 2, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)],
  1: [(1.0, 7, 0.0, True)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 3, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, False)],
  1: [(1.0, 8, 0.0, False)],
  2: [(1.0, 5, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 5: {0: [(1.0, 4, 0.0, False)],
  1: [(1.0, 9, 0.0, False)],
  2: [(1.0, 6, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 6: {0: [(1.0, 5, 0.0, False)],
  1: [(1.0, 10, 0.0, False)],
  2: [(1.0, 7, 0.0, True)],
  3: [(1.0, 2, 0.0, False)]},
 7: {0: [(1.0, 7, 0, True)],
  1: [(1.0, 7, 0, True)],
  2: [(1.0, 7, 0, True)],
  3: [(1.0, 7, 0, True)]},
 8: {0: [(1.0, 8, 0.0, False)],
  1: [(1.0, 12, 0.0, 

## 策略評估函數

In [5]:
# 策略評估函數
def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    # 狀態值函數初始化
    V = np.zeros(nS)
    V1 = np.copy(V)
    while True:
        delta = 0
        # 更新每個狀態值的函數
        for s in range(nS):
            v = 0
            # 計算每個行動後的狀態值函數
            for a, action_prob in enumerate(policy[s]):
                # 取得所有可能的下一狀態值
                for  prob, next_state, reward, done in env.unwrapped.P[s][a]:
                    # 狀態值函數公式，依照所有可能的下一狀態值函數加總 
                    v += action_prob * prob * (reward + 
                                   discount_factor * V[next_state])
            # 比較更新前後的差值，取最大值
            delta = max(delta, np.abs(v - V[s]))
            V1[s] = v
        V = np.copy(V1)
        # 若最大差值 < 門檻值，則停止評估
        if delta < theta:
            break
    return np.array(V)

In [6]:
# 隨機策略，機率均等
random_policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n
# 評估
v = policy_eval(random_policy, env)
print("狀態值函數:")
print(v.reshape((int(nS ** 0.5), int(nS ** 0.5))))

狀態值函數:
[[0.33057206 0.29638556 0.22039882 0.11019328]
 [0.36479661 0.33821814 0.25464004 0.        ]
 [0.42563702 0.4370814  0.45996217 0.4866518 ]
 [0.47506976 0.52453871 0.66149455 0.        ]]


## 策略改善函數

In [7]:
def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):
    # 計算行動值函數
    def one_step_lookahead(state, V):
        A = np.zeros(nA)
        for a in range(nA):
            for prob, next_state, reward, done in env.unwrapped.P[state][a]:
                A[a] += prob * (reward + discount_factor * V[next_state])
        return A
    
    # 一開始採隨機策略，往上/下/左/右走的機率(π)均等
    policy = np.ones([nS, nA]) / nA
    
    while True:
        # 策略評估
        V = policy_eval_fn(policy, env, discount_factor)
        
        # 若要改變策略，會設定 policy_stable = False
        policy_stable = True
        
        for s in range(nS):
            # 依 P 選擇最佳行動
            chosen_a = np.argmax(policy[s])
            
            # 計算下一步的行動值函數
            action_values = one_step_lookahead(s, V)
            # 選擇最佳行動
            best_a = np.argmax(action_values)
            
            # 貪婪策略：若有新的最佳行動，修改行動策略
            if chosen_a != best_a:
                policy_stable = False
            policy[s] = np.eye(nA)[best_a]
        
        # 如果已無較佳行動策略，則回傳策略及狀態值函數
        if policy_stable:
            return policy, V

In [9]:
# 執行策略循環
policy, v = policy_improvement(env, discount_factor=0.9)

In [10]:
# 顯示結果
print("策略機率分配:")
print(policy)
print("")

print("4x4 策略機率分配 (0~3：左/下/右/上):")
print(np.reshape(np.argmax(policy, axis=1), (int(nS ** 0.5), int(nS ** 0.5))))
print("")

print("4x4 狀態值函數:")
print(v.reshape((int(nS ** 0.5), int(nS ** 0.5))))

策略機率分配:
[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]

4x4 策略機率分配 (0~3：左/下/右/上):
[[1 1 1 0]
 [1 1 1 0]
 [1 1 1 1]
 [2 2 2 0]]

4x4 狀態值函數:
[[0.59049 0.6561  0.729   0.6561 ]
 [0.6561  0.729   0.81    0.     ]
 [0.729   0.81    0.9     1.     ]
 [0.81    0.9     1.      0.     ]]


## 測試行動策略

In [11]:
def play_episodes(environment, n_episodes, policy):
    wins = 0
    total_reward = 0
    for episode in range(n_episodes):
            done = False
            state, info = environment.reset()
            while not done:
                    # Select best action to perform in a current state
                    action = np.argmax(policy[state])
                    # Perform an action an observe how environment acted in response
                    next_state, reward, terminated, truncated, info = environment.step(action)
                    done = terminated or truncated
                    # Summarize total reward
                    total_reward += reward
                    # Update current state
                    state = next_state
                    # Calculate number of wins over episodes
                    if done and reward == 1.0:
                            wins += 1
    average_reward = total_reward / n_episodes
    return wins, total_reward, average_reward

# 測試 10000 回合
n_episodes = 10000
wins, total_reward, average_reward = play_episodes(env, n_episodes, policy)
print(f'number of wins over {n_episodes} episodes = {wins}')
print(f'average reward over {n_episodes} episodes = {average_reward} \n\n')

number of wins over 10000 episodes = 10000
average reward over 10000 episodes = 1.0 




## 隨機行動策略

In [12]:
def random_play(environment, n_episodes):
    wins = 0
    total_reward = 0
    for episode in range(n_episodes):
        done = False
        state, info = environment.reset()
        while not done:
            # 隨機行動
            action = environment.action_space.sample()
            # Perform an action an observe how environment acted in response
            next_state, reward, terminated, truncated, info = environment.step(action)
            done = terminated or truncated
            # Summarize total reward
            total_reward += reward
            # Update current state
            state = next_state
            # Calculate number of wins over episodes
            if done and reward == 1.0:
                wins += 1
    average_reward = total_reward / n_episodes
    return wins, total_reward, average_reward

# 測試 10000 回合
n_episodes = 10000
wins, total_reward, average_reward = random_play(env, n_episodes)
print(f'number of wins over {n_episodes} episodes = {wins}')
print(f'average reward over {n_episodes} episodes = {average_reward} \n\n')

number of wins over 10000 episodes = 3233
average reward over 10000 episodes = 0.3233 


