グリッドワールドの設定  
状態: 4×4のグリッド（合計16状態）。  
行動: 上、下、左、右。  
報酬:ゴール状態以外はすべて -1。  
     ゴール状態（右下）は報酬0で終了。  
割引率: γ=0.9（将来の報酬を割引）。

In [1]:
import numpy as np
import matplotlib.pyplot as plt

GRID_SIZE = 4
DISCOUNT = 0.9
THETA = 1e-4
ACTIONS = ["up", "down", "left", "right"]
ACTION_EFFECTs = {
    "up": (-1, 0),
    "down": (1, 0),
    "left": (0, -1),
    "right": (0, 1)
}

In [2]:
def step(state, action):
    x,y = state
    dx,dy = ACTION_EFFECTs[action]
    nx = x+dx
    ny = y+dy
    if nx < 0 or nx >= GRID_SIZE or ny < 0 or ny >= GRID_SIZE:
        next_state = x,y
    else:
        next_state = nx,ny
    reward = -1 if next_state != (GRID_SIZE-1,GRID_SIZE-1) else 0
    return next_state, reward


Value Iteration

In [14]:
def value_iteration():
    value = np.zeros((GRID_SIZE,GRID_SIZE))
    policy = np.zeros((GRID_SIZE,GRID_SIZE),dtype=int)
    value_history = []
    policy_history = []

    while True:
        delta = 0
        for x in range(GRID_SIZE):
            for y in range(GRID_SIZE):
                if (x,y) == (GRID_SIZE-1,GRID_SIZE-1):
                    continue
                old_value = value[x,y]
                values = []
                for a, action in enumerate(ACTIONS):
                    (nx,ny),reward = step((x,y),action)
                    values.append(reward + DISCOUNT * value[nx,ny])
                value[x,y] = max(values)
                policy[x,y] = np.argmax(values)
                delta = max(delta,abs(old_value-value[x,y]))
        value_history.append(value.copy()) 
        policy_history.append(policy.copy())
        if delta < THETA:
            break
    return value,policy,value_history,policy_history

Policy Iteration

In [15]:
# ポリシー反復
def policy_iteration():
    value = np.zeros((GRID_SIZE, GRID_SIZE))
    policy = np.zeros((GRID_SIZE, GRID_SIZE), dtype=int)  # 初期方策: すべて "up"
    value_history = []
    policy_history = []
    while True:
        # 方策評価
        while True:
            delta = 0
            for x in range(GRID_SIZE):
                for y in range(GRID_SIZE):
                    if (x, y) == (GRID_SIZE-1, GRID_SIZE-1):  # ゴール状態
                        continue
                    old_value = value[x, y]
                    action = ACTIONS[policy[x, y]]
                    (nx, ny), reward = step((x, y), action)
                    value[x, y] = reward + DISCOUNT * value[nx, ny]
                    delta = max(delta, abs(old_value - value[x, y]))
            value_history.append(value.copy())
            if delta < THETA:
                break

        # 方策改善
        policy_stable = True
        for x in range(GRID_SIZE):
            for y in range(GRID_SIZE):
                if (x, y) == (GRID_SIZE-1, GRID_SIZE-1):  # ゴール状態
                    continue
                old_action = policy[x, y]
                values = []
                for a, action in enumerate(ACTIONS):
                    (nx, ny), reward = step((x, y), action)
                    values.append(reward + DISCOUNT * value[nx, ny])
                policy[x, y] = np.argmax(values)
                if old_action != policy[x, y]:
                    policy_stable = False
        policy_history.append(policy.copy())
        if policy_stable:
            break
    return value, policy,value_history,policy_history


In [22]:
# 実行
print("値反復 (Value Iteration):")
vi_value, vi_policy,_,v_p_history = value_iteration()
print("価値関数:")
print(vi_value)
print("方策 (0:up, 1:down, 2:left, 3:right):")
print(vi_policy)
print((len(v_p_history)))
print("価値関数の過程")
for i in range(len(v_p_history)):
    print(v_p_history[i])
    print("")
print("方策変化の過程")
for i in range(len(v_p_history)):
    print(v_p_history[i])
    print("")   



値反復 (Value Iteration):
価値関数:
[[-4.0951 -3.439  -2.71   -1.9   ]
 [-3.439  -2.71   -1.9    -1.    ]
 [-2.71   -1.9    -1.      0.    ]
 [-1.9    -1.      0.      0.    ]]
方策 (0:up, 1:down, 2:left, 3:right):
[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]
6
価値関数の過程
[[0 0 0 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 3 0]]

[[0 0 0 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 3 3 0]]

[[0 0 0 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]

[[0 0 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]

[[0 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]

[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]

方策変化の過程
[[0 0 0 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 3 0]]

[[0 0 0 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 3 3 0]]

[[0 0 0 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]

[[0 0 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]

[[0 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]

[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]



In [19]:
print("\nポリシー反復 (Policy Iteration):")
pi_value, pi_policy,p_v_history,p_p_history= policy_iteration()
print("価値関数:")
print(pi_value)
print("方策 (0:up, 1:down, 2:left, 3:right):")
print(pi_policy)
print(len(p_v_history))
print("価値関数の過程")
for i in range(len(p_v_history)):
    print(p_v_history[i])
    print("")

print("方策変化の過程")
for i in range(len(p_p_history)):
    print(p_p_history[i])
    print("")


ポリシー反復 (Policy Iteration):
価値関数:
[[-4.0951 -3.439  -2.71   -1.9   ]
 [-3.439  -2.71   -1.9    -1.    ]
 [-2.71   -1.9    -1.      0.    ]
 [-1.9    -1.      0.      0.    ]]
方策 (0:up, 1:down, 2:left, 3:right):
[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [3 3 3 0]]
101
価値関数の過程
[[-1.    -1.    -1.    -1.   ]
 [-1.9   -1.9   -1.9   -1.9  ]
 [-2.71  -2.71  -2.71  -2.71 ]
 [-3.439 -3.439 -3.439  0.   ]]

[[-1.9    -1.9    -1.9    -1.9   ]
 [-2.71   -2.71   -2.71   -2.71  ]
 [-3.439  -3.439  -3.439  -3.439 ]
 [-4.0951 -4.0951 -4.0951  0.    ]]

[[-2.71    -2.71    -2.71    -2.71   ]
 [-3.439   -3.439   -3.439   -3.439  ]
 [-4.0951  -4.0951  -4.0951  -4.0951 ]
 [-4.68559 -4.68559 -4.68559  0.     ]]

[[-3.439    -3.439    -3.439    -3.439   ]
 [-4.0951   -4.0951   -4.0951   -4.0951  ]
 [-4.68559  -4.68559  -4.68559  -4.68559 ]
 [-5.217031 -5.217031 -5.217031  0.      ]]

[[-4.0951    -4.0951    -4.0951    -4.0951   ]
 [-4.68559   -4.68559   -4.68559   -4.68559  ]
 [-5.217031  -5.217031  -5.217031  -5.