In [5]:
import numpy as np
import random

In [10]:
# 環境の設定
grid_size = 5
goal_state = (4, 4)
obstacle_state = (2, 2)
actions = {0: (-1, 0), 1: (1, 0), 2: (0, -1), 3: (0, 1)}  # 上, 下, 左, 右
action_names = ["↑", "↓", "←", "→"]

# パラメータ設定
epsilon = 0.1  # 探索率
gamma = 0.9    # 割引率
episodes = 5000  # エピソード数

# 初期化
Q = np.zeros((grid_size, grid_size, len(actions)))  # Q(s, a)
Returns = [[[] for _ in range(len(actions))] for _ in range(grid_size * grid_size)]  # 各状態-行動ペアのリターンリスト
policy = np.full((grid_size, grid_size, len(actions)), 1 / len(actions))  # ε-softポリシー

# 状態からインデックスを取得する関数
def state_to_index(state):
    return state[0] * grid_size + state[1]

# 行動を適用して次の状態を取得
def apply_action(state, action):
    next_state = (state[0] + actions[action][0], state[1] + actions[action][1])
    # 境界チェック
    if next_state[0] < 0 or next_state[0] >= grid_size or next_state[1] < 0 or next_state[1] >= grid_size:
        next_state = state  # 境界外なら元の状態にとどまる
    return next_state

# エピソード生成
def generate_episode():
    state = (random.randint(0, grid_size - 1), random.randint(0, grid_size - 1))  # ランダムな初期状態
    episode = []
    while state != goal_state:
        action = np.random.choice(list(actions.keys()), p=policy[state[0], state[1]])
        next_state = apply_action(state, action)
        reward = 1 if next_state == goal_state else -1 if next_state == obstacle_state else 0
        episode.append((state, action, reward))
        state = next_state
    return episode

# 学習ループ
for _ in range(episodes):
    episode = generate_episode()
    G = 0
    visited = set()

    for step in reversed(episode):
        state, action, reward = step
        G = gamma * G + reward
        pair = (state, action)
        if pair not in visited:  # 初回訪問のみ処理
            visited.add(pair)
            idx = state_to_index(state)
            Returns[idx][action].append(G)
            Q[state[0], state[1], action] = np.mean(Returns[idx][action])  # Q値を更新

            # ポリシー改善
            best_action = np.argmax(Q[state[0], state[1]])
            for a in range(len(actions)):
                if a == best_action:
                    policy[state[0], state[1], a] = 1 - epsilon + epsilon / len(actions)
                else:
                    policy[state[0], state[1], a] = epsilon / len(actions)

In [11]:
# 学習結果を可視化
print("最適ポリシー:")
optimal_policy = np.full((grid_size, grid_size), "", dtype=object)
for i in range(grid_size):
    for j in range(grid_size):
        if (i, j) == goal_state:
            optimal_policy[i, j] = "G"  # ゴール
        elif (i, j) == obstacle_state:
            optimal_policy[i, j] = "X"  # 障害物
        else:
            best_action = np.argmax(Q[i, j])
            optimal_policy[i, j] = action_names[best_action]

for row in optimal_policy:
    print(" ".join(row))

最適ポリシー:
↓ ↓ → → ↓
↓ ↓ → ↓ ↓
↓ ↓ X → ↓
↓ ↓ ↓ ↓ ↓
→ → → → G
