In [2]:
import os
import random
import numpy as np
from collections import defaultdict
from typing import List, Optional

## 環境

In [3]:
X, O, E = 1, -1, 0  # X, O, 空
WIN_LINES = [(0,1,2),(3,4,5),(6,7,8),(0,3,6),(1,4,7),(2,5,8),(0,4,8),(2,4,6)]

def check_winner(board: List[int]) -> Optional[int]:
    """勝者を返す。引き分けや続行中はNone"""
    for a,b,c in WIN_LINES:
        s = board[a] + board[b] + board[c]
        if s == 3:  return X
        if s == -3: return O
    return None

def available_actions(board: List[int]) -> List[int]:
    """可能な手を返す"""
    return [i for i, v in enumerate(board) if v == E]


def is_done(board: List[int]) -> bool:
    """doneフラグを返す"""
    return check_winner(board) is not None or E not in board

def step(board: List[int], player: int, action: int):
    """
    1手進める
    Args:
        board: 盤面
        player: X or O
        action: 0-8
    Returns:
        新盤面, 報酬, done, winner
    """
    new_board = board[:]
    new_board[action] = player
    winner = check_winner(new_board)
    if winner == player:
        return new_board, 1.0, True, winner
    elif winner == -player:
        return new_board, -1.0, True, winner
    elif E not in new_board:
        return new_board, 0.0, True, None
    else:
        return new_board, 0.0, False, None

## Q学習エージェント

In [20]:
class QAgent:
    def __init__(self, episode=0.1, alpha=0.5, gamma=0.9):
        self.episode = episode
        self.alpha = alpha
        self.gamma = gamma
        self.Q = defaultdict(lambda: defaultdict(int))

    def state_key(self, board: List[int]):
        """ボードをキーとして扱えるようにする (tupleに変換)"""
        return tuple(board)

    def select_action(self, board: List[int]) -> int:
        """ε-greedy法に基づいて行動を選択する"""
        acts = available_actions(board)
        if random.random() < self.episode:
            return random.choice(acts)
        qvals = self.Q[self.state_key(board)]
        return max(acts, key=lambda a: qvals[a])  # 最も価値の高い行動を選択

    def update(self, s, a, r, s_next, done):
        """Q値を更新する"""
        q_sa = self.Q[s][a]
        max_next = 0.0 if done else max(self.Q[s_next].values() or [0.0])
        target = r + self.gamma * max_next
        self.Q[s][a] = q_sa + self.alpha * (target - q_sa)

## 自己対局で強化学習を進める

In [22]:
def train(agent: QAgent, episodes=50000):
    for ep in range(episodes):
        board = [E]*9
        state = agent.state_key(board)
        current = X
        done = False
        total_reward = 0
        # X=agent, O=agent（自己対局）
        while not done:
            action = agent.select_action(board)
            next_board, reward, done, winner = step(board, current, action)
            total_reward += reward
            next_state = agent.state_key(next_board)
            # 報酬は視点に応じて符号反転
            agent.update(state, action, reward if current==X else -reward, next_state, done)
            board = next_board
            state = next_state
            current = -current

        if ep % 5 == 0:
            print(f"Episode {ep} 報酬: {total_reward}")

agent = QAgent()
train(agent, episodes=1000)
print("学習完了。Qテーブル状態数:", len(agent.Q))

Episode 0 報酬: 1.0
Episode 5 報酬: 1.0
Episode 10 報酬: 1.0
Episode 15 報酬: 1.0
Episode 20 報酬: 1.0
Episode 25 報酬: 1.0
Episode 30 報酬: 0.0
Episode 35 報酬: 1.0
Episode 40 報酬: 1.0
Episode 45 報酬: 1.0
Episode 50 報酬: 1.0
Episode 55 報酬: 1.0
Episode 60 報酬: 1.0
Episode 65 報酬: 1.0
Episode 70 報酬: 1.0
Episode 75 報酬: 1.0
Episode 80 報酬: 1.0
Episode 85 報酬: 1.0
Episode 90 報酬: 1.0
Episode 95 報酬: 1.0
Episode 100 報酬: 1.0
Episode 105 報酬: 1.0
Episode 110 報酬: 1.0
Episode 115 報酬: 1.0
Episode 120 報酬: 1.0
Episode 125 報酬: 1.0
Episode 130 報酬: 1.0
Episode 135 報酬: 1.0
Episode 140 報酬: 1.0
Episode 145 報酬: 1.0
Episode 150 報酬: 1.0
Episode 155 報酬: 1.0
Episode 160 報酬: 1.0
Episode 165 報酬: 0.0
Episode 170 報酬: 1.0
Episode 175 報酬: 1.0
Episode 180 報酬: 1.0
Episode 185 報酬: 1.0
Episode 190 報酬: 1.0
Episode 195 報酬: 1.0
Episode 200 報酬: 0.0
Episode 205 報酬: 1.0
Episode 210 報酬: 0.0
Episode 215 報酬: 1.0
Episode 220 報酬: 1.0
Episode 225 報酬: 1.0
Episode 230 報酬: 1.0
Episode 235 報酬: 1.0
Episode 240 報酬: 1.0
Episode 245 報酬: 1.0
Episode 250 報酬: 1.0
Ep

In [23]:
def greedy_action(agent: QAgent, board):
    acts = available_actions(board)
    q = agent.Q[agent.state_key(board)]
    return max(acts, key=lambda a: q[a])

def play_once(agent: QAgent, opponent="random", human_as=X):
    board = [E]*9
    current = X
    done = False
    while not done:
        if current == X:
            a = greedy_action(agent, board)  # 評価時は貪欲
        else:
            acts = available_actions(board)
            a = random.choice(acts) if opponent == "random" else greedy_action(agent, board)
        board, _, done, winner = step(board, current, a)
        current = -current
    if winner == X: return 1
    if winner == O: return -1
    return 0

def evaluate(agent: QAgent, n=500, opponent="random"):
    wins = draws = losses = 0
    for _ in range(n):
        r = play_once(agent, opponent=opponent)
        if r == 1: wins += 1
        elif r == -1: losses += 1
        else: draws += 1
    print(f"[vs {opponent}] W/D/L = {wins}/{draws}/{losses}  (win%={(wins/n)*100:.1f})")

In [32]:
evaluate(agent, n=500, opponent="random")
evaluate(agent, n=500, opponent="greedy")

[vs random] W/D/L = 423/8/69  (win%=84.6)
[vs greedy] W/D/L = 500/0/0  (win%=100.0)
