# 方針
- このノートブックでは、学習済みSLポリシーネットワークの重みを初期値としてREINFORCEにてRLポリシーネットワークを学習させる。


In [None]:
%%capture
!pip install creversi

In [None]:
# リバーシ用ライブラリ
from creversi import Board,move_to_str,move_from_str,move_rotate90,move_rotate180,move_rotate270
import creversi
# 基礎ライブラリ
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from copy import copy,deepcopy
import gc
import os
# 学習用ライブラリ
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.model_selection import train_test_split

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        n_filters = 80
        self.input_layer = nn.Sequential(
            nn.Conv2d(8,n_filters,kernel_size=5,padding=2),
            nn.ReLU()
        )
        self.hidden_layer = nn.Sequential(
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU()
        )
        self.output_layer = nn.Sequential(
            nn.Conv2d(n_filters,1,kernel_size=1),
            nn.Flatten()
        )
        
    def forward(self,x):
        out = self.input_layer(x)
        out = self.hidden_layer(out)
        out = self.output_layer(out)
        return out

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        n_filters = 100
        self.input_layer = nn.Sequential(
            nn.Conv2d(8,n_filters,kernel_size=5,padding=2),
            nn.ReLU()
        )
        self.hidden_layer = nn.Sequential(
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.BatchNorm2d(n_filters),
            nn.ReLU()
        )
        self.output_layer = nn.Sequential(
            nn.Conv2d(n_filters,1,kernel_size=1),
            nn.Flatten()
        )
        
    def forward(self,x):
        out = self.input_layer(x)
        out = self.hidden_layer(out)
        out = self.output_layer(out)
        return out

In [None]:
def board_to_array(board):
    """
    boardオブジェクトからndarrayに変換する関数。
    第1チャンネルは黒石の位置、第2チャンネルに白石の位置、第3チャンネルに空白の位置、
    第4チャンネルに合法手の位置、第5チャンネルに返せる石の個数、第6チャンネルに隅=1、
    第7チャンネルに1埋め、第8チャンネルに0埋め。
    """
    b = np.zeros((8,8,8), dtype=np.float32)
    board.piece_planes(b)
    if not board.turn:
        b = b[[1,0,2,3,4,5,6,7],:,:]
    b[2] = np.where(b[0]+b[1]==1, 0, 1)
    legal_moves = list(board.legal_moves)
    if legal_moves != [64]:
        n_returns = []
        for move in legal_moves:
            board_ = copy(board)
            n_before = board_.opponent_piece_num()
            board_.move(move)
            n_after = board_.piece_num()
            n_returns.append(n_before-n_after)
        tmp = np.zeros(64)
        tmp[legal_moves] = n_returns
        tmp = tmp.reshape(8,8)
        b[3] = np.where(tmp > 0,1,0)
        b[4] = tmp
    b[5] = np.array([1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 
                     0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 
                     0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                     1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1.]).reshape(8,8)
    b[6] = 1
    return b

In [None]:
def predict(board, model):
    """モデルの出力のうち、合法手のみを選びsoftmaxに通す。合法手のidxも返す。"""
    model.eval()
    device = 'cuda' if next(model.parameters()).is_cuda else 'cpu'
#     board_arr = torch.from_numpy(board_to_array(board)).to(device)
    board_arr = torch.from_numpy(board_to_array(board)).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(board_arr)[0]
    legal_moves = list(board.legal_moves)
    return output[legal_moves].softmax(dim=0).numpy(), legal_moves

In [None]:
def test_play(model, N, greedy=False):
    Z = []
    for i in range(N):
        board = Board()
        while not board.is_game_over():
            ## 自分の手番
            if board.turn:
                if 64 not in list(board.legal_moves):
                    ### 推論
                    model.eval()
                    model.to(device)
                    # board_arr = torch.from_numpy(board_to_array(board)).to(device)
                    board_arr = torch.from_numpy(board_to_array(board)).unsqueeze(0).to(device)
                    output = model(board_arr)[0].cpu()
                    legal_moves = list(board.legal_moves)
                    prob = output[legal_moves].softmax(dim=0).detach().numpy()
                    ### 行動
                    if greedy:
                        move = legal_moves[prob.argmax()]
                    else:
                        move = np.random.choice(legal_moves, p=prob)
                    board.move(move)
                else:
                    board.move_pass()
            ## 相手の手番
            else:
                legal_moves = list(board.legal_moves)
                board.move(np.random.choice(legal_moves))

        # 勝ち負けをzに格納
        if board.turn:
            z = board.diff_num()
        else:
            z = -board.diff_num()
        Z.append(z)
    Z = np.array(Z)
    return (Z>0).sum()/N, Z.mean(), Z.std(), Z.min(), Z.max()

# 学習

$$
\nabla_\theta J(\theta)=E\left[\sum_{t=0}^T G_t\nabla_\theta\log\pi_\theta(A_t|S_t)\right]\\
G_t \equiv R_t + \gamma R_{t+1} + \cdots + \gamma^{T-t}R_T
$$

つまり、損失関数は

$$
\mathcal{L}=E\left[\sum_{t=0}^T G_t\log\pi_\theta(A_t|S_t)\right]
$$


In [None]:
# for file in sorted(os.listdir('/kaggle/input/reversi-datasets/results')):
#     if '.pth' in file:
#         # モデル読み込み
#         print(file)
#         model = torch.load(f'/kaggle/input/reversi-datasets/results/{file}')
#         device = 'cuda' if torch.cuda.is_available() else 'cpu'
#         model = model.to(device)
#         ratio,mean,std,m,M = test_play(model, 1000)
#         print(f'ratio:{ratio*100:.1f}%, mean:{mean:.1f}, std:{std:.1f}, min:{m:.0f}, max:{M:.0f}')

In [None]:
# モデル読み込み
model = torch.load('/kaggle/input/reversi-datasets/SL-PolicyNetwork-v3-checkpoint-5epoch-subdata99.pth')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [None]:
ratio,mean,std,m,M = test_play(model, 1000, greedy=True)
print(f'ratio:{ratio*100:.1f}%, mean:{mean:.1f}, std:{std:.1f}, min:{m:.0f}, max:{M:.0f}')

- prob.....ratio:91.2%, mean:23.6, std:16.1, min:-36, max:60  
- greedy...ratio:97.7%, mean:35.5, std:14.3, min:-34, max:64
- new......ratio:99.3%, mean:40.1, std:12.6, min:-28, max:64

In [None]:
# 設定
n_episode = 100  # 1セット当たりの対局数、相手モデルの更新頻度
n_set = 1  # セット数
learning_rate = 0.0001
gamma = 0.99

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
model_op_list = []  # 過去の相手モデル集合

for i in range(n_set):
    # 相手モデルの更新
    model_op_list.append(deepcopy(model.to(device)).eval())
    model_op = deepcopy(np.random.choice(model_op_list))

    for n in tqdm(range(n_episode)):
        # (prob,r)を格納する配列
        tau = [[],[]]  # 自分側
        tau_op = [[],[]]  # 相手側

        # 自己対局
        board = Board()
        while not board.is_game_over():
            ## 自分の手番
            if board.turn:
                if 64 not in list(board.legal_moves):
                    ### 推論
                    model.eval()
                    # board_arr = torch.from_numpy(board_to_array(board)).to(device)
                    board_arr = torch.from_numpy(board_to_array(board)).unsqueeze(0).to(device)
                    output = model(board_arr)[0].cpu()
                    legal_moves = list(board.legal_moves)
                    prob = output[legal_moves].softmax(dim=0).detach().numpy()
                    ### 行動
                    move = np.random.choice(legal_moves, p=prob)
                    tau[0].append(output.softmax(0)[move])
                    tau[1].append(0)
                    board.move(move)
                else:
                    board.move_pass()
            ## 相手の手番
            else:
                if 64 not in list(board.legal_moves):
                    ### 推論
                    # board_arr = torch.from_numpy(board_to_array(board)).to(device)
                    board_arr = torch.from_numpy(board_to_array(board)).unsqueeze(0).to(device)
                    output = model_op(board_arr)[0].cpu()
                    legal_moves = list(board.legal_moves)
                    prob = output[legal_moves].softmax(dim=0).detach().numpy()
                    ### 行動
                    move = np.random.choice(legal_moves, p=prob)
                    tau_op[0].append(output.softmax(0)[move])
                    tau_op[1].append(0)
                    board.move(move)
                else:
                    board.move_pass()

        # 勝ち負けをzに格納
        if board.diff_num() == 0:
            z = 0
        if board.turn:
            if board.diff_num() > 0:
                z = 1
            else:
                z = -1
        else:
            if board.diff_num() < 0:
                z = 1
            else:
                z = -1

        tau[1][-1] = z
        tau_op[1][-1] = -z
        
        # 損失を計算1
        model.train()
        optimizer.zero_grad()
        loss = 0.
        reward = 0.
        for p,r in zip(reversed(tau[0]),reversed(tau[1])):
            reward = gamma*reward + r
            loss += -torch.log(p) * reward
        loss.backward()
        optimizer.step()
        
        # 損失を計算2
        optimizer.zero_grad()
        loss = 0.
        reward = 0.
        for p,r in zip(reversed(tau_op[0]),reversed(tau_op[1])):
            reward = gamma*reward + r
            loss += -torch.log(p) * reward
        loss.backward()
        optimizer.step()

    torch.save(model.cpu(), f'RL-PolicyNetwork-checkpoint-{i+1}.pth')
    ratio,mean,std,m,M = test_play(model, 1000, greedy=True)
    print(f'[{i+1}/{n_set}] ratio:{ratio*100:.1f}%, mean:{mean:.1f}, std:{std:.1f}, min:{m:.0f}, max:{M:.0f}')

In [None]:
# model = torch.load('/kaggle/working/RL-PolicyNetwork-checkpoint-1.pth')

In [None]:
board = Board()
while not board.is_game_over():
    ## 自分の手番
    if board.turn:
        if 64 not in list(board.legal_moves):
            ### 推論
            model.eval()
            model.to(device)
            # board_arr = torch.from_numpy(board_to_array(board)).to(device)
            board_arr = torch.from_numpy(board_to_array(board)).unsqueeze(0).to(device)
            output = model(board_arr)[0].cpu()
            legal_moves = list(board.legal_moves)
            prob = output[legal_moves].softmax(dim=0).detach().numpy()
            ### 可視化
            display(board)
            plt.figure(figsize=(3,3))
            p = output.softmax(0).detach().numpy().reshape(8,8)*100
            sns.heatmap(p, cmap='gray_r', fmt='.0f', annot=True, cbar=False)
            plt.show()
            ### 行動
            move = legal_moves[prob.argmax()]
            board.move(move)
        else:
            board.move_pass()
    ## 相手の手番
    else:
        if 64 not in list(board.legal_moves):
            ### 推論
            model.eval()
            model.to(device)
            # board_arr = torch.from_numpy(board_to_array(board)).to(device)
            board_arr = torch.from_numpy(board_to_array(board)).unsqueeze(0).to(device)
            output = model_op_list[0](board_arr)[0].cpu()
            legal_moves = list(board.legal_moves)
            prob = output[legal_moves].softmax(dim=0).detach().numpy()
            ### 行動
            move = np.random.choice(legal_moves, p=prob)
            board.move(move)
        else:
            board.move_pass()
display(board)
board.diff_num()