# 概要
- このノートブックでは、ValueNetworkを学習させる。
- ネットワークの構造は以下のとおり。
    - 入力層：9チャネル
        - 黒石の位置(1)
        - 白石の位置(1)
        - 空白の位置(1)
        - 合法手の位置(1)
        - そこに打った場合、何個石を返せるか(1)
        - 隅の危険領域4マス×4隅をすべて1で埋める(1)
        - すべて1で埋める(1)
        - すべて0で埋める(1)
        - **手番情報：黒番ならすべて0で埋め、白番ならすべて1で埋める**(1)
    - 第1層：5x5のn_filters種類のフィルターとReLU関数
    - 第2-11層：3x3のn_filters種類のフィルターとReLU関数
    - 第12層：3x3のn_filters種類のフィルター
    - 第13層：1x1のn_filters種類のフィルター
    - 第14層：出力256個の全結合ネットワークとReLU関数
    - 第15層：出力1個の全結合ネットワークとtanh関数
- 学習データの作成方法は以下のとおり。（cf.AlphaGo解体新書p.171）
    - 1以上60以下の整数からランダムに数字を選択し、これをUとする。
    - SL-PolicyNetworkをU-1回使って、U-1手目まで局面を進める。
    - 次のU手目は合法手の中からランダムに選択し局面を進め、この局面をSとする。
    - 局面Sからは、RLポリシーネットワークを使って、終局まで手を進める。最終的な勝敗をzとする。
    - 組(S,z)を学習データとする。

In [None]:
!python -m pip install --no-index --find-links=/kaggle/input/reversi-datasets/ creversi

In [None]:
# リバーシ用ライブラリ
from creversi import Board, move_rotate90, move_rotate180, move_rotate270, move_from_str, move_to_str
import creversi
# 基礎ライブラリ
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from copy import copy, deepcopy
import gc
import os
# 学習用ライブラリ
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [None]:
def board_to_array(board):
    """
    boardオブジェクトからndarrayに変換する関数(PolicyNetwork用)。
    第1チャンネルは黒石の位置、第2チャンネルに白石の位置、第3チャンネルに空白の位置、
    第4チャンネルに合法手の位置、第5チャンネルに返せる石の個数、第6チャンネルに隅=1、
    第7チャンネルに1埋め、第8チャンネルに0埋め。
    """
    b = np.zeros((8,8,8), dtype=np.float32)
    board.piece_planes(b)
    if not board.turn:
        b = b[[1,0,2,3,4,5,6,7],:,:]
    b[2] = np.where(b[0]+b[1]==1, 0, 1)
    legal_moves = list(board.legal_moves)
    if legal_moves != [64]:
        n_returns = []
        for move in legal_moves:
            board_ = copy(board)
            n_before = board_.opponent_piece_num()
            board_.move(move)
            n_after = board_.piece_num()
            n_returns.append(n_before-n_after)
        tmp = np.zeros(64)
        tmp[legal_moves] = n_returns
        tmp = tmp.reshape(8,8)
        b[3] = np.where(tmp > 0,1,0)
        b[4] = tmp
    b[5] = np.array([1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 
                     0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 
                     0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                     1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1.]).reshape(8,8)
    b[6] = 1
    return b

In [None]:
def board_to_array2(board):
    """
    boardオブジェクトからndarrayに変換する関数(ValueNetwork用)。
    第1チャネルは黒石の位置、第2チャネルに白石の位置、第3チャネルに空白の位置、
    第4チャネルに合法手の位置、第5チャネルに返せる石の個数、第6チャネルに隅=1、
    第7チャネルに1埋め、第8チャネルに0埋め、第9チャネルに手番情報(黒番=0埋め、白番=1埋め)
    """
    b = np.zeros((9,8,8), dtype=np.float32)
    board.piece_planes(b)
    if not board.turn:
        b = b[[1,0,2,3,4,5,6,7,8],:,:]
        b[8] = 1
    b[2] = np.where(b[0]+b[1]==1, 0, 1)
    legal_moves = list(board.legal_moves)
    if legal_moves != [64]:
        n_returns = []
        for move in legal_moves:
            board_ = copy(board)
            n_before = board_.opponent_piece_num()
            board_.move(move)
            n_after = board_.piece_num()
            n_returns.append(n_before-n_after)
        tmp = np.zeros(64)
        tmp[legal_moves] = n_returns
        tmp = tmp.reshape(8,8)
        b[3] = np.where(tmp > 0,1,0)
        b[4] = tmp
    b[5] = np.array([1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 
                     0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 
                     0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                     1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1.]).reshape(8,8)
    b[6] = 1
    return b

In [None]:
class ValueNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        n_filters = 10
        self.input_layer = nn.Sequential(
            nn.Conv2d(9,n_filters,kernel_size=5,padding=2),
            nn.ReLU()
        )
        self.hidden_layer = nn.Sequential(
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.Conv2d(n_filters,n_filters,kernel_size=3,padding=1),
            nn.Conv2d(n_filters,n_filters,kernel_size=1,padding=1),
            nn.Flatten()
        )
        self.output_layer = nn.Sequential(
            nn.Linear(n_filters*100, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        
    def forward(self,x):
        out = self.input_layer(x)
        out = self.hidden_layer(out)
        out = self.output_layer(out)
        return out.tanh()

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# 学習

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
# N = 2500
# augmentation = True

# states = []
# results = []

# for n in tqdm(range(N)):
#     board = Board()
#     n_step = 0
#     while not board.is_game_over():
#         board_array = board_to_array2(board)
#         states.append(board_array)
#         if augmentation:
#             states.append(np.flip(board_array,axis=2).copy())
#             for k in range(1,4):
#                 board_array_rot = np.rot90(board_array, k=k, axes=(1,2)).copy()
#                 states.append(board_array_rot)
#                 states.append(np.flip(board_array_rot, axis=2).copy())
#             n_step += 8
#         else:
#             n_step += 1
#         legal_moves = list(board.legal_moves)
#         move = np.random.choice(legal_moves)
#         board.move(move)

#     result = board.diff_num() if board.turn else -board.diff_num()
# #     results += [result/64]*n_step
#     if result > 0:
#         results += [1]*n_step
#     elif result < 0:
#         results += [-1]*n_step
#     else:
#         results += [-1]*n_step

# states = np.array(states, dtype=np.float32)
# results = np.array(results, dtype=np.float32).reshape(-1,1)
        
# print(f"boards : {states.shape[0]}")

In [None]:
# n_epoch = 5
# n_batch = 256
# lr = 0.001
# N = states.shape[0]

# seed_everything(1234)
# model = ValueNetwork().to(device)
# optim = torch.optim.AdamW(model.parameters(),lr=lr)
# criterion = nn.HuberLoss()
# train_loss_list = []

In [None]:
# for epoch in range(n_epoch):
#     train_loss = 0.
#     np.random.seed(epoch)
#     random_idx = np.random.permutation(N)
#     for i in tqdm(range(N//n_batch)):
#         X_batch = torch.from_numpy(states[random_idx[n_batch*i:n_batch*(i+1)]]).to(device)
#         y_batch = torch.from_numpy(results[random_idx[n_batch*i:n_batch*(i+1)]]).to(device)
        
#         model.train()
#         optim.zero_grad()
#         output = model(X_batch)
#         loss = criterion(output, y_batch)
#         loss.backward()
#         optim.step()
#         train_loss += loss.item()
#     train_loss /= N//n_batch
#     train_loss_list.append(train_loss)
    
#     # 評価
#     print(f'Epoch:{epoch+1}/{n_epoch}, train loss:{train_loss:.5f}')
#     torch.save(model.cpu(), f'ValueNetwork-v1-checkpoint-{epoch+1}.pth')
#     model.to(device)
    
#     # この対局の形勢を判断
#     moves = "d3,e3,f2,e2,f5,c5,b6,e6,f6,c6,d6,c4,f3,f7,d7,e7,f4,b5,c3,g5,g6,b4,c7,d2,a6,a5,a3,a4,b3,d8,h6,h5,h4,g4,h3,g3,c2,f1,e1,d1,g2,g1,c1,b7,h1,b1,h2,a2,a8,a7,a1,b2,b8,c8,e8,g8,f8,g7,h8,h7"
#     moves = [move_from_str(move_str) for move_str in moves.split(',')]
#     # iPadアプリの評価と比較
#     v_app = [0,0,-6,0,-8,0,-19,0,-11,-9,-16,-11,-20,-6,-10,0,-8,0,-18,-9,-10,-8,-7,0,0,0,0,0,-9,0,0,0,6,12,11,12,0,5,0,0,5,15,12,30,31,44,36,54,51,59,44,44,40,40,24,24,24,32,32,32]
#     v_list = []
#     model.eval()
#     board = Board()
#     for move in moves:
#         v = model(torch.from_numpy(board_to_array2(board)).unsqueeze(0).to(device)).item()
#         v_list.append(v*64)
#         board.move(move)
#     plt.figure(figsize=(4,1))
#     plt.plot(v_list, c='red')
#     plt.plot(v_app, c='blue')
#     plt.ylim(-64,64)
#     plt.axhline(0, c='black', ls='--')
#     plt.title(np.corrcoef(v_list, v_app)[0,1])
#     plt.show()

In [None]:
J = 10

n_episode = 2500
augmentation = True

n_epoch = 10
n_batch = 256
lr = 0.001

criterion = nn.HuberLoss()

###########
seed_everything(1234)
model = ValueNetwork().to(device)
optim = torch.optim.AdamW(model.parameters(),lr=lr)

In [None]:
for j in range(J):
    print(f'-----j={j+1}-----')
    # データ生成
    states = []
    results = []
    for n in tqdm(range(n_episode)):
        board = Board()
        n_step = 0
        while not board.is_game_over():
            board_array = board_to_array2(board)
            states.append(board_array)
            if augmentation:
                states.append(np.flip(board_array,axis=2).copy())
                for k in range(1,4):
                    board_array_rot = np.rot90(board_array, k=k, axes=(1,2)).copy()
                    states.append(board_array_rot)
                    states.append(np.flip(board_array_rot, axis=2).copy())
                n_step += 8
            else:
                n_step += 1
            legal_moves = list(board.legal_moves)
            move = np.random.choice(legal_moves)
            board.move(move)
        result = board.diff_num() if board.turn else -board.diff_num()
        if result > 0:
            results += [1]*n_step
        elif result < 0:
            results += [-1]*n_step
        else:
            results += [-1]*n_step
    states = np.array(states, dtype=np.float32)
    results = np.array(results, dtype=np.float32).reshape(-1,1)
    N = states.shape[0]
    print(f"N : {N}")
    
    # 学習
#     seed_everything(1234)
#     model = ValueNetwork().to(device)
#     optim = torch.optim.AdamW(model.parameters(),lr=lr)

    for epoch in range(n_epoch):
        train_loss = 0.
        np.random.seed(epoch)
        random_idx = np.random.permutation(N)
        for i in tqdm(range(N//n_batch)):
            X_batch = torch.from_numpy(states[random_idx[n_batch*i:n_batch*(i+1)]]).to(device)
            y_batch = torch.from_numpy(results[random_idx[n_batch*i:n_batch*(i+1)]]).to(device)

            model.train()
            optim.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optim.step()
            train_loss += loss.item()
        train_loss /= N//n_batch

        # 評価
        print(f'Epoch:{epoch+1}/{n_epoch}, train loss:{train_loss:.5f}')
        torch.save(model.cpu(), f'ValueNetwork-v1-checkpoint-{j+1}-{epoch+1}.pth')
        model.to(device)

        # この対局の形勢を判断
        moves = "d3,e3,f2,e2,f5,c5,b6,e6,f6,c6,d6,c4,f3,f7,d7,e7,f4,b5,c3,g5,g6,b4,c7,d2,a6,a5,a3,a4,b3,d8,h6,h5,h4,g4,h3,g3,c2,f1,e1,d1,g2,g1,c1,b7,h1,b1,h2,a2,a8,a7,a1,b2,b8,c8,e8,g8,f8,g7,h8,h7"
        moves = [move_from_str(move_str) for move_str in moves.split(',')]
        v_app = [0,0,-6,0,-8,0,-19,0,-11,-9,-16,-11,-20,-6,-10,0,-8,0,-18,-9,-10,-8,-7,0,0,0,0,0,-9,0,0,0,6,12,11,12,0,5,0,0,5,15,12,30,31,44,36,54,51,59,44,44,40,40,24,24,24,32,32,32]
        v_list = []
        model.eval()
        board = Board()
        for move in moves:
            v = model(torch.from_numpy(board_to_array2(board)).unsqueeze(0).to(device)).item()
            v_list.append(v*64)
            board.move(move)
        plt.figure(figsize=(4,1))
        plt.plot(v_list, c='red')
        plt.plot(v_app, c='blue')
        plt.ylim(-64,64)
        plt.axhline(0, c='black', ls='--')
        plt.title(np.corrcoef(v_list, v_app)[0,1])
        plt.show()

In [None]:
# moves = "d3,e3,f2,e2,f5,c5,b6,e6,f6,c6,d6,c4,f3,f7,d7,e7,f4,b5,c3,g5,g6,b4,c7,d2,a6,a5,a3,a4,b3,d8,h6,h5,h4,g4,h3,g3,c2,f1,e1,d1,g2,g1,c1,b7,h1,b1,h2,a2,a8,a7,a1,b2,b8,c8,e8,g8,f8,g7,h8,h7"
# moves = [move_from_str(move_str) for move_str in moves.split(',')]
# v_app = [0,0,-6,0,-8,0,-19,0,-11,-9,-16,-11,-20,-6,-10,0,-8,0,-18,-9,-10,-8,-7,0,0,0,0,0,-9,0,0,0,6,12,11,12,0,5,0,0,5,15,12,30,31,44,36,54,51,59,44,44,40,40,24,24,24,32,32,32]
# v_mean = np.zeros(len(moves))

# for j in range(J):
#     model = torch.load(f'ValueNetwork-v1-checkpoint-{j+1}-{n_epoch}.pth')
#     model.to(device)

#     # この対局の形勢を判断
#     v_list = []
#     model.eval()
#     board = Board()
#     for move in moves:
#         v = model(torch.from_numpy(board_to_array2(board)).unsqueeze(0).to(device)).item()
#         v_list.append(v*64)
#         board.move(move)
#     v_mean += np.array(v_list)

# plt.figure(figsize=(4,1))
# plt.plot(v_mean/J, c='red')
# plt.plot(v_app, c='blue')
# plt.ylim(-64,64)
# plt.axhline(0, c='black', ls='--')
# plt.title(np.corrcoef(v_list, v_app)[0,1])
# plt.show()