In [2]:
# environment:
# pip3 install torch
# 解决五子棋计算五联子碰到索引出界，需要pad矩阵
import numpy as np
a = np.array([[1, 2], [3, 4]])
np.pad(a,(4,4))
# print(a)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [33]:
# 用五字棋尝试一下
# You can change this to another two-player game.
# 给状态张量增加一个channel表示当前行棋方

# TODO 1.实现 train Nets on GPU. Done on 2023.1.7
# TODO 2.implement a memoryband storing trails for training network. Done on 2023.1.11


from typing import Union, List
import numpy as np

BLACK, WHITE = 1, -1  # 颜色的先后手,黑圈O先,白叉X后

class State:
    '''实现 9 x 9 wuziqi 的棋盘'''
    X, Y = 'ABCDEFGHI',  '123456789'
    C = {0: '_', BLACK: 'O', WHITE: 'X'}

    def __init__(self):
        self.board = np.zeros((9, 9)) # (x, y)
        self.color = 1
        self.win_color = 0
        self.record = []

    def action2str(self, a:int):
        """用0-80编码落子位
            '1' '2' '3' ... '9'
        'A'  0   1   2  ...  8
        'B'  9   10  11 ...  17
        'C'  18  19  20 ...  26
        ...             ...
        'I'  72  73  74 ...  80
        """
        return self.X[a // 9] + self.Y[a % 9]

    def str2action(self, s:str):
        return self.X.find(s[0]) * 9 + self.Y.find(s[1])

    def record_string(self):
        """记录动作的列表组装成字符串序列，用空格分隔

        Returns
        -------
            一条动作的（轨迹）字符串，空格分隔方便split
        """
        return ' '.join([self.action2str(a) for a in self.record])

    def __str__(self):
        # 打印棋盘
        s = '   ' + ' '.join(self.Y) + '\n'
        for i in range(9):
            s += self.X[i] + ' ' + ' '.join([self.C[self.board[i, j]] for j in range(9)]) + '\n'
        s += 'record = ' + self.record_string()
        return s

    def check_win(self, x:int, y:int):
        # check whether 5 stones are on the line , pad zero around board by (4,4) then compute if sum to 5*self.color
        x_tmp, y_tmp = x + 4, y + 4
        boardex4 = np.pad(self.board,(4,4))
        for i in range(5):
            if sum(boardex4[x_tmp-4+i:x_tmp+1+i, y_tmp])==5*self.color:
                return True
            elif sum(boardex4[x_tmp, y_tmp-4+i:y_tmp+1+i])==5*self.color:
                return True
            elif boardex4[x_tmp+i-4,y_tmp+i-4]+boardex4[x_tmp+i-3,y_tmp+i-3]+boardex4[x_tmp+i-2,y_tmp+i-2]+\
                    boardex4[x_tmp+i-1,y_tmp+i-1]+boardex4[x_tmp+i,y_tmp+i]==5*self.color:
                return True
            elif boardex4[x_tmp+i-4,y_tmp-i+4]+boardex4[x_tmp+i-3,y_tmp-i+3]+boardex4[x_tmp+i-2,y_tmp-i+2]+\
                    boardex4[x_tmp+i-1,y_tmp-i+1]+boardex4[x_tmp+i,y_tmp-i]==5*self.color:
                return True
        return False

    def play(self, action:Union[str, int]) -> 'State':
        # 关于type hint : Python中的类是在读取完整个类之后才被定义的，因此在类中无法通过正常方式表示这个类本身。
        # 替代方法是使用一个和类同名的字符串，这被称为自引用类型。
        """状态转移
        Parameters
        ----------
            action : 0-80的落子位置int,或者动作的用空格分隔的字符串序列str
        Returns
        -------
            self
        """
        # 如果是一条（轨迹）字符串（该对象对应有数据结构的设计，一定程度上组织成有可遍历的特征）
        # 那么，可直接设计递归的调用，归约为仅需实现单次的int输入的动作状态转移
        if isinstance(action, str):
            for astr in action.split(): # 默认用空格分隔字符串
                self.play(self.str2action(astr))
            return self

        x, y = action // 9, action % 9
        self.board[x, y] = self.color

        # # 检查是否5子连线
        if self.check_win(x , y):
            self.win_color = self.color

        self.color = -self.color
        self.record.append(action)
        return self

    def terminal(self):
        # 终止状态检查，用于selfplay循环条件
        return self.win_color != 0 or len(self.record) == 9 * 9

    def terminal_reward(self):
        # 返回终局奖励 1，-1
        return self.win_color if self.color == BLACK else -self.win_color

    def legal_actions(self) -> List[int]:
        # 返回根节点下的合法走子位，List of int
        return [a for a in range(9 * 9) if self.board[a // 9, a % 9] == 0]

    def feature(self, to_cuda:bool = False):
        # making input ndarray for NN_state
        # 堆个ndarry用作神经网络输入 : [当前行动者 ,我方视角棋盘，对方视角棋盘]
        # support sending ndarry to cuda tensor with added a batch_dim
        now_mover = np.ones((9, 9)) * self.color # 加1通道进卷积
        s = np.stack([now_mover, self.board == self.color, self.board == -self.color]).astype(np.float32)
        if to_cuda:
            return torch.from_numpy(s).unsqueeze(0).cuda()
        return s

    def action_feature(self, action, to_cuda:bool = False):
        # 制作动作矩阵
        # support sending ndarry to cuda tensor with added a batch_dim
        a = np.zeros((1, 9, 9), dtype=np.float32)
        a[0, action // 9, action % 9] = 1
        if to_cuda:
            return torch.from_numpy(a).unsqueeze(0).cuda()
        return a

state = State().play('A2')
print(state)
print('input feature')
print(state.feature())
state = State().play('B2 A1 I2')
print(state)
print('input feature')
print(state.feature())

   1 2 3 4 5 6 7 8 9
A _ O _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = A2
input feature
[[[-1. -1. -1. -1. -1. -1. -1. -1. -1.]
  [-1. -1. -1. -1. -1. -1. -1. -1. -1.]
  [-1. -1. -1. -1. -1. -1. -1. -1. -1.]
  [-1. -1. -1. -1. -1. -1. -1. -1. -1.]
  [-1. -1. -1. -1. -1. -1. -1. -1. -1.]
  [-1. -1. -1. -1. -1. -1. -1. -1. -1.]
  [-1. -1. -1. -1. -1. -1. -1. -1. -1.]
  [-1. -1. -1. -1. -1. -1. -1. -1. -1.]
  [-1. -1. -1. -1. -1. -1. -1. -1. -1.]]

 [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.]]

 [[ 0.  1.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0

In [24]:
# 定义组件网络Res&Conv

import torch
import torch.nn as nn
import torch.nn.functional as F

class Conv(nn.Module):
    def __init__(self, filters0, filters1, kernel_size, bn=False):
        super().__init__()
        self.conv = nn.Conv2d(filters0, filters1, kernel_size, stride=1, padding=kernel_size//2, bias=False)
        self.bn = None
        if bn:
            self.bn = nn.BatchNorm2d(filters1)

    def forward(self, x):
        h = self.conv(x)
        if self.bn is not None:
            h = self.bn(h)
        return h

class ResidualBlock(nn.Module):
    def __init__(self, filters):
        super().__init__()
        self.conv = Conv(filters, filters, 3, True)

    def forward(self, x):
        return F.relu(x + (self.conv(x)))

In [25]:
num_filters = 16
num_blocks = 4

class Representation(nn.Module):
    ''' Conversion from observation to inner abstract state '''
    def __init__(self, input_shape):
        super().__init__()
        self.input_shape = input_shape # (c, 9, 9)
        self.board_size = self.input_shape[1] * self.input_shape[2]
        # 初始化nn.Conv2d inputchannels，outputchannels
        self.layer0 = Conv(self.input_shape[0], num_filters, 3, bn=True)
        self.blocks = nn.ModuleList([ResidualBlock(num_filters) for _ in range(num_blocks)])

    def forward(self, x):
        h = F.relu(self.layer0(x))
        for block in self.blocks:
            h = block(h)
        return h # torch.Size([1, 16, 9, 9])

    def inference(self, x, pass_to_cpu:bool = True):
        self.eval()
        with torch.no_grad():
            # rp = self(torch.from_numpy(x).unsqueeze(0)) # cpu_only版本: conv2d的输入tensor需要四维，多加一维度在0位置
            rp = self(x)
        if not pass_to_cpu:
            return rp # print('rp tensor shape' , rp.shape) # torch.Size([1, 16, 9, 9])
        return rp.cpu().numpy()[0]  

class Prediction(nn.Module):
    ''' Policy and value prediction from inner abstract state '''
    def __init__(self, action_shape):
        super().__init__()
        self.board_size = np.prod(action_shape[1:]) # 9 x 9 = 81
        self.action_size = action_shape[0] * self.board_size # 1 x 81 = 81

        self.conv_p1 = Conv(num_filters, 4, 1, bn=True)
        self.conv_p2 = Conv(4, 1, 1)

        self.conv_v = Conv(num_filters, 4, 1, bn=True)
        self.fc_v = nn.Linear(self.board_size * 4, 1, bias=False)

    def forward(self, rp):
        h_p = F.relu(self.conv_p1(rp))
        # print('过第一层p卷积', h_p.shape) # torch.Size([1, 4, 9, 9])
        h_p = self.conv_p2(h_p).view(-1, self.action_size)
        # print('过第二层p卷积', h_p.shape) # torch.Size([1, 81])
        h_v = F.relu(self.conv_v(rp))
        # print('过第一层fc卷积', h_v.shape) # torch.Size([1, 4, 9, 9])
        h_v = self.fc_v(h_v.view(-1, self.board_size * 4))
        # print('过第二层fc层', h_v.shape) # torch.Size([1, 1])
        # range of value is -1 ~ 1
        return F.softmax(h_p, dim=-1), torch.tanh(h_v)

    def inference(self, rp, pass_to_cpu:bool = True):
        self.eval()
        with torch.no_grad():
            # p, v = self(torch.from_numpy(rp).unsqueeze(0))
            p, v = self(rp) # print('p shape is ', p.shape) # torch.Size([1, 81])
        if not pass_to_cpu:
            return p, v
        return p.cpu().numpy()[0], v.cpu().numpy()[0][0]

class Dynamics(nn.Module):
    '''Abstract state transition'''
    def __init__(self, rp_shape, act_shape):
        super().__init__()
        self.rp_shape = rp_shape
        self.layer0 = Conv(rp_shape[0] + act_shape[0], num_filters, 3, bn=True)
        self.blocks = nn.ModuleList([ResidualBlock(num_filters) for _ in range(num_blocks)])

    def forward(self, rp, a):
        h = torch.cat([rp, a], dim=1)
        # print('dim=1 cat shape h ' , h.shape) # torch.Size([1, 17, 9, 9])
        h = self.layer0(h)
        for block in self.blocks:
            h = block(h)
        return h

    def inference(self, rp, a, pass_to_cpu:bool = True):
        self.eval()
        with torch.no_grad():
            # cpu_only版本: rp = self(torch.from_numpy(rp).unsqueeze(0), torch.from_numpy(a).unsqueeze(0))
            rp = self(rp, a)
        if not pass_to_cpu:
            return rp
        return rp.cpu().numpy()[0]

class Net(nn.Module):
    '''Whole net'''
    def __init__(self):
        super().__init__()
        state = State()
        input_shape = state.feature().shape # state (c, 9, 9)
        action_shape = state.action_feature(0).shape # action (1, 9, 9)
        rp_shape = (num_filters, *input_shape[1:]) # hidden space (16, 9, 9)

        self.representation = Representation(input_shape)
        self.prediction = Prediction(action_shape)
        self.dynamics = Dynamics(rp_shape, action_shape)

    def predict(self, state0, path):
        '''Predict p and v from original state and path'''
        outputs = []
        x = state0.feature(to_cuda=True)
        rp = self.representation.inference(x, pass_to_cpu= False)
        outputs.append(self.prediction.inference(rp, pass_to_cpu = True))
        for action in path:
            a = state0.action_feature(action, to_cuda=True)
            rp = self.dynamics.inference(rp, a, pass_to_cpu = False)
            outputs.append(self.prediction.inference(rp, pass_to_cpu = True))
        return outputs

In [26]:
# 给上面的三个网络做单元测试用
def show_net(net, state):
    '''Display policy (p) and value (v)'''
    print(state)
    p, v = net.predict(state, [])[-1]
    print('p = ')
    print((p * 10000).astype(int).reshape((-1, *net.representation.input_shape[1:3])))
    print('v = ', v)
    print()

#  Outputs before training
show_net(Net().cuda(), State())

   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = 
p = 
[[[121 124 125 125 125 125 124 124 123]
  [119 122 123 123 123 123 124 124 124]
  [117 124 122 123 123 123 123 124 126]
  [117 124 122 123 123 123 124 125 127]
  [117 124 122 123 123 123 123 125 126]
  [117 124 122 123 123 123 124 125 127]
  [117 124 122 123 123 123 123 125 127]
  [117 124 123 123 123 123 123 125 127]
  [120 121 119 120 120 120 120 122 130]]]
v =  0.0064606424



In [27]:
# 实现蒙特卡洛树搜索MCTS

class Node:
    '''Search result of one abstract (or root) state'''
    def __init__(self, p, v):
        self.p, self.v = p, v
        self.n, self.q_sum = np.zeros_like(p), np.zeros_like(p)
        self.n_all, self.q_sum_all = 1, v / 2 # prior

    def update(self, action, q_new):
        # Update
        self.n[action] += 1
        self.q_sum[action] += q_new

        # Update overall stats
        self.n_all += 1
        self.q_sum_all += q_new

In [28]:
import time
import copy

class Tree:
    '''Monte Carlo Tree'''
    def __init__(self, net):
        self.net = net
        self.nodes = {}

    def search(self, state, path, rp, depth):
        # Return predicted value from new state
        # rp is a Tensor on Gpu
        key = state.record_string()
        if len(path) > 0:
            key += '|' + ' '.join(map(state.action2str, path))
        if key not in self.nodes:
            p, v = self.net.prediction.inference(rp, pass_to_cpu = True)
            self.nodes[key] = Node(p, v)
            return v

        # State transition by an action selected from bandit
        node = self.nodes[key]
        p = node.p
        mask = np.zeros_like(p)
        if depth == 0:
            # Add noise to policy on the root node
            p = 0.75 * p + 0.25 * np.random.dirichlet([0.15] * len(p))
            # On the root node, we choose action only from legal actions
            mask[state.legal_actions()] = 1
            p *= mask
            p /= p.sum() + 1e-16

        n, q_sum = 1 + node.n, node.q_sum_all / node.n_all + node.q_sum
        ucb = q_sum / n + 2.0 * np.sqrt(node.n_all) * p / n + mask * 4 # PUCB formula
        best_action = np.argmax(ucb)

        # Search next state by recursively calling this function
        rp_next = self.net.dynamics.inference(rp, state.action_feature(best_action, to_cuda=True), pass_to_cpu=False)
        path.append(best_action)
        q_new = -self.search(state, path, rp_next, depth + 1) # With the assumption of changing player by turn
        node.update(best_action, q_new)

        return q_new

    def think(self, state, num_simulations, temperature = 0, show=False):
        # End point of MCTS
        if show:
            print(state)
        start, prev_time = time.time(), 0
        for _ in range(num_simulations):
            self.search(state, [], self.net.representation.inference(state.feature(to_cuda=True), pass_to_cpu=False), depth=0)

            # Display search result on every second
            if show:
                tmp_time = time.time() - start
                if int(tmp_time) > int(prev_time):
                    prev_time = tmp_time
                    root, pv = self.nodes[state.record_string()], self.pv(state)
                    print('%.2f sec. best %s. q = %.4f. n = %d / %d. pv = %s'
                          % (tmp_time, state.action2str(pv[0]), root.q_sum[pv[0]] / root.n[pv[0]],
                             root.n[pv[0]], root.n_all, ' '.join([state.action2str(a) for a in pv])))

        #  Return probability distribution weighted by the number of simulations
        root = self.nodes[state.record_string()]
        n = root.n + 1
        n = (n / np.max(n)) ** (1 / (temperature + 1e-8))
        return n / n.sum() # teacher--MCTS

    def pv(self, state):
        # Return principal variation (action sequence which is considered as the best)
        s, pv_seq = copy.deepcopy(state), []
        while True:
            key = s.record_string()
            if key not in self.nodes or self.nodes[key].n.sum() == 0:
                break
            best_action = sorted([(a, self.nodes[key].n[a]) for a in s.legal_actions()], key=lambda x: -x[1])[0][0]
            pv_seq.append(best_action)
            s.play(best_action)
        return pv_seq

In [29]:
# Search with initialized net

tree = Tree(Net().cuda())
next_step_0 = tree.think(State(), 100, show=True)

tree = Tree(Net().cuda())
next_step_n = tree.think(State().play('E4 F5 E5 F6 E6 F7 E7'), 200, show=True)
print(next_step_n.reshape((9, 9)))

# tree = Tree(Net().cuda())
# tree.think(State().play('F4 D5 F5 D6 F6 D7 F7'), 200, show=True)

# tree = Tree(Net().cuda())
# tree.think(State().play('B2 A2 A3 C1'), 200, show=True)

   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = 
   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ O O O O _ _
F _ _ _ _ X X X _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = E4 F5 E5 F6 E6 F7 E7
1.00 sec. best A8. q = 0.0090. n = 4 / 185. pv = A8
[[0.         0.         0.         0.         0.         0.
  0.         0.11111111 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.11111111 0.         0.11111111]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.11111111 0.         0.

In [30]:
# Training of neural net
from tqdm import tqdm
import torch.optim as optim

batch_size = 32
num_steps = 100

def gen_target(ep, k):
    '''Generate inputs and targets for training'''
    # path, reward, observation, action, policy
    turn_idx = np.random.randint(len(ep[0]))
    ps, vs, ax = [], [], []
    for t in range(turn_idx, turn_idx + k + 1):
        if t < len(ep[0]):
            p = ep[4][t]
            a = ep[3][t]
        else: # state after finishing game
            # p is 0 (loss is 0)
            p = np.zeros_like(ep[4][-1])
            # random action selection
            a = np.zeros(np.prod(ep[3][-1].shape), dtype=np.float32)
            a[np.random.randint(len(a))] = 1
            a = a.reshape(ep[3][-1].shape)
        vs.append([ep[1] if t % 2 == 0 else -ep[1]])
        ps.append(p)
        ax.append(a)
        
    return ep[2][turn_idx], ax, ps, vs

def train(episodes, net, optimizer):
    #     episodes = List(record:List[int], 
    #                     reward:int(0,1,-1), 
    #                     features:state.feature(), 
    #                     action_features:state.action_feature(action) from random.choice based on distribution p_targets, 
    #                     p_targets:teacher--MCTS))
    '''Train neural net on GPU'''
    p_loss_sum, v_loss_sum = torch.as_tensor(0, dtype=torch.float32).cuda() , torch.as_tensor(0, dtype=torch.float32).cuda()
    net.train()
    k = 3
    for _ in tqdm(range(num_steps)):
        x, ax, p_target, v_target = zip(*[gen_target(episodes[np.random.randint(len(episodes))], k) for j in range(batch_size)])
        x = torch.from_numpy(np.array(x)).cuda()
        ax = torch.from_numpy(np.array(ax))
        p_target = torch.from_numpy(np.array(p_target))
        v_target = torch.FloatTensor(np.array(v_target))

        # Change the order of axis as [time step, batch, ...]
        ax = torch.transpose(ax, 0, 1).cuda()
        p_target = torch.transpose(p_target, 0, 1).cuda()
        v_target = torch.transpose(v_target, 0, 1).cuda()

        # Compute losses for k (+ current) steps
        p_loss, v_loss = torch.as_tensor(0, dtype=torch.float32).cuda() , torch.as_tensor(0, dtype=torch.float32).cuda()
        for t in range(k + 1):
            rp = net.representation(x) if t == 0 else net.dynamics(rp, ax[t - 1])
            p, v = net.prediction(rp)
            p_loss += F.kl_div(torch.log(p), p_target[t], reduction='sum')
            v_loss += torch.sum(((v_target[t] - v) ** 2) / 2)

        p_loss_sum += p_loss.item()
        v_loss_sum += v_loss.item()

        optimizer.zero_grad()
        (p_loss + v_loss).backward()
        optimizer.step()

    num_train_datum = num_steps * batch_size
    print('p_loss %f v_loss %f' % (p_loss_sum.cpu().numpy() / num_train_datum, v_loss_sum.cpu().numpy() / num_train_datum))
    return net

In [31]:
#  Battle against random agents

def vs_random(net, n=100):
    results = { 0 : 0 , -1 : 0 , 1 : 0}
    for i in range(n):
        first_turn = i % 2 == 0
        turn = first_turn
        state = State()
        while not state.terminal():
            if turn:
                p, _ = net.predict(state, [])[-1]
                action = sorted([(a, p[a]) for a in state.legal_actions()], key=lambda x:-x[1])[0][0]
            else:
                action = np.random.choice(state.legal_actions())
            state.play(action)
            turn = not turn
        r = state.terminal_reward() if turn else -state.terminal_reward()
        results[r] = results.get(r, 0) + 1
    return results

In [32]:
# Main algorithm of MuZero
from collections import deque

num_games = 50          # selfplay的总局数
num_games_one_epoch = 10 # 每selfplay多少轮，训练模型一次
num_simulations = 40    # 每个Node的search搜索次数

net = Net().cuda()
optimizer = optim.SGD(net.parameters(), lr=3e-4, weight_decay=3e-5, momentum=0.8)

# Display battle results as {-1: lose 0: draw 1: win} (for episode generated for training, 1 means that the first player won)
vs_random_sum = vs_random(net)
print('vs_random = ', sorted(vs_random_sum.items()))

# episodes = []
episodes = deque([], maxlen=20) # 移动队列用于存储最新的模型游戏

result_distribution = {1: 0, 0: 0, -1: 0}

for g in range(num_games):
    # Generate one episode
    record, p_targets, features, action_features = [], [], [], []
    state = State()
    # temperature using to make policy targets from search results
    temperature = 0.7

    tree = Tree(net) # 每轮游戏维护同一个树

    while not state.terminal():
        # tree = Tree(net) # 有必要每走一步都新建一个搜索树么？移到循环外面可以么？
        p_target = tree.think(state, num_simulations, temperature)
        p_targets.append(p_target)
        features.append(state.feature())

        # Select action with generated distribution, and then make a transition by that action
        action = np.random.choice(np.arange(len(p_target)), p=p_target)
        record.append(action)
        action_features.append(state.action_feature(action))
        state.play(action)
        temperature *= 0.8 # 这个温度参数感觉意义不明。。。？

    # reward seen from the first turn player
    reward = state.terminal_reward() * (1 if len(record) % 2 == 0 else -1)
    result_distribution[reward] += 1
    episodes.append((record, reward, features, action_features, p_targets))

    if g % num_games_one_epoch == 0:
        print('game ', end='')
    print(g, ' ', end='')

    # Training of neural net
    if (g + 1) % num_games_one_epoch == 0:
        # Show the result distributiuon of generated episodes
        print('generated = ', sorted(result_distribution.items()))
        epi = list(episodes.copy())
        net = train(episodes=epi, net=net, optimizer=optimizer)
        vs_random_once = vs_random(net)
        print('vs_random = ', sorted(vs_random_once.items()), end='')
        for r, n in vs_random_once.items():
            vs_random_sum[r] += n
        print(' sum = ', sorted(vs_random_sum.items()))

print('finished')

vs_random =  [(-1, 11), (0, 0), (1, 89)]
game 0  1  2  3  4  5  6  7  8  9  generated =  [(-1, 6), (0, 0), (1, 4)]


100%|██████████| 100/100 [00:03<00:00, 30.40it/s]


p_loss 10.998181 v_loss 0.312893
vs_random =  [(-1, 13), (0, 0), (1, 87)] sum =  [(-1, 24), (0, 0), (1, 176)]
game 10  11  12  13  14  15  16  17  18  19  generated =  [(-1, 11), (0, 0), (1, 9)]


100%|██████████| 100/100 [00:03<00:00, 31.76it/s]


p_loss 9.672222 v_loss 0.310232
vs_random =  [(-1, 17), (0, 0), (1, 83)] sum =  [(-1, 41), (0, 0), (1, 259)]
game 20  21  22  23  24  25  26  27  28  29  generated =  [(-1, 16), (0, 0), (1, 14)]


100%|██████████| 100/100 [00:03<00:00, 30.30it/s]


p_loss 9.685742 v_loss 0.254981
vs_random =  [(-1, 8), (0, 0), (1, 92)] sum =  [(-1, 49), (0, 0), (1, 351)]
game 30  31  32  33  34  35  36  37  38  39  generated =  [(-1, 21), (0, 0), (1, 19)]


100%|██████████| 100/100 [00:03<00:00, 29.95it/s]


p_loss 9.035182 v_loss 0.295280
vs_random =  [(-1, 14), (0, 0), (1, 86)] sum =  [(-1, 63), (0, 0), (1, 437)]
game 40  41  42  43  44  45  46  47  48  49  generated =  [(-1, 27), (0, 0), (1, 23)]


100%|██████████| 100/100 [00:03<00:00, 32.62it/s]


p_loss 8.359749 v_loss 0.266758
vs_random =  [(-1, 12), (0, 0), (1, 88)] sum =  [(-1, 75), (0, 0), (1, 525)]
finished


In [34]:
# Search with trained net

tree = Tree(net)
state = State()

next_step = tree.think(state.play('E4 F5 E5 F6 E6 F7 E7'), 800, show=True)
print(next_step.reshape((9 , 9)))

   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ O O O O _ _
F _ _ _ _ X X X _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = E4 F5 E5 F6 E6 F7 E7
1.00 sec. best A1. q = 0.3663. n = 6 / 176. pv = A1
2.00 sec. best F3. q = 0.4999. n = 7 / 323. pv = F3
3.00 sec. best F8. q = 0.4407. n = 10 / 467. pv = F8
4.00 sec. best F8. q = 0.4778. n = 14 / 603. pv = F8
5.00 sec. best F8. q = 0.5191. n = 19 / 740. pv = F8
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [35]:
next_step = tree.think(state.play('F8'), 800, show=True)
print('是否已经终局: ', state.terminal())
print(next_step.reshape((9 , 9)))

   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ O O O O _ _
F _ _ _ _ X X X X _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = E4 F5 E5 F6 E6 F7 E7 F8
1.00 sec. best G8. q = 0.4899. n = 12 / 164. pv = G8
2.00 sec. best G9. q = 0.4696. n = 13 / 331. pv = G9
3.00 sec. best G9. q = 0.3521. n = 15 / 509. pv = G9
4.01 sec. best G9. q = 0.3521. n = 15 / 683. pv = G9
是否已经终局:  False
[[0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0. ]]


In [36]:
next_step = tree.think(state.play('E3'), 800, show=True)
print('是否已经终局: ', state.terminal())
print(next_step.reshape((9 , 9)))

   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ O O O O O _ _
F _ _ _ _ X X X X _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = E4 F5 E5 F6 E6 F7 E7 F8 E3
1.00 sec. best F3. q = -0.3530. n = 25 / 191. pv = F3
2.00 sec. best F3. q = -0.3530. n = 25 / 369. pv = F3
3.00 sec. best F3. q = -0.3530. n = 25 / 552. pv = F3
4.00 sec. best F3. q = -0.3530. n = 25 / 728. pv = F3
是否已经终局:  True
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
