In [44]:
# environment:
# pip3 install torch
import numpy as np
a = np.array([[1, 2], [3, 4]])
np.pad(a,(4,4))
# print(a)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 3, 4, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [45]:
# Implementation of simple game: 9 x 9 wuziqi
# You can change this to another two-player game.
# TODO 1. train Nets on GPU
# TODO 2. implement a memoryband storing trails for training network

import numpy as np

BLACK, WHITE = 1, -1  # first turn or second turn player

class State:
    '''Board implementation of 9 x 9 wuziqi'''
    X, Y = 'ABCDEFGHI',  '123456789'
    C = {0: '_', BLACK: 'O', WHITE: 'X'}

    def __init__(self):
        self.board = np.zeros((9, 9)) # (x, y)
        self.color = 1
        self.win_color = 0
        self.record = []

    def action2str(self, a):
        return self.X[a // 9] + self.Y[a % 9]

    def str2action(self, s):
        return self.X.find(s[0]) * 9 + self.Y.find(s[1])

    def record_string(self):
        return ' '.join([self.action2str(a) for a in self.record])

    def __str__(self):
        # output board.
        s = '   ' + ' '.join(self.Y) + '\n'
        for i in range(9):
            s += self.X[i] + ' ' + ' '.join([self.C[self.board[i, j]] for j in range(9)]) + '\n'
        s += 'record = ' + self.record_string()
        return s

    def check_win(self, x, y):
        # check whether 5 stones are on the line , pad zero around board by (4,4) then compute if sum to 5*self.color
        x_tmp, y_tmp = x + 4, y + 4
        boardex4 = np.pad(self.board,(4,4))
        for i in range(5):
            if sum(boardex4[x_tmp-4+i:x_tmp+1+i, y_tmp])==5*self.color:
                return True
            elif sum(boardex4[x_tmp, y_tmp-4+i:y_tmp+1+i])==5*self.color:
                return True
            elif boardex4[x_tmp+i-4,y_tmp+i-4]+boardex4[x_tmp+i-3,y_tmp+i-3]+boardex4[x_tmp+i-2,y_tmp+i-2]+\
                    boardex4[x_tmp+i-1,y_tmp+i-1]+boardex4[x_tmp+i,y_tmp+i]==5*self.color:
                return True
            elif boardex4[x_tmp+i-4,y_tmp-i+4]+boardex4[x_tmp+i-3,y_tmp-i+3]+boardex4[x_tmp+i-2,y_tmp-i+2]+\
                    boardex4[x_tmp+i-1,y_tmp-i+1]+boardex4[x_tmp+i,y_tmp-i]==5*self.color:
                return True
        return False

    def play(self, action):
        # state transition function
        # action is position inerger (0~80) or string representation of action sequence
        if isinstance(action, str):
            for astr in action.split():
                self.play(self.str2action(astr))
            return self

        x, y = action // 9, action % 9
        self.board[x, y] = self.color

        # check whether 5 stones are on the line
        if self.check_win(x , y):
            self.win_color = self.color

        self.color = -self.color
        self.record.append(action)
        return self

    def terminal(self):
        # terminal state check
        return self.win_color != 0 or len(self.record) == 9 * 9

    def terminal_reward(self):
        # terminal reward 
        return self.win_color if self.color == BLACK else -self.win_color

    def legal_actions(self):
        # list of legal actions on each state
        return [a for a in range(9 * 9) if self.board[a // 9, a % 9] == 0]

    def feature(self):
        # input tensor for neural net (state)
        return np.stack([self.board == self.color, self.board == -self.color]).astype(np.float32)

    def action_feature(self, action):
        # input tensor for neural net (action)
        a = np.zeros((1, 9, 9), dtype=np.float32)
        a[0, action // 9, action % 9] = 1
        return a

state = State().play('I9')
print(state)
print('input feature')
print(state.feature())
state = State().play('B2 A1 I2')
print('input feature')
print(state.feature())

   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ O
record = I9
input feature
[[[0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1.]]]
input feature
[[[1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0.

In [46]:
# Small neural nets with PyTorch

import torch
import torch.nn as nn
import torch.nn.functional as F

class Conv(nn.Module):
    def __init__(self, filters0, filters1, kernel_size, bn=False):
        super().__init__()
        self.conv = nn.Conv2d(filters0, filters1, kernel_size, stride=1, padding=kernel_size//2, bias=False)
        self.bn = None
        if bn:
            self.bn = nn.BatchNorm2d(filters1)

    def forward(self, x):
        h = self.conv(x)
        if self.bn is not None:
            h = self.bn(h)
        return h

class ResidualBlock(nn.Module):
    def __init__(self, filters):
        super().__init__()
        self.conv = Conv(filters, filters, 3, True)

    def forward(self, x):
        return F.relu(x + (self.conv(x)))

In [47]:
num_filters = 16
num_blocks = 4

class Representation(nn.Module):
    ''' Conversion from observation to inner abstract state '''
    def __init__(self, input_shape):
        super().__init__()
        self.input_shape = input_shape
        self.board_size = self.input_shape[1] * self.input_shape[2]

        self.layer0 = Conv(self.input_shape[0], num_filters, 3, bn=True)
        self.blocks = nn.ModuleList([ResidualBlock(num_filters) for _ in range(num_blocks)])

    def forward(self, x):
        h = F.relu(self.layer0(x))
        for block in self.blocks:
            h = block(h)
        return h

    def inference(self, x):
        self.eval()
        with torch.no_grad():
            rp = self(torch.from_numpy(x).unsqueeze(0))
        return rp.cpu().numpy()[0]

class Prediction(nn.Module):
    ''' Policy and value prediction from inner abstract state '''
    def __init__(self, action_shape):
        super().__init__()
        self.board_size = np.prod(action_shape[1:])
        self.action_size = action_shape[0] * self.board_size

        self.conv_p1 = Conv(num_filters, 4, 1, bn=True)
        self.conv_p2 = Conv(4, 1, 1)

        self.conv_v = Conv(num_filters, 4, 1, bn=True)
        self.fc_v = nn.Linear(self.board_size * 4, 1, bias=False)

    def forward(self, rp):
        h_p = F.relu(self.conv_p1(rp))
        h_p = self.conv_p2(h_p).view(-1, self.action_size)

        h_v = F.relu(self.conv_v(rp))
        h_v = self.fc_v(h_v.view(-1, self.board_size * 4))

        # range of value is -1 ~ 1
        return F.softmax(h_p, dim=-1), torch.tanh(h_v)

    def inference(self, rp):
        self.eval()
        with torch.no_grad():
            p, v = self(torch.from_numpy(rp).unsqueeze(0))
        return p.cpu().numpy()[0], v.cpu().numpy()[0][0]

class Dynamics(nn.Module):
    '''Abstract state transition'''
    def __init__(self, rp_shape, act_shape):
        super().__init__()
        self.rp_shape = rp_shape
        self.layer0 = Conv(rp_shape[0] + act_shape[0], num_filters, 3, bn=True)
        self.blocks = nn.ModuleList([ResidualBlock(num_filters) for _ in range(num_blocks)])

    def forward(self, rp, a):
        h = torch.cat([rp, a], dim=1)
        h = self.layer0(h)
        for block in self.blocks:
            h = block(h)
        return h

    def inference(self, rp, a):
        self.eval()
        with torch.no_grad():
            rp = self(torch.from_numpy(rp).unsqueeze(0), torch.from_numpy(a).unsqueeze(0))
        return rp.cpu().numpy()[0]

class Net(nn.Module):
    '''Whole net'''
    def __init__(self):
        super().__init__()
        state = State()
        input_shape = state.feature().shape
        action_shape = state.action_feature(0).shape
        rp_shape = (num_filters, *input_shape[1:])

        self.representation = Representation(input_shape)
        self.prediction = Prediction(action_shape)
        self.dynamics = Dynamics(rp_shape, action_shape)

    def predict(self, state0, path):
        '''Predict p and v from original state and path'''
        outputs = []
        x = state0.feature()
        rp = self.representation.inference(x)
        outputs.append(self.prediction.inference(rp))
        for action in path:
            a = state0.action_feature(action)
            rp = self.dynamics.inference(rp, a)
            outputs.append(self.prediction.inference(rp))
        return outputs

In [48]:
def show_net(net, state):
    '''Display policy (p) and value (v)'''
    print(state)
    p, v = net.predict(state, [])[-1]
    print('p = ')
    print((p * 10000).astype(int).reshape((-1, *net.representation.input_shape[1:3])))
    print('v = ', v)
    print()

#  Outputs before training
show_net(Net(), State())

   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = 
p = 
[[[12 12 12 12 12 12 12 12 12]
  [12 12 12 12 12 12 12 12 12]
  [12 12 12 12 12 12 12 12 12]
  [12 12 12 12 12 12 12 12 12]
  [12 12 12 12 12 12 12 12 12]
  [12 12 12 12 12 12 12 12 12]
  [12 12 12 12 12 12 12 12 12]
  [12 12 12 12 12 12 12 12 12]
  [12 12 12 12 12 12 12 12 12]]]
v =  0.0



In [49]:
# Implementation of Monte Carlo Tree Search

class Node:
    '''Search result of one abstract (or root) state'''
    def __init__(self, p, v):
        self.p, self.v = p, v
        self.n, self.q_sum = np.zeros_like(p), np.zeros_like(p)
        self.n_all, self.q_sum_all = 1, v / 2 # prior

    def update(self, action, q_new):
        # Update
        self.n[action] += 1
        self.q_sum[action] += q_new

        # Update overall stats
        self.n_all += 1
        self.q_sum_all += q_new

In [50]:
import time
import copy

class Tree:
    '''Monte Carlo Tree'''
    def __init__(self, net):
        self.net = net
        self.nodes = {}

    def search(self, state, path, rp, depth):
        # Return predicted value from new state
        key = state.record_string()
        if len(path) > 0:
            key += '|' + ' '.join(map(state.action2str, path))
        if key not in self.nodes:
            p, v = self.net.prediction.inference(rp)
            self.nodes[key] = Node(p, v)
            return v

        # State transition by an action selected from bandit
        node = self.nodes[key]
        p = node.p
        mask = np.zeros_like(p)
        if depth == 0:
            # Add noise to policy on the root node
            p = 0.75 * p + 0.25 * np.random.dirichlet([0.15] * len(p))
            # On the root node, we choose action only from legal actions
            mask[state.legal_actions()] = 1
            p *= mask
            p /= p.sum() + 1e-16

        n, q_sum = 1 + node.n, node.q_sum_all / node.n_all + node.q_sum
        ucb = q_sum / n + 2.0 * np.sqrt(node.n_all) * p / n + mask * 4 # PUCB formula
        best_action = np.argmax(ucb)

        # Search next state by recursively calling this function
        rp_next = self.net.dynamics.inference(rp, state.action_feature(best_action))
        path.append(best_action)
        q_new = -self.search(state, path, rp_next, depth + 1) # With the assumption of changing player by turn
        node.update(best_action, q_new)

        return q_new

    def think(self, state, num_simulations, temperature = 0, show=False):
        # End point of MCTS
        if show:
            print(state)
        start, prev_time = time.time(), 0
        for _ in range(num_simulations):
            self.search(state, [], self.net.representation.inference(state.feature()), depth=0)

            # Display search result on every second
            if show:
                tmp_time = time.time() - start
                if int(tmp_time) > int(prev_time):
                    prev_time = tmp_time
                    root, pv = self.nodes[state.record_string()], self.pv(state)
                    print('%.2f sec. best %s. q = %.4f. n = %d / %d. pv = %s'
                          % (tmp_time, state.action2str(pv[0]), root.q_sum[pv[0]] / root.n[pv[0]],
                             root.n[pv[0]], root.n_all, ' '.join([state.action2str(a) for a in pv])))

        #  Return probability distribution weighted by the number of simulations
        root = self.nodes[state.record_string()]
        n = root.n + 1
        n = (n / np.max(n)) ** (1 / (temperature + 1e-8))
        return n / n.sum()

    def pv(self, state):
        # Return principal variation (action sequence which is considered as the best)
        s, pv_seq = copy.deepcopy(state), []
        while True:
            key = s.record_string()
            if key not in self.nodes or self.nodes[key].n.sum() == 0:
                break
            best_action = sorted([(a, self.nodes[key].n[a]) for a in s.legal_actions()], key=lambda x: -x[1])[0][0]
            pv_seq.append(best_action)
            s.play(best_action)
        return pv_seq

In [51]:
# Search with initialized net

tree = Tree(Net())
tree.think(State(), 100, show=True)

tree = Tree(Net())
tree.think(State().play('E4 F5 E5 F6 E6 F7 E7'), 200, show=True)

tree = Tree(Net())
tree.think(State().play('F4 D5 F5 D6 F6 D7 F7'), 200, show=True)

tree = Tree(Net())
tree.think(State().play('B2 A2 A3 C1'), 200, show=True)

   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = 
   1 2 3 4 5 6 7 8 9
A O O _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C X X _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = A1 C1 A2 C2
   1 2 3 4 5 6 7 8 9
A _ X O _ _ _ _ _ _
B _ O O _ _ _ _ _ _
C X _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = B2 A2 A3 C1 B3
   1 2 3 4 5 6 7 8 9
A _ X O _ _ _ _ _ _
B _ O _ _ _ _ _ _ _
C X _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = B2 A2 A3 C1


array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.14285715, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.14285715, 0.        , 0.14285715,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.14285715, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.14285715, 0.        , 0.        ,
       0.        , 0.        , 0.14285715, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.14285715, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [52]:
# Training of neural net
from tqdm import tqdm
import torch.optim as optim

batch_size = 32
num_steps = 100

def gen_target(ep, k):
    '''Generate inputs and targets for training'''
    # path, reward, observation, action, policy
    turn_idx = np.random.randint(len(ep[0]))
    ps, vs, ax = [], [], []
    for t in range(turn_idx, turn_idx + k + 1):
        if t < len(ep[0]):
            p = ep[4][t]
            a = ep[3][t]
        else: # state after finishing game
            # p is 0 (loss is 0)
            p = np.zeros_like(ep[4][-1])
            # random action selection
            a = np.zeros(np.prod(ep[3][-1].shape), dtype=np.float32)
            a[np.random.randint(len(a))] = 1
            a = a.reshape(ep[3][-1].shape)
        vs.append([ep[1] if t % 2 == 0 else -ep[1]])
        ps.append(p)
        ax.append(a)
        
    return ep[2][turn_idx], ax, ps, vs

def train(episodes, net, optimizer):
    '''Train neural net'''
    p_loss_sum, v_loss_sum = 0, 0
    net.train()
    k = 4
    for _ in tqdm(range(num_steps)):
        x, ax, p_target, v_target = zip(*[gen_target(episodes[np.random.randint(len(episodes))], k) for j in range(batch_size)])
        x = torch.from_numpy(np.array(x))
        ax = torch.from_numpy(np.array(ax))
        p_target = torch.from_numpy(np.array(p_target))
        v_target = torch.FloatTensor(np.array(v_target))

        # Change the order of axis as [time step, batch, ...]
        ax = torch.transpose(ax, 0, 1)
        p_target = torch.transpose(p_target, 0, 1)
        v_target = torch.transpose(v_target, 0, 1)

        # Compute losses for k (+ current) steps
        p_loss, v_loss = 0, 0
        for t in range(k + 1):
            rp = net.representation(x) if t == 0 else net.dynamics(rp, ax[t - 1])
            p, v = net.prediction(rp)
            p_loss += F.kl_div(torch.log(p), p_target[t], reduction='sum')
            v_loss += torch.sum(((v_target[t] - v) ** 2) / 2)

        p_loss_sum += p_loss.item()
        v_loss_sum += v_loss.item()

        optimizer.zero_grad()
        (p_loss + v_loss).backward()
        optimizer.step()

    num_train_datum = num_steps * batch_size
    print('p_loss %f v_loss %f' % (p_loss_sum / num_train_datum, v_loss_sum / num_train_datum))
    return net

In [53]:
#  Battle against random agents

def vs_random(net, n=100):
    results = {}
    for i in range(n):
        first_turn = i % 2 == 0
        turn = first_turn
        state = State()
        while not state.terminal():
            if turn:
                p, _ = net.predict(state, [])[-1]
                action = sorted([(a, p[a]) for a in state.legal_actions()], key=lambda x:-x[1])[0][0]
            else:
                action = np.random.choice(state.legal_actions())
            state.play(action)
            turn = not turn
        r = state.terminal_reward() if turn else -state.terminal_reward()
        results[r] = results.get(r, 0) + 1
    return results

In [54]:
# Main algorithm of MuZero

num_games = 100
num_games_one_epoch = 20
num_simulations = 40

net = Net()
optimizer = optim.SGD(net.parameters(), lr=3e-4, weight_decay=3e-5, momentum=0.93)

# Display battle results as {-1: lose 0: draw 1: win} (for episode generated for training, 1 means that the first player won)
vs_random_sum = vs_random(net)
print('vs_random = ', sorted(vs_random_sum.items()))

episodes = []
result_distribution = {1: 0, 0: 0, -1: 0}

for g in tqdm(range(num_games)):
    # Generate one episode
    record, p_targets, features, action_features = [], [], [], []
    state = State()
    # temperature using to make policy targets from search results
    temperature = 0.7

    while not state.terminal():
        tree = Tree(net)
        p_target = tree.think(state, num_simulations, temperature)
        p_targets.append(p_target)
        features.append(state.feature())

        # Select action with generated distribution, and then make a transition by that action
        action = np.random.choice(np.arange(len(p_target)), p=p_target)
        record.append(action)
        action_features.append(state.action_feature(action))
        state.play(action)
        temperature *= 0.8

    # reward seen from the first turn player
    reward = state.terminal_reward() * (1 if len(record) % 2 == 0 else -1)
    result_distribution[reward] += 1
    episodes.append((record, reward, features, action_features, p_targets))

    if g % num_games_one_epoch == 0:
        print('game ', end='')
    print(g, ' ', end='')

    # Training of neural net
    if (g + 1) % num_games_one_epoch == 0:
        # Show the result distributiuon of generated episodes
        print('generated = ', sorted(result_distribution.items()))
        print("episodes's lengh  :", len(episodes))
        net = train(episodes, net, optimizer)
        vs_random_once = vs_random(net)
        print('vs_random = ', sorted(vs_random_once.items()), end='')
        for r, n in vs_random_once.items():
            vs_random_sum[r] += n
        print(' sum = ', sorted(vs_random_sum.items()))

print('finished')

vs_random =  [(-1, 11), (1, 89)]


  1%|          | 1/100 [00:08<14:39,  8.88s/it]

game 0  

  2%|▏         | 2/100 [00:14<11:13,  6.87s/it]

1  

  3%|▎         | 3/100 [00:19<09:47,  6.06s/it]

2  

  4%|▍         | 4/100 [00:24<09:20,  5.83s/it]

3  

  5%|▌         | 5/100 [00:30<08:51,  5.60s/it]

4  

  6%|▌         | 6/100 [00:39<10:47,  6.89s/it]

5  

  7%|▋         | 7/100 [00:45<10:11,  6.58s/it]

6  

  8%|▊         | 8/100 [00:50<09:27,  6.17s/it]

7  

  9%|▉         | 9/100 [00:57<09:46,  6.45s/it]

8  

 10%|█         | 10/100 [01:04<09:54,  6.61s/it]

9  

 11%|█         | 11/100 [01:11<09:49,  6.62s/it]

10  

 12%|█▏        | 12/100 [01:14<08:10,  5.57s/it]

11  

 13%|█▎        | 13/100 [01:19<07:55,  5.46s/it]

12  

 14%|█▍        | 14/100 [01:25<07:53,  5.50s/it]

13  

 15%|█▌        | 15/100 [01:28<06:51,  4.84s/it]

14  

 16%|█▌        | 16/100 [01:37<08:34,  6.13s/it]

15  

 17%|█▋        | 17/100 [01:44<08:41,  6.28s/it]

16  

 18%|█▊        | 18/100 [01:48<07:35,  5.56s/it]

17  

 19%|█▉        | 19/100 [01:52<07:03,  5.23s/it]

18  19  generated =  [(-1, 12), (0, 0), (1, 8)]
episodes's lengh  : 20


100%|██████████| 100/100 [00:05<00:00, 18.01it/s]


p_loss 13.665549 v_loss 0.819992


 20%|██        | 20/100 [02:08<11:08,  8.35s/it]

vs_random =  [(-1, 6), (1, 94)] sum =  [(-1, 17), (1, 183)]


 21%|██        | 21/100 [02:13<09:45,  7.41s/it]

game 20  

 22%|██▏       | 22/100 [02:18<08:30,  6.55s/it]

21  

 23%|██▎       | 23/100 [02:22<07:20,  5.73s/it]

22  

 24%|██▍       | 24/100 [02:28<07:30,  5.93s/it]

23  

 25%|██▌       | 25/100 [02:35<07:46,  6.23s/it]

24  

 26%|██▌       | 26/100 [02:41<07:35,  6.16s/it]

25  

 27%|██▋       | 27/100 [02:45<06:35,  5.42s/it]

26  

 28%|██▊       | 28/100 [02:50<06:30,  5.42s/it]

27  

 29%|██▉       | 29/100 [02:55<06:15,  5.29s/it]

28  

 30%|███       | 30/100 [03:02<06:51,  5.88s/it]

29  

 31%|███       | 31/100 [03:09<07:13,  6.28s/it]

30  

 32%|███▏      | 32/100 [03:14<06:29,  5.73s/it]

31  

 33%|███▎      | 33/100 [03:18<05:50,  5.23s/it]

32  

 34%|███▍      | 34/100 [03:23<05:37,  5.12s/it]

33  

 35%|███▌      | 35/100 [03:30<06:15,  5.78s/it]

34  

 36%|███▌      | 36/100 [03:35<05:45,  5.40s/it]

35  

 37%|███▋      | 37/100 [03:43<06:28,  6.17s/it]

36  

 38%|███▊      | 38/100 [03:51<06:57,  6.74s/it]

37  

 39%|███▉      | 39/100 [03:56<06:25,  6.32s/it]

38  39  generated =  [(-1, 22), (0, 0), (1, 18)]
episodes's lengh  : 40


100%|██████████| 100/100 [00:05<00:00, 17.69it/s]


p_loss 13.192550 v_loss 0.762131


 40%|████      | 40/100 [04:09<08:21,  8.36s/it]

vs_random =  [(-1, 10), (1, 90)] sum =  [(-1, 27), (1, 273)]


 41%|████      | 41/100 [04:15<07:33,  7.69s/it]

game 40  

 42%|████▏     | 42/100 [04:20<06:26,  6.66s/it]

41  

 43%|████▎     | 43/100 [04:29<07:06,  7.49s/it]

42  

 44%|████▍     | 44/100 [04:33<06:09,  6.59s/it]

43  

 45%|████▌     | 45/100 [04:40<06:05,  6.64s/it]

44  

 46%|████▌     | 46/100 [04:45<05:25,  6.03s/it]

45  

 47%|████▋     | 47/100 [04:49<04:46,  5.40s/it]

46  

 48%|████▊     | 48/100 [04:57<05:28,  6.31s/it]

47  

 49%|████▉     | 49/100 [05:03<05:17,  6.23s/it]

48  

 50%|█████     | 50/100 [05:08<04:55,  5.91s/it]

49  

 51%|█████     | 51/100 [05:13<04:28,  5.49s/it]

50  

 52%|█████▏    | 52/100 [05:19<04:33,  5.71s/it]

51  

 53%|█████▎    | 53/100 [05:24<04:19,  5.52s/it]

52  

 54%|█████▍    | 54/100 [05:32<04:52,  6.35s/it]

53  

 55%|█████▌    | 55/100 [05:38<04:32,  6.06s/it]

54  

 56%|█████▌    | 56/100 [05:45<04:36,  6.29s/it]

55  

 57%|█████▋    | 57/100 [05:52<04:43,  6.59s/it]

56  

 58%|█████▊    | 58/100 [05:58<04:28,  6.38s/it]

57  

 59%|█████▉    | 59/100 [06:03<04:04,  5.95s/it]

58  59  generated =  [(-1, 33), (0, 0), (1, 27)]
episodes's lengh  : 60


100%|██████████| 100/100 [00:06<00:00, 15.86it/s]


p_loss 12.157385 v_loss 0.769862


 60%|██████    | 60/100 [06:17<05:42,  8.57s/it]

vs_random =  [(-1, 1), (1, 99)] sum =  [(-1, 28), (1, 372)]


 61%|██████    | 61/100 [06:27<05:46,  8.90s/it]

game 60  

 62%|██████▏   | 62/100 [06:33<04:57,  7.84s/it]

61  

 63%|██████▎   | 63/100 [06:39<04:36,  7.48s/it]

62  

 64%|██████▍   | 64/100 [06:46<04:27,  7.44s/it]

63  

 65%|██████▌   | 65/100 [06:53<04:14,  7.28s/it]

64  

 66%|██████▌   | 66/100 [06:58<03:35,  6.33s/it]

65  

 67%|██████▋   | 67/100 [07:02<03:14,  5.91s/it]

66  

 68%|██████▊   | 68/100 [07:08<03:08,  5.89s/it]

67  

 69%|██████▉   | 69/100 [07:14<03:04,  5.96s/it]

68  

 70%|███████   | 70/100 [07:20<02:56,  5.88s/it]

69  

 71%|███████   | 71/100 [07:23<02:27,  5.08s/it]

70  

 72%|███████▏  | 72/100 [07:29<02:23,  5.13s/it]

71  

 73%|███████▎  | 73/100 [07:35<02:27,  5.45s/it]

72  

 74%|███████▍  | 74/100 [07:40<02:21,  5.46s/it]

73  

 75%|███████▌  | 75/100 [07:44<02:00,  4.84s/it]

74  

 76%|███████▌  | 76/100 [07:50<02:03,  5.17s/it]

75  

 77%|███████▋  | 77/100 [07:57<02:12,  5.74s/it]

76  

 78%|███████▊  | 78/100 [08:01<01:55,  5.23s/it]

77  

 79%|███████▉  | 79/100 [08:04<01:39,  4.72s/it]

78  79  generated =  [(-1, 42), (0, 0), (1, 38)]
episodes's lengh  : 80


100%|██████████| 100/100 [00:05<00:00, 17.93it/s]


p_loss 11.929194 v_loss 0.613852


 80%|████████  | 80/100 [08:19<02:34,  7.74s/it]

vs_random =  [(-1, 1), (1, 99)] sum =  [(-1, 29), (1, 471)]


 81%|████████  | 81/100 [08:26<02:23,  7.53s/it]

game 80  

 82%|████████▏ | 82/100 [08:30<01:58,  6.57s/it]

81  

 83%|████████▎ | 83/100 [08:36<01:44,  6.17s/it]

82  

 84%|████████▍ | 84/100 [08:42<01:38,  6.18s/it]

83  

 85%|████████▌ | 85/100 [08:47<01:26,  5.78s/it]

84  

 86%|████████▌ | 86/100 [08:51<01:14,  5.31s/it]

85  

 87%|████████▋ | 87/100 [08:57<01:13,  5.64s/it]

86  

 88%|████████▊ | 88/100 [09:02<01:04,  5.35s/it]

87  

 89%|████████▉ | 89/100 [09:08<00:59,  5.42s/it]

88  

 90%|█████████ | 90/100 [09:14<00:56,  5.63s/it]

89  

 91%|█████████ | 91/100 [09:18<00:47,  5.25s/it]

90  

 92%|█████████▏| 92/100 [09:23<00:41,  5.24s/it]

91  

 93%|█████████▎| 93/100 [09:30<00:39,  5.66s/it]

92  

 94%|█████████▍| 94/100 [09:35<00:33,  5.59s/it]

93  

 95%|█████████▌| 95/100 [09:41<00:27,  5.53s/it]

94  

 96%|█████████▌| 96/100 [09:47<00:23,  5.84s/it]

95  

 97%|█████████▋| 97/100 [09:52<00:16,  5.41s/it]

96  

 98%|█████████▊| 98/100 [09:59<00:11,  5.90s/it]

97  

 99%|█████████▉| 99/100 [10:02<00:05,  5.25s/it]

98  99  generated =  [(-1, 52), (0, 0), (1, 48)]
episodes's lengh  : 100


100%|██████████| 100/100 [00:05<00:00, 18.15it/s]


p_loss 11.744327 v_loss 0.625600


100%|██████████| 100/100 [10:17<00:00,  6.17s/it]

vs_random =  [(-1, 4), (1, 96)] sum =  [(-1, 33), (1, 567)]
finished





In [58]:
# Search with trained net

tree = Tree(net)

tree.think(State().play('B2 A2 A3 C1'), 2000, show=True)

tree.think(State().play('E4 F5 E5 F6 E6 F7 E7'), 2000, show=True)

   1 2 3 4 5 6 7 8 9
A _ X O _ _ _ _ _ _
B _ O _ _ _ _ _ _ _
C X _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ _ _ _ _ _ _
F _ _ _ _ _ _ _ _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = B2 A2 A3 C1
1.00 sec. best A8. q = -0.7342. n = 5 / 234. pv = A8
2.01 sec. best F6. q = -0.7389. n = 9 / 434. pv = F6
3.00 sec. best B1. q = -0.7648. n = 12 / 654. pv = B1
4.00 sec. best C6. q = -0.7613. n = 15 / 900. pv = C6
5.00 sec. best G4. q = -0.7769. n = 18 / 1144. pv = G4
6.00 sec. best H6. q = -0.7786. n = 23 / 1391. pv = H6
7.00 sec. best H6. q = -0.7860. n = 26 / 1643. pv = H6
8.00 sec. best C8. q = -0.7846. n = 29 / 1886. pv = C8
   1 2 3 4 5 6 7 8 9
A _ _ _ _ _ _ _ _ _
B _ _ _ _ _ _ _ _ _
C _ _ _ _ _ _ _ _ _
D _ _ _ _ _ _ _ _ _
E _ _ _ O O O O _ _
F _ _ _ _ X X X _ _
G _ _ _ _ _ _ _ _ _
H _ _ _ _ _ _ _ _ _
I _ _ _ _ _ _ _ _ _
record = E4 F5 E5 F6 E6 F7 E7
1.00 sec. best B7. q = -0.9232. n = 6 / 269. pv = B7
2.00 sec. best C1. q = -0.9304. n = 10 / 494. pv = C1
3.00 s

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)