# Define our policy

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from math import *
import numpy as np

from ConnectN import ConnectN
game_setting = {'size':(6,6), 'N':4}
game = ConnectN(**game_setting)

class Policy(nn.Module):

    def __init__(self, game):
        super(Policy, self).__init__()

        # input = 6x6 board
        # convert to 5x5x8
        self.conv1 = nn.Conv2d(1, 16, kernel_size=2, stride=1, bias=False)
        # 5x5x16 to 3x3x32
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, bias=False)

        self.size=3*3*32
        
        # the part for actions
        self.fc_action1 = nn.Linear(self.size, self.size//4)
        self.fc_action2 = nn.Linear(self.size//4, 36)
        
        # the part for the value function
        self.fc_value1 = nn.Linear(self.size, self.size//6)
        self.fc_value2 = nn.Linear(self.size//6, 1)
        self.tanh_value = nn.Tanh()
        
    def forward(self, x):

        y = F.leaky_relu(self.conv1(x))
        y = F.leaky_relu(self.conv2(y))
        y = y.view(-1, self.size)
        
        # action head
        a = self.fc_action2(F.leaky_relu(self.fc_action1(y)))
        
        avail = (torch.abs(x.squeeze())!=1).type(torch.FloatTensor)
        avail = avail.view(-1, 36)
        maxa = torch.max(a)
        exp = avail*torch.exp(a-maxa)
        prob = exp/torch.sum(exp)
        
        # value head
        value = self.tanh_value(self.fc_value2(F.leaky_relu( self.fc_value1(y) )))
        return prob.view(6,6), value

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


# we use the adam optimizer with learning rate 2e-4
# optim.SGD is also possible
import torch.optim as optim
policy = Policy(game).to(device)
optimizer = optim.Adam(policy.parameters(), lr=1.e-4, weight_decay=1.e-4)

indices=np.moveaxis(np.indices(game.size),0,-1)
indices=list(indices.reshape(-1, indices.shape[-1]))

def Policy_Player(game):
    frame=torch.tensor(game.state*game.player, dtype=torch.float)
    input=frame.unsqueeze(0).unsqueeze(0)
    prob,_=policy(input)
    prob = prob.view(-1)
    return random.choices(indices, prob)[0]



In [15]:
%load_ext autoreload
%autoreload 2
import MCTS

from copy import deepcopy

def Policy_Player_MCTS(game):
    mytree = MCTS.Node(copy(game))
    for _ in range(3000):
        mytree.explore(policy)


    #for k,v in mytree.child.items():
    #    print(k,v.U,v.N)
        
    mytreenext, (v, nn_v, p, nn_p) = mytree.next(temperature=0.1)
    
    #print(mytreenext.game.state)
    
    return mytreenext.game.last_move

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.animation as animation
from copy import copy

from ConnectN import ConnectN
from Play import Play

import random

def Random_Player(game):
    return random.choice(game.available_moves())    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Play a game of Advanced tic-tac-toe

In [23]:
% matplotlib notebook

# first player = red
# None = mouse-click human player
# player1 = AI allows a custom function AI(game) to dictate moves for a certain player
# i.e. player1 = Random_Player, player2 = Random_Player 
# pits two random players against each other
gameplay=Play(ConnectN(**game_setting), 
              player1=Policy_Player_MCTS, player2=Policy_Player_MCTS)



<IPython.core.display.Javascript object>

# Now try a more advanced 5-in-a-row game (Gomoku)

In [18]:
Policy_Player_MCTS(ConnectN(**game_setting))

(3, 5)

In [11]:
# initialize our alphazero agent and optimizer
game=ConnectN(**game_setting)
policy = Policy(game).to(device)
optimizer = optim.Adam(policy.parameters(), lr=.01, weight_decay=1.e-5)

In [20]:
# train our agent

from collections import deque
import MCTS

episodes = 10000
outcomes = deque(maxlen=10)


import progressbar as pb
widget = ['training loop: ', pb.Percentage(), ' ', 
          pb.Bar(), ' ', pb.ETA() ]
timer = pb.ProgressBar(widgets=widget, maxval=episodes).start()

outcomes = []
policy_loss = []


for e in range(episodes):

    mytree = MCTS.Node(game)
    logterm = []
    vterm = []
    
    while mytree.outcome is None:
        for _ in range(1000):
            mytree.explore(policy)

        current_player = mytree.game.player
        mytree, (v, nn_v, p, nn_p) = mytree.next()
        mytree.detach_mother()
        
        loglist = torch.log(nn_p)*p
        constant = torch.where(p>0, p*torch.log(p),torch.tensor(0.))
        logterm.append(-torch.sum(loglist-constant))

        vterm.append(nn_v*current_player)
        
    # we compute the "policy_loss" for computing gradient
    outcome = mytree.outcome
    outcomes.append(outcome)
    
    loss = torch.sum( (torch.stack(vterm)-outcome)**2 + torch.stack(logterm) )
    optimizer.zero_grad()
    loss.backward()
    policy_loss.append(float(loss))

    optimizer.step()
    
    if e%10==0:
        print("game: ",e+1, ", mean loss: {:3.2f}".format(np.mean(policy_loss[-20:])),
              ", recent outcomes: ", outcomes[-10:])
    del loss
    
    timer.update(e+1)
    
timer.finish()





training loop:   0% |                                   | ETA:  1 day, 19:48:47

game:  1 , mean loss: 11.71 , recent outcomes:  [1]


training loop:   0% |                                  | ETA:  2 days, 14:46:40

KeyboardInterrupt: 

In [30]:
torch.save(policy,'665-pie.policy')

  "type " + obj.__name__ + ". It won't be checked "
