# BERT

## import

In [1]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ssrgem3d
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-ssrgem3d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 12.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 44.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.1-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp

In [2]:
import pandas as pd
import numpy as np

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [4]:
from transformers import BertConfig
from transformers import BertModel

## data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
x_train_pd = pd.read_csv('/content/drive/MyDrive/reversi/dataset/train_boards.csv', index_col=0, header=0)
x_test_pd = pd.read_csv('/content/drive/MyDrive/reversi/dataset/test_boards.csv', index_col=0, header=0)
y_train_pd = pd.read_csv('/content/drive/MyDrive/reversi/dataset/train_labels.csv', index_col=0, header=0)
y_test_pd = pd.read_csv('/content/drive/MyDrive/reversi/dataset/test_labels.csv', index_col=0, header=0)

  mask |= (ar1 == a)


In [7]:
x_train_list = x_train_pd.values.tolist()
x_test_list = x_test_pd.values.tolist()

In [8]:
tokenized_x_train = []
tokenized_x_test = []

for i in x_train_list:
  #下駄を履かせる
  new_list = [(j+150) for j in i]
  tokenized_x_train.append([101] + new_list + [102])

for i in x_test_list:
  #下駄を履かせる
  new_list = [(j+150) for j in i]
  tokenized_x_test.append([101] + new_list + [102])

#attentionの作成
train_attention = torch.ones(len(tokenized_x_train), 66)
test_attention = torch.ones(len(tokenized_x_test), 66)

In [9]:
#次元削減
y_train = y_train_pd.values.squeeze()
y_test = y_test_pd.values.squeeze()

In [10]:
#pytorch用のDataset作成
class ClassifierDataset(Dataset):
    
    def __init__(self, x_data, attention, y_data):
        self.x_data = x_data
        self.attention = attention
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.x_data[index], self.attention[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.x_data)

In [11]:
#bertの入力はlongの型しか受け付けないので注意（ベクトルなので）
train_dataset = ClassifierDataset(torch.tensor(tokenized_x_train).long(), train_attention.long(), torch.from_numpy(y_train).long())
test_dataset = ClassifierDataset(torch.tensor(tokenized_x_test).long(), test_attention.long(), torch.from_numpy(y_test).long())

In [12]:
#DataLoader作成

BATCH_SIZE = 50

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## model

In [13]:
#パラメータはbert将棋を参考、要改善
#vocab_sizeは入力の種類、hidden_sizeは出力の数

config = {
    'vocab_size': 152,
    'hidden_size': 768,
    'num_hidden_layers': 12,
    'num_attention_heads': 12,
    'intermediate_size': 3072,
    'hidden_act': 'gelu',
    'hidden_dropout_prob': 0.1,
    'attention_probs_dropout_prob': 0.1,
    'max_position_embeddings': 512, 
    'type_vocab_size': 1, 
    'initializer_range': 0.02,
}

config = BertConfig.from_dict(config)

In [14]:
#モデル作成
#sigmoidが必要かどうか
class BertNextAction(nn.Module):
  def __init__(self, model_config):
    super().__init__()
    self.bert = BertModel(model_config)
    self.layer_output = nn.Linear(768, 64)

    # self.tanh = nn.Tanh()
    # self.sigmoid = nn.Sigmoid()

#labelをどうするか
#学習と予測両方が行えるようにlabelsを設定する必要がある
  def forward(self, input_ids, attention_mask):
    x = self.bert(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state']
    x = self.layer_output(x).mean(axis=1)
    # x = self.sigmoid(x)
    return x

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [16]:
model_bert = BertNextAction(config).to(device)

In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model_bert.parameters(), lr=0.001)

## train

In [24]:
def train_epoch(model, optimizer, criterion, dataloader, device):
    train_loss = 0
    model.train()

    for i, (boards, attention, labels) in enumerate(dataloader):

        boards, attention, labels = boards.to(device), attention.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(boards, attention)
        loss = criterion(outputs, labels)
        loss.backward()

        optimizer.step()

        train_loss += loss.item()

        # if i % 2000 == 1999:    # print every 2000 mini-batches
            # print(f'Epoch [{epoch+1}, {i+1}], train_Loss : {train_loss:.4f}')

    train_loss = train_loss / len(dataloader.dataset)
    return train_loss

In [25]:
def inference(model, optimizer, criterion, dataloader, devide):

    model.eval()
    test_loss=0
    preds = []


    with torch.no_grad():
        for j, (boards, attention, labels) in enumerate(dataloader):

            boards, attention, labels = boards.to(device), attention.to(device), labels.to(device)

            outputs = model(boards, attention)
            loss = criterion(outputs, labels)

            test_loss += loss.item()

    test_loss = test_loss / len(dataloader.dataset)
    return test_loss

In [26]:
def run(num_epochs, model, optimizer, criterion, trainloader, testloader, device):
    train_loss_list = []
    test_loss_list = []
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, optimizer, criterion, trainloader, device)
        test_loss = inference(model, optimizer, criterion, testloader, device)

        print(f'Epoch [{epoch+1}], train_Loss : {train_loss:.4f}, val_Loss : {test_loss:.4f}')
        train_loss_list.append(train_loss)
        test_loss_list.append(test_loss)
    return train_loss_list, test_loss_list

In [None]:
train_loss_list, test_loss_list = run(2, model_bert, optimizer, criterion, trainloader, testloader, device)

In [None]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    model_bert.train()
    for i, (inputs, attention, labels) in enumerate(trainloader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, attention, labels = inputs.to(device), attention.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model_bert(inputs, attention)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

[1,  2000] loss: 4.154
[1,  4000] loss: 4.104
[1,  6000] loss: 4.100
[1,  8000] loss: 4.098
[1, 10000] loss: 4.097
[1, 12000] loss: 4.096
[1, 14000] loss: 4.096
[1, 16000] loss: 4.096


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/reversi/sample_bert.pth')