# BERT moel

Train BERT model for emoji.

TODO

- 前処理はどのステップでするべきか考える。DataSetに与えても良いが、推論時に困ることになるかも


## Parameters
Declare parameters set by `papermill` .

In [None]:
name = "bert-model"
data_dir = "data"
output_dir = "output"
batch_size = 32
num_epochs = 10  # 3-5 epoches are enough for fine-tuning
learning_rate = 5e-5
max_grad_norm = 1.0
warmup_rate = 0.1  # warmup needed for stable convergence
tune_layer = "all"
seed = 1234
patience = 3  # if the validation metric is not improved in the count of patience, stop training

## Install dependent packages

This section installs required package. Version should be specified for reproducibility.

In [None]:
! pip install transformers==2.5.1
! pip install matplotlib==3.2.0
! pip install pandas==1.0.1
! pip install mecab-python3==0.996.2
! pip install tqdm==4.43.0
! pip install tensorboard==2.1.1  # required by torch.utils.tensorboard

## Test library

Test your all the libraries used in this notebook.

## Preprocess dataset

## Train model

Set seed.

In [None]:
import torch
import numpy as np
import random


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # When use GPU
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
import transformers


def build_tokenizer():
    return transformers.BertJapaneseTokenizer.from_pretrained("bert-base-japanese")


def build_model(num_labels):
    #config = transformers.BertConfig.from_pretrained("bert-base-japanese", num_labels=num_labels)
    #model = transformers.BertForSequenceClassification.from_pretrained("bert-base-japanese", config=config)
    model = transformers.BertForSequenceClassification.from_pretrained("bert-base-japanese", num_labels=num_labels)
    return model

In [None]:
# DataSetは __getitem__, __len__を定義する

class Dataset(torch.utils.data.Dataset):
    def __init__(self, text_label_list, tokenizer):
        self._text_label_list = text_label_list
        self._tokenizer = tokenizer
        
    def __len__(self):
        return len(self._text_label_list)
    
    def __getitem__(self, index):
        text, label = self._text_label_list[index]
        ids = self._tokenizer.encode(text)
        return ids, label
    
    
class PaddingCollation:
    def __call__(self, ids_label_list):
        # ids_list is like [([2, 4609, 3], 1), ([2, 10350, 25746, 28450, 3], 0)]
        ids_list = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(x[0]) for x in ids_label_list], batch_first=True)
        label_list = torch.LongTensor([x[1] for x in ids_label_list])
        return ids_list, label_list

    
class LabelMapper:
    # 絵文字のラベルと学習時のIDの変換を行うクラス
    def __init__(self):
        pass
        
    def build(self, labels):
        id_ = 0
        label_to_id = dict()
        id_to_label = dict()
        
        for label in sorted(labels):
            if label in label_to_id:
                continue
            label_to_id[label] = id_
            id_to_label[id] = label
            id_ += 1
        self._label_to_id = label_to_id
        self._id_to_label = id_to_label
        return self
        
    def id(self, label):
        return self._label_to_id[label]
    
    def label(self, id):
        return self._id_to_label[id]
    
    def labels(self):
        return list(self._label_to_id.keys())
    
    
def build_data_loader(fd, label_mapper, tokenizer, batch_size, shuffle):
    data = []
    for line in fd:
        label_str, text = line.strip("\n").split("\t")
        label = label_mapper.id(label_str)
        data.append((text, label))
    dataset = Dataset(data, tokenizer)
    col = PaddingCollation()
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=col, shuffle=shuffle)

In [None]:
import tqdm
import os
import sys


def train_model(model_output_dir, net, tokenizer, dataloader_dict, optimizer, scheduler, writer, num_epochs, max_grad_norm, patience, device):
    PHASE_TRAIN = "train"
    PHASE_VAL = "val"
    
    # keep the best model
    best_model = None
    best_epoch = 0
    best_metric = {"loss": float("infinity"), "top1": 0, "top5": 0}
    
    # 学習イテレーションの回数を保持
    num_iters = 0
    
    # keep the count which the validation metric does not improved
    num_patience = 0
        
    net.to(device)
    
    for epoch in range(num_epochs+1):
        print("Epoch {}/{}".format(epoch, num_epochs))
        # 学習と検証のループ
        for phase in [PHASE_TRAIN, PHASE_VAL]:
            # フェーズによってネットワークのモードを変更する
            # Dropout等の挙動に影響あり
            if phase == PHASE_TRAIN:
                net.train()
            elif phase == PHASE_VAL:
                net.eval()
            else:
                raise Exception("got {} expected one of {}".format(phase, [PHASE_TRAIN, PHASE_VAL]))
                
            epoch_loss = 0
            epoch_corrects = 0
            epoch_topk_corrects = 0
            
            # 未学習時の検証性能を確かめる
            if epoch == 0 and phase == PHASE_TRAIN:
                continue
                
            for inputs, labels in tqdm.tqdm(dataloader_dict[phase], disable=True):
                # GPUが使える場合はGPUにデータを送る
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                # Initialize optimizer
                optimizer.zero_grad()
                
                # set_grad_enabled(phrase=="train") で
                # 学習時のみ勾配計算できるようにグラフ作成する
                with torch.set_grad_enabled(phase==PHASE_TRAIN):
                    # labelsを指定することでlossを計算する
                    loss, logits = net(inputs, labels=labels)
                    _, preds = torch.max(logits, dim=1)
                    _, topk_preds = torch.topk(logits, k=5, dim=1)
                    
                    if phase == PHASE_TRAIN:
                        loss.backward()
                        optimizer.step()
                        torch.nn.utils.clip_grad_norm_(net.parameters(), max_grad_norm)
                        num_iters += 1

                        if scheduler:
                            scheduler.step()

                    # epoch loss を更新
                    epoch_loss += loss.item() * inputs.size()[0]
                    # 正解数を更新
                    epoch_corrects += (preds == labels).sum().item()
                    epoch_topk_corrects += (topk_preds == labels.unsqueeze(1)).max(dim=1)[0].sum().item()

                    # TensorBoardへの描画を行う
                    # 学習時のみlossを描画
                    if phase == PHASE_TRAIN:
                        writer.add_scalars("train/loss", {PHASE_TRAIN: loss.item()}, num_iters)
                        writer.add_scalars("train/lr", {PHASE_TRAIN: scheduler.get_lr()[0]}, num_iters)

            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects / len(dataloader_dict[phase].dataset)
            epoch_topk_acc = epoch_topk_corrects / len(dataloader_dict[phase].dataset)

            print("phase {}, loss: {:.4f}, acc: {:.4f}, topk acc: {:.4f}".format(phase, epoch_loss, epoch_acc, epoch_topk_acc))
            
            if phase == PHASE_VAL:
                writer.add_scalars("train/loss", {PHASE_VAL: epoch_loss}, num_iters)
                writer.add_scalars("acc/top1", {PHASE_VAL: epoch_acc}, num_iters)
                writer.add_scalars("acc/top5", {PHASE_VAL: epoch_topk_acc}, num_iters)
                
                if best_metric["loss"] > epoch_loss:
                    best_model = net
                    best_metric = {"loss": epoch_loss, "top1": epoch_acc, "top5": epoch_topk_acc}
                    best_epoch = epoch
                    num_patience = 0
                    # save model
                    print("Save model, epoch:", epoch)
                    save_dir = os.path.join(model_output_dir, "epoch-{}".format(epoch))
                    if not os.path.exists(save_dir):
                        os.mkdir(save_dir)
                    net.save_pretrained(save_dir)
                    tokenizer.save_pretrained(save_dir)
                else:
                    num_patience += 1
                    print("Patience {}, epoch: {}".format(num_patience, epoch))
                    
                if num_patience > patience:
                    return best_model, best_metric, best_epoch
    return best_model, best_metric, best_epoch


In [None]:
# Setup train environment
from torch.utils.tensorboard import SummaryWriter


# Optimizer
def tune_all(net):
    for name, param in net.named_parameters():
        param.requires_grad = True

        
def tune_classifier(net):
    for name, param in net.named_parameters():
        param.requires_grad = False
    for name, param in net.classifier.named_parameters():
        param.requires_grad = True

        
def tune_last_and_classifier(net):
    for name, param in net.named_parameters():
        param.requires_grad = False
    for name, param in net.bert.encoder.layer[-1].named_parameters():
        param.requires_grad = True
    for name, param in net.classifier.named_parameters():
        param.requires_grad = True

In [None]:
import os

def train():
    print("Running", name)
    
    # create directory to save model
    model_output_dir = os.path.join(output_dir, name)
    if not os.path.exists(model_output_dir):
        os.mkdir(model_output_dir)

    # Fix seed for reproducability
    set_seed(seed=seed)

    # Build dataset
    tokenizer = build_tokenizer()
    label_mapper = LabelMapper().build([x.split("\t")[0] for x in open(data_dir + "/train.tsv")])
    dataloader_dict = {
        "train": build_data_loader(open(data_dir + "/train.tsv"), label_mapper, tokenizer, batch_size=batch_size, shuffle=True),
        "val": build_data_loader(open(data_dir + "/valid.tsv"), label_mapper, tokenizer, batch_size=batch_size, shuffle=False),
    }
    print(len(label_mapper.labels()))
    print(label_mapper.labels())

    # define function which layers to tune
    tune_func = {
        "all": tune_all,
        "last_and_classifier": tune_last_and_classifier,
        "classifier": tune_classifier,
    }[tune_layer]
    
    # build model
    model = build_model(num_labels=len(label_mapper.labels()))
    tune_func(model)
    optimizer = torch.optim.Adam(model.parameters(),  lr=learning_rate)
    writer = SummaryWriter(log_dir=output_dir + "/runs/" + name)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    total_steps = len(dataloader_dict["train"]) * num_epochs
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=total_steps*warmup_rate,
        num_training_steps=total_steps
    )
    
    return train_model(model_output_dir, model, tokenizer, dataloader_dict, optimizer, scheduler, writer, num_epochs, max_grad_norm, patience, device)

In [None]:
best_model, best_metric, best_epoch = train()

## Evaluate model

In [None]:
print("Best model in epoch", best_epoch)
print("Scores:", best_metric)