# BERT model

Train BERT model for emoji.

## Install dependent packages

This section installs required package. Version should be specified for reproducibility.

In [1]:
! pip install transformers==2.5.1
! pip install matplotlib==3.2.0
! pip install pandas==1.0.1
! pip install mecab-python3==0.996.2
! pip install tqdm==4.43.0
! pip install tensorboard==2.1.1  # required by torch.utils.tensorboard
! pip install attrdict==2.0.1

Collecting transformers==2.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/13/33/ffb67897a6985a7b7d8e5e7878c3628678f553634bd3836404fef06ef19b/transformers-2.5.1-py3-none-any.whl (499kB)
[K     |████████████████████████████████| 501kB 6.3MB/s eta 0:00:01
[?25hCollecting tokenizers==0.5.2 (from transformers==2.5.1)
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 10.7MB/s eta 0:00:01
[?25hCollecting boto3 (from transformers==2.5.1)
[?25l  Downloading https://files.pythonhosted.org/packages/2a/4f/3facbb42e8d07db1ef9b8cefb28dd1dbfcd52a8e32a0323d57f59b10e147/boto3-1.12.31-py2.py3-none-any.whl (128kB)
[K     |████████████████████████████████| 133kB 10.1MB/s eta 0:00:01
[?25hCollecting regex!=2019.12.17 (from transformers==2.5.1)
[?25l  Downloading https://files.pythonhosted.org/packages/

## Test library

Test your all the libraries used in this notebook.

## Define parameters
Declare parameters set by `papermill` .

In [2]:
name = "bert-model"
data_dir = "data_sample"
output_dir = "output"
batch_size = 32
num_epochs = 10  # 3-5 epoches are enough for fine-tuning
learning_rate = 5e-5
max_grad_norm = 1.0  # max grad norm needed for stable convergence
warmup_rate = 0.1  # max grad norm needed for stable convergence
tune_layer = "all"
seed = 1234
patience = 3  # if the validation metric is not improved in the count of patience, stop training

In [3]:
# Parameters
data_dir = "data"
tune_layer = "all"
warmup_rate = 0
name = "bert-tune_layer_all-warmup_rate_0"


Create an attribute object `param` from parameters, then delete parameter variables to clean this namespace.

In [4]:
import attrdict

params = attrdict.AttrDict({
    "name": name,
    "data_dir": data_dir,
    "output_dir": output_dir,
    "batch_size": batch_size,
    "num_epochs": num_epochs,
    "learning_rate": learning_rate,
    "max_grad_norm": max_grad_norm,
    "warmup_rate": warmup_rate,
    "tune_layer": tune_layer,
    "seed": seed,
    "patience": patience
})

del name
del data_dir
del output_dir
del batch_size
del num_epochs
del learning_rate
del max_grad_norm
del warmup_rate
del tune_layer
del seed
del patience

## Define preprocessor and tokenizer

In [5]:
import transformers


def build_tokenizer():
    return transformers.BertJapaneseTokenizer.from_pretrained("bert-base-japanese")

In [6]:
class LabelMapper:
    # 絵文字のラベルと学習時のIDの変換を行うクラス
    def __init__(self):
        pass
        
    def build(self, labels):
        id_ = 0
        label_to_id = dict()
        id_to_label = dict()
        
        for label in sorted(labels):
            if label in label_to_id:
                continue
            label_to_id[label] = id_
            id_to_label[id_] = label
            id_ += 1
        self._label_to_id = label_to_id
        self._id_to_label = id_to_label
        return self
        
    def id(self, label):
        return self._label_to_id[label]
    
    def label(self, id):
        return self._id_to_label[id]
    
    def labels(self):
        return list(self._label_to_id.keys())
    
    def save(self, path):
        path = os.path.join(path, "label.txt")
        with open(path, "w") as fd:
            for id_, label in self._id_to_label.items():
                print("{}\t{}".format(id_, label), file=fd)
                
    def load(self, path):
        label_to_id = dict()
        id_to_label = dict()
        
        path = os.path.join(path, "label.txt")        
        with open(path) as fd:
            for line in fd:
                id_, label = line.strip("\n").split("\t")
                id_ = int(id_)
                label_to_id[label] = id_
                id_to_label[id_] = label
        
        self._label_to_id = label_to_id
        self._id_to_label = id_to_label
        return self

## Define dataset

LabelMapper implements conversion between emoji labels and IDs.

In [7]:
import torch


# DataSetは __getitem__, __len__を定義する

class Dataset(torch.utils.data.Dataset):
    def __init__(self, text_label_list, tokenizer):
        self._text_label_list = text_label_list
        self._tokenizer = tokenizer
        
    def __len__(self):
        return len(self._text_label_list)
    
    def __getitem__(self, index):
        text, label = self._text_label_list[index]
        ids = self._tokenizer.encode(text)
        return ids, label
    
    
class PaddingCollation:
    def __call__(self, ids_label_list):
        # ids_list is like [([2, 4609, 3], 1), ([2, 10350, 25746, 28450, 3], 0)]
        ids_list = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(x[0]) for x in ids_label_list], batch_first=True)
        label_list = torch.LongTensor([x[1] for x in ids_label_list])
        return ids_list, label_list


def build_data_loader(fd, label_mapper, tokenizer, batch_size, shuffle):
    data = []
    for line in fd:
        label_str, text = line.strip("\n").split("\t")
        label = label_mapper.id(label_str)
        data.append((text, label))
    dataset = Dataset(data, tokenizer)
    col = PaddingCollation()
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=col, shuffle=shuffle)

## Define model

In [8]:
def build_model(num_labels):
    #config = transformers.BertConfig.from_pretrained("bert-base-japanese", num_labels=num_labels)
    #model = transformers.BertForSequenceClassification.from_pretrained("bert-base-japanese", config=config)
    model = transformers.BertForSequenceClassification.from_pretrained("bert-base-japanese", num_labels=num_labels)
    return model

## Build and save vocabulary

In [9]:
import os

model_dir = os.path.join(params.output_dir, params.name)
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

label_mapper = LabelMapper().build([x.split("\t")[0] for x in open(params.data_dir + "/train.tsv")])
label_mapper.save(model_dir)

label_mapper = LabelMapper().load(model_dir)

In [10]:
print(len(label_mapper.labels()))
print(label_mapper.labels())

85
[':OK_hand:', ':anger_symbol:', ':beaming_face_with_smiling_eyes:', ':beating_heart:', ':birthday_cake:', ':blue_heart:', ':cherry_blossom:', ':clapping_hands:', ':confetti_ball:', ':confounded_face:', ':crying_face:', ':dog_face:', ':double_exclamation_mark:', ':downcast_face_with_sweat:', ':drooling_face:', ':droplet:', ':exclamation_mark:', ':exclamation_question_mark:', ':eyes:', ':face_blowing_a_kiss:', ':face_savoring_food:', ':face_screaming_in_fear:', ':face_with_hand_over_mouth:', ':face_with_rolling_eyes:', ':face_with_tears_of_joy:', ':fire:', ':flexed_biceps:', ':flushed_face:', ':folded_hands:', ':folded_hands_light_skin_tone:', ':four_leaf_clover:', ':glowing_star:', ':green_heart:', ':grinning_face_with_big_eyes:', ':grinning_face_with_smiling_eyes:', ':grinning_face_with_sweat:', ':grinning_squinting_face:', ':growing_heart:', ':heart_suit:', ':heavy_heart_exclamation:', ':hugging_face:', ':index_pointing_up:', ':kissing_face_with_closed_eyes:', ':loudly_crying_face:

## Train and save model

In [11]:
import torch
import numpy as np
import random


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # When use GPU
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [12]:
import tqdm
import os
import sys


def train_model(model_dir, net, dataloader_dict, train_config):
    PHASE_TRAIN = "train"
    PHASE_VAL = "val"
    
    # keep the best model
    best_model = None
    best_epoch = 0
    best_metric = {"loss": float("infinity"), "top1": 0, "top5": 0}
    
    # 学習イテレーションの回数を保持
    num_iters = 0
    
    # keep the count which the validation metric does not improved
    num_patience = 0
        
    net.to(train_config.device)
    
    for epoch in range(train_config.num_epochs+1):
        print("Epoch {}/{}".format(epoch, train_config.num_epochs))
        # 学習と検証のループ
        for phase in [PHASE_TRAIN, PHASE_VAL]:
            # フェーズによってネットワークのモードを変更する
            # Dropout等の挙動に影響あり
            if phase == PHASE_TRAIN:
                net.train()
            elif phase == PHASE_VAL:
                net.eval()
            else:
                raise Exception("got {} expected one of {}".format(phase, [PHASE_TRAIN, PHASE_VAL]))
                
            epoch_loss = 0
            epoch_corrects = 0
            epoch_topk_corrects = 0
            
            # 未学習時の検証性能を確かめる
            if epoch == 0 and phase == PHASE_TRAIN:
                continue
                
            for inputs, labels in tqdm.tqdm(dataloader_dict[phase], disable=True):
                # GPUが使える場合はGPUにデータを送る
                inputs = inputs.to(train_config.device)
                labels = labels.to(train_config.device)
                
                # Initialize optimizer
                if phase == PHASE_TRAIN:
                    train_config.optimizer.zero_grad()
                
                # set_grad_enabled(phrase=="train") で
                # 学習時のみ勾配計算できるようにグラフ作成する
                with torch.set_grad_enabled(phase==PHASE_TRAIN):
                    # labelsを指定することでlossを計算する
                    loss, logits = net(inputs, labels=labels)
                    _, preds = torch.max(logits, dim=1)
                    _, topk_preds = torch.topk(logits, k=5, dim=1)
                    
                    if phase == PHASE_TRAIN:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(net.parameters(), train_config.max_grad_norm)

                        train_config.optimizer.step()
                        train_config.scheduler.step()
                        num_iters += 1

                    # epoch loss を更新
                    epoch_loss += loss.item() * inputs.size()[0]
                    # 正解数を更新
                    epoch_corrects += (preds == labels).sum().item()
                    epoch_topk_corrects += (topk_preds == labels.unsqueeze(1)).max(dim=1)[0].sum().item()

                    # TensorBoardへの描画を行う
                    # 学習時のみlossを描画
                    if train_config.writer and phase == PHASE_TRAIN:
                        train_config.writer.add_scalars("train/loss", {PHASE_TRAIN: loss.item()}, num_iters)
                        train_config.writer.add_scalars("train/lr", {PHASE_TRAIN: train_config.scheduler.get_lr()[0]}, num_iters)

            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects / len(dataloader_dict[phase].dataset)
            epoch_topk_acc = epoch_topk_corrects / len(dataloader_dict[phase].dataset)

            print("phase {}, loss: {:.4f}, acc: {:.4f}, topk acc: {:.4f}".format(phase, epoch_loss, epoch_acc, epoch_topk_acc))
            
            if train_config.writer and phase == PHASE_VAL:
                train_config.writer.add_scalars("train/loss", {PHASE_VAL: epoch_loss}, num_iters)
                train_config.writer.add_scalars("acc/top1", {PHASE_VAL: epoch_acc}, num_iters)
                train_config.writer.add_scalars("acc/top5", {PHASE_VAL: epoch_topk_acc}, num_iters)

            if phase == PHASE_VAL:
                if best_metric["loss"] > epoch_loss:
                    best_model = net
                    best_metric = {"loss": epoch_loss, "top1": epoch_acc, "top5": epoch_topk_acc}
                    best_epoch = epoch
                    num_patience = 0

                    # save model
                    if model_dir:
                        print("Save model, epoch:", epoch)
                        net.save_pretrained(model_dir)
                else:
                    num_patience += 1
                    print("Patience {}, epoch: {}".format(num_patience, epoch))
                    
                if num_patience > train_config.patience:
                    return best_model, best_metric, best_epoch
    return best_model, best_metric, best_epoch

In [13]:
# Setup train environment
from torch.utils.tensorboard import SummaryWriter


# Optimizer
def tune_all(net):
    for name, param in net.named_parameters():
        param.requires_grad = True

        
def tune_classifier(net):
    for name, param in net.named_parameters():
        param.requires_grad = False
    for name, param in net.classifier.named_parameters():
        param.requires_grad = True

        
def tune_last_and_classifier(net):
    for name, param in net.named_parameters():
        param.requires_grad = False
    for name, param in net.bert.encoder.layer[-1].named_parameters():
        param.requires_grad = True
    for name, param in net.classifier.named_parameters():
        param.requires_grad = True

In [14]:
def train(params, model_dir, label_mapper):
    # Fix seed for reproducability
    set_seed(seed=params.seed)

    # Build dataset
    tokenizer = build_tokenizer()
    dataloader_dict = {
        "train": build_data_loader(open(params.data_dir + "/train.tsv"), label_mapper, tokenizer, batch_size=params.batch_size, shuffle=True),
        "val": build_data_loader(open(params.data_dir + "/valid.tsv"), label_mapper, tokenizer, batch_size=params.batch_size, shuffle=False),
    }

    # define function which layers to tune
    tune_func = {
        "all": tune_all,
        "last_and_classifier": tune_last_and_classifier,
        "classifier": tune_classifier,
    }[params.tune_layer]
    
    # build model
    net = build_model(num_labels=len(label_mapper.labels()))
    tune_func(net)
    
    # define config
    total_steps = len(dataloader_dict["train"]) * params.num_epochs
    optimizer = torch.optim.Adam(net.parameters(),  lr=params.learning_rate)

    train_config = attrdict.AttrDict({
        "optimizer": optimizer,
        "writer": SummaryWriter(log_dir=os.path.join(params.output_dir , "runs", params.name)),
        "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
        "scheduler": transformers.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=total_steps*params.warmup_rate,
            num_training_steps=total_steps
        ),
        "patience": params.patience,
        "max_grad_norm": params.max_grad_norm,
        "num_epochs": params.num_epochs,
    })
                                   
    return train_model(model_dir, net, dataloader_dict, train_config)

In [15]:
best_model, best_metric, best_epoch = train(params, model_dir, label_mapper)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=257706.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=361.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445021143.0, style=ProgressStyle(descri…


Epoch 0/10
phase val, loss: 4.4780, acc: 0.0120, topk acc: 0.0578
Save model, epoch: 0
Epoch 1/10
phase train, loss: 3.7810, acc: 0.1236, topk acc: 0.3081
phase val, loss: 3.6645, acc: 0.1396, topk acc: 0.3438
Save model, epoch: 1
Epoch 2/10
phase train, loss: 3.5663, acc: 0.1566, topk acc: 0.3718
phase val, loss: 3.6367, acc: 0.1470, topk acc: 0.3526
Save model, epoch: 2
Epoch 3/10
phase train, loss: 3.3953, acc: 0.1824, topk acc: 0.4211
phase val, loss: 3.6583, acc: 0.1463, topk acc: 0.3564
Patience 1, epoch: 3
Epoch 4/10
phase train, loss: 3.1763, acc: 0.2183, topk acc: 0.4833
phase val, loss: 3.7127, acc: 0.1445, topk acc: 0.3512
Patience 2, epoch: 4
Epoch 5/10
phase train, loss: 2.9253, acc: 0.2636, topk acc: 0.5505
phase val, loss: 3.8851, acc: 0.1412, topk acc: 0.3446
Patience 3, epoch: 5
Epoch 6/10
phase train, loss: 2.6618, acc: 0.3167, topk acc: 0.6159
phase val, loss: 4.0814, acc: 0.1351, topk acc: 0.3356
Patience 4, epoch: 6


## Evaluate the best model

In [16]:
def evaluate(params, model_dir):
    # Load models
    tokenizer = build_tokenizer()
    label_mapper = LabelMapper().load(model_dir)
    net = transformers.BertForSequenceClassification.from_pretrained(model_dir)

    dataloader_dict = {
        "val": build_data_loader(open(params.data_dir + "/test.tsv"), label_mapper, tokenizer, batch_size=params.batch_size, shuffle=False),
    }
    # define config
    optimizer = torch.optim.Adam(net.parameters(),  lr=params.learning_rate)

    train_config = attrdict.AttrDict({
        "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
        "writer": None,
        "patience": params.patience,
        "num_epochs": 0,
    })
                                   
    train_model(None, net, dataloader_dict, train_config)

In [17]:
evaluate(params, model_dir)

Epoch 0/0
phase val, loss: 3.6261, acc: 0.1466, topk acc: 0.3584
