# BERTを用いたスロットフィリング

## 準備

### パッケージのインストール

In [1]:
!pip install -q transformers==4.10.2 seqeval==1.2.2 datasets==1.12.1 numpy==1.19.5 scikit-learn==0.24.2

[K     |████████████████████████████████| 2.8 MB 5.3 MB/s 
[K     |████████████████████████████████| 43 kB 1.8 MB/s 
[K     |████████████████████████████████| 270 kB 39.1 MB/s 
[K     |████████████████████████████████| 22.3 MB 2.0 MB/s 
[K     |████████████████████████████████| 895 kB 37.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 25.1 MB/s 
[K     |████████████████████████████████| 59 kB 6.5 MB/s 
[K     |████████████████████████████████| 596 kB 28.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 35.1 MB/s 
[K     |████████████████████████████████| 56 kB 4.8 MB/s 
[K     |████████████████████████████████| 243 kB 42.4 MB/s 
[K     |████████████████████████████████| 132 kB 48.6 MB/s 
[K     |████████████████████████████████| 271 kB 46.2 MB/s 
[K     |████████████████████████████████| 192 kB 50.9 MB/s 
[K     |████████████████████████████████| 160 kB 49.9 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


### インポート

In [2]:
import json
from itertools import chain

import numpy as np
import torch
from datasets import load_dataset, load_metric
from seqeval.metrics import classification_report
from sklearn.model_selection import train_test_split
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

### データのアップロード

データセットをアップロードします。ノートブックと同じ階層にDataフォルダがあり、その下にsnipsフォルダがあるので、学習・検証用データセットをアップロードしましょう。Colabでない場合は、データセットを読み込むときに正しいパスを指定します。

In [3]:
from google.colab import files

uploaded = files.upload()

Saving train_PlayMusic_full.json to train_PlayMusic_full.json
Saving validate_PlayMusic.json to validate_PlayMusic.json


### データの読み込み

In [4]:
def load_data(filename):
    with open(filename, encoding="iso-8859-2") as f:
        datalist = json.load(f)
    x, y = [], []
    for data in datalist["PlayMusic"]:
        sent = []
        tags = []
        for phrase in data["data"]:
            words = phrase["text"].strip().split()
            if "entity" in phrase:
                label = phrase["entity"]
                labels = [f"B-{label}"] + [f"I-{label}"] * (len(words) - 1)
            else:
                labels = ["O"] * len(words)
            sent.extend(words)
            tags.extend(labels)
        x.append(sent)
        y.append(tags)
    return x, y

In [5]:
train_file = "train_PlayMusic_full.json"
test_file = "validate_PlayMusic.json"

x_train, y_train = load_data(train_file)
x_test, y_test = load_data(test_file)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, test_size=0.3, random_state=42
)

## 前処理

In [6]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
unique_tags = list(set(chain(*y_train)))
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [8]:
train_encodings = tokenizer(
    x_train,
    is_split_into_words=True,
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)
valid_encodings = tokenizer(
    x_valid,
    is_split_into_words=True,
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)
test_encodings = tokenizer(
    x_test,
    is_split_into_words=True,
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)

In [9]:
def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(y_train, train_encodings)
valid_labels = encode_tags(y_valid, valid_encodings)
test_labels = encode_tags(y_test, test_encodings)

In [10]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
valid_encodings.pop("offset_mapping")
test_encodings.pop("offset_mapping")
train_dataset = Dataset(train_encodings, train_labels)
valid_dataset = Dataset(valid_encodings, valid_labels)
test_dataset = Dataset(test_encodings, test_labels)

## モデルの学習

「BERTを用いた固有表現認識」のノートブックと同様に、モデルを学習していきましょう。

In [11]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(unique_tags)
)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [12]:
batch_size = 16
args = TrainingArguments(
    "ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
)

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [14]:
metric = load_metric("seqeval")

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [15]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 1400
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 880
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.403633,0.746853,0.815267,0.779562,0.88035
2,No log,0.264691,0.805886,0.877863,0.840336,0.92867
3,No log,0.214484,0.875277,0.905344,0.890056,0.934192
4,No log,0.234857,0.879056,0.909924,0.894224,0.935343
5,No log,0.181712,0.889956,0.919847,0.904655,0.946848
6,0.302600,0.190838,0.899628,0.923664,0.911488,0.948688
7,0.302600,0.214045,0.903875,0.925954,0.914781,0.949379
8,0.302600,0.225731,0.907532,0.929008,0.918144,0.950299
9,0.302600,0.215377,0.906786,0.928244,0.91739,0.950299
10,0.302600,0.225742,0.90611,0.928244,0.917044,0.949839


***** Running Evaluation *****
  Num examples = 600
  Batch size = 16
***** Running Evaluation *****
  Num examples = 600
  Batch size = 16
***** Running Evaluation *****
  Num examples = 600
  Batch size = 16
***** Running Evaluation *****
  Num examples = 600
  Batch size = 16
***** Running Evaluation *****
  Num examples = 600
  Batch size = 16
Saving model checkpoint to ner/checkpoint-500
Configuration saved in ner/checkpoint-500/config.json
Model weights saved in ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in ner/checkpoint-500/special_tokens_map.json
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
***** Running Evaluation *****
  Num examples = 600
  Batch size = 16
***** Running Evaluation *****
  Num examples = 600
  Batch size = 16
***** Running Evaluation *****
  Num examples = 600
  Batch size = 1

TrainOutput(global_step=880, training_loss=0.18142375187440352, metrics={'train_runtime': 249.314, 'train_samples_per_second': 56.154, 'train_steps_per_second': 3.53, 'total_flos': 200084268384000.0, 'train_loss': 0.18142375187440352, 'epoch': 10.0})

In [18]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


{'epoch': 10.0,
 'eval_accuracy': 0.9448094612352168,
 'eval_f1': 0.9007263922518159,
 'eval_loss': 0.30720072984695435,
 'eval_precision': 0.8985507246376812,
 'eval_recall': 0.9029126213592233,
 'eval_runtime': 0.4776,
 'eval_samples_per_second': 209.397,
 'eval_steps_per_second': 14.658}

## Reference

- [Fine-tuning with custom datasets](https://huggingface.co/transformers/custom_datasets.html)