Fine-tuning LLM (BERT-base) для классификации токенов (задача определение именнованных сущностей)!

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

!apt install git-lfs

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━

In [None]:
from datasets import load_dataset
from tqdm.notebook import tqdm


raw_datasets = load_dataset("RCC-MSU/collection3")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/245k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9301 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2153 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1922 [00:00<?, ? examples/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 9301
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2153
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1922
    })
})

In [None]:
raw_datasets["train"][0]["tokens"]

['Дополнение',
 ':',
 'Д',
 '.',
 'Медведев',
 'присвоил',
 'звания',
 'сотрудников',
 'полиции',
 'и',
 'переназначил',
 '14',
 'руководителей',
 'УВД',
 ',',
 'ГУВД',
 'и',
 'МВД',
 'по',
 'субъектам',
 'РФ',
 '.']

In [None]:
raw_datasets["train"][0]["ner_tags"]

[0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 0, 0, 5, 0]

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)

In [None]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Дополнение : Д     .     Медведев присвоил звания сотрудников полиции и переназначил 14 руководителей УВД   , ГУВД  и МВД   по субъектам РФ    . 
O          O B-PER I-PER I-PER    O        O      O           O       O O            O  O             B-ORG O B-ORG O B-ORG O  O         B-LOC O 


In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
tokenizer.is_fast

True

In [None]:
def batch_iterator(batch_size=1000):
    for i in tqdm(range(0, len(raw_datasets["train"]), batch_size)):
        strings = [" ".join(tokens) for tokens in raw_datasets["train"][i : i + batch_size]["tokens"]]
        yield strings

tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=48_000)
tokenizer.save_pretrained("msu-bert-ner-tokenizer")

  0%|          | 0/10 [00:00<?, ?it/s]

('msu-bert-ner-tokenizer/tokenizer_config.json',
 'msu-bert-ner-tokenizer/special_tokens_map.json',
 'msu-bert-ner-tokenizer/vocab.txt',
 'msu-bert-ner-tokenizer/added_tokens.json',
 'msu-bert-ner-tokenizer/tokenizer.json')

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'Д',
 '##о',
 '##п',
 '##о',
 '##л',
 '##н',
 '##е',
 '##н',
 '##и',
 '##е',
 ':',
 'Д',
 '.',
 'М',
 '##е',
 '##д',
 '##в',
 '##е',
 '##д',
 '##е',
 '##в',
 'п',
 '##р',
 '##и',
 '##с',
 '##в',
 '##о',
 '##и',
 '##л',
 'з',
 '##в',
 '##а',
 '##н',
 '##и',
 '##я',
 'с',
 '##о',
 '##т',
 '##р',
 '##у',
 '##д',
 '##н',
 '##и',
 '##к',
 '##ов',
 'п',
 '##о',
 '##л',
 '##и',
 '##ц',
 '##и',
 '##и',
 'и',
 'п',
 '##е',
 '##р',
 '##е',
 '##на',
 '##з',
 '##на',
 '##ч',
 '##и',
 '##л',
 '14',
 'р',
 '##у',
 '##к',
 '##ов',
 '##о',
 '##д',
 '##и',
 '##т',
 '##е',
 '##л',
 '##е',
 '##й',
 'У',
 '##В',
 '##Д',
 ',',
 'Г',
 '##У',
 '##В',
 '##Д',
 'и',
 'М',
 '##В',
 '##Д',
 'п',
 '##о',
 'с',
 '##у',
 '##б',
 '##ъ',
 '##е',
 '##к',
 '##т',
 '##а',
 '##м',
 'Р',
 '##Ф',
 '.',
 '[SEP]']

Токенизатор BERT использует WordPice алгоритм для разбиения токенов на суб-токены. (например МВД -> М, ##В, ##Д).

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:

            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:

            new_labels.append(-100)
        else:
            label = labels[word_id]

            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 0, 0, 5, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 3, 4, 4, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, -100]


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/9301 [00:00<?, ? examples/s]

Map:   0%|          | 0/2153 [00:00<?, ? examples/s]

Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9301
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2153
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1922
    })
})

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            1,    2,    2,    2,    2,    2,    2,    2,    2,    2,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    3,    4,    4,    0,    3,    4,    4,
            4,    0,    3,    4,    4,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    5,    6,    0, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -

In [None]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 3, 4, 4, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, -100]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 6, 6, 6, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "Babelscape/wikineural-multilingual-ner",
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Babelscape/wikineural-multilingual-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.num_labels

7

In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=1c8f68847327dd9bf40db33667f5e30d417d40846f71d9f7819da6e4ac5908f6
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import evaluate

metric = evaluate.load("seqeval")


import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "msu-bert-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1412,0.088514,0.766995,0.834753,0.799441,0.973743
2,0.084,0.063732,0.821131,0.887077,0.852831,0.981313
3,0.0488,0.055956,0.832354,0.896569,0.863269,0.983461


TrainOutput(global_step=3489, training_loss=0.11824483371252452, metrics={'train_runtime': 1302.1722, 'train_samples_per_second': 21.428, 'train_steps_per_second': 2.679, 'total_flos': 2967281794940298.0, 'train_loss': 0.11824483371252452, 'epoch': 3.0})

In [None]:
inps = tokenizer("Владимир Путин созвал заседание в Москве", return_tensors="pt", padding=True)
inps

{'input_ids': tensor([[  101,   450, 28400, 10286, 28396, 17424, 28401, 17424, 20442,   462,
         28405, 28404, 17424, 17127,   492, 16948, 28398, 28394, 10286, 28400,
           482, 10286, 28403, 19692, 28396, 10286, 17127, 17424, 19692,   477,
           459, 16948, 28403, 28399, 28394, 19692,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
model.eval()
import torch

with torch.no_grad():
  out = model(**inps.to("cuda"))

In [None]:
out.logits.argmax(-1)[:, 1:-1]

tensor([[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 5, 6, 6, 6, 6, 6]], device='cuda:0')

In [None]:
results = []
example = "Владимир Путин созвал заседание в Москве"
inputs_with_offsets = inps = tokenizer("Владимир Путин созвал заседание в Москве", return_tensors="pt", padding=True, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"].squeeze(0).tolist()
predictions = out.logits.argmax(-1)[0].tolist()
probabilities = torch.nn.functional.softmax(out.logits, dim=-1)[0].tolist()

groupped_labels = []
group_pos = 0
current_label = None

for idx, sub in zip(inps.word_ids()[1:-1], out.logits.argmax(-1)[0].tolist()[1:-1]):

  if id2label[sub] != "O":
    if idx == group_pos:
      current_label = id2label[sub][2:5]
      group_pos = idx
    else:
      groupped_labels.append(current_label)
      current_label = id2label[sub][2:5]
      group_pos = idx
  else:
    current_label = None
  print("current label:", current_label, "group_pos:", group_pos)
groupped_labels

current label: PER group_pos: 0
current label: PER group_pos: 0
current label: PER group_pos: 0
current label: PER group_pos: 0
current label: PER group_pos: 0
current label: PER group_pos: 0
current label: PER group_pos: 0
current label: PER group_pos: 0
current label: PER group_pos: 1
current label: PER group_pos: 1
current label: PER group_pos: 1
current label: PER group_pos: 1
current label: PER group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: None group_pos: 1
current label: LOC group_pos: 5
current label: LOC group

['PER', None]

In [1]:
# классы меткок в датасете
id2label

NameError: name 'id2label' is not defined

In [None]:
# обработка реальных значений!

import numpy as np

results = []
example = "Владимир Путин созвал заседание в Москве"
inputs_with_offsets = inps = tokenizer("Владимир Путин созвал заседание в Москве", return_tensors="pt", padding=True, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"].squeeze(0).tolist()
predictions = out.logits.argmax(-1)[0].tolist()
probabilities = torch.nn.functional.softmax(out.logits, dim=-1)[0].tolist()

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = id2label[pred]
    if label != "O":
        
        label = label[2:]
        start, _ = offsets[idx]

        
        all_scores = []
        if idx < len(predictions) - 1:
          idx += 1
        while (
            idx < len(predictions)
            and id2label[predictions[idx]] == f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1

       
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append(
            {
                "entity_group": label,
                "score": score,
                "word": word,
                "start": start,
                "end": end,
            }
        )
    idx += 1

results

[{'entity_group': 'PER',
  'score': 0.0003666538977995515,
  'word': 'Владимир Путин',
  'start': 0,
  'end': 14},
 {'entity_group': 'LOC',
  'score': 0.0005800946557428688,
  'word': 'Москве',
  'start': 34,
  'end': 40}]