In [None]:
!pip install transformers==4.37.2
!pip install peft==0.10.0
!pip install accelerate==0.28.0
!pip install corus
!pip install seqeval
# Download the Collection5 dataset
!wget http://www.labinform.ru/pub/named_entities/collection5.zip
# Extract the zip file
!unzip collection5.zip
from transformers import AutoTokenizer

from corus import load_ne5

# Load the Collection5 dataset
records = load_ne5('Collection5')

all_records = list(records)

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

In [2]:


def convert_to_bio_format(record):
    text = record.text
    spans = record.spans

    # Создаем словарь для маппинга позиций символов к типу сущности
    char_to_entity = {}
    for span in spans:
        for i in range(span.start, span.stop):
            # Маркируем первый символ как B-TYPE, остальные как I-TYPE
            if i == span.start:
                char_to_entity[i] = f"B-{span.type}"
            else:
                char_to_entity[i] = f"I-{span.type}"

    # Токенизация текста
    tokenized = tokenizer(text, return_offsets_mapping=True, truncation=True, return_tensors="pt")
    token_ids = tokenized["input_ids"][0]
    offsets = tokenized["offset_mapping"][0]

    # Сопоставляем токены с метками
    labels = []
    for token_idx, (start, end) in enumerate(offsets):
        # Пропускаем специальные токены [CLS], [SEP]
        if start == 0 and end == 0:
            labels.append("O")
            continue

        # Находим метку для токена
        token_label = "O"
        for char_idx in range(start, end):
            if char_idx in char_to_entity:
                # Берем метку первого символа токена, если он попадает в сущность
                token_label = char_to_entity[char_idx]
                break

        labels.append(token_label)

    # Получаем текстовые токены для удобства
    tokens = tokenizer.convert_ids_to_tokens(token_ids)

    return {
        "input_ids": token_ids.tolist(),
        "tokens": tokens,
        "labels": labels,
        "attention_mask": tokenized["attention_mask"][0].tolist()
    }

# Обработаем все записи
processed_records = [convert_to_bio_format(record) for record in all_records]

In [None]:
# 1. Получаем список всех уникальных меток
unique_labels = set()
for record in processed_records:
    unique_labels.update(record["labels"])

# Добавляем специальную метку для игнорируемых токенов при оценке
label_list = sorted(list(unique_labels))
print(f"Уникальные метки: {label_list}")

# 2. Разделяем на train/test
import random
random.seed(42)
random.shuffle(processed_records)

train_size = int(len(processed_records) * 0.8)
train_data = processed_records[:train_size]
test_data = processed_records[train_size:]

print(f"Размер обучающей выборки: {len(train_data)}")
print(f"Размер тестовой выборки: {len(test_data)}")

# 3. Подготовка модели
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification

# Создаем маппинги id <-> label
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Загружаем модель
model = AutoModelForTokenClassification.from_pretrained(
    "cointegrated/rubert-tiny2",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 4. Создаем датасет в формате, понятном для Hugging Face
import torch
from torch.utils.data import Dataset

# Исправляем класс NERDataset, чтобы обеспечить правильную обработку
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, processed_data, label2id, max_length=256):
        self.data = processed_data
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Убедимся, что данные имеют одинаковую длину
        input_ids = item["input_ids"]
        attention_mask = item["attention_mask"]
        labels = [self.label2id.get(label, -100) for label in item["labels"]]

        # Обрезаем или дополняем до max_length
        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]
            attention_mask = attention_mask[:self.max_length]
            labels = labels[:self.max_length]
        else:
            # Если длина меньше max_length, добавляем паддинг
            padding_length = self.max_length - len(input_ids)
            input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
            attention_mask = attention_mask + [0] * padding_length
            labels = labels + [-100] * padding_length

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)

        }

# Создаем датасеты с фиксированной длиной
train_dataset = NERDataset(train_data, label2id, max_length=256)
test_dataset = NERDataset(test_data, label2id, max_length=256)

# Создаем коллатор данных для батчирования
# Создаем коллатор данных для токенов
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    max_length=256,
    return_tensors="pt"
)

In [4]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=2)

    # Убираем игнорируемые токены из предсказаний и истинных меток
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions)
    }

In [5]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# 1. MLM предобучение
# Загружаем модель и токенизатор
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
mlm_model = AutoModelForMaskedLM.from_pretrained("cointegrated/rubert-tiny2")

# Создаем простой датасет для MLM
# Для MLM дообучения
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=256):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )

    def __getitem__(self, idx):
        # Возвращаем словарь с тензорами для каждого элемента датасета
        return {
            key: val[idx] for key, val in self.encodings.items()
        }

    def __len__(self):
        return len(self.encodings.input_ids)

# Извлекаем тексты из обучающего набора
train_texts = [record.text for record in all_records[:train_size]]
mlm_dataset = TextDataset(train_texts, tokenizer)

# Подготавливаем коллатор данных для MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Настраиваем обучение MLM
mlm_training_args = TrainingArguments(
    output_dir="./mlm_pretrained",
    per_device_train_batch_size=8,
    num_train_epochs=20,
    learning_rate=2e-5,
    save_strategy="epoch",
    fp16=True
)

# Создаем тренер для MLM
mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_training_args,
    train_dataset=mlm_dataset,
    data_collator=data_collator
)

# Обучаем MLM модель
print("Начинаем MLM предобучение...")
mlm_trainer.train()
mlm_trainer.save_model("./mlm_pretrained")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Начинаем MLM предобучение...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdmitriimartynov1[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,3.0872
1000,2.9011
1500,2.8261
2000,2.7861


In [6]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np

# Проверка доступности CUDA
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

ner_model = AutoModelForTokenClassification.from_pretrained(
    "./mlm_pretrained",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    # Параметры для работы с early stopping
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

# Перемещаем модель на GPU, если доступна
ner_model = ner_model.to(device)
ner_data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=256,
    return_tensors="pt"
)
# Создаем Trainer с добавлением callback'а для early stopping
trainer = Trainer(
    model=ner_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=ner_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Добавляем early stopping здесь
)

# Оценка метрик до дообучения
print("Метрики до дообучения:")
eval_results_before = trainer.evaluate()
print(eval_results_before)

# Дообучение модели
print("Начинаем дообучение модели...")
trainer.train()

# Оценка метрик после дообучения
print("Метрики после дообучения:")
eval_results_after = trainer.evaluate()
print(eval_results_after)

# Сохранение модели
trainer.save_model("./ner_model_finetuned")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ./mlm_pretrained and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Используемое устройство: cuda
Метрики до дообучения:


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Trainer is attempting to log a value of "              precision    recall  f1-score   support

    GEOPOLIT       0.03      0.22      0.05       606
         LOC       0.00      0.03      0.00       482
       MEDIA       0.00      0.08      0.01       271
         ORG       0.00      0.04      0.01      1129
         PER       0.00      0.01      0.00      1702

   micro avg       0.01      0.06      0.01      4190
   macro avg       0.01      0.08      0.01      4190
weighted avg       0.01      0.06      0.01      4190
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 2.5408174991607666, 'eval_precision': 0.006138066641866397, 'eval_recall': 0.05513126491646778, 'eval_f1': 0.011046289211935728, 'eval_classification_report': '              precision    recall  f1-score   support\n\n    GEOPOLIT       0.03      0.22      0.05       606\n         LOC       0.00      0.03      0.00       482\n       MEDIA       0.00      0.08      0.01       271\n         ORG       0.00      0.04      0.01      1129\n         PER       0.00      0.01      0.00      1702\n\n   micro avg       0.01      0.06      0.01      4190\n   macro avg       0.01      0.08      0.01      4190\nweighted avg       0.01      0.06      0.01      4190\n', 'eval_runtime': 1.1178, 'eval_samples_per_second': 178.931, 'eval_steps_per_second': 22.366}
Начинаем дообучение модели...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Classification Report
1,No log,0.857735,0.003106,0.000239,0.000443,precision recall f1-score support  GEOPOLIT 0.00 0.00 0.00 606  LOC 0.00 0.00 0.00 482  MEDIA 0.00 0.00 0.00 271  ORG 0.00 0.00 0.00 1129  PER 0.00 0.00 0.00 1702  micro avg 0.00 0.00 0.00 4190  macro avg 0.00 0.00 0.00 4190 weighted avg 0.00 0.00 0.00 4190
2,No log,0.523799,0.46684,0.300716,0.365801,precision recall f1-score support  GEOPOLIT 1.00 0.00 0.01 606  LOC 0.00 0.00 0.00 482  MEDIA 0.00 0.00 0.00 271  ORG 0.02 0.00 0.00 1129  PER 0.48 0.74 0.58 1702  micro avg 0.47 0.30 0.37 4190  macro avg 0.30 0.15 0.12 4190 weighted avg 0.34 0.30 0.24 4190
3,No log,0.382305,0.541966,0.522434,0.532021,precision recall f1-score support  GEOPOLIT 0.89 0.55 0.68 606  LOC 0.22 0.01 0.02 482  MEDIA 0.00 0.00 0.00 271  ORG 0.27 0.28 0.28 1129  PER 0.63 0.90 0.74 1702  micro avg 0.54 0.52 0.53 4190  macro avg 0.40 0.35 0.34 4190 weighted avg 0.48 0.52 0.47 4190
4,No log,0.295318,0.606228,0.655131,0.629732,precision recall f1-score support  GEOPOLIT 0.83 0.80 0.81 606  LOC 0.32 0.13 0.19 482  MEDIA 0.00 0.00 0.00 271  ORG 0.38 0.55 0.45 1129  PER 0.74 0.93 0.82 1702  micro avg 0.61 0.66 0.63 4190  macro avg 0.45 0.48 0.46 4190 weighted avg 0.56 0.66 0.60 4190
5,No log,0.242678,0.652996,0.733413,0.690872,precision recall f1-score support  GEOPOLIT 0.78 0.86 0.82 606  LOC 0.60 0.42 0.50 482  MEDIA 0.28 0.04 0.07 271  ORG 0.45 0.65 0.53 1129  PER 0.80 0.94 0.86 1702  micro avg 0.65 0.73 0.69 4190  macro avg 0.58 0.58 0.56 4190 weighted avg 0.64 0.73 0.67 4190
6,No log,0.206958,0.686468,0.763962,0.723145,precision recall f1-score support  GEOPOLIT 0.83 0.87 0.85 606  LOC 0.64 0.50 0.56 482  MEDIA 0.56 0.16 0.25 271  ORG 0.48 0.69 0.57 1129  PER 0.82 0.95 0.88 1702  micro avg 0.69 0.76 0.72 4190  macro avg 0.67 0.63 0.62 4190 weighted avg 0.69 0.76 0.71 4190
7,No log,0.182955,0.712821,0.796181,0.752198,precision recall f1-score support  GEOPOLIT 0.85 0.88 0.87 606  LOC 0.66 0.63 0.65 482  MEDIA 0.65 0.27 0.39 271  ORG 0.51 0.71 0.59 1129  PER 0.85 0.95 0.90 1702  micro avg 0.71 0.80 0.75 4190  macro avg 0.71 0.69 0.68 4190 weighted avg 0.72 0.80 0.75 4190
8,No log,0.164679,0.732476,0.815513,0.771767,precision recall f1-score support  GEOPOLIT 0.87 0.88 0.88 606  LOC 0.65 0.69 0.67 482  MEDIA 0.74 0.39 0.51 271  ORG 0.54 0.73 0.62 1129  PER 0.86 0.95 0.91 1702  micro avg 0.73 0.82 0.77 4190  macro avg 0.73 0.73 0.72 4190 weighted avg 0.74 0.82 0.77 4190
9,No log,0.151196,0.747003,0.832936,0.787633,precision recall f1-score support  GEOPOLIT 0.89 0.88 0.89 606  LOC 0.68 0.73 0.70 482  MEDIA 0.80 0.52 0.63 271  ORG 0.54 0.74 0.63 1129  PER 0.88 0.96 0.92 1702  micro avg 0.75 0.83 0.79 4190  macro avg 0.76 0.76 0.75 4190 weighted avg 0.76 0.83 0.79 4190
10,0.431200,0.141487,0.762613,0.844153,0.801314,precision recall f1-score support  GEOPOLIT 0.89 0.89 0.89 606  LOC 0.70 0.79 0.74 482  MEDIA 0.79 0.58 0.67 271  ORG 0.57 0.74 0.64 1129  PER 0.89 0.96 0.92 1702  micro avg 0.76 0.84 0.80 4190  macro avg 0.77 0.79 0.77 4190 weighted avg 0.78 0.84 0.81 4190


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "              precision    recall  f1-score   support

    GEOPOLIT       0.00      0.00      0.00       606
         LOC       0.00      0.00      0.00       482
       MEDIA       0.00      0.00      0.00       271
         ORG       0.00      0.00      0.00      1129
         PER       0.00      0.00      0.00      1702

   micro avg       0.00      0.00      0.00      4190
   macro avg       0.00      0.00      0.00      4190
weighted avg       0.00      0.00      0.00      4190
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "              precision    recall  f1-score   support

    GEOPOLIT       1.00      0.00      0.01       606
         LOC       0.00      0.00      0.

Метрики после дообучения:


Trainer is attempting to log a value of "              precision    recall  f1-score   support

    GEOPOLIT       0.90      0.89      0.89       606
         LOC       0.76      0.86      0.81       482
       MEDIA       0.79      0.78      0.79       271
         ORG       0.68      0.80      0.74      1129
         PER       0.93      0.97      0.95      1702

   micro avg       0.82      0.89      0.85      4190
   macro avg       0.81      0.86      0.83      4190
weighted avg       0.83      0.89      0.86      4190
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.09433083236217499, 'eval_precision': 0.8236206514513628, 'eval_recall': 0.8871121718377089, 'eval_f1': 0.8541882109617372, 'eval_classification_report': '              precision    recall  f1-score   support\n\n    GEOPOLIT       0.90      0.89      0.89       606\n         LOC       0.76      0.86      0.81       482\n       MEDIA       0.79      0.78      0.79       271\n         ORG       0.68      0.80      0.74      1129\n         PER       0.93      0.97      0.95      1702\n\n   micro avg       0.82      0.89      0.85      4190\n   macro avg       0.81      0.86      0.83      4190\nweighted avg       0.83      0.89      0.86      4190\n', 'eval_runtime': 0.8844, 'eval_samples_per_second': 226.138, 'eval_steps_per_second': 28.267, 'epoch': 30.0}


На основе проведенного эксперимента могу отметить, что подход с предварительным дообучением модели в режиме MLM (Masked Language Modeling) перед финальным дообучением на NER-задаче действительно оказался эффективным.

Предварительное MLM-дообучение помогает модели лучше адаптироваться к особенностям языка и домена в конкретном корпусе текстов. Это своего рода "акклиматизация" модели к специфике данных перед решением конкретной задачи распознавания именованных сущностей.

Такой двухэтапный подход обеспечивает:
1. Более быструю сходимость при последующем NER-дообучении
2. Потенциально лучшие финальные метрики при том же количестве эпох дообучения для NER
3. Более стабильные результаты на различных доменных данных

В данном случае, дообучение модели rubert-tiny2 сначала на MLM-задаче, а затем на NER-задаче позволило получить улучшение метрик по сравнению с прямым дообучением на NER-задаче.