In [1]:
# !rm -rf Collection5

In [3]:
%%capture --no-display

!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!wget http://www.labinform.ru/pub/named_entities/collection5.zip

!unzip collection5.zip
!rm collection5.zip

In [4]:
%%capture --no-display

!pip install corus torch transformers seqeval numpy datasets hf_xet tqdm scikit-learn accelerate

In [10]:
def preprocess_data_word_level(examples, set_type):
    processed_data = []

    for example in tqdm(examples, desc=f'Processing {set_type}'):
        text = example.text
        spans = sorted(example.spans, key=lambda x: x.start)

        spans_dict = {}
        for span in spans:
            for i in range(span.start, span.stop):
                if i == span.start:
                    spans_dict[i] = (f'B-{span.type}', span.type)
                else:
                    spans_dict[i] = (f'I-{span.type}', span.type)

        tokens = []
        token_spans = []
        in_token = False
        start_pos = 0

        for i, char in enumerate(text):
            if char.isalnum() or char in "-'":
                if not in_token:
                    start_pos = i
                    in_token = True
            else:
                if in_token:
                    tokens.append(text[start_pos:i])
                    token_spans.append((start_pos, i))
                    in_token = False

                if not char.isspace():
                    tokens.append(char)
                    token_spans.append((i, i+1))

        if in_token:
            tokens.append(text[start_pos:])
            token_spans.append((start_pos, len(text)))

        labels = []
        for start, end in token_spans:
            found_entity = False

            if start in spans_dict:
                labels.append(spans_dict[start][0])
                found_entity = True
            else:
                for i in range(start+1, end):
                    if i in spans_dict:
                        tag_type = spans_dict[i][1]
                        labels.append(f'I-{tag_type}')
                        found_entity = True
                        break

            if not found_entity:
                labels.append('O')

        processed_data.append({'tokens': tokens, 'ner_tags': labels})

    return processed_data

def preprocess_data_word_level_dict(examples, set_type):
    processed_data = []

    for example in tqdm(examples, desc=f'Processing {set_type}'):
        text = example.text
        spans = sorted(example.spans, key=lambda x: x['start'])

        spans_dict = {}
        for span in spans:
            for i in range(span['start'], span['end']):
                if i == span['start']:
                    spans_dict[i] = (f'B-{span["type"]}', span["type"])
                else:
                    spans_dict[i] = (f'I-{span["type"]}', span["type"])

        tokens = []
        token_spans = []
        in_token = False
        start_pos = 0

        for i, char in enumerate(text):
            if char.isalnum() or char in "-'":
                if not in_token:
                    start_pos = i
                    in_token = True
            else:
                if in_token:
                    tokens.append(text[start_pos:i])
                    token_spans.append((start_pos, i))
                    in_token = False

                if not char.isspace():
                    tokens.append(char)
                    token_spans.append((i, i+1))

        if in_token:
            tokens.append(text[start_pos:])
            token_spans.append((start_pos, len(text)))

        labels = []
        for start, end in token_spans:
            found_entity = False

            if start in spans_dict:
                labels.append(spans_dict[start][0])
                found_entity = True
            else:
                for i in range(start+1, end):
                    if i in spans_dict:
                        tag_type = spans_dict[i][1]
                        labels.append(f'I-{tag_type}')
                        found_entity = True
                        break

            if not found_entity:
                labels.append('O')

        processed_data.append({'tokens': tokens, 'ner_tags': labels})

    return processed_data
    

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=512,
        return_offsets_mapping=True
    )

    labels = []
    offset_mapping = tokenized_inputs.pop('offset_mapping')

    for i, (label, _) in enumerate(zip(examples['ner_tags'], offset_mapping)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []

        previous_word_idx = None

        for idx, word_idx in enumerate(word_ids):
            if word_idx is None:
                label_ids.append(-100)
                continue

            if idx > 0 and word_idx == previous_word_idx:
                current_label = label[word_idx]
                if current_label.startswith('B-'):
                    current_label = 'I-' + current_label[2:]
                label_ids.append(tag2id.get(current_label, tag2id['O']))
            else:
                label_ids.append(tag2id.get(label[word_idx], tag2id['O']))

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

def eval_model(model, dataset):
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir='./results',
            per_device_eval_batch_size=16,
            disable_tqdm=False,
            logging_dir='./logs',
            report_to='none',
        ),
    )

    predictions, labels, _ = trainer.predict(dataset)
    predictions = np.argmax(predictions, axis=2)

    true_predictions = []
    true_labels = []

    for prediction, label in tqdm(zip(predictions, labels), desc='Processing preds', total=len(predictions)):
        sentence_preds = []
        sentence_labels = []

        for p, l in zip(prediction, label):
            if l != -100:
                sentence_preds.append(id2tag[p])
                sentence_labels.append(id2tag[l])

        true_predictions.append(sentence_preds)
        true_labels.append(sentence_labels)

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    detailed_report = classification_report(true_labels, true_predictions)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'report': detailed_report
    }

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    true_predictions = []
    true_labels = []

    for prediction, label in zip(preds, labels):
        sentence_preds = []
        sentence_labels = []

        for p, l in zip(prediction, label):
            if l != -100:
                sentence_preds.append(id2tag[p])
                sentence_labels.append(id2tag[l])

        true_predictions.append(sentence_preds)
        true_labels.append(sentence_labels)

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

def train_ner(model, tokenizer, train, test):
    training_args = TrainingArguments(
        output_dir='./results',
        eval_strategy='epoch',
        save_strategy='epoch',
        learning_rate=3e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=15,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        greater_is_better=True,
        report_to='none',
        disable_tqdm=False,
        warmup_ratio=0.1,
        fp16=torch.cuda.is_available(),
        seed=42,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=test,
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
    )

    trainer.train()

    return model

- **preprocess_data_word_level**: Разбивает текст на слова (токены) и отмечает, какие из них являются частями именованных сущностей с помощью BIO-тегов;

- **tokenize_and_align_labels**: Токенизирует текст и корректирует метки NER так, чтобы они соответствовали токенам после разбиения текста;

- **eval_model**: Оценивает модель на тестовых данных;

- **compute_metrics**: Считает метрики (точность, полноту, F1) - нужно для обучения.

# №1. Обучите NER-модель
- Загрузите набор данных
- Разбейте набор данных train/test части
- Дообучите модель rubert-tiny2 на train-части корпуса для решения NER-задачи, сделайте замеры качества NER-метрик до и после дообучения

In [6]:
import torch
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, AutoModelForMaskedLM,
    DataCollatorForLanguageModeling, TrainingArguments, Trainer
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import numpy as np
from corus import load_ne5
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from tqdm.auto import tqdm
import random

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

tqdm.pandas()

DATA = './Collection5'
data = list(load_ne5(DATA))
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

print(f'Train: {len(train_data)}, test: {len(test_data)}')

unique_tags = set()
for item in data:
    for span in item.spans:
        unique_tags.add(span.type)

tag_names = list(unique_tags)
tag_names.sort()
tag_names.insert(0, 'O')

iob_tags = ['O']
for tag in tag_names:
    if tag != 'O':
        iob_tags.append(f'B-{tag}')
        iob_tags.append(f'I-{tag}')

tag2id = {tag: id for id, tag in enumerate(iob_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

print(f'Found {len(tag_names)} unique tags: {", ".join(tag_names)}')
print(f'Created {len(iob_tags)} tags in IOB format: {", ".join(iob_tags)}')

train_processed = preprocess_data_word_level(train_data, 'train')
test_processed = preprocess_data_word_level(test_data, 'test')

all_tags = [tag for item in train_processed for tag in item['ner_tags']]
print('Tags distribution:')
for tag in sorted(set(all_tags)):
    print(f'{tag}: {all_tags.count(tag)} ({all_tags.count(tag)/len(all_tags)*100:.2f}%)')

train_dataset = Dataset.from_dict({
    'tokens': [item['tokens'] for item in train_processed],
    'ner_tags': [item['ner_tags'] for item in train_processed]
})

test_dataset = Dataset.from_dict({
    'tokens': [item['tokens'] for item in test_processed],
    'ner_tags': [item['ner_tags'] for item in test_processed]
})

tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')

train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True, desc='Tokenizing train')
test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True, desc='Tokenizing test')

model = AutoModelForTokenClassification.from_pretrained('cointegrated/rubert-tiny2', num_labels=len(tag2id))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
model.to(device)

print('\n===== Model before training =====')
pre_training_metrics = eval_model(model, test_tokenized)
print(f'F1-score: {pre_training_metrics["f1"]:.4f}')
print(f'Precision: {pre_training_metrics["precision"]:.4f}')
print(f'Recall: {pre_training_metrics["recall"]:.4f}')
print('\nClassification report:')
print(pre_training_metrics['report'])

model = train_ner(model, tokenizer, train_tokenized, test_tokenized)

print('\n===== Model after training =====')
vanilla_ner_metrics = eval_model(model, test_tokenized)
print(f'F1-score: {vanilla_ner_metrics["f1"]:.4f}')
print(f'Precision: {vanilla_ner_metrics["precision"]:.4f}')
print(f'Recall: {vanilla_ner_metrics["recall"]:.4f}')
print('\nClassification report:')
print(vanilla_ner_metrics['report'])

model.save_pretrained('./ner_model')
tokenizer.save_pretrained('./ner_model')
print('Model saved in directory ./ner_model')

print('\n===== Before and after training =====')
print(f'F1-score: {pre_training_metrics["f1"]:.4f} -> {vanilla_ner_metrics["f1"]:.4f} (change: {vanilla_ner_metrics["f1"] - pre_training_metrics["f1"]:.4f})')
print(f'Precision: {pre_training_metrics["precision"]:.4f} -> {vanilla_ner_metrics["precision"]:.4f} (change: {vanilla_ner_metrics["precision"] - pre_training_metrics["precision"]:.4f})')
print(f'Recall: {pre_training_metrics["recall"]:.4f} -> {vanilla_ner_metrics["recall"]:.4f} (change: {vanilla_ner_metrics["recall"] - pre_training_metrics["recall"]:.4f})')

  from .autonotebook import tqdm as notebook_tqdm


Train: 800, test: 200
Found 6 unique tags: O, GEOPOLIT, LOC, MEDIA, ORG, PER
Created 11 tags in IOB format: O, B-GEOPOLIT, I-GEOPOLIT, B-LOC, I-LOC, B-MEDIA, I-MEDIA, B-ORG, I-ORG, B-PER, I-PER


Processing train: 100%|██████████| 800/800 [00:00<00:00, 4255.07it/s]
Processing test: 100%|██████████| 200/200 [00:00<00:00, 4567.37it/s]


Tags distribution:
B-GEOPOLIT: 3226 (1.51%)
B-LOC: 2573 (1.20%)
B-MEDIA: 1202 (0.56%)
B-ORG: 5719 (2.67%)
B-PER: 8503 (3.97%)
I-GEOPOLIT: 180 (0.08%)
I-LOC: 1145 (0.53%)
I-MEDIA: 791 (0.37%)
I-ORG: 5330 (2.49%)
I-PER: 8403 (3.92%)
O: 177027 (82.68%)


Tokenizing train: 100%|██████████| 800/800 [00:00<00:00, 1284.44 examples/s]
Tokenizing test: 100%|██████████| 200/200 [00:00<00:00, 1343.15 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device: cpu

===== Model before training =====


Processing preds: 100%|██████████| 200/200 [00:00<00:00, 14418.62it/s]


F1-score: 0.0055
Precision: 0.0030
Recall: 0.0304

Classification report:
           precision    recall  f1-score   support

    MEDIA       0.00      0.16      0.00       295
      LOC       0.00      0.03      0.01       548
      PER       0.00      0.00      0.00      2048
      ORG       0.01      0.06      0.02      1237
 GEOPOLIT       0.00      0.01      0.01       841

micro avg       0.00      0.03      0.01      4969
macro avg       0.00      0.03      0.01      4969



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.4748,1.209734,0.0,0.0,0.0
2,0.5692,0.515742,0.450498,0.291205,0.353746
3,0.4078,0.331394,0.558126,0.577782,0.567784
4,0.2963,0.243954,0.62884,0.700342,0.662668
5,0.2206,0.20061,0.661296,0.743409,0.699953
6,0.1967,0.175277,0.680798,0.769169,0.72229
7,0.165,0.157836,0.719199,0.795331,0.755352
8,0.1531,0.146546,0.738878,0.818877,0.776823
9,0.1377,0.138096,0.753145,0.831354,0.790319
10,0.1301,0.132323,0.761028,0.840209,0.798661



===== Model after training =====


Processing preds: 100%|██████████| 200/200 [00:00<00:00, 11913.27it/s]


F1-score: 0.8090
Precision: 0.7704
Recall: 0.8517

Classification report:
           precision    recall  f1-score   support

    MEDIA       0.70      0.60      0.64       295
      LOC       0.64      0.73      0.68       548
      PER       0.90      0.96      0.93      2048
      ORG       0.61      0.77      0.68      1237
 GEOPOLIT       0.85      0.86      0.86       841

micro avg       0.77      0.85      0.81      4969
macro avg       0.78      0.85      0.81      4969

Model saved in directory ./ner_model

===== Before and after training =====
F1-score: 0.0055 -> 0.8090 (change: 0.8035)
Precision: 0.0030 -> 0.7704 (change: 0.7674)
Recall: 0.0304 -> 0.8517 (change: 0.8213)


**Вывод**: Видно, что ванильный легкий rubert совсем не справляется со сравнительной простой NER-задачей, после дообучения картинка сильно поменялась, но на малых группах (LOC, MEDIA) все еще неидеально работает (кажется, тут как раз и поможет нагенерить синт. разметки).

Видно довольно высокую полноту, хорошо вычленяем все существующие сущности, но судя по точности иногда можем false-позитивить. В целом можно сказать, что даже в таком стейте модель уже готова решать некоторые практические задачи.

P.S. Возможно, побольше эпох в обучении добавят несколько сотых качества

# №2. Попробуйте улучшить качество модели следующими способами
0. Учить сразу на NER-задачу
1.  Предварительно дообучите на train-части в MLM режиме, а потом дообучите на NER-задачу
2.  Сгенерируйте синтетическую разметку* подходящего\*\*, на ваш взгляд, новостного корпуса большой и умной моделью для русскоязычного NER\*\*\*, а затем использовав ее для дообучения rubert-tiny2 вместе с основным набором данных

*прогоните датасет через NER-модель, получите ее предсказания и используйте их в качестве резметки

**Можно использовать уже знакомый вам датасет lenta-ru, объем данных лучше взять от 10_000 текстов

***Например, можно взять модель модель DeepPavlov ner_collection3_bert. Инструкция по запуску есть в документации

## Доучиваем с MLM и потом на NER

In [7]:
mlm_texts = [item.text for item in train_data]

mlm_dataset = Dataset.from_dict({'text': mlm_texts})

tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')

def tokenize_mlm(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512,
        return_special_tokens_mask=True
    )

mlm_tokenized = mlm_dataset.map(
    tokenize_mlm,
    batched=True,
    desc='Tokenizing MLM data'
)

mlm_model = AutoModelForMaskedLM.from_pretrained('cointegrated/rubert-tiny2')

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

mlm_training_args = TrainingArguments(
    output_dir='./mlm_results',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    report_to='none',
    logging_steps=50,
    disable_tqdm=False,
)

mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_training_args,
    train_dataset=mlm_tokenized,
    data_collator=data_collator,
)

mlm_trainer.train()

mlm_model.save_pretrained('./mlm_pretrained')
tokenizer.save_pretrained('./mlm_pretrained')
print('MLM model saved to ./mlm_pretrained')

unique_tags = set()
for item in data:
    for span in item.spans:
        unique_tags.add(span.type)

tag_names = list(unique_tags)
tag_names.sort()
tag_names.insert(0, 'O')

iob_tags = ['O']
for tag in tag_names:
    if tag != 'O':
        iob_tags.append(f'B-{tag}')
        iob_tags.append(f'I-{tag}')

tag2id = {tag: id for id, tag in enumerate(iob_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

print(f'Found {len(tag_names)} unique tags: {", ".join(tag_names)}')
print(f'Created {len(iob_tags)} tags in IOB format: {", ".join(iob_tags)}')

train_processed = preprocess_data_word_level(train_data, 'train')
test_processed = preprocess_data_word_level(test_data, 'test')

all_tags = [tag for item in train_processed for tag in item['ner_tags']]
print('Tags distribution:')
for tag in sorted(set(all_tags)):
    print(f'{tag}: {all_tags.count(tag)} ({all_tags.count(tag)/len(all_tags)*100:.2f}%)')

train_dataset = Dataset.from_dict({
    'tokens': [item['tokens'] for item in train_processed],
    'ner_tags': [item['ner_tags'] for item in train_processed]
})

test_dataset = Dataset.from_dict({
    'tokens': [item['tokens'] for item in test_processed],
    'ner_tags': [item['ner_tags'] for item in test_processed]
})

tokenizer = AutoTokenizer.from_pretrained('./mlm_pretrained')

train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True, desc='Tokenizing train')
test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True, desc='Tokenizing test')

model = AutoModelForTokenClassification.from_pretrained(
    './mlm_pretrained',
    num_labels=len(tag2id)
)

vanilla_ner = AutoModelForTokenClassification.from_pretrained(
    './ner_model',
    num_labels=len(tag2id)
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
model.to(device)

train_ner(model, tokenizer, train_tokenized, test_tokenized)

print('\n===== Model after NER training =====')
mlm_ner_metrics = eval_model(model, test_tokenized)
print(f'F1-score: {mlm_ner_metrics["f1"]:.4f}')
print(f'Precision: {mlm_ner_metrics["precision"]:.4f}')
print(f'Recall: {mlm_ner_metrics["recall"]:.4f}')
print('\nClassification report:')
print(mlm_ner_metrics['report'])

model.save_pretrained('./mlm_ner_model')
tokenizer.save_pretrained('./mlm_ner_model')

print('\n===== MLM+NER vs initial model =====')
print(f'F1-score: {vanilla_ner_metrics["f1"]:.4f} -> {mlm_ner_metrics["f1"]:.4f} (change: {mlm_ner_metrics["f1"] - vanilla_ner_metrics["f1"]:.4f})')
print(f'Precision: {vanilla_ner_metrics["precision"]:.4f} -> {mlm_ner_metrics["precision"]:.4f} (change: {mlm_ner_metrics["precision"] - vanilla_ner_metrics["precision"]:.4f})')
print(f'Recall: {vanilla_ner_metrics["recall"]:.4f} -> {mlm_ner_metrics["recall"]:.4f} (change: {mlm_ner_metrics["recall"] - vanilla_ner_metrics["recall"]:.4f})')

Tokenizing MLM data: 100%|██████████| 800/800 [00:00<00:00, 3986.19 examples/s]


Step,Training Loss
50,3.2247
100,3.1266
150,3.1151
200,3.0743
250,3.0482


MLM model saved to ./mlm_pretrained
Found 6 unique tags: O, GEOPOLIT, LOC, MEDIA, ORG, PER
Created 11 tags in IOB format: O, B-GEOPOLIT, I-GEOPOLIT, B-LOC, I-LOC, B-MEDIA, I-MEDIA, B-ORG, I-ORG, B-PER, I-PER


Processing train: 100%|██████████| 800/800 [00:00<00:00, 4271.83it/s]
Processing test: 100%|██████████| 200/200 [00:00<00:00, 4800.21it/s]


Tags distribution:
B-GEOPOLIT: 3226 (1.51%)
B-LOC: 2573 (1.20%)
B-MEDIA: 1202 (0.56%)
B-ORG: 5719 (2.67%)
B-PER: 8503 (3.97%)
I-GEOPOLIT: 180 (0.08%)
I-LOC: 1145 (0.53%)
I-MEDIA: 791 (0.37%)
I-ORG: 5330 (2.49%)
I-PER: 8403 (3.92%)
O: 177027 (82.68%)


Tokenizing train: 100%|██████████| 800/800 [00:00<00:00, 1434.17 examples/s]
Tokenizing test: 100%|██████████| 200/200 [00:00<00:00, 1497.89 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at ./mlm_pretrained and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device: cpu


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.389,1.14032,0.0,0.0,0.0
2,0.5233,0.472735,0.425505,0.339102,0.377422
3,0.3697,0.299159,0.596655,0.631717,0.613685
4,0.2707,0.222903,0.647038,0.718656,0.680969
5,0.2023,0.185736,0.672334,0.763936,0.715214
6,0.1817,0.16288,0.715416,0.800362,0.755509
7,0.1501,0.147158,0.744037,0.81606,0.778386
8,0.1407,0.137476,0.751138,0.830348,0.788759
9,0.1258,0.130232,0.757724,0.839002,0.796295
10,0.1203,0.125339,0.766612,0.847454,0.805009



===== Model after NER training =====


Processing preds: 100%|██████████| 200/200 [00:00<00:00, 15200.06it/s]


F1-score: 0.8144
Precision: 0.7755
Recall: 0.8573

Classification report:
           precision    recall  f1-score   support

    MEDIA       0.73      0.63      0.68       295
      LOC       0.66      0.78      0.72       548
      PER       0.90      0.96      0.93      2048
      ORG       0.61      0.77      0.68      1237
 GEOPOLIT       0.85      0.86      0.85       841

micro avg       0.78      0.86      0.81      4969
macro avg       0.78      0.86      0.82      4969


===== MLM+NER vs initial model =====
F1-score: 0.8090 -> 0.8144 (change: 0.0054)
Precision: 0.7704 -> 0.7755 (change: 0.0051)
Recall: 0.8517 -> 0.8573 (change: 0.0056)


**Вывод**: Прирост видим, но кажется, что не тот уровень эффекта (все еще не очень можем в точность), который хочется увидеть от сколь нибудь значительного изменения, посмотрим, как себя покажут синт. данные.

## Размечаем большой моделькой
- У меня не получилось добить ner_collection3_bert из DeepPavlov (ужасная поддержка зависимостей + веса просто не ставились и дебажить почти невозможно), взял NER-модель Сбера

In [13]:
from collections import namedtuple
from corus import load_lenta
import random

Example = namedtuple('Example', ['text', 'spans'])

def load_lenta_data(file_path='./lenta-ru-news.csv.gz', num_texts=10000):
    records = load_lenta(file_path)
    texts = []

    for record in records:
        if random.random() > 0.5:
            texts.append(record.text)
        if len(texts) == num_texts:
            break

    return texts

def gen_synth_labels(model_name='viktoroo/sberbank-rubert-base-collection3'):
    ner_model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    synth_data = []
    ner_model.eval()

    for text in tqdm(load_lenta_data(), desc='Synth labels'):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

        with torch.no_grad():
            outputs = ner_model(**inputs).logits

        predictions = torch.argmax(outputs, dim=-1)[0]
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

        spans = []
        current_span = None

        position = 0

        for token, idx in zip(tokens, predictions):
            label = ner_model.config.id2label[idx.item()]

            if token.startswith('##'):
                token = token[2:]

            token_len = len(token)

            if label == 'O':
                if current_span is not None:
                    spans.append(current_span)
                    current_span = None
            elif label.startswith('B-'):
                if current_span is not None:
                    spans.append(current_span)
                current_span = {
                    'start': position,
                    'end': position + token_len,
                    'type': label.split('-')[1]
                }
            elif label.startswith('I-') and current_span is not None:
                current_span['end'] = position + token_len

            position += token_len

        if current_span is not None:
            spans.append(current_span)

        synth_data.append(Example(text=text, spans=spans))

    return synth_data

synth_data = gen_synth_labels()

Synth labels:   8%|▊         | 836/10000 [04:49<52:53,  2.89it/s]  


KeyboardInterrupt: 

In [9]:
train_processed = preprocess_data_word_level(train_data, 'train')
synth_processed = preprocess_data_word_level_dict(synth_data, 'synth')
test_processed = preprocess_data_word_level(test_data, 'test')

train_processed += synth_processed

all_tags = [tag for item in train_processed for tag in item['ner_tags']]
print('Tags distribution:')
for tag in sorted(set(all_tags)):
    print(f'{tag}: {all_tags.count(tag)} ({all_tags.count(tag)/len(all_tags)*100:.2f}%)')

train_dataset = Dataset.from_dict({
    'tokens': [item['tokens'] for item in train_processed],
    'ner_tags': [item['ner_tags'] for item in train_processed]
})

test_dataset = Dataset.from_dict({
    'tokens': [item['tokens'] for item in test_processed],
    'ner_tags': [item['ner_tags'] for item in test_processed]
})

tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')

train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True, desc='Tokenizing train')
test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True, desc='Tokenizing test')

model = AutoModelForTokenClassification.from_pretrained('cointegrated/rubert-tiny2', num_labels=len(tag2id))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
model.to(device)

model = train_ner(model, tokenizer, train_tokenized, test_tokenized)

print('\n===== Model after training =====')
synth_ner_metrics = eval_model(model, test_tokenized)
print(f'F1-score: {synth_ner_metrics["f1"]:.4f}')
print(f'Precision: {synth_ner_metrics["precision"]:.4f}')
print(f'Recall: {synth_ner_metrics["recall"]:.4f}')
print('\nClassification report:')
print(synth_ner_metrics['report'])

Processing train: 100%|██████████| 800/800 [00:00<00:00, 4239.72it/s]
Processing synth: 100%|██████████| 10000/10000 [00:01<00:00, 6088.73it/s]
Processing test: 100%|██████████| 200/200 [00:00<00:00, 4676.16it/s]


Tags distribution:
B-GEOPOLIT: 3226 (0.14%)
B-LOC: 10471 (0.44%)
B-MEDIA: 1202 (0.05%)
B-ORG: 13634 (0.57%)
B-PER: 15645 (0.66%)
I-GEOPOLIT: 180 (0.01%)
I-LOC: 83306 (3.50%)
I-MEDIA: 791 (0.03%)
I-ORG: 107861 (4.53%)
I-PER: 101338 (4.25%)
O: 2044862 (85.83%)


Tokenizing train: 100%|██████████| 10800/10800 [00:06<00:00, 1695.05 examples/s]
Tokenizing test: 100%|██████████| 200/200 [00:00<00:00, 1508.48 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device: cpu


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5564,0.785912,0.223709,0.070638,0.107372
2,0.4608,0.402254,0.299857,0.380358,0.335344
3,0.4074,0.288933,0.500417,0.604347,0.547493
4,0.3693,0.239118,0.555592,0.681827,0.612271
5,0.3442,0.206246,0.600738,0.720668,0.655261
6,0.2975,0.188669,0.62673,0.747434,0.681781
7,0.3012,0.171863,0.650237,0.77259,0.706153
8,0.3078,0.167397,0.664824,0.774401,0.715441
9,0.2809,0.157829,0.67713,0.790099,0.729265
10,0.2664,0.155491,0.684329,0.792715,0.734545



===== Model after training =====


Processing preds: 100%|██████████| 200/200 [00:00<00:00, 14938.84it/s]


F1-score: 0.7536
Precision: 0.7025
Recall: 0.8126

Classification report:
           precision    recall  f1-score   support

    MEDIA       0.63      0.63      0.63       295
      LOC       0.61      0.72      0.66       548
      PER       0.80      0.90      0.85      2048
      ORG       0.56      0.72      0.63      1237
 GEOPOLIT       0.81      0.86      0.83       841

micro avg       0.70      0.81      0.75      4969
macro avg       0.71      0.81      0.76      4969



**Вывод**: Качество на тесте ниже моделей обученной с помощью MLM и ванильной версии. Выглядит так, что сейчас модель стала более устойчивой так как обучилась на большем пуле. В качестве финального решения я бы остановился на модели, которая обучилась на сгенеренной синт. разметке