In [None]:
!pip install evaluate
!pip install seqeval
#!pip install datasets
!pip install datasets==2.16.1 fsspec==2023.6.0



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
import evaluate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel, BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, AutoTokenizer, AutoModelForTokenClassification, BertPreTrainedModel

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ds = load_dataset("ai-forever/kinopoisk-sentiment-classification")
"""
with open("/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/train (1).jsonl", 'r', encoding='utf-8') as f:
    fl = f.read()
print(fl)
"""

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/104 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.39M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.27M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

'\nwith open("/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/train (1).jsonl", \'r\', encoding=\'utf-8\') as f:\n    fl = f.read()\nprint(fl)\n'

Наш кастомный датасет - это просто текстоый файл, где каждое предложение отделено от другого пустой строкой, а лейблы записаны через таб к токенам. Нам нужно сперва распарсить этот датасет, потом написать класс для него и наконец собрать все в dataloader.

In [None]:
def parse_dataset(filepath):
    texts, slots, classes = [], [], []
    with open(filepath, 'r', encoding='utf-8') as file:
        current_text, current_slots, current_classes = [], [], []
        text_id = 0
        for line in file:
            line = line.strip()
            if line.startswith('# sent_id'):
              c = line.split()[3]
              sent, text = c.split('_')
              if text_id != text:
                texts.append(current_text)
                slots.append(current_slots)
                classes.append(current_classes)
                text_id = text
                current_text, current_slots, current_classes = [], [], []
            elif line.startswith('# text'):
              continue
            elif not line:  # конец предложения
              continue
            else:
                l = line.split('\t')
                current_text.append(l[1])
                current_classes.append(l[-1])
                current_slots.append(l[-2])
        # добавим последнее предложение на случай, если файл не заканчивался на пустую строчку
        if current_text:
          texts.append(current_text)
          slots.append(current_slots)
          classes.append(current_classes)
    return texts, slots, classes

In [None]:
from collections import defaultdict

def extract_classes_and_slots(filepath1, filepath2):
    classes = set()
    slots = set()

    with open(filepath1, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            fields = line.split('\t')
            if len(fields) < 11:
                continue
            class_field = fields[11]
            slot_field = fields[10]
            #if class_field and class_field != '_':
            classes.add(class_field)
            #if slot_field and slot_field != '_':
            slots.add(slot_field)

    with open(filepath2, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            fields = line.split('\t')
            if len(fields) < 11:
                continue
            class_field = fields[11]
            slot_field = fields[10]
            #if class_field and class_field != '_':
            classes.add(class_field)
            #if slot_field and slot_field != '_':
            slots.add(slot_field)


    classes = sorted(classes)
    slots = sorted(slots)
    classes.append('PAD')
    slots.append('PAD')

    class2idx = {cls: idx for idx, cls in enumerate(classes)}
    idx2class = {idx: cls for cls, idx in class2idx.items()}

    slot2idx = {slot: idx for idx, slot in enumerate(slots)}
    idx2slot = {idx: slot for slot, idx in slot2idx.items()}

    return {
        'classes': classes,
        'slots': slots,
        'class2idx': class2idx,
        'idx2class': idx2class,
        'slot2idx': slot2idx,
        'idx2slot': idx2slot
    }

In [None]:
class NERDataset(Dataset):
    def __init__(self, texts, slots, classes, labels, tokenizer, slot2id, class2id, max_length=128):
        self.texts = texts
        self.slots = slots
        self.classes = classes
        self.labels = labels
        self.tokenizer = tokenizer
        self.slot2id = slot2id
        self.class2id = class2id
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def align_tokens_and_labels(self, tokens, slots, classes):
        word_ids = self.tokenizer.convert_tokens_to_ids(tokens)  # Индексы токенов
        aligned_slots = []
        aligned_classes = []

        current_word_idx = 0
        for word in word_ids:
            # Получаем текущий токен из разметки
            current_word = tokens[current_word_idx]

            # Получаем количество токенов для текущего слова из разметки
            original_word_token_count = len(self.tokenizer.tokenize(current_word))

            # Присваиваем всем токенам этого слова одинаковый тег
            for _ in range(original_word_token_count):
                aligned_slots.append(slots[current_word_idx])
                aligned_classes.append(classes[current_word_idx])

            # Переходим к следующему слову в разметке
            current_word_idx += 1

        return aligned_slots, aligned_classes

    def slice_token(self, index):
        start, stop, step = index.indices(len(self.texts))
        result = []
        for i in range(start, stop, step):
            tokens = self.texts[i]
            classes = self.classes[i]
            slots = self.slots[i]
            label = self.labels[i]
            encoding = self.tokenizer(
                            tokens,
                            is_split_into_words=True,
                            padding='max_length',
                            truncation=True,
                            max_length=self.max_length,
                            return_tensors='pt'
                        )

            slots, classes = self.align_tokens_and_labels(tokens, slots, classes)

            pad_len = self.max_length - len(slots)
            if pad_len > 0:
                for i in range(pad_len):
                    slots.append('PAD')
                    classes.append('PAD')
            else:
                slots = slots[:self.max_length]
                classes = classes[:self.max_length]

            slots = [self.slot2id[slot] for slot in slots]
            classes = [self.class2id[cls] for cls in classes]

            encoding["slots"] = slots
            encoding["classes"] = classes
            encoding["label"] = label
            encoding["input_ids"] = torch.squeeze(encoding["input_ids"], 0)
            encoding["token_type_ids"] = torch.squeeze(encoding["token_type_ids"], 0)
            encoding["attention_mask"] = torch.squeeze(encoding["attention_mask"], 0)

            assert torch.max(encoding["input_ids"]) < tokenizer.vocab_size, f"Bad input_ids: {encoding['input_ids']}"
            assert torch.max(encoding["token_type_ids"]) <= 1, f"token_type_ids invalid: {encoding['token_type_ids']}"
            assert all(0 <= x < len(self.slot2id) or x == -100 for x in slots), f"slots out of range: {slots}"
            assert all(0 <= x < len(self.class2id) or x == -100 for x in classes), f"classes out of range: {classes}"
            assert isinstance(label, int) and 0 <= label < 3, f"label out of range: {label}"
            result.append({key: torch.tensor(val) for key, val in encoding.items()})

        return result

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            # Обработка среза
            return self.slice_token(idx)
        elif isinstance(idx, int):
            tokens = self.texts[idx]
            classes = self.classes[idx]
            slots = self.slots[idx]
            label = self.labels[idx]

            encoding = self.tokenizer(
                        tokens,
                        is_split_into_words=True,
                        padding='max_length',
                        truncation=True,
                        max_length=self.max_length,
                        return_tensors='pt'
                    )

            slots, classes = self.align_tokens_and_labels(tokens, slots, classes)

            pad_len = self.max_length - len(slots)
            if pad_len > 0:
                for i in range(pad_len):
                    slots.append('PAD')
                    classes.append('PAD')
            else:
                slots = slots[:self.max_length]
                classes = classes[:self.max_length]

            slots = [self.slot2id[slot] for slot in slots]
            classes = [self.class2id[cls] for cls in classes]

            encoding["slots"] = slots
            encoding["classes"] = classes
            encoding["label"] = label
            encoding["input_ids"] = torch.squeeze(encoding["input_ids"], 0)
            encoding["token_type_ids"] = torch.squeeze(encoding["token_type_ids"], 0)
            encoding["attention_mask"] = torch.squeeze(encoding["attention_mask"], 0)
            #encoding.pop("offset_mcodinfapping")  # для модели не нужно
            #print(encoding)

            assert torch.max(encoding["input_ids"]) < tokenizer.vocab_size, f"Bad input_ids: {encoding['input_ids']}"
            assert torch.max(encoding["token_type_ids"]) <= 1, f"token_type_ids invalid: {encoding['token_type_ids']}"
            assert all(0 <= x < len(self.slot2id) or x == -100 for x in slots), f"slots out of range: {slots}"
            assert all(0 <= x < len(self.class2id) or x == -100 for x in classes), f"classes out of range: {classes}"
            assert isinstance(label, int) and 0 <= label < 3, f"label out of range: {label}"

            return {key: torch.tensor(val) for key, val in encoding.items()}

In [None]:
"""
texts, slots, classes = parse_dataset('/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/sentiment_train_pred.conllu')
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
extracted = extract_classes_and_slots('/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/sentiment_train_pred.conllu')
dataset = NERDataset(texts, slots, classes, ds['train']['label'], tokenizer, extracted['slot2idx'], extracted['class2idx'], max_length=128)
"""

'\ntexts, slots, classes = parse_dataset(\'/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/sentiment_train_pred.conllu\')\nmodel_checkpoint = "bert-base-multilingual-cased"\ntokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\nextracted = extract_classes_and_slots(\'/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/sentiment_train_pred.conllu\')\ndataset = NERDataset(texts, slots, classes, ds[\'train\'][\'label\'], tokenizer, extracted[\'slot2idx\'], extracted[\'class2idx\'], max_length=128)\n'

In [None]:
def create_dataloader(path, labels, tokenizer, slots2id, classes2id, batch_size=16, max_length=128):
    texts, slots, classes = parse_dataset(path)
    dataset = NERDataset(texts, slots, classes, labels, tokenizer, slots2id, classes2id, max_length=max_length)
    little_dataset = dataset[0:1000]
    return DataLoader(little_dataset, batch_size=batch_size, shuffle=True)

In [None]:
model_checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# соберем лоудеры
extracted = extract_classes_and_slots('/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/sentiment_train_pred.conllu', '/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/sentiment_val_pred.conllu')
train_dataloader = create_dataloader('/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/sentiment_train_pred.conllu', ds['train']['label'], tokenizer, extracted['slot2idx'], extracted['class2idx'], batch_size=16)
val_dataloader = create_dataloader('/content/drive/MyDrive/Colab Notebooks/курсач/датасеты/сентимент/sentiment_val_pred.conllu', ds['validation']['label'], tokenizer, extracted['slot2idx'], extracted['class2idx'], batch_size=16)

# проверим, что все ок
batch1 = next(iter(train_dataloader))
batch1

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  result.append({key: torch.tensor(val) for key, val in encoding.items()})


{'input_ids': tensor([[  101,  1182,  1196,  ..., 18947, 22919,   102],
         [  101,  1182,  1201,  ...,  1196,  1183,   102],
         [  101,  1077,  1185,  ..., 23742, 29747,   102],
         ...,
         [  101,  1192, 10260,  ..., 18947,  1189,   102],
         [  101,  1182,  1192,  ..., 29113,  1195,   102],
         [  101,  1181, 29113,  ...,  1196, 18947,   102]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'slots': tensor([[126,   5,   5,  ...,  81,  81,  81],
         [126,  79,  79,  ...,  26,  26,  26],
         [126, 126, 126,  

Посмотрим, что у нас в батче

In [None]:
batch2 = next(iter(val_dataloader))
print(batch2['input_ids'][0], batch2['label'][0])
tokenizer.decode(batch2['input_ids'][0])

tensor([  101,  1192, 29748,  1182, 14150, 22919,  1188,  1184, 14150, 29743,
        29742, 10260, 29436, 10325, 29747, 23742,  1191, 29113,  1194, 16856,
        14150, 29742, 14150, 29436, 29743, 15290, 18947, 10325, 15290,  1077,
         1196, 10260, 29745, 14150, 29741, 14150,  1190, 29748, 29752, 29753,
        15290, 29741, 14150,  1199, 10325, 29436, 23742, 29745, 10260,  1090,
         1012,  1182, 29746, 15290, 29752, 10260, 22919, 29436, 15290, 18947,
        10325, 15290,  1193, 22919,  1192, 15290, 29741, 14150,  1001, 19701,
         1192, 15290, 14150, 29742, 18947, 14150, 29744, 19865, 29752, 18947,
        14150, 15290,  1012,  1192, 10260, 29752, 10260, 29436, 14150,  1181,
        29113, 29436, 14150,  1192, 15290, 29746, 29436, 14150, 29750, 14150,
        15290,  1010,  1192, 14150,  1194, 14150, 22919, 14150, 29745,  1182,
        29747, 15290,  1001, 19701,  1194, 14150, 29753, 29436, 14150,  1189,
        10260, 23925,  1188,  1195, 28995, 23742, 29753,   102])

'[CLS] ну вот и дождались мы продолжение « самого лучшего фильма ». впечатление от него # null неоднозначное. начало было неплохое, но потом все # null пошло как и раньш [SEP]'

In [None]:
class BertWithSemanticsForTokenClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels, num_semantic_classes, num_semantic_slots, semantic_emb_dim=128):
        super().__init__(config)

        # Стандартная BERT-часть
        self.bert = BertModel(config)

        # Эмбеддинги для семантических классов и слотов
        self.semantic_class_embedding = nn.Embedding(num_semantic_classes, semantic_emb_dim)
        self.semantic_slot_embedding = nn.Embedding(num_semantic_slots, semantic_emb_dim)

        # Новый классификатор: вход — bert hidden + 2 семантических эмбеддинга
        self.classifier = nn.Linear(config.hidden_size + 2 * semantic_emb_dim, num_labels)

        # Dropout (можно убрать, если не нужен)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # Инициализация весов
        self.init_weights()

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, slots=None, classes=None, labels=None):
        # Прогон через BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output  # [CLS] токен, (batch_size, hidden_size)

        mask = attention_mask.unsqueeze(-1).float()

        # Эмбеддинги семантики: предполагается, что slots и classes — (batch_size,)
        semantic_class_embeds = self.semantic_class_embedding(classes)  # (batch_size, sem_emb_dim)
        semantic_slot_embeds = self.semantic_slot_embedding(slots)      # (batch_size, sem_emb_dim)

        semantic_class_embeds = semantic_class_embeds * mask
        semantic_slot_embeds = semantic_slot_embeds * mask

        # Сумма по токенам
        class_embeds_sum = semantic_class_embeds.sum(dim=1)  # (batch_size, sem_emb_dim)
        slot_embeds_sum =  semantic_slot_embeds.sum(dim=1)

        # Количество непаддинговых токенов
        lengths = mask.sum(dim=1).clamp(min=1e-6)  # (batch_size, 1)

        # Среднее
        class_embeds_avg = class_embeds_sum / lengths
        slot_embeds_avg = slot_embeds_sum / lengths

        # Dropout
        combined_output = torch.cat([pooled_output, class_embeds_avg, slot_embeds_avg], dim=-1)

        # Классификатор
        logits = self.classifier(combined_output)  # (batch_size, num_labels)

        # Лосс по всей последовательности
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            #loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        output = (logits,)
        return ((loss,) + output) if loss is not None else output

In [None]:
def compute_metrics(p):

    predictions, labels = p

    # логиты в индексы
    predictions = predictions.argmax(axis=-1)

    # пихнем в метрику и получим результат
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import AutoConfig
num_labels = len(set(ds['train']['label']))
num_semantic_classes = len(extracted['classes'])
num_semantic_slots = len(extracted['slots'])

config = AutoConfig.from_pretrained(
    "google-bert/bert-base-uncased",  # или путь к твоей модели
    num_labels=num_labels,
    num_semantic_classes=num_semantic_classes,
    num_semantic_slots=num_semantic_slots,
    semantic_emb_dim=128
)

model = BertWithSemanticsForTokenClassification(
    config=config,
    num_labels=num_labels,
    num_semantic_classes=num_semantic_classes,
    num_semantic_slots=num_semantic_slots,
    semantic_emb_dim=128  # или любое другое
)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

В трансформерах все петли написаны за нас: нам остается передать аргументы для обучения (их много) и собственно запустить трейнер.

In [None]:
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    eval_strategy="epoch",    # Evaluate after each epoch
    learning_rate=5e-5,             # Learning rate
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,             # Number of epochs
    weight_decay=0.01,              # Strength of weight decay
    logging_dir="./logs",           # Directory for storing logs
    logging_steps=10,               # Log every 10 steps
    save_strategy="epoch",          # Save model after each epoch
    load_best_model_at_end=True,    # Load the best model after training
    metric_for_best_model="f1",     # Use F1 score to choose the best model
)

In [None]:
# инициализировали
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader.dataset,  # Training dataset
    eval_dataset=val_dataloader.dataset,   # Evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
# собственно обучение - автоматически делает логи
trainer.train()
[ ]
# собственно обучение - автоматически делает логи
trainer.train()

# оценим модельку
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Сохраним, что получилось
trainer.save_model("./sentiment_semantics_model")
# b5a31e3a762dc4fdbd905c7a205899ee8116917a

# оценим модельку
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Сохраним, что получилось
trainer.save_model("./sentiment_semantics_model")
# b5a31e3a762dc4fdbd905c7a205899ee8116917a

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.102,1.176739,0.33,0.165414,0.11,0.333333
2,1.0904,1.123147,0.324,0.163142,0.108,0.333333
3,1.0912,1.101539,0.324,0.163142,0.108,0.333333


[[-0.1685735   0.36423048  0.7699479 ]
 [-0.16941322  0.3621908   0.7617129 ]
 [-0.17047223  0.36304644  0.7636946 ]
 ...
 [-0.1722266   0.36338508  0.7625254 ]
 [-0.17026974  0.36518472  0.76081157]
 [-0.17048468  0.36284542  0.7626622 ]] [1 1 1 1 0 1 1 0 2 2 0 2 2 1 1 2 2 0 2 1 2 2 0 1 1 0 2 1 1 1 1 1 1 2 2 1 1
 1 0 0 0 2 0 0 1 0 2 0 0 0 0 1 0 1 2 0 0 1 2 2 0 1 0 2 2 2 2 1 0 2 2 1 0 2
 1 2 1 1 0 1 1 2 1 2 2 1 1 1 1 0 1 2 0 0 0 0 2 0 0 0 0 1 0 1 0 1 2 1 1 2 2
 2 2 1 0 2 0 0 2 1 0 2 0 2 2 0 2 1 1 0 0 1 0 1 2 0 2 2 1 0 1 0 1 0 2 2 2 2
 2 2 0 0 0 0 2 2 0 2 2 2 0 1 0 0 0 1 2 0 0 1 0 2 2 1 1 2 2 2 0 2 2 0 0 0 1
 1 1 1 2 1 1 1 0 1 2 0 0 1 2 2 2 0 1 2 0 0 2 0 1 0 2 1 0 0 1 1 1 1 1 0 1 0
 0 2 1 0 0 1 0 2 0 0 2 0 0 1 0 0 2 1 2 1 1 2 0 1 0 0 2 1 2 1 1 0 2 0 2 0 1
 0 2 0 0 2 2 2 0 1 0 2 2 0 0 1 0 1 1 1 1 1 2 0 2 1 0 2 2 1 2 2 0 0 0 1 2 2
 2 2 0 0 0 2 1 0 1 0 2 2 2 0 0 2 1 0 1 0 2 1 1 2 1 0 1 2 1 0 1 2 2 0 1 0 1
 2 1 0 2 0 1 2 2 2 0 1 2 1 0 1 0 2 1 1 0 0 1 1 2 2 0 2 1 0 2 2 1 1 1 0 0 0
 0 2 1 1 2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[0.13925992 0.57040185 0.17180698]
 [0.13985118 0.5685665  0.16253139]
 [0.13839284 0.56963515 0.16474131]
 ...
 [0.13684282 0.5697807  0.16352908]
 [0.13901047 0.57126427 0.16190846]
 [0.1385949  0.5693359  0.1635714 ]] [1 1 1 1 0 1 1 0 2 2 0 2 2 1 1 2 2 0 2 1 2 2 0 1 1 0 2 1 1 1 1 1 1 2 2 1 1
 1 0 0 0 2 0 0 1 0 2 0 0 0 0 1 0 1 2 0 0 1 2 2 0 1 0 2 2 2 2 1 0 2 2 1 0 2
 1 2 1 1 0 1 1 2 1 2 2 1 1 1 1 0 1 2 0 0 0 0 2 0 0 0 0 1 0 1 0 1 2 1 1 2 2
 2 2 1 0 2 0 0 2 1 0 2 0 2 2 0 2 1 1 0 0 1 0 1 2 0 2 2 1 0 1 0 1 0 2 2 2 2
 2 2 0 0 0 0 2 2 0 2 2 2 0 1 0 0 0 1 2 0 0 1 0 2 2 1 1 2 2 2 0 2 2 0 0 0 1
 1 1 1 2 1 1 1 0 1 2 0 0 1 2 2 2 0 1 2 0 0 2 0 1 0 2 1 0 0 1 1 1 1 1 0 1 0
 0 2 1 0 0 1 0 2 0 0 2 0 0 1 0 0 2 1 2 1 1 2 0 1 0 0 2 1 2 1 1 0 2 0 2 0 1
 0 2 0 0 2 2 2 0 1 0 2 2 0 0 1 0 1 1 1 1 1 2 0 2 1 0 2 2 1 2 2 0 0 0 1 2 2
 2 2 0 0 0 2 1 0 1 0 2 2 2 0 0 2 1 0 1 0 2 1 1 2 1 0 1 2 1 0 1 2 2 0 1 0 1
 2 1 0 2 0 1 2 2 2 0 1 2 1 0 1 0 2 1 1 0 0 1 1 2 2 0 2 1 0 2 2 1 1 1 0 0 0
 0 2 1 1 2 2 2 2 0 0 2 2 2 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[0.24137196 0.36998874 0.2728772 ]
 [0.24221617 0.36818597 0.26349035]
 [0.24075934 0.36921254 0.2657764 ]
 ...
 [0.23923141 0.36935574 0.26452592]
 [0.24134658 0.3709375  0.26281258]
 [0.24088378 0.36902258 0.26455912]] [1 1 1 1 0 1 1 0 2 2 0 2 2 1 1 2 2 0 2 1 2 2 0 1 1 0 2 1 1 1 1 1 1 2 2 1 1
 1 0 0 0 2 0 0 1 0 2 0 0 0 0 1 0 1 2 0 0 1 2 2 0 1 0 2 2 2 2 1 0 2 2 1 0 2
 1 2 1 1 0 1 1 2 1 2 2 1 1 1 1 0 1 2 0 0 0 0 2 0 0 0 0 1 0 1 0 1 2 1 1 2 2
 2 2 1 0 2 0 0 2 1 0 2 0 2 2 0 2 1 1 0 0 1 0 1 2 0 2 2 1 0 1 0 1 0 2 2 2 2
 2 2 0 0 0 0 2 2 0 2 2 2 0 1 0 0 0 1 2 0 0 1 0 2 2 1 1 2 2 2 0 2 2 0 0 0 1
 1 1 1 2 1 1 1 0 1 2 0 0 1 2 2 2 0 1 2 0 0 2 0 1 0 2 1 0 0 1 1 1 1 1 0 1 0
 0 2 1 0 0 1 0 2 0 0 2 0 0 1 0 0 2 1 2 1 1 2 0 1 0 0 2 1 2 1 1 0 2 0 2 0 1
 0 2 0 0 2 2 2 0 1 0 2 2 0 0 1 0 1 1 1 1 1 2 0 2 1 0 2 2 1 2 2 0 0 0 1 2 2
 2 2 0 0 0 2 1 0 1 0 2 2 2 0 0 2 1 0 1 0 2 1 1 2 1 0 1 2 1 0 1 2 2 0 1 0 1
 2 1 0 2 0 1 2 2 2 0 1 2 1 0 1 0 2 1 1 0 0 1 1 2 2 0 2 1 0 2 2 1 1 1 0 0 0
 0 2 1 1 2 2 2 2 0 0 2 2 2 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1074,1.169439,0.324,0.163142,0.108,0.333333
2,1.1024,1.119285,0.324,0.163142,0.108,0.333333
3,1.0959,1.101035,0.324,0.163142,0.108,0.333333


[[-0.18600917  0.6760667   0.41475788]
 [-0.18577115  0.67463297  0.40716   ]
 [-0.18698736  0.6756707   0.4093335 ]
 ...
 [-0.18877587  0.6759745   0.40813085]
 [-0.18681012  0.6775039   0.4064021 ]
 [-0.18676785  0.6752673   0.40814564]] [1 1 1 1 0 1 1 0 2 2 0 2 2 1 1 2 2 0 2 1 2 2 0 1 1 0 2 1 1 1 1 1 1 2 2 1 1
 1 0 0 0 2 0 0 1 0 2 0 0 0 0 1 0 1 2 0 0 1 2 2 0 1 0 2 2 2 2 1 0 2 2 1 0 2
 1 2 1 1 0 1 1 2 1 2 2 1 1 1 1 0 1 2 0 0 0 0 2 0 0 0 0 1 0 1 0 1 2 1 1 2 2
 2 2 1 0 2 0 0 2 1 0 2 0 2 2 0 2 1 1 0 0 1 0 1 2 0 2 2 1 0 1 0 1 0 2 2 2 2
 2 2 0 0 0 0 2 2 0 2 2 2 0 1 0 0 0 1 2 0 0 1 0 2 2 1 1 2 2 2 0 2 2 0 0 0 1
 1 1 1 2 1 1 1 0 1 2 0 0 1 2 2 2 0 1 2 0 0 2 0 1 0 2 1 0 0 1 1 1 1 1 0 1 0
 0 2 1 0 0 1 0 2 0 0 2 0 0 1 0 0 2 1 2 1 1 2 0 1 0 0 2 1 2 1 1 0 2 0 2 0 1
 0 2 0 0 2 2 2 0 1 0 2 2 0 0 1 0 1 1 1 1 1 2 0 2 1 0 2 2 1 2 2 0 0 0 1 2 2
 2 2 0 0 0 2 1 0 1 0 2 2 2 0 0 2 1 0 1 0 2 1 1 2 1 0 1 2 1 0 1 2 2 0 1 0 1
 2 1 0 2 0 1 2 2 2 0 1 2 1 0 1 0 2 1 1 0 0 1 1 2 2 0 2 1 0 2 2 1 1 1 0 0 0
 0 2 1 1 2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[0.1723811  0.5488929  0.16351181]
 [0.1738445  0.5477139  0.15464883]
 [0.17234133 0.54895055 0.15689531]
 ...
 [0.17070699 0.5491587  0.15563361]
 [0.17281905 0.5504735  0.15399666]
 [0.17262633 0.54848623 0.15573433]] [1 1 1 1 0 1 1 0 2 2 0 2 2 1 1 2 2 0 2 1 2 2 0 1 1 0 2 1 1 1 1 1 1 2 2 1 1
 1 0 0 0 2 0 0 1 0 2 0 0 0 0 1 0 1 2 0 0 1 2 2 0 1 0 2 2 2 2 1 0 2 2 1 0 2
 1 2 1 1 0 1 1 2 1 2 2 1 1 1 1 0 1 2 0 0 0 0 2 0 0 0 0 1 0 1 0 1 2 1 1 2 2
 2 2 1 0 2 0 0 2 1 0 2 0 2 2 0 2 1 1 0 0 1 0 1 2 0 2 2 1 0 1 0 1 0 2 2 2 2
 2 2 0 0 0 0 2 2 0 2 2 2 0 1 0 0 0 1 2 0 0 1 0 2 2 1 1 2 2 2 0 2 2 0 0 0 1
 1 1 1 2 1 1 1 0 1 2 0 0 1 2 2 2 0 1 2 0 0 2 0 1 0 2 1 0 0 1 1 1 1 1 0 1 0
 0 2 1 0 0 1 0 2 0 0 2 0 0 1 0 0 2 1 2 1 1 2 0 1 0 0 2 1 2 1 1 0 2 0 2 0 1
 0 2 0 0 2 2 2 0 1 0 2 2 0 0 1 0 1 1 1 1 1 2 0 2 1 0 2 2 1 2 2 0 0 0 1 2 2
 2 2 0 0 0 2 1 0 1 0 2 2 2 0 0 2 1 0 1 0 2 1 1 2 1 0 1 2 1 0 1 2 2 0 1 0 1
 2 1 0 2 0 1 2 2 2 0 1 2 1 0 1 0 2 1 1 0 0 1 1 2 2 0 2 1 0 2 2 1 1 1 0 0 0
 0 2 1 1 2 2 2 2 0 0 2 2 2 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[0.2522137  0.36768913 0.30414975]
 [0.25376093 0.36658973 0.29513842]
 [0.25223053 0.36779207 0.29743725]
 ...
 [0.25064164 0.36799282 0.29613283]
 [0.25273076 0.36938956 0.29442665]
 [0.25247362 0.36741135 0.29623085]] [1 1 1 1 0 1 1 0 2 2 0 2 2 1 1 2 2 0 2 1 2 2 0 1 1 0 2 1 1 1 1 1 1 2 2 1 1
 1 0 0 0 2 0 0 1 0 2 0 0 0 0 1 0 1 2 0 0 1 2 2 0 1 0 2 2 2 2 1 0 2 2 1 0 2
 1 2 1 1 0 1 1 2 1 2 2 1 1 1 1 0 1 2 0 0 0 0 2 0 0 0 0 1 0 1 0 1 2 1 1 2 2
 2 2 1 0 2 0 0 2 1 0 2 0 2 2 0 2 1 1 0 0 1 0 1 2 0 2 2 1 0 1 0 1 0 2 2 2 2
 2 2 0 0 0 0 2 2 0 2 2 2 0 1 0 0 0 1 2 0 0 1 0 2 2 1 1 2 2 2 0 2 2 0 0 0 1
 1 1 1 2 1 1 1 0 1 2 0 0 1 2 2 2 0 1 2 0 0 2 0 1 0 2 1 0 0 1 1 1 1 1 0 1 0
 0 2 1 0 0 1 0 2 0 0 2 0 0 1 0 0 2 1 2 1 1 2 0 1 0 0 2 1 2 1 1 0 2 0 2 0 1
 0 2 0 0 2 2 2 0 1 0 2 2 0 0 1 0 1 1 1 1 1 2 0 2 1 0 2 2 1 2 2 0 0 0 1 2 2
 2 2 0 0 0 2 1 0 1 0 2 2 2 0 0 2 1 0 1 0 2 1 1 2 1 0 1 2 1 0 1 2 2 0 1 0 1
 2 1 0 2 0 1 2 2 2 0 1 2 1 0 1 0 2 1 1 0 0 1 1 2 2 0 2 1 0 2 2 1 1 1 0 0 0
 0 2 1 1 2 2 2 2 0 0 2 2 2 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[-0.18600917  0.6760667   0.41475788]
 [-0.18577115  0.67463297  0.40716   ]
 [-0.18698736  0.6756707   0.4093335 ]
 ...
 [-0.18877587  0.6759745   0.40813085]
 [-0.18681012  0.6775039   0.4064021 ]
 [-0.18676785  0.6752673   0.40814564]] [1 1 1 1 0 1 1 0 2 2 0 2 2 1 1 2 2 0 2 1 2 2 0 1 1 0 2 1 1 1 1 1 1 2 2 1 1
 1 0 0 0 2 0 0 1 0 2 0 0 0 0 1 0 1 2 0 0 1 2 2 0 1 0 2 2 2 2 1 0 2 2 1 0 2
 1 2 1 1 0 1 1 2 1 2 2 1 1 1 1 0 1 2 0 0 0 0 2 0 0 0 0 1 0 1 0 1 2 1 1 2 2
 2 2 1 0 2 0 0 2 1 0 2 0 2 2 0 2 1 1 0 0 1 0 1 2 0 2 2 1 0 1 0 1 0 2 2 2 2
 2 2 0 0 0 0 2 2 0 2 2 2 0 1 0 0 0 1 2 0 0 1 0 2 2 1 1 2 2 2 0 2 2 0 0 0 1
 1 1 1 2 1 1 1 0 1 2 0 0 1 2 2 2 0 1 2 0 0 2 0 1 0 2 1 0 0 1 1 1 1 1 0 1 0
 0 2 1 0 0 1 0 2 0 0 2 0 0 1 0 0 2 1 2 1 1 2 0 1 0 0 2 1 2 1 1 0 2 0 2 0 1
 0 2 0 0 2 2 2 0 1 0 2 2 0 0 1 0 1 1 1 1 1 2 0 2 1 0 2 2 1 2 2 0 0 0 1 2 2
 2 2 0 0 0 2 1 0 1 0 2 2 2 0 0 2 1 0 1 0 2 1 1 2 1 0 1 2 1 0 1 2 2 0 1 0 1
 2 1 0 2 0 1 2 2 2 0 1 2 1 0 1 0 2 1 1 0 0 1 1 2 2 0 2 1 0 2 2 1 1 1 0 0 0
 0 2 1 1 2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[[-0.18600917  0.6760667   0.41475788]
 [-0.18577115  0.67463297  0.40716   ]
 [-0.18698736  0.6756707   0.4093335 ]
 ...
 [-0.18877587  0.6759745   0.40813085]
 [-0.18681012  0.6775039   0.4064021 ]
 [-0.18676785  0.6752673   0.40814564]] [1 1 1 1 0 1 1 0 2 2 0 2 2 1 1 2 2 0 2 1 2 2 0 1 1 0 2 1 1 1 1 1 1 2 2 1 1
 1 0 0 0 2 0 0 1 0 2 0 0 0 0 1 0 1 2 0 0 1 2 2 0 1 0 2 2 2 2 1 0 2 2 1 0 2
 1 2 1 1 0 1 1 2 1 2 2 1 1 1 1 0 1 2 0 0 0 0 2 0 0 0 0 1 0 1 0 1 2 1 1 2 2
 2 2 1 0 2 0 0 2 1 0 2 0 2 2 0 2 1 1 0 0 1 0 1 2 0 2 2 1 0 1 0 1 0 2 2 2 2
 2 2 0 0 0 0 2 2 0 2 2 2 0 1 0 0 0 1 2 0 0 1 0 2 2 1 1 2 2 2 0 2 2 0 0 0 1
 1 1 1 2 1 1 1 0 1 2 0 0 1 2 2 2 0 1 2 0 0 2 0 1 0 2 1 0 0 1 1 1 1 1 0 1 0
 0 2 1 0 0 1 0 2 0 0 2 0 0 1 0 0 2 1 2 1 1 2 0 1 0 0 2 1 2 1 1 0 2 0 2 0 1
 0 2 0 0 2 2 2 0 1 0 2 2 0 0 1 0 1 1 1 1 1 2 0 2 1 0 2 2 1 2 2 0 0 0 1 2 2
 2 2 0 0 0 2 1 0 1 0 2 2 2 0 0 2 1 0 1 0 2 1 1 2 1 0 1 2 1 0 1 2 2 0 1 0 1
 2 1 0 2 0 1 2 2 2 0 1 2 1 0 1 0 2 1 1 0 0 1 1 2 2 0 2 1 0 2 2 1 1 1 0 0 0
 0 2 1 1 2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Сделаем, так сказать, качественную оценку:

In [None]:
def predict(sentence, tokenizer, model):
    # токенизируем исходное предложение
    tokens = tokenizer(
        sentence.split(),
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True
    )

    word_ids = tokens.word_ids()  # мапим токены по индексам слов
    with torch.no_grad():
        tokens.to('cuda')
        outputs = model(**tokens)
        predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()

    # будем элайнить лейблы по словам
    aligned_predictions = []
    current_word_id = None

    for word_id, prediction in zip(word_ids, predictions):
        if word_id is not None and word_id != current_word_id:  # начало нового слова
            aligned_predictions.append(id2label[prediction])
            current_word_id = word_id

    # зазипим результаты
    result = list(zip(sentence.split(), aligned_predictions))
    return result


# Проверка
example_sentence = "Eftir að hafa gegnt herskyldu í fyrri heimsstyrjöldinni hóf Hubble störf við stjörnuathugunarstöðina á Wilson - fjalli í Kaliforníu."
print(predict(example_sentence, tokenizer, model))


Использование готовых инструментов - хорошо, но иногда перед нами стоит задача модифицировать архитектуру модели, а то и вообще написать свою собственную с нуля, только используя эмбеддинги берта. Давайте перепишем архитектуру модели без использования автомодели трансформеров.

In [None]:
import torch.nn as nn
from transformers import BertModel

class Model(nn.Module):
    def __init__(self, num_of_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.classifier = nn.Linear(768, num_of_classes)

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        # получим эмбеддинги токенов от берта
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        sequence_output = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_size)

        # собственно классификатор
        logits = self.classifier(sequence_output)  # Shape: (batch_size, seq_len, num_classes)

        # Чтобы использовать нашу модель с трейнером трансформеров, нам нужно тут же и лосс посчитать
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            # Flatten logits and labels for loss computation
            logits_flat = logits.view(-1, logits.shape[-1])  # Shape: (batch_size * seq_len, num_classes)
            labels_flat = labels.view(-1)  # Shape: (batch_size * seq_len)
            loss = loss_fn(logits_flat, labels_flat)  # Scalar loss

        return (loss, logits) if loss is not None else logits


Удостоверимся, что наша модель адекватно работает с датасетом. Батч возвращает нам словарь с ключами, который при распаковке как раз даст нам все то, что мы прописали в форварде:

In [None]:
model = Model(len(labels))
model(**batch)

In [None]:
# инициализируем повторно трейнер, но уже с новой самописной моделькой
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader.dataset,  # Training dataset
    eval_dataset=val_dataloader.dataset,   # Evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")
trainer.save_model("./ner_model")

Ну и проверим точно так же, как предыдущую версию:

In [None]:
example_sentence = "Eftir að hafa gegnt herskyldu í fyrri heimsstyrjöldinni hóf Hubble störf við stjörnuathugunarstöðina á Wilson - fjalli í Kaliforníu."
print(predict(example_sentence, tokenizer, model))

Задание.

В текущей версии задачи мы берем, по сути, только предсказание модели для первого подслова в слове: но что, если остальные подслова могли бы тоже влиять? Попробуйте доработать код таким образом, чтобы в обучающем датасете каждому подслову слова приписывался тег всего слова, а при предсказании модель выбирала тег слова более обдуманно: например, при трех и более подсловах голосованием. Поэкспериментируйте!