# Домашнее задание № 7

## Задание 1 (4 балла)

Обучите 2 модели похожую по архитектуре на модель из ULMFit для задачи классификации текста (датасет - lenta_40k )
В моделях должно быть как минимум два рекуррентных слоя, а финальный вектор для классификации составляться из последнего состояния RNN (так делалось в семинаре), а также AveragePooling и MaxPooling из всех векторов последовательности (конкатенируйте последнее состояния и результаты пулинга). В первой модели используйте обычные слои, а во второй Bidirectional. Рассчитайте по классовую точность/полноту/f-меру для каждой из модели (результаты не должны быть совсем близкие к нулю после обучения на хотя бы нескольких эпохах).

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import numpy as np
from string import punctuation
from sklearn.model_selection import train_test_split
from collections import Counter

In [3]:
!wget https://raw.githubusercontent.com/mannefedov/compling_nlp_hse_course/refs/heads/master/data/lenta_40k.csv.zip

--2026-02-01 16:16:24--  https://raw.githubusercontent.com/mannefedov/compling_nlp_hse_course/refs/heads/master/data/lenta_40k.csv.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30658975 (29M) [application/zip]
Saving to: ‘lenta_40k.csv.zip’


2026-02-01 16:16:25 (81.4 MB/s) - ‘lenta_40k.csv.zip’ saved [30658975/30658975]



In [5]:
data = pd.read_csv('lenta_40k.csv.zip')

In [6]:
data.head()

Unnamed: 0,text,topic
0,Россия должна сотрудничать с Всемирным антидоп...,Спорт
1,Уголовный суд Кувейта 28 июня освободил под за...,Мир
2,Французский журнал Charlie Hebdo опубликовал н...,Интернет и СМИ
3,В Петербурге в доме № 53 по улице Лени Голиков...,Россия
4,"В московском аэропорту ""Домодедово"" задержан г...",Россия


In [7]:
MAX_LEN = 300

def preprocess(text):
    tokens = text.lower().split()
    tokens = [token.strip(punctuation) for token in tokens]
    return tokens


vocab = Counter()

for text in data.text:
    vocab.update(preprocess(text))

filtered_vocab = set()

for word in vocab:
    if vocab[word] > 30:
        filtered_vocab.add(word)

word2id = {'PAD':0, "UNK": 1}

for word in filtered_vocab:
    word2id[word] = len(word2id)
id2word = {i:word for word, i in word2id.items()}

In [8]:
def encode(tokens, vocab, max_len=None):
    if len(tokens) == 0:
        return [vocab["UNK"]]
    if max_len is None:
        return [vocab.get(tok, vocab["UNK"]) for tok in tokens]
    return [vocab.get(tok, vocab["UNK"]) for tok in tokens[:max_len]]

In [9]:
tokenized_texts = [preprocess(t) for t in data.text.values]
encoded_texts = [encode(t, word2id) for t in tokenized_texts]

In [10]:
lengths = [len(x) for x in encoded_texts]
print(np.percentile(lengths, [50, 75, 90, 95, 99, 100]))

[ 170.  217.  268.  304.  395. 1748.]


In [11]:
id2label = {i: label for i, label in enumerate(set(data.topic.values))}
label2id = {l: i for i, l in id2label.items()}
labels = np.array([label2id[label] for label in data.topic.values])
len(label2id)

19

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    tokenized_texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

In [13]:
class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=512):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        tokens = text
        encoded = encode(tokens, self.vocab, self.max_len)

        lengths = torch.tensor(len(encoded), dtype=torch.long)

        if len(encoded) < self.max_len:
            encoded = encoded + [self.vocab.get("PAD", 0)] * (self.max_len - len(encoded))

        return {
            'input_ids': torch.tensor(encoded, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long),
            'lengths': lengths
        }

train_dataset = ClassificationDataset(X_train, y_train, word2id, max_len=MAX_LEN)
test_dataset = ClassificationDataset(X_test, y_test, word2id, max_len=MAX_LEN)

In [14]:
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    lengths = torch.stack([item['lengths'] for item in batch])

    return {
            'input_ids': input_ids,
            'labels': labels,
            'lengths': lengths
        }

train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)

In [15]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_classes,
                 num_layers=2, bidirectional=False):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)

        self.rnn = nn.LSTM(emb_dim, hidden_dim, num_layers=num_layers,
                           batch_first=True, bidirectional=bidirectional)

        if bidirectional:
            # если bidirectional то еще умножаю на два
            # плюс так как в задание требуется AveragePooling и MaxPooling
            # то получаем всего 3 из трех векторов поэтому умножаем на три
            self.fc = nn.Linear(hidden_dim * 2 * 3, num_classes)
        else:
            self.fc = nn.Linear(hidden_dim * 3, num_classes)

    def forward(self, x, lengths):
        emb = self.embedding(x)

        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)

        packed_out, _ = self.rnn(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)

        last = out[torch.arange(out.size(0)), lengths - 1] # последнее состояние
        avg = out.sum(dim=1) / lengths.unsqueeze(1).float() # AveragePooling
        max_ = out.max(dim=1).values # MaxPooling

        features = torch.cat([last, avg, max_], dim=1) # конкатенация векторов
        logits = self.fc(features)

        return logits


In [36]:
from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, criterion, device='cpu'):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader):
        x, y, lengths = batch['input_ids'].to(device), batch['labels'].to(device), batch['lengths'].to(device)

        optimizer.zero_grad()
        logits = model(x, lengths)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [44]:
from sklearn.metrics import classification_report

def evaluate(model, loader, device):
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in loader:
            x, y, lengths = batch['input_ids'].to(device), batch['labels'], batch['lengths'].to(device)
            logits = model(x, lengths)
            preds = logits.argmax(dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(y.numpy())

    return all_labels, all_preds

In [27]:
sample_batch = next(iter(train_dataloader))
sample_batch

{'input_ids': tensor([[ 2766, 18648, 18157,  ...,     0,     0,     0],
         [13988,  3145,     1,  ...,     0,     0,     0],
         [  718, 20163,     1,  ...,     0,     0,     0],
         ...,
         [ 7818,     1, 21827,  ...,     0,     0,     0],
         [    1,     1,  6130,  ...,     0,     0,     0],
         [  486,  7027,  1881,  ...,     0,     0,     0]]),
 'labels': tensor([16, 12,  3, 12,  8,  0, 12, 10, 15,  9, 18, 15,  7, 15,  4, 15, 15, 10,
         18,  4, 16, 15, 15,  8,  4, 15,  9,  9,  9,  9, 18,  3]),
 'lengths': tensor([213, 211, 134, 216, 163, 202, 178, 143, 227, 197, 122,  93, 178, 141,
         146, 255, 158, 151,  94, 194, 193, 141, 150, 286, 147, 136, 300, 112,
         133,  84, 162, 196])}

In [29]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VOCAB_SIZE = len(id2word)
NUM_LABELS = len(id2label)
NUM_LAYERS = 2
IS_BIDIRECTIONAL = False
NUM_EPOCHS = 20

model = RNNClassifier(vocab_size=VOCAB_SIZE, emb_dim=100,
                      hidden_dim=128, num_classes=NUM_LABELS,
                      num_layers=NUM_LAYERS, bidirectional=IS_BIDIRECTIONAL).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [37]:
for epoch in range(NUM_EPOCHS):
    loss = train_epoch(model, train_dataloader, optimizer, criterion, DEVICE)
    print(f"Epoch {epoch+1}: loss={loss:.4f}")

100%|██████████| 1109/1109 [00:34<00:00, 32.52it/s]


Epoch 1: loss=0.6704


100%|██████████| 1109/1109 [00:33<00:00, 32.99it/s]


Epoch 2: loss=0.4823


100%|██████████| 1109/1109 [00:34<00:00, 32.39it/s]


Epoch 3: loss=0.3346


100%|██████████| 1109/1109 [00:34<00:00, 31.94it/s]


Epoch 4: loss=0.2146


100%|██████████| 1109/1109 [00:33<00:00, 33.04it/s]


Epoch 5: loss=0.1322


100%|██████████| 1109/1109 [00:33<00:00, 33.19it/s]


Epoch 6: loss=0.0788


100%|██████████| 1109/1109 [00:33<00:00, 33.58it/s]


Epoch 7: loss=0.0507


100%|██████████| 1109/1109 [00:33<00:00, 33.17it/s]


Epoch 8: loss=0.0394


100%|██████████| 1109/1109 [00:33<00:00, 33.20it/s]


Epoch 9: loss=0.0311


100%|██████████| 1109/1109 [00:33<00:00, 32.73it/s]


Epoch 10: loss=0.0330


100%|██████████| 1109/1109 [00:33<00:00, 32.64it/s]


Epoch 11: loss=0.0237


100%|██████████| 1109/1109 [00:33<00:00, 33.10it/s]


Epoch 12: loss=0.0232


100%|██████████| 1109/1109 [00:34<00:00, 32.50it/s]


Epoch 13: loss=0.0222


100%|██████████| 1109/1109 [00:33<00:00, 33.28it/s]


Epoch 14: loss=0.0137


100%|██████████| 1109/1109 [00:33<00:00, 33.05it/s]


Epoch 15: loss=0.0196


100%|██████████| 1109/1109 [00:33<00:00, 32.91it/s]


Epoch 16: loss=0.0179


100%|██████████| 1109/1109 [00:32<00:00, 33.65it/s]


Epoch 17: loss=0.0104


100%|██████████| 1109/1109 [00:33<00:00, 33.07it/s]


Epoch 18: loss=0.0161


100%|██████████| 1109/1109 [00:33<00:00, 32.96it/s]


Epoch 19: loss=0.0153


100%|██████████| 1109/1109 [00:33<00:00, 33.06it/s]

Epoch 20: loss=0.0059





In [46]:
labels, preds = evaluate(model, test_dataloader, device='cuda')
print(classification_report(labels, preds, digits=4))

              precision    recall  f1-score   support

           0     0.7681    0.7657    0.7669       636
           1     0.0526    0.0625    0.0571        16
           2     0.5319    0.2874    0.3731        87
           3     0.4230    0.4494    0.4358       336
           4     0.4853    0.3929    0.4342        84
           5     0.7375    0.6413    0.6860        92
           6     0.0000    0.0000    0.0000         1
           7     0.6322    0.4564    0.5301       241
           8     0.7240    0.8115    0.7652      1639
           9     0.8450    0.7786    0.8105       637
          10     0.9332    0.9499    0.9415       779
          11     0.5820    0.6371    0.6083       529
          12     0.8243    0.6880    0.7500       641
          14     0.0000    0.0000    0.0000         5
          15     0.7575    0.7599    0.7587      1924
          16     0.7782    0.7338    0.7554       263
          17     0.0000    0.0000    0.0000         8
          18     0.7845    

**Двунаправленная модель**

In [52]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VOCAB_SIZE = len(id2word)
NUM_LABELS = len(id2label)
NUM_LAYERS = 2
IS_BIDIRECTIONAL = True
NUM_EPOCHS = 20

model = RNNClassifier(vocab_size=VOCAB_SIZE, emb_dim=100,
                      hidden_dim=128, num_classes=NUM_LABELS,
                      num_layers=NUM_LAYERS, bidirectional=IS_BIDIRECTIONAL).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [53]:
for epoch in range(NUM_EPOCHS):
    loss = train_epoch(model, train_dataloader, optimizer, criterion, DEVICE)
    print(f"Epoch {epoch+1}: loss={loss:.4f}")

100%|██████████| 1109/1109 [00:52<00:00, 21.26it/s]


Epoch 1: loss=1.4519


100%|██████████| 1109/1109 [00:49<00:00, 22.20it/s]


Epoch 2: loss=0.8048


100%|██████████| 1109/1109 [00:50<00:00, 22.17it/s]


Epoch 3: loss=0.5360


100%|██████████| 1109/1109 [00:49<00:00, 22.31it/s]


Epoch 4: loss=0.3329


100%|██████████| 1109/1109 [00:49<00:00, 22.46it/s]


Epoch 5: loss=0.1736


100%|██████████| 1109/1109 [00:49<00:00, 22.45it/s]


Epoch 6: loss=0.0803


100%|██████████| 1109/1109 [00:49<00:00, 22.50it/s]


Epoch 7: loss=0.0378


100%|██████████| 1109/1109 [00:49<00:00, 22.47it/s]


Epoch 8: loss=0.0251


100%|██████████| 1109/1109 [00:48<00:00, 22.67it/s]


Epoch 9: loss=0.0323


100%|██████████| 1109/1109 [00:50<00:00, 21.93it/s]


Epoch 10: loss=0.0252


100%|██████████| 1109/1109 [00:50<00:00, 22.03it/s]


Epoch 11: loss=0.0122


100%|██████████| 1109/1109 [00:56<00:00, 19.56it/s]


Epoch 12: loss=0.0175


100%|██████████| 1109/1109 [00:56<00:00, 19.52it/s]


Epoch 13: loss=0.0211


100%|██████████| 1109/1109 [00:56<00:00, 19.47it/s]


Epoch 14: loss=0.0189


100%|██████████| 1109/1109 [01:07<00:00, 16.32it/s]


Epoch 15: loss=0.0108


100%|██████████| 1109/1109 [00:56<00:00, 19.57it/s]


Epoch 16: loss=0.0092


100%|██████████| 1109/1109 [00:58<00:00, 18.90it/s]


Epoch 17: loss=0.0185


100%|██████████| 1109/1109 [00:55<00:00, 19.92it/s]


Epoch 18: loss=0.0100


100%|██████████| 1109/1109 [01:06<00:00, 16.57it/s]


Epoch 19: loss=0.0080


100%|██████████| 1109/1109 [00:58<00:00, 18.99it/s]

Epoch 20: loss=0.0120





In [54]:
labels, preds = evaluate(model, test_dataloader, device='cuda')
print(classification_report(labels, preds, digits=4))

              precision    recall  f1-score   support

           0     0.7702    0.7327    0.7510       636
           1     0.1111    0.0625    0.0800        16
           2     0.4306    0.3563    0.3899        87
           3     0.4559    0.4613    0.4586       336
           4     0.6429    0.3214    0.4286        84
           5     0.6790    0.5978    0.6358        92
           6     0.0000    0.0000    0.0000         1
           7     0.5989    0.4647    0.5234       241
           8     0.7497    0.7419    0.7458      1639
           9     0.7614    0.8367    0.7973       637
          10     0.9153    0.9435    0.9292       779
          11     0.5502    0.6730    0.6054       529
          12     0.7765    0.7535    0.7648       641
          14     0.0000    0.0000    0.0000         5
          15     0.7051    0.7978    0.7486      1924
          16     0.8571    0.6616    0.7468       263
          17     0.2500    0.1250    0.1667         8
          18     0.8803    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Задание 2 (6 баллов)


На данных википедии (wikiann) обучите и сравните 3 модели:  
1) модель в которой как минимум два рекуррентных слоя, причем один из них GRU, а другой LSTM
2) модель в которой как минимум 3 рекуррентных слоя идут друг за другом и при этом 2-ой и 3-й слои еще имеют residual connection к изначальным эмбедингам. Для того, чтобы сделать residual connection вам нужно будет использовать одинаковую размерность эмбедингов и количество unit'ов в RNN слоях, чтобы их можно было просуммировать
3) модель в которой будут и рекуррентные и сверточные слои (как минимум 2 rnn и как минимум 2 cnn слоя). В cnn слоях будьте аккуратны с укорачиванием последовательности и используйте паддинг



Сравните качество по метрикам (точность/полнота/f-мера). Также придумайте несколько сложных примеров и проверьте, какие сущности определяет каждая из моделей.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from collections import Counter
import numpy as np
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm

In [2]:
dataset = load_dataset("wikiann", "ru")

README.md: 0.00B [00:00, ?B/s]

ru/validation-00000-of-00001.parquet:   0%|          | 0.00/809k [00:00<?, ?B/s]

ru/test-00000-of-00001.parquet:   0%|          | 0.00/816k [00:00<?, ?B/s]

ru/train-00000-of-00001.parquet:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [3]:
example = dataset['train'][0]
print(f"Tokens: {example['tokens']}")
print(f"NER tags: {example['ner_tags']}")

Tokens: ['Илизаров', ',', 'Гавриил', 'Абрамович']
NER tags: [1, 2, 2, 2]


In [5]:
vocab_counter = Counter()
for item in dataset['train']:
    vocab_counter.update(item['tokens'])

MIN_FREQ = 3
filtered_vocab = {word for word, count in vocab_counter.items() if count >= MIN_FREQ}


word2id = {'PAD': 0, 'UNK': 1}
for word in filtered_vocab:
    word2id[word] = len(word2id)

In [33]:
id2tag = {0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC", 7: "PAD"}
tag_names = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
tag2id = {v: k for k, v in id2tag.items()}
print(f"Количество классов: {len(tag2id)}")

Количество классов: 8


In [11]:
class NERDataset(Dataset):
    def __init__(self, hf_dataset, word2id, tag2id, max_len=128):
        self.data = hf_dataset
        self.word2id = word2id
        self.tag2id = tag2id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        tokens = item['tokens']
        tags = item['ner_tags']

        tokens = tokens[:self.max_len]
        tags = tags[:self.max_len]

        encoded = [self.word2id.get(tok, self.word2id['UNK']) for tok in tokens]

        length = len(encoded)

        if len(encoded) < self.max_len:
            pad_len = self.max_len - len(encoded)
            encoded = encoded + [self.word2id['PAD']] * pad_len
            tags = tags + [7] * pad_len

        return {
            'input_ids': torch.tensor(encoded, dtype=torch.long),
            'tags': torch.tensor(tags, dtype=torch.long),
            'lengths': torch.tensor(length, dtype=torch.long)
        }

In [12]:
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    tags = torch.stack([item['tags'] for item in batch])
    lengths = torch.stack([item['lengths'] for item in batch])
    return {
            'input_ids': input_ids,
            'labels': tags,
            'lengths': lengths
        }

MAX_LEN = 128
train_dataset = NERDataset(dataset['train'], word2id, tag2id, MAX_LEN)
val_dataset = NERDataset(dataset['validation'], word2id, tag2id, MAX_LEN)
test_dataset = NERDataset(dataset['test'], word2id, tag2id, MAX_LEN)

BATCH_SIZE = 32
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [13]:
next(iter(train_dataloader))

{'input_ids': tensor([[   1,  516, 3123,  ...,    0,    0,    0],
         [   1, 1757,    1,  ...,    0,    0,    0],
         [3434,    1, 2011,  ...,    0,    0,    0],
         ...,
         [   1, 3035,    1,  ...,    0,    0,    0],
         [   1, 3035, 4634,  ...,    0,    0,    0],
         [1148, 1392, 2578,  ...,    0,    0,    0]]),
 'labels': tensor([[0, 0, 0,  ..., 7, 7, 7],
         [3, 4, 4,  ..., 7, 7, 7],
         [0, 5, 6,  ..., 7, 7, 7],
         ...,
         [1, 2, 2,  ..., 7, 7, 7],
         [1, 2, 2,  ..., 7, 7, 7],
         [0, 0, 0,  ..., 7, 7, 7]]),
 'lengths': tensor([ 8,  3,  6, 11,  9,  5,  7,  4,  5,  5,  4,  7,  8, 12,  6,  5,  4, 16,
          5,  6,  4,  6,  4,  5, 12,  7,  5,  7, 23,  3,  4,  6])}

**LSTMGRUModel**

In [62]:
class LSTMGRUModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_classes, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)

        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.gru = nn.GRU(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)

        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        emb = self.dropout(emb)

        lstm_out, _ = self.lstm(emb)
        gru_out, _ = self.gru(lstm_out)

        logits = self.fc(gru_out)

        return logits

**ResidualRNNModel**

In [63]:
class ResidualRNNModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_classes, dropout=0.3):
        super().__init__()
        assert emb_dim == hidden_dim, "emb_dim должен быть равен hidden_dim для residual!"

        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)

        self.rnn1 = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.rnn2 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.rnn3 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=False)

        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        emb = self.dropout(emb)

        out1, _ = self.rnn1(emb)
        out2, _ = self.rnn2(emb)
        out2 = out2 + emb

        out3 = out3 + emb

        logits = self.fc(out3)

        return logits

In [64]:
class CNNRNNModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_classes, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)

        self.conv1 = nn.Conv1d(emb_dim, hidden_dim, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)

        self.relu = nn.ReLU()

        self.rnn1 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.rnn2 = nn.GRU(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)

        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        emb = self.dropout(emb)

        emb_t = emb.transpose(1, 2)

        conv1_out = self.relu(self.conv1(emb_t))
        conv2_out = self.relu(self.conv2(conv1_out))

        conv_out = conv2_out.transpose(1, 2)

        rnn1_out, _ = self.rnn1(conv_out)

        rnn2_out, _ = self.rnn2(rnn1_out)

        logits = self.fc(rnn2_out)

        return logits

In [58]:
def train_epoch(model, dataloader, optimizer, criterion, device='cpu'):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="TRAIN"):
        x = batch['input_ids'].to(device)
        y = batch['labels'].to(device)
        lengths = batch['lengths'].to(device)

        optimizer.zero_grad()
        logits = model(x, lengths).permute(0, 2, 1)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [59]:
def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='EVAL'):
            x, y, lengths = batch['input_ids'].to(device), batch['labels'], batch['lengths'].to(device)
            logits = model(x, lengths)
            preds = torch.argmax(logits, dim=-1)

            for i in range(x.size(0)):
                length = lengths[i].item()
                all_preds.extend(preds[i, :length].cpu().numpy())
                all_labels.extend(y[i, :length].cpu().numpy())

    report = classification_report(
        all_labels, all_preds,
        digits=4,
        zero_division=0
    )

    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

    return report, f1

In [40]:
def train_model(model, model_name, train_loader, val_loader, device, num_epochs=10):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=7)

    best_f1 = 0

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")

        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        print(f"Train Loss: {train_loss:.4f}")

        report, f1 = evaluate(model, val_loader, device)
        print(f"Validation F1: {f1:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), f'{model_name}_best.pt')
            print(f"BEST MODEL F1={f1:.4f}")
    return model

In [20]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
VOCAB_SIZE = len(word2id)
NUM_CLASSES = len(tag2id)
EMB_DIM = 128
HIDDEN_DIM = 128
NUM_EPOCHS = 7

In [34]:
model1 = LSTMGRUModel(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, NUM_CLASSES)
model1 = train_model(model1, "LSTM_GRU", train_dataloader, val_dataloader, DEVICE, NUM_EPOCHS)

Epoch 1/7


TRAIN: 100%|██████████| 625/625 [00:08<00:00, 69.78it/s]


Train Loss: 0.6564


EVAL: 100%|██████████| 313/313 [00:03<00:00, 102.50it/s]


Validation F1: 0.8445
BEST MODEL F1=0.8445
Epoch 2/7


TRAIN: 100%|██████████| 625/625 [00:07<00:00, 82.69it/s]


Train Loss: 0.4330


EVAL: 100%|██████████| 313/313 [00:02<00:00, 104.78it/s]


Validation F1: 0.8701
BEST MODEL F1=0.8701
Epoch 3/7


TRAIN: 100%|██████████| 625/625 [00:07<00:00, 81.74it/s]


Train Loss: 0.3600


EVAL: 100%|██████████| 313/313 [00:02<00:00, 105.32it/s]


Validation F1: 0.8783
BEST MODEL F1=0.8783
Epoch 4/7


TRAIN: 100%|██████████| 625/625 [00:08<00:00, 76.88it/s]


Train Loss: 0.3056


EVAL: 100%|██████████| 313/313 [00:05<00:00, 57.49it/s]


Validation F1: 0.8901
BEST MODEL F1=0.8901
Epoch 5/7


TRAIN: 100%|██████████| 625/625 [00:09<00:00, 65.47it/s]


Train Loss: 0.2681


EVAL: 100%|██████████| 313/313 [00:02<00:00, 104.36it/s]


Validation F1: 0.8937
BEST MODEL F1=0.8937
Epoch 6/7


TRAIN: 100%|██████████| 625/625 [00:09<00:00, 69.12it/s]


Train Loss: 0.2408


EVAL: 100%|██████████| 313/313 [00:03<00:00, 104.19it/s]


Validation F1: 0.8945
BEST MODEL F1=0.8945
Epoch 7/7


TRAIN: 100%|██████████| 625/625 [00:08<00:00, 72.42it/s]


Train Loss: 0.2171


EVAL: 100%|██████████| 313/313 [00:04<00:00, 71.47it/s]


Validation F1: 0.8964
BEST MODEL F1=0.8964


In [41]:
model2 = ResidualRNNModel(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, NUM_CLASSES)
model2 = train_model(model2, "Residual_RNN", train_dataloader, val_dataloader, DEVICE, NUM_EPOCHS)

Epoch 1/7


TRAIN: 100%|██████████| 625/625 [00:06<00:00, 100.36it/s]


Train Loss: 0.9458


EVAL: 100%|██████████| 313/313 [00:03<00:00, 83.24it/s]


Validation F1: 0.7449
BEST MODEL F1=0.7449
Epoch 2/7


TRAIN: 100%|██████████| 625/625 [00:06<00:00, 100.80it/s]


Train Loss: 0.6790


EVAL: 100%|██████████| 313/313 [00:03<00:00, 94.78it/s]


Validation F1: 0.7827
BEST MODEL F1=0.7827
Epoch 3/7


TRAIN: 100%|██████████| 625/625 [00:06<00:00, 91.31it/s] 


Train Loss: 0.5859


EVAL: 100%|██████████| 313/313 [00:02<00:00, 107.07it/s]


Validation F1: 0.8057
BEST MODEL F1=0.8057
Epoch 4/7


TRAIN: 100%|██████████| 625/625 [00:06<00:00, 92.18it/s] 


Train Loss: 0.5249


EVAL: 100%|██████████| 313/313 [00:02<00:00, 107.92it/s]


Validation F1: 0.8200
BEST MODEL F1=0.8200
Epoch 5/7


TRAIN: 100%|██████████| 625/625 [00:06<00:00, 92.46it/s]


Train Loss: 0.4786


EVAL: 100%|██████████| 313/313 [00:02<00:00, 106.89it/s]


Validation F1: 0.8288
BEST MODEL F1=0.8288
Epoch 6/7


TRAIN: 100%|██████████| 625/625 [00:06<00:00, 94.94it/s]


Train Loss: 0.4432


EVAL: 100%|██████████| 313/313 [00:03<00:00, 93.45it/s]


Validation F1: 0.8346
BEST MODEL F1=0.8346
Epoch 7/7


TRAIN: 100%|██████████| 625/625 [00:07<00:00, 82.81it/s]


Train Loss: 0.4181


EVAL: 100%|██████████| 313/313 [00:03<00:00, 85.75it/s]


Validation F1: 0.8364
BEST MODEL F1=0.8364


In [42]:
model3 = CNNRNNModel(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, NUM_CLASSES)
model3 = train_model(model3, "CNN_RNN", train_dataloader, val_dataloader, DEVICE, NUM_EPOCHS)

Epoch 1/7


TRAIN: 100%|██████████| 625/625 [00:11<00:00, 53.49it/s]


Train Loss: 0.7025


EVAL: 100%|██████████| 313/313 [00:05<00:00, 62.42it/s]


Validation F1: 0.8313
BEST MODEL F1=0.8313
Epoch 2/7


TRAIN: 100%|██████████| 625/625 [00:07<00:00, 79.12it/s]


Train Loss: 0.4823


EVAL: 100%|██████████| 313/313 [00:03<00:00, 80.21it/s]


Validation F1: 0.8504
BEST MODEL F1=0.8504
Epoch 3/7


TRAIN: 100%|██████████| 625/625 [00:12<00:00, 49.67it/s]


Train Loss: 0.4128


EVAL: 100%|██████████| 313/313 [00:07<00:00, 43.56it/s]


Validation F1: 0.8681
BEST MODEL F1=0.8681
Epoch 4/7


TRAIN: 100%|██████████| 625/625 [00:17<00:00, 35.87it/s]


Train Loss: 0.3657


EVAL: 100%|██████████| 313/313 [00:07<00:00, 44.49it/s]


Validation F1: 0.8787
BEST MODEL F1=0.8787
Epoch 5/7


TRAIN: 100%|██████████| 625/625 [00:15<00:00, 39.52it/s]


Train Loss: 0.3277


EVAL: 100%|██████████| 313/313 [00:03<00:00, 78.75it/s]


Validation F1: 0.8814
BEST MODEL F1=0.8814
Epoch 6/7


TRAIN: 100%|██████████| 625/625 [00:07<00:00, 78.39it/s]


Train Loss: 0.3017


EVAL: 100%|██████████| 313/313 [00:03<00:00, 81.65it/s]


Validation F1: 0.8878
BEST MODEL F1=0.8878
Epoch 7/7


TRAIN: 100%|██████████| 625/625 [00:07<00:00, 78.88it/s]


Train Loss: 0.2744


EVAL: 100%|██████████| 313/313 [00:03<00:00, 78.93it/s]


Validation F1: 0.8902
BEST MODEL F1=0.8902


In [47]:
tag2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'PAD': 7}

In [44]:
report_1, _ = evaluate(model1, test_dataloader, device='cuda')
print(report_1)

EVAL: 100%|██████████| 313/313 [00:04<00:00, 74.40it/s]


              precision    recall  f1-score   support

           0     0.9286    0.9711    0.9494     40499
           1     0.9227    0.8450    0.8821      3543
           2     0.9443    0.9125    0.9281      7544
           3     0.8082    0.6362    0.7120      4074
           4     0.8301    0.7930    0.8111      8008
           5     0.8025    0.7947    0.7986      4560
           6     0.7937    0.7846    0.7892      3060

    accuracy                         0.9002     71288
   macro avg     0.8614    0.8196    0.8386     71288
weighted avg     0.8982    0.9002    0.8982     71288



In [45]:
report_2, _ = evaluate(model2, test_dataloader, device='cuda')
print(report_2)

EVAL: 100%|██████████| 313/313 [00:06<00:00, 46.56it/s]


              precision    recall  f1-score   support

           0     0.8762    0.9533    0.9131     40499
           1     0.5203    0.8473    0.6447      3543
           2     0.9217    0.8828    0.9018      7544
           3     0.8046    0.3871    0.5227      4074
           4     0.8583    0.6718    0.7537      8008
           5     0.7476    0.5781    0.6520      4560
           6     0.8302    0.6712    0.7423      3060

    accuracy                         0.8405     71288
   macro avg     0.7941    0.7131    0.7329     71288
weighted avg     0.8470    0.8405    0.8343     71288



In [46]:
report_3, _ = evaluate(model3, test_dataloader, device='cuda')
print(report_3)

EVAL: 100%|██████████| 313/313 [00:05<00:00, 57.36it/s]


              precision    recall  f1-score   support

           0     0.9307    0.9663    0.9482     40499
           1     0.8367    0.8589    0.8476      3543
           2     0.8759    0.9275    0.9010      7544
           3     0.7917    0.6109    0.6897      4074
           4     0.8523    0.7551    0.8008      8008
           5     0.8244    0.7627    0.7923      4560
           6     0.7682    0.7915    0.7797      3060

    accuracy                         0.8923     71288
   macro avg     0.8400    0.8104    0.8227     71288
weighted avg     0.8897    0.8923    0.8896     71288



In [74]:
test_sentences = ['Её зовут Елена Белая, она работает в белой больнице недалеко от Белого дома.',
                  'Иван Иванович Сидоров открыл Сидоров банк в Иваново.',
                  'Она зашла в аптеку «Гиппократ», где встретила доктора Гиппократа']

In [93]:

def predict_tags(text, word2id, id2label, model, device='cpu'):
    tokens = [token for token in text.split()]
    ids = [word2id.get(t, 1) for t in tokens]

    model.eval()
    with torch.no_grad():
        logits = model(torch.LongTensor([ids]).to(DEVICE), torch.LongTensor([len(ids)]).to(DEVICE))
        preds = logits.argmax(2)[0].cpu().tolist()[:len(tokens)]

    return list(zip(tokens, [id2label[p] for p in preds]))

In [94]:
tag2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'PAD': 7}

In [95]:
for text in test_sentences:
    print(text)
    answer = predict_tags(text, word2id, id2tag, model1)
    for tag in answer:
        print(tag)
    print()

Её зовут Елена Белая, она работает в белой больнице недалеко от Белого дома.
('Её', 'O')
('зовут', 'O')
('Елена', 'O')
('Белая,', 'O')
('она', 'O')
('работает', 'O')
('в', 'O')
('белой', 'O')
('больнице', 'O')
('недалеко', 'O')
('от', 'O')
('Белого', 'O')
('дома.', 'I-ORG')

Иван Иванович Сидоров открыл Сидоров банк в Иваново.
('Иван', 'O')
('Иванович', 'O')
('Сидоров', 'O')
('открыл', 'O')
('Сидоров', 'O')
('банк', 'I-ORG')
('в', 'O')
('Иваново.', 'O')

Она зашла в аптеку «Гиппократ», где встретила доктора Гиппократа
('Она', 'O')
('зашла', 'O')
('в', 'O')
('аптеку', 'O')
('«Гиппократ»,', 'O')
('где', 'O')
('встретила', 'O')
('доктора', 'O')
('Гиппократа', 'I-ORG')



Не распознана Елена Белая: вероятно, модель не определила шаблон Фамилия/Имя
Белый дом отмечен как I‑ORG, но без начала (нет B‑ORG).

При этом в принципе наблюдается, что модель слишком часто и неверно выбирает ORG тег: Иваново ошибочно получили класс I‑ORG. В Сидоров банк, хотя начало неправильное и должно быть B-ORG, угадалась организация. То же самое с доктором Гиппократом.

In [96]:
for text in test_sentences:
    print(text)
    answer = predict_tags(text, word2id, id2tag, model2)
    for tag in answer:
        print(tag)
    print()

Её зовут Елена Белая, она работает в белой больнице недалеко от Белого дома.
('Её', 'O')
('зовут', 'O')
('Елена', 'O')
('Белая,', 'O')
('она', 'O')
('работает', 'O')
('в', 'O')
('белой', 'B-LOC')
('больнице', 'O')
('недалеко', 'O')
('от', 'O')
('Белого', 'O')
('дома.', 'O')

Иван Иванович Сидоров открыл Сидоров банк в Иваново.
('Иван', 'B-ORG')
('Иванович', 'I-PER')
('Сидоров', 'I-ORG')
('открыл', 'O')
('Сидоров', 'O')
('банк', 'I-ORG')
('в', 'O')
('Иваново.', 'B-LOC')

Она зашла в аптеку «Гиппократ», где встретила доктора Гиппократа
('Она', 'O')
('зашла', 'O')
('в', 'O')
('аптеку', 'B-LOC')
('«Гиппократ»,', 'O')
('где', 'O')
('встретила', 'O')
('доктора', 'O')
('Гиппократа', 'O')



Ошибки:

1) пропущено имя (Елена Белая — всё O).

2) "белой" ошибочно помечено как B‑LOC — модель спутала описание места с топонимом.

3) "аптеку" помечено как B‑LOC, что неверно (это не географический объект).

4) Иван – B-ORG, а Иванович уже верно PER

Названия (в кавычках и фамилия) — всё O.

Модель путает типы (LOC вместо ORG), не умеет объединять правильно многословные имена

In [97]:
for text in test_sentences:
    print(text)
    answer = predict_tags(text, word2id, id2tag, model3)
    for tag in answer:
        print(tag)
    print()

Её зовут Елена Белая, она работает в белой больнице недалеко от Белого дома.
('Её', 'O')
('зовут', 'O')
('Елена', 'O')
('Белая,', 'O')
('она', 'O')
('работает', 'O')
('в', 'O')
('белой', 'O')
('больнице', 'O')
('недалеко', 'O')
('от', 'O')
('Белого', 'B-ORG')
('дома.', 'I-ORG')

Иван Иванович Сидоров открыл Сидоров банк в Иваново.
('Иван', 'O')
('Иванович', 'I-PER')
('Сидоров', 'I-PER')
('открыл', 'O')
('Сидоров', 'O')
('банк', 'O')
('в', 'O')
('Иваново.', 'O')

Она зашла в аптеку «Гиппократ», где встретила доктора Гиппократа
('Она', 'O')
('зашла', 'O')
('в', 'O')
('аптеку', 'O')
('«Гиппократ»,', 'O')
('где', 'O')
('встретила', 'O')
('доктора', 'O')
('Гиппократа', 'O')



Белый дом – распознан верно, имя во втором частично правильно, хоть и без начала. С остальным модель не справилась.