In [1]:
# !wget get https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz
import torch
torch.cuda.is_available()


True

In [None]:
# !git clone https://github.com/facebookresearch/fastText.git
# %cd fastText
# !pip install .
# !cp fasttext ../
# %cd ..


In [3]:
import fasttext
# !gunzip cc.ru.300.bin.gz


In [6]:
ft = fasttext.load_model('cc.ru.300.bin')


In [8]:
# fasttext.util.reduce_model(ft, 100)
ft.get_dimension()


300

In [9]:
# ft.save_model('cc.ru.100.bin')


In [12]:
import fasttext
import numpy as np
import torch

from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm


In [13]:
# not used
# class TokenizerFastText(object):
#     def __init__(self, ft, tokenizer):
#         self.ft = ft
#         self.tokenizer = tokenizer
#     def __call__(self, sentences, max_length = 10, pad_to_max_length = False):
#         tokens = self.tokenizer.tokenize_sents(sentences)
#         if not pad_to_max_length:
#             max_length = min(max_length, max(map(len, tokens)))
#         tokens = [['[CLS]']+s+['[SEP]'] + ['[PAD]']*(max_length-len(s)) \
#                   if len(s) < max_length \
#                   else ['[CLS]']+s[:max_length]+['[SEP]'] \
#                   for s in tokens ]
#         vectors = [[self.ft.get_word_vector(w) for w in sent] for sent in tokens]
#         return torch.tensor(vectors)


In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device


'cuda'

In [16]:
import warnings
warnings.filterwarnings("ignore")
!pip install nerus


Defaulting to user installation because normal site-packages is not writeable


In [18]:
import nerus as ns
from transformers import AutoTokenizer
from transformers import AutoModel

MAX_SENTENCE_LENGTH = 50

class NerusTokenizer:
    ids_to_word: dict
    word_next_id: int = 0
    tag_to_id: dict
    lstm_tokens: list[str] = ["[PAD]", "[CLS]", "[SEP]", "[UNK]"]
    def __init__(self, ft):
        self.tag_to_id = {}
        self.ids_to_word = {}
        for sys_id, sys_tag in enumerate(self.lstm_tokens):
            self.ids_to_word[sys_id] = sys_tag
            self.tag_to_id[sys_tag] = sys_id
        self.word_next_id = self.lstm_tokens.__len__()
        self.ft = ft

                                                                      # tokenized sentences   # tags strings
    def tokenize_sent_pos_tagging(self, sents: list[ns.NerusSent]) -> tuple[list[list[int]], list[list[str]]]:
        sentences_ids = []
        pos_tags_txts = []

        for sent in sents:
            sentence_id = [self.ft.get_word_vector("[CLS]")]
            pos_tag_txt = ["[CLS]"]
            for token in sent.tokens:
                word_text = token.text
                pos_tag   = token.tag

                word_id = self.ft.get_word_vector(word_text)
                sentence_id.append(word_id)
                pos_tag_txt.append(pos_tag)

            assert sentence_id.__len__() == pos_tag_txt.__len__()

            # separating and padding
            pad_id = self.ft.get_word_vector("[PAD]")
            sep_id = self.ft.get_word_vector("[SEP]")

            if len(sentence_id) < MAX_SENTENCE_LENGTH:
                sentence_id.append(sep_id)
                sentence_id.extend([pad_id]  * (MAX_SENTENCE_LENGTH - len(sentence_id)))
                pos_tag_txt.append("[SEP]")
                pos_tag_txt.extend(["[PAD]"] * (MAX_SENTENCE_LENGTH - len(pos_tag_txt)))
            else:
                sentence_id = sentence_id[:MAX_SENTENCE_LENGTH]
                sentence_id[-1] = sep_id
                pos_tag_txt = pos_tag_txt[:MAX_SENTENCE_LENGTH]
                pos_tag_txt[-1] = "[SEP]"

            assert sentence_id.__len__() == pos_tag_txt.__len__()

            sentences_ids.append(sentence_id)
            pos_tags_txts.append(pos_tag_txt)

        return sentences_ids, pos_tags_txts

    def expand_pos_tagging_vocab(self, sents: ns.NerusSent):
        new_tags = set()
        for sent in sents:
            for token in sent.tokens:
                new_tags.add(token.tag)
        new_tags = [tag for tag in new_tags if tag not in self.tag_to_id.keys()]
        for tag in new_tags:
            self.tag_to_id[tag] = self.tag_to_id.__len__()
            self.ids_to_word[self.tag_to_id[tag]] = tag

    def tokenize_pos_tags(self, text_tags: list[list[str]]) -> list[list[int]]:
        tags_id = []
        for sent_tags in text_tags:
            sent_tags_id = []
            for tag in sent_tags:
                if tag not in self.tag_to_id.keys():
                    msg = f"Bad tag when put ids on tags; TAG: {tag}"
                    raise RuntimeError(msg)
                sent_tags_id.append(self.tag_to_id[tag])
            tags_id.append(sent_tags_id)

        return tags_id


In [19]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()

    output = model(x_batch.to(device))

    # need to adjust output sizes to evaluate loss_function on batch
    output  = output.reshape(-1, output.shape[2])
    y_batch = y_batch.reshape(-1)

    loss = loss_function(output, y_batch.to(model.device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()


In [20]:

def train_epoch(generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for batch_of_x, batch_of_y in generator:

        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)

        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)

        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss/total


In [21]:
def trainer(count_of_epoch,
            batch_size,
            dataset,
            model,
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)

    iterations = tqdm(range(count_of_epoch), desc='epoch', leave=False)
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        batch_generator = tqdm(
            torch.utils.data.DataLoader(dataset=dataset,
                                        batch_size=batch_size,
                                        shuffle=True, pin_memory=True),
            leave=False, total=len(dataset)//batch_size+(len(dataset)%batch_size>0))
        epoch_loss = train_epoch (
                    generator=batch_generator,
                    model=model,
                    loss_function=loss_function,
                    optimizer=optima,
                    callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})


In [22]:
def evaluate_model_score(model, dataset, batch_size: int, loss_function, device = 'cpu'):
    batch_generator = torch.utils.data.DataLoader(dataset, batch_size=batch_size, pin_memory=True)

    pred = []
    real = []
    test_loss = 0
    total_samples = 0

    for x_batch, y_batch in batch_generator:
        output = model(x_batch.to(device))

        output_loss  = output.reshape(-1, output.shape[2])
        y_batch = y_batch.reshape(-1)

        #evaluate loss on batch
        batch_loss = loss_function(output_loss, y_batch.to(model.device))
        test_loss += batch_loss.cpu().item() * len(x_batch)
        total_samples += len(x_batch)

        # add prediction results for classification report
        output_report = torch.argmax(output, -1)
        pred.extend(torch.reshape(output_report, (-1,)).cpu().numpy())
        real.extend(y_batch.cpu().numpy())

    pred = np.array(pred)
    real = np.array(real)

    # need to mask and remove from real tags PAD, CLS, SEP, they are needed only to support LSTM architecture
    mask = np.isin(real, [0, 1, 2], invert=True)
    return classification_report(real[mask], pred[mask]), test_loss / total_samples


In [15]:
# !wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz


--2025-04-21 02:01:49--  https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1961465886 (1.8G) [application/octet-stream]
Saving to: ‘nerus_lenta.conllu.gz.4’


2025-04-21 02:05:31 (8.49 MB/s) - ‘nerus_lenta.conllu.gz.4’ saved [1961465886/1961465886]



In [56]:
from itertools import islice as head
from nerus import load_nerus

MAX_SAMPLES = 1000

dataset = head(load_nerus("nerus_lenta.conllu.gz"), MAX_SAMPLES)


In [57]:
from pprint import pprint
for i, sample in enumerate(dataset):
    for sent in sample.sents:
        print([i.text for i in sent.tokens])
        pprint([(i.id, i.text, i.tag) for i in sent.tokens])
        break
    break


['Вице-премьер', 'по', 'социальным', 'вопросам', 'Татьяна', 'Голикова', 'рассказала', ',', 'в', 'каких', 'регионах', 'России', 'зафиксирована', 'наиболее', 'высокая', 'смертность', 'от', 'рака', ',', 'сообщает', 'РИА', 'Новости', '.']
[('1', 'Вице-премьер', 'O'),
 ('2', 'по', 'O'),
 ('3', 'социальным', 'O'),
 ('4', 'вопросам', 'O'),
 ('5', 'Татьяна', 'B-PER'),
 ('6', 'Голикова', 'I-PER'),
 ('7', 'рассказала', 'O'),
 ('8', ',', 'O'),
 ('9', 'в', 'O'),
 ('10', 'каких', 'O'),
 ('11', 'регионах', 'O'),
 ('12', 'России', 'B-LOC'),
 ('13', 'зафиксирована', 'O'),
 ('14', 'наиболее', 'O'),
 ('15', 'высокая', 'O'),
 ('16', 'смертность', 'O'),
 ('17', 'от', 'O'),
 ('18', 'рака', 'O'),
 ('19', ',', 'O'),
 ('20', 'сообщает', 'O'),
 ('21', 'РИА', 'B-ORG'),
 ('22', 'Новости', 'I-ORG'),
 ('23', '.', 'O')]


In [31]:
nerus_tokenizer = NerusTokenizer(ft)

for sample in dataset:
    nerus_tokenizer.expand_pos_tagging_vocab(sample.sents)

print(nerus_tokenizer.tag_to_id)
print(nerus_tokenizer.ids_to_word)

dataset_words_id = []
dataset_tags_id = []
dataset = head(load_nerus("nerus_lenta.conllu.gz"), MAX_SAMPLES)
for i, sample in enumerate(dataset):
    words_id, tags = nerus_tokenizer.tokenize_sent_pos_tagging(sample.sents)
    tags_id = nerus_tokenizer.tokenize_pos_tags(tags)

    dataset_words_id.extend(words_id)
    dataset_tags_id.extend(tags_id)

print(nerus_tokenizer.ids_to_word)
print(nerus_tokenizer.tag_to_id)
print(f"Tokenized: {dataset_words_id[0][0]}")


{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3, 'B-LOC': 4, 'O': 5, 'I-PER': 6, 'B-PER': 7, 'B-ORG': 8, 'I-ORG': 9, 'I-LOC': 10}
{0: '[PAD]', 1: '[CLS]', 2: '[SEP]', 3: '[UNK]', 4: 'B-LOC', 5: 'O', 6: 'I-PER', 7: 'B-PER', 8: 'B-ORG', 9: 'I-ORG', 10: 'I-LOC'}
{0: '[PAD]', 1: '[CLS]', 2: '[SEP]', 3: '[UNK]', 4: 'B-LOC', 5: 'O', 6: 'I-PER', 7: 'B-PER', 8: 'B-ORG', 9: 'I-ORG', 10: 'I-LOC'}
{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3, 'B-LOC': 4, 'O': 5, 'I-PER': 6, 'B-PER': 7, 'B-ORG': 8, 'I-ORG': 9, 'I-LOC': 10}
Tokenized: [ 0.04495002 -0.00612148  0.0189565  -0.00339077 -0.03292686  0.08229555
  0.05068677  0.0485021   0.04732766 -0.03630793 -0.01181982 -0.03532731
  0.01627408  0.01166721  0.14996576  0.02862255 -0.03039438 -0.05383234
  0.06767236 -0.06728604 -0.06970047  0.04501471 -0.0427916   0.0008449
  0.0287628   0.07793065 -0.05218444  0.15594816 -0.09688261 -0.00739052
  0.03250176 -0.03479643  0.02306124 -0.03193174  0.13146067  0.11108675
  0.02468847  0.00468462 -0.0239

In [33]:
print(nerus_tokenizer.tag_to_id)
# print(nerus_tokenizer.id_)
#print(dataset_tags_id)


{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3, 'B-LOC': 4, 'O': 5, 'I-PER': 6, 'B-PER': 7, 'B-ORG': 8, 'I-ORG': 9, 'I-LOC': 10}


In [34]:
from sklearn.model_selection import train_test_split

tags_tensor  = torch.tensor(dataset_tags_id)
sents_tensor = torch.tensor(dataset_words_id)

words_train, words_test = train_test_split(sents_tensor, test_size=0.2, random_state=52)
tags_train, tags_test = train_test_split(tags_tensor, test_size=0.2, random_state=52)

train_dataset_pt = torch.utils.data.TensorDataset(words_train, tags_train.long())
test_dataset_pt = torch.utils.data.TensorDataset(words_test, tags_test.long())

print(tags_tensor.shape)
print(sents_tensor.shape)


torch.Size([11645, 50])
torch.Size([11645, 50, 300])


In [35]:
class RNNclassifier(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(
            self,
            output_dim,
            emb_dim=300,
            hidden_dim=10,
            num_layers=3,
            bidirectional=False,
            p=0.7,
            batchnorm=False):
        super(RNNclassifier, self).__init__()

        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.biderect = bidirectional
        self.dropout = p
        self.batchnorm = batchnorm

        self.lstm = torch.nn.LSTM(
            emb_dim,
            hidden_dim,
            num_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=p)

        if self.batchnorm:
            self.normalization = torch.nn.BatchNorm1d(MAX_SENTENCE_LENGTH)

        self.linear = torch.nn.Linear(int(bidirectional + 1)*hidden_dim, output_dim)

    def forward(self, input):
        #print(input.size(-1))
        if input.size(-1) != 300:
          global err_cnt
          err_cnt += 1
          print(err_cnt)
          return torch.zeros(self.output_dim)
          #print(input.shape)
          #print(input)
        lstm_out, _ = self.lstm(input.float())
        if self.batchnorm:
            lstm_out = self.normalization(lstm_out)

        return self.linear(lstm_out)

    def __str__(self):
        return f"layer_size={self.hidden_dim}" +     \
               f"_layers_num={self.num_layers}" +    \
               f"_bidirect={int(self.biderect)}" +   \
               f"_dropout={self.dropout}" +          \
               f"_batchnorm={int(self.batchnorm)}"


In [36]:
class TensorboardCallback:
    def __init__(self, writer, test_dataset, loss_function, delimeter=100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.test_dataset = test_dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar(f'LOSS/train/{model}', loss, self.step)

        if self.step % self.delimeter == 0:
            self.writer.add_graph(model, torch.zeros(1, MAX_SENTENCE_LENGTH, dtype=torch.long).to(model.device))

            model_metrics, model_loss = evaluate_model_score(model, self.test_dataset, self.batch_size, self.loss_function, model.device)

            self.writer.add_scalar(f'LOSS/test/{model}', model_loss, self.step)
            self.writer.add_text(f'METRICS/{model}', str(model_metrics), self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)


In [37]:
config = dict()
config['output_dim'] = nerus_tokenizer.tag_to_id.__len__()
config['emb_dim'] = 300
config['hidden_dim'] = 64
config['num_layers'] = 2
config['bidirectional'] = False
config['batchnorm'] = False
config['p'] = 0.2


In [41]:
err_cnt = 0

model = RNNclassifier(**config).to(device)

loss_function = torch.nn.CrossEntropyLoss(ignore_index=nerus_tokenizer.tag_to_id["[PAD]"])
optimizer = torch.optim.Adam

writer = SummaryWriter(log_dir=f'tensorboard/layer_width/testtrain')
nerus_callback = TensorboardCallback(writer, test_dataset_pt, loss_function)

trainer(count_of_epoch=10,
    batch_size=64,
    dataset=train_dataset_pt,
    model=model,
    loss_function=loss_function,
    optimizer=optimizer,
    lr=0.001,
    callback=nerus_callback)


epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/146 [00:00<?, ?it/s]

1
2
3


  0%|          | 0/146 [00:00<?, ?it/s]

4
5
6


  0%|          | 0/146 [00:00<?, ?it/s]

7
8
9
10
11
12


  0%|          | 0/146 [00:00<?, ?it/s]

13
14
15


  0%|          | 0/146 [00:00<?, ?it/s]

16
17
18
19
20
21


  0%|          | 0/146 [00:00<?, ?it/s]

22
23
24


  0%|          | 0/146 [00:00<?, ?it/s]

25
26
27
28
29
30


  0%|          | 0/146 [00:00<?, ?it/s]

31
32
33


  0%|          | 0/146 [00:00<?, ?it/s]

34
35
36
37
38
39


  0%|          | 0/146 [00:00<?, ?it/s]

40
41
42


In [None]:
device


In [42]:
%load_ext tensorboard


In [43]:
%tensorboard --logdir tensorboard/layer_width/testtrain


In [59]:
model.eval()
test_data = ['[CLS]',
 'Вице-премьер', 'по', 'социальным', 'вопросам', 'Татьяна', 'Голикова', 'рассказала', ',', 'в', 'каких', 'регионах', 'России', 'зафиксирована', 'наиболее', 'высокая', 'смертность', 'от', 'рака', ',', 'сообщает', 'РИА', 'Новости', '.',

 '[SEP]']
tok_test_data = torch.tensor([ft.get_word_vector(i) for i in test_data])
print(tok_test_data.shape)
output = model(tok_test_data.to(device))
output = torch.argmax(output, dim=1)
print(output)
print(nerus_tokenizer.lstm_tokens)
print(nerus_tokenizer.ids_to_word)

from pprint import pprint
pprint([(w, nerus_tokenizer.ids_to_word[int(t)]) for w, t in zip(test_data, output)])
#print([nerus_tokenizer.ids_to_word[int(i)] for i in output]
#print([nerus_tokenizer.ids_to_word[int(i)] for i in output])

model.train()


torch.Size([25, 300])
tensor([1, 5, 5, 5, 5, 7, 6, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 8, 9, 5,
        2], device='cuda:0')
['[PAD]', '[CLS]', '[SEP]', '[UNK]']
{0: '[PAD]', 1: '[CLS]', 2: '[SEP]', 3: '[UNK]', 4: 'B-LOC', 5: 'O', 6: 'I-PER', 7: 'B-PER', 8: 'B-ORG', 9: 'I-ORG', 10: 'I-LOC'}
[('[CLS]', '[CLS]'),
 ('Вице-премьер', 'O'),
 ('по', 'O'),
 ('социальным', 'O'),
 ('вопросам', 'O'),
 ('Татьяна', 'B-PER'),
 ('Голикова', 'I-PER'),
 ('рассказала', 'O'),
 (',', 'O'),
 ('в', 'O'),
 ('каких', 'O'),
 ('регионах', 'O'),
 ('России', 'B-LOC'),
 ('зафиксирована', 'O'),
 ('наиболее', 'O'),
 ('высокая', 'O'),
 ('смертность', 'O'),
 ('от', 'O'),
 ('рака', 'O'),
 (',', 'O'),
 ('сообщает', 'O'),
 ('РИА', 'B-ORG'),
 ('Новости', 'I-ORG'),
 ('.', 'O'),
 ('[SEP]', '[SEP]')]


RNNclassifier(
  (lstm): LSTM(300, 64, num_layers=2, batch_first=True, dropout=0.2)
  (linear): Linear(in_features=64, out_features=11, bias=True)
)