In [1]:
import tqdm
import torch
import torch.nn as nn
import numpy as np
from matplotlib import pyplot as plt
import torchvision

In [4]:
!pip install nerus

^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

Загрузим датасет NERUS через предложенный интерфейс

In [2]:
NEURUS = './nerus_lenta.conllu.gz'
from nerus import load_nerus
docs = load_nerus(NEURUS)

In [3]:
doc = next(docs)

Посмотрим как выглядит элемент датасета

In [5]:
doc.sents[0].tokens[0]

NerusToken(
    id='1',
    text='Вице-премьер',
    pos='NOUN',
    feats={'Animacy': 'Anim',
     'Case': 'Nom',
     'Gender': 'Masc',
     'Number': 'Sing'},
    head_id='7',
    rel='nsubj',
    tag='O'
)

Создадим класс словаря для pos_tags. Для каждого встретившегося нового тега используем следующий новый номер. Не забудем добавить технические теги для начала, конца и паддинга.

In [3]:
class PosesVocabulary:
    def __init__(self):
        self.idx2pos = {0: '[PAD]', 1: '[CLS]', 2: '[SEP]'}
        self.pos2idx = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2}
        self.pos_tags = ['ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN',
                         'VERB', 'ADP', 'AUX', 'CCONJ', 'DET', 'NUM',
                         'PART', 'PRON', 'SCONJ', 'PUNCT', 'SYM', 'X']

    def __len__(self):
        return len(self.idx2pos)

    def build_vocabulary(self):
        idx = 3
        for pos in self.pos_tags:
            self.idx2pos[idx] = pos
            self.pos2idx[pos] = idx
            idx += 1

    def numericalize(self, poses):
        return [self.pos2idx[pos] for pos in poses]

В целом тот же самый словарь делаем и для слов. Единственное для задание нужно уметь изменять размер словаря. Для этого мы используем кучу, в которй будем хранить чило раз, когда каждое слово встретилось. Соотвественно, если мы хотим сделать соварь из n слов, то берем n самых частых слов.

In [4]:
import heapq
class WordsVocabulary:
    def __init__(self, freq_threshold):
        self.idx2word = {0: '[PAD]', 1: '[CLS]', 2: '[SEP]', 3: '[UNK]'}
        self.word2idx = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3}
        self.freq_threshold = freq_threshold
        self.frequencies = {}

    def __len__(self):
        return len(self.idx2word)

    def build_vocabulary(self, sents, voc_size = None):
        idx = len(self.word2idx)
        for sent in sents:
            for word in sent:
                if word not in self.frequencies:
                    self.frequencies[word] = 1
                else:
                    self.frequencies[word] += 1
        
        #Отличие здесь только в том, что с помощью кучи берем n самых встречаемых
        if voc_size == None:
            for sent in sents:
                for word in sent:
                    if self.frequencies[word] >= self.freq_threshold:
                        self.word2idx[word] = idx
                        self.idx2word[idx] = word
                        idx += 1
        else:
            words_sorted_by_frequencies = heapq.nlargest(voc_size - len(self.idx2word), self.frequencies, key=self.frequencies.get)
            for word in words_sorted_by_frequencies:
                self.word2idx[word] = idx
                self.idx2word[idx] = word
                idx += 1
    def rebuild_vocabulary(self, voc_size):
        #На случай если нужно поменять размер словаря
        self.idx2word = {0: '[PAD]', 1: '[CLS]', 2: '[SEP]', 3: '[UNK]'}
        self.word2idx = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[UNK]': 3}
        idx = len(self.word2idx)
        words_sorted_by_frequencies = heapq.nlargest(voc_size - len(self.idx2word), self.frequencies, key=self.frequencies.get)
        for word in words_sorted_by_frequencies:
            self.word2idx[word] = idx
            self.idx2word[idx] = word
            idx += 1
    def numericalize(self, tokens):
        return [self.word2idx[token] if token in self.word2idx else self.word2idx['[UNK]']
                for token in tokens]
        

Создадим класс dataset для данных из nerus. Все документы пердзагрузим в токенезироанныом виде, каждому токену сопостаим его тег. По номеру элемента датасета мы должны уметь выдавать предложенитя в приведеном виде, так чтобы все элементы имели одну длину, если нужно обрезать их до одной длины, если нужно их дополнить [PAD], что необхдимо для объединения их в батчи.

In [5]:
from torch.utils.data import DataLoader, Dataset
class Nerus(Dataset):
    def __init__(self, tokenizer,  n_docs=1000, filename=NEURUS, freq_threshold=1, pad_to_max_length = False, max_length = 20, voc_size = None):
        docs_generator = load_nerus(filename)
        docs = [next(docs_generator) for _ in range(n_docs)]
        # array of splitted sentences for each document from docs
        self.sents = []
        self.sents_poses = []
        for doc in docs:
            for sent in doc.sents:
                sents = []
                poses = []
                for token in sent.tokens:
                    sents+=tokenizer.tokenize(token.text)
                    
                    t = ['[PAD]'] * len(tokenizer.tokenize(token.text))
                    t[-1] = token.pos
                    poses +=t
                self.sents.append(sents)
                self.sents_poses.append(poses)
        
        # initialize vocabularies and build them
        self.words_vocab = WordsVocabulary(freq_threshold)
        self.words_vocab.build_vocabulary(self.sents, voc_size = voc_size)
        self.poses_vocab = PosesVocabulary()
        self.poses_vocab.build_vocabulary()
        self.pad_to_max_length =  pad_to_max_length
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        tokens = self.sents[idx]
        poses = self.sents_poses[idx]
        # tokens
        max_length = self.max_length
        if not self.pad_to_max_length:
            max_length = min(max_length, len(tokens))
        if len(self.words_vocab.numericalize(tokens)) < max_length :
            numericalized_tokens = [self.words_vocab.word2idx['[CLS]']]+ self.words_vocab.numericalize(tokens)+[self.words_vocab.word2idx['[SEP]']] + [self.words_vocab.word2idx['[PAD]']]*(max_length-len(tokens))
            numericalized_poses = [self.poses_vocab.pos2idx['[CLS]']]+ self.poses_vocab.numericalize(poses)+[self.poses_vocab.pos2idx['[SEP]']] + [self.poses_vocab.pos2idx['[PAD]']]*(max_length-len(tokens))          
        else :
            numericalized_tokens = [self.words_vocab.word2idx['[CLS]']]+ self.words_vocab.numericalize(tokens)[:max_length]+[self.words_vocab.word2idx['[SEP]']]
            numericalized_poses = [self.poses_vocab.pos2idx['[CLS]']]+ self.poses_vocab.numericalize(poses)[:max_length]+[self.poses_vocab.pos2idx['[SEP]']] 
        source = torch.tensor(numericalized_tokens)
        target = torch.tensor(numericalized_poses)
        
        return source, target

Токенайзер ипользуем предложенный в задании LaBSE.

In [6]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")

Инициализируем датасет в приведеном виде

In [7]:
dataset = Nerus(tokenizer, n_docs = 1000, max_length=100, pad_to_max_length=True, voc_size = 10000)

Посмотрим на полученный размер словаря

In [14]:
len(dataset.words_vocab.idx2word)

303641

Стандартные функции для обучения, примерно те же что и впредыдущем задании. Т.к. у нас все та же задача классификации.

In [8]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()
    
    output = model(x_batch.to(model.device))
    num_class = output.shape[-1]
    loss = loss_function(output.reshape(-1, num_class), y_batch.to(model.device).reshape(-1))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()

In [9]:

def train_epoch(train_generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
        
        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)
            
        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total

In [10]:
def trainer(count_of_epoch, 
            batch_size, 
            dataset,
            model, 
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)
    
    iterations = tqdm.tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    epoch_loss = None
    for it in iterations:
        batch_generator = tqdm.tqdm(
            torch.utils.data.DataLoader(dataset=dataset, 
                                        batch_size=batch_size, 
                                        shuffle=True, pin_memory=True), 
            leave=False, total=len(dataset)//batch_size+(len(dataset)%batch_size>0), position=1)
        
        epoch_loss = train_epoch(train_generator=batch_generator, 
                    model=model, 
                    loss_function=loss_function, 
                    optimizer=optima, 
                    callback=callback)
        
        iterations.set_postfix({'train epoch loss': epoch_loss})
    return epoch_loss

Посмотрим какие токены имеют отдельные слова

In [145]:
tokens = tokenizer(['man',  'mankind','человек', 'сова', 'глаза', 'lamp','лампа','лемма', 'страна', 'вице-премьер', 'экспропреированный', 'телевезионный'], padding=True,
                    truncation=True, 
                    max_length=512, 
                    return_tensors='pt')

In [68]:
tokens

{'input_ids': tensor([[   101,  15351,    102,      0,      0,      0,      0],
        [   101, 415562,    102,      0,      0,      0,      0],
        [   101,  23047,    102,      0,      0,      0,      0],
        [   101,  15373,  16522,    102,      0,      0,      0],
        [   101,  64861,    102,      0,      0,      0,      0],
        [   101,  68097,    102,      0,      0,      0,      0],
        [   101, 238225,    102,      0,      0,      0,      0],
        [   101,  86379, 293807,    102,      0,      0,      0],
        [   101,  20563,    102,      0,      0,      0,      0],
        [   101,  74795,    118,  58084,    102,      0,      0],
        [   101,  88753, 500786, 396721, 346325,  16846,    102],
        [   101,  19102,  25964,  24931, 280944,    102,      0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0

Небольшое предложение

In [71]:
tokens1 = tokenizer(['Это телевезионный'], padding=True,
                    truncation=True, 
                    max_length=512, 
                    return_tensors='pt')

In [73]:
tokens1.word_ids()

[None, 0, 1, 1, 1, 1, None]

In [76]:
tokens1.tokens()

['[CLS]', 'Это', 'теле', '##ве', '##зи', '##онный', '[SEP]']

Как это соотносится с их номерами

In [127]:
tokenizer.convert_tokens_to_ids(tokens1)

[88753, 500786, 396721, 346325, 16846]

In [None]:
tokenizer

In [111]:
tokens

{'input_ids': tensor([[   101,  15351,    102,      0,      0,      0,      0],
        [   101, 415562,    102,      0,      0,      0,      0],
        [   101,  23047,    102,      0,      0,      0,      0],
        [   101,  15373,  16522,    102,      0,      0,      0],
        [   101,  64861,    102,      0,      0,      0,      0],
        [   101,  68097,    102,      0,      0,      0,      0],
        [   101, 238225,    102,      0,      0,      0,      0],
        [   101,  86379, 293807,    102,      0,      0,      0],
        [   101,  20563,    102,      0,      0,      0,      0],
        [   101,  74795,    118,  58084,    102,      0,      0],
        [   101,  88753, 500786, 396721, 346325,  16846,    102],
        [   101,  19102,  25964,  24931, 280944,    102,      0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0

В качетстве классификатора использкем LSTM, которому еа входе подают эмбеддинги токенов, а ее выход подается в один полносвязанный слой. Выход сети представлется из себя вероятности тегов для токенов.

In [11]:

class RNNclassifier(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
    def __init__(self, vocab_dim, output_dim, emb_dim = 10, hidden_dim = 10, 
                 num_layers = 3, bidirectional = False, p=0.7, batch_norm = False):
        super(RNNclassifier, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)
        self.encoder = torch.nn.LSTM(emb_dim, hidden_dim, num_layers, 
                                     bidirectional=bidirectional, 
                                     batch_first=True, dropout=p)
        self.batch_norm = batch_norm
        if (self.batch_norm): 
            self.bn = nn.BatchNorm1d(int(bidirectional + 1)*hidden_dim)
        self.linear = torch.nn.Linear(
            int(bidirectional + 1)*hidden_dim, 
            output_dim)
        self.logsoftmax = nn.LogSoftmax()
    def forward(self, input):
        input = self.embedding(input)
        lstm_out, _ = self.encoder(input)

        if (self.batch_norm): 
            lstm_out = torch.transpose(lstm_out, -2, -1)
            lstm_out = self.bn(lstm_out)
            lstm_out = torch.transpose(lstm_out, -2, -1)
        act = self.linear(lstm_out)
        return act

In [12]:
class Tokenizer(object):
    def __init__(self, word_to_ind, tokenizer):
        self.word_to_ind = word_to_ind
        self.tokenizer = tokenizer
    def __call__(self, sentences, max_length = 10, pad_to_max_length = False):
        tokens = self.tokenizer.tokenize_sents(sentences)
        if not pad_to_max_length:
            max_length = min(max_length, max(map(len, tokens)))
        tokens = [['[CLS]']+s+['[SEP]'] + ['[PAD]']*(max_length-len(s)) \
                  if len(s) < max_length \
                  else ['[CLS]']+s[:max_length]+['[SEP]'] \
                  for s in tokens ]
        ids = [[self.word_to_ind.get(w, self.word_to_ind['[UNK]']) for w in sent] for sent in tokens]
        return torch.tensor(ids)

Обучаем на GPU

In [13]:
device = torch.device("cuda")

Для начала иницализурем сет ьна случано подобранных гиперпараметрах

In [16]:
config = dict()
config['vocab_dim'] = len(dataset.words_vocab.word2idx)
config['output_dim'] = len(dataset.poses_vocab.pos2idx)
config['emb_dim'] = 100
config['hidden_dim'] = 30
config['num_layers'] = 4
config['bidirectional'] = False
config['p'] = 0.7
config['batch_norm'] = True

model = RNNclassifier(**config)
_ = model.to(device)

In [45]:
config['output_dim']

20

In [46]:
len(dataset.words_vocab.word2idx)

10000

Проверять результаты классифкации будем на Recall, Accuracy и их среднем гармоническом F1-score

In [31]:
from sklearn.metrics import classification_report

In [115]:
dataset[0][0]

tensor([   1, 5659,   12,  603,   19, 9033,  100, 2997, 2813, 1409, 5660,   17,
         831,    4,    6, 2638, 1840,   44,   18, 1656, 1657, 3648, 2127, 9034,
         904,  856, 1464,   23, 7025,    4,   62,  475,  495,    5,    2,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0])

In [116]:
dataset[0][1].shape

torch.Size([102])

In [117]:
dataset[0][2]

IndexError: tuple index out of range

Помотрим как справлется с классификацией на датасете необученная сеть

In [48]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset, 
                                              batch_size=64, 
                                              pin_memory=True)
            
pred = []
real = []
model.eval()
for its, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = model(x_batch)
    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().flatten().tolist())
    real.extend(y_batch.cpu().numpy().flatten().tolist())
real = np.array(real)
pred = np.array(pred)
print(classification_report(real, pred, sample_weight= 1.0 * (real != 0)))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00  952216.0
           1       0.00      0.00      0.00   11645.0
           2       0.00      0.00      0.00   11645.0
           3       0.00      0.00      0.00   19021.0
           4       0.00      0.00      0.00    5405.0
           5       0.00      0.00      0.00      12.0
           6       0.00      0.00      0.00   58047.0
           7       0.00      0.00      0.00   15256.0
           8       0.00      0.00      0.00   24394.0
           9       0.02      1.00      0.04   24138.0
          10       0.00      0.00      0.00    1415.0
          11       0.00      0.00      0.00    5005.0
          12       0.00      0.00      0.00    3271.0
          13       0.00      0.00      0.00    3987.0
          14       0.00      0.00      0.00    2537.0
          15       0.00      0.00      0.00    7244.0
          16       0.00      0.00      0.00    3608.0
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Аналогично предыдущему заданию исаользем в качестве функции потреь кросс-энтропию, а в качестве оптимизатора Adam

In [14]:
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam

Посмотрим на качество обучения на этих значениях гиперпараметров

In [56]:
trainer(count_of_epoch=20, 
        batch_size=64, 
        dataset=dataset,
        model=model, 
        loss_function=loss_function,
        optimizer = optimizer,
        lr=0.001,
        callback=None)

epoch: 100%|██████████| 20/20 [00:22<00:00,  1.15s/it, train epoch loss=0.525]


0.5246788402952929

In [166]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset, 
                                              batch_size=64, 
                                              pin_memory=True)
            
pred = []
real = []
model.eval()
for its, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = model(x_batch)
    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().flatten().tolist())
    real.extend(y_batch.cpu().numpy().flatten().tolist())
real = np.array(real)
pred = np.array(pred)
print(classification_report(real, pred, sample_weight= 1.0 * (real != 0)))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00  952216.0
           1       1.00      1.00      1.00   11645.0
           2       0.49      0.99      0.66   11645.0
           3       0.07      0.12      0.09   19021.0
           4       0.47      0.12      0.20    5405.0
           5       0.00      0.00      0.00      12.0
           6       0.39      0.97      0.55   58047.0
           7       0.09      0.03      0.04   15256.0
           8       0.34      0.76      0.47   24394.0
           9       0.92      0.12      0.22   24138.0
          10       0.00      0.00      0.00    1415.0
          11       0.64      0.57      0.60    5005.0
          12       0.00      0.00      0.00    3271.0
          13       0.00      0.00      0.00    3987.0
          14       0.33      0.05      0.09    2537.0
          15       0.12      0.51      0.19    7244.0
          16       0.39      0.84      0.53    3608.0
          17       0.04    

  _warn_prf(average, modifier, msg_start, len(result))


Как видим для некторых классов повысилась не только точность угадываения Accuracy, но и доля угадыннх меток относительно всего класса Recall. Но вот с более редко встречающимися классами проблема, для них резултаты не поменялись и модель их просто не выдает.

In [149]:
real[0:10]

array([1, 0, 0, ..., 0, 0, 0])

In [148]:
pred

array([1, 6, 6, ..., 6, 6, 6])

Попробуем перебрать структуру самой сети с помощью optuna как в прошлом задании.

In [15]:
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp

2024-04-03 04:52:51.096005: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Опять логгирование при обучении вынесем в отдельный интерфейс

In [16]:
class callback():
    def __init__(self, writer, dataset, loss_function, metrics, hparams,  delimeter = 100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.hparams = hparams
        self.dataset = dataset
        self.metrics = metrics

    def forward(self, model, loss):
        self.step += 1
        with self.writer.as_default():
            hp.hparams(self.hparams)
            tf.summary.scalar('Loss', loss, self.step)
        
        if self.step % self.delimeter == 0:
            
            batch_generator = torch.utils.data.DataLoader(dataset = self.dataset, 
                                                          batch_size=self.batch_size)
            
            pred = []
            real = []
            test_loss = 0
            model.eval()
            
            for its, (x_batch, y_batch) in enumerate(batch_generator):
                x_batch = x_batch.to(device)
                with torch.no_grad():
                    output = model(x_batch)
                pred.extend(torch.argmax(output, dim=-1).cpu().numpy().flatten().tolist())
                real.extend(y_batch.cpu().numpy().flatten().tolist())
            real = np.array(real)
            pred = np.array(pred)

            test_loss = self.loss_function(real, pred)
            with self.writer.as_default():
                tf.summary.scalar(self.metrics, test_loss, self.step)

            x = x_batch[-10:]
          
    def __call__(self, model, loss):
        return self.forward(model, loss)

In [17]:
import optuna
import optuna_dashboard

Зададим сетку на которой будем перебирать гиперпараметры

In [18]:
HP_VOCAB_DIM = hp.HParam('vocab_dim', hp.IntInterval(1* 1000, 10* 1000))
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.5, 0.8))
HP_BATCH_NORM = hp.HParam('batch_norm', hp.Discrete([True, False]))
HP_HIDDEN_DIM = hp.HParam('hidden_dim', hp.IntInterval(5, 30))
HP_EMB_DIM = hp.HParam('emb_dim', hp.IntInterval(50, 100))
HP_NUM_LAYERS = hp.HParam('num_layers', hp.IntInterval(1, 4))
METRICS_NAME = "Accuracy"
DELIMETER = 100
IMAGE_SIZE = 28
EPOCHS = 100
BATCH_SIZE  = 64
COLORS = 1
MIN_CHANNELS = 4

Каждый набор гиперпараметром логируем

In [19]:
logs_dir = 'logs/hparam_tuning'
writer_hparam = tf.summary.create_file_writer('logs/hparam_tuning')
with writer_hparam.as_default():
  hp.hparams_config(
    hparams=[HP_VOCAB_DIM,
      HP_EMB_DIM,
      HP_HIDDEN_DIM,
      HP_NUM_LAYERS,
      HP_BATCH_NORM,
      HP_DROPOUT],
    metrics=[hp.Metric(METRICS_NAME , display_name='Accuracy')],
  )

2024-04-03 04:52:57.684607: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-03 04:52:57.695076: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-03 04:52:57.695111: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-03 04:52:57.697117: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-03 04:52:57.697141: I tensorflow/compile

Самая простая метрика которую можн овзять - это точность

In [20]:
import sklearn

In [21]:
def prob_accuracy(pred, real):
    return sklearn.metrics.accuracy_score(real, pred, sample_weight= 1.0 * (real != 0))

In [2]:
%load_ext tensorboard
%tensorboard --logdir logs/hparam_tuning

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 40153), started 0:00:03 ago. (Use '!kill 40153' to kill it.)

Запустим перебор гиперпараметров с помощью optina, обернув это в функцию, которая по параметрам выдает значение метрики

In [22]:
NUM_TRIALS = 30
def objective(trial):
    run_number = trial.number
    run_name = logs_dir + f'/run-{run_number}'
    print("Current run: " + run_name)
    vocab_dim =  trial.suggest_int('vocab_dim', 1, 10) * 1000
    dropout = trial.suggest_float('dropout', 0.5, 0.8)
    emb_dim = trial.suggest_int('emb_dim', 50, 100)
    hidden_dim = trial.suggest_int('hidden_dim', 5, 30)
    batch_norm = trial.suggest_categorical('batch_norm', [True, False])
    num_layers = trial.suggest_int('layers_count', 1, 4)
    writer = tf.summary.create_file_writer(run_name)

    hparams = {
      HP_VOCAB_DIM: vocab_dim,
      HP_EMB_DIM:  emb_dim,
      HP_HIDDEN_DIM: hidden_dim,
      HP_NUM_LAYERS: num_layers,
      HP_BATCH_NORM: batch_norm,
      HP_DROPOUT: dropout
    }
    dataset.words_vocab.rebuild_vocabulary(hparams[HP_VOCAB_DIM])
    config = dict()
    config['vocab_dim'] = len(dataset.words_vocab.word2idx)
    config['output_dim'] = len(dataset.poses_vocab.pos2idx)
    config['emb_dim'] = hparams[HP_EMB_DIM]
    config['hidden_dim'] = hparams[HP_HIDDEN_DIM]
    config['num_layers'] = hparams[HP_NUM_LAYERS]
    config['bidirectional'] = False
    config['p'] = hparams[HP_DROPOUT]
    config['batch_norm'] = hparams[HP_BATCH_NORM]
    device = torch.device("cuda")
    rnn = RNNclassifier(**config)
    rnn = rnn.to(device)
    loss = trainer(count_of_epoch=EPOCHS, 
        batch_size=BATCH_SIZE, 
        dataset=dataset,
        model=rnn, 
        loss_function=torch.nn.CrossEntropyLoss(ignore_index=0),
        optimizer = torch.optim.Adam,
        lr=0.001,
        callback=callback(writer, dataset, prob_accuracy, METRICS_NAME , hparams, DELIMETER))
    
    print(loss)
    return loss
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=NUM_TRIALS)

[I 2024-04-03 04:53:10,962] A new study created in memory with name: no-name-30b81933-83e1-4bf9-b28a-b9aea9726222


Current run: logs/hparam_tuning/run-0


epoch: 100%|██████████| 100/100 [04:05<00:00,  2.46s/it, train epoch loss=0.828]
[I 2024-04-03 04:57:16,692] Trial 0 finished with value: 0.828410155464007 and parameters: {'vocab_dim': 6, 'dropout': 0.6916383320313331, 'emb_dim': 97, 'hidden_dim': 15, 'batch_norm': False, 'layers_count': 3}. Best is trial 0 with value: 0.828410155464007.


0.828410155464007
Current run: logs/hparam_tuning/run-1


epoch: 100%|██████████| 100/100 [03:51<00:00,  2.32s/it, train epoch loss=0.241]
[I 2024-04-03 05:01:08,573] Trial 1 finished with value: 0.2414513362175715 and parameters: {'vocab_dim': 3, 'dropout': 0.7539793858558304, 'emb_dim': 72, 'hidden_dim': 15, 'batch_norm': False, 'layers_count': 1}. Best is trial 0 with value: 0.828410155464007.


0.2414513362175715
Current run: logs/hparam_tuning/run-2


epoch: 100%|██████████| 100/100 [04:13<00:00,  2.53s/it, train epoch loss=0.435]
[I 2024-04-03 05:05:22,075] Trial 2 finished with value: 0.4352338940199479 and parameters: {'vocab_dim': 10, 'dropout': 0.5858516834162082, 'emb_dim': 89, 'hidden_dim': 29, 'batch_norm': False, 'layers_count': 4}. Best is trial 0 with value: 0.828410155464007.


0.4352338940199479
Current run: logs/hparam_tuning/run-3


epoch: 100%|██████████| 100/100 [03:49<00:00,  2.29s/it, train epoch loss=0.515]
[I 2024-04-03 05:09:11,420] Trial 3 finished with value: 0.5149787447748393 and parameters: {'vocab_dim': 1, 'dropout': 0.6156701810242755, 'emb_dim': 95, 'hidden_dim': 9, 'batch_norm': False, 'layers_count': 1}. Best is trial 0 with value: 0.828410155464007.


0.5149787447748393
Current run: logs/hparam_tuning/run-4


epoch: 100%|██████████| 100/100 [04:08<00:00,  2.49s/it, train epoch loss=0.587]
[I 2024-04-03 05:13:20,196] Trial 4 finished with value: 0.5870874889524467 and parameters: {'vocab_dim': 6, 'dropout': 0.5009906004958044, 'emb_dim': 93, 'hidden_dim': 12, 'batch_norm': False, 'layers_count': 3}. Best is trial 0 with value: 0.828410155464007.


0.5870874889524467
Current run: logs/hparam_tuning/run-5


epoch: 100%|██████████| 100/100 [04:02<00:00,  2.43s/it, train epoch loss=0.0159]
[I 2024-04-03 05:17:22,749] Trial 5 finished with value: 0.015899894417800608 and parameters: {'vocab_dim': 8, 'dropout': 0.5543456065149572, 'emb_dim': 65, 'hidden_dim': 25, 'batch_norm': True, 'layers_count': 1}. Best is trial 0 with value: 0.828410155464007.


0.015899894417800608
Current run: logs/hparam_tuning/run-6


epoch: 100%|██████████| 100/100 [04:16<00:00,  2.57s/it, train epoch loss=0.0706]
[I 2024-04-03 05:21:39,720] Trial 6 finished with value: 0.07062117609664471 and parameters: {'vocab_dim': 4, 'dropout': 0.6541551157409462, 'emb_dim': 95, 'hidden_dim': 25, 'batch_norm': True, 'layers_count': 1}. Best is trial 0 with value: 0.828410155464007.


0.07062117609664471
Current run: logs/hparam_tuning/run-7


epoch: 100%|██████████| 100/100 [04:05<00:00,  2.46s/it, train epoch loss=0.983]
[I 2024-04-03 05:25:45,657] Trial 7 finished with value: 0.9825970407922465 and parameters: {'vocab_dim': 4, 'dropout': 0.6695602738922346, 'emb_dim': 100, 'hidden_dim': 9, 'batch_norm': True, 'layers_count': 4}. Best is trial 7 with value: 0.9825970407922465.


0.9825970407922465
Current run: logs/hparam_tuning/run-8


epoch: 100%|██████████| 100/100 [04:12<00:00,  2.53s/it, train epoch loss=0.0161]
[I 2024-04-03 05:29:58,593] Trial 8 finished with value: 0.0160606201100709 and parameters: {'vocab_dim': 9, 'dropout': 0.5927551164548851, 'emb_dim': 69, 'hidden_dim': 18, 'batch_norm': True, 'layers_count': 1}. Best is trial 7 with value: 0.9825970407922465.


0.0160606201100709
Current run: logs/hparam_tuning/run-9


epoch: 100%|██████████| 100/100 [04:10<00:00,  2.51s/it, train epoch loss=0.255]
[I 2024-04-03 05:34:09,539] Trial 9 finished with value: 0.2551578123920616 and parameters: {'vocab_dim': 3, 'dropout': 0.7012781375577928, 'emb_dim': 55, 'hidden_dim': 9, 'batch_norm': True, 'layers_count': 1}. Best is trial 7 with value: 0.9825970407922465.


0.2551578123920616
Current run: logs/hparam_tuning/run-10


epoch: 100%|██████████| 100/100 [04:07<00:00,  2.47s/it, train epoch loss=1.83]
[I 2024-04-03 05:38:16,953] Trial 10 finished with value: 1.8275855116498168 and parameters: {'vocab_dim': 1, 'dropout': 0.7763459654879962, 'emb_dim': 83, 'hidden_dim': 6, 'batch_norm': True, 'layers_count': 4}. Best is trial 10 with value: 1.8275855116498168.


1.8275855116498168
Current run: logs/hparam_tuning/run-11


epoch: 100%|██████████| 100/100 [04:04<00:00,  2.44s/it, train epoch loss=1.85]
[I 2024-04-03 05:42:21,286] Trial 11 finished with value: 1.8487955672078606 and parameters: {'vocab_dim': 1, 'dropout': 0.7991440144916659, 'emb_dim': 83, 'hidden_dim': 5, 'batch_norm': True, 'layers_count': 4}. Best is trial 11 with value: 1.8487955672078606.


1.8487955672078606
Current run: logs/hparam_tuning/run-12


epoch: 100%|██████████| 100/100 [04:06<00:00,  2.46s/it, train epoch loss=1.87]
[I 2024-04-03 05:46:27,643] Trial 12 finished with value: 1.8662073695234804 and parameters: {'vocab_dim': 1, 'dropout': 0.7953055750286426, 'emb_dim': 82, 'hidden_dim': 5, 'batch_norm': True, 'layers_count': 4}. Best is trial 12 with value: 1.8662073695234804.


1.8662073695234804
Current run: logs/hparam_tuning/run-13


epoch: 100%|██████████| 100/100 [04:02<00:00,  2.42s/it, train epoch loss=1.7]
[I 2024-04-03 05:50:30,007] Trial 13 finished with value: 1.704812552540944 and parameters: {'vocab_dim': 1, 'dropout': 0.7953682077702475, 'emb_dim': 81, 'hidden_dim': 5, 'batch_norm': True, 'layers_count': 3}. Best is trial 12 with value: 1.8662073695234804.


1.704812552540944
Current run: logs/hparam_tuning/run-14


epoch: 100%|██████████| 100/100 [04:06<00:00,  2.46s/it, train epoch loss=1.82]
[I 2024-04-03 05:54:36,073] Trial 14 finished with value: 1.8230235767446397 and parameters: {'vocab_dim': 2, 'dropout': 0.736559761377283, 'emb_dim': 80, 'hidden_dim': 5, 'batch_norm': True, 'layers_count': 4}. Best is trial 12 with value: 1.8662073695234804.


1.8230235767446397
Current run: logs/hparam_tuning/run-15


epoch: 100%|██████████| 100/100 [04:00<00:00,  2.40s/it, train epoch loss=0.368]
[I 2024-04-03 05:58:36,399] Trial 15 finished with value: 0.3679536295658839 and parameters: {'vocab_dim': 4, 'dropout': 0.718517046633522, 'emb_dim': 85, 'hidden_dim': 19, 'batch_norm': True, 'layers_count': 2}. Best is trial 12 with value: 1.8662073695234804.


0.3679536295658839
Current run: logs/hparam_tuning/run-16


epoch: 100%|██████████| 100/100 [03:58<00:00,  2.39s/it, train epoch loss=0.749]
[I 2024-04-03 06:02:35,056] Trial 16 finished with value: 0.7491966027412972 and parameters: {'vocab_dim': 2, 'dropout': 0.7958094874398419, 'emb_dim': 64, 'hidden_dim': 13, 'batch_norm': True, 'layers_count': 2}. Best is trial 12 with value: 1.8662073695234804.


0.7491966027412972
Current run: logs/hparam_tuning/run-17


epoch: 100%|██████████| 100/100 [04:12<00:00,  2.53s/it, train epoch loss=0.476]
[I 2024-04-03 06:06:47,652] Trial 17 finished with value: 0.47639568684284345 and parameters: {'vocab_dim': 7, 'dropout': 0.7484565631313602, 'emb_dim': 77, 'hidden_dim': 21, 'batch_norm': True, 'layers_count': 3}. Best is trial 12 with value: 1.8662073695234804.


0.47639568684284345
Current run: logs/hparam_tuning/run-18


epoch: 100%|██████████| 100/100 [04:06<00:00,  2.47s/it, train epoch loss=1.45]
[I 2024-04-03 06:10:54,476] Trial 18 finished with value: 1.4514003936391677 and parameters: {'vocab_dim': 2, 'dropout': 0.7658392965756634, 'emb_dim': 88, 'hidden_dim': 8, 'batch_norm': True, 'layers_count': 4}. Best is trial 12 with value: 1.8662073695234804.


1.4514003936391677
Current run: logs/hparam_tuning/run-19


epoch: 100%|██████████| 100/100 [04:09<00:00,  2.50s/it, train epoch loss=0.896]
[I 2024-04-03 06:15:04,363] Trial 19 finished with value: 0.8957599593109714 and parameters: {'vocab_dim': 5, 'dropout': 0.7201973739835417, 'emb_dim': 75, 'hidden_dim': 12, 'batch_norm': True, 'layers_count': 4}. Best is trial 12 with value: 1.8662073695234804.


0.8957599593109714
Current run: logs/hparam_tuning/run-20


epoch: 100%|██████████| 100/100 [03:59<00:00,  2.40s/it, train epoch loss=0.724]
[I 2024-04-03 06:19:04,341] Trial 20 finished with value: 0.7243506022766968 and parameters: {'vocab_dim': 3, 'dropout': 0.6312289664110131, 'emb_dim': 50, 'hidden_dim': 7, 'batch_norm': True, 'layers_count': 2}. Best is trial 12 with value: 1.8662073695234804.


0.7243506022766968
Current run: logs/hparam_tuning/run-21


epoch: 100%|██████████| 100/100 [04:06<00:00,  2.46s/it, train epoch loss=1.72]
[I 2024-04-03 06:23:10,470] Trial 21 finished with value: 1.7196658512511034 and parameters: {'vocab_dim': 1, 'dropout': 0.7822164854634648, 'emb_dim': 84, 'hidden_dim': 6, 'batch_norm': True, 'layers_count': 4}. Best is trial 12 with value: 1.8662073695234804.


1.7196658512511034
Current run: logs/hparam_tuning/run-22


epoch: 100%|██████████| 100/100 [04:04<00:00,  2.45s/it, train epoch loss=1.89]
[I 2024-04-03 06:27:15,240] Trial 22 finished with value: 1.8945633379775293 and parameters: {'vocab_dim': 1, 'dropout': 0.7748024153402618, 'emb_dim': 89, 'hidden_dim': 5, 'batch_norm': True, 'layers_count': 4}. Best is trial 22 with value: 1.8945633379775293.


1.8945633379775293
Current run: logs/hparam_tuning/run-23


epoch: 100%|██████████| 100/100 [04:02<00:00,  2.43s/it, train epoch loss=1.22]
[I 2024-04-03 06:31:17,876] Trial 23 finished with value: 1.222402932225199 and parameters: {'vocab_dim': 2, 'dropout': 0.7978478576158479, 'emb_dim': 91, 'hidden_dim': 10, 'batch_norm': True, 'layers_count': 3}. Best is trial 22 with value: 1.8945633379775293.


1.222402932225199
Current run: logs/hparam_tuning/run-24


epoch: 100%|██████████| 100/100 [04:03<00:00,  2.44s/it, train epoch loss=1.8]
[I 2024-04-03 06:35:21,798] Trial 24 finished with value: 1.7952855259320828 and parameters: {'vocab_dim': 1, 'dropout': 0.7663393497591658, 'emb_dim': 87, 'hidden_dim': 5, 'batch_norm': True, 'layers_count': 4}. Best is trial 22 with value: 1.8945633379775293.


1.7952855259320828
Current run: logs/hparam_tuning/run-25


epoch: 100%|██████████| 100/100 [04:04<00:00,  2.45s/it, train epoch loss=1]  
[I 2024-04-03 06:39:26,360] Trial 25 finished with value: 1.004807043392907 and parameters: {'vocab_dim': 3, 'dropout': 0.7352840920990291, 'emb_dim': 72, 'hidden_dim': 11, 'batch_norm': True, 'layers_count': 3}. Best is trial 22 with value: 1.8945633379775293.


1.004807043392907
Current run: logs/hparam_tuning/run-26


epoch: 100%|██████████| 100/100 [04:04<00:00,  2.44s/it, train epoch loss=1.59]
[I 2024-04-03 06:43:30,649] Trial 26 finished with value: 1.5909253212759658 and parameters: {'vocab_dim': 2, 'dropout': 0.7701997923456856, 'emb_dim': 78, 'hidden_dim': 7, 'batch_norm': False, 'layers_count': 4}. Best is trial 22 with value: 1.8945633379775293.


1.5909253212759658
Current run: logs/hparam_tuning/run-27


epoch: 100%|██████████| 100/100 [04:09<00:00,  2.49s/it, train epoch loss=0.771]
[I 2024-04-03 06:47:40,111] Trial 27 finished with value: 0.7714528085899844 and parameters: {'vocab_dim': 5, 'dropout': 0.6923369130355432, 'emb_dim': 90, 'hidden_dim': 14, 'batch_norm': True, 'layers_count': 4}. Best is trial 22 with value: 1.8945633379775293.


0.7714528085899844
Current run: logs/hparam_tuning/run-28


epoch: 100%|██████████| 100/100 [04:02<00:00,  2.43s/it, train epoch loss=1.42]
[I 2024-04-03 06:51:42,870] Trial 28 finished with value: 1.4228852876116145 and parameters: {'vocab_dim': 1, 'dropout': 0.7999343620622313, 'emb_dim': 80, 'hidden_dim': 8, 'batch_norm': True, 'layers_count': 3}. Best is trial 22 with value: 1.8945633379775293.


1.4228852876116145
Current run: logs/hparam_tuning/run-29


epoch: 100%|██████████| 100/100 [04:04<00:00,  2.45s/it, train epoch loss=1.26]
[I 2024-04-03 06:55:47,647] Trial 29 finished with value: 1.2612346751744166 and parameters: {'vocab_dim': 2, 'dropout': 0.6725142185466407, 'emb_dim': 99, 'hidden_dim': 21, 'batch_norm': False, 'layers_count': 4}. Best is trial 22 with value: 1.8945633379775293.


1.2612346751744166


Лучшие значение гиперпараметров, полученные optuna

In [24]:
study.best_params

{'vocab_dim': 1,
 'dropout': 0.7748024153402618,
 'emb_dim': 89,
 'hidden_dim': 5,
 'batch_norm': True,
 'layers_count': 4}

Лучшая метрика

In [25]:
study.best_value

1.8945633379775293

# Предварительные выводы по результатам перебора гиперпараметров
Рассмотрим график полученный в tensorboard на hpparams - parralel coordinates view.
Из него делается несколько выводов. 

Лучше себя показывет модель с небольшим по размеру слоем, зато с общем числом слоев $4$. 

Увеличение словаря ухудшило результаты обучения, лучше себя показал небольшой словарь, что довольно неожиданно, но возможно при увеличении размера словаря требуется соответсвенно увеличивать и сложность сети.

Для предотвращения перобучения требуется использовать больший $\text{dropout} = 0.7$.

При увеличении словаря требуется увеличивать и $\text{embedding}$.


Попробуем еще раз обучить с подобранными параметрами

In [27]:
config = dict()
dataset.words_vocab.rebuild_vocabulary(10000)
config['vocab_dim'] = len(dataset.words_vocab.word2idx)
config['output_dim'] = len(dataset.poses_vocab.pos2idx)
config['emb_dim'] = 90
config['hidden_dim'] = 30
config['num_layers'] = 1
config['bidirectional'] = False
config['p'] = 0
config['batch_norm'] = True
device = torch.device("cuda")
rnn = RNNclassifier(**config)
rnn = rnn.to(device)

In [29]:
trainer(count_of_epoch=20, 
        batch_size=64, 
        dataset=dataset,
        model=rnn, 
        loss_function=loss_function,
        optimizer = optimizer,
        lr=0.001,
        callback=None)

epoch: 100%|██████████| 20/20 [00:22<00:00,  1.12s/it, train epoch loss=0.068] 


0.06796524720726324

In [32]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset, 
                                              batch_size=64, 
                                              pin_memory=True)
            
pred = []
real = []
rnn.eval()
for its, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = rnn(x_batch)
    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().flatten().tolist())
    real.extend(y_batch.cpu().numpy().flatten().tolist())
real = np.array(real)
pred = np.array(pred)
print(classification_report(real, pred, sample_weight= 1.0 * (real != 0)))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       1.00      1.00      1.00   11645.0
           2       1.00      1.00      1.00   11645.0
           3       0.92      0.96      0.94   19021.0
           4       0.99      0.93      0.96    5405.0
           5       0.80      0.33      0.47      12.0
           6       0.98      0.97      0.98   58047.0
           7       0.98      0.97      0.98   15256.0
           8       0.97      0.98      0.97   24394.0
           9       1.00      1.00      1.00   24138.0
          10       0.95      0.97      0.96    1415.0
          11       0.99      0.99      0.99    5005.0
          12       0.93      0.91      0.92    3271.0
          13       0.96      0.95      0.95    3987.0
          14       0.98      0.97      0.97    2537.0
          15       0.98      0.95      0.96    7244.0
          16       0.95      0.99      0.97    3608.0
          17       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Результат нам ного лучше после обучения на подобранных параметрах. Теперь теги правильно выдаются не только для самых популярных классов. Предыдущей модели обучится на них мешала недостаточная сложность можели, из-за которых она не могла подстроится под маленькие классы

In [None]:
model

Проверим как модель пресказывает тэги для случайных предложений, а не из датасета

In [68]:
class Tokenizer(object):
    def __init__(self, word_to_ind, tokenizer):
        self.word_to_ind = word_to_ind
        self.tokenizer = tokenizer
    def __call__(self, sentences, max_length = 100, pad_to_max_length = False):
        tokens = self.tokenizer(sentences).tokens()
        print(tokens)
        if not pad_to_max_length:
            max_length = min(max_length, max(map(len, tokens)))
        tokens = tokens + ['[PAD]']*(max_length-len(tokens)) \
                  if len(tokens) < max_length \
                  else tokens
        print(tokens)
        ids = [self.word_to_ind.get(w, self.word_to_ind['[UNK]']) for w in tokens]
        return torch.tensor(ids)

Напишем сами простое предложение.

In [119]:
prob = "Вполне простое предложение"

Токенизируем его

In [120]:
tokenizer(prob).tokens()

['[CLS]', 'В', '##пол', '##не', 'простое', 'предложение', '[SEP]']

In [78]:
token = Tokenizer(dataset.words_vocab.word2idx, tokenizer)

Проверим, что мы не забыли добавить маркеры начала и конца 

In [121]:
tokenized_prob = token(prob)

['[CLS]', 'В', '##пол', '##не', 'простое', 'предложение', '[SEP]']
['[CLS]', 'В', '##пол', '##не', 'простое', 'предложение', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [122]:
tokenized_prob

tensor([   1,   16,  206,  325,    3, 3234,    2,    0,    0,    0,    0])

In [123]:
tokenized_prob = tokenized_prob.unsqueeze(dim = 0)

In [124]:
rnn.cuda()

RNNclassifier(
  (embedding): Embedding(10000, 90)
  (encoder): LSTM(90, 30, batch_first=True)
  (bn): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear): Linear(in_features=30, out_features=20, bias=True)
  (logsoftmax): LogSoftmax(dim=None)
)

In [126]:

tokenized_prob = tokenized_prob.to(rnn.device)

In [127]:
rnn.eval()
output = rnn(tokenized_prob)

Предскажем тэги с помощью модели, выберем намболее вероятныцй класс для каждого тега

In [128]:
pred = torch.argmax(output, dim=-1).cpu().numpy().flatten().tolist()

In [129]:
pred

[1, 9, 12, 6, 8, 6, 2, 7, 3, 3, 6]

In [130]:
res = []

In [131]:
for e in pred:
    res.append(dataset.poses_vocab.idx2pos[e])

In [132]:
res

['[CLS]',
 'ADP',
 'DET',
 'NOUN',
 'VERB',
 'NOUN',
 '[SEP]',
 'PROPN',
 'ADJ',
 'ADJ',
 'NOUN']

## Вывод
Перебор гиперпараметров для модели очень важен. Если взять недостаточно простую модель, как мы взяли вначале это ей не позолит обучиться под классы, т.к. она недостаторчно гибкая. Слишком сожная модель переобучитсья. Нормализация позволяет не переобучиться также. Причем увеличение числа слоев помогало обучаться модели, в то же время увеличение размера скрытого слоя приводило наоборот к ухудшению метрики. Лучше всего показала себя модель с небольшими по размеру слоями, но при этом соостоящая из 4 слоев. При этом увеличение рамера словаря не вело к улучшению метрик, намного лучше себя показывали модели с небольшим словаряем.