Dataset https://github.com/natasha/nerus

In [1]:
# !pip install navec
# !pip install razdel
# !pip install nerus

Embeddings https://github.com/natasha/navec

In [2]:
# !wget https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar

In [3]:
import onnxruntime

In [4]:
from nerus import load_nerus
from tqdm import tqdm
from navec import Navec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [6]:
docs = load_nerus('nerus_lenta.conllu.gz')

In [7]:
tokens = []
tags = []
sentence_limit = 200_000

In [8]:
%%time
for doc in tqdm(docs):
    for sent in doc.sents:
        token_sent = []
        tag_sent = []
        for token in sent.tokens:
            token_sent.append(token.text)
            tag_sent.append(token.tag)
        tokens.append(token_sent)
        tags.append(tag_sent)
    if len(tokens) > sentence_limit:
        break

16867it [00:27, 603.63it/s]

CPU times: user 27.2 s, sys: 560 ms, total: 27.7 s
Wall time: 28 s





In [9]:
len(tokens)

200010

In [10]:
label_to_entity = {
    0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'
}
entity_to_label = {value: key for key, value in label_to_entity.items()}

In [11]:
labels = []
for sent in tags:
    labels.append([entity_to_label[tag] for tag in sent])

# Dataset

In [12]:
class NerDataset(Dataset):
    def __init__(self,
                 tokens: list,
                 tags: list,
                 embedding_model,
                 entity_to_label: dict,
                 max_sent_len=80,
                 pad_token='<pad>',
                 ignoring_label=7
                ):
        self.embedding_model = embedding_model
        self.tokens = tokens
        self.tags = tags
        self.max_sent_len = max_sent_len
        self.entity_to_label=entity_to_label
        self.pad = pad_token
        self.ig_label = ignoring_label

    def __getitem__(self, index):        
        embeddings = np.array([
            self.embedding_model.get(token.lower(), self.embedding_model['<unk>']) 
            for token in self.tokens[index]
        ])
        labels = np.array([self.entity_to_label[tag] for tag in self.tags[index]])
        
        sent_len = len(embeddings)
        
        if sent_len < self.max_sent_len:
            to_pad = self.max_sent_len - sent_len
            embeddings = np.concatenate([embeddings, [self.embedding_model[self.pad] for _ in range(to_pad)]])
            labels = np.concatenate([labels, np.repeat(self.ig_label, to_pad)])
        else:
            embeddings = embeddings[:self.max_sent_len, :]
            labels = labels[:self.max_sent_len]
        
        if len(labels) < 80:
            print(index, labels)
        return torch.FloatTensor(embeddings), torch.LongTensor(labels)
    
    def __len__(self):
        return len(self.tokens)

In [13]:
navec_path = 'navec_news_v1_1B_250K_300d_100q.tar'
navec = Navec.load(navec_path)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    tokens,
    tags,
    random_state=42,
    test_size=0.3
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    random_state=42,
    test_size=0.2
)

In [15]:
train_dataset = NerDataset(
    embedding_model=navec,
    tokens=X_train,
    tags=y_train,
    entity_to_label=entity_to_label,
)

valid_dataset = NerDataset(
    embedding_model=navec,
    tokens=X_val,
    tags=y_val,
    entity_to_label=entity_to_label,
)

test_dataset = NerDataset(
    embedding_model=navec,
    tokens=X_test,
    tags=y_test,
    entity_to_label=entity_to_label,
)

In [16]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [17]:
X_train[38849]

['Ранее',
 'суд',
 'обязал',
 'главу',
 'Смоленска',
 'оплатить',
 'штраф',
 'в',
 'размере',
 '60',
 'тысяч',
 'рублей',
 'за',
 'нецелевое',
 'использование',
 'бюджетных',
 'областных',
 'средств',
 ',',
 'выделенных',
 'на',
 'благоустройство',
 '.']

In [18]:
y_train[38849]

['O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [19]:
for batch in train_loader:
    print(batch[0].shape)
    print(batch[1].shape)
    break

torch.Size([32, 80, 300])
torch.Size([32, 80])


# Model

In [20]:
class BiLSTM(nn.Module):
    def __init__(self,
                 emb_size: int,
                 hidden_size: int,
                 out_size: int
                ):
        super().__init__()
        self.bilstm = nn.LSTM(
            emb_size, hidden_size, batch_first=True, bidirectional=True
        )

        self.fc = nn.Linear(2 * hidden_size, out_size)

    def forward(self, x):
        # x = [batch_size, sent_len, emb_dim]
        rnn_out, _ = self.bilstm(x)  # [batch_size, sent_len, hidden_size * 2]
        scores = self.fc(rnn_out)  # [batch_size, sent_len, out_size]
        return scores

In [21]:
model = BiLSTM(
    emb_size=300,
    hidden_size=128,
    out_size=len(entity_to_label) + 1
)

In [22]:
model(batch[0]).shape

torch.Size([32, 80, 8])

In [23]:
y_predict = model(batch[0])
y_predict.view(-1, y_predict.shape[2]).shape

torch.Size([2560, 8])

In [24]:
optimizer = optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(ignore_index=7)

In [25]:
def train(
    model: nn.Module,
    train_loader: DataLoader,
    valid_loader: DataLoader,
    criterion: nn.Module,
    optimizer: torch.optim.Optimizer,
    device: str,
    max_grad_norm: int = 2,
) -> (float, float):
    train_loss = 0.0
    val_loss = 0.0

    model.train()
    for batch in tqdm(train_loader):
        text = batch[0].to(device)
        labels = batch[1].view(-1).to(device)

        y_predict = model(text)
        loss = criterion(y_predict.view(-1, y_predict.shape[2]), labels)
        optimizer.zero_grad()
        train_loss += loss.item()
        loss.backward()

        if max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()

    train_loss /= len(train_loader)

    model.eval()
    y_true, y_pred, label_pred = [], [], []
    with torch.no_grad():
        for batch in tqdm(valid_loader):

            text = batch[0].to(device)
            labels = batch[1].view(-1).to(device)

            prediction = model(text)
            prediction = prediction.view(-1, prediction.shape[2])
            label_predict = torch.argmax(prediction, dim=1).view(-1)
            preds = F.softmax(prediction, dim=1)[:, 1]

            y_true += labels.cpu().detach().numpy().ravel().tolist()
            y_pred += preds.cpu().detach().numpy().ravel().tolist()
            label_pred += label_predict.cpu().detach().numpy().ravel().tolist()

            loss = criterion(prediction, labels)

            val_loss += loss.item()

    val_loss /= len(valid_loader)
    y_true = np.array(y_true)
    label_pred = np.array(label_pred)
    print(classification_report(y_true[y_true != 7], label_pred[y_true != 7]))

    return train_loss, val_loss

In [26]:
def test(model: nn.Module,
         test_data_loader: DataLoader,
         device: str,
         ):

    model.eval()
    y_true, y_pred, label_pred = [], [], []
    for batch in tqdm(test_data_loader):
        text = batch[0].to(device)
        labels = batch[1].view(-1).to(device)

        prediction = model(text)
        prediction = prediction.view(-1, prediction.shape[2])
        label_predict = torch.argmax(prediction, dim=1).view(-1)
        preds = F.softmax(prediction, dim=1)[:, 1]

        y_true += labels.cpu().detach().numpy().ravel().tolist()
        y_pred += preds.cpu().detach().numpy().ravel().tolist()
        label_pred += label_predict.cpu().detach().numpy().ravel().tolist()
        
    y_true = np.array(y_true)
    label_pred = np.array(label_pred)
    print(classification_report(y_true[y_true != 7], label_pred[y_true != 7]))

In [27]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [28]:
for epoch in range(2):
    train_loss, val_loss = train(
        model=model,
        train_loader=train_loader,
        valid_loader=valid_loader,
        criterion=criterion,
        optimizer=optimizer,
        device=device
    )
    print()
    print(f'Epoch: {epoch}, Training Loss: {train_loss}, Validation Loss: {val_loss}')

100%|██████████████████████████████████████████████████| 3501/3501 [04:53<00:00, 11.94it/s]
100%|████████████████████████████████████████████████████| 876/876 [00:39<00:00, 21.93it/s]


              precision    recall  f1-score   support

           0       0.98      0.99      0.99    459716
           1       0.86      0.81      0.83     10161
           2       0.87      0.87      0.87      6676
           3       0.78      0.72      0.75     11035
           4       0.72      0.66      0.69      8769
           5       0.90      0.90      0.90     12346
           6       0.84      0.69      0.75      1977

    accuracy                           0.97    510680
   macro avg       0.85      0.81      0.83    510680
weighted avg       0.97      0.97      0.97    510680


Epoch: 0, Training Loss: 0.1773051009155646, Validation Loss: 0.10343711142806702


100%|██████████████████████████████████████████████████| 3501/3501 [05:06<00:00, 11.42it/s]
100%|████████████████████████████████████████████████████| 876/876 [00:44<00:00, 19.55it/s]


              precision    recall  f1-score   support

           0       0.98      1.00      0.99    459716
           1       0.93      0.80      0.86     10161
           2       0.92      0.87      0.89      6676
           3       0.89      0.74      0.81     11035
           4       0.87      0.64      0.74      8769
           5       0.95      0.91      0.93     12346
           6       0.90      0.77      0.83      1977

    accuracy                           0.98    510680
   macro avg       0.92      0.82      0.87    510680
weighted avg       0.97      0.98      0.97    510680


Epoch: 1, Training Loss: 0.0847997877739372, Validation Loss: 0.07772123008502911


In [29]:
test(model, test_loader, device)

100%|██████████████████████████████████████████████████| 1876/1876 [01:34<00:00, 19.92it/s]


              precision    recall  f1-score   support

           0       0.98      1.00      0.99    978059
           1       0.93      0.80      0.86     21426
           2       0.92      0.87      0.89     14371
           3       0.89      0.74      0.81     23446
           4       0.88      0.65      0.75     18721
           5       0.95      0.91      0.93     25503
           6       0.88      0.79      0.83      3972

    accuracy                           0.98   1085498
   macro avg       0.92      0.82      0.87   1085498
weighted avg       0.97      0.98      0.97   1085498



# CRF

In [30]:
START_TAG = '<START>'
STOP_TAG = '<STOP>'
entity_to_label.update({START_TAG: 7, STOP_TAG: 8})

In [31]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim // 2,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )

        # слой для преобразования векторного представления из LSTM в логиты сущности
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # матрица перехода transitions[i, j] из сущности j в сущность i
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size, device=self.device)
        )
        # устанавливаем следующие параметры для матрицы перехода
        # делаем невозможным переход в специальный токен старт 
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        # делаем невозможным переход из специального токена конец
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2, device=self.device),
                torch.randn(2, 1, self.hidden_dim // 2, device=self.device))

    
    def _forward_alg(self, feats):
        # инициализируем массив с логитами сущностей
        init_alphas = torch.full((self.tagset_size,), -10000., device=self.device)
        init_alphas[self.tag_to_ix[START_TAG]] = 0.

        forward_var = [init_alphas,]

        # аналогично прямому ходу в HHM
        for idx, feat in enumerate(feats):
            forwards = torch.stack([forward_var[idx]] * feats.shape[1], dim=0)  # [feats.shape[1], feats.shape[1]]
            emmissions = feat.unsqueeze(0).transpose(0, 1)  # [feats.shape[1], 1]
            alphas_t = forwards + emmissions + self.transitions  # [feats.shape[1], feats.shape[1]]
            forward_var.append(torch.logsumexp(alphas_t, dim=1))

        terminal_var = forward_var[-1] + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = torch.logsumexp(terminal_var.unsqueeze(0), dim=1)[0]
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        lstm_out, self.hidden = self.lstm(sentence.unsqueeze(0), self.hidden)
        lstm_feats = self.hidden2tag(lstm_out).squeeze(0)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # оценка реальной последовательности сущностей
        score = torch.zeros(1, device=self.device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long, device=self.device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score
    
    def _viterbi_decode(self, feats):
        # прямой и обратный ход алгоритма витерби
        backpointers = []

        init_vvars = torch.full((1, self.tagset_size,), -10000., device=self.device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        forward_var = [init_vvars,]
        for idx, feat in enumerate(feats):
            forwards = torch.stack([forward_var[idx]] * feats.shape[1], dim=0)  # [feats.shape[1], feats.shape[1]]
            forwards = forwards.squeeze()
            tags_var = forwards + self.transitions
            viterbivars_t, bptrs_t = torch.max(tags_var, dim=1)
            
            next_forward_var = feat.unsqueeze(0) + viterbivars_t.unsqueeze(0)
            forward_var.append(next_forward_var)
            
            backpointers.append(bptrs_t.tolist())
        
        terminal_var = forward_var[-1] + self.transitions[self.tag_to_ix[STOP_TAG]]  # [1, feats.shape[1]]
        best_tag_id = torch.argmax(terminal_var).tolist()
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)

        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        # метод для инференса (без _viterby_decode)
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        # score, tag_seq = self._viterbi_decode(lstm_feats)
        return lstm_feats, self.transitions
    
    def predict(self, sentence):
        # метод для получения предсказаний на валидации
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

    @property
    def device(self):
        return 'cpu'

In [32]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 128

model = BiLSTM_CRF(entity_to_label, EMBEDDING_DIM, HIDDEN_DIM).to('cpu')
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)

In [33]:
x = batch[0]
x.shape

torch.Size([32, 80, 300])

In [34]:
feats = model._get_lstm_features(x[0])
feats[0].shape

torch.Size([9])

In [35]:
model._forward_alg(feats)

tensor(225.5877, grad_fn=<SelectBackward0>)

In [36]:
model._viterbi_decode(feats)

(tensor(159.6971, grad_fn=<SelectBackward0>),
 [4,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  1,
  2])

In [37]:
model._score_sentence(batch[0][0], batch[1][0])

tensor([-679996.6250], grad_fn=<AddBackward0>)

In [38]:
model.neg_log_likelihood(batch[0][0], batch[1][0])

tensor([680222.5000], grad_fn=<SubBackward0>)

In [39]:
IGNORE_LABEL = 7

In [40]:
n_epoch = 2

for epoch in range(n_epoch):
    train_loss = 0
    num_iter = 0
    model.train()
    for batch in tqdm(train_loader):
        num_iter += 1
        sentences, labels = batch[0].to(device), batch[1].to(device)
        optimizer.zero_grad()
        loss = 0
        for i in range(sentences.size(0)):

            sentence_in = sentences[i]
            targets = labels[i]
            index = (targets != IGNORE_LABEL)

            loss += model.neg_log_likelihood(sentence_in[index], targets[index])

        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    print(f"Train Loss: {train_loss / num_iter}")

100%|██████████████████████████████████████████████████| 3501/3501 [10:42<00:00,  5.45it/s]


Train Loss: 43.527854282288715


100%|██████████████████████████████████████████████████| 3501/3501 [10:38<00:00,  5.48it/s]

Train Loss: 28.936170242677857





In [41]:
targets

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7])

In [42]:
sentence_in

tensor([[ 0.3705, -0.7310, -0.4335,  ..., -0.5524, -0.2321, -0.1320],
        [ 0.0369,  0.0933,  0.0202,  ..., -0.1182, -0.0398,  0.0157],
        [-0.1782, -0.4132,  0.2546,  ..., -0.0679, -0.6165, -0.1322],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [43]:
lstm_feats, transitions = model(sentence_in)

In [44]:
def viterbi_decode(feats, transitions):
    tagset_size = len(transitions)
    # прямой и обратный ход алгоритма витерби
    backpointers = []

    init_vvars = np.full((1, tagset_size,), -10000.)
    init_vvars[0][entity_to_label[START_TAG]] = 0

    forward_var = [init_vvars,]
    for idx, feat in enumerate(feats):
        forwards = np.stack([forward_var[idx]] * feats.shape[1], axis=0)  # [feats.shape[1], feats.shape[1]]
        forwards = forwards.squeeze()
        tags_var = forwards + transitions
        viterbivars_t = np.max(tags_var, axis=1)
        bptrs_t = np.argmax(tags_var, axis=1)

        next_forward_var = feat[np.newaxis, :] + viterbivars_t[np.newaxis, :]
        forward_var.append(next_forward_var)

        backpointers.append(bptrs_t.tolist())

    terminal_var = forward_var[-1] + transitions[entity_to_label[STOP_TAG]]  # [1, feats.shape[1]]
    best_tag_id = np.argmax(terminal_var).tolist()
    path_score = terminal_var[0][best_tag_id]

    # Follow the back pointers to decode the best path.
    best_path = [best_tag_id]
    for bptrs_t in reversed(backpointers):
        best_tag_id = bptrs_t[best_tag_id]
        best_path.append(best_tag_id)

    # Pop off the start tag (we dont want to return that to the caller)
    start = best_path.pop()
    assert start == entity_to_label[START_TAG]
    best_path.reverse()
    return path_score, best_path

In [45]:
viterbi_decode(lstm_feats.detach().cpu().numpy(), transitions.detach().cpu().numpy())

(356.8219148516655,
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  0,
  0])

In [46]:
model.predict(sentence_in)

(tensor(356.1121, grad_fn=<SelectBackward0>),
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  0])

In [47]:
@torch.no_grad()
def evaluate(model, iterator):
    valid_loss = 0
    num_iter = 0
    valid_acc = 0
    num_words = 0
    labels_ = []
    predicts = []
    model.eval()
    for batch in tqdm(iterator, total=len(iterator)):
        num_iter += 1
        sentences, labels = batch[0].to(device), batch[1].to(device)
        with torch.no_grad():
            loss = 0
            for i in range(sentences.size(0)):
                sentence_in = sentences[i]
                targets = labels[i]
                index = (targets != IGNORE_LABEL)
                num_words += len(targets[index])

                loss += model.neg_log_likelihood(sentence_in[index], targets[index])
                _, prediction = model.predict(sentence_in[index])
                valid_acc += (targets[index] == prediction)
                labels_.extend(targets[index].cpu().detach().tolist())
                predicts.extend(prediction)
            valid_loss += loss.item()
            
    print(classification_report(labels_, predicts))
    return valid_loss / num_words, valid_acc / num_words

In [48]:
evaluate(model, valid_loader)

100%|████████████████████████████████████████████████████| 876/876 [01:52<00:00,  7.82it/s]


              precision    recall  f1-score   support

           0       0.99      0.99      0.99    459716
           1       0.94      0.83      0.88     10161
           2       0.90      0.88      0.89      6676
           3       0.84      0.81      0.82     11035
           4       0.64      0.84      0.73      8769
           5       0.96      0.92      0.94     12346
           6       0.90      0.80      0.85      1977

    accuracy                           0.97    510680
   macro avg       0.88      0.87      0.87    510680
weighted avg       0.98      0.97      0.97    510680



(0.05469495396723508, 0.0)

# ONNX

In [49]:
dummy_input = torch.randn(1, 80, 300)

In [50]:
model(dummy_input[0])[0].shape

torch.Size([80, 9])

In [51]:
torch.onnx.export(
    model,
    dummy_input[0],
    'ner.onnx',
    input_names=['input'],
    output_names=['output'],
    dynamic_axes={'input': {0: 'seq_len'}, 'output': {0: 'seq_len', 1: 'tagset_dim'}},
)



verbose: False, log level: Level.ERROR



In [52]:
onnx_model = onnxruntime.InferenceSession('ner.onnx')

In [53]:
onnx_model.get_inputs()

[<onnxruntime.capi.onnxruntime_pybind11_state.NodeArg at 0x1110eba30>]

In [54]:
model_input = {onnx_model.get_inputs()[0].name: np.random.randn(20, 300).astype(np.float32)}

In [55]:
model_output = onnx_model.run(None, model_input)

In [56]:
model_output[0].shape

(20, 9)

In [57]:
model_output

[array([[ 2.4997542e+00,  1.7855476e+00, -1.7697632e+00, -4.7236148e-01,
         -8.3279902e-01, -7.7576518e+00, -1.6556206e+00, -2.6764913e-04,
         -1.1226005e-03],
        [ 2.0506878e+00, -4.9079509e+00, -3.2761295e+00, -2.8434510e+00,
          2.2662246e+00, -3.5639086e+00,  4.6917143e+00,  5.2528240e-04,
         -3.7918118e-04],
        [ 3.5871267e+00, -5.9086881e+00, -5.7697926e+00, -9.7056150e-01,
         -8.2546610e-01, -5.2151737e+00, -1.5465969e+00,  2.3082382e-04,
         -8.8623993e-04],
        [ 1.2331828e+00,  1.9994755e-01, -2.0941973e+00,  5.4416053e-02,
          1.3272123e+00, -6.8608751e+00, -1.2029495e+00, -7.0299325e-04,
          6.7429013e-05],
        [ 2.8996274e+00,  6.1644837e-02, -1.4424642e+00,  7.5732328e-02,
         -3.4712012e+00, -2.0634651e+00, -5.6706367e+00,  9.5449784e-04,
          3.5029376e-04],
        [ 5.2019191e+00, -6.4175868e+00, -3.8840194e+00, -2.5222335e+00,
         -8.1722307e-01, -5.2342296e+00, -4.3930826e+00, -7.7542459

In [58]:
viterbi_decode(model_output[0], model_output[1])

(85.00238877534866,
 [0, 0, 0, 0, 0, 0, 5, 6, 6, 6, 6, 6, 0, 3, 4, 0, 0, 0, 0, 0])

# Test

In [59]:
test = 'В библиотеке имени Ленина разгорелся скандал с участием Сергея Миронова'
tokenized = test.lower().split()
embedded = np.array([navec.get(token, navec['<unk>']) for token in tokenized])

In [60]:
embedded.shape

(10, 300)

In [61]:
model_input = {onnx_model.get_inputs()[0].name: embedded.astype(np.float32)}

In [62]:
model_output = onnx_model.run(None, model_input)

In [63]:
model_output[0].shape

(10, 9)

In [64]:
_, labels = viterbi_decode(model_output[0], model_output[1])
tags = [label_to_entity[label] for label in labels]

In [65]:
dict(zip(tokenized, tags))

{'в': 'O',
 'библиотеке': 'O',
 'имени': 'O',
 'ленина': 'O',
 'разгорелся': 'O',
 'скандал': 'O',
 'с': 'O',
 'участием': 'O',
 'сергея': 'B-PER',
 'миронова': 'I-PER'}