In [112]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext import datasets

import numpy as np
from sklearn.metrics import classification_report

import random

from gensim.models import FastText
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score, f1_score

In [113]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [114]:
train_data, _, test_data = datasets.UDPOS()
train_data = [d for d in train_data]
test_data = [d for d in test_data]

train_tokens = [ [w.lower() for w in d[0]] for d in train_data]
train_tags = [ d[1] for d in train_data]

test_tokens = [[w.lower() for w in d[0]] for d in test_data]
test_tags = [d[1] for d in test_data]

tag2num = { t:i for i, t in enumerate(np.unique([tag for tags in train_tags for tag in tags])) }

In [115]:
stemmer = PorterStemmer()

word_to_ix = {}
for tokens in train_tokens:
    for word in tokens:
        word = stemmer.stem(word)
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

word_to_ix["UNK"] =  len(word_to_ix)

In [116]:
max_len = 20
pad_inds = len(tag2num)

def prepare_sequence(seq, to_ix):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(w) for w in seq]
    idxs = [to_ix[w] if w in to_ix else to_ix["UNK"] for w in stemmed_words ]
    return torch.tensor(idxs, dtype=torch.long)


def prepare_data_for_inner_embeddings(all_tokens, all_tags, word_to_ix, tag2num, max_len, pad_tags):
    all_tags = [np.array([tag2num[tag]  for tag in tags]) for tags in all_tags]
    
    all_tokens = [tokens[:max_len] for tokens in all_tokens]
    all_tags = [tags[:max_len] for tags in all_tags]
    
    all_ids = []
    for tokens in all_tokens:
        ids = prepare_sequence(tokens, word_to_ix)
        all_ids.append(ids)
        
    X_vecs = []
    Y_vecs = []

    for ids, tags in zip(all_ids, all_tags):
        X_vecs.append(torch.tensor(ids, dtype=torch.long))
        Y_vecs.append(torch.tensor(tags, dtype=torch.long))
        
    # в качестве заполнителя X используем новый индекс len(word_to_ix)
    X = pad_sequence(X_vecs, batch_first=True, padding_value=len(word_to_ix))

    # в качестве заполнителя Y используем pad_tags
    Y = pad_sequence(Y_vecs, batch_first=True, padding_value=pad_tags)
    
    return X, Y

X_train, Y_train = prepare_data_for_inner_embeddings(train_tokens, train_tags, word_to_ix, tag2num, max_len, pad_inds)

X_train.size(), Y_train.size()



(torch.Size([12543, 20]), torch.Size([12543, 20]))

In [117]:
X_test, Y_test = prepare_data_for_inner_embeddings(test_tokens, test_tags, word_to_ix, tag2num, max_len, pad_inds)

X_test.size(), Y_test.size()



(torch.Size([2077, 20]), torch.Size([2077, 20]))

In [118]:
print(X_train)

tensor([[    0,     1,     2,  ...,    14,    15,    11],
        [   23,    24,     6,  ...,    37, 12121, 12121],
        [   38,     3,    39,  ..., 12121, 12121, 12121],
        ...,
        [ 3083,    43,    28,  ...,   211,    29,    25],
        [   11,  4206,    13,  ...,    17,   368,    42],
        [  112,    28,   387,  ...,   132,    43,  1054]])


In [119]:
print(Y_train)

tensor([[11, 12, 11,  ...,  7,  1,  5],
        [12,  5,  7,  ..., 12, 17, 17],
        [11, 12,  0,  ..., 17, 17, 17],
        ...,
        [ 2, 10,  3,  ...,  2,  3,  5],
        [ 5,  7,  1,  ...,  1,  7, 10],
        [10,  3,  2,  ...,  7, 10,  2]])


In [120]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

bs = 128
data = TensorDataset(X_train, Y_train)
dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=bs)

In [121]:
class BiLSTMPOSTagger(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        # padding_idx=pad_idx - это номер id "заполнителя". 
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        
        # добавляем в параметры модели то, что первая размерность отвечает за батчи - после этого мы сможем не использовать метод view в обработке (в forward)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first = True)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    # меняем функцию, применяем эмбеддинг при обработке текста
    def forward(self, text):
        outputs, (hidden, cell) = self.lstm(self.embedding(text))
        predictions = self.fc(self.dropout(outputs))
        return predictions

In [122]:
def train_on_epoch(model, dataloader, optimizer):
    model.train()
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input, b_tags = batch
        
        model.zero_grad()
        outputs = model(b_input) 

        # здесь функция view нужна!
        # outputs = [batch size, sent len, out dim]
        outputs = outputs.view(-1, outputs.shape[-1])       
        # outputs = [batch size * sent len, out dim]

        # b_tags = [batch size, sent len]
        b_tags = b_tags.view(-1)
        # b_tags = [batch size * sent len]
        
        loss = criterion(outputs, b_tags)
        loss.backward()
        optimizer.step()


def predict_on_dataloader(model, dataloaded):
    model.eval()
        
    all_outputs = []
    all_tags = []
    for batch in dataloaded:
        batch = tuple(t.to(device) for t in batch)
        b_input, b_tags = batch
        outputs = model(b_input)  
        
        outputs = outputs.view(-1, outputs.shape[-1])       
        b_tags = b_tags.view(-1)

        all_outputs.append(outputs)
        all_tags.append(b_tags)

    all_outputs = torch.cat(all_outputs)
    all_tags = torch.cat(all_tags)
    
    return all_outputs, all_tags

In [123]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
print(device)

cuda


In [124]:
INPUT_DIM = len(word_to_ix)+1
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = len(tag2num)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
PAD_IDX = len(word_to_ix)

model = BiLSTMPOSTagger(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=pad_inds)
optimizer = optim.Adam(model.parameters())

In [125]:
epochs = 50
for e in range(epochs):
    train_on_epoch(model, dataloader, optimizer)    
    
    all_outputs, all_tags = predict_on_dataloader(model, dataloader)
    loss = criterion(all_outputs, all_tags).item()
    all_outputs = all_outputs.detach().cpu().numpy()
    all_tags = all_tags.detach().cpu().numpy()
    
    mask = all_tags != pad_inds
    loss = loss/len(all_tags[mask]) 
    all_tags = all_tags[mask]
    all_preds = np.argmax(all_outputs, axis=1)[mask]
    
    print(f"{e}:\tLoss {loss}, "
          f"accuracy: {accuracy_score(all_tags, all_preds)}, "
          f"f1-macro: {f1_score(all_tags, all_preds, average='macro')}")

0:	Loss 5.557459261468461e-06, accuracy: 0.7129221326346734, f1-macro: 0.5302582019515593
1:	Loss 3.632146791607471e-06, accuracy: 0.8054761157954944, f1-macro: 0.7014959554800455
2:	Loss 2.7419912671908238e-06, accuracy: 0.8531891732319784, f1-macro: 0.7650589188356605
3:	Loss 2.1752669233417644e-06, accuracy: 0.8841983554432123, f1-macro: 0.8106203037607905
4:	Loss 1.7598667550182433e-06, accuracy: 0.9075259691138213, f1-macro: 0.8493931588113233
5:	Loss 1.4480388077201148e-06, accuracy: 0.9242667453889063, f1-macro: 0.8761806433682885
6:	Loss 1.188654476183766e-06, accuracy: 0.9391932200471104, f1-macro: 0.9015598871403027
7:	Loss 9.759769824341272e-07, accuracy: 0.9508416514449836, f1-macro: 0.9203174809700494
8:	Loss 7.918361970935309e-07, accuracy: 0.9610570920742695, f1-macro: 0.9369362432244044
9:	Loss 6.472855765843729e-07, accuracy: 0.9684742030910779, f1-macro: 0.9454437652426189
10:	Loss 5.360378329265314e-07, accuracy: 0.9743599205397394, f1-macro: 0.9554123152100895
11:	L

In [129]:
def count_metrics(model, dataloader):
  y_pred, y_true = predict_on_dataloader(model, dataloader)

  y_pred = y_pred.detach().cpu().numpy()
  y_true = y_true.detach().cpu().numpy()

  mask = y_true != pad_inds
  y_true = y_true[mask]
  y_pred = np.argmax(y_pred, axis=1)[mask]

  print(classification_report(y_true, y_pred))

In [127]:
count_metrics(model, dataloader)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9962
           1       1.00      1.00      1.00     13578
           2       1.00      1.00      1.00      8547
           3       1.00      1.00      1.00     10404
           4       1.00      1.00      1.00      5202
           5       1.00      1.00      1.00     13014
           6       1.00      1.00      1.00       649
           7       1.00      1.00      1.00     27080
           8       1.00      1.00      1.00      3339
           9       1.00      1.00      1.00      4484
          10       1.00      1.00      1.00     15619
          11       1.00      1.00      1.00     10523
          12       1.00      1.00      1.00     16990
          13       1.00      1.00      1.00      3134
          14       1.00      1.00      1.00       484
          15       1.00      1.00      1.00     18849
          16       1.00      1.00      1.00       739

    accuracy              

In [128]:
data = TensorDataset(X_test, Y_test)
test_dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=bs)
count_metrics(model, test_dataloader)

              precision    recall  f1-score   support

           0       0.77      0.85      0.81      1466
           1       0.91      0.97      0.93      1656
           2       0.84      0.87      0.85      1066
           3       0.97      0.98      0.97      1336
           4       0.99      0.99      0.99       599
           5       0.98      0.99      0.98      1607
           6       0.94      0.78      0.85       115
           7       0.83      0.87      0.85      3446
           8       0.82      0.70      0.76       448
           9       0.94      0.97      0.96       546
          10       0.98      0.98      0.98      1923
          11       0.76      0.68      0.72      1773
          12       0.99      0.99      0.99      2467
          13       0.92      0.78      0.84       330
          14       0.88      0.79      0.83        81
          15       0.91      0.89      0.90      2306
          16       0.56      0.16      0.25       114

    accuracy              