Морфологический анализатор на основе ансамбля сверточной и LSTM сетей!

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from pymorphy2.tagset import OpencorporaTag
from pymorphy2 import MorphAnalyzer

from navec import Navec
from slovnet.model.emb import NavecEmbedding

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
navec = Navec.load("../SHEM/VECS/vectors.bin")

emb = NavecEmbedding(navec)

m = MorphAnalyzer()

  torch.from_numpy(navec.pq.indexes),


In [3]:
use_cuda = True
device = torch.device("cuda" if torch.cuda.is_available() and use_cuda else "cpu")
device

device(type='cuda')

In [4]:
PARTS_OF_SPEECH = [
    "UNK", 
    'PART',
    'ADP',
    'ADV',
    'INTJ',
    'ADJ',
    'DET',
    'PRON',
    'SCONJ',
    'PROPN',
    'CCONJ',
    'AUX',
    'VERB',
    'NOUN',
    'PUNCT'
]

ANIMACY = [
        'Unk',
        'Anim',  # одушевлённое
        'Inan',  # неодушевлённое
    ]

GENDERS = [
        'Unk',
        'Masc',  # мужской род
        'Fem',  # женский род
        'Neut',  # средний род
    ]

NUMBERS = [
        'Unk',
        'Sing',  # единственное число
        'Plur',  # множественное число
    ]

CASES = [
    'Unk',
    'Acc',
    'Nom',
    'Gen',
    'Dat',
    'Loc',
    'Ins',
    'Voc'
]

TENSES = [
        'Unk',
        'Pres',  # настоящее время
        'Past',  # прошедшее время
        'Fut',  # будущее время
    ]

speech_part_map = {str(s): i for i, s in enumerate(PARTS_OF_SPEECH)}
speech_part_len = len(PARTS_OF_SPEECH)

case_map = {str(s): i for i, s in enumerate(CASES)}
case_len = len(CASES)

number_map = {str(s): i for i, s in enumerate(NUMBERS)}
number_len = len(NUMBERS)

gender_map = {str(s): i for i, s in enumerate(GENDERS)}
gender_len = len(GENDERS)

tense_map = {str(s): i for i, s in enumerate(TENSES)}
tense_len = len(TENSES)

animacy_map = {str(s): i for i, s in enumerate(ANIMACY)}
animacy_len = len(ANIMACY)

In [5]:
out = speech_part_len + case_len + number_len + gender_len + tense_len + animacy_len

In [6]:
def analyze2tensor(analyzer_result):
    word, info = analyzer_result[0][0], analyzer_result[0][1]
    
#     pos_array = [0 for i in range(speech_part_len)]
#     pos_array[speech_part_map.get(info.POS, 0)] = 1
    
#     case_array = [0 for i in range(case_len)]
#     case_array[case_map.get(info.case, 0)] = 1
    
#     number_array = [0 for i in range(number_len)]
#     number_array[number_map.get(info.number, 0)] = 1
    
#     gender_array = [0 for i in range(gender_len)]
#     gender_array[gender_map.get(info.gender, 0)] = 1
    
#     tense_array = [0 for i in range(tense_len)]
#     tense_array[tense_map.get(info.tense, 0)] = 1
    
#     animacy_array = [0 for i in range(animacy_len)]
#     animacy_array[animacy_map.get(info.animacy, 0)] = 1
    
    try:
        word_index = navec.vocab[word]
    except KeyError:
        word_index = 500000
    finally:
        word_tensor = emb(torch.tensor([word_index])).squeeze(0)
    
#     return torch.cat((word_tensor, 
#                       torch.tensor(pos_array, dtype=torch.long), 
#                       torch.tensor(case_array, dtype=torch.long),
#                       torch.tensor(number_array, dtype=torch.long),
#                       torch.tensor(gender_array, dtype=torch.long),
#                       torch.tensor(tense_array, dtype=torch.long),
#                       torch.tensor(animacy_array, dtype=torch.long)
#                      ))
    return word_tensor

def parse2tensor(parse_result):
    word, info = parse_result[:1], parse_result[1:]
    
    pos_array = [0 for i in range(speech_part_len)]
    pos_array[speech_part_map.get(info[0], 0)] = 1
    
    case_array = [0 for i in range(case_len)]
    case_array[case_map.get(info[1], 0)] = 1
    
    number_array = [0 for i in range(number_len)]
    number_array[number_map.get(info[2], 0)] = 1
    
    gender_array = [0 for i in range(gender_len)]
    gender_array[gender_map.get(info[3], 0)] = 1
    
    tense_array = [0 for i in range(tense_len)]
    tense_array[tense_map.get(info[4], 0)] = 1
    
    animacy_array = [0 for i in range(animacy_len)]
    animacy_array[animacy_map.get(info[5], 0)] = 1
    
    try:
        word_index = navec.vocab[word]
    except KeyError:
        word_index = 500000
    finally:
        word_tensor = emb(torch.tensor([word_index])).squeeze(0)
    
    return torch.cat((torch.tensor(pos_array, dtype=torch.long), 
            torch.tensor(case_array, dtype=torch.long),
            torch.tensor(number_array, dtype=torch.long),
            torch.tensor(gender_array, dtype=torch.long),
            torch.tensor(tense_array, dtype=torch.long),
            torch.tensor(animacy_array, dtype=torch.long)
            ))

In [7]:
def prepare_data(path):
    sentences = []
    sentence = []
    i = 0
    with open(path, "r", encoding="utf-8") as file:
        for line in file:
            i += 1
            line = line.strip()
            if not line:
                sentences.append(sentence)
                sentence = []
            else:
                splited = line.split("\t")
                tags = splited[6].split("|")
                case = '_'
                number = '_'
                gender = '_'
                tense = '_'
                animacy = '_'
                for tag in tags:
                    if tag.startswith('Case='):
                        tag = tag.split("=")[1]
                        case = tag
                    elif tag.startswith('Number='):
                        tag = tag.split("=")[1]
                        number = tag
                    elif tag.startswith('Gender='):
                        tag = tag.split("=")[1]
                        gender = tag
                    elif tag.startswith('Tense='):
                        tag = tag.split("=")[1]
                        tense = tag
                    elif tag.startswith('Animacy='):
                        tag = tag.split("=")[1]
                        animacy = tag
                        
                sentence.append((splited[1], splited[5], 
                                 case, number, 
                                 gender, tense, animacy))
            
    return sentences
    
def vectorize_prepared_data(data):
    i = 0
    
    x = []
    y = []
    
    for sentence in tqdm(data, total=len(data)):
        i += 1
        inputs = []
        targets = []
        for j in range(len(sentence) - 1):
            analyzer_result = None
            if sentence[j][0] and sentence[j][1] in PARTS_OF_SPEECH:
                analyzer_result = m.parse(sentence[j][0])
                item = sentence[j]
            else:
                continue
                
#             if sentence[j + 1][0] and sentence[j + 1][1] in PARTS_OF_SPEECH:
#                 item = sentence[j + 1]
#             else:
#                 continue

            inputs.append(analyze2tensor(analyzer_result))
            targets.append(parse2tensor(item))
        if not inputs or not targets:
            continue
        
        x.append(inputs)
        y.append(targets)
        
        if i % 100000 == 0 and i > 0:
            print("Passed:", i)
    print("Total data:", i)
    return x, y

class MorphDataset(Dataset):
    def __init__(self, path, threshold=None):
        data = prepare_data(path)
        print("Data prepared!")
        print("*" * 50)
        inputs, targets = vectorize_prepared_data(data)
        if threshold:
            inputs = inputs[:threshold]
            targets = targets[:threshold]
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        x = self.inputs[idx]
        y = self.targets[idx]
        x = torch.cat([t.unsqueeze(0) for t in x], dim=0).to(device)
        y = torch.cat([t.unsqueeze(0) for t in y], dim=0).to(device)
        return (x, y)

In [11]:
train_loader = DataLoader(MorphDataset("labeled_syntagrus.train"), batch_size=1, shuffle=False)
test_loader = DataLoader(MorphDataset("labeled_syntagrus.test"), batch_size=1, shuffle=False)

Data prepared!
**************************************************


  0%|          | 0/49469 [00:00<?, ?it/s]

Total data: 49469
Data prepared!
**************************************************


  0%|          | 0/12420 [00:00<?, ?it/s]

Total data: 12420


In [12]:
# i = 0
# for batch in train_loader:
#     print(batch[0].shape, batch[1].shape)
#     for i in range(batch[0].size(1)):
#         print(batch[0][:, i].shape, batch[1][:, i].shape)
#     if i == 2:
#         break
#     i += 1
len(test_loader)

12416

In [9]:
class ContextLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, kernel_size=8, window_size=4):
        super().__init__()
        self.gru = nn.LSTMCell(input_size, hidden_size)
        self.hidden = hidden_size
        self.conv = nn.Sequential(
                                nn.Conv1d(in_channels=1, out_channels=1, kernel_size=8),
                                nn.GELU(),
                                nn.MaxPool1d(kernel_size=4),
                                nn.Conv1d(in_channels=1, out_channels=1, kernel_size=8),
                                nn.GELU(),
                                nn.MaxPool1d(kernel_size=4)
                                )
        self.softmax = nn.Softmax(dim=-1)
        self.linear = nn.Sequential(
            nn.Linear((((hidden_size - 8 + 1) // 4) - 8 + 1) // 4, output_size),
            nn.GELU()
            )
        self.pos = nn.Linear(speech_part_len, speech_part_len)
        self.case = nn.Linear(case_len, case_len)
        self.number = nn.Linear(number_len, number_len)
        self.gender = nn.Linear(gender_len, gender_len)
        self.tense = nn.Linear(tense_len, tense_len)
        self.animacy = nn.Linear(animacy_len, animacy_len)
        
    def forward(self, hx, cx, x):
        hx, cx = self.gru(x, (hx, cx))
        y = self.linear(self.conv(hx))
        pos = self.pos(y[:, :speech_part_len])
        case = self.case(y[:, speech_part_len: speech_part_len + case_len])
        number = self.number(y[:, speech_part_len + case_len: speech_part_len + case_len + number_len])
        gender = self.gender(y[:, speech_part_len + case_len + number_len: speech_part_len + case_len + number_len + gender_len])
        tense = self.tense(y[:, speech_part_len + case_len + number_len + gender_len: speech_part_len + case_len + number_len + gender_len + tense_len])
        animacy = self.animacy(y[:, speech_part_len + case_len + number_len + gender_len + tense_len: ])
        return hx, cx, pos, case, number, gender, tense, animacy
    
    def initHidden(self):
        return torch.zeros(1, self.hidden).to(device), torch.zeros(1, self.hidden).to(device)
        
        
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, kernel_size=8, window_size=4):
        super().__init__()
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.dropout = nn.Dropout(0.3)
        self.i2o = nn.Linear(hidden_size, hidden_size)
        self.hidden = hidden_size
        self.conv = nn.Sequential(
                                nn.Conv1d(in_channels=1, out_channels=1, kernel_size=8),
                                nn.GELU(),
                                nn.MaxPool1d(kernel_size=4),
                                nn.Conv1d(in_channels=1, out_channels=1, kernel_size=8),
                                nn.GELU(),
                                nn.MaxPool1d(kernel_size=4)
                                    )
        self.softmax = nn.Softmax(dim=-1)
        
        self.pos = nn.Linear(speech_part_len, speech_part_len)
        self.case = nn.Linear(case_len, case_len)
        self.number = nn.Linear(number_len, number_len)
        self.gender = nn.Linear(gender_len, gender_len)
        self.tense = nn.Linear(tense_len, tense_len)
        self.animacy = nn.Linear(animacy_len, animacy_len)
    
    def forward(self, h_prev, x):
        combined = torch.cat([h_prev, x], dim = 1) # concatenate x and h
        h = torch.tanh(self.dropout(self.i2h(combined)))
        y = self.i2o(h)
        y = self.conv(y)
        pos = self.softmax(self.pos(y[:, :speech_part_len]))
        case = self.softmax(self.case(y[:, speech_part_len: speech_part_len + case_len]))
        number = self.softmax(self.number(y[:, speech_part_len + case_len: speech_part_len + case_len + number_len]))
        gender = self.softmax(self.gender(y[:, speech_part_len + case_len + number_len: speech_part_len + case_len + number_len + gender_len]))
        tense = self.softmax(self.tense(y[:, speech_part_len + case_len + number_len + gender_len: speech_part_len + case_len + number_len + gender_len + tense_len]))
        animacy = self.softmax(self.animacy(y[:, speech_part_len + case_len + number_len + gender_len + tense_len: ]))
        return h, pos, case, number, gender, tense, animacy
    
    def initHidden(self):
        return torch.zeros(1, self.hidden).to(device)

In [14]:
p = speech_part_len
c = case_len
n = number_len
g = gender_len
t = tense_len
a = animacy_len

In [15]:
def accuracy(pred, y):
    correct = (torch.softmax(pred, dim=1).argmax(-1) == y.argmax(-1)).sum()
    return correct / y.size(0)

def train(model, iterator, optimizer, criterion):
    model.train()
    seq_len = len(iterator)
    lp, lc, ln, lg, lt, la = [], [], [], [], [], []
    ap, ac, an, ag, at, aa = [], [], [], [], [], []
    epoch_pos_acc = 0
    epoch_case_acc = 0
    epoch_number_acc = 0
    epoch_gender_acc = 0
    epoch_tense_acc = 0
    epoch_animacy_acc = 0
    
    epoch_pos_loss = 0
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        i += 1
        
        loss = 0
        loss_pos = 0
        loss_case = 0
        loss_number = 0
        loss_gender = 0
        loss_tense = 0
        loss_animacy = 0
        
        pos_acc = 0
        case_acc = 0
        number_acc = 0
        gender_acc = 0
        tense_acc = 0
        animacy_acc = 0
        
        hx, cx = model.initHidden()
        inp, out = batch
        for j in range(inp.size(1)):
            hx, cx, pos, case, number, gender, tense, animacy = model(hx, cx, inp[:, j])
            loss_pos += criterion(pos, out[:, j, : p].float())
            loss_case += criterion(case, out[:, j, p: p + c].float())
            loss_number += criterion(number, out[:, j, p + c: p + c + n].float())
            loss_gender += criterion(gender, out[:, j, p + c + n: p + c + n + g].float())
            loss_tense += criterion(tense, out[:, j, p + c + n + g: p + c + n + g + t].float())
            loss_animacy += criterion(animacy, out[:, j, p + c + n + g + t:].float())
            
            pos_acc += accuracy(pos, out[:, j, : p])
            case_acc += accuracy(case, out[:, j, p: p + c])
            number_acc += accuracy(number, out[:, j, p + c: p + c + n])
            gender_acc += accuracy(gender, out[:, j, p + c + n: p + c + n + g])
            tense_acc += accuracy(tense, out[:, j, p + c + n + g: p + c + n + g + t])
            animacy_acc += accuracy(animacy, out[:, j, p + c + n + g + t:])
        
        lp.append(loss_pos.item())
        lc.append(loss_case.item())
        ln.append(loss_number.item())
        lg.append(loss_gender.item())
        lt.append(loss_tense.item())
        la.append(loss_animacy.item())
        
        ap.append(pos_acc / inp.size(1))
        ac.append(case_acc / inp.size(1))
        an.append(number_acc / inp.size(1))
        ag.append(gender_acc / inp.size(1))
        at.append(tense_acc / inp.size(1))
        aa.append(animacy_acc / inp.size(1))
        
        epoch_pos_acc += (pos_acc / inp.size(1))
        epoch_case_acc += (case_acc / inp.size(1))
        epoch_number_acc += (number_acc / inp.size(1))
        epoch_gender_acc += (gender_acc / inp.size(1))
        epoch_tense_acc += (tense_acc / inp.size(1))
        epoch_animacy_acc += (animacy_acc / inp.size(1))
        
        epoch_pos_loss += loss_pos.item()
        loss = loss_pos + loss_case + loss_number + loss_gender + loss_tense + loss_animacy
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if i % 10000 == 0:
            print("_" * 90)
            print(f"| passed: {i}/{seq_len} | loss: {(epoch_loss / i):7.4f} | loss pos: {(epoch_pos_loss / i):7.4f} |"
                  f" pos_acc: {(epoch_pos_acc / i):7.4f} | case_acc: {(epoch_case_acc / i):7.4f} | "
                  f"number_acc: {(epoch_number_acc / i):7.4f} | tense_acc: {(epoch_tense_acc / i):7.4f} | "
                  f"pos_acc per last sentence: {((pos_acc / inp.size(1))):7.4f}")
    return epoch_loss / seq_len, lp, lc, ln, lg, lt, la, ap, ac, an, ag, at, aa

In [16]:
def evaluate(model, iterator, criterion):
    model.eval()
    seq_len = len(iterator)
    
    epoch_tense_loss = 0
    epoch_gender_loss = 0
    epoch_number_loss = 0
    epoch_case_loss = 0
    epoch_pos_loss = 0
    epoch_loss = 0
    i = 0
    
    acc = 0
    epoch_pos_acc = 0
    epoch_case_acc = 0
    epoch_number_acc = 0
    epoch_gender_acc = 0
    epoch_tense_acc = 0
    epoch_animacy_acc = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            i += 1
            
            loss = 0
            loss_pos = 0
            loss_case = 0
            loss_number = 0
            loss_gender = 0
            loss_tense = 0
            loss_animacy = 0
            
            pos_acc = 0
            case_acc = 0
            number_acc = 0
            gender_acc = 0
            tense_acc = 0
            animacy_acc = 0
            
            hx, cx = model.initHidden()
            inp, out = batch
            for j in range(inp.size(1)):
                hx, cx, pos, case, number, gender, tense, animacy = model(hx, cx, inp[:, j])
                loss_pos += criterion(pos, out[:, j, : p].float())
                loss_case += criterion(case, out[:, j, p: p + c].float())
                loss_number += criterion(number, out[:, j, p + c: p + c + n].float())
                loss_gender += criterion(gender, out[:, j, p + c + n: p + c + n + g].float())
                loss_tense += criterion(tense, out[:, j, p + c + n + g: p + c + n + g + t].float())
                loss_animacy += criterion(animacy, out[:, j, p + c + n + g + t:].float())
                
                pos_acc += accuracy(pos, out[:, j, : p].float())
                case_acc += accuracy(case, out[:, j, p: p + c].float())
                number_acc += accuracy(number, out[:, j, p + c: p + c + n].float())
                gender_acc += accuracy(gender, out[:, j, p + c + n: p + c + n + g].float())
                tense_acc += accuracy(tense, out[:, j, p + c + n + g: p + c + n + g + t].float())
                animacy_acc += accuracy(animacy, out[:, j, p + c + n + g + t:].float())
            acc += ((pos_acc + case_acc + number_acc + gender_acc + tense_acc + animacy_acc) / (6 * inp.size(1)))      
            loss += (loss_pos + loss_case + loss_number + loss_gender + loss_tense + loss_animacy)
            epoch_loss += loss.item()
            
            epoch_pos_acc += (pos_acc / inp.size(1))
            epoch_case_acc += (case_acc / inp.size(1))
            
            epoch_pos_loss += loss_pos.item()
            epoch_tense_loss += loss_tense.item()
            epoch_gender_loss += loss_gender.item()
            epoch_number_loss += loss_number.item()
            epoch_case_loss += loss_case.item()
            
            if i % 2000 == 0:
                print("_" * 100)
                print(f"| passed: {i}/{seq_len} | loss: {(epoch_loss / i):7.4f} | loss pos: {(epoch_pos_loss / i):7.4f} | "
                      f"loss case: {(epoch_case_loss / i):7.4f} | loss number: {(epoch_number_loss / i):7.4f} | "
                      f"loss gender: {(epoch_gender_loss / i):7.4f} | loss tense: {(epoch_tense_loss / i):7.4f} | "
                      f"pos accuracy: {(pos_acc / inp.size(1)):6.3f} | case accuracy: {(case_acc / inp.size(1)):6.3f} |")
        
    return epoch_loss / seq_len, acc / seq_len, epoch_pos_acc / seq_len, epoch_case_acc / seq_len

In [10]:
INPUT = 300
HIDDEN = 1024
OUTPUT = out
EPOCH = 4

model = ContextLSTM(INPUT, HIDDEN, OUTPUT).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0003)
criterion = nn.CrossEntropyLoss().to(device)

In [21]:
print("model parameters:", sum([p.numel() for p in model.parameters() if p.requires_grad]))

model parameters: 5433984


In [22]:
lpt, lct, lnt, lgt, ltt, lat, apt, act, ant, agt, att, aat = [], [], [], [], [], [], [], [], [], [], [], []
for epoch in range(1, EPOCH + 1):
    train_loss, lp, lc, ln, lg, lt, la, ap, ac, an, ag, at, aa = train(model, train_loader, optimizer, criterion)
    lpt += lp
    lct += lc
    lnt += ln
    lgt += lg
    ltt += lt
    lat += la
    apt += ap
    act += ac
    ant += an
    agt += ag
    att += at
    aat += aa
    print("=" * 100)
    print(f"| epoch: {epoch}/{EPOCH} | train loss: {train_loss} |")
    eval_loss, acc, pos_acc, case_acc = evaluate(model, test_loader, criterion)
    print(f"| epoch: {epoch}/{EPOCH} | eval loss: {eval_loss} | average acc: {acc} | pos acc: {pos_acc:6.3f} | case_acc: {case_acc:6.3f}")

__________________________________________________________________________________________
| passed: 10000/49449 | loss: 59.7522 | loss pos: 16.3279 | pos_acc:  0.6616 | case_acc:  0.7321 | number_acc:  0.8311 | tense_acc:  0.9071 | pos_acc per last sentence:  0.8667
__________________________________________________________________________________________
| passed: 20000/49449 | loss: 44.3961 | loss pos: 11.9552 | pos_acc:  0.7549 | case_acc:  0.8092 | number_acc:  0.8799 | tense_acc:  0.9351 | pos_acc per last sentence:  0.8667
__________________________________________________________________________________________
| passed: 30000/49449 | loss: 37.3520 | loss pos:  9.8948 | pos_acc:  0.7990 | case_acc:  0.8419 | number_acc:  0.9000 | tense_acc:  0.9473 | pos_acc per last sentence:  1.0000
__________________________________________________________________________________________
| passed: 40000/49449 | loss: 33.2877 | loss pos:  8.7093 | pos_acc:  0.8240 | case_acc:  0.8610 | number

____________________________________________________________________________________________________
| passed: 8000/12416 | loss: 14.5552 | loss pos:  3.5271 | loss case:  3.0538 | loss number:  1.9340 | loss gender:  2.9395 | loss tense:  0.9201 | pos accuracy:  1.000 | case accuracy:  0.875 |
____________________________________________________________________________________________________
| passed: 10000/12416 | loss: 14.5405 | loss pos:  3.5193 | loss case:  3.0519 | loss number:  1.9325 | loss gender:  2.9412 | loss tense:  0.9301 | pos accuracy:  0.833 | case accuracy:  0.944 |
____________________________________________________________________________________________________
| passed: 12000/12416 | loss: 14.5288 | loss pos:  3.5154 | loss case:  3.0625 | loss number:  1.9269 | loss gender:  2.9339 | loss tense:  0.9257 | pos accuracy:  0.926 | case accuracy:  0.889 |
| epoch: 3/4 | eval loss: 14.544349717120697 | average acc: 0.9509199261665344 | pos acc:  0.929 | case_acc:  

In [23]:
# ЧИСЛО ЭПОХ СДЕЛАЙ 2 И ПОСТАВЬ ШУДЛЕР БЛЯТЬ НАКОНЕЦ!
torch.save(model.state_dict(), "lstm-cnn-morph-predictor-345d-768h-adam-3e5-epoch-4-update-2.pt")

In [24]:
from datetime import datetime
datetime.utcnow()

datetime.datetime(2023, 3, 21, 22, 0, 51, 883543)

In [25]:
str(datetime(2023, 3, 18, 13, 11, 53, 942337))

'2023-03-18 13:11:53.942337'

In [26]:
def convertItems(seq: list) -> list:
    return [i.item() for i in seq]

df = pd.DataFrame({
    "pos_loss": lpt,
    "case_loss": lct,
    "number_loss": lnt,
    "gender_loss": lgt,
    "tense_loss": ltt,
    "animacy_loss": lat,
    "pos_acc": convertItems(apt),
    "case_acc": convertItems(act),
    "number_acc": convertItems(ant),
    "gender_acc": convertItems(agt),
    "tense_acc": convertItems(att),
    "animacy_acc": convertItems(aat)
})
df

Unnamed: 0,pos_loss,case_loss,number_loss,gender_loss,tense_loss,animacy_loss,pos_acc,case_acc,number_acc,gender_acc,tense_acc,animacy_acc
0,53.976917,43.010426,21.555592,29.460543,26.514055,18.933142,0.000000,0.150000,0.450000,0.200000,0.950000,0.650000
1,45.329891,36.633713,20.301250,27.392374,22.378302,15.239198,0.176471,0.000000,0.000000,0.000000,0.882353,0.823529
2,83.570099,66.340607,36.786621,49.101803,40.914780,28.274738,0.000000,0.096774,0.032258,0.032258,0.903226,0.741935
3,16.222179,12.931485,6.657172,9.612161,8.047033,4.933577,0.166667,0.000000,0.333333,0.000000,0.833333,1.000000
4,2.487713,2.145606,0.932404,1.696517,1.310476,1.129375,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
197791,4.637361,9.195228,3.203994,3.361206,0.063446,5.811224,0.875000,0.750000,0.937500,0.875000,1.000000,0.812500
197792,5.044485,2.067928,1.678415,1.355302,0.626490,3.489312,0.933333,0.966667,1.000000,1.000000,1.000000,0.933333
197793,9.469631,5.550805,4.595307,3.069457,0.243415,6.804484,0.863636,0.909091,0.954545,0.954545,1.000000,0.863636
197794,0.467668,0.627315,0.540148,0.374201,0.181452,0.200805,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000


In [27]:
df.to_csv("lstm-cnn-result-v2.csv")

In [None]:
eval_loss, acc, pos_acc, case_acc = evaluate(model, test_loader, criterion)
print(f"| eval loss: {eval_loss:6.3f} | acc: {acc:6.3f} | pos_acc: {pos_acc:6.3f} | case_acc: {case_acc:6.3f} |")

In [11]:
import time

model.load_state_dict(torch.load("lstm-cnn-morph-predictor-345d-768h-adam-3e5-epoch-4-update-2.pt"))

speech_part_unmap = {i: str(s) for i, s in enumerate(PARTS_OF_SPEECH)}
speech_part_len = len(PARTS_OF_SPEECH)

case_unmap = {i: str(s) for i, s in enumerate(CASES)}
case_len = len(CASES)

number_unmap = {i: str(s) for i, s in enumerate(NUMBERS)}
number_len = len(NUMBERS)

gender_unmap = {i: str(s) for i, s in enumerate(GENDERS)}
gender_len = len(GENDERS)

tense_unmap = {i: str(s) for i, s in enumerate(TENSES)}
tense_len = len(TENSES)

animacy_unmap = {i: str(s) for i, s in enumerate(ANIMACY)}
animacy_len = len(ANIMACY)

def clockit(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        func(*args, **kwargs)
        print("Elapsed time:", time.time() - start)
    return wrapper

def sample(model, word, hx=torch.zeros(1, model.hidden), cx=torch.zeros(1, model.hidden)):
    m = MorphAnalyzer()
    inp = analyze2tensor(m.parse(word)).unsqueeze(0).to(device)
    hx, cx, pos, case, number, gender, tense, animacy = model(hx, cx, inp)
    pos = speech_part_unmap[torch.softmax(pos.float(), dim=-1).argmax(-1).item()]
    case = case_unmap[torch.softmax(case.float(), dim=-1).argmax(-1).item()]
    number = number_unmap[torch.softmax(number.float(), dim=-1).argmax(-1).item()]
    gender = gender_unmap[torch.softmax(gender.float(), dim=-1).argmax(-1).item()]
    tense = tense_unmap[torch.softmax(tense.float(), dim=-1).argmax(-1).item()]
    animacy = animacy_unmap[torch.softmax(animacy.float(), dim=-1).argmax(-1).item()]
    return hx, cx, pos, case, number, gender, tense, animacy

@clockit
def parse_sentence(sent: str, model):
    sent = sent.split()
    hx, cx = torch.zeros(1, model.hidden).to(device), torch.zeros(1, model.hidden).to(device)
    for word in sent:
        hx, cx, pos, case, number, gender, tense, animacy = sample(model, word, hx, cx)
        print("Слово:", word, pos, case, number, gender, tense, animacy)

In [22]:
parse_sentence("веселый молочник", model)

Слово: веселый ADJ Nom Sing Masc Unk Unk
Слово: молочник NOUN Nom Sing Masc Unk Anim
Elapsed time: 0.19352126121520996


In [21]:
from pymorphy2 import MorphAnalyzer
from string import punctuation


m = MorphAnalyzer()

tokens = 0
biased = 0

with open("labeled_syntagrus.test", "r", encoding="utf-8") as file:
    for line in file.readlines():
        if line.strip() != "":
            token = line.lower().split("\t")[1]
            if token in punctuation:
                tokens += 1
                continue
            if len(m.parse(token)) > 1 and len(set([x.tag.POS for x in m.parse(token)])) > 1:
                biased += 1
                tokens += 1
            else:
                tokens += 1
print(tokens, biased)

221082 54808
