In [1]:
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch import Tensor
import functools
import os
import numpy as np

import warnings
import glob
import io
from joblib import delayed 

from cacher import root, file_cached, mem_cached, clear_cache

from corus import load_lenta2
from navec import Navec
from razdel import tokenize, sentenize

# from data_loader import download_file

from slovnet.model.emb import NavecEmbedding


# from torchmetrics.functional.classification import binary_accuracy

import random

In [41]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import math

@mem_cached('read_dataset')
def read_dataset(filename, *args, **kwargs):
    return pd.read_csv(filename, *args, **kwargs)

@mem_cached('words_forms_map')
@file_cached('words_forms_map')
def get_words_forms_map():
    words_forms = read_dataset("dataset/russian3 - words_forms.csv")

    words_forms_map = defaultdict(lambda: [])

    for _, row in tqdm(words_forms.iterrows(), total=len(words_forms)):
        words_forms_map[row['form_bare']].append((row['word_id'], form_type_to_id[row['form_type']]))

    return dict(words_forms_map)

def get_words_dataset():
    return read_dataset('dataset/russian3 - words.csv', low_memory=False, index_col=0, usecols=["id", "bare", "type", "rank"])

def get_nouns():
    return read_dataset("dataset/russian3 - nouns.csv", index_col=0, usecols=['word_id', 'gender', 'animate', 'indeclinable', 'sg_only', 'pl_only'])

def get_adjectuvies():
    return read_dataset('dataset/russian3 - adjectives.csv', low_memory=False, index_col=0)
    
def get_verbs():
    return read_dataset('dataset/russian3 - verbs.csv', index_col=0, usecols=['word_id', 'aspect'])

def reverse_dict(d, priority_for_duplicates=[]):
    res = {val: key for key, val in d.items()}
    for key in priority_for_duplicates:
        res[d[key]] = key
    return res

def EnumFeature(name, values):
    by_name = dict(
            [('unk', 0)] +
            [('nan', 0)] +
            [(val, i + 1) for i, val in enumerate(sorted(values))]
        )
    by_id = reverse_dict(by_name, priority_for_duplicates=['unk'])

    def to_str(id):
        return name + "=" + by_id[id]

    return type(name, (object, ), {
        "by_name": by_name,
        "_by_id": by_id,
        "to_str": to_str
    })


Formset(read_dataset("dataset/russian3 - words_forms.csv").form_type)
class FormType:
    _by_id = dict( 
        [(0, 'unk')] +
        [(i + 1, form_type) for i, form_type in enumerate(sorted()]
    )
    by_name = reverse_dict(_by_id)

    @staticmethod
    def by_id(id):
        return FormType._by_id[id]


class WordType:
    UNKNOWN = 0
    ADJECTIVE = 1
    ADVERB = 2
    EXPRESSION = 3
    NOUN = 4
    OTHER = 5
    VERB = 6
    by_name = {'nan': UNKNOWN, 'unk': UNKNOWN,
        'adjective': ADJECTIVE, 'adverb': ADVERB, 'expression':EXPRESSION, 
        'noun': NOUN, 'other': OTHER, 'verb': VERB}
    by_id = reverse_dict(by_name, priority_for_duplicates=['unk'])
    @staticmethod
    def to_str(id):
        return 'word_type=' + WordType.by_id[id]

class VerbAspect:
    UNKNOWN = 0
    IMPERFECTIVE = 1
    PERFECTIVE = 2
    BOTH = 3
    by_name = {
        'unk': UNKNOWN,
        'imperfective': IMPERFECTIVE,
        'perfective': PERFECTIVE,
        'both': BOTH
    }
    by_id = reverse_dict(by_name)
    @staticmethod
    def to_str(id):
        return 'aspect=' + VerbAspect.by_id[id]

class NounGender:
    UNKNOWN = 0
    F = 1
    M = 2
    N = 3
    PL = 4
    BOTH = 5
    by_name = {
        'unk': UNKNOWN,
        'nan': UNKNOWN,
        'f': F,
        'm': M,
        'n': N,
        'pl': PL,
        'both': BOTH
    }
    by_id = reverse_dict(by_name, priority_for_duplicates=['unk'])
    @staticmethod
    def to_str(id):
        return 'gender=' + NounGender.by_id[id]

class BoolFeature:
    UNKNOWN = 0
    FALSE = 1
    TRUE = 2

    def __init__(self, name):
        self.name = name

    def by_val(self, val):
        if val == 0:
            return BoolFeature.FALSE
        elif val == 1:
            return BoolFeature.TRUE
        return BoolFeature.UNKNOWN
    
    def to_str(self, bool_feature):
        if bool_feature == BoolFeature.FALSE:
            return self.name + "=0"
        elif bool_feature == BoolFeature.TRUE:
            return self.name + "=1"
        return self.name + "=unk"


NounAnimate = BoolFeature('noun_animate')
NounIndeclinable = BoolFeature('noun_indeclinable')
NounSGOnly = BoolFeature('noun_sg_only')
NounPLOnly = BoolFeature('noun_pl_only')
AdjectiveIncomparable = BoolFeature('adjective_incomparable')

word_form_by_name = get_words_forms_map()
words_by_id = {x[0]: (x[1], x[2], WordType.by_name[str(x[3])]) for x in get_words_dataset().itertuples()}
words_by_name = {x[1]: (x[0], x[2], WordType.by_name[str(x[3])]) for x in get_words_dataset().itertuples()}
adjectives_by_id = {x[0]: AdjectiveIncomparable.by_val(x[1]) for x in get_adjectuvies().itertuples() if not math.isnan(x[1])}
verbs_by_id = {x[0]: VerbAspect.by_name[x[1]] for x in get_verbs().itertuples() if str(x[1]) != 'nan'}

#'word_id', 'gender', 'animate', 'indeclinable', 'sg_only', 'pl_only'
nouns_by_id = {x[0]: (NounGender.by_name[str(x[1])], NounAnimate.by_val(x[2]), 
        NounIndeclinable.by_val(x[3]), NounSGOnly.by_val(x[4]), NounPLOnly.by_val(x[5])) 
        for x in get_nouns().itertuples()} 


In [77]:
word_form_by_name.get('кота', (0, 0))

[(1291, 40), (1291, 38)]

In [44]:
from collections import namedtuple


StaticFeatures = namedtuple('StaticFeatures', 
    ['word_id', 'bare', 'incomparable', 'aspect', 'gender', 
            'animate', 'indeclinable', 'sg_only', 'pl_only'])

MorphFeatures = namedtuple('MorphFeatures', 
        ['form_type_id', 'word_type'])

WordFeatures = namedtuple('WordFeatures',
        ['word', 'morph_features', 'static_features'])

class StaticFeatures(StaticFeatures):
    def __repr__(self) -> str:
        features = []
        for feature, caster in [(self.bare, str),
                  (self.incomparable, AdjectiveIncomparable.by_id),
                  (self.aspect, VerbAspect.by_id),
                  (self.animate, NounAnimate.by_id),
                  (self.indeclinable, NounIndeclinable.by_id),
                  (self.sg_only, NounSGOnly.by_id),
                  (self.pl_only, NounPLOnly.by_id)]:
            s = caster(feature)
            if 'unk' not in s:
                features.append(s)
        return f"S({', '.join(features)})"
        # return f"S(word_id={self.word_id}, bare={self.bare}, " +\
        #     f"{AdjectiveIncomparable.by_id(self.incomparable)}, " +\
        #     f"aspect={VerbAspect.by_id[self.aspect]}, {NounAnimate.by_id(self.animate)}, "+\
        #     f"{NounIndeclinable.by_id(self.animate)}, " +\
        #     f"{NounSGOnly.by_id(self.indeclinable)}, {NounPLOnly.by_id(self.pl_only)})" 

    def __str__(self) -> str:
        return self.__repr__()

class MorphFeatures(MorphFeatures):
    def __repr__(self) -> str:
        return f"M(form_type_id={FormType.by_id(self.form_type_id)}, " +\
                f"word_type={WordType.by_id(self.word_type)})" 

    def __str__(self) -> str:
        return self.__repr__()

class WordFeatures(WordFeatures):
    def __repr__(self) -> str:
        return f"W(word={self.word}, " +\
                f"{self.morph_features}, {self.static_features})" 
    def __str__(self) -> str:
        return self.__repr__()


def extract_word_features(word):
    variants = []
    for word_id, form_type_id in word_form_by_name.get(word, [(0, 0)]):
        bare, rank, word_type = words_by_id.get(word_id, ('unk', 100000, WordType.UNKNOWN))
        variants.append((word_id, form_type_id, bare, rank, word_type))
    
    # take first by rank
    word_id, form_type_id, bare, rank, word_type = min(variants, key=lambda x: x[3])
    
    incomparable = AdjectiveIncomparable.UNKNOWN
    if word_type == WordType.ADJECTIVE:
        incomparable = adjectives_by_id.get(word_id, incomparable)

    aspect = VerbAspect.UNKNOWN
    if word_type == WordType.VERB:
        aspect = verbs_by_id.get(word_id, aspect)

    gender, animate, indeclinable, sg_only, pl_only = NounGender.UNKNOWN, NounAnimate.UNKNOWN, \
            NounIndeclinable.UNKNOWN, NounSGOnly.UNKNOWN, NounPLOnly.UNKNOWN
    if word_type == WordType.NOUN:
        gender, animate, indeclinable, sg_only, pl_only = nouns_by_id.get(word_id, 
                (gender, animate, indeclinable, sg_only, pl_only))

    return WordFeatures(word, 
        MorphFeatures(form_type_id, word_type), 
        StaticFeatures(word_id, bare, incomparable, aspect, gender, 
            animate, indeclinable, sg_only, pl_only))

extract_word_features('не')

W(word=не, M(form_type_id=ru_base, word_type=other), S(не))

In [21]:
word_form_by_name['не']

[(3, 31)]

In [29]:
russian_letters = set('йцукенгшщзхъфывапролджэячсмитьбюё')
for sentence in sentenize('токены, не являющиеся словами, игнорируются, т.е. в биграмму могут входить, например, слова, разделённые запятой. Тип B: никакие токены не игнорируются, но из списка исключаются цепочки, где хотя бы один токен не является словом.'):
    for token in tokenize(sentence.text):
        token = token.text
        if len(russian_letters & set(token)) > 0:
            print(extract_word_features(token))

WordFeatures(word=токены, word_id=0, form_type_id=unk, bare=unk, word_type=unk, adjective_incomparable=unk, aspect=unk, noun_animate=unk, noun_indeclinable=unk, noun_sg_only=unk, noun_pl_only=unk)
WordFeatures(word=не, word_id=3, form_type_id=ru_base, bare=не, word_type=other, adjective_incomparable=unk, aspect=unk, noun_animate=unk, noun_indeclinable=unk, noun_sg_only=unk, noun_pl_only=unk)
WordFeatures(word=являющиеся, word_id=60842, form_type_id=ru_adj_pl_acc, bare=являющийся, word_type=adjective, adjective_incomparable=unk, aspect=unk, noun_animate=unk, noun_indeclinable=unk, noun_sg_only=unk, noun_pl_only=unk)
WordFeatures(word=словами, word_id=104, form_type_id=ru_noun_pl_inst, bare=слово, word_type=noun, adjective_incomparable=unk, aspect=unk, noun_animate=0, noun_indeclinable=0, noun_sg_only=0, noun_pl_only=0)
WordFeatures(word=игнорируются, word_id=0, form_type_id=unk, bare=unk, word_type=unk, adjective_incomparable=unk, aspect=unk, noun_animate=unk, noun_indeclinable=unk, nou

In [48]:
verbs = read_dataset('dataset/russian3 - verbs.csv', low_memory=False, index_col=0)
verbs

Unnamed: 0_level_0,aspect,partner
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1
9,imperfective,
28,perfective,говорить;ска'зывать
32,imperfective,смочь
50,imperfective,узнать
53,imperfective,сказать;поговорить
...,...,...
90952,perfective,засо'вывать
90955,imperfective,
91007,perfective,выпаса'ть
91008,imperfective,вы'пасти


In [16]:
words.loc[60722]

position                                NaN
bare                              увидевший
accented                         уви'девший
derived_from_word_id                    NaN
rank                                    NaN
disabled                                  0
audio                                   NaN
usage_en                                NaN
usage_de                                NaN
number_value                            NaN
type                              adjective
level                                   NaN
created_at              2021-08-12 16:07:39
Name: 60722, dtype: object

In [15]:
words.loc[159]

position                                                              NaN
bare                                                              увидеть
accented                                                         уви'деть
derived_from_word_id                                                  NaN
rank                                                                143.0
disabled                                                                0
audio                   https://openrussian.org/audio-shtooka/увидеть.mp3
usage_en                                                              NaN
usage_de                                                       кого? что?
number_value                                                          NaN
type                                                                 verb
level                                                                  B1
created_at                                            2020-01-01 00:00:00
Name: 159, dtype: object

In [6]:
len(set(words_forms.form_bare))

1010298

In [7]:
len(words_forms)

1693184

In [2]:
navec_model_path = download_file("navec_hudlit_v1_12B_500K_300d_100q.tar",
        "https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar")
navec = Navec.load(navec_model_path)

In [3]:
dd = list(tokenize())
dd

TypeError: __call__() missing 1 required positional argument: 'text'

In [25]:
def word2vec(word):
    if word == "":
        return navec['<pad>']
    res = navec.get(word.lower())
    if res is None:
        return navec['<unk>']
    return res


# def word2torch(word):
#     id = word2id(torch)
#     return navec[id]

def text2dataset(text):
    dataset_x = []
    dataset_y = []
    tokens = list(tokenize(text))
    for i in range(len(tokens)):
        if tokens[i].text == "." or tokens[i].text == ",":
            left = tokens[i - 1].text if i - 1 >= 0 else ""
            right = tokens[i + 1].text if i + 1 < len(tokens) else ""

            if left not in navec.vocab or right not in navec.vocab:
                continue

            dataset_x.append([word2vec(left), word2vec(right)])
            dataset_y.append([1])
        else:
            if random.random() < 0.8:
                continue

            left = tokens[i - 1].text if i - 1 >= 0 else ""
            right = tokens[i].text

            if left not in navec.vocab or right not in navec.vocab:
                continue

            dataset_x.append([word2vec(left), word2vec(right)])
            dataset_y.append([0])

    return torch.tensor(dataset_x), torch.tensor(dataset_y)

text2dataset("Год, тест. Тест. Тест")

(tensor([]), tensor([]))

In [26]:
lenta_path = download_file("lenta-ru-news.csv.gz", 
        "https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2")
    
dataset_x, dataset_y = torch.Tensor(), torch.Tensor()

records = load_lenta2(lenta_path)
for record in records:
    x, y = text2dataset(record.text)
    dataset_x = torch.cat((dataset_x, x))
    dataset_y = torch.cat((dataset_y, y))
    print(dataset_x.shape[0])
    if dataset_x.shape[0] > 100000:
        break

dataset_x.shape, dataset_y.shape, sum(dataset_y) / dataset_y.shape[0]

24
32
82
96
115
153
172
205
243
296
340
357
375
404
412
469
513
569
592
613
651
678
700
732
750
799
853
881
897
934
959
978
1010
1047
1068
1128
1155
1198
1252
1276
1318
1374
1394
1410
1467
1480
1517
1541
1564
1596
1636
1664
1676
1687
1745
1776
1827
1862
1884
1909
1942
1964
1995
2022
2053
2063
2113
2139
2159
2195
2213
2226
2243
2265
2339
2358
2382
2395
2422
2430
2440
2466
2479
2501
2530
2581
2610
2646
2667
2683
2696
2719
2723
2747
2764
2795
2802
2832
2857
2909
2924
2947
2981
3004
3043
3066
3078
3102
3134
3151
3184
3196
3264
3294
3333
3336
3367
3424
3451
3458
3466
3485
3488
3517
3567
3591
3654
3737
3762
3798
3830
3853
3871
3895
3932
3964
3988
4018
4028
4079
4108
4124
4148
4192
4207
4291
4307
4325
4350
4371
4396
4404
4417
4450
4467
4493
4518
4539
4554
4596
4669
4723
4753
4782
4806
4817
4847
4883
4912
4965
5009
5029
5076
5079
5119
5135
5155
5167
5175
5210
5233
5299
5323
5362
5391
5401
5419
5448
5476
5480
5514
5562
5608
5624
5691
5784
5801
5831
5859
5888
5926
5988
6012
6061
6093
6166
6226
6

(torch.Size([100020, 2, 300]), torch.Size([100020, 1]), tensor([0.3097]))

In [8]:
output.shape, dataset_y_cuda.shape

(torch.Size([5001, 1]), torch.Size([5001, 1]))

In [36]:
model = torch.nn.Sequential(
    torch.nn.Flatten(1),
    torch.nn.Linear(600, 100),
    torch.nn.Dropout(0.5),
    # nn.BatchNorm1d(100),
    torch.nn.Tanh(),
    # torch.nn.Linear(200, 100),
    # torch.nn.ReLU(),
    torch.nn.Linear(100, 1),
    torch.nn.Sigmoid()
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# loss_fn = torch.nn.MSELoss()
loss_fn = torch.nn.BCELoss()

model.cuda()
dataset_x_cuda = dataset_x.cuda()
dataset_y_cuda = dataset_y.cuda()

dataset_x_cuda_test = dataset_x_cuda[:1000]
dataset_y_cuda_test = dataset_y_cuda[:1000]
dataset_x_cuda_train = dataset_x_cuda[1000:]
dataset_y_cuda_train = dataset_y_cuda[1000:]

        # total_len = self.x.shape[0]
        # if test_count is not None:
        #     train_len = int(total_len - test_count)
        # else:
        #     train_len = int(0.9 * total_len)
        # test_len = int(total_len - train_len)
        # x_train, x_test = torch.utils.data.random_split(self.x, [train_len, test_len])

# model.cpu()
# dataset_x_cuda = dataset_x
# dataset_y_cuda = dataset_y


for i in range(10000):
    optimizer.zero_grad()
    output = model(dataset_x_cuda_train)
    loss = loss_fn(output, dataset_y_cuda_train)
    loss.backward()
    optimizer.step()
    print(binary_accuracy(output, dataset_y_cuda_train).item(), 
            binary_accuracy(model(dataset_x_cuda_test), dataset_y_cuda_test).item())
    # print(torch.nn.L1Loss()(output, dataset_y_cuda).item())


0.45697838068008423 0.6570000052452087
0.6486568450927734 0.6800000071525574
0.6779741644859314 0.6919999718666077
0.6838012337684631 0.6949999928474426
0.6888002157211304 0.699999988079071
0.6971924901008606 0.7170000076293945
0.7079681158065796 0.7229999899864197
0.7177135944366455 0.734000027179718
0.7299838662147522 0.7350000143051147
0.7359321117401123 0.7450000047683716
0.7450212240219116 0.7490000128746033
0.7539588212966919 0.7559999823570251
0.7573318481445312 0.75
0.7593819499015808 0.753000020980835
0.7623914480209351 0.7490000128746033
0.7640880346298218 0.7509999871253967
0.7648353576660156 0.7580000162124634
0.7662896513938904 0.7559999823570251
0.768501341342926 0.7590000033378601
0.7705715894699097 0.7620000243186951
0.7729953527450562 0.7649999856948853
0.7745909690856934 0.7730000019073486
0.7775802612304688 0.7720000147819519
0.779852569103241 0.7730000019073486
0.7813774943351746 0.7760000228881836
0.7833771109580994 0.7799999713897705
0.7844172716140747 0.777999997

KeyboardInterrupt: 

In [22]:
torch.metrics

AttributeError: module 'torch' has no attribute 'metrics'

In [21]:
torch.nn.L1Loss()(output > 0.5, dataset_y_cuda > 0.5)

RuntimeError: Subtraction, the `-` operator, with two bool tensors is not supported. Use the `^` or `logical_xor()` operator instead.

  return F.l1_loss(input, target, reduction=self.reduction)


tensor(0.5007, grad_fn=<L1LossBackward0>)

NameError: name 'slovnet' is not defined

In [None]:
def words2ids(words):
    return [navec.vocab[_] for _ in words]

