In [1]:
import pandas as pd
import numpy as np

from tg.common import DataBundle
from tg.grammar_ru.common import Loc

Building vocab

In [2]:
db = DataBundle.load(Loc.bundles_path/'punct/9kk')

In [3]:
import typing as tp

import pandas as pd
import numpy as np
import pymorphy2
from nltk.corpus import stopwords

from tg.common import DataBundle

In [4]:
class VocabBuilder:
    def __init__(self, lang='russian'):
        self._morph_analyzer = pymorphy2.MorphAnalyzer()
        # self._stop_words = stopwords.words(lang)
        self._cached_words = {}

    def build_vocab(self, db: DataBundle, vocab_size=10_000, filter_stop_words=False) -> pd.DataFrame:
        normalized = pd.Series(self.get_normalized_words(db.src.word))
        if filter_stop_words:
            stop_words = stopwords.words(lang)
            filtered = normalized[~normalized.isin(stop_words)]
            frequency = filtered.value_counts()
        else:
            frequency = normalized.value_counts()
        
        taken_words = frequency[:vocab_size]

        return self._convert_to_vocab_frame(taken_words)

    def _convert_to_vocab_frame(self, taken_words: pd.Series) -> pd.DataFrame:
        vocab_df = taken_words.to_frame()
        vocab_df.columns = ['count']
        vocab_df['vocab_id'] = np.arange(len(taken_words))
        vocab_df.rename(columns={'0': 'count'}, inplace=True)
        vocab_df.index.name = 'word'

        vocab_df.loc['UNK'] = [0, -1]
        vocab_df.vocab_id += 1
        vocab_df.sort_values('vocab_id', inplace=True)

        return vocab_df

    def get_normal_form(self, word: str) -> str:
        return self._morph_analyzer.parse(word)[0].normal_form

    def get_normalized_words(self, words: tp.Iterable[str]) -> tp.Sequence[str]:
        normalized_words = []
        for word in words:
            if word in self._cached_words:
                normalized_form = self._cached_words[word]
            else:
                normalized_form = self.get_normal_form(word)
                self._cached_words[word] = normalized_form
            normalized_words.append(normalized_form)

        return normalized_words


In [5]:
vocab_builder = VocabBuilder()
vocab_df = vocab_builder.build_vocab(db)

In [6]:
vocab_df

Unnamed: 0_level_0,count,vocab_id
word,Unnamed: 1_level_1,Unnamed: 2_level_1
UNK,0,0
",",870752,1
.,366886,2
в,272738,3
и,205288,4
...,...,...
бабочка,57,9996
солженицын,57,9997
очнуться,57,9998
беляев,57,9999


In [7]:
vocab_df.to_parquet(Loc.bundles_path/'punct/9kk/word_to_vocab.parquet')
vocab_df.to_parquet('word_to_vocab.parquet')

Vocab featurizer

In [8]:
from tg.grammar_ru.features import SimpleFeaturizer

In [9]:
class VocabFeaturizer(SimpleFeaturizer):
    def __init__(self, path_to_vocab):
        super().__init__('vocab')
        self.path_to_vocab = path_to_vocab
        self.morpy_analyzer = pymorphy2.MorphAnalyzer()

    def _featurize_inner(self, db):
        vocab_df = pd.read_parquet(self.path_to_vocab)

        db.src['normalized_word'] = self._get_normalized_words(db.src.word)
        merged = db.src.merge(vocab_df, how='left', left_on='normalized_word', right_on='word')
        result = merged[['word_id', 'word', 'vocab_id']].copy()
        result.vocab_id = result.vocab_id.fillna(0)
        result.vocab_id = result.vocab_id.astype(int)

        return result.set_index('word_id', drop=True)

    def _get_normal_form(self, word):
        return self.morpy_analyzer.parse(word)[0].normal_form

    def _get_normalized_words(self, words):
        cached_words = {}
        normalized_words = []
        for word in db.src.word:
            if word in cached_words:
                normalized_form = cached_words[word]
            else:
                normalized_form = self._get_normal_form(word)
                cached_words[word] = normalized_form
            normalized_words.append(normalized_form)

        return normalized_words

In [10]:
class NavecFeaturizer(SimpleFeaturizer):
    def __init__(self, navec):
        self.navec = navec
        self.morpy_analyzer = pymorphy2.MorphAnalyzer()

    def _featurize_inner(self, db):
        db.src['normalized_word'] = self._get_normalized_words(db.src.word)
        word_ids = [navec.vocab.get(word, navec.vocab.unk_id) for word in db.src.normalized_word]
        result = db.src[['word_id', 'word']].copy()
        result['navec_id'] = word_ids
        result.navec_id = result.navec_id.astype(int)

        return result.set_index('word_id', drop=True)

    def _get_normal_form(self, word):
        return self.morpy_analyzer.parse(word)[0].normal_form

    def _get_normalized_words(self, words):
        cached_words = {}
        normalized_words = []
        for word in db.src.word:
            if word in cached_words:
                normalized_form = cached_words[word]
            else:
                normalized_form = self._get_normal_form(word)
                cached_words[word] = normalized_form
            normalized_words.append(normalized_form)

        return normalized_words

In [12]:
vocab_featurizer = VocabFeaturizer('word_to_vocab.parquet')

In [13]:
featurized = vocab_featurizer._featurize_inner(db)

In [14]:
featurized.to_parquet(Loc.bundles_path/'punct/9kk/sample_to_vocab.parquet')

In [15]:
from navec import Navec


navec = Navec.load('navec_hudlit_v1_12B_500K_300d_100q.tar')

In [16]:
navec_featurizer = NavecFeaturizer(navec)
featurized = navec_featurizer._featurize_inner(db)
featurized

Unnamed: 0_level_0,word,navec_id
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,По,302187
1,словам,403005
2,Шварца,484700
3,",",500000
4,все,74009
...,...,...
9514098,разговоре,361422
9514099,с,383451
9514100,The,12631
9514101,Guardian,5383


In [17]:
featurized.to_parquet(Loc.bundles_path/'punct/9kk/sample_to_navec.parquet')