In [3]:
import numpy as np
import pandas as pd
from tg.grammar_ru.common import Loc
from collections import defaultdict

import re

from tg.common import DataBundle
from tg.common.ml.batched_training import train_display_test_split
from tg.grammar_ru.features import PyMorphyFeaturizer

from tg.grammar_ru.corpus import ITransfuseSelector
from nltk.stem import SnowballStemmer
from tg.projects.agreement.bundles_tools import _print_thrown


In [4]:
# дядей, землёй Note в печатных текстах, наверное, ё заменяют на е
first_declension_ends = set("а я ы и е у ю ой ёй ей ".split())
# second_declension_ends = set("а я ы и е у ю ой ёй ей ою ".split())
POSSIBLE_ENDINGS = first_declension_ends

ends_list = sorted(list(first_declension_ends))
num_by_end = {e: n for n, e in enumerate(ends_list)}


def _extract_ending(word: str):
    for possible_ending in POSSIBLE_ENDINGS:
        if word.lower().endswith(possible_ending):
            return possible_ending
    return np.nan


class NounAgreementTrainIndexBuilder(ITransfuseSelector):
    def __init__(self):
        self.pmf = PyMorphyFeaturizer()
        # self.snowball = SnowballStemmer(language="russian")
        self.norm_endings_nums = {e: i for i,
                                  e in enumerate(['я', 'а'])}
        # self.endings_nums = {e: i for i, e in enumerate(ALL_ENDS_list)}

    def _extract_norm_ending(self, word_in_norm_form: str):
        for possible_ending in self.norm_endings_nums.keys():
            if word_in_norm_form.lower().endswith(possible_ending):
                return possible_ending
        return np.nan

    def select(self, source, df, toc_row):
        db = DataBundle(src=df)
        self.pmf.featurize(db)
        morphed = db.data_frames['pymorphy']
        morphed.replace({np.nan: 'nan'}, inplace=True)
        nouns = df[(morphed.POS == "NOUN")].copy()  # TODO delete
        # return morphed[(morphed.POS == "NOUN")]
        df['is_target'] = False
        df['declension_type'] = -1

        nouns['ending'] = (nouns.word
                           .apply(_extract_ending))

        morphed_nouns = morphed.loc[nouns.index]
        nouns['norm_ending'] = (morphed_nouns.normal_form
                                .apply(self._extract_norm_ending))

        undefined_ending_mask = (nouns.norm_ending.isnull() |
                                 nouns.ending.isnull())
        thrown = list(set(nouns[undefined_ending_mask].word))

        nouns = nouns[~undefined_ending_mask]
        nouns['declension_type'] = 1
        # adjectives.norm_ending.replace(            self.norm_endings_nums)

        nouns['label'] = nouns.ending.map(num_by_end)
        thrown.extend(nouns[nouns.label.isnull()].word)
        nouns = nouns[~nouns.label.isnull()]

        df.loc[nouns.index, 'declension_type'] = nouns['declension_type']
        df.declension_type = df.declension_type.astype(int)
        df['label'] = -1
        df.loc[nouns.index, 'label'] = nouns.label
        df.loc[nouns.index, 'is_target'] = True
        _print_thrown(thrown, Loc.temp_path / "noun_undefined_ending.txt")
        return [df]

    @staticmethod
    def build_index_from_src(src_df):
        df = src_df.loc[src_df.is_target][[
            'word_id', 'sentence_id', 'declension_type', 'label']].copy()
        df = df.reset_index(drop=True)
        df.index.name = 'sample_id'
        df['split'] = train_display_test_split(df)
        return df


In [5]:
from tg.grammar_ru.corpus.corpus_reader import CorpusReader
selector = NounAgreementTrainIndexBuilder()
source = Loc.corpus_path / 'prepare/balanced/books&pub_60K_balanced_feat.zip'
reader = CorpusReader(source)
toc = reader.get_toc()
frame_index_in_corpus = 0
frame = reader.get_frames().first()
toc_row = toc.iloc[frame_index_in_corpus].to_dict()
frame_index_in_corpus += 1
dfs = selector.select(source, frame, toc_row)
            


In [9]:
dfs[0].is_target.sum()

2414

In [10]:
len(dfs[0])

49971