In [1]:
import numpy as np
import pandas as pd
from tg.grammar_ru.common import Loc

import re

from tg.common import DataBundle
from tg.common.ml.batched_training import train_display_test_split
from tg.grammar_ru.features import PyMorphyFeaturizer

from tg.grammar_ru.corpus import ITransfuseSelector
from nltk.stem import SnowballStemmer
from pymystem3 import Mystem

# mystem = Mystem()

new = {'ая', 'ого', 'ое', 'ой', 'ом', 'ому',
       'ою', 'ую', 'ые', 'ый', 'ым', 'ыми', 'ых'}

good = {'ая', 'его', 'ее', 'ей', 'ем', 'ему',
        'ие', 'ий', 'им', 'ими', 'их', 'ую', 'яя', 'юю'}

big = {'ая', 'ие', 'им', 'ими', 'их', 'ого',
       'ое', 'ой', 'ом', 'ому', 'ою', 'ую'}

POSSIBLE_ENDINGS = set().union(new, good, big)


# def _get_poses_by_sentence(sentence: str):
#     # NOTE: краткие прилагательные mystem'ом отмечаются как прилагательные. e.g. Хороша, плох.
#     res = []
#     for word_info in mystem.analyze(sentence):
#         if 'analysis' not in word_info or not word_info["analysis"]:
#             continue
#         res.append(
#             (word_info["text"],
#              re.split(
#                  ',|=', word_info["analysis"][0]["gr"])[0])
#         )
#     return res


# def _set_mystem_pos(df):
#     df['pos_mystem'] = np.nan
#     for sent_id, tokens_group in df.groupby("sentence_id"):
#         sentence = ' '.join(tokens_group.word)
#         poses = _get_poses_by_sentence(sentence)
#         if not poses:
#             continue
#         j = 0
#         for i in tokens_group.index:  # zip with gaps
#             word, pos = poses[j]
#             if df.at[i, 'word'] == word:
#                 df.at[i, 'pos_mystem'] = pos
#                 j += 1
#                 if j == len(poses):
#                     break


def _extract_ending(word: str):
    for possible_ending in POSSIBLE_ENDINGS:  # TODO can we make it faster?
        if word.lower().endswith(possible_ending):
            return possible_ending
    return np.nan


class AdjAgreementTrainIndexBuilder(ITransfuseSelector):
    def __init__(self):
        self.pmf = PyMorphyFeaturizer()
        self.snowball = SnowballStemmer(language="russian")
        self.norm_endings_nums = {e: i for i,
                                  e in enumerate(['ый', 'ий', 'ой'])}
        self.endings_nums = {e: i for i, e in enumerate(
            sorted(list(POSSIBLE_ENDINGS)))}

    def _extract_norm_ending(self, word_in_norm_form: str):
        for possible_ending in self.norm_endings_nums.keys():
            if word_in_norm_form.lower().endswith(possible_ending):
                return possible_ending
        return np.nan

    def select(self, source, df, toc_row):  # ~build_train_index
        # _set_mystem_pos(df)
        db = DataBundle(src=df)
        self.pmf.featurize(db)  # запишет результат по ключу pymorphy
        morphed = db.data_frames['pymorphy']
        morphed.replace({np.nan: 'nan'}, inplace=True)
        adjectives = df[
            # (df.pos_mystem == 'A') &
            (morphed.POS == "ADJF")
        ].copy()  # TODO delete
        df['is_target'] = False
        df['declension_type'] = -1

        adjectives['ending'] = (adjectives.word
                                .apply(_extract_ending))

        morphed_adjectives = morphed.loc[adjectives.index]
        adjectives['norm_ending'] = (morphed_adjectives.normal_form
                                     .apply(self._extract_norm_ending))

        # adjectives['norm_form'] = morphed_adjectives.normal_form
        undefined_ending_mask = (adjectives.norm_ending.isnull() |
                                 adjectives.ending.isnull())
        with open(Loc.temp_path / "undefined_ending.txt", "a") as myfile:
            for w in adjectives[undefined_ending_mask].word:
                myfile.write(f'{w}\n')
        adjectives = adjectives[~undefined_ending_mask]
        # NOTE: отбросили слова, у которых не смогли определить окончание. e.g. волчий
        df.loc[adjectives.index, 'declension_type'] = adjectives.norm_ending.replace(
            self.norm_endings_nums)
        df.declension_type = df.declension_type.astype(int)
        df['label'] = -1
        df.loc[adjectives.index, 'label'] = adjectives.ending.replace(
            self.endings_nums)
        df.loc[adjectives.index, 'is_target'] = True
        return [df]

    @staticmethod
    def build_index_from_src(src_df):
        df = src_df.loc[src_df.is_target][[
            'word_id', 'sentence_id', 'label']].copy()
        df = df.reset_index(drop=True)
        df.index.name = 'sample_id'
        df['split'] = train_display_test_split(df)
        return df


In [6]:
from tg.grammar_ru import CorpusReader

builder = AdjAgreementTrainIndexBuilder()

reader = CorpusReader(Loc.corpus_path / 'prepare/balanced/books&pub_60K_balanced_feat.zip')
df = reader.get_frames().first()

In [9]:
ind = builder.select(_,df,_)[0]
# ind[ind.is_target]

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length,file_id,corpus_id,original_corpus_id,original_word_id,original_sentence_id,original_paragraph_id,updated,is_target,declension_type,label
14,14,0,14,0,1,другие,ru,6,45f6d5b5-3e14-4346-b848-003eed141143,books&pub_60K_balanced_feat.zip,books.base.zip,88,4,1,False,True,2,6
15,15,0,15,0,1,русскоязычные,ru,13,45f6d5b5-3e14-4346-b848-003eed141143,books&pub_60K_balanced_feat.zip,books.base.zip,89,4,1,False,True,0,18
20,20,1,0,1,1,Приятного,ru,9,45f6d5b5-3e14-4346-b848-003eed141143,books&pub_60K_balanced_feat.zip,books.base.zip,122,7,3,False,True,0,11
96,96,78,8,76,1,злосчастный,ru,11,45f6d5b5-3e14-4346-b848-003eed141143,books&pub_60K_balanced_feat.zip,books.base.zip,10754,10715,10711,False,True,0,19
104,104,78,16,76,1,неприятных,ru,10,45f6d5b5-3e14-4346-b848-003eed141143,books&pub_60K_balanced_feat.zip,books.base.zip,10762,10715,10711,False,True,0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50162,50162,49522,21,49517,1,чужие,ru,5,45f6d5b5-3e14-4346-b848-003eed141143,books&pub_60K_balanced_feat.zip,books.base.zip,445672,443264,443170,False,True,2,6
50183,50183,49523,2,49518,1,двойной,ru,7,45f6d5b5-3e14-4346-b848-003eed141143,books&pub_60K_balanced_feat.zip,books.base.zip,445813,443274,443178,False,True,2,13
50186,50186,49523,5,49518,1,старый,ru,6,45f6d5b5-3e14-4346-b848-003eed141143,books&pub_60K_balanced_feat.zip,books.base.zip,445816,443274,443178,False,True,0,19
50240,50240,49525,25,49519,1,Темного,ru,7,45f6d5b5-3e14-4346-b848-003eed141143,books&pub_60K_balanced_feat.zip,books.base.zip,445928,443280,443180,False,True,0,11


In [5]:
for df in  reader.get_frames():
    builder.select(_,df,_)

KeyboardInterrupt: 