In [1]:
import pandas as pd
from tg.grammar_ru.common import Loc
from tg.grammar_ru.ml.corpus import CorpusReader
import re
import numpy as np

# Loading data

In [2]:
corpus_name = 'lenta.base.zip'
reader = CorpusReader(Loc.corpus_path/corpus_name)

In [3]:
frames = reader.read_frames()

In [4]:
frame = frames.first()

In [5]:
frame.head(10)

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length,file_id,corpus_id
0,0,0,0,0,1,Американские,ru,12,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip
1,1,0,1,0,1,войска,ru,6,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip
2,2,0,2,0,1,продвигаются,ru,12,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip
3,3,0,3,0,1,к,ru,1,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip
4,4,0,4,0,1,центру,ru,6,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip
5,5,0,5,0,1,города,ru,6,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip
6,6,0,6,0,0,Неджеф,ru,6,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip
7,7,0,7,0,1,",",punct,1,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip
8,8,0,8,0,1,сообщает,ru,8,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip
9,9,0,9,0,1,агентство,ru,9,bbd53cdb-f9c2-4ee9-8a0e-0238e52253d8,lenta.base.zip


# Extracting words

In [6]:
def get_df_with_single_n(df):
    single_n_regex = '[^н]н[^н](?!.*?нн)'  # matches only 'н' not followed by 'нн'
    df_with_single = frame[frame['word'].str.contains(single_n_regex)]  

    return df_with_single

def get_df_with_double_n(df):
    double_n_regex = r'нн(?!.+?н)'  # matches only 'нн' not followed by 'н'
    df_with_double = frame[frame['word'].str.contains(double_n_regex)]

    return df_with_double

In [7]:
def replace_double_n_to_single_and_lemmatize(nlp, df):
    df_with_double_replaced_to_single = df_with_double['word'].str[::-1].str.replace('нн', 'н', 1).str[::-1]
    words_with_double_replaced_to_single = {token.lemma_ for token in nlp('. '.join(df_with_double_replaced_to_single))}

    return words_with_double_replaced_to_single

In [8]:
def extract_words_with_lemmatization(nlp, df):
    df_with_single = get_df_with_single_n(df)
    df_with_double = get_df_with_double_n(df)

    words_with_double_replaced_to_single = replace_double_n_to_single_and_lemmatize(nlp, df_with_double)
    words_with_single = {token.lemma_ for token in nlp('. '.join(df_with_single['word']))}

    union = words_with_single.intersection(words_with_double_replaced_to_single)

    return union

In [21]:
from importlib import reload
from tg.grammar_ru.ml.tasks.n_nn import bundle
build_dictionary = reload(bundle).build_dictionary

In [6]:
from tg.grammar_ru.ml.tasks.n_nn.bundle import build_dictionary

In [7]:
words = build_dictionary([frame])

In [13]:
frame[frame.word.str.contains(r'[^н]н[^н](?!.*?нн)')].word

0         Американские
4               центру
9            агентство
14        командование
18         Специальные
              ...     
184984      подписание
184985      соглашения
184987     возвращению
184989       мигрантов
184992      транзитный
Name: word, Length: 35888, dtype: object

In [14]:
replaced = np.where(
    frame.word.str.contains(r'[^н]н[^н](?!.*?нн)'),
    frame.word.str[::-1].str.replace('н', 'нн', 1).str[::-1],
    frame.word.str[::-1].str.replace('нн', 'н', 1).str[::-1]
)

# index builder

In [19]:
from importlib import reload
from tg.grammar_ru.ml.tasks.train_index_builder import index_builders
NNnIndexBuilder = reload(index_builders).NNnIndexBuilder

In [15]:
from tg.grammar_ru.ml.tasks.train_index_builder.index_builders import TsaIndexBuilder, NNnIndexBuilder

In [16]:
df = frame.copy()

In [17]:
from yo_fluq_ds import FileIO
words = FileIO.read_json('/home/alabai/studies/grammar_ru/grammar_ru/research/n_nn/words.json')

In [20]:
n_nn_index_builder = NNnIndexBuilder(words)

In [21]:
df['is_target'] = n_nn_index_builder._get_targets(df)

In [23]:
positive = n_nn_index_builder._build_positive(df)

In [30]:
positive[positive.is_target].word

20          подразделения
23                окраины
26        военизированных
39            применением
47                    дом
               ...       
184939         президента
184968           заключил
184984         подписание
184987        возвращению
184988         незаконных
Name: word, Length: 16789, dtype: object

In [35]:
negative = positive.copy()
negative.word = np.where(
    ~negative.is_target,
    negative.word,
    np.where(
        negative.word.str.contains(r'[^н]н[^н](?!.*?нн)'),
        negative.word.str[::-1].str.replace('н', 'нн', 1).str[::-1],
        negative.word.str[::-1].str.replace('нн', 'н', 1).str[::-1]
    )
)

In [36]:
negative[negative.is_target].word

20        подразделенния
23              окраинны
26        военизированых
39          примененнием
47                   дом
               ...      
184939       президеннта
184968          заключил
184984       подписанние
184987      возвращеннию
184988         незаконых
Name: word, Length: 16789, dtype: object