In [14]:
import pandas as pd
from tg.grammar_ru.common import Loc
from tg.grammar_ru.ml.corpus import CorpusReader
import re
import numpy as np
import zipfile
import io

# Loading data

In [2]:
reader = CorpusReader(Loc.corpus_path/'lenta.base.zip')

In [3]:
frames = reader.get_frames()

In [4]:
df = frames.first()

# Чтобы что бы

In [44]:
def get_chto_by(df):
    what = df[df['word'] == 'что']
    what_next = what[['sentence_id', 'word_index']].copy()
    what_next['word_index'] += 1
    what_neighbour = df.merge(what_next, on=['sentence_id', 'word_index'], how='inner')
    
    by = what_neighbour[what_neighbour['word'] == 'бы']
    
    return pd.concat([what.merge(by['word_id'] - 1, on=['word_id'], how='inner'), by]).sort_values(['word_id'])

In [384]:
chto_by = get_chto_by(df)
chto_by

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length,original_word_id,original_sentence_id,original_paragraph_id,updated,file_id,corpus_id,original_corpus_id,is_target
0,241295,197321,22,195180,1,что,ru,3,46295,2321,180,False,092b10a4-1328-4cd6-bae6-0367fd502a19,lenta.base.zip,lenta.base.zip,True
438,241296,197321,23,195180,1,бы,ru,2,46296,2321,180,False,092b10a4-1328-4cd6-bae6-0367fd502a19,lenta.base.zip,lenta.base.zip,False


In [385]:
def get_targets(df):
    targets = df.set_index('word_id').word.str.lower() == 'чтобы'

    chto = df[df['word'] == 'что']
    chto_next = chto['word_id'] + 1
    chto_neighbour = df.merge(chto_next, how='inner')
    by = chto_neighbour[chto_neighbour['word'] == 'бы']

    targets[by['word_id'] - 1] = True

    return targets.values

Negative sample building

In [386]:
import re
import itertools

In [387]:
sentence = 'что бы то ни было, чтобы это решить.'

In [490]:
target_regex = r'[чЧ]то бы|[чЧ]тобы'
re.findall(target_regex, sentence, re.IGNORECASE)

['Чтобы']

In [468]:
def get_alternatives(word):
    if word.lower() not in ('чтобы', 'что бы'):
        raise ValueError()

    alternatives = [word]
    # saving first letter register
    if word.lower() == 'чтобы':
        alternatives.append(word[0] + 'то бы')
    if word.lower() == 'что бы':
        alternatives.append(word[0] + 'тобы')

    return alternatives

In [464]:
sentence = 'Чтобы принять участие в конкурсе, необходимо зарегистрироваться на сайте axeapollo.com. '

In [494]:
def get_word_combinations(sentence, target_regex):
    words = re.findall(target_regex, sentence, re.IGNORECASE)

    return set(itertools.product(*(get_alternatives(word) for word in words))) - set([tuple(words)])

In [495]:
get_word_combinations(sentence, target_regex)

{('Что бы',)}

In [496]:
re.split(target_regex, sentence)

['',
 ' принять участие в конкурсе, необходимо зарегистрироваться на сайте axeapollo.com. ']

In [497]:
def join_sentence_parts(sentence_parts, words_combination, starts_with_target):
    rebuilt_sentence = [''] * (len(words_combination) + len(sentence_parts))
    if starts_with_target:
        rebuilt_sentence[::2] = words_combination
        rebuilt_sentence[1::2] = sentence_parts
    else:
        rebuilt_sentence[1::2] = words_combination
        rebuilt_sentence[::2] = sentence_parts

    return ''.join(rebuilt_sentence)

In [498]:
def build_negative_samples_from_sentence(sentence, target_regex):
    sentence_parts = re.split(target_regex, sentence, re.IGNORECASE)
    starts_with_target = sentence_parts[0] == ''
    sentence_parts = list(filter(lambda word: word != '', sentence_parts))
    negative_samples = []
    for combination in get_word_combinations(sentence, target_regex):
        negative_samples.append(join_sentence_parts(sentence_parts, combination, starts_with_target))
    
    return negative_samples

In [499]:
build_negative_samples_from_sentence(sentence, target_regex)

['Что бы принять участие в конкурсе, необходимо зарегистрироваться на сайте axeapollo.com. ']

In [500]:
negative = sentences_df.apply(build_negative_samples_from_sentence, target_regex=target_regex).explode()

In [518]:
from tg.grammar_ru.common.separator import Separator


def build_negative_sample_from_positive(positive_sample: pd.DataFrame) -> pd.DataFrame:
    sentences_df = (  # grouping all setnences
        positive_sample
        .assign(word_print=positive_sample.word+pd.Series(' ', index=positive_sample.index) * positive_sample.word_tail)
        .groupby('sentence_id').word_print
        .sum()
    )
    negative = sentences_df.apply(build_negative_samples_from_sentence, target_regex=target_regex).explode()
    negative = Separator.separate_string(negative.str.cat(sep=' '))
    negative['is_target'] = ChtobyIndexBuilder()._get_targets(negative)

    negative['label'] = 1

    return negative

In [519]:
negative_sample = build_negative_sample_from_positive(positive)

In [512]:
negative_sample[negative_sample['is_target'] & (negative_sample['word'] == 'чтобы')]

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length,is_target,label
808,808,27,22,0,1,чтобы,ru,5,True,1
1110,1110,40,9,0,1,чтобы,ru,5,True,1
1176,1176,41,14,0,1,чтобы,ru,5,True,1
4008,4008,144,10,0,1,чтобы,ru,5,True,1
4040,4040,145,15,0,1,чтобы,ru,5,True,1


Building index

In [67]:
import numpy as np
from tg.grammar_ru.ml.tasks.train_index_builder.index_builders import ChtobyIndexBuilder

In [130]:
import importlib
from tg.grammar_ru.ml.tasks.train_index_builder import index_builders

ChtobyIndexBuilder = importlib.reload(index_builders).ChtobyIndexBuilder

In [68]:
builder = ChtobyIndexBuilder()

In [132]:
index = builder.build_train_index(df)

In [133]:
positive, negative = index

Filtering sentences

In [19]:
from tg.grammar_ru.ml.tasks.train_index_builder.sentence_filterer import ChtobyFilterer

In [21]:
from importlib import reload
from tg.grammar_ru.ml.tasks.train_index_builder import sentence_filterer
ChtobyFilterer = reload(sentence_filterer).ChtobyFilterer

In [None]:
filterer = ChtobyFilterer()
filtered_df = filterer.get_filtered_df(df)
filtered_df

# Balancing

In [2]:
from tg.grammar_ru.common import Loc

In [8]:
corpuses = [
    Loc.corpus_path/'lenta.base.zip',
    Loc.corpus_path/'proza.base.zip',
]

In [9]:
from tg.grammar_ru.ml.tasks.train_index_builder.sentence_filterer import ChtobyFilterer
from tg.grammar_ru.ml.corpus import CorpusBuilder

In [None]:
for corpus_path in corpuses:
    corpus_name = corpus_path.name.split('.')[0]
    filtered_path = Loc.bundles_path/f'chtoby/filtered_{corpus_name}'
    print(corpus_path)
    print(filtered_path)

    CorpusBuilder.transfuse_corpus(
        [corpus_path],
        filtered_path,
        selector=ChtobyFilterer().select
    )

In [11]:
from tg.grammar_ru.ml.corpus import CorpusReader, CorpusBuilder, BucketCorpusBalancer
from tg.grammar_ru.ml.corpus.corpus_reader import read_data

In [12]:
def get_sentences_count(corpus_path):
    return sum([
        len(frame.groupby("sentence_id")) 
        for frame in read_data(corpus_path)
    ])

In [15]:
filtered_corpuses = [
    Loc.bundles_path/'chtoby/prepare/filtered/filtered_lenta.zip',
    Loc.bundles_path/'chtoby/prepare/filtered/filtered_proza.zip',
]

In [16]:
for filtered_corpus in filtered_corpuses:
    print(filtered_corpus.name)
    print(get_sentences_count(filtered_corpus))

filtered_lenta.zip


100%|██████████| 67/67 [00:05<00:00, 13.06it/s]


129677
filtered_proza.zip


100%|██████████| 11/11 [00:00<00:00, 13.83it/s]

27186





In [23]:
from pathlib import Path
import pandas as pd

In [21]:
bucket_path = Path.cwd() / 'chtoby/buckets.parquet'
BucketCorpusBalancer.build_buckets_frame(filtered_corpuses, bucket_path)

100%|██████████| 78/78 [00:17<00:00,  4.40it/s]


In [24]:
pd.read_parquet(bucket_path)

Unnamed: 0_level_0,sentences,bucket_size
bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
filtered_lenta.zip/1,"[1684107, 1949513, 2022972, 2161642, 2592794]",5
filtered_lenta.zip/2,"[30, 50, 60, 63, 65, 66, 95, 152, 5007, 5027, ...",5399
filtered_lenta.zip/3,"[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16...",100378
filtered_lenta.zip/4,"[3, 11, 12, 20, 21, 31, 39, 40, 44, 52, 54, 55...",23809
filtered_lenta.zip/5,"[12990, 19207, 74205, 88661, 105936, 114223, 1...",86
filtered_proza.zip/1,"[4677, 11804, 31764, 72125, 79288, 88905, 1123...",56
filtered_proza.zip/2,"[231, 326, 327, 328, 330, 423, 424, 425, 579, ...",5728
filtered_proza.zip/3,"[0, 30, 31, 32, 103, 104, 105, 106, 107, 230, ...",18915
filtered_proza.zip/4,"[233, 427, 581, 727, 773, 875, 1283, 1285, 128...",2446
filtered_proza.zip/5,"[210095, 218482, 300175, 302454, 302839, 30319...",41


In [25]:
bucket_numbers = [2, 3, 4]
bucket_limit = 2400

In [2]:
balanced_path = Loc.bundles_path/'chtoby/prepare/balanced/balanced.zip'

In [None]:
import math


balancer = BucketCorpusBalancer(
    buckets=pd.read_parquet(bucket_path), 
    log_base=math.e,
    bucket_limit=bucket_limit,
)

CorpusBuilder.transfuse_corpus(
    sources=filtered_corpuses,
    destination=balanced_path,
    selector=balancer.select
)

# Proza to interformat

In [10]:
import os
from pathlib import Path


for idx, filename in enumerate(Path(Loc.data_cache_path/'processed/proza/').iterdir(), 1):
    print(filename)
    #filename.rename(f'{idx}.md')

In [3]:
CorpusBuilder.convert_interformat_folder_to_corpus(
    Loc.corpus_path/'proza.base.zip',
    Loc.data_cache_path/'processed/proza',
    ['']
)

  0%|          | 0/63 [00:00<?, ?it/s]

# Training

In [12]:
from pathlib import Path

from tg.grammar_ru.common import Loc
from research.common import run_training


bundle_path = Loc.bundles_path/f'chtoby/toy'

In [13]:
result = run_training.run_local(bundle_path)

2022-12-16 10:03:44.332556+00:00 INFO: Training starts. Info: {'name': 'TSAG-'}
2022-12-16 10:03:44.333942+00:00 INFO: Ensuring/loading bundle. Bundle before:
{'slovnet': {'shape': (699701, 17), 'index_name': 'word_id'}, 'syntax_fixes': {'shape': (699701, 4), 'index_name': 'word_id'}, 'syntax_stats': {'shape': (699701, 6), 'index_name': 'word_id'}, 'index': {'shape': (26537, 5), 'index_name': 'sample_id'}, 'syntax_closure': {'shape': (1861619, 4), 'index_name': 'entry_id'}, 'src': {'shape': (699701, 18), 'index_name': None}, 'pymorphy': {'shape': (699701, 16), 'index_name': 'word_id'}}
2022-12-16 10:03:44.337217+00:00 INFO: Bundle loaded
{'slovnet': {'shape': (699701, 17), 'index_name': 'word_id', 'columns': ['POS', 'Case', 'Number', 'Person', 'Polarity', '...'], 'index': [0, 1, 2, 3, 4, '...']}, 'syntax_fixes': {'shape': (699701, 4), 'index_name': 'word_id', 'columns': ['syntax_parent_id', 'root', 'cycle_status', 'correct_root'], 'index': [0, 1, 2, 3, 4, '...']}, 'syntax_stats': {'sha

ValueError: Network type ContextualNetworkType.Plain is not recognized

# Debug

In [30]:
reader = CorpusReader(Loc.bundles_path/'chtoby/prepare/balanced/balanced.zip')

In [31]:
frames = reader.get_frames()

In [32]:
frame = frames.first()

In [66]:
frame.head()

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length,file_id,corpus_id,is_target,original_word_id,original_sentence_id,original_paragraph_id,updated,original_corpus_id
0,0,0,0,0,0,"""",punct,1,05e87080-f026-4b0d-a850-76cdd9b668ee,balanced.zip,False,809,30,23,False,filtered_lenta.zip
1,1,0,1,0,1,Вам,ru,3,05e87080-f026-4b0d-a850-76cdd9b668ee,balanced.zip,False,810,30,23,False,filtered_lenta.zip
2,2,0,2,0,1,не,ru,2,05e87080-f026-4b0d-a850-76cdd9b668ee,balanced.zip,False,811,30,23,False,filtered_lenta.zip
3,3,0,3,0,1,построить,ru,9,05e87080-f026-4b0d-a850-76cdd9b668ee,balanced.zip,False,812,30,23,False,filtered_lenta.zip
4,4,0,4,0,1,тюрьмы,ru,6,05e87080-f026-4b0d-a850-76cdd9b668ee,balanced.zip,False,813,30,23,False,filtered_lenta.zip


In [32]:
from importlib import reload

In [16]:
from tg.grammar_ru.ml.tasks.train_index_builder.index_builders import ChtobyIndexBuilder
from tg.grammar_ru.ml.corpus import CorpusBuilder

In [19]:
from tg.grammar_ru.common import Separator, DataBundle
from tg.grammar_ru.ml.tasks.train_index_builder.sentence_filterer import ChtobyFilterer

In [33]:
from tg.grammar_ru.ml.tasks.train_index_builder import sentence_filterer


ChtobyFilterer = reload(sentence_filterer).ChtobyFilterer

In [39]:
db = Separator.build_bundle(
    'Чтобы приготовить суп. Нужно купить продукты. Что бы мне сделать. С Новым Годом!. Для того, чтобы. Во что бы мне поиграть.'
)
filterer = ChtobyFilterer()

In [40]:
db

{'src': {'shape': (28, 8), 'index_name': None}}

In [41]:
frame = db.data_frames['src']
frame

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length
0,0,0,0,0,1,Чтобы,ru,5
1,1,0,1,0,1,приготовить,ru,11
2,2,0,2,0,0,суп,ru,3
3,3,0,3,0,1,.,punct,1
4,4,1,0,0,1,Нужно,ru,5
5,5,1,1,0,1,купить,ru,6
6,6,1,2,0,0,продукты,ru,8
7,7,1,3,0,1,.,punct,1
8,8,2,0,0,1,Что,ru,3
9,9,2,1,0,1,бы,ru,2


In [47]:
filtered_words = filterer.get_filtered_df(frame)

In [48]:
exprected_words = filtered_words['word']

In [49]:
list(exprected_words)

['Чтобы',
 'приготовить',
 'суп',
 '.',
 'Что',
 'бы',
 'мне',
 'сделать',
 '.',
 'Для',
 'того',
 ',',
 'чтобы',
 '.',
 'Во',
 'что',
 'бы',
 'мне',
 'поиграть',
 '.']

In [56]:
db = Separator.build_bundle(
    '''Чтобы приготовить суп. 
    Что бы мне сделать. Для того, чтобы. 
    Во что бы мне поиграть, чтобы развлечься.'''
)

In [57]:
frame = db.data_frames['src']

In [58]:
frame

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length
0,0,0,0,0,1,Чтобы,ru,5
1,1,0,1,0,1,приготовить,ru,11
2,2,0,2,0,0,суп,ru,3
3,3,0,3,0,1,.,punct,1
4,4,1,0,1,1,Что,ru,3
5,5,1,1,1,1,бы,ru,2
6,6,1,2,1,1,мне,ru,3
7,7,1,3,1,0,сделать,ru,7
8,8,1,4,1,1,.,punct,1
9,9,2,0,1,1,Для,ru,3


In [59]:
from tg.grammar_ru.ml.tasks.train_index_builder.negative_sampler import ChtobyNegativeSampler

In [60]:
sampler = ChtobyNegativeSampler()

In [62]:
negative_samples = sampler.build_negative_sample_from_positive(frame)

In [63]:
expected_words = list(negative_samples['word'])

In [64]:
expected_words

['Что',
 'бы',
 'приготовить',
 'суп',
 '.',
 'Чтобы',
 'мне',
 'сделать',
 '.',
 'Для',
 'того',
 ',',
 'что',
 'бы',
 '.',
 'Во',
 'чтобы',
 'мне',
 'поиграть',
 ',',
 'что',
 'бы',
 'развлечься',
 '.',
 'Во',
 'что',
 'бы',
 'мне',
 'поиграть',
 ',',
 'что',
 'бы',
 'развлечься',
 '.',
 'Во',
 'чтобы',
 'мне',
 'поиграть',
 ',',
 'чтобы',
 'развлечься',
 '.']