In [1]:
import os
import sys
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)


In [2]:
import json
import spacy
from pathlib import Path

from corpus_utils import *
from segmentation import *
from simple_structure import *
from dataset_utils import *


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
RANDOM_STATE = 42
CORPORA_DIR = Path('../corpora')
OUTPUT_DIR = Path('../datasets')


In [4]:
with open(CORPORA_DIR / 'corpus_sci_comm.json') as corpus_file:
    corpus_sci_comm = Corpus.from_dict(json.loads(corpus_file.read()))

with open(CORPORA_DIR / 'corpus_sci_corpus.json') as corpus_file:
    corpus_sci_corpus = Corpus.from_dict(json.loads(corpus_file.read()))

flat_corpus_sci_comm = Corpus.flatten(corpus_sci_comm)
flat_corpus_sci_corpus = Corpus.flatten(corpus_sci_corpus)
full_corpus = Corpus.merge(flat_corpus_sci_comm, flat_corpus_sci_corpus)


'https://osf.io/qu4e2/download/ipl/#abc47ef1-8c9b-41ed-9c20-c13b124eb71a'
Excluding project "ISK" from text "Искусственный интеллект-3"
'https://osf.io/qu4e2/download/ipl/#c8099f8b-6c88-4717-9ebb-f0bde7938ddf'
Excluding project "ISK" from text "Искусственный интеллект-6"


In [5]:
spacy.prefer_gpu()
nlp = spacy.load('ru_core_news_lg')
nlp.enable_pipe('senter')

def project_with_most_comments(text):
    return max(text.projects, key=lambda project: len({stmt.record for stmt in project.statements}))

def segment_into_sentences(paragraphs):
    with nlp.select_pipes(enable=['tok2vec', 'parser']):
        return [[(sent.start_char, sent.end_char) for sent in doc.sents] for doc in nlp.pipe(paragraphs)]

def intersect_three_quarters_stmt(_, stmt_span, frag_span):
    return min(frag_span[1], stmt_span[1]) - max(frag_span[0], stmt_span[0]) >= 3/4 * (stmt_span[1] - stmt_span[0])

all_projects = [project_with_most_comments(text) for text in full_corpus.texts]
all_segmentations = segment_texts(full_corpus.texts, segment_into_sentences)
all_structures = [build_simple_structure(*z, intersect_three_quarters_stmt) for z in zip(all_projects, all_segmentations)]


In [6]:
dataset_kwargs = {
    'eval_size': 1/10,
    'test_size': 1/10,
    'random_state': RANDOM_STATE,
}

for ctx_len in [0, 1, 2]:
    build_dataset(all_structures, link_pairs_from_simple_structures, ctx_len=ctx_len, **dataset_kwargs).save_to_disk(OUTPUT_DIR / 'link_classification' / f'{ctx_len}ctx', max_shard_size='50MB')

link_classification_dataset_1ctx_label = build_dataset(all_structures, link_pairs_from_simple_structures, ctx_len=1, **dataset_kwargs).map(lambda row: {
    'fragment1_left':   f'Перед посылкой: {row["fragment1_left"]}',
    'fragment1':        f'Посылка: {row["fragment1"]}',
    'fragment1_right':  f'После посылки: {row["fragment1_right"]}',
    'fragment2_left':   f'Перед заключением: {row["fragment2_left"]}',
    'fragment2':        f'Заключение: {row["fragment2"]}',
    'fragment2_right':  f'После заключения: {row["fragment2_right"]}',
    'label': row['label'],
})
link_classification_dataset_1ctx_label.save_to_disk(OUTPUT_DIR / 'link_classification' / '1ctx_label', max_shard_size='50MB')


Saving the dataset (1/1 shards): 100%|██████████| 15467/15467 [00:00<00:00, 1928801.21 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1499/1499 [00:00<00:00, 125359.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1428/1428 [00:00<00:00, 356919.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 15467/15467 [00:00<00:00, 1530030.66 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1499/1499 [00:00<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1428/1428 [00:00<00:00, 322135.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 15467/15467 [00:00<00:00, 1214605.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1499/1499 [00:00<00:00, 6441866.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1428/1428 [00:00<00:00, 165482.29 examples/s]
Map: 100%|██████████| 15467/15467 [00:01<00:00, 13440.07 examples/s]
Map: 100%|██████████| 1499/1499 [00:00<00:00, 13031.59 examples/s]
Ma

In [7]:
for ctx_len in [0, 1]:
    triplet_dataset = build_dataset(all_structures, triplets_from_simple_structures, ctx_len=ctx_len, **dataset_kwargs)
    triplet_dataset_transitive = merge_datasets(triplet_dataset, build_dataset(all_structures, transitive_triplets_from_simple_structures, ctx_len=ctx_len, **dataset_kwargs))
    triplet_dataset.save_to_disk(OUTPUT_DIR / 'similarity_learning' / f'{ctx_len}ctx', max_shard_size='50MB')
    triplet_dataset_transitive.save_to_disk(OUTPUT_DIR / 'similarity_learning' / f'{ctx_len}ctx_transitive', max_shard_size='50MB')


Saving the dataset (1/1 shards): 100%|██████████| 24143/24143 [00:00<00:00, 1451779.64 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2134/2134 [00:00<00:00, 533284.36 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2129/2129 [00:00<00:00, 657270.22 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 34359/34359 [00:00<00:00, 2755805.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3168/3168 [00:00<00:00, 191579.27 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3088/3088 [00:00<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 24143/24143 [00:00<00:00, 724099.06 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 2134/2134 [00:00<00:00, 1526111.63 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2129/2129 [00:00<?, ? examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 34359/34359 [00:00<00:00, 1046725.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3

In [8]:
for ctx_len in [0, 1]:
    random_pairs_dataset = build_dataset(all_structures, random_pairs_from_simple_structures, ctx_len=ctx_len, **dataset_kwargs)
    random_pairs_dataset.save_to_disk(OUTPUT_DIR / 'random_pairs' / f'{ctx_len}ctx', max_shard_size='50MB')


Saving the dataset (2/2 shards): 100%|██████████| 118188/118188 [00:00<00:00, 1941154.09 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 16204/16204 [00:00<00:00, 1096449.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14825/14825 [00:00<00:00, 7397163.55 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 118188/118188 [00:00<00:00, 1464303.15 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 16204/16204 [00:00<00:00, 1091063.09 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14825/14825 [00:00<00:00, 29372015.49 examples/s]
