In [1]:
%load_ext autoreload
%autoreload 2

<div class="alert alert-block alert-warning">
<b>Note:</b> in the original RuRSTreebank dataset, some deprecated symbols occure (>, <, &, etc.), breaking the xml parser, as well as EDUs with punctuation marks at the beginning (it happens when brackets and dots/commas are separated with space in the original text). The latest version of the corpus (at the time of this notebooks' latest commit) has been corrected and dumped in <b>corpus/RuRsTreebank_full_corrected.zip</b>
</div>

## Read RS3 files into isanlp.DiscourseUnit annotations
input:
 - corpus with .rs3 files
output:
 - ``binarized_trees/file_filename_PART.du``  - pickled isanlp DiscourseUnit with tree number PART from the original *.rs3 file

#### 1. Split dataset files into separated trees 

In [2]:
import xml
import xml.etree.ElementTree as ET
from scipy.sparse.csgraph import connected_components
import numpy as np


class RS3ForestSplitter:
    def __call__(self, filename: str, output_dir: str):
        output_filename = filename.split('/')[-1]
        output_filename = output_filename.replace(".rst", "").replace(".rs3", "")
        
        # Save file header and make adjacency matrix
        
        pairs = []  # [[id1, parent1], [id2, parent2], ...]
        context = ET.iterparse(filename, events=('end', ))
        for event, elem in context:
            if elem.tag == 'header':
                header = ET.tostring(elem).decode('utf-8')

            elif elem.tag == 'body':
                for child in elem:
                    if child.get('parent'):
                        pairs.append(list(map(int, [child.get('id'), child.get('parent')])))
                    else:
                        pairs.append(list(map(int, [child.get('id'), child.get('id')])))

        max_id = np.array(pairs).max()
        adj_matrix = np.zeros((max_id, max_id))
        for pair in pairs:
            adj_matrix[pair[0]-1, pair[1]-1] = 1
            adj_matrix[pair[1]-1, pair[0]-1] = 1
            adj_matrix[pair[0]-1, pair[0]-1] = 1
            adj_matrix[pair[1]-1, pair[1]-1] = 1
            
        # Find separated trees
        n_components, labels = connected_components(adj_matrix)
        trees = dict()
        for _id, tree_number in enumerate(labels):
            if not trees.get(tree_number):
                trees[tree_number] = [str(_id+1)]
            else:
                trees[tree_number].append(str(_id+1))
                
        trees_body = dict()
        context = ET.iterparse(filename, events=('end', ))
        for event, elem in context:
            if elem.tag == 'body':
                for child in elem:
                    for tree_number, tree_ids in trees.items():
                        if child.get('id') in tree_ids:
                            if not trees_body.get(tree_number):
                                trees_body[tree_number] = [child]
                            else:
                                trees_body[tree_number].append(child)
        
        # Write the results
        for tree_number in trees_body.keys():
            try:
                with open(os.path.join(output_dir, f'{output_filename}_part_{tree_number}.rs3'), 'w') as f:
                    f.write('<rst>\n')
                    f.write(header)
                    f.write('<body>\n')
                    for element in trees_body.get(tree_number):
                        _id = element.get('id')
                        _type = element.get('type')
                        _par = element.get('parent')
                        parent = f'parent="{_par}"' if _par else ''
                        _relname = element.get('relname')
                        f.write(f'\t\t<{element.tag} id="{_id}" type="{_type}" {parent} relname="{_relname}"')
                        if element.tag == 'segment':
                            f.write(f'>{self.debracket_text(element.text)}</segment>\n')
                        elif element.tag == 'group':
                            f.write('/>\n')
                    f.write('\t</body>\n')
                    f.write('</rst>')
            except:
                print(f"Skip tree {tree_number} in file {filename}")
    
    @staticmethod
    def debracket_text(text):
        return text.replace(')', '-RB-').replace('(', '-LB-')

In [None]:
splitter = RS3ForestSplitter()

In [None]:
! ls corpus/RuRsTreebank_full_6/news1/news1_rs3/* | head -2

In [None]:
import os, sys
import glob
from tqdm import tqdm

output_dir = 'corpus_rs3'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
    
for filename in tqdm(glob.glob('corpus/RuRsTreebank_full_6/news1/news1_rs3/*.rst')):
    splitter(filename, output_dir)
    
for filename in tqdm(glob.glob('corpus/RuRsTreebank_full_6/news2/news2_rs3/*.rst')):
    splitter(filename, output_dir)
    
for filename in tqdm(glob.glob('corpus/RuRsTreebank_full_6/blogs/blogs_rs3/*.rst')):
    splitter(filename, output_dir)

In [None]:
! ls -laht $output_dir/*.rs3 | wc -l  # Overall number of trees in news+blogs

#### 2. Convert them all to *.dis files

Using https://github.com/rst-workbench/rst-converter-service

In [None]:
import glob

input_dir = 'corpus_rs3'
output_dir = 'corpus_dis'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

for file in glob.glob(os.path.join(input_dir, '*.rs3')):
    output_file = os.path.join(output_dir, file.split('/')[-1].replace('.rs3', '.dis'))
    ! curl -XPOST echistova:5000/convert/rs3/dis -F input=@$file > $output_file

In [None]:
! ls -lath $output_dir/*.dis | wc -l

#### 3. Collect DiscourseUnit annotations for isanlp library

In [None]:
! pip install -U git+https://github.com/tchewik/dis2du.git

In [None]:
import glob
import pickle
import os
from tqdm import tqdm
from dis2du.read_dis import read_dis

In [None]:
# from dis2du.tree import RSTTree
# from dis2du.convert2isanlp import convert2isanlp


# def read_dis(filename):
#     rst = RSTTree(filename)
#     rst.build()
#     return convert2isanlp(rst)


In [None]:
input_dir = 'corpus_dis'
output_dir = 'corpus_du'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

failed = []
for file in tqdm(glob.glob(os.path.join(input_dir, '*.dis'))):
    
    try:
        tree = read_dis(file)
        output_file = file.split('/')[-1].replace('.dis', '.du')
        with open(os.path.join(output_dir, output_file), 'wb') as f:
            pickle.dump(tree, f)
    except Exception as e:
        print(e)
        failed.append(file)

In [None]:
len(failed)

Align trees with the original texts (collect ``start`` and ``end`` for each node). Otherwise, we can't define paragraph boundaries.

In [None]:
import os
from utils.file_reading import prepare_text
import pandas as pd

def align_du2text(tree, text):
    tree.text = prepare_text(tree.text).strip()
    tree.start = text.find(tree.text)
    tree.end = tree.start + len(tree.text)
    if tree.relation != 'elementary':
        tree.left = align_du2text(tree.left, text)
        tree.right = align_du2text(tree.right, text)
    return tree

In [None]:
all_pairs = []

In [None]:
for file in tqdm(glob.glob('corpus/RuRsTreebank_full_6/news1/news1_txt/*.txt')):
    text = prepare_text(open(file, 'r').read().strip())
    filename = file.split('/')[-1].replace('.txt', '')

    for du_filename in sorted(glob.glob(os.path.join('corpus_du/', filename + '_part_*'))):
        tree = pickle.load(open(du_filename, 'rb'))
        all_pairs += extr_pairs(tree)
        tree = align_du2text(tree, text)
        pickle.dump(tree, open(du_filename, 'wb'))

In [None]:
for file in tqdm(glob.glob('corpus/RuRsTreebank_full_6/news2/news2_txt/*.txt')):
    text = prepare_text(open(file, 'r').read().strip())
    filename = file.split('/')[-1].replace('.txt', '')

    for du_filename in sorted(glob.glob(os.path.join('corpus_du/', filename + '_part_*'))):
        tree = pickle.load(open(du_filename, 'rb'))
        all_pairs += extr_pairs(tree)
        tree = align_du2text(tree, text)
        pickle.dump(tree, open(du_filename, 'wb'))

In [None]:
# def extr_pairs(tree):
#     pp = []
#     if tree.left:
#         pp.append([tree.left.text, tree.right.text, 
#                    tree.left.start, tree.right.start,
#                    tree.relation, tree.nuclearity])
#         pp += extr_pairs(tree.left)
#         pp += extr_pairs(tree.right)
#     return pp

In [None]:
for file in tqdm(glob.glob('corpus/RuRsTreebank_full_6/blogs/blogs_txt/*.txt')):
    text = prepare_text(open(file, 'r').read().strip())
    filename = file.split('/')[-1].replace('.txt', '')

    for du_filename in sorted(glob.glob(os.path.join('corpus_du/', filename + '_part_*'))):
        tree = pickle.load(open(du_filename, 'rb'))
        all_pairs += extr_pairs(tree)
        tree = align_du2text(tree, text)
        pickle.dump(tree, open(du_filename, 'wb'))

In [None]:
pairs = pd.DataFrame(all_pairs, columns=['snippet_x', 'snippet_y', 'loc_x', 'loc_y', 'category_id', 'order'])

In [None]:
txt = 'Он страдает сам, он заставляет окружающих'

In [None]:
pairs[pairs.snippet_x.str.contains(txt)]

In [None]:
pairs[pairs.snippet_y.str.contains(txt)]

In [None]:
! ls corpus/RuRsTreebank_full_6/news1/news1_txt/* | head -2

In [None]:
# from isanlp.annotation_rst import ForestExporter

# exp = ForestExporter()
# exp([tree], 'some_tree.rs3')

#### 5. Collect data for train and evaluation 
output:
 - ``data/file.edus``  - text file with edus from .rs3 - each line contains one edu
 - ``data/file.json``  - json file with du-pairs from gold trees. 
 keys: ``['snippet_x', 'snippet_y', 'loc_x', 'loc_y', 'category_id', 'order', 'filename]``

In [None]:
def extr_pairs(tree):
    pp = []
    if tree.left:
        pp.append([tree.left.text, tree.right.text, 
                   tree.left.start, tree.right.start,
                   tree.relation, tree.nuclearity])
        pp += extr_pairs(tree.left)
        pp += extr_pairs(tree.right)
    return pp

def extr_edus(tree):
    if tree.relation == 'elementary':
        return [tree.text]
    else:
        tt = []
        tt += extr_edus(tree.left)
        tt += extr_edus(tree.right)
    return tt

In [None]:
# ! rm -r data && mkdir data

In [None]:
import re

all_pairs = []
for orig_filename in glob.glob(os.path.join('corpus_du/', '*part_0.du')):
    for du_filename in sorted(glob.glob(orig_filename.replace('_0', '_*')),
                             key=lambda x: float(re.findall("(\d+)",x)[-1])):
        tree = pickle.load(open(du_filename, 'rb'))
        pairs = extr_pairs(tree)
        filename = du_filename.split('/')[-1].replace('.du', '')
        filename = '_'.join(filename.split('_')[:2])
        if pairs:
            pairs = [pair + [filename] for pair in pairs]
            all_pairs += pairs

        edus = extr_edus(tree)
        with open(os.path.join('data/', filename + '.edus'), 'a') as f:
            f.write('\n'.join(edus))
            f.write('\n')

In [None]:
len(all_pairs)

In [None]:
pairs = pd.DataFrame(all_pairs, columns=['snippet_x', 'snippet_y', 'loc_x', 'loc_y', 'category_id', 'order', 'filename'])

In [None]:
pairs.head()

In [None]:
pairs = pairs.drop_duplicates()

In [None]:
pairs.shape

In [None]:
pairs[pairs.loc_x == -1].shape

In [None]:
pairs = pairs[pairs.loc_x != -1]
pairs = pairs[pairs.loc_y != -1]

In [None]:
pairs.shape

In [None]:
pairs['category_id'].value_counts(normalize=False)

In [None]:
pairs['relation'] = pairs['category_id'] + '_' + pairs['order']

In [None]:
pairs.relation.value_counts()

In [None]:
pairs[pairs.relation == 'background_NN'].head()

In [None]:
pairs.reset_index().to_feather('data/all_pairs.fth')

In [None]:
pairs

In [None]:
pairs.dropna().shape

In [None]:
pairs[pairs.filename == 'blogs_65'].category_id.value_counts()

## Annotate the texts with isanlp 
output:
 - file.annot.pkl  # morphology, syntax, semantics to use with isanlp

In [1]:
from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote
from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd
from isanlp.ru.processor_mystem import ProcessorMystem
from isanlp.processor_razdel import ProcessorRazdel
# from isanlp.processor_deeppavlov_syntax import ProcessorDeeppavlovSyntax

host_udpipe = 'papertext'
port_udpipe = '3200'
port_udpipe = '3134'

ppl = PipelineCommon([
    (ProcessorRazdel(), ['text'],
    {'tokens': 'tokens',
     'sentences': 'sentences'}),
    (ProcessorRemote(host_udpipe, port_udpipe, '0'),
     ['tokens', 'sentences'],
     {'lemma': 'lemma',
      'syntax_dep_tree': 'syntax_dep_tree',
      'postag': 'ud_postag'}),
    (ProcessorMystem(delay_init=False),
     ['tokens', 'sentences'],
     {'postag': 'postag'}),
    (ConverterMystemToUd(),
     ['postag'],
     {'morph': 'morph',
      'postag': 'postag'}),
])

In [2]:
ppl('Мама мыла раму.')

{'text': 'Мама мыла раму.',
 'tokens': [<isanlp.annotation.Token at 0x7f0285582450>,
  <isanlp.annotation.Token at 0x7f02855825d0>,
  <isanlp.annotation.Token at 0x7f0285582210>,
  <isanlp.annotation.Token at 0x7f02855822d0>],
 'sentences': [<isanlp.annotation.Sentence at 0x7f02855aeb50>],
 'lemma': [['мама', 'мыть', 'рама', '.']],
 'syntax_dep_tree': [[<isanlp.annotation.WordSynt at 0x7f02855ab490>,
   <isanlp.annotation.WordSynt at 0x7f02855ab510>,
   <isanlp.annotation.WordSynt at 0x7f02855ab550>,
   <isanlp.annotation.WordSynt at 0x7f02855ab610>]],
 'ud_postag': [['NOUN', 'VERB', 'NOUN', 'PUNCT']],
 'postag': [['NOUN', 'VERB', 'NOUN', '']],
 'morph': [[{'fPOS': 'NOUN',
    'Gender': 'Fem',
    'Animacy': 'Anim',
    'Case': 'Nom',
    'Number': 'Sing'},
   {'fPOS': 'VERB',
    'Aspect': 'Imp',
    'Valency': 'TR',
    'Tense': 'Past',
    'Number': 'Sing',
    'VerbForm': 'Fin',
    'Gender': 'Fem'},
   {'fPOS': 'NOUN',
    'Gender': 'Fem',
    'Animacy': 'Inan',
    'Case': 'Acc',

In [3]:
import glob
import os
import pickle

from tqdm.autonotebook import tqdm
from utils.file_reading import _prepare_text as prepare_text

directories = ['corpus/RuRsTreebank_full_6/blogs/blogs_txt/',
               'corpus/RuRsTreebank_full_6/news1/news1_txt/',
               'corpus/RuRsTreebank_full_6/news2/news2_txt/'
               ]

for path in directories:
    print('analyze path:', path)
    for file in tqdm(glob.glob(f'{path}*.txt')):
        filename = file.split('/')[-1].replace('.txt', '.annot.pkl')
        if True:
        #if not os.path.isfile(os.path.join('data', filename)):
            text = prepare_text(open(file, 'r').read().strip())
            try:
                annot = ppl(text)
                pickle.dump(annot, open(os.path.join('data', filename), 'wb'))
            except:
                print(filename)

  """


analyze path: corpus/RuRsTreebank_full_6/blogs/blogs_txt/


  0%|          | 0/104 [00:00<?, ?it/s]

analyze path: corpus/RuRsTreebank_full_6/news1/news1_txt/


  0%|          | 0/79 [00:00<?, ?it/s]

analyze path: corpus/RuRsTreebank_full_6/news2/news2_txt/


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
! ls -laht data/*.pkl | wc -l

In [None]:
! ls -laht data/*.edus | wc -l

(Optional) parse science texts

In [None]:
import glob
import os
import pickle

from tqdm.autonotebook import tqdm
from utils.file_reading import _prepare_text as prepare_text

directories = ['corpus/RuRsTreebank_full_6/sci_comp/sci_comp_txt/',
               'corpus/RuRsTreebank_full_6/sci_ling/sci_ling_txt/',
               ]

for path in directories:
    print('analyze path:', path)
    for file in tqdm(glob.glob(f'{path}*.txt')):
        text = open(file, 'r').read()
        text = text.replace('  \n', '#####').replace('\n', ' ')
        text = prepare_text(text)
        annot = ppl(text)
        filename = file.split('/')[-1].replace('.txt', '.annot.pkl')
        pickle.dump(annot, open(os.path.join('data', filename), 'wb'))


## Gold trees
### Extract features 
output:
 - models/tf_idf/pipeline.pkl  # is used in default feature extraction
 - file.gold.pkl  # dataset with extracted default features for gold trees

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import glob
import pickle
import numpy as np
import pandas as pd
import nltk

from utils.file_reading import read_annotation


input_dir = 'data/'

if not os.path.isdir('models'):
    os.path.mkdir('models')

if not os.path.isdir('models/tf_idf'):
    os.path.mkdir('models/tf_idf')

corpus = []
for file in glob.glob(os.path.join(input_dir, f"*.annot.pkl")):
    tokens = read_annotation(file.replace('.annot.pkl', ''))['tokens']
    corpus.append(list(map(lambda token: token.text.lower(), tokens)))

    
from utils.count_vectorizer import MyCountVectorizer
count_vect = MyCountVectorizer(ngram_range=(1, 2), tokenizer=MyCountVectorizer.dummy, preprocessor=MyCountVectorizer.dummy)

svd = TruncatedSVD(n_components=25,
                   tol=0.0,
                   n_iter=7,
                   random_state=42)

pipeline = Pipeline([
    ('vect', count_vect),
    ('svd', svd)
])

pipeline.fit(corpus)
pickle.dump(pipeline, open('models/tf_idf/pipeline.pkl', 'wb'))

In [None]:
# %%bash

# python -c "import nltk; nltk.download('stopwords')"
# pip install dostoevsky
# dostoevsky download fasttext-social-network-model

In [None]:
! cp ../isanlp_rst/utils/features_processor_variables.py utils/features_processor_variables.py

In [None]:
# ! pip install "scikit_learn==0.22.2.post1"

In [5]:
import pandas as pd
import numpy as np
from utils.print_tree import printBTree

import sys
sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../')

from _isanlp_rst.src.isanlp_rst.features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='models', verbose=0)



In [6]:
import glob
import pandas as pd
import pickle
from tqdm.autonotebook import tqdm
from utils.file_reading import read_gold, read_annotation

table = read_gold('data/all_pairs')
table = table[table.snippet_x.map(len) > 0]
table = table[table.snippet_y.map(len) > 0]

In [7]:
table[table.snippet_x.str.contains('И так. Поправим ACL domains следующим')]

Unnamed: 0,index,snippet_x,snippet_y,loc_x,loc_y,category_id,order,filename,relation
5711,5734,И так. Поправим ACL domains следующим образом:...,"Смотрим, что происходило в FreeSWITCH: начинае...",3330,3571,comparison,NN,blogs_43,comparison_NN


In [8]:
table.category_id.value_counts()

joint                        5389
elaboration                  4842
attribution                  1959
contrast                     1867
cause                        1564
evaluation                   1321
condition                    1256
preparation                  1088
sequence                     1039
same-unit                     844
purpose                       810
background                    494
evidence                      442
comparison                    415
solutionhood                  370
cause-effect                  341
concession                    296
interpretation-evaluation     159
restatement                   138
effect                         56
conclusion                      6
span                            2
antithesis                      2
interpretation                  1
Name: category_id, dtype: int64

In [9]:
table[table.snippet_x.str.contains('И так. Поправим ACL domains следующим')]

Unnamed: 0,index,snippet_x,snippet_y,loc_x,loc_y,category_id,order,filename,relation
5711,5734,И так. Поправим ACL domains следующим образом:...,"Смотрим, что происходило в FreeSWITCH: начинае...",3330,3571,comparison,NN,blogs_43,comparison_NN


In [10]:
for filename, df in tqdm(table.groupby('filename')):
    annot = read_annotation(os.path.join('data', filename))
    features = features_processor(df, 
                                  annot['text'], annot['tokens'], 
                                  annot['sentences'], annot['lemma'], 
                                  annot['morph'], annot['ud_postag'], 
                                  annot['syntax_dep_tree'])
    features.to_pickle(filename + '.gold.pkl')

  0%|          | 0/233 [00:00<?, ?it/s]

Unable to locate first snippet >>> [['Био-тоник с пребиотиками Идеал Chocolatte для проблемной кожи Тоник изготовлен на основе гидролатов сосны, иссопа и шалфея. В него включен комплекс лизатов лакто/бифидо/пропионовых бактерий. Все. Состав максимально прост и лаконичен. Производитель сообщает, что возможно выпадение осадка.'
  'IMG' 59 58 297 293]
 ['Тоник находится в бутылке из плотного пластика. Распылитель хороший, мелкодисперсный. Тоник водичковой текстуры, прозрачный, но в нем плавают микрочастички, похожие на частички растений. Выпадает небольшой осадок. Всегда встряхиваю перед применением. Орошаю им лицо из распылителя сразу после умывания или протираю ватным диском. Запах практически отсутствует.'
  'IMG' 173 58 1088 293]
 ['\u200bГель-крем для лица БИОАКТИВ с пребиотиком Chocolatte Самым интересным продуктом марки для меня стал гель-крем для лица Био-актив с 5% содержанием пребиотика Biolin P.'
  'IMG' 333 58 1965 293]
 ['Гель находится в бутылочке из плотного пластика, дозат

In [13]:
for pklfile in tqdm(glob.glob('data/*.gold.pkl')):
    features = pd.read_pickle(pklfile)
    merge = pd.merge(features, table, on=['snippet_x', 'snippet_y'])
    features['category_id'] = merge.category_id_y
    features['order'] = merge.order_y
    features['filename'] = merge.filename_y
    if 'level_0' in features.keys():
        features = features.drop(columns=['level_0'])
    features.to_pickle(pklfile)

  0%|          | 0/233 [00:00<?, ?it/s]