## Binary structure classification used in tree building: Step 1. Negative samples generation

Create train and test sets; Save negative samples of file ``filename.rs3`` as `filename.neg`

Output:
 - ``data/*.neg``
 - ``data_structure/*``

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import os
import pickle
import sys

sys.path.append('../')
sys.path.append('../../')
sys.path.append('../../../')

import numpy as np
import pandas as pd
from isanlp.annotation_rst import DiscourseUnit
from _isanlp_rst.src.isanlp_rst.rst_tree_predictor import RSTTreePredictor, GoldTreePredictor
from tqdm import tqdm_notebook as tqdm
from utils.evaluation import extr_pairs, extr_pairs_forest
from utils.file_reading import *
from utils.print_tree import printBTree

In [3]:
class RandomNegativeGenerator(object):
    def __call__(self, edus, corpus, annot_text):
        new_set = self.create_training_set(edus, corpus)
        result = []
        for item in new_set:
            result.append((filename, item[0], item[1], item[2]))

        tmp = pd.DataFrame(result, columns=['filename', 'snippet_x', 'snippet_y', 'relation'])

        def place_locations(row):
            row['loc_x'] = annot_text.find(row.snippet_x)
            row['loc_y'] = annot_text.find(row.snippet_y, row['loc_x'] + 1)
            return row

        return tmp.apply(place_locations, axis=1)
    
    def __name__(self):
        return 'RandomNegativeGenerator'
    
    def create_training_set(self, edus, gold):
        training_set = []
        
        snippet_cache = []
        for num, e in enumerate(gold.index):
            snippet_x = gold.loc[e, 'snippet_x']
            cache_x = self.extract_snippet_ids(snippet_x, edus)

            snippet_y = gold.loc[e, 'snippet_y']
            cache_y = self.extract_snippet_ids(snippet_y, edus)

            if cache_x and cache_y:
                snippet_cache.append((cache_x, snippet_x))
                snippet_cache.append((cache_y, snippet_y))

        for i in range(len(edus) - 1):
            if not self.check_snippet_pair_in_dataset(gold, edus[i], edus[i+1]):
                training_set.append((edus[i], edus[i+1], False))

        for i in gold.index:
            training_set += self.extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_x'])
            training_set += self.extract_negative_samples_for_snippet(gold, edus, gold.loc[i, 'snippet_y'])

        for i in range(len(snippet_cache)):
            for j in range(i, len(snippet_cache)):
                cache_i, snippet_i = snippet_cache[i]
                cache_j, snippet_j = snippet_cache[j]

                if cache_i[-1] + 1 == cache_j[0]:
                    if not self.check_snippet_pair_in_dataset(gold, snippet_i, snippet_j):
                        training_set.append((snippet_i, snippet_j, False))

                if cache_j[-1] + 1 == cache_i[0]:
                    if not self.check_snippet_pair_in_dataset(gold, snippet_j, snippet_i):
                        training_set.append((snippet_j, snippet_i, False))

        return list(set(training_set))
    
    def extract_snippet_ids(self, snippet, edus):
        return [edu_nm for edu_nm, edu in enumerate(edus) if (edu in snippet)]
    
    def check_snippet_pair_in_dataset(self, dataset, snippet_left, snippet_right):
        return ((((dataset.snippet_x == snippet_left) & (dataset.snippet_y == snippet_right)).sum(axis=0) != 0) 
                or ((dataset.snippet_y == snippet_left) & (dataset.snippet_x == snippet_right)).sum(axis=0) != 0)
    
    def extract_negative_samples_for_snippet(self, gold, edus, snippet):
        training_set = []

        snippet_ids = self.extract_snippet_ids(snippet, edus)

        if not snippet_ids:
            return []

        if snippet_ids[0] > 0:
            if not self.check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[0] - 1]):
                training_set.append((edus[snippet_ids[0] - 1], snippet, False))

        if snippet_ids[-1] < len(edus) - 1:
            if not self.check_snippet_pair_in_dataset(gold, snippet, edus[snippet_ids[-1] + 1]):
                training_set.append((snippet, edus[snippet_ids[-1] + 1], False))

        return training_set

In [4]:
import sys
import numpy as np


class GreedyNegativeGenerator:
    """ Inversed greedy parser based on gold tree predictor. """

    def __init__(self):
        self.forest_threshold = 0.01
        self._same_sentence_bonus = 0

    def __call__(self, edus, corpus,
                 annot_text, annot_tokens,
                 annot_sentences,
                 annot_lemma, annot_morph, annot_postag,
                 annot_syntax_dep_tree):

        def to_merge(scores):
            return np.argmax(np.array(scores))

        negative_nodes = []

        self.tree_predictor = GoldTreePredictor(corpus)
        nodes = edus
        max_id = edus[-1].id

        # initialize scores
        features = self.tree_predictor.initialize_features(nodes,
                                                           annot_text, annot_tokens,
                                                           annot_sentences,
                                                           annot_lemma, annot_morph, annot_postag,
                                                           annot_syntax_dep_tree)

        scores = self.tree_predictor.predict_pair_proba(features, _same_sentence_bonus=self._same_sentence_bonus)

        for i, score in enumerate(scores):
            if score == 0:
                negative_nodes.append(
                    DiscourseUnit(
                        id=None,
                        left=nodes[i],
                        right=nodes[i + 1],
                        relation='no_relation',
                        nuclearity='NN',
                        proba=score,
                        text=annot_text[nodes[i].start:nodes[i + 1].end].strip()
                    ))

        while len(nodes) > 2 and any([score > self.forest_threshold for score in scores]):
            # select two nodes to merge
            j = to_merge(scores)  # position of the pair in list

            # make the new node by merging node[j] + node[j+1]
            relation = self.tree_predictor.predict_label(features.iloc[j])
            relation, nuclearity = relation.split('_')

            temp = DiscourseUnit(
                id=max_id + 1,
                left=nodes[j],
                right=nodes[j + 1],
                relation=relation,
                nuclearity=nuclearity,
                proba=scores[j],
                text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
            )

            max_id += 1

            # modify the node list
            nodes = nodes[:j] + [temp] + nodes[j + 2:]

            # modify the scores list
            if j == 0:
                _features = self.tree_predictor.extract_features(nodes[j], nodes[j + 1],
                                                                 annot_text, annot_tokens,
                                                                 annot_sentences,
                                                                 annot_lemma, annot_morph, annot_postag,
                                                                 annot_syntax_dep_tree)

                _scores = self.tree_predictor.predict_pair_proba(_features,
                                                                 _same_sentence_bonus=self._same_sentence_bonus)
                scores = _scores + scores[j + 2:]
                features = pd.concat([_features, features.iloc[j + 2:]])

                if _scores[0] == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j],
                            right=nodes[j + 1],
                            relation='no_relation',
                            nuclearity='NN',
                            proba=_scores[0],
                            text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
                        ))

            elif j + 1 < len(nodes):
                _features = self.tree_predictor.initialize_features([nodes[j - 1], nodes[j], nodes[j + 1]],
                                                                    annot_text, annot_tokens,
                                                                    annot_sentences,
                                                                    annot_lemma, annot_morph, annot_postag,
                                                                    annot_syntax_dep_tree)

                _scores = self.tree_predictor.predict_pair_proba(_features,
                                                                 _same_sentence_bonus=self._same_sentence_bonus)
                features = pd.concat([features.iloc[:j - 1], _features, features.iloc[j + 2:]])
                scores = scores[:j - 1] + _scores + scores[j + 2:]

                if _scores[0] == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j - 1],
                            right=nodes[j],
                            relation='no_relation',
                            nuclearity='NN',
                            proba=_scores[0],
                            text=annot_text[nodes[j - 1].start:nodes[j].end].strip()
                        ))

                if _scores[1] == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j],
                            right=nodes[j + 1],
                            relation='no_relation',
                            nuclearity='NN',
                            proba=_scores[1],
                            text=annot_text[nodes[j].start:nodes[j + 1].end].strip()
                        ))

            else:
                _features = self.tree_predictor.extract_features(nodes[j - 1], nodes[j],
                                                                 annot_text, annot_tokens,
                                                                 annot_sentences,
                                                                 annot_lemma, annot_morph, annot_postag,
                                                                 annot_syntax_dep_tree)

                _scores = self.tree_predictor.predict_pair_proba(_features,
                                                                 _same_sentence_bonus=self._same_sentence_bonus)
                scores = scores[:j - 1] + _scores
                features = pd.concat([features.iloc[:j - 1], _features])

                if _scores[0] == 0:
                    negative_nodes.append(
                        DiscourseUnit(
                            id=None,
                            left=nodes[j - 1],
                            right=nodes[j],
                            relation='no_relation',
                            nuclearity='NN',
                            proba=_scores,
                            text=annot_text[nodes[j - 1].start:nodes[j].end].strip()
                        ))

        return negative_nodes

    def __name__(self):
        return 'GreedyNegativeGenerator'


### Make negative samples, save them

In [5]:
gen1 = RandomNegativeGenerator()
# gen2 = GreedyNegativeGenerator()

for filename in tqdm(glob.glob('./data/*.gold.pkl')):
    filename = filename.replace('.gold.pkl', '')
    df = read_gold(filename, features=True)
    edus = read_edus(filename)
    annot = read_annotation(filename)

    tmp = gen1(edus, df, annot['text'])
#     tmp1 = gen1(edus, df, annot['text'])
    
#     _edus = []
#     last_end = 0
#     last_id = 0
#     for max_id in range(len(edus)):
#         start = len(annot['text'][:last_end]) + annot['text'][last_end:].find(edus[max_id])
#         end = start + len(edus[max_id])
#         temp = DiscourseUnit(
#             id=max_id + last_id,
#             left=None,
#             right=None,
#             relation='edu',
#             start=start,
#             end=end,
#             orig_text=annot['text'],
#             proba=1.,
#             )
#         _edus.append(temp)
#         last_end = end
#         last_id += 1

#     tmp = gen2(_edus, df, annot['text'],
#                annot['tokens'], annot['sentences'],
#                annot['lemma'], annot['morph'],
#                annot['postag'], annot['syntax_dep_tree'])
    
#     tmp = pd.DataFrame(extr_pairs_forest(tmp, annot['text'], locations=True), 
#                        columns=['snippet_x', 'snippet_y', 
#                                 'category_id', 'order', 
#                                 'loc_x', 'loc_y'])
    
#     tmp = tmp[tmp.category_id == 'no_relation']
#     tmp = tmp.drop(columns=['order', 'category_id'])
#     tmp['filename'] = filename
#     tmp['relation'] = False
    
#     tmp = pd.concat([tmp, tmp1])
    tmp = tmp[(tmp.loc_x < tmp.loc_y) & (tmp.loc_x > -1)]
    
    tt = pd.concat([df, tmp])
    tt['relation'] = tt.relation.map(lambda row: False if row == False else True)
    tt = tt.sort_values('relation', ascending=False).drop_duplicates(
        ['filename', 'snippet_x', 'snippet_y'])
    tmp = tt[tt.relation == False]
    
    tmp.drop_duplicates(['snippet_x', 'snippet_y']).reset_index()[
        ['filename', 'snippet_x', 'snippet_y', 'relation', 'loc_x', 'loc_y']].to_json(filename + '.json.neg')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/233 [00:00<?, ?it/s]

### Extract features

In [6]:
%%time
from _isanlp_rst.src.isanlp_rst.features_processor_default import FeaturesProcessor

features_processor = FeaturesProcessor(model_dir_path='models', verbose=0)

CPU times: user 36.8 s, sys: 534 ms, total: 37.4 s
Wall time: 37.1 s




In [7]:
from tqdm import tqdm_notebook as tqdm

MAX_LEN = 10000
for filename in tqdm(glob.glob("data/*.json.neg")):    
    filename = filename.replace('.json.neg', '')
    
    df = read_negative(filename).drop(columns=['loc_y'])
    df = df[df.snippet_x.str.len() > 0]
    df = df[df.snippet_y.str.len() > 0]
    
    annotation = read_annotation(filename)
        
    result = features_processor(df,
                                annotation['text'],
                                annotation['tokens'],
                                annotation['sentences'],
                                annotation['lemma'],
                                annotation['morph'],
                                annotation['postag'],
                                annotation['syntax_dep_tree'])
    
    result = result[result.is_broken == False]
    
    result = result[result.tokens_x.map(len) < MAX_LEN]
    result = result[result.tokens_y.map(len) < MAX_LEN]
    
    result.to_pickle(filename + '.neg.features')

  0%|          | 0/233 [00:00<?, ?it/s]

Unable to locate first snippet >>> [['и запутались,' 'а ведь это далеко не все' 955 955 5443 5443]
 ['для проверки,'
  'что описано в 12.2.6.9 Runtime Semantics: PropertуDefinitionEvaluation.'
  913 913 5186 5186]
 ['AST-ановитесь!'
  'На мой субъективный взгляд, по большей мере спецификация это предписание для интерпретатора EcmaScript, держать такой же в собственной голове есть дело тяжелое и неблагодарное.'
  1414 1414 7913 7913]
 ['для ранее созданных тестов,' 'с помощью инструмента ASTEхplorer.' 1483
  1483 8408 8409]
 ['и ставим точку.' 'Весь этот путь был проделан не зря,' 1045 1045 5975
  5975]
 ['с помощью метода bind,'
  'что описано в разделе 19.2.3.2 Function.prototуpe.bind,' 1361 1361
  7628 7628]
 ['Вывод' 'Как мы выяснили функция, будучи анонимной, может иметь имя,'
  1495 1495 8471 8471]
 ['Вывод'
  'Как мы выяснили функция, будучи анонимной, может иметь имя, поскольку одновременно является также и объектом, что есть следствие мультипарадигмальной природы языка JavaScri

### Make train/test splits 

In [8]:
from utils.train_test_split import split_train_dev_test

train, dev, test = split_train_dev_test('./data')

news in train: 0.5344827586206896,	in dev: 0.6470588235294118,	in test: 0.6086956521739131
ling in train: 0.0,	in dev: 0.0,	in test: 0.0
comp in train: 0.0,	in dev: 0.0,	in test: 0.0
blog in train: 0.43103448275862066,	in dev: 0.5294117647058824,	in test: 0.4782608695652174


In [9]:
import pandas as pd
from utils.file_reading import read_gold


random_state = 45

train_samples = []
test_samples = []
dev_samples = []

for file in tqdm(train):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    train_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    train_samples.append(negative)

for file in tqdm(dev):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    dev_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    dev_samples.append(negative)
    
for file in tqdm(test):
    gold = read_gold(file.replace('.edus', ''), features=True)
    gold['relation'] = 1
    test_samples.append(gold)
    negative = read_negative(file.replace('.edus', ''), features=True)
    negative['relation'] = 0
    test_samples.append(negative)

train_samples = pd.concat(train_samples)
dev_samples = pd.concat(dev_samples)
test_samples = pd.concat(test_samples)

  0%|          | 0/168 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [10]:
import os
from utils.prepare_sequence import _prepare_sequence


def correct_samples(row):
    if row.snippet_x[0] in (',', '.', '!', '?'):
        row.snippet_x = row.snippet_x[1:].strip()
    if row.snippet_y[0] in (',', '.'):
        row.snippet_x += row.snippet_y[0]
        row.snippet_y = row.snippet_y[1:].strip()
    return row

def prepare_data(data, max_len=100):

    data = data[data.tokens_x.map(len) < max_len]
    data = data[data.tokens_y.map(len) < max_len]
    
    data['snippet_x'] = data.tokens_x.map(lambda row: ' '.join(row))
    data['snippet_y'] = data.tokens_y.map(lambda row: ' '.join(row))
    
    data = data.apply(correct_samples, axis=1)
    
    data = data[data.snippet_x.map(len) > 0]
    data = data[data.snippet_y.map(len) > 0]
    
    data['snippet_x'] = data.snippet_x.map(_prepare_sequence)
    data['snippet_y'] = data.snippet_y.map(_prepare_sequence)
    
    data = data.sort_values(['relation'], ascending=True).drop_duplicates(['snippet_x', 'snippet_y'], keep='last')
    data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return data


train_samples = prepare_data(train_samples)
dev_samples = prepare_data(dev_samples)
test_samples = prepare_data(test_samples)

OUT_PATH = 'data_structure'
if not os.path.isdir(OUT_PATH):
    os.path.mkdir(OUT_PATH)

train_samples.to_pickle(os.path.join(OUT_PATH, 'train_samples.pkl'))
dev_samples.to_pickle(os.path.join(OUT_PATH, 'dev_samples.pkl'))
test_samples.to_pickle(os.path.join(OUT_PATH, 'test_samples.pkl'))

In [11]:
train_samples[['snippet_x', 'snippet_y', 'relation', 'filename']].sort_values('snippet_x').tail()

Unnamed: 0,snippet_x,snippet_y,relation,filename
6658,﻿Новость : Занятия в Воскресной школе,"В каждое воскресение , утром , прихожане Свято...",0,./data/news2_2
19489,﻿Новость : Занятия в Воскресной школе,"В каждое воскресение , утром , прихожане Свято...",0,./data/news2_2
41053,﻿Новость : Занятия в Воскресной школе,"В каждое воскресение , утром , прихожане Свято...",0,./data/news2_2
41443,😄️Игорь Морской😄️🇷🇺 ( @ 812 Hocke у ) April 18...,Что-то не очень-то разобрались,0,./data/blogs_88
16720,😄️Игорь Морской😄️🇷🇺 ( @ 812 Hocke у ) April 18...,Что-то не очень-то разобрались ! Город в полно...,0,./data/blogs_88


In [12]:
train_samples[['snippet_x', 'snippet_y', 'relation', 'filename']].sort_values('snippet_x').tail(1).values

array([['😄️Игорь Морской😄️🇷🇺 ( @ 812 Hocke у ) April 18, 2019',
        'Что-то не очень-то разобрались ! Город в полном запустении находится . Каналы и реки в парковках , трамваи в пробках стоят , поля стихийных паркингов , пылища и грязища . Это ползучее уничтожение города , люди даже не понимают , что что-то идет не так . Одно дерево спилили , другое кронировали , на третьей площади сделали паркинг , вокруг четвертой забор , пятую реку забросили и вот уже вместо культурной Северной Столицы куда полстраны приезжает на белые ночи , у нас фиг знает что',
        0, './data/blogs_88']], dtype=object)

In [13]:
train_samples['len_x'] = train_samples.snippet_x.map(lambda row: len(row.split()))

In [14]:
train_samples.relation.value_counts()

0    30972
1    16400
Name: relation, dtype: int64